add filter algorithms
This commit is contained in:
parent
d9f8b5d36f
commit
4e4a712718
|
@ -6,5 +6,5 @@ defmodule Readability do
|
||||||
def title(html) when is_binary(html), do: parse(html) |> title
|
def title(html) when is_binary(html), do: parse(html) |> title
|
||||||
def title(html_tree), do: TitleFinder.title(html_tree)
|
def title(html_tree), do: TitleFinder.title(html_tree)
|
||||||
|
|
||||||
defp parse(raw_html), do: Floki.parse(raw_html)
|
def parse(raw_html), do: Floki.parse(raw_html)
|
||||||
end
|
end
|
||||||
|
|
|
@ -0,0 +1,94 @@
|
||||||
|
defmodule Readability.ContentFinder do
|
||||||
|
@moduledoc """
|
||||||
|
ContentFinder uses a variety of metrics for finding the content
|
||||||
|
that is most likely to be the stuff a user wants to read.
|
||||||
|
Then return it wrapped up in a div.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@regexes [ unlikelyCandidatesRe: ~r/combx|comment|community|disqus|extra|foot|header|lightbox|modal|menu|meta|nav|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
|
||||||
|
okMaybeItsACandidateRe: ~r/and|article|body|column|main|shadow/i,
|
||||||
|
positiveRe: ~r/article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
|
||||||
|
negativeRe: ~r/combx|comment|com-|contact|foot|footer|footnote|link|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|utility|widget/i,
|
||||||
|
divToPElementsRe: ~r/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
|
||||||
|
replaceBrsRe: ~r/(<br[^>]*>[ \n\r\t]*){2,}/i,
|
||||||
|
replaceFontsRe: ~r/<(\/?)font[^>]*>/i,
|
||||||
|
trimRe: ~r/^\s+|\s+$/,
|
||||||
|
normalizeRe: ~r/\s{2,}/,
|
||||||
|
killBreaksRe: ~r/(<br\s*\/?>(\s| ?)*){1,}/,
|
||||||
|
videoRe: ~r/http:\/\/(www\.)?(youtube|vimeo)\.com/i
|
||||||
|
]
|
||||||
|
|
||||||
|
@type html_tree :: tuple | list
|
||||||
|
|
||||||
|
@spec content(html_tree) :: html_tree
|
||||||
|
|
||||||
|
def content(html_tree, options \\ []) do
|
||||||
|
candidate = html_tree
|
||||||
|
|> preapre_cadidates
|
||||||
|
|
||||||
|
best_candidate = candidate
|
||||||
|
|> select_best_candidate
|
||||||
|
|
||||||
|
candidate
|
||||||
|
|> fix_relative_uris
|
||||||
|
end
|
||||||
|
|
||||||
|
defp preapre_cadidates(html_tree) do
|
||||||
|
html_tree
|
||||||
|
|> Floki.filter_out("script")
|
||||||
|
|> Floki.filter_out("style")
|
||||||
|
|> remove_unlikely_candidates
|
||||||
|
|> transform_misused_divs_into_paragraphs
|
||||||
|
end
|
||||||
|
|
||||||
|
@doc """
|
||||||
|
Remove unlikely tag nodes
|
||||||
|
"""
|
||||||
|
|
||||||
|
@spec remove_unlikely_candidates(html_tree) :: html_tree
|
||||||
|
|
||||||
|
def remove_unlikely_candidates(content) when is_binary(content), do: content
|
||||||
|
def remove_unlikely_candidates([]), do: []
|
||||||
|
def remove_unlikely_candidates([h|t]) do
|
||||||
|
case remove_unlikely_candidates(h) do
|
||||||
|
nil -> remove_unlikely_candidates(t)
|
||||||
|
html_tree -> [html_tree|remove_unlikely_candidates(t)]
|
||||||
|
end
|
||||||
|
end
|
||||||
|
def remove_unlikely_candidates({tag_name, attrs, inner_tree}) do
|
||||||
|
cond do
|
||||||
|
unlikely_candidate?(tag_name, attrs) -> nil
|
||||||
|
true -> {tag_name, attrs, remove_unlikely_candidates(inner_tree)}
|
||||||
|
end
|
||||||
|
end
|
||||||
|
defp unlikely_candidate?(tag_name, attrs) do
|
||||||
|
idclass_str = attrs
|
||||||
|
|> Enum.filter_map(fn(attr) -> elem(attr, 0) =~ ~r/id|class/i end,
|
||||||
|
fn(attr) -> elem(attr, 1) end)
|
||||||
|
|> Enum.join("")
|
||||||
|
str = tag_name <> idclass_str
|
||||||
|
str =~ @regexes[:unlikelyCandidatesRe] && !(str =~ @regexes[:okMaybeItsACandidateRe]) && tag_name != "html"
|
||||||
|
end
|
||||||
|
|
||||||
|
def transform_misused_divs_into_paragraphs(content) when is_binary(content), do: content
|
||||||
|
def transform_misused_divs_into_paragraphs([]), do: []
|
||||||
|
def transform_misused_divs_into_paragraphs([h|t]) do
|
||||||
|
[transform_misused_divs_into_paragraphs(h)|transform_misused_divs_into_paragraphs(t)]
|
||||||
|
end
|
||||||
|
def transform_misused_divs_into_paragraphs({tag_name, attrs, inner_tree} = html_tree) do
|
||||||
|
if misused_divs?(tag_name, inner_tree), do: tag_name = "p"
|
||||||
|
{tag_name, attrs, transform_misused_divs_into_paragraphs(inner_tree)}
|
||||||
|
end
|
||||||
|
defp misused_divs?("div", inner_tree) do
|
||||||
|
!(Floki.raw_html(inner_tree) =~ @regexes[:divToPElementsRe])
|
||||||
|
end
|
||||||
|
defp misused_divs?(_, _), do: false
|
||||||
|
|
||||||
|
defp select_best_candidate(html_tree) do
|
||||||
|
html_tree
|
||||||
|
end
|
||||||
|
|
||||||
|
defp fix_relative_uris(html_tree) do
|
||||||
|
html_tree
|
||||||
|
end
|
||||||
|
end
|
|
@ -0,0 +1,25 @@
|
||||||
|
defmodule Readability.Helper do
|
||||||
|
@moduledoc """
|
||||||
|
Utilities
|
||||||
|
"""
|
||||||
|
|
||||||
|
@type html_tree :: tuple | list
|
||||||
|
|
||||||
|
@doc """
|
||||||
|
change existing tags by selector
|
||||||
|
"""
|
||||||
|
|
||||||
|
@spec change_tag(html_tree, String.t, String.t) :: html_tree
|
||||||
|
|
||||||
|
def change_tag({tag_name, attrs, inner_tree}, tag_name, tag) do
|
||||||
|
{tag, attrs, change_tag(inner_tree, tag_name, tag)}
|
||||||
|
end
|
||||||
|
def change_tag({tag_name, attrs, html_tree}, selector, tag) do
|
||||||
|
{tag_name, attrs, change_tag(html_tree, selector, tag)}
|
||||||
|
end
|
||||||
|
def change_tag([h|t], selector, tag) do
|
||||||
|
[change_tag(h, selector, tag)|change_tag(t, selector, tag)]
|
||||||
|
end
|
||||||
|
def change_tag([], selector, tag), do: []
|
||||||
|
def change_tag(content, selector, tag) when is_binary(content), do: content
|
||||||
|
end
|
|
@ -8,6 +8,12 @@ defmodule Readability.TitleFinder do
|
||||||
|
|
||||||
@type html_tree :: tuple | list
|
@type html_tree :: tuple | list
|
||||||
|
|
||||||
|
@doc """
|
||||||
|
Find proper title
|
||||||
|
"""
|
||||||
|
|
||||||
|
@spec title(html_tree) :: binary
|
||||||
|
|
||||||
def title(html_tree) do
|
def title(html_tree) do
|
||||||
maybe_title = tag_title(html_tree)
|
maybe_title = tag_title(html_tree)
|
||||||
if length(String.split(maybe_title, " ")) <= 4 do
|
if length(String.split(maybe_title, " ")) <= 4 do
|
|
@ -0,0 +1,65 @@
|
||||||
|
defmodule Readability.ContentFinderTest do
|
||||||
|
use ExUnit.Case, async: true
|
||||||
|
|
||||||
|
doctest Readability.ContentFinder
|
||||||
|
|
||||||
|
|
||||||
|
@unlikey_sample """
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<header>HEADER</header>
|
||||||
|
<nav>NAV</nav>
|
||||||
|
<article class="community">ARTICLE</article>
|
||||||
|
<div class="disqus">SOCIAL</div>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
|
||||||
|
test "remove unlikely tag nodes" do
|
||||||
|
expected = {"html", [], [ {"body", [], [ {"article", [{"class", "community"}], ["ARTICLE"]} ]} ]}
|
||||||
|
result = @unlikey_sample
|
||||||
|
|> Readability.parse
|
||||||
|
|> Readability.ContentFinder.remove_unlikely_candidates
|
||||||
|
assert expected == result
|
||||||
|
end
|
||||||
|
|
||||||
|
@misused_sample """
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<div>
|
||||||
|
<span>here</span>
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
<p>not here</p>
|
||||||
|
</div>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
|
||||||
|
test "transform misused div tag" do
|
||||||
|
expected = {"html",
|
||||||
|
[],
|
||||||
|
[{"body",
|
||||||
|
[],
|
||||||
|
[{"p",
|
||||||
|
[],
|
||||||
|
[{"span", [], ["here"]}]
|
||||||
|
}, {"div",
|
||||||
|
[],
|
||||||
|
[{"p", [], ["not here"]}]
|
||||||
|
}]
|
||||||
|
}]
|
||||||
|
}
|
||||||
|
|
||||||
|
result = @misused_sample
|
||||||
|
|> Readability.parse
|
||||||
|
|> Readability.ContentFinder.transform_misused_divs_into_paragraphs
|
||||||
|
assert expected == result
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
|
def read_html(name) do
|
||||||
|
{:ok, body} = File.read("./test/fixtures/#{name}.html")
|
||||||
|
body
|
||||||
|
end
|
||||||
|
end
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,13 @@
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<code><pre>
|
||||||
|
root
|
||||||
|
indented
|
||||||
|
</pre></code>
|
||||||
|
|
||||||
|
<pre><code>
|
||||||
|
second
|
||||||
|
indented
|
||||||
|
</code></pre>
|
||||||
|
</body>
|
||||||
|
</html>
|
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,31 @@
|
||||||
|
defmodule Readability.HelperTest do
|
||||||
|
use ExUnit.Case, async: true
|
||||||
|
|
||||||
|
import Readability, only: :functions
|
||||||
|
alias Readability.Helper
|
||||||
|
|
||||||
|
@sample """
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<p>
|
||||||
|
<font>a</fond>
|
||||||
|
<p>
|
||||||
|
<font>abc</font>
|
||||||
|
</p>
|
||||||
|
</p>
|
||||||
|
<p>
|
||||||
|
<font>b</font>
|
||||||
|
</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
|
||||||
|
test "change font tag to span" do
|
||||||
|
expectred = @sample
|
||||||
|
|> String.replace(~r/font/, "span")
|
||||||
|
|> Floki.parse
|
||||||
|
|
||||||
|
result = Helper.change_tag(parse(@sample), "font", "span")
|
||||||
|
assert expectred == result
|
||||||
|
end
|
||||||
|
end
|
|
@ -1,7 +1,7 @@
|
||||||
defmodule Readability.TitleFinderTest do
|
defmodule Readability.TitleFinderTest do
|
||||||
use ExUnit.Case, async: true
|
use ExUnit.Case, async: true
|
||||||
|
|
||||||
doctest Readability
|
doctest Readability.TitleFinder
|
||||||
|
|
||||||
@html """
|
@html """
|
||||||
<html>
|
<html>
|
||||||
|
|
Loading…
Reference in New Issue