add filter algorithms

This commit is contained in:
keepcosmos 2016-04-17 15:28:33 +09:00
parent d9f8b5d36f
commit 4e4a712718
12 changed files with 3482 additions and 2 deletions

View File

@ -6,5 +6,5 @@ defmodule Readability do
def title(html) when is_binary(html), do: parse(html) |> title def title(html) when is_binary(html), do: parse(html) |> title
def title(html_tree), do: TitleFinder.title(html_tree) def title(html_tree), do: TitleFinder.title(html_tree)
defp parse(raw_html), do: Floki.parse(raw_html) def parse(raw_html), do: Floki.parse(raw_html)
end end

View File

@ -0,0 +1,94 @@
defmodule Readability.ContentFinder do
@moduledoc """
ContentFinder uses a variety of metrics for finding the content
that is most likely to be the stuff a user wants to read.
Then return it wrapped up in a div.
"""
@regexes [ unlikelyCandidatesRe: ~r/combx|comment|community|disqus|extra|foot|header|lightbox|modal|menu|meta|nav|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
okMaybeItsACandidateRe: ~r/and|article|body|column|main|shadow/i,
positiveRe: ~r/article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
negativeRe: ~r/combx|comment|com-|contact|foot|footer|footnote|link|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|utility|widget/i,
divToPElementsRe: ~r/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
replaceBrsRe: ~r/(<br[^>]*>[ \n\r\t]*){2,}/i,
replaceFontsRe: ~r/<(\/?)font[^>]*>/i,
trimRe: ~r/^\s+|\s+$/,
normalizeRe: ~r/\s{2,}/,
killBreaksRe: ~r/(<br\s*\/?>(\s|&nbsp;?)*){1,}/,
videoRe: ~r/http:\/\/(www\.)?(youtube|vimeo)\.com/i
]
@type html_tree :: tuple | list
@spec content(html_tree) :: html_tree
def content(html_tree, options \\ []) do
candidate = html_tree
|> preapre_cadidates
best_candidate = candidate
|> select_best_candidate
candidate
|> fix_relative_uris
end
defp preapre_cadidates(html_tree) do
html_tree
|> Floki.filter_out("script")
|> Floki.filter_out("style")
|> remove_unlikely_candidates
|> transform_misused_divs_into_paragraphs
end
@doc """
Remove unlikely tag nodes
"""
@spec remove_unlikely_candidates(html_tree) :: html_tree
def remove_unlikely_candidates(content) when is_binary(content), do: content
def remove_unlikely_candidates([]), do: []
def remove_unlikely_candidates([h|t]) do
case remove_unlikely_candidates(h) do
nil -> remove_unlikely_candidates(t)
html_tree -> [html_tree|remove_unlikely_candidates(t)]
end
end
def remove_unlikely_candidates({tag_name, attrs, inner_tree}) do
cond do
unlikely_candidate?(tag_name, attrs) -> nil
true -> {tag_name, attrs, remove_unlikely_candidates(inner_tree)}
end
end
defp unlikely_candidate?(tag_name, attrs) do
idclass_str = attrs
|> Enum.filter_map(fn(attr) -> elem(attr, 0) =~ ~r/id|class/i end,
fn(attr) -> elem(attr, 1) end)
|> Enum.join("")
str = tag_name <> idclass_str
str =~ @regexes[:unlikelyCandidatesRe] && !(str =~ @regexes[:okMaybeItsACandidateRe]) && tag_name != "html"
end
def transform_misused_divs_into_paragraphs(content) when is_binary(content), do: content
def transform_misused_divs_into_paragraphs([]), do: []
def transform_misused_divs_into_paragraphs([h|t]) do
[transform_misused_divs_into_paragraphs(h)|transform_misused_divs_into_paragraphs(t)]
end
def transform_misused_divs_into_paragraphs({tag_name, attrs, inner_tree} = html_tree) do
if misused_divs?(tag_name, inner_tree), do: tag_name = "p"
{tag_name, attrs, transform_misused_divs_into_paragraphs(inner_tree)}
end
defp misused_divs?("div", inner_tree) do
!(Floki.raw_html(inner_tree) =~ @regexes[:divToPElementsRe])
end
defp misused_divs?(_, _), do: false
defp select_best_candidate(html_tree) do
html_tree
end
defp fix_relative_uris(html_tree) do
html_tree
end
end

25
lib/readability/helper.ex Normal file
View File

@ -0,0 +1,25 @@
defmodule Readability.Helper do
@moduledoc """
Utilities
"""
@type html_tree :: tuple | list
@doc """
change existing tags by selector
"""
@spec change_tag(html_tree, String.t, String.t) :: html_tree
def change_tag({tag_name, attrs, inner_tree}, tag_name, tag) do
{tag, attrs, change_tag(inner_tree, tag_name, tag)}
end
def change_tag({tag_name, attrs, html_tree}, selector, tag) do
{tag_name, attrs, change_tag(html_tree, selector, tag)}
end
def change_tag([h|t], selector, tag) do
[change_tag(h, selector, tag)|change_tag(t, selector, tag)]
end
def change_tag([], selector, tag), do: []
def change_tag(content, selector, tag) when is_binary(content), do: content
end

View File

@ -8,6 +8,12 @@ defmodule Readability.TitleFinder do
@type html_tree :: tuple | list @type html_tree :: tuple | list
@doc """
Find proper title
"""
@spec title(html_tree) :: binary
def title(html_tree) do def title(html_tree) do
maybe_title = tag_title(html_tree) maybe_title = tag_title(html_tree)
if length(String.split(maybe_title, " ")) <= 4 do if length(String.split(maybe_title, " ")) <= 4 do

View File

@ -0,0 +1,65 @@
defmodule Readability.ContentFinderTest do
use ExUnit.Case, async: true
doctest Readability.ContentFinder
@unlikey_sample """
<html>
<body>
<header>HEADER</header>
<nav>NAV</nav>
<article class="community">ARTICLE</article>
<div class="disqus">SOCIAL</div>
</body>
</html>
"""
test "remove unlikely tag nodes" do
expected = {"html", [], [ {"body", [], [ {"article", [{"class", "community"}], ["ARTICLE"]} ]} ]}
result = @unlikey_sample
|> Readability.parse
|> Readability.ContentFinder.remove_unlikely_candidates
assert expected == result
end
@misused_sample """
<html>
<body>
<div>
<span>here</span>
</div>
<div>
<p>not here</p>
</div>
</body>
</html>
"""
test "transform misused div tag" do
expected = {"html",
[],
[{"body",
[],
[{"p",
[],
[{"span", [], ["here"]}]
}, {"div",
[],
[{"p", [], ["not here"]}]
}]
}]
}
result = @misused_sample
|> Readability.parse
|> Readability.ContentFinder.transform_misused_divs_into_paragraphs
assert expected == result
end
def read_html(name) do
{:ok, body} = File.read("./test/fixtures/#{name}.html")
body
end
end

2066
test/fixtures/bbc.html vendored Normal file

File diff suppressed because it is too large Load Diff

13
test/fixtures/code.html vendored Normal file
View File

@ -0,0 +1,13 @@
<html>
<body>
<code><pre>
root
indented
</pre></code>
<pre><code>
second
indented
</code></pre>
</body>
</html>

58
test/fixtures/nytimes.html vendored Normal file

File diff suppressed because one or more lines are too long

1122
test/fixtures/thesun.html vendored Normal file

File diff suppressed because it is too large Load Diff

31
test/helper_text.exs Normal file
View File

@ -0,0 +1,31 @@
defmodule Readability.HelperTest do
use ExUnit.Case, async: true
import Readability, only: :functions
alias Readability.Helper
@sample """
<html>
<body>
<p>
<font>a</fond>
<p>
<font>abc</font>
</p>
</p>
<p>
<font>b</font>
</p>
</body>
</html>
"""
test "change font tag to span" do
expectred = @sample
|> String.replace(~r/font/, "span")
|> Floki.parse
result = Helper.change_tag(parse(@sample), "font", "span")
assert expectred == result
end
end

View File

@ -1,7 +1,7 @@
defmodule Readability.TitleFinderTest do defmodule Readability.TitleFinderTest do
use ExUnit.Case, async: true use ExUnit.Case, async: true
doctest Readability doctest Readability.TitleFinder
@html """ @html """
<html> <html>