add filter algorithms
This commit is contained in:
parent
d9f8b5d36f
commit
4e4a712718
|
@ -6,5 +6,5 @@ defmodule Readability do
|
|||
def title(html) when is_binary(html), do: parse(html) |> title
|
||||
def title(html_tree), do: TitleFinder.title(html_tree)
|
||||
|
||||
defp parse(raw_html), do: Floki.parse(raw_html)
|
||||
def parse(raw_html), do: Floki.parse(raw_html)
|
||||
end
|
||||
|
|
|
@ -0,0 +1,94 @@
|
|||
defmodule Readability.ContentFinder do
|
||||
@moduledoc """
|
||||
ContentFinder uses a variety of metrics for finding the content
|
||||
that is most likely to be the stuff a user wants to read.
|
||||
Then return it wrapped up in a div.
|
||||
"""
|
||||
|
||||
@regexes [ unlikelyCandidatesRe: ~r/combx|comment|community|disqus|extra|foot|header|lightbox|modal|menu|meta|nav|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
|
||||
okMaybeItsACandidateRe: ~r/and|article|body|column|main|shadow/i,
|
||||
positiveRe: ~r/article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
|
||||
negativeRe: ~r/combx|comment|com-|contact|foot|footer|footnote|link|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|utility|widget/i,
|
||||
divToPElementsRe: ~r/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
|
||||
replaceBrsRe: ~r/(<br[^>]*>[ \n\r\t]*){2,}/i,
|
||||
replaceFontsRe: ~r/<(\/?)font[^>]*>/i,
|
||||
trimRe: ~r/^\s+|\s+$/,
|
||||
normalizeRe: ~r/\s{2,}/,
|
||||
killBreaksRe: ~r/(<br\s*\/?>(\s| ?)*){1,}/,
|
||||
videoRe: ~r/http:\/\/(www\.)?(youtube|vimeo)\.com/i
|
||||
]
|
||||
|
||||
@type html_tree :: tuple | list
|
||||
|
||||
@spec content(html_tree) :: html_tree
|
||||
|
||||
def content(html_tree, options \\ []) do
|
||||
candidate = html_tree
|
||||
|> preapre_cadidates
|
||||
|
||||
best_candidate = candidate
|
||||
|> select_best_candidate
|
||||
|
||||
candidate
|
||||
|> fix_relative_uris
|
||||
end
|
||||
|
||||
defp preapre_cadidates(html_tree) do
|
||||
html_tree
|
||||
|> Floki.filter_out("script")
|
||||
|> Floki.filter_out("style")
|
||||
|> remove_unlikely_candidates
|
||||
|> transform_misused_divs_into_paragraphs
|
||||
end
|
||||
|
||||
@doc """
|
||||
Remove unlikely tag nodes
|
||||
"""
|
||||
|
||||
@spec remove_unlikely_candidates(html_tree) :: html_tree
|
||||
|
||||
def remove_unlikely_candidates(content) when is_binary(content), do: content
|
||||
def remove_unlikely_candidates([]), do: []
|
||||
def remove_unlikely_candidates([h|t]) do
|
||||
case remove_unlikely_candidates(h) do
|
||||
nil -> remove_unlikely_candidates(t)
|
||||
html_tree -> [html_tree|remove_unlikely_candidates(t)]
|
||||
end
|
||||
end
|
||||
def remove_unlikely_candidates({tag_name, attrs, inner_tree}) do
|
||||
cond do
|
||||
unlikely_candidate?(tag_name, attrs) -> nil
|
||||
true -> {tag_name, attrs, remove_unlikely_candidates(inner_tree)}
|
||||
end
|
||||
end
|
||||
defp unlikely_candidate?(tag_name, attrs) do
|
||||
idclass_str = attrs
|
||||
|> Enum.filter_map(fn(attr) -> elem(attr, 0) =~ ~r/id|class/i end,
|
||||
fn(attr) -> elem(attr, 1) end)
|
||||
|> Enum.join("")
|
||||
str = tag_name <> idclass_str
|
||||
str =~ @regexes[:unlikelyCandidatesRe] && !(str =~ @regexes[:okMaybeItsACandidateRe]) && tag_name != "html"
|
||||
end
|
||||
|
||||
def transform_misused_divs_into_paragraphs(content) when is_binary(content), do: content
|
||||
def transform_misused_divs_into_paragraphs([]), do: []
|
||||
def transform_misused_divs_into_paragraphs([h|t]) do
|
||||
[transform_misused_divs_into_paragraphs(h)|transform_misused_divs_into_paragraphs(t)]
|
||||
end
|
||||
def transform_misused_divs_into_paragraphs({tag_name, attrs, inner_tree} = html_tree) do
|
||||
if misused_divs?(tag_name, inner_tree), do: tag_name = "p"
|
||||
{tag_name, attrs, transform_misused_divs_into_paragraphs(inner_tree)}
|
||||
end
|
||||
defp misused_divs?("div", inner_tree) do
|
||||
!(Floki.raw_html(inner_tree) =~ @regexes[:divToPElementsRe])
|
||||
end
|
||||
defp misused_divs?(_, _), do: false
|
||||
|
||||
defp select_best_candidate(html_tree) do
|
||||
html_tree
|
||||
end
|
||||
|
||||
defp fix_relative_uris(html_tree) do
|
||||
html_tree
|
||||
end
|
||||
end
|
|
@ -0,0 +1,25 @@
|
|||
defmodule Readability.Helper do
|
||||
@moduledoc """
|
||||
Utilities
|
||||
"""
|
||||
|
||||
@type html_tree :: tuple | list
|
||||
|
||||
@doc """
|
||||
change existing tags by selector
|
||||
"""
|
||||
|
||||
@spec change_tag(html_tree, String.t, String.t) :: html_tree
|
||||
|
||||
def change_tag({tag_name, attrs, inner_tree}, tag_name, tag) do
|
||||
{tag, attrs, change_tag(inner_tree, tag_name, tag)}
|
||||
end
|
||||
def change_tag({tag_name, attrs, html_tree}, selector, tag) do
|
||||
{tag_name, attrs, change_tag(html_tree, selector, tag)}
|
||||
end
|
||||
def change_tag([h|t], selector, tag) do
|
||||
[change_tag(h, selector, tag)|change_tag(t, selector, tag)]
|
||||
end
|
||||
def change_tag([], selector, tag), do: []
|
||||
def change_tag(content, selector, tag) when is_binary(content), do: content
|
||||
end
|
|
@ -8,6 +8,12 @@ defmodule Readability.TitleFinder do
|
|||
|
||||
@type html_tree :: tuple | list
|
||||
|
||||
@doc """
|
||||
Find proper title
|
||||
"""
|
||||
|
||||
@spec title(html_tree) :: binary
|
||||
|
||||
def title(html_tree) do
|
||||
maybe_title = tag_title(html_tree)
|
||||
if length(String.split(maybe_title, " ")) <= 4 do
|
|
@ -0,0 +1,65 @@
|
|||
defmodule Readability.ContentFinderTest do
|
||||
use ExUnit.Case, async: true
|
||||
|
||||
doctest Readability.ContentFinder
|
||||
|
||||
|
||||
@unlikey_sample """
|
||||
<html>
|
||||
<body>
|
||||
<header>HEADER</header>
|
||||
<nav>NAV</nav>
|
||||
<article class="community">ARTICLE</article>
|
||||
<div class="disqus">SOCIAL</div>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
test "remove unlikely tag nodes" do
|
||||
expected = {"html", [], [ {"body", [], [ {"article", [{"class", "community"}], ["ARTICLE"]} ]} ]}
|
||||
result = @unlikey_sample
|
||||
|> Readability.parse
|
||||
|> Readability.ContentFinder.remove_unlikely_candidates
|
||||
assert expected == result
|
||||
end
|
||||
|
||||
@misused_sample """
|
||||
<html>
|
||||
<body>
|
||||
<div>
|
||||
<span>here</span>
|
||||
</div>
|
||||
<div>
|
||||
<p>not here</p>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
test "transform misused div tag" do
|
||||
expected = {"html",
|
||||
[],
|
||||
[{"body",
|
||||
[],
|
||||
[{"p",
|
||||
[],
|
||||
[{"span", [], ["here"]}]
|
||||
}, {"div",
|
||||
[],
|
||||
[{"p", [], ["not here"]}]
|
||||
}]
|
||||
}]
|
||||
}
|
||||
|
||||
result = @misused_sample
|
||||
|> Readability.parse
|
||||
|> Readability.ContentFinder.transform_misused_divs_into_paragraphs
|
||||
assert expected == result
|
||||
end
|
||||
|
||||
|
||||
def read_html(name) do
|
||||
{:ok, body} = File.read("./test/fixtures/#{name}.html")
|
||||
body
|
||||
end
|
||||
end
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,13 @@
|
|||
<html>
|
||||
<body>
|
||||
<code><pre>
|
||||
root
|
||||
indented
|
||||
</pre></code>
|
||||
|
||||
<pre><code>
|
||||
second
|
||||
indented
|
||||
</code></pre>
|
||||
</body>
|
||||
</html>
|
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,31 @@
|
|||
defmodule Readability.HelperTest do
|
||||
use ExUnit.Case, async: true
|
||||
|
||||
import Readability, only: :functions
|
||||
alias Readability.Helper
|
||||
|
||||
@sample """
|
||||
<html>
|
||||
<body>
|
||||
<p>
|
||||
<font>a</fond>
|
||||
<p>
|
||||
<font>abc</font>
|
||||
</p>
|
||||
</p>
|
||||
<p>
|
||||
<font>b</font>
|
||||
</p>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
test "change font tag to span" do
|
||||
expectred = @sample
|
||||
|> String.replace(~r/font/, "span")
|
||||
|> Floki.parse
|
||||
|
||||
result = Helper.change_tag(parse(@sample), "font", "span")
|
||||
assert expectred == result
|
||||
end
|
||||
end
|
|
@ -1,7 +1,7 @@
|
|||
defmodule Readability.TitleFinderTest do
|
||||
use ExUnit.Case, async: true
|
||||
|
||||
doctest Readability
|
||||
doctest Readability.TitleFinder
|
||||
|
||||
@html """
|
||||
<html>
|
||||
|
|
Loading…
Reference in New Issue