add candidate builder

add test
This commit is contained in:
keepcosmos 2016-04-17 21:26:51 +09:00
parent 4e4a712718
commit b131d7effa
26 changed files with 4592 additions and 4661 deletions

1
.gitignore vendored
View File

@ -3,3 +3,4 @@
/deps
erl_crash.dump
*.ez
.credo.exs

View File

@ -1,58 +0,0 @@
defmodule Readability.Document do
@default_options [retry_length: 250,
min_text_length: 25,
remove_unlikely_candidates: true,
weight_classes: true,
clean_conditionally: true,
remove_empty_nodes: true,
min_image_width: 130,
min_image_height: 80,
ignore_image_format: [],
blacklist: nil,
whitelist: nil
]
@regexes [ unlikelyCandidatesRe: ~r/combx|comment|community|disqus|extra|foot|header|lightbox|modal|menu|meta|nav|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
okMaybeItsACandidateRe: ~r/and|article|body|column|main|shadow/i,
positiveRe: ~r/article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
negativeRe: ~r/combx|comment|com-|contact|foot|footer|footnote|link|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|utility|widget/i,
divToPElementsRe: ~r/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
replaceBrsRe: ~r/(<br[^>]*>[ \n\r\t]*){2,}/i,
replaceFontsRe: ~r/<(\/?)font[^>]*>/i,
trimRe: ~r/^\s+|\s+$/,
normalizeRe: ~r/\s{2,}/,
killBreaksRe: ~r/(<br\s*\/?>(\s|&nbsp;?)*){1,}/,
videoRe: ~r/http:\/\/(www\.)?(youtube|vimeo)\.com/i
]
def html do
page
|> String.replace(@regexes[:replaceBrsRe], "</p><p>")
|> String.replace(@regexes[:replaceFontsRe], "<\1span>")
|> Floki.find("html")
|> Floki.filter_out(:comment)
end
def title do
html |> Floki.find("title") |> Floki.text
end
def content do
html
|> Floki.filter_out("script")
|> Floki.filter_out("style")
end
def page do
{:ok, f} = File.read("test/features/nytimes.html")
f
end
def default_options do
@default_options
end
def regexes do
@regexes
end
end

View File

@ -1,10 +1,86 @@
defmodule Readability do
@moduledoc """
"""
alias Readability.TitleFinder
alias Readability.ArticleBuilder
@default_options [retry_length: 250,
min_text_length: 25,
remove_unlikely_candidates: true,
weight_classes: true,
clean_conditionally: true,
remove_empty_nodes: true,
min_image_width: 130,
min_image_height: 80,
ignore_image_format: [],
blacklist: nil,
whitelist: nil
]
@regexes [unlikely_candidate: ~r/combx|comment|community|disqus|extra|foot|header|hidden|lightbox|modal|menu|meta|nav|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
ok_maybe_its_a_candidate: ~r/and|article|body|column|main|shadow/i,
positive: ~r/article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
negative: ~r/hidden|^hid|combx|comment|com-|contact|foot|footer|footnote|link|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|utility|widget/i,
div_to_p_elements: ~r/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
replace_brs: ~r/(<br[^>]*>[ \n\r\t]*){2,}/i,
replace_fonts: ~r/<(\/?)font[^>]*>/i,
normalize: ~r/\s{2,}/,
video: ~r/\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i
]
@type html_tree :: tuple | list
@type options :: list
def title(html) when is_binary(html), do: parse(html) |> title
def title(html) when is_binary(html), do: html |> parse |> title
def title(html_tree), do: TitleFinder.title(html_tree)
def parse(raw_html), do: Floki.parse(raw_html)
@doc """
Using a variety of metrics (content score, classname, element types), find the content that is
most likely to be the stuff a user wants to read
"""
@spec content(binary, options) :: binary
def content(raw_html, opts \\ @default_options) do
opts = Keyword.merge(@default_options, opts)
raw_html
|> parse
|> ArticleBuilder.build(opts)
end
@doc """
Normalize and Parse to html tree(tuple or list)) from binary html
"""
@spec parse(binary) :: html_tree
def parse(raw_html) do
raw_html
|> String.replace(Readability.regexes[:replace_brs], "</p><p>")
|> String.replace(Readability.regexes[:replace_fonts], "<\1span>")
|> String.replace(Readability.regexes[:normalize], " ")
|> Floki.parse
|> Floki.filter_out(:comment)
end
@doc """
return raw html binary from html tree tuple
"""
@spec raw_html(html_tree) :: binary
def raw_html(html_tree) do
html_tree |> Floki.raw_html
end
@doc """
return only text binary from html tree tuple
"""
@spec raw_html(html_tree) :: binary
def readabl_text(html_tree) do
tags_to_br = ~r/<\/(p|div|article|h\d)/i
html_str = html_tree |> raw_html
Regex.replace(tags_to_br, html_str, &("\n#{&1}"))
|> Floki.parse
|> Floki.text
end
def regexes, do: @regexes
def default_options, do: @default_options
end

View File

@ -0,0 +1,100 @@
defmodule Readability.ArticleBuilder do
@moduledoc """
build article for readability
"""
alias Readability.Helper
alias Readability.Sanitizer
alias Readability.Candidate
alias Readability.CandidateFinder
alias Readability.Candidate.Cleaner
alias Readability.Candidate.Scoring
@type html_tree :: tuple | list
@type options :: list
@doc """
Prepare the article node for display.
Clean out any inline styles, iframes, forms, strip extraneous <p> tags, etc.
"""
@spec build(html_tree, options) :: html_tree
def build(html_tree, opts) do
origin_tree = html_tree
html_tree = html_tree
|> Helper.remove_tag(fn({tag, _, _}) ->
Enum.member?(["script", "style"], tag)
end)
if opts[:remove_unlikely_candidates] do
html_tree = Cleaner.remove_unlikely_tree(html_tree)
end
html_tree = Cleaner.transform_misused_div_to_p(html_tree)
candidates = CandidateFinder.find(html_tree, opts)
article = find_article(candidates, html_tree)
html_tree = Sanitizer.sanitize(article, candidates, opts)
if Helper.text_length(html_tree) < opts[:retry_length] do
if opts = next_try_opts(opts) do
build(origin_tree, opts)
else
html_tree
end
else
html_tree
end
end
defp next_try_opts(opts) do
cond do
opts[:remove_unlikely_candidates] ->
Keyword.put(opts, :remove_unlikely_candidates, false)
opts[:weight_classes] ->
Keyword.put(opts, :weight_classes, false)
opts[:clean_conditionally] ->
Keyword.put(opts, :clean_conditionally, false)
true -> nil
end
end
defp find_article(candidates, html_tree) do
best_candidate = CandidateFinder.find_best_candidate(candidates)
unless best_candidate do
tree = html_tree |> Floki.find("body") |> hd
best_candidate = %Candidate{html_tree: tree}
end
article_trees = find_article_trees(best_candidate, candidates)
{"div", [], article_trees}
end
defp find_article_trees(best_candidate, candidates) do
score_threshold = Enum.max([10, best_candidate.score * 0.2])
candidates
|> Enum.filter(&(&1.tree_depth == best_candidate.tree_depth))
|> Enum.filter_map(fn(candidate) ->
candidate == best_candidate
|| candidate.score >= score_threshold
|| append?(candidate)
end, &(to_article_tag(&1.html_tree)))
end
defp append?(%Candidate{html_tree: html_tree}) when elem(html_tree, 0) == "p" do
link_density = Scoring.calc_link_density(html_tree)
inner_text = html_tree |> Floki.text
inner_length = inner_text |> String.length
(inner_length > 80 && link_density < 0.25)
|| (inner_length < 80 && link_density == 0 && inner_text =~ ~r/\.( |$)/)
end
defp append?(_), do: false
defp to_article_tag({tag, attrs, inner_tree} = html_tree) do
if tag =~ ~r/^p$|^div$/ do
html_tree
else
{"div", attrs, inner_tree}
end
end
end

View File

@ -0,0 +1,6 @@
defmodule Readability.Candidate do
@moduledoc """
Candidate can be article
"""
defstruct html_tree: {}, score: 0, tree_depth: 0
end

View File

@ -0,0 +1,48 @@
defmodule Readability.Candidate.Cleaner do
@moduledoc """
Clean html tree for prepare candidates.
It transforms misused tags and removes unlikely candidates.
"""
alias Readability.Helper
@type html_tree :: tuple | list
@doc """
Transform misused divs <div>s that do not contain other block elements into <p>s
"""
@spec transform_misused_div_to_p(html_tree) :: html_tree
def transform_misused_div_to_p(content) when is_binary(content), do: content
def transform_misused_div_to_p([]), do: []
def transform_misused_div_to_p([h|t]) do
[transform_misused_div_to_p(h)|transform_misused_div_to_p(t)]
end
def transform_misused_div_to_p({tag, attrs, inner_tree}) do
if misused_divs?(tag, inner_tree), do: tag = "p"
{tag, attrs, transform_misused_div_to_p(inner_tree)}
end
@doc """
Remove unlikely html tree
"""
@spec remove_unlikely_tree(html_tree) :: html_tree
def remove_unlikely_tree(html_tree) do
Helper.remove_tag(html_tree, &unlikely_tree?(&1))
end
defp misused_divs?("div", inner_tree) do
!(Floki.raw_html(inner_tree) =~ Readability.regexes[:div_to_p_elements])
end
defp misused_divs?(_, _), do: false
defp unlikely_tree?({tag, attrs, _}) do
idclass_str = attrs
|> Enum.filter_map(&(elem(&1, 0) =~ ~r/id|class/i), &(elem(&1, 1)))
|> Enum.join("")
str = tag <> idclass_str
str =~ Readability.regexes[:unlikely_candidate]
&& !(str =~ Readability.regexes[:ok_maybe_its_a_candidate])
&& tag != "html"
end
end

View File

@ -0,0 +1,89 @@
defmodule Readability.Candidate.Scoring do
@moduledoc """
Score html tree
"""
alias Readability.Helper
@element_scores %{"div" => 5,
"blockquote" => 3,
"form" => -3,
"th" => -5
}
@type html_tree :: tuple | list
@type options :: list
@doc """
Score html tree by some algorithm that check children nodes, attributes, link densities, etcs..
options -> weight_classes :: boolean, calculate weight class
"""
@spec calc_score(html_tree, options) :: number
def calc_score(html_tree, opts \\ []) do
score = calc_node_score(html_tree, opts)
score = score + calc_children_content_score(html_tree) + calc_grand_children_content_score(html_tree)
score * (1 - calc_link_density(html_tree))
end
defp calc_content_score(html_tree) do
score = 1
inner_text = html_tree |> Floki.text
split_score = inner_text |> String.split(",") |> length
length_score = [(String.length(inner_text) / 100), 3] |> Enum.min
score + split_score + length_score
end
defp calc_node_score({tag, attrs, _}, opts) do
score = 0
if opts[:weight_classes], do: score = score + class_weight(attrs)
score + (@element_scores[tag] || 0)
end
defp calc_node_score([h|t], opts) do
calc_node_score(h, opts) + calc_node_score(t, opts)
end
defp calc_node_score([], _), do: 0
def class_weight(attrs) do
weight = 0
class = attrs |> List.keyfind("class", 0, {"", ""}) |> elem(1)
id = attrs |> List.keyfind("id", 0, {"", ""}) |> elem(1)
if class =~ Readability.regexes[:positive], do: weight = weight + 25
if id =~ Readability.regexes[:positive], do: weight = weight + 25
if class =~ Readability.regexes[:negative], do: weight = weight - 25
if id =~ Readability.regexes[:negative], do: weight = weight - 25
weight
end
def calc_link_density(html_tree) do
link_length = html_tree
|> Floki.find("a")
|> Floki.text
|> String.length
text_length = html_tree
|> Floki.text
|> String.length
if text_length == 0 do
0
else
link_length / text_length
end
end
defp calc_children_content_score({_, _, children_tree}) do
children_tree
|> Enum.filter(&(is_tuple(&1) && Helper.candidate_tag?(&1)))
|> calc_content_score
end
defp calc_grand_children_content_score({_, _, children_tree}) do
score = children_tree
|> Enum.filter_map(&is_tuple(&1), &elem(&1, 2))
|> List.flatten
|> Enum.filter(&(is_tuple(&1) && Helper.candidate_tag?(&1)))
|> calc_content_score
score / 2
end
end

View File

@ -0,0 +1,60 @@
defmodule Readability.CandidateFinder do
@moduledoc """
The builing and finding candidates engine
It traverses the HTML tree searching, removing, socring nodes
"""
alias Readability.Helper
alias Readability.Candidate
alias Readability.Candidate.Scoring
@type html_tree :: tuple | list
@type options :: list
@doc """
Find candidates that shuld be meaningful article by analysing nodes
"""
@spec find(html_tree, options, number) :: [Candidate.t]
def find(_, opts \\ [], tree_depth \\ 0)
def find([], _, _), do: []
def find([h|t], opts, tree_depth) do
[find(h, opts, tree_depth) | find(t, opts, tree_depth)]
|> List.flatten
end
def find(text, _, _) when is_binary(text), do: []
def find({tag, attrs, inner_tree}, opts, tree_depth) do
html_tree = {tag, attrs, inner_tree}
if candidate?(html_tree) do
candidate = %Candidate{html_tree: html_tree,
score: Scoring.calc_score(html_tree, opts),
tree_depth: tree_depth}
[candidate | find(inner_tree, opts, tree_depth + 1)]
else
find(inner_tree, opts, tree_depth + 1)
end
end
@doc """
Find the highest score candidate.
"""
@spec find_best_candidate([Candidate.t]) :: Candidate.t
def find_best_candidate([]), do: nil
def find_best_candidate(candidates) do
candidates
|> Enum.max_by(fn(candidate) -> candidate.score end)
end
defp candidate?(_, depth \\ 0)
defp candidate?(_, depth) when depth > 2, do: false
defp candidate?([h|t], depth), do: candidate?(h, depth) || candidate?(t, depth)
defp candidate?([], _), do: false
defp candidate?(text, _) when is_binary(text), do: false
defp candidate?({_, _, inner_tree} = html_tree, depth) do
if Helper.candidate_tag?(html_tree) do
true
else
candidate?(inner_tree, depth + 1)
end
end
end

View File

@ -1,94 +0,0 @@
defmodule Readability.ContentFinder do
@moduledoc """
ContentFinder uses a variety of metrics for finding the content
that is most likely to be the stuff a user wants to read.
Then return it wrapped up in a div.
"""
@regexes [ unlikelyCandidatesRe: ~r/combx|comment|community|disqus|extra|foot|header|lightbox|modal|menu|meta|nav|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
okMaybeItsACandidateRe: ~r/and|article|body|column|main|shadow/i,
positiveRe: ~r/article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
negativeRe: ~r/combx|comment|com-|contact|foot|footer|footnote|link|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|utility|widget/i,
divToPElementsRe: ~r/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
replaceBrsRe: ~r/(<br[^>]*>[ \n\r\t]*){2,}/i,
replaceFontsRe: ~r/<(\/?)font[^>]*>/i,
trimRe: ~r/^\s+|\s+$/,
normalizeRe: ~r/\s{2,}/,
killBreaksRe: ~r/(<br\s*\/?>(\s|&nbsp;?)*){1,}/,
videoRe: ~r/http:\/\/(www\.)?(youtube|vimeo)\.com/i
]
@type html_tree :: tuple | list
@spec content(html_tree) :: html_tree
def content(html_tree, options \\ []) do
candidate = html_tree
|> preapre_cadidates
best_candidate = candidate
|> select_best_candidate
candidate
|> fix_relative_uris
end
defp preapre_cadidates(html_tree) do
html_tree
|> Floki.filter_out("script")
|> Floki.filter_out("style")
|> remove_unlikely_candidates
|> transform_misused_divs_into_paragraphs
end
@doc """
Remove unlikely tag nodes
"""
@spec remove_unlikely_candidates(html_tree) :: html_tree
def remove_unlikely_candidates(content) when is_binary(content), do: content
def remove_unlikely_candidates([]), do: []
def remove_unlikely_candidates([h|t]) do
case remove_unlikely_candidates(h) do
nil -> remove_unlikely_candidates(t)
html_tree -> [html_tree|remove_unlikely_candidates(t)]
end
end
def remove_unlikely_candidates({tag_name, attrs, inner_tree}) do
cond do
unlikely_candidate?(tag_name, attrs) -> nil
true -> {tag_name, attrs, remove_unlikely_candidates(inner_tree)}
end
end
defp unlikely_candidate?(tag_name, attrs) do
idclass_str = attrs
|> Enum.filter_map(fn(attr) -> elem(attr, 0) =~ ~r/id|class/i end,
fn(attr) -> elem(attr, 1) end)
|> Enum.join("")
str = tag_name <> idclass_str
str =~ @regexes[:unlikelyCandidatesRe] && !(str =~ @regexes[:okMaybeItsACandidateRe]) && tag_name != "html"
end
def transform_misused_divs_into_paragraphs(content) when is_binary(content), do: content
def transform_misused_divs_into_paragraphs([]), do: []
def transform_misused_divs_into_paragraphs([h|t]) do
[transform_misused_divs_into_paragraphs(h)|transform_misused_divs_into_paragraphs(t)]
end
def transform_misused_divs_into_paragraphs({tag_name, attrs, inner_tree} = html_tree) do
if misused_divs?(tag_name, inner_tree), do: tag_name = "p"
{tag_name, attrs, transform_misused_divs_into_paragraphs(inner_tree)}
end
defp misused_divs?("div", inner_tree) do
!(Floki.raw_html(inner_tree) =~ @regexes[:divToPElementsRe])
end
defp misused_divs?(_, _), do: false
defp select_best_candidate(html_tree) do
html_tree
end
defp fix_relative_uris(html_tree) do
html_tree
end
end

View File

@ -1,25 +1,93 @@
defmodule Readability.Helper do
@moduledoc """
Utilities
Helpers for parsing, updating, removing html tree
"""
@type html_tree :: tuple | list
@doc """
change existing tags by selector
Change existing tags by selector
"""
@spec change_tag(html_tree, String.t, String.t) :: html_tree
def change_tag(content, _, _) when is_binary(content), do: content
def change_tag([], _, _), do: []
def change_tag([h|t], selector, tag) do
[change_tag(h, selector, tag)|change_tag(t, selector, tag)]
end
def change_tag({tag_name, attrs, inner_tree}, tag_name, tag) do
{tag, attrs, change_tag(inner_tree, tag_name, tag)}
end
def change_tag({tag_name, attrs, html_tree}, selector, tag) do
{tag_name, attrs, change_tag(html_tree, selector, tag)}
end
def change_tag([h|t], selector, tag) do
[change_tag(h, selector, tag)|change_tag(t, selector, tag)]
@spec remove_attrs(html_tree, String.t | [String.t] | Regex.t) :: html_tree
def remove_attrs(content, _) when is_binary(content), do: content
def remove_attrs([], _), do: []
def remove_attrs([h|t], t_attrs) do
[remove_attrs(h, t_attrs)|remove_attrs(t, t_attrs)]
end
def remove_attrs({tag_name, attrs, inner_tree}, target_attr) do
reject_fun = fn(attr) -> attr end
cond do
is_binary(target_attr) ->
reject_fun = fn(attr) -> elem(attr, 0) == target_attr end
Regex.regex?(target_attr) ->
reject_fun = fn(attr) -> elem(attr, 0) =~ target_attr end
is_list(target_attr) ->
reject_fun = fn(attr) -> Enum.member?(target_attr, elem(attr, 0)) end
true -> nil
end
{tag_name, Enum.reject(attrs, reject_fun), remove_attrs(inner_tree, target_attr)}
end
@doc """
Remove tags
"""
@spec remove_tag(html_tree, fun) :: html_tree
def remove_tag(content, _) when is_binary(content), do: content
def remove_tag([], _), do: []
def remove_tag([h|t], fun) do
node = remove_tag(h, fun)
if is_nil(node) do
remove_tag(t, fun)
else
[node|remove_tag(t, fun)]
end
end
def remove_tag({tag, attrs, inner_tree} = html_tree, fun) do
if fun.(html_tree) do
nil
else
{tag, attrs, remove_tag(inner_tree, fun)}
end
end
@doc """
count only text length
"""
@spec text_length(html_tree) :: number
def text_length(html_tree) do
html_tree |> Floki.text |> String.strip |> String.length
end
@doc """
Check html_tree can be candidate or not.
"""
@spec candidate_tag?(html_tree) :: boolean
def candidate_tag?(html_tree) do
Enum.any?(candidates_selector, fn(selector) ->
Floki.Selector.match?(html_tree, selector)
&& (text_length(html_tree)) >= Readability.default_options[:min_text_length]
end)
end
defp candidates_selector do
["p", "td"]
|> Enum.map(fn(s) ->
tokens = Floki.SelectorTokenizer.tokenize(s)
Floki.SelectorParser.parse(tokens)
end)
end
def change_tag([], selector, tag), do: []
def change_tag(content, selector, tag) when is_binary(content), do: content
end

View File

@ -0,0 +1,85 @@
defmodule Readability.Sanitizer do
@moduledoc """
Clean an element of all tags of type "tag" if they look fishy.
"Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
"""
alias Readability.Helper
alias Readability.Candidate
alias Readability.Candidate.Scoring
@type html_tree :: tuple | list
@doc """
Sanitizes article html tree
"""
@spec sanitize(html_tree, [Candidate.t], list) :: html_tree
def sanitize(html_tree, candidates, opts \\ []) do
html_tree = html_tree
|> Helper.remove_tag(&clean_headline_tag?(&1))
|> Helper.remove_tag(&clean_unlikely_tag?(&1))
|> Helper.remove_tag(&clean_empty_p?(&1))
if opts[:clean_conditionally] do
html_tree = html_tree
|> Helper.remove_tag(conditionally_cleaing_fn(candidates))
end
html_tree |> Helper.remove_attrs("style")
end
defp conditionally_cleaing_fn(candidates) do
fn({tag, attrs, _} = tree) ->
if Enum.any?(["table", "ul", "div"], &(&1 == tag)) do
weight = Scoring.class_weight(attrs)
same_tree = candidates
|> Enum.find(%Candidate{}, &(&1.html_tree == tree))
list? = tag == "ul"
cond do
weight + same_tree.score < 0
-> true
length(Regex.scan(~r/\,/, Floki.text(tree))) < 10 ->
# If there are not very many commas, and the number of
# non-paragraph elements is more than paragraphs or other
# ominous signs, remove the element.
p_len = tree |> Floki.find("p") |> length
img_len = tree |> Floki.find("img") |> length
li_len = tree |> Floki.find("li") |> length
input_len = tree |> Floki.find("input") |> length
embed_len = tree
|> Floki.find("embed")
|> Enum.reject(&(&1 =~ Readability.regexes[:video]))
|> length
link_density = Scoring.calc_link_density(tree)
conent_len = Helper.text_length(tree)
img_len > p_len # too many image
|| (!list? && li_len > p_len) # more <li>s than <p>s
|| input_len > (p_len / 3) # less than 3x <p>s than <input>s
|| (!list? && conent_len < Readability.regexes[:min_text_length] && img_len != 1) # too short a content length without a single image
|| (weight < 25 && link_density > 0.2) # too many links for its weight (#{weight})
|| (weight >= 25 && link_density > 0.5) # too many links for its weight (#{weight})
|| ((embed_len == 1 && conent_len < 75) || embed_len > 1) # <embed>s with too short a content length, or too many <embed>s
true -> false
end
end
end
end
defp clean_headline_tag?({tag, attrs, _} = html_tree) do
tag =~ ~r/^h\d{1}$/
&& (Scoring.class_weight(attrs) < 0 || Scoring.calc_link_density(html_tree) > 0.33)
end
defp clean_unlikely_tag?({tag, attrs, _}) do
attrs_str = attrs |> Enum.map(&(elem(&1, 1))) |> Enum.join("")
tag =~ ~r/form|object|iframe|embed/ && !(attrs_str =~ Readability.regexes[:video])
end
defp clean_empty_p?({tag, _, _} = html_tree) do
tag == "p" && Helper.text_length(html_tree) == 0
end
end

View File

@ -11,9 +11,7 @@ defmodule Readability.TitleFinder do
@doc """
Find proper title
"""
@spec title(html_tree) :: binary
def title(html_tree) do
maybe_title = tag_title(html_tree)
if length(String.split(maybe_title, " ")) <= 4 do
@ -25,42 +23,36 @@ defmodule Readability.TitleFinder do
@doc """
Find title from title tag
"""
@spec tag_title(html_tree) :: binary
def tag_title(html_tree) do
html_tree
|> Floki.find("title")
|> to_clean_text
|> clean_title
end
@doc """
Find title from og:title property of meta tag
"""
@spec og_title(html_tree) :: binary
def og_title(html_tree) do
html_tree
|> Floki.find("meta[property=og:title]")
|> Floki.attribute("content")
|> to_clean_text
|> clean_title
end
@doc """
Find title from h tag
"""
@spec h_tag_title(html_tree, String.t) :: binary
def h_tag_title(html_tree, selector \\@h_tag_selector) do
def h_tag_title(html_tree, selector \\ @h_tag_selector) do
html_tree
|> Floki.find(selector)
|> hd
|> to_clean_text
|> clean_title
end
defp to_clean_text(html_tree) do
defp clean_title(html_tree) do
title_text = html_tree
|> Floki.text
|> String.split(@title_suffix)

File diff suppressed because it is too large Load Diff

View File

@ -1,522 +0,0 @@
# encoding: utf-8
require 'rubygems'
require 'nokogiri'
require 'guess_html_encoding'
module Readability
class Document
DEFAULT_OPTIONS = {
:retry_length => 250,
:min_text_length => 25,
:remove_unlikely_candidates => true,
:weight_classes => true,
:clean_conditionally => true,
:remove_empty_nodes => true,
:min_image_width => 130,
:min_image_height => 80,
:ignore_image_format => [],
:blacklist => nil,
:whitelist => nil
}.freeze
REGEXES = {
:unlikelyCandidatesRe => /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
:okMaybeItsACandidateRe => /and|article|body|column|main|shadow/i,
:positiveRe => /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
:negativeRe => /combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i,
:divToPElementsRe => /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
:replaceBrsRe => /(<br[^>]*>[ \n\r\t]*){2,}/i,
:replaceFontsRe => /<(\/?)font[^>]*>/i,
:trimRe => /^\s+|\s+$/,
:normalizeRe => /\s{2,}/,
:killBreaksRe => /(<br\s*\/?>(\s|&nbsp;?)*){1,}/,
:videoRe => /http:\/\/(www\.)?(youtube|vimeo)\.com/i
}
attr_accessor :options, :html, :best_candidate, :candidates, :best_candidate_has_image
def initialize(input, options = {})
@options = DEFAULT_OPTIONS.merge(options)
@input = input
if RUBY_VERSION =~ /^(1\.9|2)/ && !@options[:encoding]
@input = GuessHtmlEncoding.encode(@input, @options[:html_headers]) unless @options[:do_not_guess_encoding]
@options[:encoding] = @input.encoding.to_s
end
@input = @input.gsub(REGEXES[:replaceBrsRe], '</p><p>').gsub(REGEXES[:replaceFontsRe], '<\1span>')
@remove_unlikely_candidates = @options[:remove_unlikely_candidates]
@weight_classes = @options[:weight_classes]
@clean_conditionally = @options[:clean_conditionally]
@best_candidate_has_image = true
make_html
handle_exclusions!(@options[:whitelist], @options[:blacklist])
end
def images(content=nil, reload=false)
begin
require 'fastimage'
rescue LoadError
raise "Please install fastimage in order to use the #images feature."
end
@best_candidate_has_image = false if reload
prepare_candidates
list_images = []
tested_images = []
content = @best_candidate[:elem] unless reload
return list_images if content.nil?
elements = content.css("img").map(&:attributes)
elements.each do |element|
next unless element["src"]
url = element["src"].value
height = element["height"].nil? ? 0 : element["height"].value.to_i
width = element["width"].nil? ? 0 : element["width"].value.to_i
if url =~ /\Ahttps?:\/\//i && (height.zero? || width.zero?)
image = get_image_size(url)
next unless image
else
image = {:width => width, :height => height}
end
image[:format] = File.extname(url).gsub(".", "")
if tested_images.include?(url)
debug("Image was tested: #{url}")
next
end
tested_images.push(url)
if image_meets_criteria?(image)
list_images << url
else
debug("Image discarded: #{url} - height: #{image[:height]} - width: #{image[:width]} - format: #{image[:format]}")
end
end
(list_images.empty? and content != @html) ? images(@html, true) : list_images
end
def images_with_fqdn_uris!(source_uri)
images_with_fqdn_uris(@html, source_uri)
end
def images_with_fqdn_uris(document = @html.dup, source_uri)
uri = URI.parse(source_uri)
host = uri.host
scheme = uri.scheme
port = uri.port # defaults to 80
base = "#{scheme}://#{host}:#{port}/"
images = []
document.css("img").each do |elem|
begin
elem['src'] = URI.join(base,elem['src']).to_s if URI.parse(elem['src']).host == nil
images << elem['src'].to_s
rescue URI::InvalidURIError => exc
elem.remove
end
end
images(document,true)
end
def get_image_size(url)
w, h = FastImage.size(url)
raise "Couldn't get size." if w.nil? || h.nil?
{:width => w, :height => h}
rescue => e
debug("Image error: #{e}")
nil
end
def image_meets_criteria?(image)
return false if options[:ignore_image_format].include?(image[:format].downcase)
image[:width] >= (options[:min_image_width] || 0) && image[:height] >= (options[:min_image_height] || 0)
end
def title
title = @html.css("title").first
title ? title.text : nil
end
# Look through the @html document looking for the author
# Precedence Information here on the wiki: (TODO attach wiki URL if it is accepted)
# Returns nil if no author is detected
def author
# Let's grab this author:
# <meta name="dc.creator" content="Finch - http://www.getfinch.com" />
author_elements = @html.xpath('//meta[@name = "dc.creator"]')
unless author_elements.empty?
author_elements.each do |element|
return element['content'].strip if element['content']
end
end
# Now let's try to grab this
# <span class="byline author vcard"><span>By</span><cite class="fn">Austin Fonacier</cite></span>
# <div class="author">By</div><div class="author vcard"><a class="url fn" href="http://austinlivesinyoapp.com/">Austin Fonacier</a></div>
author_elements = @html.xpath('//*[contains(@class, "vcard")]//*[contains(@class, "fn")]')
unless author_elements.empty?
author_elements.each do |element|
return element.text.strip if element.text
end
end
# Now let's try to grab this
# <a rel="author" href="http://dbanksdesign.com">Danny Banks (rel)</a>
# TODO: strip out the (rel)?
author_elements = @html.xpath('//a[@rel = "author"]')
unless author_elements.empty?
author_elements.each do |element|
return element.text.strip if element.text
end
end
author_elements = @html.xpath('//*[@id = "author"]')
unless author_elements.empty?
author_elements.each do |element|
return element.text.strip if element.text
end
end
end
def content(remove_unlikely_candidates = :default)
@remove_unlikely_candidates = false if remove_unlikely_candidates == false
prepare_candidates
article = get_article(@candidates, @best_candidate)
cleaned_article = sanitize(article, @candidates, options)
if article.text.strip.length < options[:retry_length]
if @remove_unlikely_candidates
@remove_unlikely_candidates = false
elsif @weight_classes
@weight_classes = false
elsif @clean_conditionally
@clean_conditionally = false
else
# nothing we can do
return cleaned_article
end
make_html
content
else
cleaned_article
end
end
def get_article(candidates, best_candidate)
# Now that we have the top candidate, look through its siblings for content that might also be related.
# Things like preambles, content split by ads that we removed, etc.
sibling_score_threshold = [10, best_candidate[:content_score] * 0.2].max
output = Nokogiri::XML::Node.new('div', @html)
best_candidate[:elem].parent.children.each do |sibling|
append = false
append = true if sibling == best_candidate[:elem]
append = true if candidates[sibling] && candidates[sibling][:content_score] >= sibling_score_threshold
if sibling.name.downcase == "p"
link_density = get_link_density(sibling)
node_content = sibling.text
node_length = node_content.length
append = if node_length > 80 && link_density < 0.25
true
elsif node_length < 80 && link_density == 0 && node_content =~ /\.( |$)/
true
end
end
if append
sibling_dup = sibling.dup # otherwise the state of the document in processing will change, thus creating side effects
sibling_dup.name = "div" unless %w[div p].include?(sibling.name.downcase)
output << sibling_dup
end
end
output
end
def select_best_candidate(candidates)
sorted_candidates = candidates.values.sort { |a, b| b[:content_score] <=> a[:content_score] }
debug("Top 5 candidates:")
sorted_candidates[0...5].each do |candidate|
debug("Candidate #{candidate[:elem].name}##{candidate[:elem][:id]}.#{candidate[:elem][:class]} with score #{candidate[:content_score]}")
end
best_candidate = sorted_candidates.first || { :elem => @html.css("body").first, :content_score => 0 }
debug("Best candidate #{best_candidate[:elem].name}##{best_candidate[:elem][:id]}.#{best_candidate[:elem][:class]} with score #{best_candidate[:content_score]}")
best_candidate
end
def get_link_density(elem)
link_length = elem.css("a").map(&:text).join("").length
text_length = elem.text.length
link_length / text_length.to_f
end
def class_weight(e)
weight = 0
return weight unless @weight_classes
if e[:class] && e[:class] != ""
weight -= 25 if e[:class] =~ REGEXES[:negativeRe]
weight += 25 if e[:class] =~ REGEXES[:positiveRe]
end
if e[:id] && e[:id] != ""
weight -= 25 if e[:id] =~ REGEXES[:negativeRe]
weight += 25 if e[:id] =~ REGEXES[:positiveRe]
end
weight
end
ELEMENT_SCORES = {
'div' => 5,
'blockquote' => 3,
'form' => -3,
'th' => -5
}.freeze
def score_node(elem)
content_score = class_weight(elem)
content_score += ELEMENT_SCORES.fetch(elem.name.downcase, 0)
{ :content_score => content_score, :elem => elem }
end
def debug(str)
puts str if options[:debug]
end
def sanitize(node, candidates, options = {})
node.css("h1, h2, h3, h4, h5, h6").each do |header|
header.remove if class_weight(header) < 0 || get_link_density(header) > 0.33
end
node.css("form, object, iframe, embed").each do |elem|
elem.remove
end
if @options[:remove_empty_nodes]
# remove <p> tags that have no text content - this will also remove p tags that contain only images.
node.css("p").each do |elem|
elem.remove if elem.content.strip.empty?
end
end
# Conditionally clean <table>s, <ul>s, and <div>s
clean_conditionally(node, candidates, "table, ul, div")
# We'll sanitize all elements using a whitelist
base_whitelist = @options[:tags] || %w[div p]
# We'll add whitespace instead of block elements,
# so a<br>b will have a nice space between them
base_replace_with_whitespace = %w[br hr h1 h2 h3 h4 h5 h6 dl dd ol li ul address blockquote center]
# Use a hash for speed (don't want to make a million calls to include?)
whitelist = Hash.new
base_whitelist.each {|tag| whitelist[tag] = true }
replace_with_whitespace = Hash.new
base_replace_with_whitespace.each { |tag| replace_with_whitespace[tag] = true }
([node] + node.css("*")).each do |el|
# If element is in whitelist, delete all its attributes
if whitelist[el.node_name]
el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) }
# Otherwise, replace the element with its contents
else
# If element is root, replace the node as a text node
if el.parent.nil?
node = Nokogiri::XML::Text.new(el.text, el.document)
break
else
if replace_with_whitespace[el.node_name]
el.swap(Nokogiri::XML::Text.new(' ' << el.text << ' ', el.document))
else
el.swap(Nokogiri::XML::Text.new(el.text, el.document))
end
end
end
end
s = Nokogiri::XML::Node::SaveOptions
save_opts = s::NO_DECLARATION | s::NO_EMPTY_TAGS | s::AS_XHTML
html = node.serialize(:save_with => save_opts)
# Get rid of duplicate whitespace
return html.gsub(/[\r\n\f]+/, "\n" )
end
def clean_conditionally(node, candidates, selector)
return unless @clean_conditionally
node.css(selector).each do |el|
weight = class_weight(el)
content_score = candidates[el] ? candidates[el][:content_score] : 0
name = el.name.downcase
if weight + content_score < 0
el.remove
debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because score + content score was less than zero.")
elsif el.text.count(",") < 10
counts = %w[p img li a embed input].inject({}) { |m, kind| m[kind] = el.css(kind).length; m }
counts["li"] -= 100
# For every img under a noscript tag discount one from the count to avoid double counting
counts["img"] -= el.css("noscript").css("img").length
content_length = el.text.strip.length # Count the text length excluding any surrounding whitespace
link_density = get_link_density(el)
reason = clean_conditionally_reason?(name, counts, content_length, options, weight, link_density)
if reason
debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because it has #{reason}.")
el.remove
end
end
end
end
def clean_conditionally_reason?(name, counts, content_length, options, weight, link_density)
if (counts["img"] > counts["p"]) && (counts["img"] > 1)
"too many images"
elsif counts["li"] > counts["p"] && name != "ul" && name != "ol"
"more <li>s than <p>s"
elsif counts["input"] > (counts["p"] / 3).to_i
"less than 3x <p>s than <input>s"
elsif (content_length < options[:min_text_length]) && (counts["img"] != 1)
"too short a content length without a single image"
elsif weight < 25 && link_density > 0.2
"too many links for its weight (#{weight})"
elsif weight >= 25 && link_density > 0.5
"too many links for its weight (#{weight})"
elsif (counts["embed"] == 1 && content_length < 75) || counts["embed"] > 1
"<embed>s with too short a content length, or too many <embed>s"
else
nil
end
end
private
# 제거항목 추가항목을 지정한다.
def handle_exclusions!(whitelist, blacklist)
return unless whitelist || blacklist
if blacklist
elems = @html.css(blacklist)
if elems
elems.each do |e|
e.remove
end
end
end
if whitelist
elems = @html.css(whitelist).to_s
if body = @html.at_css('body')
body.inner_html = elems
end
end
@input = @html.to_s
end
# 코멘트가 제거된 기본 html 노드 반환
def make_html(whitelist=nil, blacklist=nil)
@html = Nokogiri::HTML(@input, nil, @options[:encoding])
# In case document has no body, such as from empty string or redirect
@html = Nokogiri::HTML('<body />', nil, @options[:encoding]) if @html.css('body').length == 0
# Remove html comment tags
@html.xpath('//comment()').each { |i| i.remove }
end
def prepare_candidates
@html.css("script, style").each { |i| i.remove }
remove_unlikely_candidates! if @remove_unlikely_candidates
transform_misused_divs_into_paragraphs!
@candidates = score_paragraphs(options[:min_text_length])
@best_candidate = select_best_candidate(@candidates)
end
# 가망없는 후보자를 제거한다. (명확한 후보자는 제외하고 제거한다.)
def remove_unlikely_candidates!
@html.css("*").each do |elem|
str = "#{elem[:class]}#{elem[:id]}"
if str =~ REGEXES[:unlikelyCandidatesRe] && str !~ REGEXES[:okMaybeItsACandidateRe] && (elem.name.downcase != 'html') && (elem.name.downcase != 'body')
debug("Removing unlikely candidate - #{str}")
elem.remove
end
end
end
# 잘못 사용되고 있는 DIV를 p로 변환한다.
def transform_misused_divs_into_paragraphs!
@html.css("*").each do |elem|
if elem.name.downcase == "div"
# transform <div>s that do not contain other block elements into <p>s
if elem.inner_html !~ REGEXES[:divToPElementsRe]
debug("Altering div(##{elem[:id]}.#{elem[:class]}) to p");
elem.name = "p"
end
else
# wrap text nodes in p tags
# elem.children.each do |child|
# if child.text?
# debug("wrapping text node with a p")
# child.swap("<p>#{child.text}</p>")
# end
# end
end
end
end
# 가능노드에 점수를 매긴다.
def score_paragraphs(min_text_length)
candidates = {}
@html.css("p,td").each do |elem|
parent_node = elem.parent
grand_parent_node = parent_node.respond_to?(:parent) ? parent_node.parent : nil
inner_text = elem.text
# If this paragraph is less than 25 characters, don't even count it.
next if inner_text.length < min_text_length
candidates[parent_node] ||= score_node(parent_node)
candidates[grand_parent_node] ||= score_node(grand_parent_node) if grand_parent_node
content_score = 1
content_score += inner_text.split(',').length
content_score += [(inner_text.length / 100).to_i, 3].min
candidates[parent_node][:content_score] += content_score
candidates[grand_parent_node][:content_score] += content_score / 2.0 if grand_parent_node
end
# Scale the final candidates score based on link density. Good content should have a
# relatively small link density (5% or less) and be mostly unaffected by this operation.
candidates.each do |elem, candidate|
candidate[:content_score] = candidate[:content_score] * (1 - get_link_density(elem))
end
candidates
end
end
end

12
mix.exs
View File

@ -1,4 +1,7 @@
defmodule Readability.Mixfile do
@moduledoc """
"""
use Mix.Project
def project do
@ -15,7 +18,8 @@ defmodule Readability.Mixfile do
# Type "mix help compile.app" for more information
def application do
[applications: [:logger,
:floki
:floki,
:httpoison
]]
end
@ -29,6 +33,10 @@ defmodule Readability.Mixfile do
#
# Type "mix help deps" for more examples and options
defp deps do
[{:floki, "~> 0.8.0"}]
[{:floki, "~> 0.8.0"},
{:httpoison, "~> 0.8.0"},
{:credo, "~> 0.3", only: [:dev, :test]},
{:dialyxir, "~> 0.3", only: [:dev]}
]
end
end

View File

@ -1,2 +1,12 @@
%{"floki": {:hex, :floki, "0.8.0"},
"mochiweb_html": {:hex, :mochiweb_html, "2.13.0"}}
%{"bunt": {:hex, :bunt, "0.1.5"},
"certifi": {:hex, :certifi, "0.4.0"},
"credo": {:hex, :credo, "0.3.12"},
"dialyxir": {:hex, :dialyxir, "0.3.3"},
"floki": {:hex, :floki, "0.8.0"},
"hackney": {:hex, :hackney, "1.6.0"},
"httpoison": {:hex, :httpoison, "0.8.3"},
"idna": {:hex, :idna, "1.2.0"},
"metrics": {:hex, :metrics, "1.0.1"},
"mimerl": {:hex, :mimerl, "1.0.2"},
"mochiweb_html": {:hex, :mochiweb_html, "2.13.0"},
"ssl_verify_fun": {:hex, :ssl_verify_fun, "1.1.0"}}

1
test.html Normal file

File diff suppressed because one or more lines are too long

4539
test/fixtures/bbc.html vendored

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -1,31 +0,0 @@
defmodule Readability.HelperTest do
use ExUnit.Case, async: true
import Readability, only: :functions
alias Readability.Helper
@sample """
<html>
<body>
<p>
<font>a</fond>
<p>
<font>abc</font>
</p>
</p>
<p>
<font>b</font>
</p>
</body>
</html>
"""
test "change font tag to span" do
expectred = @sample
|> String.replace(~r/font/, "span")
|> Floki.parse