101 lines
2.9 KiB
Elixir
101 lines
2.9 KiB
Elixir
defmodule Readability.ArticleBuilder do
|
|
@moduledoc """
|
|
build article for readability
|
|
"""
|
|
|
|
alias Readability.Helper
|
|
alias Readability.Sanitizer
|
|
alias Readability.Candidate
|
|
alias Readability.CandidateFinder
|
|
alias Readability.Candidate.Cleaner
|
|
alias Readability.Candidate.Scoring
|
|
|
|
@type html_tree :: tuple | list
|
|
@type options :: list
|
|
|
|
@doc """
|
|
Prepare the article node for display.
|
|
Clean out any inline styles, iframes, forms, strip extraneous <p> tags, etc.
|
|
"""
|
|
@spec build(html_tree, options) :: html_tree
|
|
def build(html_tree, opts) do
|
|
origin_tree = html_tree
|
|
html_tree = html_tree
|
|
|> Helper.remove_tag(fn({tag, _, _}) ->
|
|
Enum.member?(["script", "style"], tag)
|
|
end)
|
|
|
|
if opts[:remove_unlikely_candidates] do
|
|
html_tree = Cleaner.remove_unlikely_tree(html_tree)
|
|
end
|
|
html_tree = Cleaner.transform_misused_div_to_p(html_tree)
|
|
|
|
candidates = CandidateFinder.find(html_tree, opts)
|
|
article = find_article(candidates, html_tree)
|
|
|
|
html_tree = Sanitizer.sanitize(article, candidates, opts)
|
|
|
|
if Helper.text_length(html_tree) < opts[:retry_length] do
|
|
if opts = next_try_opts(opts) do
|
|
build(origin_tree, opts)
|
|
else
|
|
html_tree
|
|
end
|
|
else
|
|
html_tree
|
|
end
|
|
end
|
|
|
|
defp next_try_opts(opts) do
|
|
cond do
|
|
opts[:remove_unlikely_candidates] ->
|
|
Keyword.put(opts, :remove_unlikely_candidates, false)
|
|
opts[:weight_classes] ->
|
|
Keyword.put(opts, :weight_classes, false)
|
|
opts[:clean_conditionally] ->
|
|
Keyword.put(opts, :clean_conditionally, false)
|
|
true -> nil
|
|
end
|
|
end
|
|
|
|
defp find_article(candidates, html_tree) do
|
|
best_candidate = CandidateFinder.find_best_candidate(candidates)
|
|
unless best_candidate do
|
|
tree = html_tree |> Floki.find("body") |> hd
|
|
best_candidate = %Candidate{html_tree: tree}
|
|
end
|
|
article_trees = find_article_trees(best_candidate, candidates)
|
|
{"div", [], article_trees}
|
|
end
|
|
|
|
defp find_article_trees(best_candidate, candidates) do
|
|
score_threshold = Enum.max([10, best_candidate.score * 0.2])
|
|
|
|
candidates
|
|
|> Enum.filter(&(&1.tree_depth == best_candidate.tree_depth))
|
|
|> Enum.filter_map(fn(candidate) ->
|
|
candidate == best_candidate
|
|
|| candidate.score >= score_threshold
|
|
|| append?(candidate)
|
|
end, &(to_article_tag(&1.html_tree)))
|
|
end
|
|
|
|
defp append?(%Candidate{html_tree: html_tree}) when elem(html_tree, 0) == "p" do
|
|
link_density = Scoring.calc_link_density(html_tree)
|
|
inner_text = html_tree |> Floki.text
|
|
inner_length = inner_text |> String.length
|
|
|
|
(inner_length > 80 && link_density < 0.25)
|
|
|| (inner_length < 80 && link_density == 0 && inner_text =~ ~r/\.( |$)/)
|
|
end
|
|
defp append?(_), do: false
|
|
|
|
defp to_article_tag({tag, attrs, inner_tree} = html_tree) do
|
|
if tag =~ ~r/^p$|^div$/ do
|
|
html_tree
|
|
else
|
|
{"div", attrs, inner_tree}
|
|
end
|
|
end
|
|
end
|