97 lines
3.2 KiB
Elixir
97 lines
3.2 KiB
Elixir
defmodule Readability.Sanitizer do
|
|
@moduledoc """
|
|
Clean an element of all tags of type "tag" if they look fishy.
|
|
"Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
|
|
"""
|
|
|
|
alias Readability.Helper
|
|
alias Readability.Candidate
|
|
alias Readability.Candidate.Scoring
|
|
|
|
@type html_tree :: tuple | list
|
|
|
|
@doc """
|
|
Sanitizes article html tree
|
|
"""
|
|
@spec sanitize(html_tree, [Candidate.t()], list) :: html_tree
|
|
def sanitize(html_tree, candidates, opts \\ []) do
|
|
html_tree =
|
|
html_tree
|
|
|> Helper.remove_tag(&clean_headline_tag?(&1))
|
|
|> Helper.remove_tag(&clean_unlikely_tag?(&1))
|
|
|> Helper.remove_tag(&clean_empty_p?(&1))
|
|
|
|
if opts[:clean_conditionally] do
|
|
html_tree |> Helper.remove_tag(conditionally_cleaing_fn(candidates))
|
|
else
|
|
html_tree
|
|
end
|
|
end
|
|
|
|
defp conditionally_cleaing_fn(candidates) do
|
|
fn {tag, attrs, _} = tree ->
|
|
if Enum.any?(["table", "ul", "div"], &(&1 == tag)) do
|
|
weight = Scoring.class_weight(attrs)
|
|
|
|
same_tree =
|
|
candidates
|
|
|> Enum.find(%Candidate{}, &(&1.html_tree == tree))
|
|
|
|
list? = tag == "ul"
|
|
|
|
cond do
|
|
weight + same_tree.score < 0 ->
|
|
true
|
|
|
|
length(Regex.scan(~r/\,/, Floki.text(tree))) < 10 ->
|
|
# If there are not very many commas, and the number of
|
|
# non-paragraph elements is more than paragraphs or other
|
|
# ominous signs, remove the element.
|
|
p_len = tree |> Floki.find("p") |> length
|
|
img_len = tree |> Floki.find("img") |> length
|
|
li_len = tree |> Floki.find("li") |> length
|
|
input_len = tree |> Floki.find("input") |> length
|
|
|
|
embed_len =
|
|
tree
|
|
|> Floki.find("embed")
|
|
|> Enum.reject(&(&1 =~ Readability.regexes(:video)))
|
|
|> length
|
|
|
|
link_density = Scoring.calc_link_density(tree)
|
|
conent_len = Helper.text_length(tree)
|
|
|
|
# too many image
|
|
# more <li>s than <p>s
|
|
# less than 3x <p>s than <input>s
|
|
# too short a content length without a single image
|
|
# too many links for its weight (#{weight})
|
|
# too many links for its weight (#{weight})
|
|
# <embed>s with too short a content length, or too many <embed>s
|
|
img_len > p_len || (!list? && li_len > p_len) || input_len > p_len / 3 ||
|
|
(!list? && conent_len < Readability.regexes(:min_text_length) && img_len != 1) ||
|
|
(weight < 25 && link_density > 0.2) || (weight >= 25 && link_density > 0.5) ||
|
|
((embed_len == 1 && conent_len < 75) || embed_len > 1)
|
|
|
|
true ->
|
|
false
|
|
end
|
|
end
|
|
end
|
|
end
|
|
|
|
defp clean_headline_tag?({tag, attrs, _} = html_tree) do
|
|
tag =~ ~r/^h\d{1}$/ &&
|
|
(Scoring.class_weight(attrs) < 0 || Scoring.calc_link_density(html_tree) > 0.33)
|
|
end
|
|
|
|
defp clean_unlikely_tag?({tag, attrs, _}) do
|
|
attrs_str = attrs |> Enum.map(&elem(&1, 1)) |> Enum.join("")
|
|
tag =~ ~r/form|object|iframe|embed/ && !(attrs_str =~ Readability.regexes(:video))
|
|
end
|
|
|
|
defp clean_empty_p?({tag, _, _} = html_tree) do
|
|
tag == "p" && Helper.text_length(html_tree) == 0
|
|
end
|
|
end
|