readability/lib/readability/sanitizer.ex

97 lines
3.2 KiB
Elixir

defmodule Readability.Sanitizer do
@moduledoc """
Clean an element of all tags of type "tag" if they look fishy.
"Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
"""
alias Readability.Helper
alias Readability.Candidate
alias Readability.Candidate.Scoring
@type html_tree :: tuple | list
@doc """
Sanitizes article html tree
"""
@spec sanitize(html_tree, [Candidate.t()], list) :: html_tree
def sanitize(html_tree, candidates, opts \\ []) do
html_tree =
html_tree
|> Helper.remove_tag(&clean_headline_tag?(&1))
|> Helper.remove_tag(&clean_unlikely_tag?(&1))
|> Helper.remove_tag(&clean_empty_p?(&1))
if opts[:clean_conditionally] do
html_tree |> Helper.remove_tag(conditionally_cleaing_fn(candidates))
else
html_tree
end
end
defp conditionally_cleaing_fn(candidates) do
fn {tag, attrs, _} = tree ->
if Enum.any?(["table", "ul", "div"], &(&1 == tag)) do
weight = Scoring.class_weight(attrs)
same_tree =
candidates
|> Enum.find(%Candidate{}, &(&1.html_tree == tree))
list? = tag == "ul"
cond do
weight + same_tree.score < 0 ->
true
length(Regex.scan(~r/\,/, Floki.text(tree))) < 10 ->
# If there are not very many commas, and the number of
# non-paragraph elements is more than paragraphs or other
# ominous signs, remove the element.
p_len = tree |> Floki.find("p") |> length
img_len = tree |> Floki.find("img") |> length
li_len = tree |> Floki.find("li") |> length
input_len = tree |> Floki.find("input") |> length
embed_len =
tree
|> Floki.find("embed")
|> Enum.reject(&(&1 =~ Readability.regexes(:video)))
|> length
link_density = Scoring.calc_link_density(tree)
conent_len = Helper.text_length(tree)
# too many image
# more <li>s than <p>s
# less than 3x <p>s than <input>s
# too short a content length without a single image
# too many links for its weight (#{weight})
# too many links for its weight (#{weight})
# <embed>s with too short a content length, or too many <embed>s
img_len > p_len || (!list? && li_len > p_len) || input_len > p_len / 3 ||
(!list? && conent_len < Readability.regexes(:min_text_length) && img_len != 1) ||
(weight < 25 && link_density > 0.2) || (weight >= 25 && link_density > 0.5) ||
((embed_len == 1 && conent_len < 75) || embed_len > 1)
true ->
false
end
end
end
end
defp clean_headline_tag?({tag, attrs, _} = html_tree) do
tag =~ ~r/^h\d{1}$/ &&
(Scoring.class_weight(attrs) < 0 || Scoring.calc_link_density(html_tree) > 0.33)
end
defp clean_unlikely_tag?({tag, attrs, _}) do
attrs_str = attrs |> Enum.map(&elem(&1, 1)) |> Enum.join("")
tag =~ ~r/form|object|iframe|embed/ && !(attrs_str =~ Readability.regexes(:video))
end
defp clean_empty_p?({tag, _, _} = html_tree) do
tag == "p" && Helper.text_length(html_tree) == 0
end
end