readability/lib/readability/candidate/cleaner.ex

53 lines
1.5 KiB
Elixir

defmodule Readability.Candidate.Cleaner do
@moduledoc """
Clean html tree for prepare candidates.
It transforms misused tags and removes unlikely candidates.
"""
alias Readability.Helper
@type html_tree :: tuple | list
@doc """
Transform misused divs <div>s that do not contain other block elements into <p>s
"""
@spec transform_misused_div_to_p(html_tree) :: html_tree
def transform_misused_div_to_p(content) when is_binary(content), do: content
def transform_misused_div_to_p([]), do: []
def transform_misused_div_to_p([h | t]) do
[transform_misused_div_to_p(h) | transform_misused_div_to_p(t)]
end
def transform_misused_div_to_p({tag, attrs, inner_tree}) do
tag = if misused_divs?(tag, inner_tree), do: "p", else: tag
{tag, attrs, transform_misused_div_to_p(inner_tree)}
end
@doc """
Remove unlikely html tree
"""
@spec remove_unlikely_tree(html_tree) :: html_tree
def remove_unlikely_tree(html_tree) do
Helper.remove_tag(html_tree, &unlikely_tree?(&1))
end
defp misused_divs?("div", inner_tree) do
!(Floki.raw_html(inner_tree) =~ Readability.regexes(:div_to_p_elements))
end
defp misused_divs?(_, _), do: false
defp unlikely_tree?({tag, attrs, _}) do
idclass_str =
attrs
|> Enum.filter_map(&(elem(&1, 0) =~ ~r/id|class/i), &elem(&1, 1))
|> Enum.join("")
str = tag <> idclass_str
str =~ Readability.regexes(:unlikely_candidate) &&
!(str =~ Readability.regexes(:ok_maybe_its_a_candidate)) && tag != "html"
end
end