defmodule Readability.Candidate.Cleaner do @moduledoc """ Clean html tree for prepare candidates. It transforms misused tags and removes unlikely candidates. """ alias Readability.Helper @type html_tree :: tuple | list @doc """ Transform misused divs
s that do not contain other block elements into

s """ @spec transform_misused_div_to_p(html_tree) :: html_tree def transform_misused_div_to_p(content) when is_binary(content), do: content def transform_misused_div_to_p([]), do: [] def transform_misused_div_to_p([h | t]) do [transform_misused_div_to_p(h) | transform_misused_div_to_p(t)] end def transform_misused_div_to_p({tag, attrs, inner_tree}) do tag = if misused_divs?(tag, inner_tree), do: "p", else: tag {tag, attrs, transform_misused_div_to_p(inner_tree)} end @doc """ Remove unlikely html tree """ @spec remove_unlikely_tree(html_tree) :: html_tree def remove_unlikely_tree(html_tree) do Helper.remove_tag(html_tree, &unlikely_tree?(&1)) end defp misused_divs?("div", inner_tree) do !(Floki.raw_html(inner_tree) =~ Readability.regexes(:div_to_p_elements)) end defp misused_divs?(_, _), do: false defp unlikely_tree?({tag, attrs, _}) do idclass_str = attrs |> Enum.filter_map(&(elem(&1, 0) =~ ~r/id|class/i), &elem(&1, 1)) |> Enum.join("") str = tag <> idclass_str str =~ Readability.regexes(:unlikely_candidate) && !(str =~ Readability.regexes(:ok_maybe_its_a_candidate)) && tag != "html" end end