defmodule Readability.Sanitizer do @moduledoc """ Clean an element of all tags of type "tag" if they look fishy. "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc. """ alias Readability.Helper alias Readability.Candidate alias Readability.Candidate.Scoring @type html_tree :: tuple | list @doc """ Sanitizes article html tree """ @spec sanitize(html_tree, [Candidate.t()], list) :: html_tree def sanitize(html_tree, candidates, opts \\ []) do html_tree = html_tree |> Helper.remove_tag(&clean_headline_tag?(&1)) |> Helper.remove_tag(&clean_unlikely_tag?(&1)) |> Helper.remove_tag(&clean_empty_p?(&1)) if opts[:clean_conditionally] do html_tree |> Helper.remove_tag(conditionally_cleaing_fn(candidates)) else html_tree end end defp conditionally_cleaing_fn(candidates) do fn {tag, attrs, _} = tree -> if Enum.any?(["table", "ul", "div"], &(&1 == tag)) do weight = Scoring.class_weight(attrs) same_tree = candidates |> Enum.find(%Candidate{}, &(&1.html_tree == tree)) list? = tag == "ul" cond do weight + same_tree.score < 0 -> true length(Regex.scan(~r/\,/, Floki.text(tree))) < 10 -> # If there are not very many commas, and the number of # non-paragraph elements is more than paragraphs or other # ominous signs, remove the element. p_len = tree |> Floki.find("p") |> length img_len = tree |> Floki.find("img") |> length li_len = tree |> Floki.find("li") |> length input_len = tree |> Floki.find("input") |> length embed_len = tree |> Floki.find("embed") |> Enum.reject(&(&1 =~ Readability.regexes(:video))) |> length link_density = Scoring.calc_link_density(tree) conent_len = Helper.text_length(tree) # too many image # more
  • s than

    s # less than 3x

    s than s # too short a content length without a single image # too many links for its weight (#{weight}) # too many links for its weight (#{weight}) # s with too short a content length, or too many s img_len > p_len || (!list? && li_len > p_len) || input_len > p_len / 3 || (!list? && conent_len < Readability.regexes(:min_text_length) && img_len != 1) || (weight < 25 && link_density > 0.2) || (weight >= 25 && link_density > 0.5) || ((embed_len == 1 && conent_len < 75) || embed_len > 1) true -> false end end end end defp clean_headline_tag?({tag, attrs, _} = html_tree) do tag =~ ~r/^h\d{1}$/ && (Scoring.class_weight(attrs) < 0 || Scoring.calc_link_density(html_tree) > 0.33) end defp clean_unlikely_tag?({tag, attrs, _}) do attrs_str = attrs |> Enum.map(&elem(&1, 1)) |> Enum.join("") tag =~ ~r/form|object|iframe|embed/ && !(attrs_str =~ Readability.regexes(:video)) end defp clean_empty_p?({tag, _, _} = html_tree) do tag == "p" && Helper.text_length(html_tree) == 0 end end