defmodule Readability.Sanitizer do @moduledoc """ Clean an element of all tags of type "tag" if they look fishy. "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc. """ alias Readability.Helper alias Readability.Candidate alias Readability.Candidate.Scoring @type html_tree :: tuple | list @doc """ Sanitizes article html tree """ @spec sanitize(html_tree, [Candidate.t()], list) :: html_tree def sanitize(html_tree, candidates, opts \\ []) do html_tree = html_tree |> Helper.remove_tag(&clean_headline_tag?(&1)) |> Helper.remove_tag(&clean_unlikely_tag?(&1)) |> Helper.remove_tag(&clean_empty_p?(&1)) if opts[:clean_conditionally] do html_tree |> Helper.remove_tag(conditionally_cleaing_fn(candidates)) else html_tree end end defp conditionally_cleaing_fn(candidates) do fn {tag, attrs, _} = tree -> if Enum.any?(["table", "ul", "div"], &(&1 == tag)) do weight = Scoring.class_weight(attrs) same_tree = candidates |> Enum.find(%Candidate{}, &(&1.html_tree == tree)) list? = tag == "ul" cond do weight + same_tree.score < 0 -> true length(Regex.scan(~r/\,/, Floki.text(tree))) < 10 -> # If there are not very many commas, and the number of # non-paragraph elements is more than paragraphs or other # ominous signs, remove the element. p_len = tree |> Floki.find("p") |> length img_len = tree |> Floki.find("img") |> length li_len = tree |> Floki.find("li") |> length input_len = tree |> Floki.find("input") |> length embed_len = tree |> Floki.find("embed") |> Enum.reject(&(&1 =~ Readability.regexes(:video))) |> length link_density = Scoring.calc_link_density(tree) conent_len = Helper.text_length(tree) # too many image # more
s # less than 3x
s than s # too short a content length without a single image # too many links for its weight (#{weight}) # too many links for its weight (#{weight}) #