readability/lib/readability/candidate/scoring.ex

95 lines
2.6 KiB
Elixir

defmodule Readability.Candidate.Scoring do
@moduledoc """
Score html tree
"""
alias Readability.Helper
@element_scores %{"div" => 5, "blockquote" => 3, "form" => -3, "th" => -5}
@type html_tree :: tuple | list
@type options :: list
@doc """
Score html tree by some algorithm that check children nodes, attributes, link densities, etcs..
options -> weight_classes :: boolean, calculate weight class
"""
@spec calc_score(html_tree, options) :: number
def calc_score(html_tree, opts \\ []) do
score = calc_node_score(html_tree, opts)
score =
score + calc_children_content_score(html_tree) +
calc_grand_children_content_score(html_tree)
score * (1 - calc_link_density(html_tree))
end
defp calc_content_score(html_tree) do
score = 1
inner_text = html_tree |> Floki.text()
split_score = inner_text |> String.split(",") |> length
length_score = [String.length(inner_text) / 100, 3] |> Enum.min()
score + split_score + length_score
end
defp calc_node_score({tag, attrs, _}, opts) do
score = 0
score = if opts[:weight_classes], do: score + class_weight(attrs), else: score
score + (@element_scores[tag] || 0)
end
defp calc_node_score([h | t], opts) do
calc_node_score(h, opts) + calc_node_score(t, opts)
end
defp calc_node_score([], _), do: 0
def class_weight(attrs) do
weight = 0
class = attrs |> List.keyfind("class", 0, {"", ""}) |> elem(1)
id = attrs |> List.keyfind("id", 0, {"", ""}) |> elem(1)
weight = if class =~ Readability.regexes(:positive), do: weight + 25, else: weight
weight = if id =~ Readability.regexes(:positive), do: weight + 25, else: weight
weight = if class =~ Readability.regexes(:negative), do: weight - 25, else: weight
weight = if id =~ Readability.regexes(:negative), do: weight - 25, else: weight
weight
end
def calc_link_density(html_tree) do
link_length =
html_tree
|> Floki.find("a")
|> Floki.text()
|> String.length()
text_length =
html_tree
|> Floki.text()
|> String.length()
if text_length == 0 do
0
else
link_length / text_length
end
end
defp calc_children_content_score({_, _, children_tree}) do
children_tree
|> Enum.filter(&(is_tuple(&1) && Helper.candidate_tag?(&1)))
|> calc_content_score
end
defp calc_grand_children_content_score({_, _, children_tree}) do
score =
children_tree
|> Enum.filter_map(&is_tuple(&1), &elem(&1, 2))
|> List.flatten()
|> Enum.filter(&(is_tuple(&1) && Helper.candidate_tag?(&1)))
|> calc_content_score
score / 2
end
end