95 lines
2.6 KiB
Elixir
95 lines
2.6 KiB
Elixir
defmodule Readability.Candidate.Scoring do
|
|
@moduledoc """
|
|
Score html tree
|
|
"""
|
|
alias Readability.Helper
|
|
|
|
@element_scores %{"div" => 5, "blockquote" => 3, "form" => -3, "th" => -5}
|
|
|
|
@type html_tree :: tuple | list
|
|
@type options :: list
|
|
|
|
@doc """
|
|
Score html tree by some algorithm that check children nodes, attributes, link densities, etcs..
|
|
options -> weight_classes :: boolean, calculate weight class
|
|
"""
|
|
@spec calc_score(html_tree, options) :: number
|
|
def calc_score(html_tree, opts \\ []) do
|
|
score = calc_node_score(html_tree, opts)
|
|
|
|
score =
|
|
score + calc_children_content_score(html_tree) +
|
|
calc_grand_children_content_score(html_tree)
|
|
|
|
score * (1 - calc_link_density(html_tree))
|
|
end
|
|
|
|
defp calc_content_score(html_tree) do
|
|
score = 1
|
|
inner_text = html_tree |> Floki.text()
|
|
split_score = inner_text |> String.split(",") |> length
|
|
length_score = [String.length(inner_text) / 100, 3] |> Enum.min()
|
|
score + split_score + length_score
|
|
end
|
|
|
|
defp calc_node_score({tag, attrs, _}, opts) do
|
|
score = 0
|
|
score = if opts[:weight_classes], do: score + class_weight(attrs), else: score
|
|
score + (@element_scores[tag] || 0)
|
|
end
|
|
|
|
defp calc_node_score([h | t], opts) do
|
|
calc_node_score(h, opts) + calc_node_score(t, opts)
|
|
end
|
|
|
|
defp calc_node_score([], _), do: 0
|
|
|
|
def class_weight(attrs) do
|
|
weight = 0
|
|
class = attrs |> List.keyfind("class", 0, {"", ""}) |> elem(1)
|
|
id = attrs |> List.keyfind("id", 0, {"", ""}) |> elem(1)
|
|
|
|
weight = if class =~ Readability.regexes(:positive), do: weight + 25, else: weight
|
|
weight = if id =~ Readability.regexes(:positive), do: weight + 25, else: weight
|
|
weight = if class =~ Readability.regexes(:negative), do: weight - 25, else: weight
|
|
weight = if id =~ Readability.regexes(:negative), do: weight - 25, else: weight
|
|
weight
|
|
end
|
|
|
|
def calc_link_density(html_tree) do
|
|
link_length =
|
|
html_tree
|
|
|> Floki.find("a")
|
|
|> Floki.text()
|
|
|> String.length()
|
|
|
|
text_length =
|
|
html_tree
|
|
|> Floki.text()
|
|
|> String.length()
|
|
|
|
if text_length == 0 do
|
|
0
|
|
else
|
|
link_length / text_length
|
|
end
|
|
end
|
|
|
|
defp calc_children_content_score({_, _, children_tree}) do
|
|
children_tree
|
|
|> Enum.filter(&(is_tuple(&1) && Helper.candidate_tag?(&1)))
|
|
|> calc_content_score
|
|
end
|
|
|
|
defp calc_grand_children_content_score({_, _, children_tree}) do
|
|
score =
|
|
children_tree
|
|
|> Enum.filter_map(&is_tuple(&1), &elem(&1, 2))
|
|
|> List.flatten()
|
|
|> Enum.filter(&(is_tuple(&1) && Helper.candidate_tag?(&1)))
|
|
|> calc_content_score
|
|
|
|
score / 2
|
|
end
|
|
end
|