readability/lib/readability/candidate_finder.ex

69 lines
1.8 KiB
Elixir
Raw Normal View History

2016-04-17 12:26:51 +00:00
defmodule Readability.CandidateFinder do
@moduledoc """
The builing and finding candidates engine
It traverses the HTML tree searching, removing, socring nodes
"""
alias Readability.Helper
alias Readability.Candidate
alias Readability.Candidate.Scoring
@type html_tree :: tuple | list
@type options :: list
@doc """
Find candidates that shuld be meaningful article by analysing nodes
"""
@spec find(html_tree, options, number) :: [Candidate.t()]
2016-04-17 12:26:51 +00:00
def find(_, opts \\ [], tree_depth \\ 0)
def find([], _, _), do: []
def find([h | t], opts, tree_depth) do
2016-04-17 12:26:51 +00:00
[find(h, opts, tree_depth) | find(t, opts, tree_depth)]
|> List.flatten()
2016-04-17 12:26:51 +00:00
end
2016-04-17 12:26:51 +00:00
def find(text, _, _) when is_binary(text), do: []
2016-04-17 12:26:51 +00:00
def find({tag, attrs, inner_tree}, opts, tree_depth) do
html_tree = {tag, attrs, inner_tree}
2016-04-17 12:26:51 +00:00
if candidate?(html_tree) do
candidate = %Candidate{
html_tree: html_tree,
score: Scoring.calc_score(html_tree, opts),
tree_depth: tree_depth
}
2016-04-17 12:26:51 +00:00
[candidate | find(inner_tree, opts, tree_depth + 1)]
else
find(inner_tree, opts, tree_depth + 1)
end
end
@doc """
Find the highest score candidate.
"""
@spec find_best_candidate([Candidate.t()]) :: Candidate.t()
2016-04-17 12:26:51 +00:00
def find_best_candidate([]), do: nil
2016-04-17 12:26:51 +00:00
def find_best_candidate(candidates) do
candidates
|> Enum.max_by(fn candidate -> candidate.score end)
2016-04-17 12:26:51 +00:00
end
defp candidate?(_, depth \\ 0)
defp candidate?(_, depth) when depth > 2, do: false
defp candidate?([h | t], depth), do: candidate?(h, depth) || candidate?(t, depth)
2016-04-17 12:26:51 +00:00
defp candidate?([], _), do: false
defp candidate?(text, _) when is_binary(text), do: false
2016-04-17 12:26:51 +00:00
defp candidate?({_, _, inner_tree} = html_tree, depth) do
if Helper.candidate_tag?(html_tree) do
true
else
candidate?(inner_tree, depth + 1)
end
end
end