readability/lib/readability/helper.ex

115 lines
3.1 KiB
Elixir
Raw Normal View History

2016-04-17 06:28:33 +00:00
defmodule Readability.Helper do
@moduledoc """
2016-04-17 12:26:51 +00:00
Helpers for parsing, updating, removing html tree
2016-04-17 06:28:33 +00:00
"""
@type html_tree :: tuple | list
@doc """
2016-04-17 12:26:51 +00:00
Change existing tags by selector
2016-04-17 06:28:33 +00:00
"""
@spec change_tag(html_tree, String.t(), String.t()) :: html_tree
2016-04-17 12:26:51 +00:00
def change_tag(content, _, _) when is_binary(content), do: content
def change_tag([], _, _), do: []
def change_tag([h | t], selector, tag) do
[change_tag(h, selector, tag) | change_tag(t, selector, tag)]
2016-04-17 12:26:51 +00:00
end
2016-04-17 06:28:33 +00:00
def change_tag({tag_name, attrs, inner_tree}, tag_name, tag) do
{tag, attrs, change_tag(inner_tree, tag_name, tag)}
end
2016-04-17 06:28:33 +00:00
def change_tag({tag_name, attrs, html_tree}, selector, tag) do
{tag_name, attrs, change_tag(html_tree, selector, tag)}
end
2016-04-17 12:26:51 +00:00
2016-04-24 09:40:35 +00:00
@doc """
Remove html attributes
"""
@spec remove_attrs(html_tree, String.t() | [String.t()] | Regex.t()) :: html_tree
2016-04-17 12:26:51 +00:00
def remove_attrs(content, _) when is_binary(content), do: content
def remove_attrs([], _), do: []
def remove_attrs([h | t], t_attrs) do
[remove_attrs(h, t_attrs) | remove_attrs(t, t_attrs)]
2016-04-17 12:26:51 +00:00
end
2016-04-17 12:26:51 +00:00
def remove_attrs({tag_name, attrs, inner_tree}, target_attr) do
2017-02-05 09:48:26 +00:00
reject_fun =
cond do
is_binary(target_attr) ->
fn attr -> elem(attr, 0) == target_attr end
2017-02-05 09:48:26 +00:00
Regex.regex?(target_attr) ->
fn attr -> elem(attr, 0) =~ target_attr end
2017-02-05 09:48:26 +00:00
is_list(target_attr) ->
fn attr -> Enum.member?(target_attr, elem(attr, 0)) end
true ->
fn attr -> attr end
2017-02-05 09:48:26 +00:00
end
2016-04-17 12:26:51 +00:00
{tag_name, Enum.reject(attrs, reject_fun), remove_attrs(inner_tree, target_attr)}
end
@doc """
Remove tags
"""
@spec remove_tag(html_tree, fun) :: html_tree
def remove_tag(content, _) when is_binary(content), do: content
def remove_tag([], _), do: []
def remove_tag([h | t], fun) do
2016-04-17 12:26:51 +00:00
node = remove_tag(h, fun)
2016-04-17 12:26:51 +00:00
if is_nil(node) do
remove_tag(t, fun)
else
[node | remove_tag(t, fun)]
2016-04-17 12:26:51 +00:00
end
end
2016-04-17 12:26:51 +00:00
def remove_tag({tag, attrs, inner_tree} = html_tree, fun) do
if fun.(html_tree) do
nil
else
{tag, attrs, remove_tag(inner_tree, fun)}
end
end
@doc """
2016-04-24 09:40:35 +00:00
Count only text length
2016-04-17 12:26:51 +00:00
"""
@spec text_length(html_tree) :: number
def text_length(html_tree) do
html_tree |> Floki.text() |> String.strip() |> String.length()
2016-04-17 12:26:51 +00:00
end
@doc """
Check html_tree can be candidate or not.
"""
@spec candidate_tag?(html_tree) :: boolean
def candidate_tag?({tag, _, _} = html_tree) do
Enum.any?(["p", "td"], fn candidate_tag ->
tag == candidate_tag &&
text_length(html_tree) >= Readability.default_options()[:min_text_length]
2016-04-17 12:26:51 +00:00
end)
end
@doc """
Normalize and Parse to html tree(tuple or list)) from binary html
"""
@spec normalize(binary) :: html_tree
def normalize(raw_html) do
raw_html
2017-02-05 09:48:26 +00:00
|> String.replace(Readability.regexes(:replace_xml_version), "")
|> String.replace(Readability.regexes(:replace_brs), "</p><p>")
|> String.replace(Readability.regexes(:replace_fonts), "<\1span>")
|> String.replace(Readability.regexes(:normalize), " ")
|> Floki.parse()
|> Floki.filter_out(:comment)
end
2016-04-17 06:28:33 +00:00
end