2016-04-17 06:28:33 +00:00
|
|
|
defmodule Readability.Helper do
|
|
|
|
@moduledoc """
|
2016-04-17 12:26:51 +00:00
|
|
|
Helpers for parsing, updating, removing html tree
|
2016-04-17 06:28:33 +00:00
|
|
|
"""
|
|
|
|
|
|
|
|
@type html_tree :: tuple | list
|
|
|
|
|
|
|
|
@doc """
|
2016-04-17 12:26:51 +00:00
|
|
|
Change existing tags by selector
|
2016-04-17 06:28:33 +00:00
|
|
|
"""
|
2018-02-09 03:42:08 +00:00
|
|
|
@spec change_tag(html_tree, String.t(), String.t()) :: html_tree
|
2016-04-17 12:26:51 +00:00
|
|
|
def change_tag(content, _, _) when is_binary(content), do: content
|
|
|
|
def change_tag([], _, _), do: []
|
2018-02-09 03:42:08 +00:00
|
|
|
|
|
|
|
def change_tag([h | t], selector, tag) do
|
|
|
|
[change_tag(h, selector, tag) | change_tag(t, selector, tag)]
|
2016-04-17 12:26:51 +00:00
|
|
|
end
|
2018-02-09 03:42:08 +00:00
|
|
|
|
2016-04-17 06:28:33 +00:00
|
|
|
def change_tag({tag_name, attrs, inner_tree}, tag_name, tag) do
|
|
|
|
{tag, attrs, change_tag(inner_tree, tag_name, tag)}
|
|
|
|
end
|
2018-02-09 03:42:08 +00:00
|
|
|
|
2016-04-17 06:28:33 +00:00
|
|
|
def change_tag({tag_name, attrs, html_tree}, selector, tag) do
|
|
|
|
{tag_name, attrs, change_tag(html_tree, selector, tag)}
|
|
|
|
end
|
2016-04-17 12:26:51 +00:00
|
|
|
|
2016-04-24 09:40:35 +00:00
|
|
|
@doc """
|
|
|
|
Remove html attributes
|
|
|
|
"""
|
2018-02-09 03:42:08 +00:00
|
|
|
@spec remove_attrs(html_tree, String.t() | [String.t()] | Regex.t()) :: html_tree
|
2016-04-17 12:26:51 +00:00
|
|
|
def remove_attrs(content, _) when is_binary(content), do: content
|
|
|
|
def remove_attrs([], _), do: []
|
2018-02-09 03:42:08 +00:00
|
|
|
|
|
|
|
def remove_attrs([h | t], t_attrs) do
|
|
|
|
[remove_attrs(h, t_attrs) | remove_attrs(t, t_attrs)]
|
2016-04-17 12:26:51 +00:00
|
|
|
end
|
2018-02-09 03:42:08 +00:00
|
|
|
|
2016-04-17 12:26:51 +00:00
|
|
|
def remove_attrs({tag_name, attrs, inner_tree}, target_attr) do
|
2017-02-05 09:48:26 +00:00
|
|
|
reject_fun =
|
|
|
|
cond do
|
|
|
|
is_binary(target_attr) ->
|
2018-02-09 03:42:08 +00:00
|
|
|
fn attr -> elem(attr, 0) == target_attr end
|
|
|
|
|
2017-02-05 09:48:26 +00:00
|
|
|
Regex.regex?(target_attr) ->
|
2018-02-09 03:42:08 +00:00
|
|
|
fn attr -> elem(attr, 0) =~ target_attr end
|
|
|
|
|
2017-02-05 09:48:26 +00:00
|
|
|
is_list(target_attr) ->
|
2018-02-09 03:42:08 +00:00
|
|
|
fn attr -> Enum.member?(target_attr, elem(attr, 0)) end
|
|
|
|
|
|
|
|
true ->
|
|
|
|
fn attr -> attr end
|
2017-02-05 09:48:26 +00:00
|
|
|
end
|
2018-02-09 03:42:08 +00:00
|
|
|
|
2016-04-17 12:26:51 +00:00
|
|
|
{tag_name, Enum.reject(attrs, reject_fun), remove_attrs(inner_tree, target_attr)}
|
|
|
|
end
|
|
|
|
|
|
|
|
@doc """
|
|
|
|
Remove tags
|
|
|
|
"""
|
|
|
|
@spec remove_tag(html_tree, fun) :: html_tree
|
|
|
|
def remove_tag(content, _) when is_binary(content), do: content
|
|
|
|
def remove_tag([], _), do: []
|
2018-02-09 03:42:08 +00:00
|
|
|
|
|
|
|
def remove_tag([h | t], fun) do
|
2016-04-17 12:26:51 +00:00
|
|
|
node = remove_tag(h, fun)
|
2018-02-09 03:42:08 +00:00
|
|
|
|
2018-10-11 00:29:48 +00:00
|
|
|
if node == [] do
|
2016-04-17 12:26:51 +00:00
|
|
|
remove_tag(t, fun)
|
|
|
|
else
|
2018-02-09 03:42:08 +00:00
|
|
|
[node | remove_tag(t, fun)]
|
2016-04-17 12:26:51 +00:00
|
|
|
end
|
|
|
|
end
|
2018-02-09 03:42:08 +00:00
|
|
|
|
2016-04-17 12:26:51 +00:00
|
|
|
def remove_tag({tag, attrs, inner_tree} = html_tree, fun) do
|
|
|
|
if fun.(html_tree) do
|
2018-10-11 00:29:48 +00:00
|
|
|
[]
|
2016-04-17 12:26:51 +00:00
|
|
|
else
|
|
|
|
{tag, attrs, remove_tag(inner_tree, fun)}
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
@doc """
|
2016-04-24 09:40:35 +00:00
|
|
|
Count only text length
|
2016-04-17 12:26:51 +00:00
|
|
|
"""
|
|
|
|
@spec text_length(html_tree) :: number
|
|
|
|
def text_length(html_tree) do
|
2018-07-24 09:13:08 +00:00
|
|
|
html_tree |> Floki.text() |> String.trim() |> String.length()
|
2016-04-17 12:26:51 +00:00
|
|
|
end
|
|
|
|
|
|
|
|
@doc """
|
|
|
|
Check html_tree can be candidate or not.
|
|
|
|
"""
|
|
|
|
@spec candidate_tag?(html_tree) :: boolean
|
2017-10-28 09:38:57 +00:00
|
|
|
def candidate_tag?({tag, _, _} = html_tree) do
|
2018-02-09 03:42:08 +00:00
|
|
|
Enum.any?(["p", "td"], fn candidate_tag ->
|
|
|
|
tag == candidate_tag &&
|
|
|
|
text_length(html_tree) >= Readability.default_options()[:min_text_length]
|
2016-04-17 12:26:51 +00:00
|
|
|
end)
|
|
|
|
end
|
|
|
|
|
2016-05-07 09:23:19 +00:00
|
|
|
@doc """
|
|
|
|
Normalize and Parse to html tree(tuple or list)) from binary html
|
|
|
|
"""
|
2018-06-30 00:59:43 +00:00
|
|
|
@spec normalize(binary, list) :: html_tree
|
|
|
|
def normalize(raw_html, opts \\ []) do
|
2016-05-07 09:23:19 +00:00
|
|
|
raw_html
|
2017-02-05 09:48:26 +00:00
|
|
|
|> String.replace(Readability.regexes(:replace_xml_version), "")
|
|
|
|
|> String.replace(Readability.regexes(:replace_brs), "</p><p>")
|
|
|
|
|> String.replace(Readability.regexes(:replace_fonts), "<\1span>")
|
|
|
|
|> String.replace(Readability.regexes(:normalize), " ")
|
2018-06-30 00:59:43 +00:00
|
|
|
|> transform_img_paths(opts[:url])
|
2018-02-09 03:42:08 +00:00
|
|
|
|> Floki.parse()
|
2016-05-07 09:23:19 +00:00
|
|
|
|> Floki.filter_out(:comment)
|
2018-10-11 01:10:29 +00:00
|
|
|
|> remove_tag(fn {tag, _, _} -> is_atom(tag) end)
|
2016-05-07 09:23:19 +00:00
|
|
|
end
|
2018-06-30 00:59:43 +00:00
|
|
|
|
|
|
|
# Turn relative `img` tag paths into absolute if possible
|
|
|
|
defp transform_img_paths(html_str, nil), do: html_str
|
|
|
|
|
|
|
|
defp transform_img_paths(html_str, url) do
|
|
|
|
Readability.regexes(:img_tag_src)
|
|
|
|
|> Regex.replace(html_str, &build_img_path(url, &1, &2, &3, &4))
|
|
|
|
end
|
|
|
|
|
|
|
|
defp build_img_path(url, _str, pre_src, src, post_src) do
|
|
|
|
new_src =
|
|
|
|
case URI.parse(src) do
|
|
|
|
%URI{host: nil} ->
|
|
|
|
base_url = base_url(url)
|
|
|
|
scrubbed_src = String.trim_leading(src, "/")
|
|
|
|
|
|
|
|
base_url <> "/" <> scrubbed_src
|
|
|
|
|
|
|
|
_ ->
|
|
|
|
src
|
|
|
|
end
|
|
|
|
|
|
|
|
pre_src <> new_src <> post_src
|
|
|
|
end
|
|
|
|
|
|
|
|
# Get the base url of a given url, including its scheme.
|
|
|
|
# E.g: both http://elixir-lang.org/guides and elixir-lang.org/guides
|
|
|
|
# would return http://elixir-lang.org
|
|
|
|
defp base_url(url) do
|
|
|
|
scheme_regex = ~r/^(https?:\/\/)?(.*)/i
|
|
|
|
path_regex = ~r/^([^\/]+)(.*)/i
|
|
|
|
|
|
|
|
url_without_scheme = Regex.replace(scheme_regex, url, "\\2")
|
|
|
|
base_url = Regex.replace(path_regex, url_without_scheme, "\\1")
|
|
|
|
|
|
|
|
scheme = URI.parse(url).scheme || "http"
|
|
|
|
|
|
|
|
scheme <> "://" <> base_url
|
|
|
|
end
|
2016-04-17 06:28:33 +00:00
|
|
|
end
|