diff --git a/.formatter.exs b/.formatter.exs
new file mode 100644
index 0000000..525446d
--- /dev/null
+++ b/.formatter.exs
@@ -0,0 +1,4 @@
+# Used by "mix format"
+[
+ inputs: ["mix.exs", "{config,lib,test}/**/*.{ex,exs}"]
+]
diff --git a/lib/readability.ex b/lib/readability.ex
index c50c715..6b0d819 100644
--- a/lib/readability.ex
+++ b/lib/readability.ex
@@ -34,32 +34,36 @@ defmodule Readability do
alias Readability.Summary
alias Readability.Helper
- @default_options [retry_length: 250,
- min_text_length: 25,
- remove_unlikely_candidates: true,
- weight_classes: true,
- clean_conditionally: true,
- remove_empty_nodes: true,
- min_image_width: 130,
- min_image_height: 80,
- ignore_image_format: [],
- blacklist: nil,
- whitelist: nil,
- page_url: nil
- ]
+ @default_options [
+ retry_length: 250,
+ min_text_length: 25,
+ remove_unlikely_candidates: true,
+ weight_classes: true,
+ clean_conditionally: true,
+ remove_empty_nodes: true,
+ min_image_width: 130,
+ min_image_height: 80,
+ ignore_image_format: [],
+ blacklist: nil,
+ whitelist: nil,
+ page_url: nil
+ ]
- @regexes [unlikely_candidate: ~r/combx|comment|community|disqus|extra|foot|header|hidden|lightbox|modal|menu|meta|nav|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
- ok_maybe_its_a_candidate: ~r/and|article|body|column|main|shadow/i,
- positive: ~r/article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
- negative: ~r/hidden|^hid|combx|comment|com-|contact|foot|footer|footnote|link|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|utility|widget/i,
- div_to_p_elements: ~r/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
- replace_brs: ~r/(
]*>[ \n\r\t]*){2,}/i,
- replace_fonts: ~r/<(\/?)font[^>]*>/i,
- replace_xml_version: ~r/<\?xml.*\?>/i,
- normalize: ~r/\s{2,}/,
- video: ~r/\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i,
- protect_attrs: ~r/^(?!id|rel|for|summary|title|href|src|alt|srcdoc)/i
- ]
+ @regexes [
+ unlikely_candidate:
+ ~r/combx|comment|community|disqus|extra|foot|header|hidden|lightbox|modal|menu|meta|nav|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
+ ok_maybe_its_a_candidate: ~r/and|article|body|column|main|shadow/i,
+ positive: ~r/article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
+ negative:
+ ~r/hidden|^hid|combx|comment|com-|contact|foot|footer|footnote|link|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|utility|widget/i,
+ div_to_p_elements: ~r/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
+ replace_brs: ~r/(
]*>[ \n\r\t]*){2,}/i,
+ replace_fonts: ~r/<(\/?)font[^>]*>/i,
+ replace_xml_version: ~r/<\?xml.*\?>/i,
+ normalize: ~r/\s{2,}/,
+ video: ~r/\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i,
+ protect_attrs: ~r/^(?!id|rel|for|summary|title|href|src|alt|srcdoc)/i
+ ]
@markup_mimes ~r/^(application|text)\/[a-z\-_\.\+]+ml(;\s+charset=.*)?$/i
@@ -72,32 +76,30 @@ defmodule Readability do
@doc """
summarize the primary readable content of a webpage.
"""
- @spec summarize(url, options) :: Summary.t
+ @spec summarize(url, options) :: Summary.t()
def summarize(url, opts \\ []) do
- opts = Keyword.merge(opts, [page_url: url])
- httpoison_options = Application.get_env :readability, :httpoison_options, []
+ opts = Keyword.merge(opts, page_url: url)
+ httpoison_options = Application.get_env(:readability, :httpoison_options, [])
%{status_code: _, body: raw, headers: headers} = HTTPoison.get!(url, [], httpoison_options)
case is_response_markup(headers) do
true ->
html_tree = Helper.normalize(raw)
- article_tree = html_tree
- |> ArticleBuilder.build(opts)
- %Summary{title: title(html_tree),
- authors: authors(html_tree),
- article_html: readable_html(article_tree),
- article_text: readable_text(article_tree)
+ article_tree =
+ html_tree
+ |> ArticleBuilder.build(opts)
+
+ %Summary{
+ title: title(html_tree),
+ authors: authors(html_tree),
+ article_html: readable_html(article_tree),
+ article_text: readable_text(article_tree)
}
_ ->
- %Summary{title: nil,
- authors: nil,
- article_html: nil,
- article_text: raw
- }
+ %Summary{title: nil, authors: nil, article_html: nil, article_text: raw}
end
-
end
@doc """
@@ -112,8 +114,10 @@ defmodule Readability do
def mime(headers \\ []) do
headers
|> Enum.find(
- {"Content-Type", "text/plain"}, # default
- fn({key, _}) -> String.downcase(key) == "content-type" end)
+ # default
+ {"Content-Type", "text/plain"},
+ fn {key, _} -> String.downcase(key) == "content-type" end
+ )
|> elem(1)
end
@@ -141,12 +145,12 @@ defmodule Readability do
"""
@spec title(binary | html_tree) :: binary
def title(raw_html) when is_binary(raw_html) do
- raw_html
- |> Helper.normalize
- |> title
+ raw_html
+ |> Helper.normalize()
+ |> title
end
- def title(html_tree), do: TitleFinder.title(html_tree)
+ def title(html_tree), do: TitleFinder.title(html_tree)
@doc """
Extract authors
@@ -173,8 +177,9 @@ defmodule Readability do
@spec article(binary, options) :: html_tree
def article(raw_html, opts \\ []) do
opts = Keyword.merge(@default_options, opts)
+
raw_html
- |> Helper.normalize
+ |> Helper.normalize()
|> ArticleBuilder.build(opts)
end
@@ -196,10 +201,11 @@ defmodule Readability do
# TODO: Remove image caption when extract only text
tags_to_br = ~r/<\/(p|div|article|h\d)/i
html_str = html_tree |> raw_html
- Regex.replace(tags_to_br, html_str, &("\n#{&1}"))
- |> Floki.parse
- |> Floki.text
- |> String.strip
+
+ Regex.replace(tags_to_br, html_str, &"\n#{&1}")
+ |> Floki.parse()
+ |> Floki.text()
+ |> String.strip()
end
@doc """
@@ -207,7 +213,7 @@ defmodule Readability do
"""
@spec raw_html(html_tree) :: binary
def raw_html(html_tree) do
- html_tree |> Floki.raw_html
+ html_tree |> Floki.raw_html()
end
def parse(raw_html) when is_binary(raw_html), do: Floki.parse(raw_html)
diff --git a/lib/readability/article_builder.ex b/lib/readability/article_builder.ex
index b071fc3..d9b6d1c 100644
--- a/lib/readability/article_builder.ex
+++ b/lib/readability/article_builder.ex
@@ -20,12 +20,18 @@ defmodule Readability.ArticleBuilder do
@spec build(html_tree, options) :: html_tree
def build(html_tree, opts) do
origin_tree = html_tree
- html_tree = html_tree
- |> Helper.remove_tag(fn({tag, _, _}) ->
- Enum.member?(["script", "style"], tag)
- end)
- html_tree = if opts[:remove_unlikely_candidates], do: Cleaner.remove_unlikely_tree(html_tree), else: html_tree
+ html_tree =
+ html_tree
+ |> Helper.remove_tag(fn {tag, _, _} ->
+ Enum.member?(["script", "style"], tag)
+ end)
+
+ html_tree =
+ if opts[:remove_unlikely_candidates],
+ do: Cleaner.remove_unlikely_tree(html_tree),
+ else: html_tree
+
html_tree = Cleaner.transform_misused_div_to_p(html_tree)
candidates = CandidateFinder.find(html_tree, opts)
@@ -48,25 +54,34 @@ defmodule Readability.ArticleBuilder do
cond do
opts[:remove_unlikely_candidates] ->
Keyword.put(opts, :remove_unlikely_candidates, false)
+
opts[:weight_classes] ->
Keyword.put(opts, :weight_classes, false)
+
opts[:clean_conditionally] ->
Keyword.put(opts, :clean_conditionally, false)
- true -> nil
+
+ true ->
+ nil
end
end
defp find_article(candidates, html_tree) do
best_candidate = CandidateFinder.find_best_candidate(candidates)
- article_trees = if best_candidate do
- find_article_trees(best_candidate, candidates)
- else
- fallback_candidate = case html_tree |> Floki.find("body") do
- [tree|_] -> %Candidate{html_tree: tree}
- _ -> %Candidate{html_tree: {}}
- end
- find_article_trees(fallback_candidate, candidates)
- end
+
+ article_trees =
+ if best_candidate do
+ find_article_trees(best_candidate, candidates)
+ else
+ fallback_candidate =
+ case html_tree |> Floki.find("body") do
+ [tree | _] -> %Candidate{html_tree: tree}
+ _ -> %Candidate{html_tree: {}}
+ end
+
+ find_article_trees(fallback_candidate, candidates)
+ end
+
{"div", [], article_trees}
end
@@ -75,22 +90,21 @@ defmodule Readability.ArticleBuilder do
candidates
|> Enum.filter(&(&1.tree_depth == best_candidate.tree_depth))
- |> Enum.filter(fn(candidate) ->
- candidate == best_candidate
- || candidate.score >= score_threshold
- || append?(candidate)
- end)
- |> Enum.map(&(to_article_tag(&1.html_tree)))
+ |> Enum.filter(fn candidate ->
+ candidate == best_candidate || candidate.score >= score_threshold || append?(candidate)
+ end)
+ |> Enum.map(&to_article_tag(&1.html_tree))
end
defp append?(%Candidate{html_tree: html_tree}) when elem(html_tree, 0) == "p" do
link_density = Scoring.calc_link_density(html_tree)
- inner_text = html_tree |> Floki.text
- inner_length = inner_text |> String.length
+ inner_text = html_tree |> Floki.text()
+ inner_length = inner_text |> String.length()
- (inner_length > 80 && link_density < 0.25)
- || (inner_length < 80 && link_density == 0 && inner_text =~ ~r/\.( |$)/)
+ (inner_length > 80 && link_density < 0.25) ||
+ (inner_length < 80 && link_density == 0 && inner_text =~ ~r/\.( |$)/)
end
+
defp append?(_), do: false
defp to_article_tag({tag, attrs, inner_tree} = html_tree) do
diff --git a/lib/readability/author_finder.ex b/lib/readability/author_finder.ex
index 8350e4e..074ea67 100644
--- a/lib/readability/author_finder.ex
+++ b/lib/readability/author_finder.ex
@@ -11,21 +11,24 @@ defmodule Readability.AuthorFinder do
@spec find(html_tree) :: [binary]
def find(html_tree) do
author_names = find_by_meta_tag(html_tree)
+
if author_names do
split_author_names(author_names)
end
end
def find_by_meta_tag(html_tree) do
- names = html_tree
- |> Floki.find("meta[name*=author], meta[property*=author]")
- |> Enum.map(fn(meta) ->
- meta
- |> Floki.attribute("content")
- |> Enum.join(" ")
- |> String.strip
- end)
- |> Enum.reject(&(is_nil(&1) || String.length(&1) == 0))
+ names =
+ html_tree
+ |> Floki.find("meta[name*=author], meta[property*=author]")
+ |> Enum.map(fn meta ->
+ meta
+ |> Floki.attribute("content")
+ |> Enum.join(" ")
+ |> String.strip()
+ end)
+ |> Enum.reject(&(is_nil(&1) || String.length(&1) == 0))
+
if length(names) > 0 do
hd(names)
else
diff --git a/lib/readability/candidate/cleaner.ex b/lib/readability/candidate/cleaner.ex
index 4599984..f8e068f 100644
--- a/lib/readability/candidate/cleaner.ex
+++ b/lib/readability/candidate/cleaner.ex
@@ -14,9 +14,11 @@ defmodule Readability.Candidate.Cleaner do
@spec transform_misused_div_to_p(html_tree) :: html_tree
def transform_misused_div_to_p(content) when is_binary(content), do: content
def transform_misused_div_to_p([]), do: []
- def transform_misused_div_to_p([h|t]) do
- [transform_misused_div_to_p(h)|transform_misused_div_to_p(t)]
+
+ def transform_misused_div_to_p([h | t]) do
+ [transform_misused_div_to_p(h) | transform_misused_div_to_p(t)]
end
+
def transform_misused_div_to_p({tag, attrs, inner_tree}) do
tag = if misused_divs?(tag, inner_tree), do: "p", else: tag
{tag, attrs, transform_misused_div_to_p(inner_tree)}
@@ -33,16 +35,18 @@ defmodule Readability.Candidate.Cleaner do
defp misused_divs?("div", inner_tree) do
!(Floki.raw_html(inner_tree) =~ Readability.regexes(:div_to_p_elements))
end
+
defp misused_divs?(_, _), do: false
defp unlikely_tree?({tag, attrs, _}) do
- idclass_str = attrs
- |> Enum.filter_map(&(elem(&1, 0) =~ ~r/id|class/i), &(elem(&1, 1)))
- |> Enum.join("")
+ idclass_str =
+ attrs
+ |> Enum.filter_map(&(elem(&1, 0) =~ ~r/id|class/i), &elem(&1, 1))
+ |> Enum.join("")
+
str = tag <> idclass_str
- str =~ Readability.regexes(:unlikely_candidate)
- && !(str =~ Readability.regexes(:ok_maybe_its_a_candidate))
- && tag != "html"
+ str =~ Readability.regexes(:unlikely_candidate) &&
+ !(str =~ Readability.regexes(:ok_maybe_its_a_candidate)) && tag != "html"
end
end
diff --git a/lib/readability/candidate/scoring.ex b/lib/readability/candidate/scoring.ex
index 38e85a0..792abf9 100644
--- a/lib/readability/candidate/scoring.ex
+++ b/lib/readability/candidate/scoring.ex
@@ -4,11 +4,7 @@ defmodule Readability.Candidate.Scoring do
"""
alias Readability.Helper
- @element_scores %{"div" => 5,
- "blockquote" => 3,
- "form" => -3,
- "th" => -5
- }
+ @element_scores %{"div" => 5, "blockquote" => 3, "form" => -3, "th" => -5}
@type html_tree :: tuple | list
@type options :: list
@@ -20,15 +16,19 @@ defmodule Readability.Candidate.Scoring do
@spec calc_score(html_tree, options) :: number
def calc_score(html_tree, opts \\ []) do
score = calc_node_score(html_tree, opts)
- score = score + calc_children_content_score(html_tree) + calc_grand_children_content_score(html_tree)
+
+ score =
+ score + calc_children_content_score(html_tree) +
+ calc_grand_children_content_score(html_tree)
+
score * (1 - calc_link_density(html_tree))
end
defp calc_content_score(html_tree) do
score = 1
- inner_text = html_tree |> Floki.text
+ inner_text = html_tree |> Floki.text()
split_score = inner_text |> String.split(",") |> length
- length_score = [(String.length(inner_text) / 100), 3] |> Enum.min
+ length_score = [String.length(inner_text) / 100, 3] |> Enum.min()
score + split_score + length_score
end
@@ -37,9 +37,11 @@ defmodule Readability.Candidate.Scoring do
score = if opts[:weight_classes], do: score + class_weight(attrs), else: score
score + (@element_scores[tag] || 0)
end
- defp calc_node_score([h|t], opts) do
+
+ defp calc_node_score([h | t], opts) do
calc_node_score(h, opts) + calc_node_score(t, opts)
end
+
defp calc_node_score([], _), do: 0
def class_weight(attrs) do
@@ -55,14 +57,16 @@ defmodule Readability.Candidate.Scoring do
end
def calc_link_density(html_tree) do
- link_length = html_tree
- |> Floki.find("a")
- |> Floki.text
- |> String.length
+ link_length =
+ html_tree
+ |> Floki.find("a")
+ |> Floki.text()
+ |> String.length()
- text_length = html_tree
- |> Floki.text
- |> String.length
+ text_length =
+ html_tree
+ |> Floki.text()
+ |> String.length()
if text_length == 0 do
0
@@ -78,11 +82,13 @@ defmodule Readability.Candidate.Scoring do
end
defp calc_grand_children_content_score({_, _, children_tree}) do
- score = children_tree
- |> Enum.filter_map(&is_tuple(&1), &elem(&1, 2))
- |> List.flatten
- |> Enum.filter(&(is_tuple(&1) && Helper.candidate_tag?(&1)))
- |> calc_content_score
+ score =
+ children_tree
+ |> Enum.filter_map(&is_tuple(&1), &elem(&1, 2))
+ |> List.flatten()
+ |> Enum.filter(&(is_tuple(&1) && Helper.candidate_tag?(&1)))
+ |> calc_content_score
+
score / 2
end
end
diff --git a/lib/readability/candidate_finder.ex b/lib/readability/candidate_finder.ex
index 4f4896a..1f551cb 100644
--- a/lib/readability/candidate_finder.ex
+++ b/lib/readability/candidate_finder.ex
@@ -14,20 +14,26 @@ defmodule Readability.CandidateFinder do
@doc """
Find candidates that shuld be meaningful article by analysing nodes
"""
- @spec find(html_tree, options, number) :: [Candidate.t]
+ @spec find(html_tree, options, number) :: [Candidate.t()]
def find(_, opts \\ [], tree_depth \\ 0)
def find([], _, _), do: []
- def find([h|t], opts, tree_depth) do
+
+ def find([h | t], opts, tree_depth) do
[find(h, opts, tree_depth) | find(t, opts, tree_depth)]
- |> List.flatten
+ |> List.flatten()
end
+
def find(text, _, _) when is_binary(text), do: []
+
def find({tag, attrs, inner_tree}, opts, tree_depth) do
html_tree = {tag, attrs, inner_tree}
+
if candidate?(html_tree) do
- candidate = %Candidate{html_tree: html_tree,
- score: Scoring.calc_score(html_tree, opts),
- tree_depth: tree_depth}
+ candidate = %Candidate{
+ html_tree: html_tree,
+ score: Scoring.calc_score(html_tree, opts),
+ tree_depth: tree_depth
+ }
[candidate | find(inner_tree, opts, tree_depth + 1)]
else
@@ -38,18 +44,20 @@ defmodule Readability.CandidateFinder do
@doc """
Find the highest score candidate.
"""
- @spec find_best_candidate([Candidate.t]) :: Candidate.t
+ @spec find_best_candidate([Candidate.t()]) :: Candidate.t()
def find_best_candidate([]), do: nil
+
def find_best_candidate(candidates) do
candidates
- |> Enum.max_by(fn(candidate) -> candidate.score end)
+ |> Enum.max_by(fn candidate -> candidate.score end)
end
defp candidate?(_, depth \\ 0)
defp candidate?(_, depth) when depth > 2, do: false
- defp candidate?([h|t], depth), do: candidate?(h, depth) || candidate?(t, depth)
+ defp candidate?([h | t], depth), do: candidate?(h, depth) || candidate?(t, depth)
defp candidate?([], _), do: false
defp candidate?(text, _) when is_binary(text), do: false
+
defp candidate?({_, _, inner_tree} = html_tree, depth) do
if Helper.candidate_tag?(html_tree) do
true
diff --git a/lib/readability/helper.ex b/lib/readability/helper.ex
index 012ee79..afce5dd 100644
--- a/lib/readability/helper.ex
+++ b/lib/readability/helper.ex
@@ -8,15 +8,18 @@ defmodule Readability.Helper do
@doc """
Change existing tags by selector
"""
- @spec change_tag(html_tree, String.t, String.t) :: html_tree
+ @spec change_tag(html_tree, String.t(), String.t()) :: html_tree
def change_tag(content, _, _) when is_binary(content), do: content
def change_tag([], _, _), do: []
- def change_tag([h|t], selector, tag) do
- [change_tag(h, selector, tag)|change_tag(t, selector, tag)]
+
+ def change_tag([h | t], selector, tag) do
+ [change_tag(h, selector, tag) | change_tag(t, selector, tag)]
end
+
def change_tag({tag_name, attrs, inner_tree}, tag_name, tag) do
{tag, attrs, change_tag(inner_tree, tag_name, tag)}
end
+
def change_tag({tag_name, attrs, html_tree}, selector, tag) do
{tag_name, attrs, change_tag(html_tree, selector, tag)}
end
@@ -24,41 +27,50 @@ defmodule Readability.Helper do
@doc """
Remove html attributes
"""
- @spec remove_attrs(html_tree, String.t | [String.t] | Regex.t) :: html_tree
+ @spec remove_attrs(html_tree, String.t() | [String.t()] | Regex.t()) :: html_tree
def remove_attrs(content, _) when is_binary(content), do: content
def remove_attrs([], _), do: []
- def remove_attrs([h|t], t_attrs) do
- [remove_attrs(h, t_attrs)|remove_attrs(t, t_attrs)]
+
+ def remove_attrs([h | t], t_attrs) do
+ [remove_attrs(h, t_attrs) | remove_attrs(t, t_attrs)]
end
+
def remove_attrs({tag_name, attrs, inner_tree}, target_attr) do
reject_fun =
cond do
is_binary(target_attr) ->
- fn(attr) -> elem(attr, 0) == target_attr end
+ fn attr -> elem(attr, 0) == target_attr end
+
Regex.regex?(target_attr) ->
- fn(attr) -> elem(attr, 0) =~ target_attr end
+ fn attr -> elem(attr, 0) =~ target_attr end
+
is_list(target_attr) ->
- fn(attr) -> Enum.member?(target_attr, elem(attr, 0)) end
- true -> fn(attr) -> attr end
+ fn attr -> Enum.member?(target_attr, elem(attr, 0)) end
+
+ true ->
+ fn attr -> attr end
end
+
{tag_name, Enum.reject(attrs, reject_fun), remove_attrs(inner_tree, target_attr)}
end
-
@doc """
Remove tags
"""
@spec remove_tag(html_tree, fun) :: html_tree
def remove_tag(content, _) when is_binary(content), do: content
def remove_tag([], _), do: []
- def remove_tag([h|t], fun) do
+
+ def remove_tag([h | t], fun) do
node = remove_tag(h, fun)
+
if is_nil(node) do
remove_tag(t, fun)
else
- [node|remove_tag(t, fun)]
+ [node | remove_tag(t, fun)]
end
end
+
def remove_tag({tag, attrs, inner_tree} = html_tree, fun) do
if fun.(html_tree) do
nil
@@ -72,7 +84,7 @@ defmodule Readability.Helper do
"""
@spec text_length(html_tree) :: number
def text_length(html_tree) do
- html_tree |> Floki.text |> String.strip |> String.length
+ html_tree |> Floki.text() |> String.strip() |> String.length()
end
@doc """
@@ -80,9 +92,9 @@ defmodule Readability.Helper do
"""
@spec candidate_tag?(html_tree) :: boolean
def candidate_tag?({tag, _, _} = html_tree) do
- Enum.any?(["p", "td"], fn(candidate_tag) ->
- tag == candidate_tag
- && (text_length(html_tree)) >= Readability.default_options[:min_text_length]
+ Enum.any?(["p", "td"], fn candidate_tag ->
+ tag == candidate_tag &&
+ text_length(html_tree) >= Readability.default_options()[:min_text_length]
end)
end
@@ -96,7 +108,7 @@ defmodule Readability.Helper do
|> String.replace(Readability.regexes(:replace_brs), "
") |> String.replace(Readability.regexes(:replace_fonts), "<\1span>") |> String.replace(Readability.regexes(:normalize), " ") - |> Floki.parse + |> Floki.parse() |> Floki.filter_out(:comment) end end diff --git a/lib/readability/sanitizer.ex b/lib/readability/sanitizer.ex index 42fa90a..3605f8d 100644 --- a/lib/readability/sanitizer.ex +++ b/lib/readability/sanitizer.ex @@ -13,12 +13,13 @@ defmodule Readability.Sanitizer do @doc """ Sanitizes article html tree """ - @spec sanitize(html_tree, [Candidate.t], list) :: html_tree - def sanitize(html_tree, candidates, opts \\ []) do - html_tree = html_tree - |> Helper.remove_tag(&clean_headline_tag?(&1)) - |> Helper.remove_tag(&clean_unlikely_tag?(&1)) - |> Helper.remove_tag(&clean_empty_p?(&1)) + @spec sanitize(html_tree, [Candidate.t()], list) :: html_tree + def sanitize(html_tree, candidates, opts \\ []) do + html_tree = + html_tree + |> Helper.remove_tag(&clean_headline_tag?(&1)) + |> Helper.remove_tag(&clean_unlikely_tag?(&1)) + |> Helper.remove_tag(&clean_empty_p?(&1)) if opts[:clean_conditionally] do html_tree |> Helper.remove_tag(conditionally_cleaing_fn(candidates)) @@ -28,15 +29,19 @@ defmodule Readability.Sanitizer do end defp conditionally_cleaing_fn(candidates) do - fn({tag, attrs, _} = tree) -> + fn {tag, attrs, _} = tree -> if Enum.any?(["table", "ul", "div"], &(&1 == tag)) do weight = Scoring.class_weight(attrs) - same_tree = candidates - |> Enum.find(%Candidate{}, &(&1.html_tree == tree)) + + same_tree = + candidates + |> Enum.find(%Candidate{}, &(&1.html_tree == tree)) + list? = tag == "ul" + cond do - weight + same_tree.score < 0 - -> true + weight + same_tree.score < 0 -> + true length(Regex.scan(~r/\,/, Floki.text(tree))) < 10 -> # If there are not very many commas, and the number of @@ -46,35 +51,42 @@ defmodule Readability.Sanitizer do img_len = tree |> Floki.find("img") |> length li_len = tree |> Floki.find("li") |> length input_len = tree |> Floki.find("input") |> length - embed_len = tree - |> Floki.find("embed") - |> Enum.reject(&(&1 =~ Readability.regexes(:video))) - |> length - link_density = Scoring.calc_link_density(tree) + embed_len = + tree + |> Floki.find("embed") + |> Enum.reject(&(&1 =~ Readability.regexes(:video))) + |> length + + link_density = Scoring.calc_link_density(tree) conent_len = Helper.text_length(tree) - img_len > p_len # too many image - || (!list? && li_len > p_len) # more
s - || input_len > (p_len / 3) # less than 3x
s than s - || (!list? && conent_len < Readability.regexes(:min_text_length) && img_len != 1) # too short a content length without a single image - || (weight < 25 && link_density > 0.2) # too many links for its weight (#{weight}) - || (weight >= 25 && link_density > 0.5) # too many links for its weight (#{weight}) - || ((embed_len == 1 && conent_len < 75) || embed_len > 1) #