Add Elixir 1.6 formatter config file and formatted the codebase
This commit is contained in:
parent
307152202b
commit
b2f8a3b4da
|
@ -0,0 +1,4 @@
|
||||||
|
# Used by "mix format"
|
||||||
|
[
|
||||||
|
inputs: ["mix.exs", "{config,lib,test}/**/*.{ex,exs}"]
|
||||||
|
]
|
|
@ -34,32 +34,36 @@ defmodule Readability do
|
||||||
alias Readability.Summary
|
alias Readability.Summary
|
||||||
alias Readability.Helper
|
alias Readability.Helper
|
||||||
|
|
||||||
@default_options [retry_length: 250,
|
@default_options [
|
||||||
min_text_length: 25,
|
retry_length: 250,
|
||||||
remove_unlikely_candidates: true,
|
min_text_length: 25,
|
||||||
weight_classes: true,
|
remove_unlikely_candidates: true,
|
||||||
clean_conditionally: true,
|
weight_classes: true,
|
||||||
remove_empty_nodes: true,
|
clean_conditionally: true,
|
||||||
min_image_width: 130,
|
remove_empty_nodes: true,
|
||||||
min_image_height: 80,
|
min_image_width: 130,
|
||||||
ignore_image_format: [],
|
min_image_height: 80,
|
||||||
blacklist: nil,
|
ignore_image_format: [],
|
||||||
whitelist: nil,
|
blacklist: nil,
|
||||||
page_url: nil
|
whitelist: nil,
|
||||||
]
|
page_url: nil
|
||||||
|
]
|
||||||
|
|
||||||
@regexes [unlikely_candidate: ~r/combx|comment|community|disqus|extra|foot|header|hidden|lightbox|modal|menu|meta|nav|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
|
@regexes [
|
||||||
ok_maybe_its_a_candidate: ~r/and|article|body|column|main|shadow/i,
|
unlikely_candidate:
|
||||||
positive: ~r/article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
|
~r/combx|comment|community|disqus|extra|foot|header|hidden|lightbox|modal|menu|meta|nav|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
|
||||||
negative: ~r/hidden|^hid|combx|comment|com-|contact|foot|footer|footnote|link|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|utility|widget/i,
|
ok_maybe_its_a_candidate: ~r/and|article|body|column|main|shadow/i,
|
||||||
div_to_p_elements: ~r/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
|
positive: ~r/article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
|
||||||
replace_brs: ~r/(<br[^>]*>[ \n\r\t]*){2,}/i,
|
negative:
|
||||||
replace_fonts: ~r/<(\/?)font[^>]*>/i,
|
~r/hidden|^hid|combx|comment|com-|contact|foot|footer|footnote|link|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|utility|widget/i,
|
||||||
replace_xml_version: ~r/<\?xml.*\?>/i,
|
div_to_p_elements: ~r/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
|
||||||
normalize: ~r/\s{2,}/,
|
replace_brs: ~r/(<br[^>]*>[ \n\r\t]*){2,}/i,
|
||||||
video: ~r/\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i,
|
replace_fonts: ~r/<(\/?)font[^>]*>/i,
|
||||||
protect_attrs: ~r/^(?!id|rel|for|summary|title|href|src|alt|srcdoc)/i
|
replace_xml_version: ~r/<\?xml.*\?>/i,
|
||||||
]
|
normalize: ~r/\s{2,}/,
|
||||||
|
video: ~r/\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i,
|
||||||
|
protect_attrs: ~r/^(?!id|rel|for|summary|title|href|src|alt|srcdoc)/i
|
||||||
|
]
|
||||||
|
|
||||||
@markup_mimes ~r/^(application|text)\/[a-z\-_\.\+]+ml(;\s+charset=.*)?$/i
|
@markup_mimes ~r/^(application|text)\/[a-z\-_\.\+]+ml(;\s+charset=.*)?$/i
|
||||||
|
|
||||||
|
@ -72,32 +76,30 @@ defmodule Readability do
|
||||||
@doc """
|
@doc """
|
||||||
summarize the primary readable content of a webpage.
|
summarize the primary readable content of a webpage.
|
||||||
"""
|
"""
|
||||||
@spec summarize(url, options) :: Summary.t
|
@spec summarize(url, options) :: Summary.t()
|
||||||
def summarize(url, opts \\ []) do
|
def summarize(url, opts \\ []) do
|
||||||
opts = Keyword.merge(opts, [page_url: url])
|
opts = Keyword.merge(opts, page_url: url)
|
||||||
httpoison_options = Application.get_env :readability, :httpoison_options, []
|
httpoison_options = Application.get_env(:readability, :httpoison_options, [])
|
||||||
%{status_code: _, body: raw, headers: headers} = HTTPoison.get!(url, [], httpoison_options)
|
%{status_code: _, body: raw, headers: headers} = HTTPoison.get!(url, [], httpoison_options)
|
||||||
|
|
||||||
case is_response_markup(headers) do
|
case is_response_markup(headers) do
|
||||||
true ->
|
true ->
|
||||||
html_tree = Helper.normalize(raw)
|
html_tree = Helper.normalize(raw)
|
||||||
article_tree = html_tree
|
|
||||||
|> ArticleBuilder.build(opts)
|
|
||||||
|
|
||||||
%Summary{title: title(html_tree),
|
article_tree =
|
||||||
authors: authors(html_tree),
|
html_tree
|
||||||
article_html: readable_html(article_tree),
|
|> ArticleBuilder.build(opts)
|
||||||
article_text: readable_text(article_tree)
|
|
||||||
|
%Summary{
|
||||||
|
title: title(html_tree),
|
||||||
|
authors: authors(html_tree),
|
||||||
|
article_html: readable_html(article_tree),
|
||||||
|
article_text: readable_text(article_tree)
|
||||||
}
|
}
|
||||||
|
|
||||||
_ ->
|
_ ->
|
||||||
%Summary{title: nil,
|
%Summary{title: nil, authors: nil, article_html: nil, article_text: raw}
|
||||||
authors: nil,
|
|
||||||
article_html: nil,
|
|
||||||
article_text: raw
|
|
||||||
}
|
|
||||||
end
|
end
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
||||||
@doc """
|
@doc """
|
||||||
|
@ -112,8 +114,10 @@ defmodule Readability do
|
||||||
def mime(headers \\ []) do
|
def mime(headers \\ []) do
|
||||||
headers
|
headers
|
||||||
|> Enum.find(
|
|> Enum.find(
|
||||||
{"Content-Type", "text/plain"}, # default
|
# default
|
||||||
fn({key, _}) -> String.downcase(key) == "content-type" end)
|
{"Content-Type", "text/plain"},
|
||||||
|
fn {key, _} -> String.downcase(key) == "content-type" end
|
||||||
|
)
|
||||||
|> elem(1)
|
|> elem(1)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -141,12 +145,12 @@ defmodule Readability do
|
||||||
"""
|
"""
|
||||||
@spec title(binary | html_tree) :: binary
|
@spec title(binary | html_tree) :: binary
|
||||||
def title(raw_html) when is_binary(raw_html) do
|
def title(raw_html) when is_binary(raw_html) do
|
||||||
raw_html
|
raw_html
|
||||||
|> Helper.normalize
|
|> Helper.normalize()
|
||||||
|> title
|
|> title
|
||||||
end
|
end
|
||||||
def title(html_tree), do: TitleFinder.title(html_tree)
|
|
||||||
|
|
||||||
|
def title(html_tree), do: TitleFinder.title(html_tree)
|
||||||
|
|
||||||
@doc """
|
@doc """
|
||||||
Extract authors
|
Extract authors
|
||||||
|
@ -173,8 +177,9 @@ defmodule Readability do
|
||||||
@spec article(binary, options) :: html_tree
|
@spec article(binary, options) :: html_tree
|
||||||
def article(raw_html, opts \\ []) do
|
def article(raw_html, opts \\ []) do
|
||||||
opts = Keyword.merge(@default_options, opts)
|
opts = Keyword.merge(@default_options, opts)
|
||||||
|
|
||||||
raw_html
|
raw_html
|
||||||
|> Helper.normalize
|
|> Helper.normalize()
|
||||||
|> ArticleBuilder.build(opts)
|
|> ArticleBuilder.build(opts)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -196,10 +201,11 @@ defmodule Readability do
|
||||||
# TODO: Remove image caption when extract only text
|
# TODO: Remove image caption when extract only text
|
||||||
tags_to_br = ~r/<\/(p|div|article|h\d)/i
|
tags_to_br = ~r/<\/(p|div|article|h\d)/i
|
||||||
html_str = html_tree |> raw_html
|
html_str = html_tree |> raw_html
|
||||||
Regex.replace(tags_to_br, html_str, &("\n#{&1}"))
|
|
||||||
|> Floki.parse
|
Regex.replace(tags_to_br, html_str, &"\n#{&1}")
|
||||||
|> Floki.text
|
|> Floki.parse()
|
||||||
|> String.strip
|
|> Floki.text()
|
||||||
|
|> String.strip()
|
||||||
end
|
end
|
||||||
|
|
||||||
@doc """
|
@doc """
|
||||||
|
@ -207,7 +213,7 @@ defmodule Readability do
|
||||||
"""
|
"""
|
||||||
@spec raw_html(html_tree) :: binary
|
@spec raw_html(html_tree) :: binary
|
||||||
def raw_html(html_tree) do
|
def raw_html(html_tree) do
|
||||||
html_tree |> Floki.raw_html
|
html_tree |> Floki.raw_html()
|
||||||
end
|
end
|
||||||
|
|
||||||
def parse(raw_html) when is_binary(raw_html), do: Floki.parse(raw_html)
|
def parse(raw_html) when is_binary(raw_html), do: Floki.parse(raw_html)
|
||||||
|
|
|
@ -20,12 +20,18 @@ defmodule Readability.ArticleBuilder do
|
||||||
@spec build(html_tree, options) :: html_tree
|
@spec build(html_tree, options) :: html_tree
|
||||||
def build(html_tree, opts) do
|
def build(html_tree, opts) do
|
||||||
origin_tree = html_tree
|
origin_tree = html_tree
|
||||||
html_tree = html_tree
|
|
||||||
|> Helper.remove_tag(fn({tag, _, _}) ->
|
|
||||||
Enum.member?(["script", "style"], tag)
|
|
||||||
end)
|
|
||||||
|
|
||||||
html_tree = if opts[:remove_unlikely_candidates], do: Cleaner.remove_unlikely_tree(html_tree), else: html_tree
|
html_tree =
|
||||||
|
html_tree
|
||||||
|
|> Helper.remove_tag(fn {tag, _, _} ->
|
||||||
|
Enum.member?(["script", "style"], tag)
|
||||||
|
end)
|
||||||
|
|
||||||
|
html_tree =
|
||||||
|
if opts[:remove_unlikely_candidates],
|
||||||
|
do: Cleaner.remove_unlikely_tree(html_tree),
|
||||||
|
else: html_tree
|
||||||
|
|
||||||
html_tree = Cleaner.transform_misused_div_to_p(html_tree)
|
html_tree = Cleaner.transform_misused_div_to_p(html_tree)
|
||||||
|
|
||||||
candidates = CandidateFinder.find(html_tree, opts)
|
candidates = CandidateFinder.find(html_tree, opts)
|
||||||
|
@ -48,25 +54,34 @@ defmodule Readability.ArticleBuilder do
|
||||||
cond do
|
cond do
|
||||||
opts[:remove_unlikely_candidates] ->
|
opts[:remove_unlikely_candidates] ->
|
||||||
Keyword.put(opts, :remove_unlikely_candidates, false)
|
Keyword.put(opts, :remove_unlikely_candidates, false)
|
||||||
|
|
||||||
opts[:weight_classes] ->
|
opts[:weight_classes] ->
|
||||||
Keyword.put(opts, :weight_classes, false)
|
Keyword.put(opts, :weight_classes, false)
|
||||||
|
|
||||||
opts[:clean_conditionally] ->
|
opts[:clean_conditionally] ->
|
||||||
Keyword.put(opts, :clean_conditionally, false)
|
Keyword.put(opts, :clean_conditionally, false)
|
||||||
true -> nil
|
|
||||||
|
true ->
|
||||||
|
nil
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
defp find_article(candidates, html_tree) do
|
defp find_article(candidates, html_tree) do
|
||||||
best_candidate = CandidateFinder.find_best_candidate(candidates)
|
best_candidate = CandidateFinder.find_best_candidate(candidates)
|
||||||
article_trees = if best_candidate do
|
|
||||||
find_article_trees(best_candidate, candidates)
|
article_trees =
|
||||||
else
|
if best_candidate do
|
||||||
fallback_candidate = case html_tree |> Floki.find("body") do
|
find_article_trees(best_candidate, candidates)
|
||||||
[tree|_] -> %Candidate{html_tree: tree}
|
else
|
||||||
_ -> %Candidate{html_tree: {}}
|
fallback_candidate =
|
||||||
end
|
case html_tree |> Floki.find("body") do
|
||||||
find_article_trees(fallback_candidate, candidates)
|
[tree | _] -> %Candidate{html_tree: tree}
|
||||||
end
|
_ -> %Candidate{html_tree: {}}
|
||||||
|
end
|
||||||
|
|
||||||
|
find_article_trees(fallback_candidate, candidates)
|
||||||
|
end
|
||||||
|
|
||||||
{"div", [], article_trees}
|
{"div", [], article_trees}
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -75,22 +90,21 @@ defmodule Readability.ArticleBuilder do
|
||||||
|
|
||||||
candidates
|
candidates
|
||||||
|> Enum.filter(&(&1.tree_depth == best_candidate.tree_depth))
|
|> Enum.filter(&(&1.tree_depth == best_candidate.tree_depth))
|
||||||
|> Enum.filter(fn(candidate) ->
|
|> Enum.filter(fn candidate ->
|
||||||
candidate == best_candidate
|
candidate == best_candidate || candidate.score >= score_threshold || append?(candidate)
|
||||||
|| candidate.score >= score_threshold
|
end)
|
||||||
|| append?(candidate)
|
|> Enum.map(&to_article_tag(&1.html_tree))
|
||||||
end)
|
|
||||||
|> Enum.map(&(to_article_tag(&1.html_tree)))
|
|
||||||
end
|
end
|
||||||
|
|
||||||
defp append?(%Candidate{html_tree: html_tree}) when elem(html_tree, 0) == "p" do
|
defp append?(%Candidate{html_tree: html_tree}) when elem(html_tree, 0) == "p" do
|
||||||
link_density = Scoring.calc_link_density(html_tree)
|
link_density = Scoring.calc_link_density(html_tree)
|
||||||
inner_text = html_tree |> Floki.text
|
inner_text = html_tree |> Floki.text()
|
||||||
inner_length = inner_text |> String.length
|
inner_length = inner_text |> String.length()
|
||||||
|
|
||||||
(inner_length > 80 && link_density < 0.25)
|
(inner_length > 80 && link_density < 0.25) ||
|
||||||
|| (inner_length < 80 && link_density == 0 && inner_text =~ ~r/\.( |$)/)
|
(inner_length < 80 && link_density == 0 && inner_text =~ ~r/\.( |$)/)
|
||||||
end
|
end
|
||||||
|
|
||||||
defp append?(_), do: false
|
defp append?(_), do: false
|
||||||
|
|
||||||
defp to_article_tag({tag, attrs, inner_tree} = html_tree) do
|
defp to_article_tag({tag, attrs, inner_tree} = html_tree) do
|
||||||
|
|
|
@ -11,21 +11,24 @@ defmodule Readability.AuthorFinder do
|
||||||
@spec find(html_tree) :: [binary]
|
@spec find(html_tree) :: [binary]
|
||||||
def find(html_tree) do
|
def find(html_tree) do
|
||||||
author_names = find_by_meta_tag(html_tree)
|
author_names = find_by_meta_tag(html_tree)
|
||||||
|
|
||||||
if author_names do
|
if author_names do
|
||||||
split_author_names(author_names)
|
split_author_names(author_names)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
def find_by_meta_tag(html_tree) do
|
def find_by_meta_tag(html_tree) do
|
||||||
names = html_tree
|
names =
|
||||||
|> Floki.find("meta[name*=author], meta[property*=author]")
|
html_tree
|
||||||
|> Enum.map(fn(meta) ->
|
|> Floki.find("meta[name*=author], meta[property*=author]")
|
||||||
meta
|
|> Enum.map(fn meta ->
|
||||||
|> Floki.attribute("content")
|
meta
|
||||||
|> Enum.join(" ")
|
|> Floki.attribute("content")
|
||||||
|> String.strip
|
|> Enum.join(" ")
|
||||||
end)
|
|> String.strip()
|
||||||
|> Enum.reject(&(is_nil(&1) || String.length(&1) == 0))
|
end)
|
||||||
|
|> Enum.reject(&(is_nil(&1) || String.length(&1) == 0))
|
||||||
|
|
||||||
if length(names) > 0 do
|
if length(names) > 0 do
|
||||||
hd(names)
|
hd(names)
|
||||||
else
|
else
|
||||||
|
|
|
@ -14,9 +14,11 @@ defmodule Readability.Candidate.Cleaner do
|
||||||
@spec transform_misused_div_to_p(html_tree) :: html_tree
|
@spec transform_misused_div_to_p(html_tree) :: html_tree
|
||||||
def transform_misused_div_to_p(content) when is_binary(content), do: content
|
def transform_misused_div_to_p(content) when is_binary(content), do: content
|
||||||
def transform_misused_div_to_p([]), do: []
|
def transform_misused_div_to_p([]), do: []
|
||||||
def transform_misused_div_to_p([h|t]) do
|
|
||||||
[transform_misused_div_to_p(h)|transform_misused_div_to_p(t)]
|
def transform_misused_div_to_p([h | t]) do
|
||||||
|
[transform_misused_div_to_p(h) | transform_misused_div_to_p(t)]
|
||||||
end
|
end
|
||||||
|
|
||||||
def transform_misused_div_to_p({tag, attrs, inner_tree}) do
|
def transform_misused_div_to_p({tag, attrs, inner_tree}) do
|
||||||
tag = if misused_divs?(tag, inner_tree), do: "p", else: tag
|
tag = if misused_divs?(tag, inner_tree), do: "p", else: tag
|
||||||
{tag, attrs, transform_misused_div_to_p(inner_tree)}
|
{tag, attrs, transform_misused_div_to_p(inner_tree)}
|
||||||
|
@ -33,16 +35,18 @@ defmodule Readability.Candidate.Cleaner do
|
||||||
defp misused_divs?("div", inner_tree) do
|
defp misused_divs?("div", inner_tree) do
|
||||||
!(Floki.raw_html(inner_tree) =~ Readability.regexes(:div_to_p_elements))
|
!(Floki.raw_html(inner_tree) =~ Readability.regexes(:div_to_p_elements))
|
||||||
end
|
end
|
||||||
|
|
||||||
defp misused_divs?(_, _), do: false
|
defp misused_divs?(_, _), do: false
|
||||||
|
|
||||||
defp unlikely_tree?({tag, attrs, _}) do
|
defp unlikely_tree?({tag, attrs, _}) do
|
||||||
idclass_str = attrs
|
idclass_str =
|
||||||
|> Enum.filter_map(&(elem(&1, 0) =~ ~r/id|class/i), &(elem(&1, 1)))
|
attrs
|
||||||
|> Enum.join("")
|
|> Enum.filter_map(&(elem(&1, 0) =~ ~r/id|class/i), &elem(&1, 1))
|
||||||
|
|> Enum.join("")
|
||||||
|
|
||||||
str = tag <> idclass_str
|
str = tag <> idclass_str
|
||||||
|
|
||||||
str =~ Readability.regexes(:unlikely_candidate)
|
str =~ Readability.regexes(:unlikely_candidate) &&
|
||||||
&& !(str =~ Readability.regexes(:ok_maybe_its_a_candidate))
|
!(str =~ Readability.regexes(:ok_maybe_its_a_candidate)) && tag != "html"
|
||||||
&& tag != "html"
|
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -4,11 +4,7 @@ defmodule Readability.Candidate.Scoring do
|
||||||
"""
|
"""
|
||||||
alias Readability.Helper
|
alias Readability.Helper
|
||||||
|
|
||||||
@element_scores %{"div" => 5,
|
@element_scores %{"div" => 5, "blockquote" => 3, "form" => -3, "th" => -5}
|
||||||
"blockquote" => 3,
|
|
||||||
"form" => -3,
|
|
||||||
"th" => -5
|
|
||||||
}
|
|
||||||
|
|
||||||
@type html_tree :: tuple | list
|
@type html_tree :: tuple | list
|
||||||
@type options :: list
|
@type options :: list
|
||||||
|
@ -20,15 +16,19 @@ defmodule Readability.Candidate.Scoring do
|
||||||
@spec calc_score(html_tree, options) :: number
|
@spec calc_score(html_tree, options) :: number
|
||||||
def calc_score(html_tree, opts \\ []) do
|
def calc_score(html_tree, opts \\ []) do
|
||||||
score = calc_node_score(html_tree, opts)
|
score = calc_node_score(html_tree, opts)
|
||||||
score = score + calc_children_content_score(html_tree) + calc_grand_children_content_score(html_tree)
|
|
||||||
|
score =
|
||||||
|
score + calc_children_content_score(html_tree) +
|
||||||
|
calc_grand_children_content_score(html_tree)
|
||||||
|
|
||||||
score * (1 - calc_link_density(html_tree))
|
score * (1 - calc_link_density(html_tree))
|
||||||
end
|
end
|
||||||
|
|
||||||
defp calc_content_score(html_tree) do
|
defp calc_content_score(html_tree) do
|
||||||
score = 1
|
score = 1
|
||||||
inner_text = html_tree |> Floki.text
|
inner_text = html_tree |> Floki.text()
|
||||||
split_score = inner_text |> String.split(",") |> length
|
split_score = inner_text |> String.split(",") |> length
|
||||||
length_score = [(String.length(inner_text) / 100), 3] |> Enum.min
|
length_score = [String.length(inner_text) / 100, 3] |> Enum.min()
|
||||||
score + split_score + length_score
|
score + split_score + length_score
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -37,9 +37,11 @@ defmodule Readability.Candidate.Scoring do
|
||||||
score = if opts[:weight_classes], do: score + class_weight(attrs), else: score
|
score = if opts[:weight_classes], do: score + class_weight(attrs), else: score
|
||||||
score + (@element_scores[tag] || 0)
|
score + (@element_scores[tag] || 0)
|
||||||
end
|
end
|
||||||
defp calc_node_score([h|t], opts) do
|
|
||||||
|
defp calc_node_score([h | t], opts) do
|
||||||
calc_node_score(h, opts) + calc_node_score(t, opts)
|
calc_node_score(h, opts) + calc_node_score(t, opts)
|
||||||
end
|
end
|
||||||
|
|
||||||
defp calc_node_score([], _), do: 0
|
defp calc_node_score([], _), do: 0
|
||||||
|
|
||||||
def class_weight(attrs) do
|
def class_weight(attrs) do
|
||||||
|
@ -55,14 +57,16 @@ defmodule Readability.Candidate.Scoring do
|
||||||
end
|
end
|
||||||
|
|
||||||
def calc_link_density(html_tree) do
|
def calc_link_density(html_tree) do
|
||||||
link_length = html_tree
|
link_length =
|
||||||
|> Floki.find("a")
|
html_tree
|
||||||
|> Floki.text
|
|> Floki.find("a")
|
||||||
|> String.length
|
|> Floki.text()
|
||||||
|
|> String.length()
|
||||||
|
|
||||||
text_length = html_tree
|
text_length =
|
||||||
|> Floki.text
|
html_tree
|
||||||
|> String.length
|
|> Floki.text()
|
||||||
|
|> String.length()
|
||||||
|
|
||||||
if text_length == 0 do
|
if text_length == 0 do
|
||||||
0
|
0
|
||||||
|
@ -78,11 +82,13 @@ defmodule Readability.Candidate.Scoring do
|
||||||
end
|
end
|
||||||
|
|
||||||
defp calc_grand_children_content_score({_, _, children_tree}) do
|
defp calc_grand_children_content_score({_, _, children_tree}) do
|
||||||
score = children_tree
|
score =
|
||||||
|> Enum.filter_map(&is_tuple(&1), &elem(&1, 2))
|
children_tree
|
||||||
|> List.flatten
|
|> Enum.filter_map(&is_tuple(&1), &elem(&1, 2))
|
||||||
|> Enum.filter(&(is_tuple(&1) && Helper.candidate_tag?(&1)))
|
|> List.flatten()
|
||||||
|> calc_content_score
|
|> Enum.filter(&(is_tuple(&1) && Helper.candidate_tag?(&1)))
|
||||||
|
|> calc_content_score
|
||||||
|
|
||||||
score / 2
|
score / 2
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -14,20 +14,26 @@ defmodule Readability.CandidateFinder do
|
||||||
@doc """
|
@doc """
|
||||||
Find candidates that shuld be meaningful article by analysing nodes
|
Find candidates that shuld be meaningful article by analysing nodes
|
||||||
"""
|
"""
|
||||||
@spec find(html_tree, options, number) :: [Candidate.t]
|
@spec find(html_tree, options, number) :: [Candidate.t()]
|
||||||
def find(_, opts \\ [], tree_depth \\ 0)
|
def find(_, opts \\ [], tree_depth \\ 0)
|
||||||
def find([], _, _), do: []
|
def find([], _, _), do: []
|
||||||
def find([h|t], opts, tree_depth) do
|
|
||||||
|
def find([h | t], opts, tree_depth) do
|
||||||
[find(h, opts, tree_depth) | find(t, opts, tree_depth)]
|
[find(h, opts, tree_depth) | find(t, opts, tree_depth)]
|
||||||
|> List.flatten
|
|> List.flatten()
|
||||||
end
|
end
|
||||||
|
|
||||||
def find(text, _, _) when is_binary(text), do: []
|
def find(text, _, _) when is_binary(text), do: []
|
||||||
|
|
||||||
def find({tag, attrs, inner_tree}, opts, tree_depth) do
|
def find({tag, attrs, inner_tree}, opts, tree_depth) do
|
||||||
html_tree = {tag, attrs, inner_tree}
|
html_tree = {tag, attrs, inner_tree}
|
||||||
|
|
||||||
if candidate?(html_tree) do
|
if candidate?(html_tree) do
|
||||||
candidate = %Candidate{html_tree: html_tree,
|
candidate = %Candidate{
|
||||||
score: Scoring.calc_score(html_tree, opts),
|
html_tree: html_tree,
|
||||||
tree_depth: tree_depth}
|
score: Scoring.calc_score(html_tree, opts),
|
||||||
|
tree_depth: tree_depth
|
||||||
|
}
|
||||||
|
|
||||||
[candidate | find(inner_tree, opts, tree_depth + 1)]
|
[candidate | find(inner_tree, opts, tree_depth + 1)]
|
||||||
else
|
else
|
||||||
|
@ -38,18 +44,20 @@ defmodule Readability.CandidateFinder do
|
||||||
@doc """
|
@doc """
|
||||||
Find the highest score candidate.
|
Find the highest score candidate.
|
||||||
"""
|
"""
|
||||||
@spec find_best_candidate([Candidate.t]) :: Candidate.t
|
@spec find_best_candidate([Candidate.t()]) :: Candidate.t()
|
||||||
def find_best_candidate([]), do: nil
|
def find_best_candidate([]), do: nil
|
||||||
|
|
||||||
def find_best_candidate(candidates) do
|
def find_best_candidate(candidates) do
|
||||||
candidates
|
candidates
|
||||||
|> Enum.max_by(fn(candidate) -> candidate.score end)
|
|> Enum.max_by(fn candidate -> candidate.score end)
|
||||||
end
|
end
|
||||||
|
|
||||||
defp candidate?(_, depth \\ 0)
|
defp candidate?(_, depth \\ 0)
|
||||||
defp candidate?(_, depth) when depth > 2, do: false
|
defp candidate?(_, depth) when depth > 2, do: false
|
||||||
defp candidate?([h|t], depth), do: candidate?(h, depth) || candidate?(t, depth)
|
defp candidate?([h | t], depth), do: candidate?(h, depth) || candidate?(t, depth)
|
||||||
defp candidate?([], _), do: false
|
defp candidate?([], _), do: false
|
||||||
defp candidate?(text, _) when is_binary(text), do: false
|
defp candidate?(text, _) when is_binary(text), do: false
|
||||||
|
|
||||||
defp candidate?({_, _, inner_tree} = html_tree, depth) do
|
defp candidate?({_, _, inner_tree} = html_tree, depth) do
|
||||||
if Helper.candidate_tag?(html_tree) do
|
if Helper.candidate_tag?(html_tree) do
|
||||||
true
|
true
|
||||||
|
|
|
@ -8,15 +8,18 @@ defmodule Readability.Helper do
|
||||||
@doc """
|
@doc """
|
||||||
Change existing tags by selector
|
Change existing tags by selector
|
||||||
"""
|
"""
|
||||||
@spec change_tag(html_tree, String.t, String.t) :: html_tree
|
@spec change_tag(html_tree, String.t(), String.t()) :: html_tree
|
||||||
def change_tag(content, _, _) when is_binary(content), do: content
|
def change_tag(content, _, _) when is_binary(content), do: content
|
||||||
def change_tag([], _, _), do: []
|
def change_tag([], _, _), do: []
|
||||||
def change_tag([h|t], selector, tag) do
|
|
||||||
[change_tag(h, selector, tag)|change_tag(t, selector, tag)]
|
def change_tag([h | t], selector, tag) do
|
||||||
|
[change_tag(h, selector, tag) | change_tag(t, selector, tag)]
|
||||||
end
|
end
|
||||||
|
|
||||||
def change_tag({tag_name, attrs, inner_tree}, tag_name, tag) do
|
def change_tag({tag_name, attrs, inner_tree}, tag_name, tag) do
|
||||||
{tag, attrs, change_tag(inner_tree, tag_name, tag)}
|
{tag, attrs, change_tag(inner_tree, tag_name, tag)}
|
||||||
end
|
end
|
||||||
|
|
||||||
def change_tag({tag_name, attrs, html_tree}, selector, tag) do
|
def change_tag({tag_name, attrs, html_tree}, selector, tag) do
|
||||||
{tag_name, attrs, change_tag(html_tree, selector, tag)}
|
{tag_name, attrs, change_tag(html_tree, selector, tag)}
|
||||||
end
|
end
|
||||||
|
@ -24,41 +27,50 @@ defmodule Readability.Helper do
|
||||||
@doc """
|
@doc """
|
||||||
Remove html attributes
|
Remove html attributes
|
||||||
"""
|
"""
|
||||||
@spec remove_attrs(html_tree, String.t | [String.t] | Regex.t) :: html_tree
|
@spec remove_attrs(html_tree, String.t() | [String.t()] | Regex.t()) :: html_tree
|
||||||
def remove_attrs(content, _) when is_binary(content), do: content
|
def remove_attrs(content, _) when is_binary(content), do: content
|
||||||
def remove_attrs([], _), do: []
|
def remove_attrs([], _), do: []
|
||||||
def remove_attrs([h|t], t_attrs) do
|
|
||||||
[remove_attrs(h, t_attrs)|remove_attrs(t, t_attrs)]
|
def remove_attrs([h | t], t_attrs) do
|
||||||
|
[remove_attrs(h, t_attrs) | remove_attrs(t, t_attrs)]
|
||||||
end
|
end
|
||||||
|
|
||||||
def remove_attrs({tag_name, attrs, inner_tree}, target_attr) do
|
def remove_attrs({tag_name, attrs, inner_tree}, target_attr) do
|
||||||
reject_fun =
|
reject_fun =
|
||||||
cond do
|
cond do
|
||||||
is_binary(target_attr) ->
|
is_binary(target_attr) ->
|
||||||
fn(attr) -> elem(attr, 0) == target_attr end
|
fn attr -> elem(attr, 0) == target_attr end
|
||||||
|
|
||||||
Regex.regex?(target_attr) ->
|
Regex.regex?(target_attr) ->
|
||||||
fn(attr) -> elem(attr, 0) =~ target_attr end
|
fn attr -> elem(attr, 0) =~ target_attr end
|
||||||
|
|
||||||
is_list(target_attr) ->
|
is_list(target_attr) ->
|
||||||
fn(attr) -> Enum.member?(target_attr, elem(attr, 0)) end
|
fn attr -> Enum.member?(target_attr, elem(attr, 0)) end
|
||||||
true -> fn(attr) -> attr end
|
|
||||||
|
true ->
|
||||||
|
fn attr -> attr end
|
||||||
end
|
end
|
||||||
|
|
||||||
{tag_name, Enum.reject(attrs, reject_fun), remove_attrs(inner_tree, target_attr)}
|
{tag_name, Enum.reject(attrs, reject_fun), remove_attrs(inner_tree, target_attr)}
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
||||||
@doc """
|
@doc """
|
||||||
Remove tags
|
Remove tags
|
||||||
"""
|
"""
|
||||||
@spec remove_tag(html_tree, fun) :: html_tree
|
@spec remove_tag(html_tree, fun) :: html_tree
|
||||||
def remove_tag(content, _) when is_binary(content), do: content
|
def remove_tag(content, _) when is_binary(content), do: content
|
||||||
def remove_tag([], _), do: []
|
def remove_tag([], _), do: []
|
||||||
def remove_tag([h|t], fun) do
|
|
||||||
|
def remove_tag([h | t], fun) do
|
||||||
node = remove_tag(h, fun)
|
node = remove_tag(h, fun)
|
||||||
|
|
||||||
if is_nil(node) do
|
if is_nil(node) do
|
||||||
remove_tag(t, fun)
|
remove_tag(t, fun)
|
||||||
else
|
else
|
||||||
[node|remove_tag(t, fun)]
|
[node | remove_tag(t, fun)]
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
def remove_tag({tag, attrs, inner_tree} = html_tree, fun) do
|
def remove_tag({tag, attrs, inner_tree} = html_tree, fun) do
|
||||||
if fun.(html_tree) do
|
if fun.(html_tree) do
|
||||||
nil
|
nil
|
||||||
|
@ -72,7 +84,7 @@ defmodule Readability.Helper do
|
||||||
"""
|
"""
|
||||||
@spec text_length(html_tree) :: number
|
@spec text_length(html_tree) :: number
|
||||||
def text_length(html_tree) do
|
def text_length(html_tree) do
|
||||||
html_tree |> Floki.text |> String.strip |> String.length
|
html_tree |> Floki.text() |> String.strip() |> String.length()
|
||||||
end
|
end
|
||||||
|
|
||||||
@doc """
|
@doc """
|
||||||
|
@ -80,9 +92,9 @@ defmodule Readability.Helper do
|
||||||
"""
|
"""
|
||||||
@spec candidate_tag?(html_tree) :: boolean
|
@spec candidate_tag?(html_tree) :: boolean
|
||||||
def candidate_tag?({tag, _, _} = html_tree) do
|
def candidate_tag?({tag, _, _} = html_tree) do
|
||||||
Enum.any?(["p", "td"], fn(candidate_tag) ->
|
Enum.any?(["p", "td"], fn candidate_tag ->
|
||||||
tag == candidate_tag
|
tag == candidate_tag &&
|
||||||
&& (text_length(html_tree)) >= Readability.default_options[:min_text_length]
|
text_length(html_tree) >= Readability.default_options()[:min_text_length]
|
||||||
end)
|
end)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -96,7 +108,7 @@ defmodule Readability.Helper do
|
||||||
|> String.replace(Readability.regexes(:replace_brs), "</p><p>")
|
|> String.replace(Readability.regexes(:replace_brs), "</p><p>")
|
||||||
|> String.replace(Readability.regexes(:replace_fonts), "<\1span>")
|
|> String.replace(Readability.regexes(:replace_fonts), "<\1span>")
|
||||||
|> String.replace(Readability.regexes(:normalize), " ")
|
|> String.replace(Readability.regexes(:normalize), " ")
|
||||||
|> Floki.parse
|
|> Floki.parse()
|
||||||
|> Floki.filter_out(:comment)
|
|> Floki.filter_out(:comment)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -13,12 +13,13 @@ defmodule Readability.Sanitizer do
|
||||||
@doc """
|
@doc """
|
||||||
Sanitizes article html tree
|
Sanitizes article html tree
|
||||||
"""
|
"""
|
||||||
@spec sanitize(html_tree, [Candidate.t], list) :: html_tree
|
@spec sanitize(html_tree, [Candidate.t()], list) :: html_tree
|
||||||
def sanitize(html_tree, candidates, opts \\ []) do
|
def sanitize(html_tree, candidates, opts \\ []) do
|
||||||
html_tree = html_tree
|
html_tree =
|
||||||
|> Helper.remove_tag(&clean_headline_tag?(&1))
|
html_tree
|
||||||
|> Helper.remove_tag(&clean_unlikely_tag?(&1))
|
|> Helper.remove_tag(&clean_headline_tag?(&1))
|
||||||
|> Helper.remove_tag(&clean_empty_p?(&1))
|
|> Helper.remove_tag(&clean_unlikely_tag?(&1))
|
||||||
|
|> Helper.remove_tag(&clean_empty_p?(&1))
|
||||||
|
|
||||||
if opts[:clean_conditionally] do
|
if opts[:clean_conditionally] do
|
||||||
html_tree |> Helper.remove_tag(conditionally_cleaing_fn(candidates))
|
html_tree |> Helper.remove_tag(conditionally_cleaing_fn(candidates))
|
||||||
|
@ -28,15 +29,19 @@ defmodule Readability.Sanitizer do
|
||||||
end
|
end
|
||||||
|
|
||||||
defp conditionally_cleaing_fn(candidates) do
|
defp conditionally_cleaing_fn(candidates) do
|
||||||
fn({tag, attrs, _} = tree) ->
|
fn {tag, attrs, _} = tree ->
|
||||||
if Enum.any?(["table", "ul", "div"], &(&1 == tag)) do
|
if Enum.any?(["table", "ul", "div"], &(&1 == tag)) do
|
||||||
weight = Scoring.class_weight(attrs)
|
weight = Scoring.class_weight(attrs)
|
||||||
same_tree = candidates
|
|
||||||
|> Enum.find(%Candidate{}, &(&1.html_tree == tree))
|
same_tree =
|
||||||
|
candidates
|
||||||
|
|> Enum.find(%Candidate{}, &(&1.html_tree == tree))
|
||||||
|
|
||||||
list? = tag == "ul"
|
list? = tag == "ul"
|
||||||
|
|
||||||
cond do
|
cond do
|
||||||
weight + same_tree.score < 0
|
weight + same_tree.score < 0 ->
|
||||||
-> true
|
true
|
||||||
|
|
||||||
length(Regex.scan(~r/\,/, Floki.text(tree))) < 10 ->
|
length(Regex.scan(~r/\,/, Floki.text(tree))) < 10 ->
|
||||||
# If there are not very many commas, and the number of
|
# If there are not very many commas, and the number of
|
||||||
|
@ -46,35 +51,42 @@ defmodule Readability.Sanitizer do
|
||||||
img_len = tree |> Floki.find("img") |> length
|
img_len = tree |> Floki.find("img") |> length
|
||||||
li_len = tree |> Floki.find("li") |> length
|
li_len = tree |> Floki.find("li") |> length
|
||||||
input_len = tree |> Floki.find("input") |> length
|
input_len = tree |> Floki.find("input") |> length
|
||||||
embed_len = tree
|
|
||||||
|> Floki.find("embed")
|
|
||||||
|> Enum.reject(&(&1 =~ Readability.regexes(:video)))
|
|
||||||
|> length
|
|
||||||
|
|
||||||
link_density = Scoring.calc_link_density(tree)
|
embed_len =
|
||||||
|
tree
|
||||||
|
|> Floki.find("embed")
|
||||||
|
|> Enum.reject(&(&1 =~ Readability.regexes(:video)))
|
||||||
|
|> length
|
||||||
|
|
||||||
|
link_density = Scoring.calc_link_density(tree)
|
||||||
conent_len = Helper.text_length(tree)
|
conent_len = Helper.text_length(tree)
|
||||||
|
|
||||||
img_len > p_len # too many image
|
# too many image
|
||||||
|| (!list? && li_len > p_len) # more <li>s than <p>s
|
# more <li>s than <p>s
|
||||||
|| input_len > (p_len / 3) # less than 3x <p>s than <input>s
|
# less than 3x <p>s than <input>s
|
||||||
|| (!list? && conent_len < Readability.regexes(:min_text_length) && img_len != 1) # too short a content length without a single image
|
# too short a content length without a single image
|
||||||
|| (weight < 25 && link_density > 0.2) # too many links for its weight (#{weight})
|
# too many links for its weight (#{weight})
|
||||||
|| (weight >= 25 && link_density > 0.5) # too many links for its weight (#{weight})
|
# too many links for its weight (#{weight})
|
||||||
|| ((embed_len == 1 && conent_len < 75) || embed_len > 1) # <embed>s with too short a content length, or too many <embed>s
|
# <embed>s with too short a content length, or too many <embed>s
|
||||||
|
img_len > p_len || (!list? && li_len > p_len) || input_len > p_len / 3 ||
|
||||||
|
(!list? && conent_len < Readability.regexes(:min_text_length) && img_len != 1) ||
|
||||||
|
(weight < 25 && link_density > 0.2) || (weight >= 25 && link_density > 0.5) ||
|
||||||
|
((embed_len == 1 && conent_len < 75) || embed_len > 1)
|
||||||
|
|
||||||
true -> false
|
true ->
|
||||||
|
false
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
defp clean_headline_tag?({tag, attrs, _} = html_tree) do
|
defp clean_headline_tag?({tag, attrs, _} = html_tree) do
|
||||||
tag =~ ~r/^h\d{1}$/
|
tag =~ ~r/^h\d{1}$/ &&
|
||||||
&& (Scoring.class_weight(attrs) < 0 || Scoring.calc_link_density(html_tree) > 0.33)
|
(Scoring.class_weight(attrs) < 0 || Scoring.calc_link_density(html_tree) > 0.33)
|
||||||
end
|
end
|
||||||
|
|
||||||
defp clean_unlikely_tag?({tag, attrs, _}) do
|
defp clean_unlikely_tag?({tag, attrs, _}) do
|
||||||
attrs_str = attrs |> Enum.map(&(elem(&1, 1))) |> Enum.join("")
|
attrs_str = attrs |> Enum.map(&elem(&1, 1)) |> Enum.join("")
|
||||||
tag =~ ~r/form|object|iframe|embed/ && !(attrs_str =~ Readability.regexes(:video))
|
tag =~ ~r/form|object|iframe|embed/ && !(attrs_str =~ Readability.regexes(:video))
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
|
@ -23,6 +23,7 @@ defmodule Readability.TitleFinder do
|
||||||
else
|
else
|
||||||
h_title
|
h_title
|
||||||
end
|
end
|
||||||
|
|
||||||
title when is_binary(title) ->
|
title when is_binary(title) ->
|
||||||
title
|
title
|
||||||
end
|
end
|
||||||
|
@ -54,7 +55,7 @@ defmodule Readability.TitleFinder do
|
||||||
@doc """
|
@doc """
|
||||||
Find title from h tag
|
Find title from h tag
|
||||||
"""
|
"""
|
||||||
@spec h_tag_title(html_tree, String.t) :: binary
|
@spec h_tag_title(html_tree, String.t()) :: binary
|
||||||
def h_tag_title(html_tree, selector \\ @h_tag_selector) do
|
def h_tag_title(html_tree, selector \\ @h_tag_selector) do
|
||||||
html_tree
|
html_tree
|
||||||
|> find_tag(selector)
|
|> find_tag(selector)
|
||||||
|
@ -65,6 +66,7 @@ defmodule Readability.TitleFinder do
|
||||||
case Floki.find(html_tree, selector) do
|
case Floki.find(html_tree, selector) do
|
||||||
[] ->
|
[] ->
|
||||||
[]
|
[]
|
||||||
|
|
||||||
matches when is_list(matches) ->
|
matches when is_list(matches) ->
|
||||||
hd(matches)
|
hd(matches)
|
||||||
end
|
end
|
||||||
|
@ -73,9 +75,11 @@ defmodule Readability.TitleFinder do
|
||||||
defp clean_title([]) do
|
defp clean_title([]) do
|
||||||
""
|
""
|
||||||
end
|
end
|
||||||
|
|
||||||
defp clean_title([title]) when is_binary(title) do
|
defp clean_title([title]) when is_binary(title) do
|
||||||
String.strip(title)
|
String.strip(title)
|
||||||
end
|
end
|
||||||
|
|
||||||
defp clean_title(html_tree) do
|
defp clean_title(html_tree) do
|
||||||
html_tree
|
html_tree
|
||||||
|> Floki.text()
|
|> Floki.text()
|
||||||
|
|
50
mix.exs
50
mix.exs
|
@ -10,24 +10,23 @@ defmodule Readability.Mixfile do
|
||||||
use Mix.Project
|
use Mix.Project
|
||||||
|
|
||||||
def project do
|
def project do
|
||||||
[app: :readability,
|
[
|
||||||
version: @version,
|
app: :readability,
|
||||||
elixir: "~> 1.3",
|
version: @version,
|
||||||
description: @description,
|
elixir: "~> 1.3",
|
||||||
package: package(),
|
description: @description,
|
||||||
build_embedded: Mix.env == :prod,
|
package: package(),
|
||||||
start_permanent: Mix.env == :prod,
|
build_embedded: Mix.env() == :prod,
|
||||||
deps: deps()]
|
start_permanent: Mix.env() == :prod,
|
||||||
|
deps: deps()
|
||||||
|
]
|
||||||
end
|
end
|
||||||
|
|
||||||
# Configuration for the OTP application
|
# Configuration for the OTP application
|
||||||
#
|
#
|
||||||
# Type "mix help compile.app" for more information
|
# Type "mix help compile.app" for more information
|
||||||
def application do
|
def application do
|
||||||
[applications: [:logger,
|
[applications: [:logger, :floki, :httpoison]]
|
||||||
:floki,
|
|
||||||
:httpoison
|
|
||||||
]]
|
|
||||||
end
|
end
|
||||||
|
|
||||||
# Dependencies can be Hex packages:
|
# Dependencies can be Hex packages:
|
||||||
|
@ -40,20 +39,25 @@ defmodule Readability.Mixfile do
|
||||||
#
|
#
|
||||||
# Type "mix help deps" for more examples and options
|
# Type "mix help deps" for more examples and options
|
||||||
defp deps do
|
defp deps do
|
||||||
[{:floki, "~> 0.18.0"},
|
[
|
||||||
{:httpoison, "~> 0.13.0"},
|
{:floki, "~> 0.18.0"},
|
||||||
{:ex_doc, "~> 0.14", only: :dev},
|
{:httpoison, "~> 0.13.0"},
|
||||||
{:credo, "~> 0.6.1", only: [:dev, :test]},
|
{:ex_doc, "~> 0.14", only: :dev},
|
||||||
{:dialyxir, "~> 0.3", only: [:dev]},
|
{:credo, "~> 0.6.1", only: [:dev, :test]},
|
||||||
{:mock, "~> 0.2.0", only: :test},
|
{:dialyxir, "~> 0.3", only: [:dev]},
|
||||||
|
{:mock, "~> 0.2.0", only: :test}
|
||||||
]
|
]
|
||||||
end
|
end
|
||||||
|
|
||||||
defp package do
|
defp package do
|
||||||
[files: ["lib", "mix.exs", "README*", "LICENSE*", "doc"],
|
[
|
||||||
maintainers: ["Jaehyun Shin"],
|
files: ["lib", "mix.exs", "README*", "LICENSE*", "doc"],
|
||||||
licenses: ["Apache 2.0"],
|
maintainers: ["Jaehyun Shin"],
|
||||||
links: %{"GitHub" => "https://github.com/keepcosmos/readability",
|
licenses: ["Apache 2.0"],
|
||||||
"Docs" => "https://hexdocs.pm/readability/Readability.html"}]
|
links: %{
|
||||||
|
"GitHub" => "https://github.com/keepcosmos/readability",
|
||||||
|
"Docs" => "https://hexdocs.pm/readability/Readability.html"
|
||||||
|
}
|
||||||
|
]
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -29,14 +29,14 @@ defmodule Readability.Candidate.CleanerTest do
|
||||||
|
|
||||||
test "transform divs containing no block elements", %{html_tree: html_tree} do
|
test "transform divs containing no block elements", %{html_tree: html_tree} do
|
||||||
html_tree = Cleaner.transform_misused_div_to_p(html_tree)
|
html_tree = Cleaner.transform_misused_div_to_p(html_tree)
|
||||||
[{tag, _, _}|_] = html_tree |> Floki.find("#body")
|
[{tag, _, _} | _] = html_tree |> Floki.find("#body")
|
||||||
|
|
||||||
assert tag == "p"
|
assert tag == "p"
|
||||||
end
|
end
|
||||||
|
|
||||||
test "not transform divs that contain block elements", %{html_tree: html_tree} do
|
test "not transform divs that contain block elements", %{html_tree: html_tree} do
|
||||||
html_tree = Cleaner.transform_misused_div_to_p(html_tree)
|
html_tree = Cleaner.transform_misused_div_to_p(html_tree)
|
||||||
[{tag, _, _}|_] = html_tree |> Floki.find("#contains_blockquote")
|
[{tag, _, _} | _] = html_tree |> Floki.find("#contains_blockquote")
|
||||||
assert tag == "div"
|
assert tag == "div"
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
|
@ -26,23 +26,25 @@ defmodule Readability.HelperTest do
|
||||||
end
|
end
|
||||||
|
|
||||||
test "change font tag to span", %{html_tree: html_tree} do
|
test "change font tag to span", %{html_tree: html_tree} do
|
||||||
expectred = @sample |> String.replace(~r/font/, "span") |> Floki.parse
|
expectred = @sample |> String.replace(~r/font/, "span") |> Floki.parse()
|
||||||
result = Helper.change_tag(html_tree, "font", "span")
|
result = Helper.change_tag(html_tree, "font", "span")
|
||||||
assert result == expectred
|
assert result == expectred
|
||||||
end
|
end
|
||||||
|
|
||||||
test "remove tag", %{html_tree: html_tree} do
|
test "remove tag", %{html_tree: html_tree} do
|
||||||
expected = "<html><body></body></html>" |> parse
|
expected = "<html><body></body></html>" |> parse
|
||||||
result = html_tree
|
|
||||||
|> Helper.remove_tag(fn({tag, _, _}) ->
|
result =
|
||||||
tag == "p"
|
html_tree
|
||||||
end)
|
|> Helper.remove_tag(fn {tag, _, _} ->
|
||||||
|
tag == "p"
|
||||||
|
end)
|
||||||
|
|
||||||
assert result == expected
|
assert result == expected
|
||||||
end
|
end
|
||||||
|
|
||||||
test "inner text lengt", %{html_tree: html_tree} do
|
test "inner text lengt", %{html_tree: html_tree} do
|
||||||
result = html_tree |> Helper.text_length
|
result = html_tree |> Helper.text_length()
|
||||||
assert result == 5
|
assert result == 5
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -37,6 +37,7 @@ defmodule Readability.TitleFinderTest do
|
||||||
</head>
|
</head>
|
||||||
</html>
|
</html>
|
||||||
"""
|
"""
|
||||||
|
|
||||||
title = Readability.TitleFinder.og_title(html)
|
title = Readability.TitleFinder.og_title(html)
|
||||||
assert title == "og title 1"
|
assert title == "og title 1"
|
||||||
end
|
end
|
||||||
|
@ -52,6 +53,7 @@ defmodule Readability.TitleFinderTest do
|
||||||
</head>
|
</head>
|
||||||
</html>
|
</html>
|
||||||
"""
|
"""
|
||||||
|
|
||||||
title = Readability.TitleFinder.tag_title(html)
|
title = Readability.TitleFinder.tag_title(html)
|
||||||
assert title == "Tag title"
|
assert title == "Tag title"
|
||||||
|
|
||||||
|
@ -62,6 +64,7 @@ defmodule Readability.TitleFinderTest do
|
||||||
</head>
|
</head>
|
||||||
</html>
|
</html>
|
||||||
"""
|
"""
|
||||||
|
|
||||||
title = Readability.TitleFinder.tag_title(html)
|
title = Readability.TitleFinder.tag_title(html)
|
||||||
assert title == "Tag title"
|
assert title == "Tag title"
|
||||||
|
|
||||||
|
@ -72,6 +75,7 @@ defmodule Readability.TitleFinderTest do
|
||||||
</head>
|
</head>
|
||||||
</html>
|
</html>
|
||||||
"""
|
"""
|
||||||
|
|
||||||
title = Readability.TitleFinder.tag_title(html)
|
title = Readability.TitleFinder.tag_title(html)
|
||||||
assert title == "Tag title-tag"
|
assert title == "Tag title-tag"
|
||||||
|
|
||||||
|
@ -82,6 +86,7 @@ defmodule Readability.TitleFinderTest do
|
||||||
</head>
|
</head>
|
||||||
</html>
|
</html>
|
||||||
"""
|
"""
|
||||||
|
|
||||||
title = Readability.TitleFinder.tag_title(html)
|
title = Readability.TitleFinder.tag_title(html)
|
||||||
assert title == "Tag title-tag-title"
|
assert title == "Tag title-tag-title"
|
||||||
|
|
||||||
|
@ -95,6 +100,7 @@ defmodule Readability.TitleFinderTest do
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
"""
|
"""
|
||||||
|
|
||||||
title = Readability.TitleFinder.tag_title(html)
|
title = Readability.TitleFinder.tag_title(html)
|
||||||
assert title == "Tag title"
|
assert title == "Tag title"
|
||||||
end
|
end
|
||||||
|
@ -108,6 +114,7 @@ defmodule Readability.TitleFinderTest do
|
||||||
</head>
|
</head>
|
||||||
</html>
|
</html>
|
||||||
"""
|
"""
|
||||||
|
|
||||||
title = Readability.TitleFinder.tag_title(html)
|
title = Readability.TitleFinder.tag_title(html)
|
||||||
assert title == "tag title 1"
|
assert title == "tag title 1"
|
||||||
end
|
end
|
||||||
|
@ -131,6 +138,7 @@ defmodule Readability.TitleFinderTest do
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
"""
|
"""
|
||||||
|
|
||||||
title = Readability.TitleFinder.h_tag_title(html)
|
title = Readability.TitleFinder.h_tag_title(html)
|
||||||
assert title == "header 1"
|
assert title == "header 1"
|
||||||
end
|
end
|
||||||
|
|
|
@ -6,12 +6,9 @@ defmodule ReadabilityHttpTest do
|
||||||
test "blank response is parsed as plain text" do
|
test "blank response is parsed as plain text" do
|
||||||
url = "https://tools.ietf.org/rfc/rfc2616.txt"
|
url = "https://tools.ietf.org/rfc/rfc2616.txt"
|
||||||
content = TestHelper.read_fixture("rfc2616.txt")
|
content = TestHelper.read_fixture("rfc2616.txt")
|
||||||
response = %HTTPoison.Response{
|
response = %HTTPoison.Response{status_code: 200, headers: [], body: content}
|
||||||
status_code: 200,
|
|
||||||
headers: [],
|
with_mock HTTPoison, get!: fn _url, _headers, _opts -> response end do
|
||||||
body: content}
|
|
||||||
|
|
||||||
with_mock HTTPoison, [get!: fn(_url, _headers, _opts) -> response end] do
|
|
||||||
%Readability.Summary{article_text: result_text} = Readability.summarize(url)
|
%Readability.Summary{article_text: result_text} = Readability.summarize(url)
|
||||||
|
|
||||||
assert result_text =~ ~r/3 Protocol Parameters/
|
assert result_text =~ ~r/3 Protocol Parameters/
|
||||||
|
@ -21,12 +18,14 @@ defmodule ReadabilityHttpTest do
|
||||||
test "text/plain response is parsed as plain text" do
|
test "text/plain response is parsed as plain text" do
|
||||||
url = "https://tools.ietf.org/rfc/rfc2616.txt"
|
url = "https://tools.ietf.org/rfc/rfc2616.txt"
|
||||||
content = TestHelper.read_fixture("rfc2616.txt")
|
content = TestHelper.read_fixture("rfc2616.txt")
|
||||||
|
|
||||||
response = %HTTPoison.Response{
|
response = %HTTPoison.Response{
|
||||||
status_code: 200,
|
status_code: 200,
|
||||||
headers: [{"Content-Type", "text/plain"}],
|
headers: [{"Content-Type", "text/plain"}],
|
||||||
body: content}
|
body: content
|
||||||
|
}
|
||||||
with_mock HTTPoison, [get!: fn(_url, _headers, _opts) -> response end] do
|
|
||||||
|
with_mock HTTPoison, get!: fn _url, _headers, _opts -> response end do
|
||||||
%Readability.Summary{article_text: result_text} = Readability.summarize(url)
|
%Readability.Summary{article_text: result_text} = Readability.summarize(url)
|
||||||
|
|
||||||
assert result_text =~ ~r/3 Protocol Parameters/
|
assert result_text =~ ~r/3 Protocol Parameters/
|
||||||
|
@ -38,13 +37,15 @@ defmodule ReadabilityHttpTest do
|
||||||
content = TestHelper.read_fixture("bbc.html")
|
content = TestHelper.read_fixture("bbc.html")
|
||||||
mimes = ["text/html", "application/xml", "application/xhtml+xml"]
|
mimes = ["text/html", "application/xml", "application/xhtml+xml"]
|
||||||
|
|
||||||
mimes |> Enum.each(fn(mime) ->
|
mimes
|
||||||
|
|> Enum.each(fn mime ->
|
||||||
response = %HTTPoison.Response{
|
response = %HTTPoison.Response{
|
||||||
status_code: 200,
|
status_code: 200,
|
||||||
headers: [{"Content-Type", mime}],
|
headers: [{"Content-Type", mime}],
|
||||||
body: content}
|
body: content
|
||||||
|
}
|
||||||
with_mock HTTPoison, [get!: fn(_url, _headers, _opts) -> response end] do
|
|
||||||
|
with_mock HTTPoison, get!: fn _url, _headers, _opts -> response end do
|
||||||
%Readability.Summary{article_html: result_html} = Readability.summarize(url)
|
%Readability.Summary{article_html: result_html} = Readability.summarize(url)
|
||||||
|
|
||||||
assert result_html =~ ~r/connected computing devices\".<\/p><\/div><\/div>$/
|
assert result_html =~ ~r/connected computing devices\".<\/p><\/div><\/div>$/
|
||||||
|
@ -55,12 +56,14 @@ defmodule ReadabilityHttpTest do
|
||||||
test "response with charset is parsed correctly" do
|
test "response with charset is parsed correctly" do
|
||||||
url = "https://news.bbc.co.uk/test.html"
|
url = "https://news.bbc.co.uk/test.html"
|
||||||
content = TestHelper.read_fixture("bbc.html")
|
content = TestHelper.read_fixture("bbc.html")
|
||||||
|
|
||||||
response = %HTTPoison.Response{
|
response = %HTTPoison.Response{
|
||||||
status_code: 200,
|
status_code: 200,
|
||||||
headers: [{"Content-Type", "text/html; charset=UTF-8"}],
|
headers: [{"Content-Type", "text/html; charset=UTF-8"}],
|
||||||
body: content}
|
body: content
|
||||||
|
}
|
||||||
with_mock HTTPoison, [get!: fn(_url, _headers, _opts) -> response end] do
|
|
||||||
|
with_mock HTTPoison, get!: fn _url, _headers, _opts -> response end do
|
||||||
%Readability.Summary{article_html: result_html} = Readability.summarize(url)
|
%Readability.Summary{article_html: result_html} = Readability.summarize(url)
|
||||||
|
|
||||||
assert result_html =~ ~r/connected computing devices\".<\/p><\/div><\/div>$/
|
assert result_html =~ ~r/connected computing devices\".<\/p><\/div><\/div>$/
|
||||||
|
@ -71,12 +74,14 @@ defmodule ReadabilityHttpTest do
|
||||||
# HTTP header keys are case insensitive (RFC2616 - Section 4.2)
|
# HTTP header keys are case insensitive (RFC2616 - Section 4.2)
|
||||||
url = "https://news.bbc.co.uk/test.html"
|
url = "https://news.bbc.co.uk/test.html"
|
||||||
content = TestHelper.read_fixture("bbc.html")
|
content = TestHelper.read_fixture("bbc.html")
|
||||||
|
|
||||||
response = %HTTPoison.Response{
|
response = %HTTPoison.Response{
|
||||||
status_code: 200,
|
status_code: 200,
|
||||||
headers: [{"content-Type", "text/html; charset=UTF-8"}],
|
headers: [{"content-Type", "text/html; charset=UTF-8"}],
|
||||||
body: content}
|
body: content
|
||||||
|
}
|
||||||
with_mock HTTPoison, [get!: fn(_url, _headers, _opts) -> response end] do
|
|
||||||
|
with_mock HTTPoison, get!: fn _url, _headers, _opts -> response end do
|
||||||
%Readability.Summary{article_html: result_html} = Readability.summarize(url)
|
%Readability.Summary{article_html: result_html} = Readability.summarize(url)
|
||||||
|
|
||||||
assert result_html =~ ~r/connected computing devices\".<\/p><\/div><\/div>$/
|
assert result_html =~ ~r/connected computing devices\".<\/p><\/div><\/div>$/
|
||||||
|
|
|
@ -7,7 +7,10 @@ defmodule ReadabilityTest do
|
||||||
nytimes = Readability.article(html, opts)
|
nytimes = Readability.article(html, opts)
|
||||||
|
|
||||||
nytimes_html = Readability.readable_html(nytimes)
|
nytimes_html = Readability.readable_html(nytimes)
|
||||||
assert nytimes_html =~ ~r/^<div><div><figure id=\"media-100000004245260\"><div><img src=\"https/
|
|
||||||
|
assert nytimes_html =~
|
||||||
|
~r/^<div><div><figure id=\"media-100000004245260\"><div><img src=\"https/
|
||||||
|
|
||||||
assert nytimes_html =~ ~r/major priorities.<\/p><\/div><\/div>$/
|
assert nytimes_html =~ ~r/major priorities.<\/p><\/div><\/div>$/
|
||||||
|
|
||||||
nytimes_text = Readability.readable_text(nytimes)
|
nytimes_text = Readability.readable_text(nytimes)
|
||||||
|
@ -66,12 +69,17 @@ defmodule ReadabilityTest do
|
||||||
|
|
||||||
pubmed_html = Readability.readable_html(pubmed)
|
pubmed_html = Readability.readable_html(pubmed)
|
||||||
|
|
||||||
assert pubmed_html =~ ~r/^<div><div><h4>BACKGROUND AND OBJECTIVES: <\/h4><p><abstracttext>Although strict blood pressure/
|
assert pubmed_html =~
|
||||||
assert pubmed_html =~ ~r/different mechanisms yielded potent antihypertensive efficacy with safety and decreased plasma BNP levels.<\/abstracttext><\/p><\/div><\/div>$/
|
~r/^<div><div><h4>BACKGROUND AND OBJECTIVES: <\/h4><p><abstracttext>Although strict blood pressure/
|
||||||
|
|
||||||
|
assert pubmed_html =~
|
||||||
|
~r/different mechanisms yielded potent antihypertensive efficacy with safety and decreased plasma BNP levels.<\/abstracttext><\/p><\/div><\/div>$/
|
||||||
|
|
||||||
pubmed_text = Readability.readable_text(pubmed)
|
pubmed_text = Readability.readable_text(pubmed)
|
||||||
|
|
||||||
assert pubmed_text =~ ~r/^BACKGROUND AND OBJECTIVES: \nAlthough strict blood pressure/
|
assert pubmed_text =~ ~r/^BACKGROUND AND OBJECTIVES: \nAlthough strict blood pressure/
|
||||||
assert pubmed_text =~ ~r/with different mechanisms yielded potent antihypertensive efficacy with safety and decreased plasma BNP levels.$/
|
|
||||||
|
assert pubmed_text =~
|
||||||
|
~r/with different mechanisms yielded potent antihypertensive efficacy with safety and decreased plasma BNP levels.$/
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
Loading…
Reference in New Issue