diff --git a/.formatter.exs b/.formatter.exs new file mode 100644 index 0000000..525446d --- /dev/null +++ b/.formatter.exs @@ -0,0 +1,4 @@ +# Used by "mix format" +[ + inputs: ["mix.exs", "{config,lib,test}/**/*.{ex,exs}"] +] diff --git a/lib/readability.ex b/lib/readability.ex index c50c715..6b0d819 100644 --- a/lib/readability.ex +++ b/lib/readability.ex @@ -34,32 +34,36 @@ defmodule Readability do alias Readability.Summary alias Readability.Helper - @default_options [retry_length: 250, - min_text_length: 25, - remove_unlikely_candidates: true, - weight_classes: true, - clean_conditionally: true, - remove_empty_nodes: true, - min_image_width: 130, - min_image_height: 80, - ignore_image_format: [], - blacklist: nil, - whitelist: nil, - page_url: nil - ] + @default_options [ + retry_length: 250, + min_text_length: 25, + remove_unlikely_candidates: true, + weight_classes: true, + clean_conditionally: true, + remove_empty_nodes: true, + min_image_width: 130, + min_image_height: 80, + ignore_image_format: [], + blacklist: nil, + whitelist: nil, + page_url: nil + ] - @regexes [unlikely_candidate: ~r/combx|comment|community|disqus|extra|foot|header|hidden|lightbox|modal|menu|meta|nav|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i, - ok_maybe_its_a_candidate: ~r/and|article|body|column|main|shadow/i, - positive: ~r/article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i, - negative: ~r/hidden|^hid|combx|comment|com-|contact|foot|footer|footnote|link|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|utility|widget/i, - div_to_p_elements: ~r/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i, - replace_brs: ~r/(]*>[ \n\r\t]*){2,}/i, - replace_fonts: ~r/<(\/?)font[^>]*>/i, - replace_xml_version: ~r/<\?xml.*\?>/i, - normalize: ~r/\s{2,}/, - video: ~r/\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i, - protect_attrs: ~r/^(?!id|rel|for|summary|title|href|src|alt|srcdoc)/i - ] + @regexes [ + unlikely_candidate: + ~r/combx|comment|community|disqus|extra|foot|header|hidden|lightbox|modal|menu|meta|nav|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i, + ok_maybe_its_a_candidate: ~r/and|article|body|column|main|shadow/i, + positive: ~r/article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i, + negative: + ~r/hidden|^hid|combx|comment|com-|contact|foot|footer|footnote|link|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|utility|widget/i, + div_to_p_elements: ~r/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i, + replace_brs: ~r/(]*>[ \n\r\t]*){2,}/i, + replace_fonts: ~r/<(\/?)font[^>]*>/i, + replace_xml_version: ~r/<\?xml.*\?>/i, + normalize: ~r/\s{2,}/, + video: ~r/\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i, + protect_attrs: ~r/^(?!id|rel|for|summary|title|href|src|alt|srcdoc)/i + ] @markup_mimes ~r/^(application|text)\/[a-z\-_\.\+]+ml(;\s+charset=.*)?$/i @@ -72,32 +76,30 @@ defmodule Readability do @doc """ summarize the primary readable content of a webpage. """ - @spec summarize(url, options) :: Summary.t + @spec summarize(url, options) :: Summary.t() def summarize(url, opts \\ []) do - opts = Keyword.merge(opts, [page_url: url]) - httpoison_options = Application.get_env :readability, :httpoison_options, [] + opts = Keyword.merge(opts, page_url: url) + httpoison_options = Application.get_env(:readability, :httpoison_options, []) %{status_code: _, body: raw, headers: headers} = HTTPoison.get!(url, [], httpoison_options) case is_response_markup(headers) do true -> html_tree = Helper.normalize(raw) - article_tree = html_tree - |> ArticleBuilder.build(opts) - %Summary{title: title(html_tree), - authors: authors(html_tree), - article_html: readable_html(article_tree), - article_text: readable_text(article_tree) + article_tree = + html_tree + |> ArticleBuilder.build(opts) + + %Summary{ + title: title(html_tree), + authors: authors(html_tree), + article_html: readable_html(article_tree), + article_text: readable_text(article_tree) } _ -> - %Summary{title: nil, - authors: nil, - article_html: nil, - article_text: raw - } + %Summary{title: nil, authors: nil, article_html: nil, article_text: raw} end - end @doc """ @@ -112,8 +114,10 @@ defmodule Readability do def mime(headers \\ []) do headers |> Enum.find( - {"Content-Type", "text/plain"}, # default - fn({key, _}) -> String.downcase(key) == "content-type" end) + # default + {"Content-Type", "text/plain"}, + fn {key, _} -> String.downcase(key) == "content-type" end + ) |> elem(1) end @@ -141,12 +145,12 @@ defmodule Readability do """ @spec title(binary | html_tree) :: binary def title(raw_html) when is_binary(raw_html) do - raw_html - |> Helper.normalize - |> title + raw_html + |> Helper.normalize() + |> title end - def title(html_tree), do: TitleFinder.title(html_tree) + def title(html_tree), do: TitleFinder.title(html_tree) @doc """ Extract authors @@ -173,8 +177,9 @@ defmodule Readability do @spec article(binary, options) :: html_tree def article(raw_html, opts \\ []) do opts = Keyword.merge(@default_options, opts) + raw_html - |> Helper.normalize + |> Helper.normalize() |> ArticleBuilder.build(opts) end @@ -196,10 +201,11 @@ defmodule Readability do # TODO: Remove image caption when extract only text tags_to_br = ~r/<\/(p|div|article|h\d)/i html_str = html_tree |> raw_html - Regex.replace(tags_to_br, html_str, &("\n#{&1}")) - |> Floki.parse - |> Floki.text - |> String.strip + + Regex.replace(tags_to_br, html_str, &"\n#{&1}") + |> Floki.parse() + |> Floki.text() + |> String.strip() end @doc """ @@ -207,7 +213,7 @@ defmodule Readability do """ @spec raw_html(html_tree) :: binary def raw_html(html_tree) do - html_tree |> Floki.raw_html + html_tree |> Floki.raw_html() end def parse(raw_html) when is_binary(raw_html), do: Floki.parse(raw_html) diff --git a/lib/readability/article_builder.ex b/lib/readability/article_builder.ex index b071fc3..d9b6d1c 100644 --- a/lib/readability/article_builder.ex +++ b/lib/readability/article_builder.ex @@ -20,12 +20,18 @@ defmodule Readability.ArticleBuilder do @spec build(html_tree, options) :: html_tree def build(html_tree, opts) do origin_tree = html_tree - html_tree = html_tree - |> Helper.remove_tag(fn({tag, _, _}) -> - Enum.member?(["script", "style"], tag) - end) - html_tree = if opts[:remove_unlikely_candidates], do: Cleaner.remove_unlikely_tree(html_tree), else: html_tree + html_tree = + html_tree + |> Helper.remove_tag(fn {tag, _, _} -> + Enum.member?(["script", "style"], tag) + end) + + html_tree = + if opts[:remove_unlikely_candidates], + do: Cleaner.remove_unlikely_tree(html_tree), + else: html_tree + html_tree = Cleaner.transform_misused_div_to_p(html_tree) candidates = CandidateFinder.find(html_tree, opts) @@ -48,25 +54,34 @@ defmodule Readability.ArticleBuilder do cond do opts[:remove_unlikely_candidates] -> Keyword.put(opts, :remove_unlikely_candidates, false) + opts[:weight_classes] -> Keyword.put(opts, :weight_classes, false) + opts[:clean_conditionally] -> Keyword.put(opts, :clean_conditionally, false) - true -> nil + + true -> + nil end end defp find_article(candidates, html_tree) do best_candidate = CandidateFinder.find_best_candidate(candidates) - article_trees = if best_candidate do - find_article_trees(best_candidate, candidates) - else - fallback_candidate = case html_tree |> Floki.find("body") do - [tree|_] -> %Candidate{html_tree: tree} - _ -> %Candidate{html_tree: {}} - end - find_article_trees(fallback_candidate, candidates) - end + + article_trees = + if best_candidate do + find_article_trees(best_candidate, candidates) + else + fallback_candidate = + case html_tree |> Floki.find("body") do + [tree | _] -> %Candidate{html_tree: tree} + _ -> %Candidate{html_tree: {}} + end + + find_article_trees(fallback_candidate, candidates) + end + {"div", [], article_trees} end @@ -75,22 +90,21 @@ defmodule Readability.ArticleBuilder do candidates |> Enum.filter(&(&1.tree_depth == best_candidate.tree_depth)) - |> Enum.filter(fn(candidate) -> - candidate == best_candidate - || candidate.score >= score_threshold - || append?(candidate) - end) - |> Enum.map(&(to_article_tag(&1.html_tree))) + |> Enum.filter(fn candidate -> + candidate == best_candidate || candidate.score >= score_threshold || append?(candidate) + end) + |> Enum.map(&to_article_tag(&1.html_tree)) end defp append?(%Candidate{html_tree: html_tree}) when elem(html_tree, 0) == "p" do link_density = Scoring.calc_link_density(html_tree) - inner_text = html_tree |> Floki.text - inner_length = inner_text |> String.length + inner_text = html_tree |> Floki.text() + inner_length = inner_text |> String.length() - (inner_length > 80 && link_density < 0.25) - || (inner_length < 80 && link_density == 0 && inner_text =~ ~r/\.( |$)/) + (inner_length > 80 && link_density < 0.25) || + (inner_length < 80 && link_density == 0 && inner_text =~ ~r/\.( |$)/) end + defp append?(_), do: false defp to_article_tag({tag, attrs, inner_tree} = html_tree) do diff --git a/lib/readability/author_finder.ex b/lib/readability/author_finder.ex index 8350e4e..074ea67 100644 --- a/lib/readability/author_finder.ex +++ b/lib/readability/author_finder.ex @@ -11,21 +11,24 @@ defmodule Readability.AuthorFinder do @spec find(html_tree) :: [binary] def find(html_tree) do author_names = find_by_meta_tag(html_tree) + if author_names do split_author_names(author_names) end end def find_by_meta_tag(html_tree) do - names = html_tree - |> Floki.find("meta[name*=author], meta[property*=author]") - |> Enum.map(fn(meta) -> - meta - |> Floki.attribute("content") - |> Enum.join(" ") - |> String.strip - end) - |> Enum.reject(&(is_nil(&1) || String.length(&1) == 0)) + names = + html_tree + |> Floki.find("meta[name*=author], meta[property*=author]") + |> Enum.map(fn meta -> + meta + |> Floki.attribute("content") + |> Enum.join(" ") + |> String.strip() + end) + |> Enum.reject(&(is_nil(&1) || String.length(&1) == 0)) + if length(names) > 0 do hd(names) else diff --git a/lib/readability/candidate/cleaner.ex b/lib/readability/candidate/cleaner.ex index 4599984..f8e068f 100644 --- a/lib/readability/candidate/cleaner.ex +++ b/lib/readability/candidate/cleaner.ex @@ -14,9 +14,11 @@ defmodule Readability.Candidate.Cleaner do @spec transform_misused_div_to_p(html_tree) :: html_tree def transform_misused_div_to_p(content) when is_binary(content), do: content def transform_misused_div_to_p([]), do: [] - def transform_misused_div_to_p([h|t]) do - [transform_misused_div_to_p(h)|transform_misused_div_to_p(t)] + + def transform_misused_div_to_p([h | t]) do + [transform_misused_div_to_p(h) | transform_misused_div_to_p(t)] end + def transform_misused_div_to_p({tag, attrs, inner_tree}) do tag = if misused_divs?(tag, inner_tree), do: "p", else: tag {tag, attrs, transform_misused_div_to_p(inner_tree)} @@ -33,16 +35,18 @@ defmodule Readability.Candidate.Cleaner do defp misused_divs?("div", inner_tree) do !(Floki.raw_html(inner_tree) =~ Readability.regexes(:div_to_p_elements)) end + defp misused_divs?(_, _), do: false defp unlikely_tree?({tag, attrs, _}) do - idclass_str = attrs - |> Enum.filter_map(&(elem(&1, 0) =~ ~r/id|class/i), &(elem(&1, 1))) - |> Enum.join("") + idclass_str = + attrs + |> Enum.filter_map(&(elem(&1, 0) =~ ~r/id|class/i), &elem(&1, 1)) + |> Enum.join("") + str = tag <> idclass_str - str =~ Readability.regexes(:unlikely_candidate) - && !(str =~ Readability.regexes(:ok_maybe_its_a_candidate)) - && tag != "html" + str =~ Readability.regexes(:unlikely_candidate) && + !(str =~ Readability.regexes(:ok_maybe_its_a_candidate)) && tag != "html" end end diff --git a/lib/readability/candidate/scoring.ex b/lib/readability/candidate/scoring.ex index 38e85a0..792abf9 100644 --- a/lib/readability/candidate/scoring.ex +++ b/lib/readability/candidate/scoring.ex @@ -4,11 +4,7 @@ defmodule Readability.Candidate.Scoring do """ alias Readability.Helper - @element_scores %{"div" => 5, - "blockquote" => 3, - "form" => -3, - "th" => -5 - } + @element_scores %{"div" => 5, "blockquote" => 3, "form" => -3, "th" => -5} @type html_tree :: tuple | list @type options :: list @@ -20,15 +16,19 @@ defmodule Readability.Candidate.Scoring do @spec calc_score(html_tree, options) :: number def calc_score(html_tree, opts \\ []) do score = calc_node_score(html_tree, opts) - score = score + calc_children_content_score(html_tree) + calc_grand_children_content_score(html_tree) + + score = + score + calc_children_content_score(html_tree) + + calc_grand_children_content_score(html_tree) + score * (1 - calc_link_density(html_tree)) end defp calc_content_score(html_tree) do score = 1 - inner_text = html_tree |> Floki.text + inner_text = html_tree |> Floki.text() split_score = inner_text |> String.split(",") |> length - length_score = [(String.length(inner_text) / 100), 3] |> Enum.min + length_score = [String.length(inner_text) / 100, 3] |> Enum.min() score + split_score + length_score end @@ -37,9 +37,11 @@ defmodule Readability.Candidate.Scoring do score = if opts[:weight_classes], do: score + class_weight(attrs), else: score score + (@element_scores[tag] || 0) end - defp calc_node_score([h|t], opts) do + + defp calc_node_score([h | t], opts) do calc_node_score(h, opts) + calc_node_score(t, opts) end + defp calc_node_score([], _), do: 0 def class_weight(attrs) do @@ -55,14 +57,16 @@ defmodule Readability.Candidate.Scoring do end def calc_link_density(html_tree) do - link_length = html_tree - |> Floki.find("a") - |> Floki.text - |> String.length + link_length = + html_tree + |> Floki.find("a") + |> Floki.text() + |> String.length() - text_length = html_tree - |> Floki.text - |> String.length + text_length = + html_tree + |> Floki.text() + |> String.length() if text_length == 0 do 0 @@ -78,11 +82,13 @@ defmodule Readability.Candidate.Scoring do end defp calc_grand_children_content_score({_, _, children_tree}) do - score = children_tree - |> Enum.filter_map(&is_tuple(&1), &elem(&1, 2)) - |> List.flatten - |> Enum.filter(&(is_tuple(&1) && Helper.candidate_tag?(&1))) - |> calc_content_score + score = + children_tree + |> Enum.filter_map(&is_tuple(&1), &elem(&1, 2)) + |> List.flatten() + |> Enum.filter(&(is_tuple(&1) && Helper.candidate_tag?(&1))) + |> calc_content_score + score / 2 end end diff --git a/lib/readability/candidate_finder.ex b/lib/readability/candidate_finder.ex index 4f4896a..1f551cb 100644 --- a/lib/readability/candidate_finder.ex +++ b/lib/readability/candidate_finder.ex @@ -14,20 +14,26 @@ defmodule Readability.CandidateFinder do @doc """ Find candidates that shuld be meaningful article by analysing nodes """ - @spec find(html_tree, options, number) :: [Candidate.t] + @spec find(html_tree, options, number) :: [Candidate.t()] def find(_, opts \\ [], tree_depth \\ 0) def find([], _, _), do: [] - def find([h|t], opts, tree_depth) do + + def find([h | t], opts, tree_depth) do [find(h, opts, tree_depth) | find(t, opts, tree_depth)] - |> List.flatten + |> List.flatten() end + def find(text, _, _) when is_binary(text), do: [] + def find({tag, attrs, inner_tree}, opts, tree_depth) do html_tree = {tag, attrs, inner_tree} + if candidate?(html_tree) do - candidate = %Candidate{html_tree: html_tree, - score: Scoring.calc_score(html_tree, opts), - tree_depth: tree_depth} + candidate = %Candidate{ + html_tree: html_tree, + score: Scoring.calc_score(html_tree, opts), + tree_depth: tree_depth + } [candidate | find(inner_tree, opts, tree_depth + 1)] else @@ -38,18 +44,20 @@ defmodule Readability.CandidateFinder do @doc """ Find the highest score candidate. """ - @spec find_best_candidate([Candidate.t]) :: Candidate.t + @spec find_best_candidate([Candidate.t()]) :: Candidate.t() def find_best_candidate([]), do: nil + def find_best_candidate(candidates) do candidates - |> Enum.max_by(fn(candidate) -> candidate.score end) + |> Enum.max_by(fn candidate -> candidate.score end) end defp candidate?(_, depth \\ 0) defp candidate?(_, depth) when depth > 2, do: false - defp candidate?([h|t], depth), do: candidate?(h, depth) || candidate?(t, depth) + defp candidate?([h | t], depth), do: candidate?(h, depth) || candidate?(t, depth) defp candidate?([], _), do: false defp candidate?(text, _) when is_binary(text), do: false + defp candidate?({_, _, inner_tree} = html_tree, depth) do if Helper.candidate_tag?(html_tree) do true diff --git a/lib/readability/helper.ex b/lib/readability/helper.ex index 012ee79..afce5dd 100644 --- a/lib/readability/helper.ex +++ b/lib/readability/helper.ex @@ -8,15 +8,18 @@ defmodule Readability.Helper do @doc """ Change existing tags by selector """ - @spec change_tag(html_tree, String.t, String.t) :: html_tree + @spec change_tag(html_tree, String.t(), String.t()) :: html_tree def change_tag(content, _, _) when is_binary(content), do: content def change_tag([], _, _), do: [] - def change_tag([h|t], selector, tag) do - [change_tag(h, selector, tag)|change_tag(t, selector, tag)] + + def change_tag([h | t], selector, tag) do + [change_tag(h, selector, tag) | change_tag(t, selector, tag)] end + def change_tag({tag_name, attrs, inner_tree}, tag_name, tag) do {tag, attrs, change_tag(inner_tree, tag_name, tag)} end + def change_tag({tag_name, attrs, html_tree}, selector, tag) do {tag_name, attrs, change_tag(html_tree, selector, tag)} end @@ -24,41 +27,50 @@ defmodule Readability.Helper do @doc """ Remove html attributes """ - @spec remove_attrs(html_tree, String.t | [String.t] | Regex.t) :: html_tree + @spec remove_attrs(html_tree, String.t() | [String.t()] | Regex.t()) :: html_tree def remove_attrs(content, _) when is_binary(content), do: content def remove_attrs([], _), do: [] - def remove_attrs([h|t], t_attrs) do - [remove_attrs(h, t_attrs)|remove_attrs(t, t_attrs)] + + def remove_attrs([h | t], t_attrs) do + [remove_attrs(h, t_attrs) | remove_attrs(t, t_attrs)] end + def remove_attrs({tag_name, attrs, inner_tree}, target_attr) do reject_fun = cond do is_binary(target_attr) -> - fn(attr) -> elem(attr, 0) == target_attr end + fn attr -> elem(attr, 0) == target_attr end + Regex.regex?(target_attr) -> - fn(attr) -> elem(attr, 0) =~ target_attr end + fn attr -> elem(attr, 0) =~ target_attr end + is_list(target_attr) -> - fn(attr) -> Enum.member?(target_attr, elem(attr, 0)) end - true -> fn(attr) -> attr end + fn attr -> Enum.member?(target_attr, elem(attr, 0)) end + + true -> + fn attr -> attr end end + {tag_name, Enum.reject(attrs, reject_fun), remove_attrs(inner_tree, target_attr)} end - @doc """ Remove tags """ @spec remove_tag(html_tree, fun) :: html_tree def remove_tag(content, _) when is_binary(content), do: content def remove_tag([], _), do: [] - def remove_tag([h|t], fun) do + + def remove_tag([h | t], fun) do node = remove_tag(h, fun) + if is_nil(node) do remove_tag(t, fun) else - [node|remove_tag(t, fun)] + [node | remove_tag(t, fun)] end end + def remove_tag({tag, attrs, inner_tree} = html_tree, fun) do if fun.(html_tree) do nil @@ -72,7 +84,7 @@ defmodule Readability.Helper do """ @spec text_length(html_tree) :: number def text_length(html_tree) do - html_tree |> Floki.text |> String.strip |> String.length + html_tree |> Floki.text() |> String.strip() |> String.length() end @doc """ @@ -80,9 +92,9 @@ defmodule Readability.Helper do """ @spec candidate_tag?(html_tree) :: boolean def candidate_tag?({tag, _, _} = html_tree) do - Enum.any?(["p", "td"], fn(candidate_tag) -> - tag == candidate_tag - && (text_length(html_tree)) >= Readability.default_options[:min_text_length] + Enum.any?(["p", "td"], fn candidate_tag -> + tag == candidate_tag && + text_length(html_tree) >= Readability.default_options()[:min_text_length] end) end @@ -96,7 +108,7 @@ defmodule Readability.Helper do |> String.replace(Readability.regexes(:replace_brs), "

") |> String.replace(Readability.regexes(:replace_fonts), "<\1span>") |> String.replace(Readability.regexes(:normalize), " ") - |> Floki.parse + |> Floki.parse() |> Floki.filter_out(:comment) end end diff --git a/lib/readability/sanitizer.ex b/lib/readability/sanitizer.ex index 42fa90a..3605f8d 100644 --- a/lib/readability/sanitizer.ex +++ b/lib/readability/sanitizer.ex @@ -13,12 +13,13 @@ defmodule Readability.Sanitizer do @doc """ Sanitizes article html tree """ - @spec sanitize(html_tree, [Candidate.t], list) :: html_tree - def sanitize(html_tree, candidates, opts \\ []) do - html_tree = html_tree - |> Helper.remove_tag(&clean_headline_tag?(&1)) - |> Helper.remove_tag(&clean_unlikely_tag?(&1)) - |> Helper.remove_tag(&clean_empty_p?(&1)) + @spec sanitize(html_tree, [Candidate.t()], list) :: html_tree + def sanitize(html_tree, candidates, opts \\ []) do + html_tree = + html_tree + |> Helper.remove_tag(&clean_headline_tag?(&1)) + |> Helper.remove_tag(&clean_unlikely_tag?(&1)) + |> Helper.remove_tag(&clean_empty_p?(&1)) if opts[:clean_conditionally] do html_tree |> Helper.remove_tag(conditionally_cleaing_fn(candidates)) @@ -28,15 +29,19 @@ defmodule Readability.Sanitizer do end defp conditionally_cleaing_fn(candidates) do - fn({tag, attrs, _} = tree) -> + fn {tag, attrs, _} = tree -> if Enum.any?(["table", "ul", "div"], &(&1 == tag)) do weight = Scoring.class_weight(attrs) - same_tree = candidates - |> Enum.find(%Candidate{}, &(&1.html_tree == tree)) + + same_tree = + candidates + |> Enum.find(%Candidate{}, &(&1.html_tree == tree)) + list? = tag == "ul" + cond do - weight + same_tree.score < 0 - -> true + weight + same_tree.score < 0 -> + true length(Regex.scan(~r/\,/, Floki.text(tree))) < 10 -> # If there are not very many commas, and the number of @@ -46,35 +51,42 @@ defmodule Readability.Sanitizer do img_len = tree |> Floki.find("img") |> length li_len = tree |> Floki.find("li") |> length input_len = tree |> Floki.find("input") |> length - embed_len = tree - |> Floki.find("embed") - |> Enum.reject(&(&1 =~ Readability.regexes(:video))) - |> length - link_density = Scoring.calc_link_density(tree) + embed_len = + tree + |> Floki.find("embed") + |> Enum.reject(&(&1 =~ Readability.regexes(:video))) + |> length + + link_density = Scoring.calc_link_density(tree) conent_len = Helper.text_length(tree) - img_len > p_len # too many image - || (!list? && li_len > p_len) # more

  • s than

    s - || input_len > (p_len / 3) # less than 3x

    s than s - || (!list? && conent_len < Readability.regexes(:min_text_length) && img_len != 1) # too short a content length without a single image - || (weight < 25 && link_density > 0.2) # too many links for its weight (#{weight}) - || (weight >= 25 && link_density > 0.5) # too many links for its weight (#{weight}) - || ((embed_len == 1 && conent_len < 75) || embed_len > 1) # s with too short a content length, or too many s + # too many image + # more

  • s than

    s + # less than 3x

    s than s + # too short a content length without a single image + # too many links for its weight (#{weight}) + # too many links for its weight (#{weight}) + # s with too short a content length, or too many s + img_len > p_len || (!list? && li_len > p_len) || input_len > p_len / 3 || + (!list? && conent_len < Readability.regexes(:min_text_length) && img_len != 1) || + (weight < 25 && link_density > 0.2) || (weight >= 25 && link_density > 0.5) || + ((embed_len == 1 && conent_len < 75) || embed_len > 1) - true -> false + true -> + false end end end end defp clean_headline_tag?({tag, attrs, _} = html_tree) do - tag =~ ~r/^h\d{1}$/ - && (Scoring.class_weight(attrs) < 0 || Scoring.calc_link_density(html_tree) > 0.33) + tag =~ ~r/^h\d{1}$/ && + (Scoring.class_weight(attrs) < 0 || Scoring.calc_link_density(html_tree) > 0.33) end defp clean_unlikely_tag?({tag, attrs, _}) do - attrs_str = attrs |> Enum.map(&(elem(&1, 1))) |> Enum.join("") + attrs_str = attrs |> Enum.map(&elem(&1, 1)) |> Enum.join("") tag =~ ~r/form|object|iframe|embed/ && !(attrs_str =~ Readability.regexes(:video)) end diff --git a/lib/readability/title_finder.ex b/lib/readability/title_finder.ex index 22c8d40..f8a37d6 100644 --- a/lib/readability/title_finder.ex +++ b/lib/readability/title_finder.ex @@ -23,6 +23,7 @@ defmodule Readability.TitleFinder do else h_title end + title when is_binary(title) -> title end @@ -54,7 +55,7 @@ defmodule Readability.TitleFinder do @doc """ Find title from h tag """ - @spec h_tag_title(html_tree, String.t) :: binary + @spec h_tag_title(html_tree, String.t()) :: binary def h_tag_title(html_tree, selector \\ @h_tag_selector) do html_tree |> find_tag(selector) @@ -65,6 +66,7 @@ defmodule Readability.TitleFinder do case Floki.find(html_tree, selector) do [] -> [] + matches when is_list(matches) -> hd(matches) end @@ -73,9 +75,11 @@ defmodule Readability.TitleFinder do defp clean_title([]) do "" end + defp clean_title([title]) when is_binary(title) do String.strip(title) end + defp clean_title(html_tree) do html_tree |> Floki.text() diff --git a/mix.exs b/mix.exs index ce8ed92..891bbfb 100644 --- a/mix.exs +++ b/mix.exs @@ -10,24 +10,23 @@ defmodule Readability.Mixfile do use Mix.Project def project do - [app: :readability, - version: @version, - elixir: "~> 1.3", - description: @description, - package: package(), - build_embedded: Mix.env == :prod, - start_permanent: Mix.env == :prod, - deps: deps()] + [ + app: :readability, + version: @version, + elixir: "~> 1.3", + description: @description, + package: package(), + build_embedded: Mix.env() == :prod, + start_permanent: Mix.env() == :prod, + deps: deps() + ] end # Configuration for the OTP application # # Type "mix help compile.app" for more information def application do - [applications: [:logger, - :floki, - :httpoison - ]] + [applications: [:logger, :floki, :httpoison]] end # Dependencies can be Hex packages: @@ -40,20 +39,25 @@ defmodule Readability.Mixfile do # # Type "mix help deps" for more examples and options defp deps do - [{:floki, "~> 0.18.0"}, - {:httpoison, "~> 0.13.0"}, - {:ex_doc, "~> 0.14", only: :dev}, - {:credo, "~> 0.6.1", only: [:dev, :test]}, - {:dialyxir, "~> 0.3", only: [:dev]}, - {:mock, "~> 0.2.0", only: :test}, + [ + {:floki, "~> 0.18.0"}, + {:httpoison, "~> 0.13.0"}, + {:ex_doc, "~> 0.14", only: :dev}, + {:credo, "~> 0.6.1", only: [:dev, :test]}, + {:dialyxir, "~> 0.3", only: [:dev]}, + {:mock, "~> 0.2.0", only: :test} ] end defp package do - [files: ["lib", "mix.exs", "README*", "LICENSE*", "doc"], - maintainers: ["Jaehyun Shin"], - licenses: ["Apache 2.0"], - links: %{"GitHub" => "https://github.com/keepcosmos/readability", - "Docs" => "https://hexdocs.pm/readability/Readability.html"}] + [ + files: ["lib", "mix.exs", "README*", "LICENSE*", "doc"], + maintainers: ["Jaehyun Shin"], + licenses: ["Apache 2.0"], + links: %{ + "GitHub" => "https://github.com/keepcosmos/readability", + "Docs" => "https://hexdocs.pm/readability/Readability.html" + } + ] end end diff --git a/test/readability/candidate/cleaner_test.exs b/test/readability/candidate/cleaner_test.exs index cb0aafe..cf00d95 100644 --- a/test/readability/candidate/cleaner_test.exs +++ b/test/readability/candidate/cleaner_test.exs @@ -29,14 +29,14 @@ defmodule Readability.Candidate.CleanerTest do test "transform divs containing no block elements", %{html_tree: html_tree} do html_tree = Cleaner.transform_misused_div_to_p(html_tree) - [{tag, _, _}|_] = html_tree |> Floki.find("#body") + [{tag, _, _} | _] = html_tree |> Floki.find("#body") assert tag == "p" end test "not transform divs that contain block elements", %{html_tree: html_tree} do html_tree = Cleaner.transform_misused_div_to_p(html_tree) - [{tag, _, _}|_] = html_tree |> Floki.find("#contains_blockquote") + [{tag, _, _} | _] = html_tree |> Floki.find("#contains_blockquote") assert tag == "div" end diff --git a/test/readability/helper_test.exs b/test/readability/helper_test.exs index d94d655..7eef33b 100644 --- a/test/readability/helper_test.exs +++ b/test/readability/helper_test.exs @@ -26,23 +26,25 @@ defmodule Readability.HelperTest do end test "change font tag to span", %{html_tree: html_tree} do - expectred = @sample |> String.replace(~r/font/, "span") |> Floki.parse + expectred = @sample |> String.replace(~r/font/, "span") |> Floki.parse() result = Helper.change_tag(html_tree, "font", "span") assert result == expectred end test "remove tag", %{html_tree: html_tree} do expected = "" |> parse - result = html_tree - |> Helper.remove_tag(fn({tag, _, _}) -> - tag == "p" - end) + + result = + html_tree + |> Helper.remove_tag(fn {tag, _, _} -> + tag == "p" + end) assert result == expected end test "inner text lengt", %{html_tree: html_tree} do - result = html_tree |> Helper.text_length + result = html_tree |> Helper.text_length() assert result == 5 end end diff --git a/test/readability/title_finder_test.exs b/test/readability/title_finder_test.exs index dc08ea4..513ad78 100644 --- a/test/readability/title_finder_test.exs +++ b/test/readability/title_finder_test.exs @@ -37,6 +37,7 @@ defmodule Readability.TitleFinderTest do """ + title = Readability.TitleFinder.og_title(html) assert title == "og title 1" end @@ -52,6 +53,7 @@ defmodule Readability.TitleFinderTest do """ + title = Readability.TitleFinder.tag_title(html) assert title == "Tag title" @@ -62,6 +64,7 @@ defmodule Readability.TitleFinderTest do """ + title = Readability.TitleFinder.tag_title(html) assert title == "Tag title" @@ -72,6 +75,7 @@ defmodule Readability.TitleFinderTest do """ + title = Readability.TitleFinder.tag_title(html) assert title == "Tag title-tag" @@ -82,6 +86,7 @@ defmodule Readability.TitleFinderTest do """ + title = Readability.TitleFinder.tag_title(html) assert title == "Tag title-tag-title" @@ -95,6 +100,7 @@ defmodule Readability.TitleFinderTest do """ + title = Readability.TitleFinder.tag_title(html) assert title == "Tag title" end @@ -108,6 +114,7 @@ defmodule Readability.TitleFinderTest do """ + title = Readability.TitleFinder.tag_title(html) assert title == "tag title 1" end @@ -131,6 +138,7 @@ defmodule Readability.TitleFinderTest do """ + title = Readability.TitleFinder.h_tag_title(html) assert title == "header 1" end diff --git a/test/readability_http_test.exs b/test/readability_http_test.exs index bb626fa..88a7971 100644 --- a/test/readability_http_test.exs +++ b/test/readability_http_test.exs @@ -6,12 +6,9 @@ defmodule ReadabilityHttpTest do test "blank response is parsed as plain text" do url = "https://tools.ietf.org/rfc/rfc2616.txt" content = TestHelper.read_fixture("rfc2616.txt") - response = %HTTPoison.Response{ - status_code: 200, - headers: [], - body: content} - - with_mock HTTPoison, [get!: fn(_url, _headers, _opts) -> response end] do + response = %HTTPoison.Response{status_code: 200, headers: [], body: content} + + with_mock HTTPoison, get!: fn _url, _headers, _opts -> response end do %Readability.Summary{article_text: result_text} = Readability.summarize(url) assert result_text =~ ~r/3 Protocol Parameters/ @@ -21,12 +18,14 @@ defmodule ReadabilityHttpTest do test "text/plain response is parsed as plain text" do url = "https://tools.ietf.org/rfc/rfc2616.txt" content = TestHelper.read_fixture("rfc2616.txt") + response = %HTTPoison.Response{ status_code: 200, headers: [{"Content-Type", "text/plain"}], - body: content} - - with_mock HTTPoison, [get!: fn(_url, _headers, _opts) -> response end] do + body: content + } + + with_mock HTTPoison, get!: fn _url, _headers, _opts -> response end do %Readability.Summary{article_text: result_text} = Readability.summarize(url) assert result_text =~ ~r/3 Protocol Parameters/ @@ -38,13 +37,15 @@ defmodule ReadabilityHttpTest do content = TestHelper.read_fixture("bbc.html") mimes = ["text/html", "application/xml", "application/xhtml+xml"] - mimes |> Enum.each(fn(mime) -> + mimes + |> Enum.each(fn mime -> response = %HTTPoison.Response{ status_code: 200, headers: [{"Content-Type", mime}], - body: content} - - with_mock HTTPoison, [get!: fn(_url, _headers, _opts) -> response end] do + body: content + } + + with_mock HTTPoison, get!: fn _url, _headers, _opts -> response end do %Readability.Summary{article_html: result_html} = Readability.summarize(url) assert result_html =~ ~r/connected computing devices\".<\/p><\/div><\/div>$/ @@ -55,12 +56,14 @@ defmodule ReadabilityHttpTest do test "response with charset is parsed correctly" do url = "https://news.bbc.co.uk/test.html" content = TestHelper.read_fixture("bbc.html") + response = %HTTPoison.Response{ status_code: 200, headers: [{"Content-Type", "text/html; charset=UTF-8"}], - body: content} - - with_mock HTTPoison, [get!: fn(_url, _headers, _opts) -> response end] do + body: content + } + + with_mock HTTPoison, get!: fn _url, _headers, _opts -> response end do %Readability.Summary{article_html: result_html} = Readability.summarize(url) assert result_html =~ ~r/connected computing devices\".<\/p><\/div><\/div>$/ @@ -71,12 +74,14 @@ defmodule ReadabilityHttpTest do # HTTP header keys are case insensitive (RFC2616 - Section 4.2) url = "https://news.bbc.co.uk/test.html" content = TestHelper.read_fixture("bbc.html") + response = %HTTPoison.Response{ status_code: 200, headers: [{"content-Type", "text/html; charset=UTF-8"}], - body: content} - - with_mock HTTPoison, [get!: fn(_url, _headers, _opts) -> response end] do + body: content + } + + with_mock HTTPoison, get!: fn _url, _headers, _opts -> response end do %Readability.Summary{article_html: result_html} = Readability.summarize(url) assert result_html =~ ~r/connected computing devices\".<\/p><\/div><\/div>$/ diff --git a/test/readability_test.exs b/test/readability_test.exs index 0b93d83..53039ba 100644 --- a/test/readability_test.exs +++ b/test/readability_test.exs @@ -7,7 +7,10 @@ defmodule ReadabilityTest do nytimes = Readability.article(html, opts) nytimes_html = Readability.readable_html(nytimes) - assert nytimes_html =~ ~r/^

    <\/div><\/div>$/ nytimes_text = Readability.readable_text(nytimes) @@ -66,12 +69,17 @@ defmodule ReadabilityTest do pubmed_html = Readability.readable_html(pubmed) - assert pubmed_html =~ ~r/^

    BACKGROUND AND OBJECTIVES: <\/h4>

    Although strict blood pressure/ - assert pubmed_html =~ ~r/different mechanisms yielded potent antihypertensive efficacy with safety and decreased plasma BNP levels.<\/abstracttext><\/p><\/div><\/div>$/ + assert pubmed_html =~ + ~r/^

    BACKGROUND AND OBJECTIVES: <\/h4>

    Although strict blood pressure/ + + assert pubmed_html =~ + ~r/different mechanisms yielded potent antihypertensive efficacy with safety and decreased plasma BNP levels.<\/abstracttext><\/p><\/div><\/div>$/ pubmed_text = Readability.readable_text(pubmed) assert pubmed_text =~ ~r/^BACKGROUND AND OBJECTIVES: \nAlthough strict blood pressure/ - assert pubmed_text =~ ~r/with different mechanisms yielded potent antihypertensive efficacy with safety and decreased plasma BNP levels.$/ + + assert pubmed_text =~ + ~r/with different mechanisms yielded potent antihypertensive efficacy with safety and decreased plasma BNP levels.$/ end end