From 1aa682a31afd5c85208712e809434f5e4807b835 Mon Sep 17 00:00:00 2001 From: keepcosmos Date: Sun, 5 Feb 2017 18:48:26 +0900 Subject: [PATCH] fix some bug and update deps --- .travis.yml | 1 + lib/readability.ex | 4 ++-- lib/readability/article_builder.ex | 10 +++++----- lib/readability/author_finder.ex | 2 +- lib/readability/candidate/cleaner.ex | 8 ++++---- lib/readability/candidate/scoring.ex | 11 +++++----- lib/readability/helper.ex | 30 ++++++++++++++-------------- lib/readability/sanitizer.ex | 12 +++++------ lib/readability/title_finder.ex | 5 ++++- mix.exs | 15 +++++++------- mix.lock | 19 +++++++++--------- 11 files changed, 60 insertions(+), 57 deletions(-) diff --git a/.travis.yml b/.travis.yml index b5c6fd6..f430b32 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,3 +3,4 @@ language: elixir elixir: - 1.2.6 - 1.3.4 + - 1.4.1 diff --git a/lib/readability.ex b/lib/readability.ex index b33ecf2..d1cef25 100644 --- a/lib/readability.ex +++ b/lib/readability.ex @@ -138,7 +138,7 @@ defmodule Readability do @spec readable_html(html_tree) :: binary def readable_html(html_tree) do html_tree - |> Helper.remove_attrs(regexes[:protect_attrs]) + |> Helper.remove_attrs(regexes(:protect_attrs)) |> raw_html end @@ -166,7 +166,7 @@ defmodule Readability do def parse(raw_html) when is_binary(raw_html), do: Floki.parse(raw_html) - def regexes, do: @regexes + def regexes(key), do: @regexes[key] def default_options, do: @default_options end diff --git a/lib/readability/article_builder.ex b/lib/readability/article_builder.ex index 4ccc7de..52b1fb5 100644 --- a/lib/readability/article_builder.ex +++ b/lib/readability/article_builder.ex @@ -25,9 +25,7 @@ defmodule Readability.ArticleBuilder do Enum.member?(["script", "style"], tag) end) - if opts[:remove_unlikely_candidates] do - html_tree = Cleaner.remove_unlikely_tree(html_tree) - end + html_tree = if opts[:remove_unlikely_candidates], do: Cleaner.remove_unlikely_tree(html_tree), else: html_tree html_tree = Cleaner.transform_misused_div_to_p(html_tree) candidates = CandidateFinder.find(html_tree, opts) @@ -61,8 +59,10 @@ defmodule Readability.ArticleBuilder do defp find_article(candidates, html_tree) do best_candidate = CandidateFinder.find_best_candidate(candidates) unless best_candidate do - tree = html_tree |> Floki.find("body") |> hd - best_candidate = %Candidate{html_tree: tree} + best_candidate = case html_tree |> Floki.find("body") do + [tree|_] -> %Candidate{html_tree: tree} + _ -> %Candidate{html_tree: {}} + end end article_trees = find_article_trees(best_candidate, candidates) {"div", [], article_trees} diff --git a/lib/readability/author_finder.ex b/lib/readability/author_finder.ex index 3fb366c..8350e4e 100644 --- a/lib/readability/author_finder.ex +++ b/lib/readability/author_finder.ex @@ -22,7 +22,7 @@ defmodule Readability.AuthorFinder do |> Enum.map(fn(meta) -> meta |> Floki.attribute("content") - |> Floki.text + |> Enum.join(" ") |> String.strip end) |> Enum.reject(&(is_nil(&1) || String.length(&1) == 0)) diff --git a/lib/readability/candidate/cleaner.ex b/lib/readability/candidate/cleaner.ex index 41c65aa..4599984 100644 --- a/lib/readability/candidate/cleaner.ex +++ b/lib/readability/candidate/cleaner.ex @@ -18,7 +18,7 @@ defmodule Readability.Candidate.Cleaner do [transform_misused_div_to_p(h)|transform_misused_div_to_p(t)] end def transform_misused_div_to_p({tag, attrs, inner_tree}) do - if misused_divs?(tag, inner_tree), do: tag = "p" + tag = if misused_divs?(tag, inner_tree), do: "p", else: tag {tag, attrs, transform_misused_div_to_p(inner_tree)} end @@ -31,7 +31,7 @@ defmodule Readability.Candidate.Cleaner do end defp misused_divs?("div", inner_tree) do - !(Floki.raw_html(inner_tree) =~ Readability.regexes[:div_to_p_elements]) + !(Floki.raw_html(inner_tree) =~ Readability.regexes(:div_to_p_elements)) end defp misused_divs?(_, _), do: false @@ -41,8 +41,8 @@ defmodule Readability.Candidate.Cleaner do |> Enum.join("") str = tag <> idclass_str - str =~ Readability.regexes[:unlikely_candidate] - && !(str =~ Readability.regexes[:ok_maybe_its_a_candidate]) + str =~ Readability.regexes(:unlikely_candidate) + && !(str =~ Readability.regexes(:ok_maybe_its_a_candidate)) && tag != "html" end end diff --git a/lib/readability/candidate/scoring.ex b/lib/readability/candidate/scoring.ex index ed9edbb..38e85a0 100644 --- a/lib/readability/candidate/scoring.ex +++ b/lib/readability/candidate/scoring.ex @@ -34,7 +34,7 @@ defmodule Readability.Candidate.Scoring do defp calc_node_score({tag, attrs, _}, opts) do score = 0 - if opts[:weight_classes], do: score = score + class_weight(attrs) + score = if opts[:weight_classes], do: score + class_weight(attrs), else: score score + (@element_scores[tag] || 0) end defp calc_node_score([h|t], opts) do @@ -47,11 +47,10 @@ defmodule Readability.Candidate.Scoring do class = attrs |> List.keyfind("class", 0, {"", ""}) |> elem(1) id = attrs |> List.keyfind("id", 0, {"", ""}) |> elem(1) - if class =~ Readability.regexes[:positive], do: weight = weight + 25 - if id =~ Readability.regexes[:positive], do: weight = weight + 25 - if class =~ Readability.regexes[:negative], do: weight = weight - 25 - if id =~ Readability.regexes[:negative], do: weight = weight - 25 - + weight = if class =~ Readability.regexes(:positive), do: weight + 25, else: weight + weight = if id =~ Readability.regexes(:positive), do: weight + 25, else: weight + weight = if class =~ Readability.regexes(:negative), do: weight - 25, else: weight + weight = if id =~ Readability.regexes(:negative), do: weight - 25, else: weight weight end diff --git a/lib/readability/helper.ex b/lib/readability/helper.ex index f77cfff..1746812 100644 --- a/lib/readability/helper.ex +++ b/lib/readability/helper.ex @@ -31,16 +31,16 @@ defmodule Readability.Helper do [remove_attrs(h, t_attrs)|remove_attrs(t, t_attrs)] end def remove_attrs({tag_name, attrs, inner_tree}, target_attr) do - reject_fun = fn(attr) -> attr end - cond do - is_binary(target_attr) -> - reject_fun = fn(attr) -> elem(attr, 0) == target_attr end - Regex.regex?(target_attr) -> - reject_fun = fn(attr) -> elem(attr, 0) =~ target_attr end - is_list(target_attr) -> - reject_fun = fn(attr) -> Enum.member?(target_attr, elem(attr, 0)) end - true -> nil - end + reject_fun = + cond do + is_binary(target_attr) -> + fn(attr) -> elem(attr, 0) == target_attr end + Regex.regex?(target_attr) -> + fn(attr) -> elem(attr, 0) =~ target_attr end + is_list(target_attr) -> + fn(attr) -> Enum.member?(target_attr, elem(attr, 0)) end + true -> fn(attr) -> attr end + end {tag_name, Enum.reject(attrs, reject_fun), remove_attrs(inner_tree, target_attr)} end @@ -80,7 +80,7 @@ defmodule Readability.Helper do """ @spec candidate_tag?(html_tree) :: boolean def candidate_tag?(html_tree) do - Enum.any?(candidates_selector, fn(selector) -> + Enum.any?(candidates_selector(), fn(selector) -> Floki.Selector.match?(html_tree, selector) && (text_length(html_tree)) >= Readability.default_options[:min_text_length] end) @@ -92,10 +92,10 @@ defmodule Readability.Helper do @spec normalize(binary) :: html_tree def normalize(raw_html) do raw_html - |> String.replace(Readability.regexes[:replace_xml_version], "") - |> String.replace(Readability.regexes[:replace_brs], "

") - |> String.replace(Readability.regexes[:replace_fonts], "<\1span>") - |> String.replace(Readability.regexes[:normalize], " ") + |> String.replace(Readability.regexes(:replace_xml_version), "") + |> String.replace(Readability.regexes(:replace_brs), "

") + |> String.replace(Readability.regexes(:replace_fonts), "<\1span>") + |> String.replace(Readability.regexes(:normalize), " ") |> Floki.parse |> Floki.filter_out(:comment) end diff --git a/lib/readability/sanitizer.ex b/lib/readability/sanitizer.ex index 2efceb5..42fa90a 100644 --- a/lib/readability/sanitizer.ex +++ b/lib/readability/sanitizer.ex @@ -21,10 +21,10 @@ defmodule Readability.Sanitizer do |> Helper.remove_tag(&clean_empty_p?(&1)) if opts[:clean_conditionally] do - html_tree = html_tree - |> Helper.remove_tag(conditionally_cleaing_fn(candidates)) + html_tree |> Helper.remove_tag(conditionally_cleaing_fn(candidates)) + else + html_tree end - html_tree end defp conditionally_cleaing_fn(candidates) do @@ -48,7 +48,7 @@ defmodule Readability.Sanitizer do input_len = tree |> Floki.find("input") |> length embed_len = tree |> Floki.find("embed") - |> Enum.reject(&(&1 =~ Readability.regexes[:video])) + |> Enum.reject(&(&1 =~ Readability.regexes(:video))) |> length link_density = Scoring.calc_link_density(tree) @@ -57,7 +57,7 @@ defmodule Readability.Sanitizer do img_len > p_len # too many image || (!list? && li_len > p_len) # more

  • s than

    s || input_len > (p_len / 3) # less than 3x

    s than s - || (!list? && conent_len < Readability.regexes[:min_text_length] && img_len != 1) # too short a content length without a single image + || (!list? && conent_len < Readability.regexes(:min_text_length) && img_len != 1) # too short a content length without a single image || (weight < 25 && link_density > 0.2) # too many links for its weight (#{weight}) || (weight >= 25 && link_density > 0.5) # too many links for its weight (#{weight}) || ((embed_len == 1 && conent_len < 75) || embed_len > 1) # s with too short a content length, or too many s @@ -75,7 +75,7 @@ defmodule Readability.Sanitizer do defp clean_unlikely_tag?({tag, attrs, _}) do attrs_str = attrs |> Enum.map(&(elem(&1, 1))) |> Enum.join("") - tag =~ ~r/form|object|iframe|embed/ && !(attrs_str =~ Readability.regexes[:video]) + tag =~ ~r/form|object|iframe|embed/ && !(attrs_str =~ Readability.regexes(:video)) end defp clean_empty_p?({tag, _, _} = html_tree) do diff --git a/lib/readability/title_finder.ex b/lib/readability/title_finder.ex index 59ac9b3..1ce2da0 100644 --- a/lib/readability/title_finder.ex +++ b/lib/readability/title_finder.ex @@ -45,7 +45,7 @@ defmodule Readability.TitleFinder do @spec og_title(html_tree) :: binary def og_title(html_tree) do html_tree - |> find_tag("meta[property=og:title]") + |> find_tag("meta[property='og:title']") |> Floki.attribute("content") |> clean_title() end @@ -72,6 +72,9 @@ defmodule Readability.TitleFinder do defp clean_title([]) do "" end + defp clean_title([title]) when is_binary(title) do + String.strip(title) + end defp clean_title(html_tree) do html_tree |> Floki.text() diff --git a/mix.exs b/mix.exs index 73e370b..ef60fc3 100644 --- a/mix.exs +++ b/mix.exs @@ -2,7 +2,7 @@ defmodule Readability.Mixfile do @moduledoc """ """ - @version "0.6.2" + @version "0.7.0" @description """ Readability library for extracting and curating articles. """ @@ -14,10 +14,10 @@ defmodule Readability.Mixfile do version: @version, elixir: "~> 1.2", description: @description, - package: package, + package: package(), build_embedded: Mix.env == :prod, start_permanent: Mix.env == :prod, - deps: deps] + deps: deps()] end # Configuration for the OTP application @@ -40,11 +40,10 @@ defmodule Readability.Mixfile do # # Type "mix help deps" for more examples and options defp deps do - [{:floki, "~> 0.9.0"}, - {:httpoison, "~> 0.9.0"}, - {:earmark, "~> 0.1", only: :dev}, - {:ex_doc, "~> 0.11", only: :dev}, - {:credo, "~> 0.3", only: [:dev, :test]}, + [{:floki, "~> 0.13.1"}, + {:httpoison, "~> 0.11.0"}, + {:ex_doc, "~> 0.14", only: :dev}, + {:credo, "~> 0.6.1", only: [:dev, :test]}, {:dialyxir, "~> 0.3", only: [:dev]} ] end diff --git a/mix.lock b/mix.lock index a030395..04c7e15 100644 --- a/mix.lock +++ b/mix.lock @@ -1,14 +1,15 @@ -%{"bunt": {:hex, :bunt, "0.1.6", "5d95a6882f73f3b9969fdfd1953798046664e6f77ec4e486e6fafc7caad97c6f", [:mix], []}, - "certifi": {:hex, :certifi, "0.4.0", "a7966efb868b179023618d29a407548f70c52466bf1849b9e8ebd0e34b7ea11f", [:rebar3], []}, - "credo": {:hex, :credo, "0.4.5", "5c5daaf50a2a96068c0f21b6fbd382d206702efa8836a946eeab0b8ac25f5f22", [:mix], [{:bunt, "~> 0.1.6", [hex: :bunt, optional: false]}]}, +%{"bunt": {:hex, :bunt, "0.2.0", "951c6e801e8b1d2cbe58ebbd3e616a869061ddadcc4863d0a2182541acae9a38", [:mix], []}, + "certifi": {:hex, :certifi, "0.7.0", "861a57f3808f7eb0c2d1802afeaae0fa5de813b0df0979153cbafcd853ababaf", [:rebar3], []}, + "credo": {:hex, :credo, "0.6.1", "a941e2591bd2bd2055dc92b810c174650b40b8290459c89a835af9d59ac4a5f8", [:mix], [{:bunt, "~> 0.2.0", [hex: :bunt, optional: false]}]}, "dialyxir": {:hex, :dialyxir, "0.3.5", "eaba092549e044c76f83165978979f60110dc58dd5b92fd952bf2312f64e9b14", [:mix], []}, - "earmark": {:hex, :earmark, "0.2.1", "ba6d26ceb16106d069b289df66751734802777a3cbb6787026dd800ffeb850f3", [:mix], []}, - "ex_doc": {:hex, :ex_doc, "0.12.0", "b774aabfede4af31c0301aece12371cbd25995a21bb3d71d66f5c2fe074c603f", [:mix], [{:earmark, "~> 0.2", [hex: :earmark, optional: false]}]}, - "floki": {:hex, :floki, "0.9.0", "e952ca71a453f7827ab5405106ac8d9ac5c9602d18aa5d2d893e5b9944e2499e", [:mix], [{:mochiweb_html, "~> 2.15", [hex: :mochiweb_html, optional: false]}]}, - "hackney": {:hex, :hackney, "1.6.1", "ddd22d42db2b50e6a155439c8811b8f6df61a4395de10509714ad2751c6da817", [:rebar3], [{:certifi, "0.4.0", [hex: :certifi, optional: false]}, {:idna, "1.2.0", [hex: :idna, optional: false]}, {:metrics, "1.0.1", [hex: :metrics, optional: false]}, {:mimerl, "1.0.2", [hex: :mimerl, optional: false]}, {:ssl_verify_fun, "1.1.0", [hex: :ssl_verify_fun, optional: false]}]}, - "httpoison": {:hex, :httpoison, "0.9.0", "68187a2daddfabbe7ca8f7d75ef227f89f0e1507f7eecb67e4536b3c516faddb", [:mix], [{:hackney, "~> 1.6.0", [hex: :hackney, optional: false]}]}, + "earmark": {:hex, :earmark, "1.1.1", "433136b7f2e99cde88b745b3a0cfc3fbc81fe58b918a09b40fce7f00db4d8187", [:mix], []}, + "ex_doc": {:hex, :ex_doc, "0.14.5", "c0433c8117e948404d93ca69411dd575ec6be39b47802e81ca8d91017a0cf83c", [:mix], [{:earmark, "~> 1.0", [hex: :earmark, optional: false]}]}, + "floki": {:hex, :floki, "0.13.1", "b3b287e02914cb41a66285071dade287165ed1915ab07903e18fb454fe961bad", [:mix], [{:mochiweb, "~> 2.15", [hex: :mochiweb, optional: false]}]}, + "hackney": {:hex, :hackney, "1.6.5", "8c025ee397ac94a184b0743c73b33b96465e85f90a02e210e86df6cbafaa5065", [:rebar3], [{:certifi, "0.7.0", [hex: :certifi, optional: false]}, {:idna, "1.2.0", [hex: :idna, optional: false]}, {:metrics, "1.0.1", [hex: :metrics, optional: false]}, {:mimerl, "1.0.2", [hex: :mimerl, optional: false]}, {:ssl_verify_fun, "1.1.1", [hex: :ssl_verify_fun, optional: false]}]}, + "httpoison": {:hex, :httpoison, "0.11.0", "b9240a9c44fc46fcd8618d17898859ba09a3c1b47210b74316c0ffef10735e76", [:mix], [{:hackney, "~> 1.6.3", [hex: :hackney, optional: false]}]}, "idna": {:hex, :idna, "1.2.0", "ac62ee99da068f43c50dc69acf700e03a62a348360126260e87f2b54eced86b2", [:rebar3], []}, "metrics": {:hex, :metrics, "1.0.1", "25f094dea2cda98213cecc3aeff09e940299d950904393b2a29d191c346a8486", [:rebar3], []}, "mimerl": {:hex, :mimerl, "1.0.2", "993f9b0e084083405ed8252b99460c4f0563e41729ab42d9074fd5e52439be88", [:rebar3], []}, + "mochiweb": {:hex, :mochiweb, "2.15.0", "e1daac474df07651e5d17cc1e642c4069c7850dc4508d3db7263a0651330aacc", [:rebar3], []}, "mochiweb_html": {:hex, :mochiweb_html, "2.15.0", "d7402e967d7f9f2912f8befa813c37be62d5eeeddbbcb6fe986c44e01460d497", [:rebar3], []}, - "ssl_verify_fun": {:hex, :ssl_verify_fun, "1.1.0", "edee20847c42e379bf91261db474ffbe373f8acb56e9079acb6038d4e0bf414f", [:rebar, :make], []}} + "ssl_verify_fun": {:hex, :ssl_verify_fun, "1.1.1", "28a4d65b7f59893bc2c7de786dec1e1555bd742d336043fe644ae956c3497fbe", [:make, :rebar], []}}