From 1aa682a31afd5c85208712e809434f5e4807b835 Mon Sep 17 00:00:00 2001
From: keepcosmos
Date: Sun, 5 Feb 2017 18:48:26 +0900
Subject: [PATCH] fix some bug and update deps
---
.travis.yml | 1 +
lib/readability.ex | 4 ++--
lib/readability/article_builder.ex | 10 +++++-----
lib/readability/author_finder.ex | 2 +-
lib/readability/candidate/cleaner.ex | 8 ++++----
lib/readability/candidate/scoring.ex | 11 +++++-----
lib/readability/helper.ex | 30 ++++++++++++++--------------
lib/readability/sanitizer.ex | 12 +++++------
lib/readability/title_finder.ex | 5 ++++-
mix.exs | 15 +++++++-------
mix.lock | 19 +++++++++---------
11 files changed, 60 insertions(+), 57 deletions(-)
diff --git a/.travis.yml b/.travis.yml
index b5c6fd6..f430b32 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -3,3 +3,4 @@ language: elixir
elixir:
- 1.2.6
- 1.3.4
+ - 1.4.1
diff --git a/lib/readability.ex b/lib/readability.ex
index b33ecf2..d1cef25 100644
--- a/lib/readability.ex
+++ b/lib/readability.ex
@@ -138,7 +138,7 @@ defmodule Readability do
@spec readable_html(html_tree) :: binary
def readable_html(html_tree) do
html_tree
- |> Helper.remove_attrs(regexes[:protect_attrs])
+ |> Helper.remove_attrs(regexes(:protect_attrs))
|> raw_html
end
@@ -166,7 +166,7 @@ defmodule Readability do
def parse(raw_html) when is_binary(raw_html), do: Floki.parse(raw_html)
- def regexes, do: @regexes
+ def regexes(key), do: @regexes[key]
def default_options, do: @default_options
end
diff --git a/lib/readability/article_builder.ex b/lib/readability/article_builder.ex
index 4ccc7de..52b1fb5 100644
--- a/lib/readability/article_builder.ex
+++ b/lib/readability/article_builder.ex
@@ -25,9 +25,7 @@ defmodule Readability.ArticleBuilder do
Enum.member?(["script", "style"], tag)
end)
- if opts[:remove_unlikely_candidates] do
- html_tree = Cleaner.remove_unlikely_tree(html_tree)
- end
+ html_tree = if opts[:remove_unlikely_candidates], do: Cleaner.remove_unlikely_tree(html_tree), else: html_tree
html_tree = Cleaner.transform_misused_div_to_p(html_tree)
candidates = CandidateFinder.find(html_tree, opts)
@@ -61,8 +59,10 @@ defmodule Readability.ArticleBuilder do
defp find_article(candidates, html_tree) do
best_candidate = CandidateFinder.find_best_candidate(candidates)
unless best_candidate do
- tree = html_tree |> Floki.find("body") |> hd
- best_candidate = %Candidate{html_tree: tree}
+ best_candidate = case html_tree |> Floki.find("body") do
+ [tree|_] -> %Candidate{html_tree: tree}
+ _ -> %Candidate{html_tree: {}}
+ end
end
article_trees = find_article_trees(best_candidate, candidates)
{"div", [], article_trees}
diff --git a/lib/readability/author_finder.ex b/lib/readability/author_finder.ex
index 3fb366c..8350e4e 100644
--- a/lib/readability/author_finder.ex
+++ b/lib/readability/author_finder.ex
@@ -22,7 +22,7 @@ defmodule Readability.AuthorFinder do
|> Enum.map(fn(meta) ->
meta
|> Floki.attribute("content")
- |> Floki.text
+ |> Enum.join(" ")
|> String.strip
end)
|> Enum.reject(&(is_nil(&1) || String.length(&1) == 0))
diff --git a/lib/readability/candidate/cleaner.ex b/lib/readability/candidate/cleaner.ex
index 41c65aa..4599984 100644
--- a/lib/readability/candidate/cleaner.ex
+++ b/lib/readability/candidate/cleaner.ex
@@ -18,7 +18,7 @@ defmodule Readability.Candidate.Cleaner do
[transform_misused_div_to_p(h)|transform_misused_div_to_p(t)]
end
def transform_misused_div_to_p({tag, attrs, inner_tree}) do
- if misused_divs?(tag, inner_tree), do: tag = "p"
+ tag = if misused_divs?(tag, inner_tree), do: "p", else: tag
{tag, attrs, transform_misused_div_to_p(inner_tree)}
end
@@ -31,7 +31,7 @@ defmodule Readability.Candidate.Cleaner do
end
defp misused_divs?("div", inner_tree) do
- !(Floki.raw_html(inner_tree) =~ Readability.regexes[:div_to_p_elements])
+ !(Floki.raw_html(inner_tree) =~ Readability.regexes(:div_to_p_elements))
end
defp misused_divs?(_, _), do: false
@@ -41,8 +41,8 @@ defmodule Readability.Candidate.Cleaner do
|> Enum.join("")
str = tag <> idclass_str
- str =~ Readability.regexes[:unlikely_candidate]
- && !(str =~ Readability.regexes[:ok_maybe_its_a_candidate])
+ str =~ Readability.regexes(:unlikely_candidate)
+ && !(str =~ Readability.regexes(:ok_maybe_its_a_candidate))
&& tag != "html"
end
end
diff --git a/lib/readability/candidate/scoring.ex b/lib/readability/candidate/scoring.ex
index ed9edbb..38e85a0 100644
--- a/lib/readability/candidate/scoring.ex
+++ b/lib/readability/candidate/scoring.ex
@@ -34,7 +34,7 @@ defmodule Readability.Candidate.Scoring do
defp calc_node_score({tag, attrs, _}, opts) do
score = 0
- if opts[:weight_classes], do: score = score + class_weight(attrs)
+ score = if opts[:weight_classes], do: score + class_weight(attrs), else: score
score + (@element_scores[tag] || 0)
end
defp calc_node_score([h|t], opts) do
@@ -47,11 +47,10 @@ defmodule Readability.Candidate.Scoring do
class = attrs |> List.keyfind("class", 0, {"", ""}) |> elem(1)
id = attrs |> List.keyfind("id", 0, {"", ""}) |> elem(1)
- if class =~ Readability.regexes[:positive], do: weight = weight + 25
- if id =~ Readability.regexes[:positive], do: weight = weight + 25
- if class =~ Readability.regexes[:negative], do: weight = weight - 25
- if id =~ Readability.regexes[:negative], do: weight = weight - 25
-
+ weight = if class =~ Readability.regexes(:positive), do: weight + 25, else: weight
+ weight = if id =~ Readability.regexes(:positive), do: weight + 25, else: weight
+ weight = if class =~ Readability.regexes(:negative), do: weight - 25, else: weight
+ weight = if id =~ Readability.regexes(:negative), do: weight - 25, else: weight
weight
end
diff --git a/lib/readability/helper.ex b/lib/readability/helper.ex
index f77cfff..1746812 100644
--- a/lib/readability/helper.ex
+++ b/lib/readability/helper.ex
@@ -31,16 +31,16 @@ defmodule Readability.Helper do
[remove_attrs(h, t_attrs)|remove_attrs(t, t_attrs)]
end
def remove_attrs({tag_name, attrs, inner_tree}, target_attr) do
- reject_fun = fn(attr) -> attr end
- cond do
- is_binary(target_attr) ->
- reject_fun = fn(attr) -> elem(attr, 0) == target_attr end
- Regex.regex?(target_attr) ->
- reject_fun = fn(attr) -> elem(attr, 0) =~ target_attr end
- is_list(target_attr) ->
- reject_fun = fn(attr) -> Enum.member?(target_attr, elem(attr, 0)) end
- true -> nil
- end
+ reject_fun =
+ cond do
+ is_binary(target_attr) ->
+ fn(attr) -> elem(attr, 0) == target_attr end
+ Regex.regex?(target_attr) ->
+ fn(attr) -> elem(attr, 0) =~ target_attr end
+ is_list(target_attr) ->
+ fn(attr) -> Enum.member?(target_attr, elem(attr, 0)) end
+ true -> fn(attr) -> attr end
+ end
{tag_name, Enum.reject(attrs, reject_fun), remove_attrs(inner_tree, target_attr)}
end
@@ -80,7 +80,7 @@ defmodule Readability.Helper do
"""
@spec candidate_tag?(html_tree) :: boolean
def candidate_tag?(html_tree) do
- Enum.any?(candidates_selector, fn(selector) ->
+ Enum.any?(candidates_selector(), fn(selector) ->
Floki.Selector.match?(html_tree, selector)
&& (text_length(html_tree)) >= Readability.default_options[:min_text_length]
end)
@@ -92,10 +92,10 @@ defmodule Readability.Helper do
@spec normalize(binary) :: html_tree
def normalize(raw_html) do
raw_html
- |> String.replace(Readability.regexes[:replace_xml_version], "")
- |> String.replace(Readability.regexes[:replace_brs], "
")
- |> String.replace(Readability.regexes[:replace_fonts], "<\1span>")
- |> String.replace(Readability.regexes[:normalize], " ")
+ |> String.replace(Readability.regexes(:replace_xml_version), "")
+ |> String.replace(Readability.regexes(:replace_brs), "
")
+ |> String.replace(Readability.regexes(:replace_fonts), "<\1span>")
+ |> String.replace(Readability.regexes(:normalize), " ")
|> Floki.parse
|> Floki.filter_out(:comment)
end
diff --git a/lib/readability/sanitizer.ex b/lib/readability/sanitizer.ex
index 2efceb5..42fa90a 100644
--- a/lib/readability/sanitizer.ex
+++ b/lib/readability/sanitizer.ex
@@ -21,10 +21,10 @@ defmodule Readability.Sanitizer do
|> Helper.remove_tag(&clean_empty_p?(&1))
if opts[:clean_conditionally] do
- html_tree = html_tree
- |> Helper.remove_tag(conditionally_cleaing_fn(candidates))
+ html_tree |> Helper.remove_tag(conditionally_cleaing_fn(candidates))
+ else
+ html_tree
end
- html_tree
end
defp conditionally_cleaing_fn(candidates) do
@@ -48,7 +48,7 @@ defmodule Readability.Sanitizer do
input_len = tree |> Floki.find("input") |> length
embed_len = tree
|> Floki.find("embed")
- |> Enum.reject(&(&1 =~ Readability.regexes[:video]))
+ |> Enum.reject(&(&1 =~ Readability.regexes(:video)))
|> length
link_density = Scoring.calc_link_density(tree)
@@ -57,7 +57,7 @@ defmodule Readability.Sanitizer do
img_len > p_len # too many image
|| (!list? && li_len > p_len) # more
s than s
|| input_len > (p_len / 3) # less than 3x
s than s
- || (!list? && conent_len < Readability.regexes[:min_text_length] && img_len != 1) # too short a content length without a single image
+ || (!list? && conent_len < Readability.regexes(:min_text_length) && img_len != 1) # too short a content length without a single image
|| (weight < 25 && link_density > 0.2) # too many links for its weight (#{weight})
|| (weight >= 25 && link_density > 0.5) # too many links for its weight (#{weight})
|| ((embed_len == 1 && conent_len < 75) || embed_len > 1) # s with too short a content length, or too many s
@@ -75,7 +75,7 @@ defmodule Readability.Sanitizer do
defp clean_unlikely_tag?({tag, attrs, _}) do
attrs_str = attrs |> Enum.map(&(elem(&1, 1))) |> Enum.join("")
- tag =~ ~r/form|object|iframe|embed/ && !(attrs_str =~ Readability.regexes[:video])
+ tag =~ ~r/form|object|iframe|embed/ && !(attrs_str =~ Readability.regexes(:video))
end
defp clean_empty_p?({tag, _, _} = html_tree) do
diff --git a/lib/readability/title_finder.ex b/lib/readability/title_finder.ex
index 59ac9b3..1ce2da0 100644
--- a/lib/readability/title_finder.ex
+++ b/lib/readability/title_finder.ex
@@ -45,7 +45,7 @@ defmodule Readability.TitleFinder do
@spec og_title(html_tree) :: binary
def og_title(html_tree) do
html_tree
- |> find_tag("meta[property=og:title]")
+ |> find_tag("meta[property='og:title']")
|> Floki.attribute("content")
|> clean_title()
end
@@ -72,6 +72,9 @@ defmodule Readability.TitleFinder do
defp clean_title([]) do
""
end
+ defp clean_title([title]) when is_binary(title) do
+ String.strip(title)
+ end
defp clean_title(html_tree) do
html_tree
|> Floki.text()
diff --git a/mix.exs b/mix.exs
index 73e370b..ef60fc3 100644
--- a/mix.exs
+++ b/mix.exs
@@ -2,7 +2,7 @@ defmodule Readability.Mixfile do
@moduledoc """
"""
- @version "0.6.2"
+ @version "0.7.0"
@description """
Readability library for extracting and curating articles.
"""
@@ -14,10 +14,10 @@ defmodule Readability.Mixfile do
version: @version,
elixir: "~> 1.2",
description: @description,
- package: package,
+ package: package(),
build_embedded: Mix.env == :prod,
start_permanent: Mix.env == :prod,
- deps: deps]
+ deps: deps()]
end
# Configuration for the OTP application
@@ -40,11 +40,10 @@ defmodule Readability.Mixfile do
#
# Type "mix help deps" for more examples and options
defp deps do
- [{:floki, "~> 0.9.0"},
- {:httpoison, "~> 0.9.0"},
- {:earmark, "~> 0.1", only: :dev},
- {:ex_doc, "~> 0.11", only: :dev},
- {:credo, "~> 0.3", only: [:dev, :test]},
+ [{:floki, "~> 0.13.1"},
+ {:httpoison, "~> 0.11.0"},
+ {:ex_doc, "~> 0.14", only: :dev},
+ {:credo, "~> 0.6.1", only: [:dev, :test]},
{:dialyxir, "~> 0.3", only: [:dev]}
]
end
diff --git a/mix.lock b/mix.lock
index a030395..04c7e15 100644
--- a/mix.lock
+++ b/mix.lock
@@ -1,14 +1,15 @@
-%{"bunt": {:hex, :bunt, "0.1.6", "5d95a6882f73f3b9969fdfd1953798046664e6f77ec4e486e6fafc7caad97c6f", [:mix], []},
- "certifi": {:hex, :certifi, "0.4.0", "a7966efb868b179023618d29a407548f70c52466bf1849b9e8ebd0e34b7ea11f", [:rebar3], []},
- "credo": {:hex, :credo, "0.4.5", "5c5daaf50a2a96068c0f21b6fbd382d206702efa8836a946eeab0b8ac25f5f22", [:mix], [{:bunt, "~> 0.1.6", [hex: :bunt, optional: false]}]},
+%{"bunt": {:hex, :bunt, "0.2.0", "951c6e801e8b1d2cbe58ebbd3e616a869061ddadcc4863d0a2182541acae9a38", [:mix], []},
+ "certifi": {:hex, :certifi, "0.7.0", "861a57f3808f7eb0c2d1802afeaae0fa5de813b0df0979153cbafcd853ababaf", [:rebar3], []},
+ "credo": {:hex, :credo, "0.6.1", "a941e2591bd2bd2055dc92b810c174650b40b8290459c89a835af9d59ac4a5f8", [:mix], [{:bunt, "~> 0.2.0", [hex: :bunt, optional: false]}]},
"dialyxir": {:hex, :dialyxir, "0.3.5", "eaba092549e044c76f83165978979f60110dc58dd5b92fd952bf2312f64e9b14", [:mix], []},
- "earmark": {:hex, :earmark, "0.2.1", "ba6d26ceb16106d069b289df66751734802777a3cbb6787026dd800ffeb850f3", [:mix], []},
- "ex_doc": {:hex, :ex_doc, "0.12.0", "b774aabfede4af31c0301aece12371cbd25995a21bb3d71d66f5c2fe074c603f", [:mix], [{:earmark, "~> 0.2", [hex: :earmark, optional: false]}]},
- "floki": {:hex, :floki, "0.9.0", "e952ca71a453f7827ab5405106ac8d9ac5c9602d18aa5d2d893e5b9944e2499e", [:mix], [{:mochiweb_html, "~> 2.15", [hex: :mochiweb_html, optional: false]}]},
- "hackney": {:hex, :hackney, "1.6.1", "ddd22d42db2b50e6a155439c8811b8f6df61a4395de10509714ad2751c6da817", [:rebar3], [{:certifi, "0.4.0", [hex: :certifi, optional: false]}, {:idna, "1.2.0", [hex: :idna, optional: false]}, {:metrics, "1.0.1", [hex: :metrics, optional: false]}, {:mimerl, "1.0.2", [hex: :mimerl, optional: false]}, {:ssl_verify_fun, "1.1.0", [hex: :ssl_verify_fun, optional: false]}]},
- "httpoison": {:hex, :httpoison, "0.9.0", "68187a2daddfabbe7ca8f7d75ef227f89f0e1507f7eecb67e4536b3c516faddb", [:mix], [{:hackney, "~> 1.6.0", [hex: :hackney, optional: false]}]},
+ "earmark": {:hex, :earmark, "1.1.1", "433136b7f2e99cde88b745b3a0cfc3fbc81fe58b918a09b40fce7f00db4d8187", [:mix], []},
+ "ex_doc": {:hex, :ex_doc, "0.14.5", "c0433c8117e948404d93ca69411dd575ec6be39b47802e81ca8d91017a0cf83c", [:mix], [{:earmark, "~> 1.0", [hex: :earmark, optional: false]}]},
+ "floki": {:hex, :floki, "0.13.1", "b3b287e02914cb41a66285071dade287165ed1915ab07903e18fb454fe961bad", [:mix], [{:mochiweb, "~> 2.15", [hex: :mochiweb, optional: false]}]},
+ "hackney": {:hex, :hackney, "1.6.5", "8c025ee397ac94a184b0743c73b33b96465e85f90a02e210e86df6cbafaa5065", [:rebar3], [{:certifi, "0.7.0", [hex: :certifi, optional: false]}, {:idna, "1.2.0", [hex: :idna, optional: false]}, {:metrics, "1.0.1", [hex: :metrics, optional: false]}, {:mimerl, "1.0.2", [hex: :mimerl, optional: false]}, {:ssl_verify_fun, "1.1.1", [hex: :ssl_verify_fun, optional: false]}]},
+ "httpoison": {:hex, :httpoison, "0.11.0", "b9240a9c44fc46fcd8618d17898859ba09a3c1b47210b74316c0ffef10735e76", [:mix], [{:hackney, "~> 1.6.3", [hex: :hackney, optional: false]}]},
"idna": {:hex, :idna, "1.2.0", "ac62ee99da068f43c50dc69acf700e03a62a348360126260e87f2b54eced86b2", [:rebar3], []},
"metrics": {:hex, :metrics, "1.0.1", "25f094dea2cda98213cecc3aeff09e940299d950904393b2a29d191c346a8486", [:rebar3], []},
"mimerl": {:hex, :mimerl, "1.0.2", "993f9b0e084083405ed8252b99460c4f0563e41729ab42d9074fd5e52439be88", [:rebar3], []},
+ "mochiweb": {:hex, :mochiweb, "2.15.0", "e1daac474df07651e5d17cc1e642c4069c7850dc4508d3db7263a0651330aacc", [:rebar3], []},
"mochiweb_html": {:hex, :mochiweb_html, "2.15.0", "d7402e967d7f9f2912f8befa813c37be62d5eeeddbbcb6fe986c44e01460d497", [:rebar3], []},
- "ssl_verify_fun": {:hex, :ssl_verify_fun, "1.1.0", "edee20847c42e379bf91261db474ffbe373f8acb56e9079acb6038d4e0bf414f", [:rebar, :make], []}}
+ "ssl_verify_fun": {:hex, :ssl_verify_fun, "1.1.1", "28a4d65b7f59893bc2c7de786dec1e1555bd742d336043fe644ae956c3497fbe", [:make, :rebar], []}}