fix some bug and update deps

This commit is contained in:
keepcosmos 2017-02-05 18:48:26 +09:00
parent 47af5e48de
commit 1aa682a31a
11 changed files with 60 additions and 57 deletions

View File

@ -3,3 +3,4 @@ language: elixir
elixir:
- 1.2.6
- 1.3.4
- 1.4.1

View File

@ -138,7 +138,7 @@ defmodule Readability do
@spec readable_html(html_tree) :: binary
def readable_html(html_tree) do
html_tree
|> Helper.remove_attrs(regexes[:protect_attrs])
|> Helper.remove_attrs(regexes(:protect_attrs))
|> raw_html
end
@ -166,7 +166,7 @@ defmodule Readability do
def parse(raw_html) when is_binary(raw_html), do: Floki.parse(raw_html)
def regexes, do: @regexes
def regexes(key), do: @regexes[key]
def default_options, do: @default_options
end

View File

@ -25,9 +25,7 @@ defmodule Readability.ArticleBuilder do
Enum.member?(["script", "style"], tag)
end)
if opts[:remove_unlikely_candidates] do
html_tree = Cleaner.remove_unlikely_tree(html_tree)
end
html_tree = if opts[:remove_unlikely_candidates], do: Cleaner.remove_unlikely_tree(html_tree), else: html_tree
html_tree = Cleaner.transform_misused_div_to_p(html_tree)
candidates = CandidateFinder.find(html_tree, opts)
@ -61,8 +59,10 @@ defmodule Readability.ArticleBuilder do
defp find_article(candidates, html_tree) do
best_candidate = CandidateFinder.find_best_candidate(candidates)
unless best_candidate do
tree = html_tree |> Floki.find("body") |> hd
best_candidate = %Candidate{html_tree: tree}
best_candidate = case html_tree |> Floki.find("body") do
[tree|_] -> %Candidate{html_tree: tree}
_ -> %Candidate{html_tree: {}}
end
end
article_trees = find_article_trees(best_candidate, candidates)
{"div", [], article_trees}

View File

@ -22,7 +22,7 @@ defmodule Readability.AuthorFinder do
|> Enum.map(fn(meta) ->
meta
|> Floki.attribute("content")
|> Floki.text
|> Enum.join(" ")
|> String.strip
end)
|> Enum.reject(&(is_nil(&1) || String.length(&1) == 0))

View File

@ -18,7 +18,7 @@ defmodule Readability.Candidate.Cleaner do
[transform_misused_div_to_p(h)|transform_misused_div_to_p(t)]
end
def transform_misused_div_to_p({tag, attrs, inner_tree}) do
if misused_divs?(tag, inner_tree), do: tag = "p"
tag = if misused_divs?(tag, inner_tree), do: "p", else: tag
{tag, attrs, transform_misused_div_to_p(inner_tree)}
end
@ -31,7 +31,7 @@ defmodule Readability.Candidate.Cleaner do
end
defp misused_divs?("div", inner_tree) do
!(Floki.raw_html(inner_tree) =~ Readability.regexes[:div_to_p_elements])
!(Floki.raw_html(inner_tree) =~ Readability.regexes(:div_to_p_elements))
end
defp misused_divs?(_, _), do: false
@ -41,8 +41,8 @@ defmodule Readability.Candidate.Cleaner do
|> Enum.join("")
str = tag <> idclass_str
str =~ Readability.regexes[:unlikely_candidate]
&& !(str =~ Readability.regexes[:ok_maybe_its_a_candidate])
str =~ Readability.regexes(:unlikely_candidate)
&& !(str =~ Readability.regexes(:ok_maybe_its_a_candidate))
&& tag != "html"
end
end

View File

@ -34,7 +34,7 @@ defmodule Readability.Candidate.Scoring do
defp calc_node_score({tag, attrs, _}, opts) do
score = 0
if opts[:weight_classes], do: score = score + class_weight(attrs)
score = if opts[:weight_classes], do: score + class_weight(attrs), else: score
score + (@element_scores[tag] || 0)
end
defp calc_node_score([h|t], opts) do
@ -47,11 +47,10 @@ defmodule Readability.Candidate.Scoring do
class = attrs |> List.keyfind("class", 0, {"", ""}) |> elem(1)
id = attrs |> List.keyfind("id", 0, {"", ""}) |> elem(1)
if class =~ Readability.regexes[:positive], do: weight = weight + 25
if id =~ Readability.regexes[:positive], do: weight = weight + 25
if class =~ Readability.regexes[:negative], do: weight = weight - 25
if id =~ Readability.regexes[:negative], do: weight = weight - 25
weight = if class =~ Readability.regexes(:positive), do: weight + 25, else: weight
weight = if id =~ Readability.regexes(:positive), do: weight + 25, else: weight
weight = if class =~ Readability.regexes(:negative), do: weight - 25, else: weight
weight = if id =~ Readability.regexes(:negative), do: weight - 25, else: weight
weight
end

View File

@ -31,16 +31,16 @@ defmodule Readability.Helper do
[remove_attrs(h, t_attrs)|remove_attrs(t, t_attrs)]
end
def remove_attrs({tag_name, attrs, inner_tree}, target_attr) do
reject_fun = fn(attr) -> attr end
cond do
is_binary(target_attr) ->
reject_fun = fn(attr) -> elem(attr, 0) == target_attr end
Regex.regex?(target_attr) ->
reject_fun = fn(attr) -> elem(attr, 0) =~ target_attr end
is_list(target_attr) ->
reject_fun = fn(attr) -> Enum.member?(target_attr, elem(attr, 0)) end
true -> nil
end
reject_fun =
cond do
is_binary(target_attr) ->
fn(attr) -> elem(attr, 0) == target_attr end
Regex.regex?(target_attr) ->
fn(attr) -> elem(attr, 0) =~ target_attr end
is_list(target_attr) ->
fn(attr) -> Enum.member?(target_attr, elem(attr, 0)) end
true -> fn(attr) -> attr end
end
{tag_name, Enum.reject(attrs, reject_fun), remove_attrs(inner_tree, target_attr)}
end
@ -80,7 +80,7 @@ defmodule Readability.Helper do
"""
@spec candidate_tag?(html_tree) :: boolean
def candidate_tag?(html_tree) do
Enum.any?(candidates_selector, fn(selector) ->
Enum.any?(candidates_selector(), fn(selector) ->
Floki.Selector.match?(html_tree, selector)
&& (text_length(html_tree)) >= Readability.default_options[:min_text_length]
end)
@ -92,10 +92,10 @@ defmodule Readability.Helper do
@spec normalize(binary) :: html_tree
def normalize(raw_html) do
raw_html
|> String.replace(Readability.regexes[:replace_xml_version], "")
|> String.replace(Readability.regexes[:replace_brs], "</p><p>")
|> String.replace(Readability.regexes[:replace_fonts], "<\1span>")
|> String.replace(Readability.regexes[:normalize], " ")
|> String.replace(Readability.regexes(:replace_xml_version), "")
|> String.replace(Readability.regexes(:replace_brs), "</p><p>")
|> String.replace(Readability.regexes(:replace_fonts), "<\1span>")
|> String.replace(Readability.regexes(:normalize), " ")
|> Floki.parse
|> Floki.filter_out(:comment)
end

View File

@ -21,10 +21,10 @@ defmodule Readability.Sanitizer do
|> Helper.remove_tag(&clean_empty_p?(&1))
if opts[:clean_conditionally] do
html_tree = html_tree
|> Helper.remove_tag(conditionally_cleaing_fn(candidates))
html_tree |> Helper.remove_tag(conditionally_cleaing_fn(candidates))
else
html_tree
end
html_tree
end
defp conditionally_cleaing_fn(candidates) do
@ -48,7 +48,7 @@ defmodule Readability.Sanitizer do
input_len = tree |> Floki.find("input") |> length
embed_len = tree
|> Floki.find("embed")
|> Enum.reject(&(&1 =~ Readability.regexes[:video]))
|> Enum.reject(&(&1 =~ Readability.regexes(:video)))
|> length
link_density = Scoring.calc_link_density(tree)
@ -57,7 +57,7 @@ defmodule Readability.Sanitizer do
img_len > p_len # too many image
|| (!list? && li_len > p_len) # more <li>s than <p>s
|| input_len > (p_len / 3) # less than 3x <p>s than <input>s
|| (!list? && conent_len < Readability.regexes[:min_text_length] && img_len != 1) # too short a content length without a single image
|| (!list? && conent_len < Readability.regexes(:min_text_length) && img_len != 1) # too short a content length without a single image
|| (weight < 25 && link_density > 0.2) # too many links for its weight (#{weight})
|| (weight >= 25 && link_density > 0.5) # too many links for its weight (#{weight})
|| ((embed_len == 1 && conent_len < 75) || embed_len > 1) # <embed>s with too short a content length, or too many <embed>s
@ -75,7 +75,7 @@ defmodule Readability.Sanitizer do
defp clean_unlikely_tag?({tag, attrs, _}) do
attrs_str = attrs |> Enum.map(&(elem(&1, 1))) |> Enum.join("")
tag =~ ~r/form|object|iframe|embed/ && !(attrs_str =~ Readability.regexes[:video])
tag =~ ~r/form|object|iframe|embed/ && !(attrs_str =~ Readability.regexes(:video))
end
defp clean_empty_p?({tag, _, _} = html_tree) do

View File

@ -45,7 +45,7 @@ defmodule Readability.TitleFinder do
@spec og_title(html_tree) :: binary
def og_title(html_tree) do
html_tree
|> find_tag("meta[property=og:title]")
|> find_tag("meta[property='og:title']")
|> Floki.attribute("content")
|> clean_title()
end
@ -72,6 +72,9 @@ defmodule Readability.TitleFinder do
defp clean_title([]) do
""
end
defp clean_title([title]) when is_binary(title) do
String.strip(title)
end
defp clean_title(html_tree) do
html_tree
|> Floki.text()

15
mix.exs
View File

@ -2,7 +2,7 @@ defmodule Readability.Mixfile do
@moduledoc """
"""
@version "0.6.2"
@version "0.7.0"
@description """
Readability library for extracting and curating articles.
"""
@ -14,10 +14,10 @@ defmodule Readability.Mixfile do
version: @version,
elixir: "~> 1.2",
description: @description,
package: package,
package: package(),
build_embedded: Mix.env == :prod,
start_permanent: Mix.env == :prod,
deps: deps]
deps: deps()]
end
# Configuration for the OTP application
@ -40,11 +40,10 @@ defmodule Readability.Mixfile do
#
# Type "mix help deps" for more examples and options
defp deps do
[{:floki, "~> 0.9.0"},
{:httpoison, "~> 0.9.0"},
{:earmark, "~> 0.1", only: :dev},
{:ex_doc, "~> 0.11", only: :dev},
{:credo, "~> 0.3", only: [:dev, :test]},
[{:floki, "~> 0.13.1"},
{:httpoison, "~> 0.11.0"},
{:ex_doc, "~> 0.14", only: :dev},
{:credo, "~> 0.6.1", only: [:dev, :test]},
{:dialyxir, "~> 0.3", only: [:dev]}
]
end

View File

@ -1,14 +1,15 @@
%{"bunt": {:hex, :bunt, "0.1.6", "5d95a6882f73f3b9969fdfd1953798046664e6f77ec4e486e6fafc7caad97c6f", [:mix], []},
"certifi": {:hex, :certifi, "0.4.0", "a7966efb868b179023618d29a407548f70c52466bf1849b9e8ebd0e34b7ea11f", [:rebar3], []},
"credo": {:hex, :credo, "0.4.5", "5c5daaf50a2a96068c0f21b6fbd382d206702efa8836a946eeab0b8ac25f5f22", [:mix], [{:bunt, "~> 0.1.6", [hex: :bunt, optional: false]}]},
%{"bunt": {:hex, :bunt, "0.2.0", "951c6e801e8b1d2cbe58ebbd3e616a869061ddadcc4863d0a2182541acae9a38", [:mix], []},
"certifi": {:hex, :certifi, "0.7.0", "861a57f3808f7eb0c2d1802afeaae0fa5de813b0df0979153cbafcd853ababaf", [:rebar3], []},
"credo": {:hex, :credo, "0.6.1", "a941e2591bd2bd2055dc92b810c174650b40b8290459c89a835af9d59ac4a5f8", [:mix], [{:bunt, "~> 0.2.0", [hex: :bunt, optional: false]}]},
"dialyxir": {:hex, :dialyxir, "0.3.5", "eaba092549e044c76f83165978979f60110dc58dd5b92fd952bf2312f64e9b14", [:mix], []},
"earmark": {:hex, :earmark, "0.2.1", "ba6d26ceb16106d069b289df66751734802777a3cbb6787026dd800ffeb850f3", [:mix], []},
"ex_doc": {:hex, :ex_doc, "0.12.0", "b774aabfede4af31c0301aece12371cbd25995a21bb3d71d66f5c2fe074c603f", [:mix], [{:earmark, "~> 0.2", [hex: :earmark, optional: false]}]},
"floki": {:hex, :floki, "0.9.0", "e952ca71a453f7827ab5405106ac8d9ac5c9602d18aa5d2d893e5b9944e2499e", [:mix], [{:mochiweb_html, "~> 2.15", [hex: :mochiweb_html, optional: false]}]},
"hackney": {:hex, :hackney, "1.6.1", "ddd22d42db2b50e6a155439c8811b8f6df61a4395de10509714ad2751c6da817", [:rebar3], [{:certifi, "0.4.0", [hex: :certifi, optional: false]}, {:idna, "1.2.0", [hex: :idna, optional: false]}, {:metrics, "1.0.1", [hex: :metrics, optional: false]}, {:mimerl, "1.0.2", [hex: :mimerl, optional: false]}, {:ssl_verify_fun, "1.1.0", [hex: :ssl_verify_fun, optional: false]}]},
"httpoison": {:hex, :httpoison, "0.9.0", "68187a2daddfabbe7ca8f7d75ef227f89f0e1507f7eecb67e4536b3c516faddb", [:mix], [{:hackney, "~> 1.6.0", [hex: :hackney, optional: false]}]},
"earmark": {:hex, :earmark, "1.1.1", "433136b7f2e99cde88b745b3a0cfc3fbc81fe58b918a09b40fce7f00db4d8187", [:mix], []},
"ex_doc": {:hex, :ex_doc, "0.14.5", "c0433c8117e948404d93ca69411dd575ec6be39b47802e81ca8d91017a0cf83c", [:mix], [{:earmark, "~> 1.0", [hex: :earmark, optional: false]}]},
"floki": {:hex, :floki, "0.13.1", "b3b287e02914cb41a66285071dade287165ed1915ab07903e18fb454fe961bad", [:mix], [{:mochiweb, "~> 2.15", [hex: :mochiweb, optional: false]}]},
"hackney": {:hex, :hackney, "1.6.5", "8c025ee397ac94a184b0743c73b33b96465e85f90a02e210e86df6cbafaa5065", [:rebar3], [{:certifi, "0.7.0", [hex: :certifi, optional: false]}, {:idna, "1.2.0", [hex: :idna, optional: false]}, {:metrics, "1.0.1", [hex: :metrics, optional: false]}, {:mimerl, "1.0.2", [hex: :mimerl, optional: false]}, {:ssl_verify_fun, "1.1.1", [hex: :ssl_verify_fun, optional: false]}]},
"httpoison": {:hex, :httpoison, "0.11.0", "b9240a9c44fc46fcd8618d17898859ba09a3c1b47210b74316c0ffef10735e76", [:mix], [{:hackney, "~> 1.6.3", [hex: :hackney, optional: false]}]},
"idna": {:hex, :idna, "1.2.0", "ac62ee99da068f43c50dc69acf700e03a62a348360126260e87f2b54eced86b2", [:rebar3], []},
"metrics": {:hex, :metrics, "1.0.1", "25f094dea2cda98213cecc3aeff09e940299d950904393b2a29d191c346a8486", [:rebar3], []},
"mimerl": {:hex, :mimerl, "1.0.2", "993f9b0e084083405ed8252b99460c4f0563e41729ab42d9074fd5e52439be88", [:rebar3], []},
"mochiweb": {:hex, :mochiweb, "2.15.0", "e1daac474df07651e5d17cc1e642c4069c7850dc4508d3db7263a0651330aacc", [:rebar3], []},
"mochiweb_html": {:hex, :mochiweb_html, "2.15.0", "d7402e967d7f9f2912f8befa813c37be62d5eeeddbbcb6fe986c44e01460d497", [:rebar3], []},
"ssl_verify_fun": {:hex, :ssl_verify_fun, "1.1.0", "edee20847c42e379bf91261db474ffbe373f8acb56e9079acb6038d4e0bf414f", [:rebar, :make], []}}
"ssl_verify_fun": {:hex, :ssl_verify_fun, "1.1.1", "28a4d65b7f59893bc2c7de786dec1e1555bd742d336043fe644ae956c3497fbe", [:make, :rebar], []}}