fix some bug and update deps
This commit is contained in:
parent
47af5e48de
commit
1aa682a31a
|
@ -3,3 +3,4 @@ language: elixir
|
|||
elixir:
|
||||
- 1.2.6
|
||||
- 1.3.4
|
||||
- 1.4.1
|
||||
|
|
|
@ -138,7 +138,7 @@ defmodule Readability do
|
|||
@spec readable_html(html_tree) :: binary
|
||||
def readable_html(html_tree) do
|
||||
html_tree
|
||||
|> Helper.remove_attrs(regexes[:protect_attrs])
|
||||
|> Helper.remove_attrs(regexes(:protect_attrs))
|
||||
|> raw_html
|
||||
end
|
||||
|
||||
|
@ -166,7 +166,7 @@ defmodule Readability do
|
|||
|
||||
def parse(raw_html) when is_binary(raw_html), do: Floki.parse(raw_html)
|
||||
|
||||
def regexes, do: @regexes
|
||||
def regexes(key), do: @regexes[key]
|
||||
|
||||
def default_options, do: @default_options
|
||||
end
|
||||
|
|
|
@ -25,9 +25,7 @@ defmodule Readability.ArticleBuilder do
|
|||
Enum.member?(["script", "style"], tag)
|
||||
end)
|
||||
|
||||
if opts[:remove_unlikely_candidates] do
|
||||
html_tree = Cleaner.remove_unlikely_tree(html_tree)
|
||||
end
|
||||
html_tree = if opts[:remove_unlikely_candidates], do: Cleaner.remove_unlikely_tree(html_tree), else: html_tree
|
||||
html_tree = Cleaner.transform_misused_div_to_p(html_tree)
|
||||
|
||||
candidates = CandidateFinder.find(html_tree, opts)
|
||||
|
@ -61,8 +59,10 @@ defmodule Readability.ArticleBuilder do
|
|||
defp find_article(candidates, html_tree) do
|
||||
best_candidate = CandidateFinder.find_best_candidate(candidates)
|
||||
unless best_candidate do
|
||||
tree = html_tree |> Floki.find("body") |> hd
|
||||
best_candidate = %Candidate{html_tree: tree}
|
||||
best_candidate = case html_tree |> Floki.find("body") do
|
||||
[tree|_] -> %Candidate{html_tree: tree}
|
||||
_ -> %Candidate{html_tree: {}}
|
||||
end
|
||||
end
|
||||
article_trees = find_article_trees(best_candidate, candidates)
|
||||
{"div", [], article_trees}
|
||||
|
|
|
@ -22,7 +22,7 @@ defmodule Readability.AuthorFinder do
|
|||
|> Enum.map(fn(meta) ->
|
||||
meta
|
||||
|> Floki.attribute("content")
|
||||
|> Floki.text
|
||||
|> Enum.join(" ")
|
||||
|> String.strip
|
||||
end)
|
||||
|> Enum.reject(&(is_nil(&1) || String.length(&1) == 0))
|
||||
|
|
|
@ -18,7 +18,7 @@ defmodule Readability.Candidate.Cleaner do
|
|||
[transform_misused_div_to_p(h)|transform_misused_div_to_p(t)]
|
||||
end
|
||||
def transform_misused_div_to_p({tag, attrs, inner_tree}) do
|
||||
if misused_divs?(tag, inner_tree), do: tag = "p"
|
||||
tag = if misused_divs?(tag, inner_tree), do: "p", else: tag
|
||||
{tag, attrs, transform_misused_div_to_p(inner_tree)}
|
||||
end
|
||||
|
||||
|
@ -31,7 +31,7 @@ defmodule Readability.Candidate.Cleaner do
|
|||
end
|
||||
|
||||
defp misused_divs?("div", inner_tree) do
|
||||
!(Floki.raw_html(inner_tree) =~ Readability.regexes[:div_to_p_elements])
|
||||
!(Floki.raw_html(inner_tree) =~ Readability.regexes(:div_to_p_elements))
|
||||
end
|
||||
defp misused_divs?(_, _), do: false
|
||||
|
||||
|
@ -41,8 +41,8 @@ defmodule Readability.Candidate.Cleaner do
|
|||
|> Enum.join("")
|
||||
str = tag <> idclass_str
|
||||
|
||||
str =~ Readability.regexes[:unlikely_candidate]
|
||||
&& !(str =~ Readability.regexes[:ok_maybe_its_a_candidate])
|
||||
str =~ Readability.regexes(:unlikely_candidate)
|
||||
&& !(str =~ Readability.regexes(:ok_maybe_its_a_candidate))
|
||||
&& tag != "html"
|
||||
end
|
||||
end
|
||||
|
|
|
@ -34,7 +34,7 @@ defmodule Readability.Candidate.Scoring do
|
|||
|
||||
defp calc_node_score({tag, attrs, _}, opts) do
|
||||
score = 0
|
||||
if opts[:weight_classes], do: score = score + class_weight(attrs)
|
||||
score = if opts[:weight_classes], do: score + class_weight(attrs), else: score
|
||||
score + (@element_scores[tag] || 0)
|
||||
end
|
||||
defp calc_node_score([h|t], opts) do
|
||||
|
@ -47,11 +47,10 @@ defmodule Readability.Candidate.Scoring do
|
|||
class = attrs |> List.keyfind("class", 0, {"", ""}) |> elem(1)
|
||||
id = attrs |> List.keyfind("id", 0, {"", ""}) |> elem(1)
|
||||
|
||||
if class =~ Readability.regexes[:positive], do: weight = weight + 25
|
||||
if id =~ Readability.regexes[:positive], do: weight = weight + 25
|
||||
if class =~ Readability.regexes[:negative], do: weight = weight - 25
|
||||
if id =~ Readability.regexes[:negative], do: weight = weight - 25
|
||||
|
||||
weight = if class =~ Readability.regexes(:positive), do: weight + 25, else: weight
|
||||
weight = if id =~ Readability.regexes(:positive), do: weight + 25, else: weight
|
||||
weight = if class =~ Readability.regexes(:negative), do: weight - 25, else: weight
|
||||
weight = if id =~ Readability.regexes(:negative), do: weight - 25, else: weight
|
||||
weight
|
||||
end
|
||||
|
||||
|
|
|
@ -31,16 +31,16 @@ defmodule Readability.Helper do
|
|||
[remove_attrs(h, t_attrs)|remove_attrs(t, t_attrs)]
|
||||
end
|
||||
def remove_attrs({tag_name, attrs, inner_tree}, target_attr) do
|
||||
reject_fun = fn(attr) -> attr end
|
||||
cond do
|
||||
is_binary(target_attr) ->
|
||||
reject_fun = fn(attr) -> elem(attr, 0) == target_attr end
|
||||
Regex.regex?(target_attr) ->
|
||||
reject_fun = fn(attr) -> elem(attr, 0) =~ target_attr end
|
||||
is_list(target_attr) ->
|
||||
reject_fun = fn(attr) -> Enum.member?(target_attr, elem(attr, 0)) end
|
||||
true -> nil
|
||||
end
|
||||
reject_fun =
|
||||
cond do
|
||||
is_binary(target_attr) ->
|
||||
fn(attr) -> elem(attr, 0) == target_attr end
|
||||
Regex.regex?(target_attr) ->
|
||||
fn(attr) -> elem(attr, 0) =~ target_attr end
|
||||
is_list(target_attr) ->
|
||||
fn(attr) -> Enum.member?(target_attr, elem(attr, 0)) end
|
||||
true -> fn(attr) -> attr end
|
||||
end
|
||||
{tag_name, Enum.reject(attrs, reject_fun), remove_attrs(inner_tree, target_attr)}
|
||||
end
|
||||
|
||||
|
@ -80,7 +80,7 @@ defmodule Readability.Helper do
|
|||
"""
|
||||
@spec candidate_tag?(html_tree) :: boolean
|
||||
def candidate_tag?(html_tree) do
|
||||
Enum.any?(candidates_selector, fn(selector) ->
|
||||
Enum.any?(candidates_selector(), fn(selector) ->
|
||||
Floki.Selector.match?(html_tree, selector)
|
||||
&& (text_length(html_tree)) >= Readability.default_options[:min_text_length]
|
||||
end)
|
||||
|
@ -92,10 +92,10 @@ defmodule Readability.Helper do
|
|||
@spec normalize(binary) :: html_tree
|
||||
def normalize(raw_html) do
|
||||
raw_html
|
||||
|> String.replace(Readability.regexes[:replace_xml_version], "")
|
||||
|> String.replace(Readability.regexes[:replace_brs], "</p><p>")
|
||||
|> String.replace(Readability.regexes[:replace_fonts], "<\1span>")
|
||||
|> String.replace(Readability.regexes[:normalize], " ")
|
||||
|> String.replace(Readability.regexes(:replace_xml_version), "")
|
||||
|> String.replace(Readability.regexes(:replace_brs), "</p><p>")
|
||||
|> String.replace(Readability.regexes(:replace_fonts), "<\1span>")
|
||||
|> String.replace(Readability.regexes(:normalize), " ")
|
||||
|> Floki.parse
|
||||
|> Floki.filter_out(:comment)
|
||||
end
|
||||
|
|
|
@ -21,10 +21,10 @@ defmodule Readability.Sanitizer do
|
|||
|> Helper.remove_tag(&clean_empty_p?(&1))
|
||||
|
||||
if opts[:clean_conditionally] do
|
||||
html_tree = html_tree
|
||||
|> Helper.remove_tag(conditionally_cleaing_fn(candidates))
|
||||
html_tree |> Helper.remove_tag(conditionally_cleaing_fn(candidates))
|
||||
else
|
||||
html_tree
|
||||
end
|
||||
html_tree
|
||||
end
|
||||
|
||||
defp conditionally_cleaing_fn(candidates) do
|
||||
|
@ -48,7 +48,7 @@ defmodule Readability.Sanitizer do
|
|||
input_len = tree |> Floki.find("input") |> length
|
||||
embed_len = tree
|
||||
|> Floki.find("embed")
|
||||
|> Enum.reject(&(&1 =~ Readability.regexes[:video]))
|
||||
|> Enum.reject(&(&1 =~ Readability.regexes(:video)))
|
||||
|> length
|
||||
|
||||
link_density = Scoring.calc_link_density(tree)
|
||||
|
@ -57,7 +57,7 @@ defmodule Readability.Sanitizer do
|
|||
img_len > p_len # too many image
|
||||
|| (!list? && li_len > p_len) # more <li>s than <p>s
|
||||
|| input_len > (p_len / 3) # less than 3x <p>s than <input>s
|
||||
|| (!list? && conent_len < Readability.regexes[:min_text_length] && img_len != 1) # too short a content length without a single image
|
||||
|| (!list? && conent_len < Readability.regexes(:min_text_length) && img_len != 1) # too short a content length without a single image
|
||||
|| (weight < 25 && link_density > 0.2) # too many links for its weight (#{weight})
|
||||
|| (weight >= 25 && link_density > 0.5) # too many links for its weight (#{weight})
|
||||
|| ((embed_len == 1 && conent_len < 75) || embed_len > 1) # <embed>s with too short a content length, or too many <embed>s
|
||||
|
@ -75,7 +75,7 @@ defmodule Readability.Sanitizer do
|
|||
|
||||
defp clean_unlikely_tag?({tag, attrs, _}) do
|
||||
attrs_str = attrs |> Enum.map(&(elem(&1, 1))) |> Enum.join("")
|
||||
tag =~ ~r/form|object|iframe|embed/ && !(attrs_str =~ Readability.regexes[:video])
|
||||
tag =~ ~r/form|object|iframe|embed/ && !(attrs_str =~ Readability.regexes(:video))
|
||||
end
|
||||
|
||||
defp clean_empty_p?({tag, _, _} = html_tree) do
|
||||
|
|
|
@ -45,7 +45,7 @@ defmodule Readability.TitleFinder do
|
|||
@spec og_title(html_tree) :: binary
|
||||
def og_title(html_tree) do
|
||||
html_tree
|
||||
|> find_tag("meta[property=og:title]")
|
||||
|> find_tag("meta[property='og:title']")
|
||||
|> Floki.attribute("content")
|
||||
|> clean_title()
|
||||
end
|
||||
|
@ -72,6 +72,9 @@ defmodule Readability.TitleFinder do
|
|||
defp clean_title([]) do
|
||||
""
|
||||
end
|
||||
defp clean_title([title]) when is_binary(title) do
|
||||
String.strip(title)
|
||||
end
|
||||
defp clean_title(html_tree) do
|
||||
html_tree
|
||||
|> Floki.text()
|
||||
|
|
15
mix.exs
15
mix.exs
|
@ -2,7 +2,7 @@ defmodule Readability.Mixfile do
|
|||
@moduledoc """
|
||||
"""
|
||||
|
||||
@version "0.6.2"
|
||||
@version "0.7.0"
|
||||
@description """
|
||||
Readability library for extracting and curating articles.
|
||||
"""
|
||||
|
@ -14,10 +14,10 @@ defmodule Readability.Mixfile do
|
|||
version: @version,
|
||||
elixir: "~> 1.2",
|
||||
description: @description,
|
||||
package: package,
|
||||
package: package(),
|
||||
build_embedded: Mix.env == :prod,
|
||||
start_permanent: Mix.env == :prod,
|
||||
deps: deps]
|
||||
deps: deps()]
|
||||
end
|
||||
|
||||
# Configuration for the OTP application
|
||||
|
@ -40,11 +40,10 @@ defmodule Readability.Mixfile do
|
|||
#
|
||||
# Type "mix help deps" for more examples and options
|
||||
defp deps do
|
||||
[{:floki, "~> 0.9.0"},
|
||||
{:httpoison, "~> 0.9.0"},
|
||||
{:earmark, "~> 0.1", only: :dev},
|
||||
{:ex_doc, "~> 0.11", only: :dev},
|
||||
{:credo, "~> 0.3", only: [:dev, :test]},
|
||||
[{:floki, "~> 0.13.1"},
|
||||
{:httpoison, "~> 0.11.0"},
|
||||
{:ex_doc, "~> 0.14", only: :dev},
|
||||
{:credo, "~> 0.6.1", only: [:dev, :test]},
|
||||
{:dialyxir, "~> 0.3", only: [:dev]}
|
||||
]
|
||||
end
|
||||
|
|
19
mix.lock
19
mix.lock
|
@ -1,14 +1,15 @@
|
|||
%{"bunt": {:hex, :bunt, "0.1.6", "5d95a6882f73f3b9969fdfd1953798046664e6f77ec4e486e6fafc7caad97c6f", [:mix], []},
|
||||
"certifi": {:hex, :certifi, "0.4.0", "a7966efb868b179023618d29a407548f70c52466bf1849b9e8ebd0e34b7ea11f", [:rebar3], []},
|
||||
"credo": {:hex, :credo, "0.4.5", "5c5daaf50a2a96068c0f21b6fbd382d206702efa8836a946eeab0b8ac25f5f22", [:mix], [{:bunt, "~> 0.1.6", [hex: :bunt, optional: false]}]},
|
||||
%{"bunt": {:hex, :bunt, "0.2.0", "951c6e801e8b1d2cbe58ebbd3e616a869061ddadcc4863d0a2182541acae9a38", [:mix], []},
|
||||
"certifi": {:hex, :certifi, "0.7.0", "861a57f3808f7eb0c2d1802afeaae0fa5de813b0df0979153cbafcd853ababaf", [:rebar3], []},
|
||||
"credo": {:hex, :credo, "0.6.1", "a941e2591bd2bd2055dc92b810c174650b40b8290459c89a835af9d59ac4a5f8", [:mix], [{:bunt, "~> 0.2.0", [hex: :bunt, optional: false]}]},
|
||||
"dialyxir": {:hex, :dialyxir, "0.3.5", "eaba092549e044c76f83165978979f60110dc58dd5b92fd952bf2312f64e9b14", [:mix], []},
|
||||
"earmark": {:hex, :earmark, "0.2.1", "ba6d26ceb16106d069b289df66751734802777a3cbb6787026dd800ffeb850f3", [:mix], []},
|
||||
"ex_doc": {:hex, :ex_doc, "0.12.0", "b774aabfede4af31c0301aece12371cbd25995a21bb3d71d66f5c2fe074c603f", [:mix], [{:earmark, "~> 0.2", [hex: :earmark, optional: false]}]},
|
||||
"floki": {:hex, :floki, "0.9.0", "e952ca71a453f7827ab5405106ac8d9ac5c9602d18aa5d2d893e5b9944e2499e", [:mix], [{:mochiweb_html, "~> 2.15", [hex: :mochiweb_html, optional: false]}]},
|
||||
"hackney": {:hex, :hackney, "1.6.1", "ddd22d42db2b50e6a155439c8811b8f6df61a4395de10509714ad2751c6da817", [:rebar3], [{:certifi, "0.4.0", [hex: :certifi, optional: false]}, {:idna, "1.2.0", [hex: :idna, optional: false]}, {:metrics, "1.0.1", [hex: :metrics, optional: false]}, {:mimerl, "1.0.2", [hex: :mimerl, optional: false]}, {:ssl_verify_fun, "1.1.0", [hex: :ssl_verify_fun, optional: false]}]},
|
||||
"httpoison": {:hex, :httpoison, "0.9.0", "68187a2daddfabbe7ca8f7d75ef227f89f0e1507f7eecb67e4536b3c516faddb", [:mix], [{:hackney, "~> 1.6.0", [hex: :hackney, optional: false]}]},
|
||||
"earmark": {:hex, :earmark, "1.1.1", "433136b7f2e99cde88b745b3a0cfc3fbc81fe58b918a09b40fce7f00db4d8187", [:mix], []},
|
||||
"ex_doc": {:hex, :ex_doc, "0.14.5", "c0433c8117e948404d93ca69411dd575ec6be39b47802e81ca8d91017a0cf83c", [:mix], [{:earmark, "~> 1.0", [hex: :earmark, optional: false]}]},
|
||||
"floki": {:hex, :floki, "0.13.1", "b3b287e02914cb41a66285071dade287165ed1915ab07903e18fb454fe961bad", [:mix], [{:mochiweb, "~> 2.15", [hex: :mochiweb, optional: false]}]},
|
||||
"hackney": {:hex, :hackney, "1.6.5", "8c025ee397ac94a184b0743c73b33b96465e85f90a02e210e86df6cbafaa5065", [:rebar3], [{:certifi, "0.7.0", [hex: :certifi, optional: false]}, {:idna, "1.2.0", [hex: :idna, optional: false]}, {:metrics, "1.0.1", [hex: :metrics, optional: false]}, {:mimerl, "1.0.2", [hex: :mimerl, optional: false]}, {:ssl_verify_fun, "1.1.1", [hex: :ssl_verify_fun, optional: false]}]},
|
||||
"httpoison": {:hex, :httpoison, "0.11.0", "b9240a9c44fc46fcd8618d17898859ba09a3c1b47210b74316c0ffef10735e76", [:mix], [{:hackney, "~> 1.6.3", [hex: :hackney, optional: false]}]},
|
||||
"idna": {:hex, :idna, "1.2.0", "ac62ee99da068f43c50dc69acf700e03a62a348360126260e87f2b54eced86b2", [:rebar3], []},
|
||||
"metrics": {:hex, :metrics, "1.0.1", "25f094dea2cda98213cecc3aeff09e940299d950904393b2a29d191c346a8486", [:rebar3], []},
|
||||
"mimerl": {:hex, :mimerl, "1.0.2", "993f9b0e084083405ed8252b99460c4f0563e41729ab42d9074fd5e52439be88", [:rebar3], []},
|
||||
"mochiweb": {:hex, :mochiweb, "2.15.0", "e1daac474df07651e5d17cc1e642c4069c7850dc4508d3db7263a0651330aacc", [:rebar3], []},
|
||||
"mochiweb_html": {:hex, :mochiweb_html, "2.15.0", "d7402e967d7f9f2912f8befa813c37be62d5eeeddbbcb6fe986c44e01460d497", [:rebar3], []},
|
||||
"ssl_verify_fun": {:hex, :ssl_verify_fun, "1.1.0", "edee20847c42e379bf91261db474ffbe373f8acb56e9079acb6038d4e0bf414f", [:rebar, :make], []}}
|
||||
"ssl_verify_fun": {:hex, :ssl_verify_fun, "1.1.1", "28a4d65b7f59893bc2c7de786dec1e1555bd742d336043fe644ae956c3497fbe", [:make, :rebar], []}}
|
||||
|
|
Loading…
Reference in New Issue