diff --git a/.travis.yml b/.travis.yml index b5c6fd6..f430b32 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,3 +3,4 @@ language: elixir elixir: - 1.2.6 - 1.3.4 + - 1.4.1 diff --git a/lib/readability.ex b/lib/readability.ex index b33ecf2..d1cef25 100644 --- a/lib/readability.ex +++ b/lib/readability.ex @@ -138,7 +138,7 @@ defmodule Readability do @spec readable_html(html_tree) :: binary def readable_html(html_tree) do html_tree - |> Helper.remove_attrs(regexes[:protect_attrs]) + |> Helper.remove_attrs(regexes(:protect_attrs)) |> raw_html end @@ -166,7 +166,7 @@ defmodule Readability do def parse(raw_html) when is_binary(raw_html), do: Floki.parse(raw_html) - def regexes, do: @regexes + def regexes(key), do: @regexes[key] def default_options, do: @default_options end diff --git a/lib/readability/article_builder.ex b/lib/readability/article_builder.ex index 4ccc7de..52b1fb5 100644 --- a/lib/readability/article_builder.ex +++ b/lib/readability/article_builder.ex @@ -25,9 +25,7 @@ defmodule Readability.ArticleBuilder do Enum.member?(["script", "style"], tag) end) - if opts[:remove_unlikely_candidates] do - html_tree = Cleaner.remove_unlikely_tree(html_tree) - end + html_tree = if opts[:remove_unlikely_candidates], do: Cleaner.remove_unlikely_tree(html_tree), else: html_tree html_tree = Cleaner.transform_misused_div_to_p(html_tree) candidates = CandidateFinder.find(html_tree, opts) @@ -61,8 +59,10 @@ defmodule Readability.ArticleBuilder do defp find_article(candidates, html_tree) do best_candidate = CandidateFinder.find_best_candidate(candidates) unless best_candidate do - tree = html_tree |> Floki.find("body") |> hd - best_candidate = %Candidate{html_tree: tree} + best_candidate = case html_tree |> Floki.find("body") do + [tree|_] -> %Candidate{html_tree: tree} + _ -> %Candidate{html_tree: {}} + end end article_trees = find_article_trees(best_candidate, candidates) {"div", [], article_trees} diff --git a/lib/readability/author_finder.ex b/lib/readability/author_finder.ex index 3fb366c..8350e4e 100644 --- a/lib/readability/author_finder.ex +++ b/lib/readability/author_finder.ex @@ -22,7 +22,7 @@ defmodule Readability.AuthorFinder do |> Enum.map(fn(meta) -> meta |> Floki.attribute("content") - |> Floki.text + |> Enum.join(" ") |> String.strip end) |> Enum.reject(&(is_nil(&1) || String.length(&1) == 0)) diff --git a/lib/readability/candidate/cleaner.ex b/lib/readability/candidate/cleaner.ex index 41c65aa..4599984 100644 --- a/lib/readability/candidate/cleaner.ex +++ b/lib/readability/candidate/cleaner.ex @@ -18,7 +18,7 @@ defmodule Readability.Candidate.Cleaner do [transform_misused_div_to_p(h)|transform_misused_div_to_p(t)] end def transform_misused_div_to_p({tag, attrs, inner_tree}) do - if misused_divs?(tag, inner_tree), do: tag = "p" + tag = if misused_divs?(tag, inner_tree), do: "p", else: tag {tag, attrs, transform_misused_div_to_p(inner_tree)} end @@ -31,7 +31,7 @@ defmodule Readability.Candidate.Cleaner do end defp misused_divs?("div", inner_tree) do - !(Floki.raw_html(inner_tree) =~ Readability.regexes[:div_to_p_elements]) + !(Floki.raw_html(inner_tree) =~ Readability.regexes(:div_to_p_elements)) end defp misused_divs?(_, _), do: false @@ -41,8 +41,8 @@ defmodule Readability.Candidate.Cleaner do |> Enum.join("") str = tag <> idclass_str - str =~ Readability.regexes[:unlikely_candidate] - && !(str =~ Readability.regexes[:ok_maybe_its_a_candidate]) + str =~ Readability.regexes(:unlikely_candidate) + && !(str =~ Readability.regexes(:ok_maybe_its_a_candidate)) && tag != "html" end end diff --git a/lib/readability/candidate/scoring.ex b/lib/readability/candidate/scoring.ex index ed9edbb..38e85a0 100644 --- a/lib/readability/candidate/scoring.ex +++ b/lib/readability/candidate/scoring.ex @@ -34,7 +34,7 @@ defmodule Readability.Candidate.Scoring do defp calc_node_score({tag, attrs, _}, opts) do score = 0 - if opts[:weight_classes], do: score = score + class_weight(attrs) + score = if opts[:weight_classes], do: score + class_weight(attrs), else: score score + (@element_scores[tag] || 0) end defp calc_node_score([h|t], opts) do @@ -47,11 +47,10 @@ defmodule Readability.Candidate.Scoring do class = attrs |> List.keyfind("class", 0, {"", ""}) |> elem(1) id = attrs |> List.keyfind("id", 0, {"", ""}) |> elem(1) - if class =~ Readability.regexes[:positive], do: weight = weight + 25 - if id =~ Readability.regexes[:positive], do: weight = weight + 25 - if class =~ Readability.regexes[:negative], do: weight = weight - 25 - if id =~ Readability.regexes[:negative], do: weight = weight - 25 - + weight = if class =~ Readability.regexes(:positive), do: weight + 25, else: weight + weight = if id =~ Readability.regexes(:positive), do: weight + 25, else: weight + weight = if class =~ Readability.regexes(:negative), do: weight - 25, else: weight + weight = if id =~ Readability.regexes(:negative), do: weight - 25, else: weight weight end diff --git a/lib/readability/helper.ex b/lib/readability/helper.ex index f77cfff..1746812 100644 --- a/lib/readability/helper.ex +++ b/lib/readability/helper.ex @@ -31,16 +31,16 @@ defmodule Readability.Helper do [remove_attrs(h, t_attrs)|remove_attrs(t, t_attrs)] end def remove_attrs({tag_name, attrs, inner_tree}, target_attr) do - reject_fun = fn(attr) -> attr end - cond do - is_binary(target_attr) -> - reject_fun = fn(attr) -> elem(attr, 0) == target_attr end - Regex.regex?(target_attr) -> - reject_fun = fn(attr) -> elem(attr, 0) =~ target_attr end - is_list(target_attr) -> - reject_fun = fn(attr) -> Enum.member?(target_attr, elem(attr, 0)) end - true -> nil - end + reject_fun = + cond do + is_binary(target_attr) -> + fn(attr) -> elem(attr, 0) == target_attr end + Regex.regex?(target_attr) -> + fn(attr) -> elem(attr, 0) =~ target_attr end + is_list(target_attr) -> + fn(attr) -> Enum.member?(target_attr, elem(attr, 0)) end + true -> fn(attr) -> attr end + end {tag_name, Enum.reject(attrs, reject_fun), remove_attrs(inner_tree, target_attr)} end @@ -80,7 +80,7 @@ defmodule Readability.Helper do """ @spec candidate_tag?(html_tree) :: boolean def candidate_tag?(html_tree) do - Enum.any?(candidates_selector, fn(selector) -> + Enum.any?(candidates_selector(), fn(selector) -> Floki.Selector.match?(html_tree, selector) && (text_length(html_tree)) >= Readability.default_options[:min_text_length] end) @@ -92,10 +92,10 @@ defmodule Readability.Helper do @spec normalize(binary) :: html_tree def normalize(raw_html) do raw_html - |> String.replace(Readability.regexes[:replace_xml_version], "") - |> String.replace(Readability.regexes[:replace_brs], "
") - |> String.replace(Readability.regexes[:replace_fonts], "<\1span>") - |> String.replace(Readability.regexes[:normalize], " ") + |> String.replace(Readability.regexes(:replace_xml_version), "") + |> String.replace(Readability.regexes(:replace_brs), "
") + |> String.replace(Readability.regexes(:replace_fonts), "<\1span>") + |> String.replace(Readability.regexes(:normalize), " ") |> Floki.parse |> Floki.filter_out(:comment) end diff --git a/lib/readability/sanitizer.ex b/lib/readability/sanitizer.ex index 2efceb5..42fa90a 100644 --- a/lib/readability/sanitizer.ex +++ b/lib/readability/sanitizer.ex @@ -21,10 +21,10 @@ defmodule Readability.Sanitizer do |> Helper.remove_tag(&clean_empty_p?(&1)) if opts[:clean_conditionally] do - html_tree = html_tree - |> Helper.remove_tag(conditionally_cleaing_fn(candidates)) + html_tree |> Helper.remove_tag(conditionally_cleaing_fn(candidates)) + else + html_tree end - html_tree end defp conditionally_cleaing_fn(candidates) do @@ -48,7 +48,7 @@ defmodule Readability.Sanitizer do input_len = tree |> Floki.find("input") |> length embed_len = tree |> Floki.find("embed") - |> Enum.reject(&(&1 =~ Readability.regexes[:video])) + |> Enum.reject(&(&1 =~ Readability.regexes(:video))) |> length link_density = Scoring.calc_link_density(tree) @@ -57,7 +57,7 @@ defmodule Readability.Sanitizer do img_len > p_len # too many image || (!list? && li_len > p_len) # more
s || input_len > (p_len / 3) # less than 3x
s than s - || (!list? && conent_len < Readability.regexes[:min_text_length] && img_len != 1) # too short a content length without a single image + || (!list? && conent_len < Readability.regexes(:min_text_length) && img_len != 1) # too short a content length without a single image || (weight < 25 && link_density > 0.2) # too many links for its weight (#{weight}) || (weight >= 25 && link_density > 0.5) # too many links for its weight (#{weight}) || ((embed_len == 1 && conent_len < 75) || embed_len > 1) #