From b131d7effa04e2f98a61d5858a053c41ae7e60fc Mon Sep 17 00:00:00 2001 From: keepcosmos Date: Sun, 17 Apr 2016 21:26:51 +0900 Subject: [PATCH] add candidate builder add test --- .gitignore | 1 + lib/document.ex | 58 - lib/readability.ex | 80 +- lib/readability/article_builder.ex | 100 + lib/readability/candidate.ex | 6 + lib/readability/candidate/cleaner.ex | 48 + lib/readability/candidate/scoring.ex | 89 + lib/readability/candidate_finder.ex | 60 + lib/readability/content_finder.ex | 94 - lib/readability/helper.ex | 84 +- lib/readability/sanitizer.ex | 85 + lib/readability/title_finder.ex | 18 +- lib/test.js | 1835 ------- lib/test.rb | 522 -- mix.exs | 12 +- mix.lock | 14 +- test.html | 1 + test/fixtures/bbc.html | 4539 +++++++++-------- test/fixtures/nytimes.html | 1352 ++++- test/helper_text.exs | 31 - test/readability/candidate/_builder.exs | 53 + .../candidate/_finder.ex} | 20 +- test/readability/candidate/cleaner_test.exs | 59 + test/readability/helper_test.exs | 48 + test/{ => readability}/title_finder_test.exs | 10 +- test/readability_test.exs | 34 +- 26 files changed, 4592 insertions(+), 4661 deletions(-) delete mode 100644 lib/document.ex create mode 100644 lib/readability/article_builder.ex create mode 100644 lib/readability/candidate.ex create mode 100644 lib/readability/candidate/cleaner.ex create mode 100644 lib/readability/candidate/scoring.ex create mode 100644 lib/readability/candidate_finder.ex delete mode 100644 lib/readability/content_finder.ex create mode 100644 lib/readability/sanitizer.ex delete mode 100644 lib/test.js delete mode 100644 lib/test.rb create mode 100644 test.html delete mode 100644 test/helper_text.exs create mode 100644 test/readability/candidate/_builder.exs rename test/{content_finder_test.ex => readability/candidate/_finder.ex} (68%) create mode 100644 test/readability/candidate/cleaner_test.exs create mode 100644 test/readability/helper_test.exs rename test/{ => readability}/title_finder_test.exs (100%) diff --git a/.gitignore b/.gitignore index 755b605..7d56110 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ /deps erl_crash.dump *.ez +.credo.exs diff --git a/lib/document.ex b/lib/document.ex deleted file mode 100644 index 5016756..0000000 --- a/lib/document.ex +++ /dev/null @@ -1,58 +0,0 @@ -defmodule Readability.Document do - @default_options [retry_length: 250, - min_text_length: 25, - remove_unlikely_candidates: true, - weight_classes: true, - clean_conditionally: true, - remove_empty_nodes: true, - min_image_width: 130, - min_image_height: 80, - ignore_image_format: [], - blacklist: nil, - whitelist: nil - ] - - @regexes [ unlikelyCandidatesRe: ~r/combx|comment|community|disqus|extra|foot|header|lightbox|modal|menu|meta|nav|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i, - okMaybeItsACandidateRe: ~r/and|article|body|column|main|shadow/i, - positiveRe: ~r/article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i, - negativeRe: ~r/combx|comment|com-|contact|foot|footer|footnote|link|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|utility|widget/i, - divToPElementsRe: ~r/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i, - replaceBrsRe: ~r/(]*>[ \n\r\t]*){2,}/i, - replaceFontsRe: ~r/<(\/?)font[^>]*>/i, - trimRe: ~r/^\s+|\s+$/, - normalizeRe: ~r/\s{2,}/, - killBreaksRe: ~r/((\s| ?)*){1,}/, - videoRe: ~r/http:\/\/(www\.)?(youtube|vimeo)\.com/i - ] - - def html do - page - |> String.replace(@regexes[:replaceBrsRe], "

") - |> String.replace(@regexes[:replaceFontsRe], "<\1span>") - |> Floki.find("html") - |> Floki.filter_out(:comment) - end - - def title do - html |> Floki.find("title") |> Floki.text - end - - def content do - html - |> Floki.filter_out("script") - |> Floki.filter_out("style") - end - - def page do - {:ok, f} = File.read("test/features/nytimes.html") - f - end - - def default_options do - @default_options - end - - def regexes do - @regexes - end -end diff --git a/lib/readability.ex b/lib/readability.ex index 87e2840..720f42e 100644 --- a/lib/readability.ex +++ b/lib/readability.ex @@ -1,10 +1,86 @@ defmodule Readability do + @moduledoc """ + """ + alias Readability.TitleFinder + alias Readability.ArticleBuilder + + @default_options [retry_length: 250, + min_text_length: 25, + remove_unlikely_candidates: true, + weight_classes: true, + clean_conditionally: true, + remove_empty_nodes: true, + min_image_width: 130, + min_image_height: 80, + ignore_image_format: [], + blacklist: nil, + whitelist: nil + ] + + @regexes [unlikely_candidate: ~r/combx|comment|community|disqus|extra|foot|header|hidden|lightbox|modal|menu|meta|nav|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i, + ok_maybe_its_a_candidate: ~r/and|article|body|column|main|shadow/i, + positive: ~r/article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i, + negative: ~r/hidden|^hid|combx|comment|com-|contact|foot|footer|footnote|link|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|utility|widget/i, + div_to_p_elements: ~r/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i, + replace_brs: ~r/(]*>[ \n\r\t]*){2,}/i, + replace_fonts: ~r/<(\/?)font[^>]*>/i, + normalize: ~r/\s{2,}/, + video: ~r/\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i + ] @type html_tree :: tuple | list + @type options :: list - def title(html) when is_binary(html), do: parse(html) |> title + def title(html) when is_binary(html), do: html |> parse |> title def title(html_tree), do: TitleFinder.title(html_tree) - def parse(raw_html), do: Floki.parse(raw_html) + @doc """ + Using a variety of metrics (content score, classname, element types), find the content that is + most likely to be the stuff a user wants to read + """ + @spec content(binary, options) :: binary + def content(raw_html, opts \\ @default_options) do + opts = Keyword.merge(@default_options, opts) + raw_html + |> parse + |> ArticleBuilder.build(opts) + end + + @doc """ + Normalize and Parse to html tree(tuple or list)) from binary html + """ + @spec parse(binary) :: html_tree + def parse(raw_html) do + raw_html + |> String.replace(Readability.regexes[:replace_brs], "

") + |> String.replace(Readability.regexes[:replace_fonts], "<\1span>") + |> String.replace(Readability.regexes[:normalize], " ") + |> Floki.parse + |> Floki.filter_out(:comment) + end + + @doc """ + return raw html binary from html tree tuple + """ + @spec raw_html(html_tree) :: binary + def raw_html(html_tree) do + html_tree |> Floki.raw_html + end + + @doc """ + return only text binary from html tree tuple + """ + @spec raw_html(html_tree) :: binary + def readabl_text(html_tree) do + tags_to_br = ~r/<\/(p|div|article|h\d)/i + html_str = html_tree |> raw_html + Regex.replace(tags_to_br, html_str, &("\n#{&1}")) + |> Floki.parse + |> Floki.text + end + + def regexes, do: @regexes + + def default_options, do: @default_options end diff --git a/lib/readability/article_builder.ex b/lib/readability/article_builder.ex new file mode 100644 index 0000000..e7b6254 --- /dev/null +++ b/lib/readability/article_builder.ex @@ -0,0 +1,100 @@ +defmodule Readability.ArticleBuilder do + @moduledoc """ + build article for readability + """ + + alias Readability.Helper + alias Readability.Sanitizer + alias Readability.Candidate + alias Readability.CandidateFinder + alias Readability.Candidate.Cleaner + alias Readability.Candidate.Scoring + + @type html_tree :: tuple | list + @type options :: list + + @doc """ + Prepare the article node for display. + Clean out any inline styles, iframes, forms, strip extraneous

tags, etc. + """ + @spec build(html_tree, options) :: html_tree + def build(html_tree, opts) do + origin_tree = html_tree + html_tree = html_tree + |> Helper.remove_tag(fn({tag, _, _}) -> + Enum.member?(["script", "style"], tag) + end) + + if opts[:remove_unlikely_candidates] do + html_tree = Cleaner.remove_unlikely_tree(html_tree) + end + html_tree = Cleaner.transform_misused_div_to_p(html_tree) + + candidates = CandidateFinder.find(html_tree, opts) + article = find_article(candidates, html_tree) + + html_tree = Sanitizer.sanitize(article, candidates, opts) + + if Helper.text_length(html_tree) < opts[:retry_length] do + if opts = next_try_opts(opts) do + build(origin_tree, opts) + else + html_tree + end + else + html_tree + end + end + + defp next_try_opts(opts) do + cond do + opts[:remove_unlikely_candidates] -> + Keyword.put(opts, :remove_unlikely_candidates, false) + opts[:weight_classes] -> + Keyword.put(opts, :weight_classes, false) + opts[:clean_conditionally] -> + Keyword.put(opts, :clean_conditionally, false) + true -> nil + end + end + + defp find_article(candidates, html_tree) do + best_candidate = CandidateFinder.find_best_candidate(candidates) + unless best_candidate do + tree = html_tree |> Floki.find("body") |> hd + best_candidate = %Candidate{html_tree: tree} + end + article_trees = find_article_trees(best_candidate, candidates) + {"div", [], article_trees} + end + + defp find_article_trees(best_candidate, candidates) do + score_threshold = Enum.max([10, best_candidate.score * 0.2]) + + candidates + |> Enum.filter(&(&1.tree_depth == best_candidate.tree_depth)) + |> Enum.filter_map(fn(candidate) -> + candidate == best_candidate + || candidate.score >= score_threshold + || append?(candidate) + end, &(to_article_tag(&1.html_tree))) + end + + defp append?(%Candidate{html_tree: html_tree}) when elem(html_tree, 0) == "p" do + link_density = Scoring.calc_link_density(html_tree) + inner_text = html_tree |> Floki.text + inner_length = inner_text |> String.length + + (inner_length > 80 && link_density < 0.25) + || (inner_length < 80 && link_density == 0 && inner_text =~ ~r/\.( |$)/) + end + defp append?(_), do: false + + defp to_article_tag({tag, attrs, inner_tree} = html_tree) do + if tag =~ ~r/^p$|^div$/ do + html_tree + else + {"div", attrs, inner_tree} + end + end +end diff --git a/lib/readability/candidate.ex b/lib/readability/candidate.ex new file mode 100644 index 0000000..7655d37 --- /dev/null +++ b/lib/readability/candidate.ex @@ -0,0 +1,6 @@ +defmodule Readability.Candidate do + @moduledoc """ + Candidate can be article + """ + defstruct html_tree: {}, score: 0, tree_depth: 0 +end diff --git a/lib/readability/candidate/cleaner.ex b/lib/readability/candidate/cleaner.ex new file mode 100644 index 0000000..41c65aa --- /dev/null +++ b/lib/readability/candidate/cleaner.ex @@ -0,0 +1,48 @@ +defmodule Readability.Candidate.Cleaner do + @moduledoc """ + Clean html tree for prepare candidates. + It transforms misused tags and removes unlikely candidates. + """ + + alias Readability.Helper + + @type html_tree :: tuple | list + + @doc """ + Transform misused divs

s that do not contain other block elements into

s + """ + @spec transform_misused_div_to_p(html_tree) :: html_tree + def transform_misused_div_to_p(content) when is_binary(content), do: content + def transform_misused_div_to_p([]), do: [] + def transform_misused_div_to_p([h|t]) do + [transform_misused_div_to_p(h)|transform_misused_div_to_p(t)] + end + def transform_misused_div_to_p({tag, attrs, inner_tree}) do + if misused_divs?(tag, inner_tree), do: tag = "p" + {tag, attrs, transform_misused_div_to_p(inner_tree)} + end + + @doc """ + Remove unlikely html tree + """ + @spec remove_unlikely_tree(html_tree) :: html_tree + def remove_unlikely_tree(html_tree) do + Helper.remove_tag(html_tree, &unlikely_tree?(&1)) + end + + defp misused_divs?("div", inner_tree) do + !(Floki.raw_html(inner_tree) =~ Readability.regexes[:div_to_p_elements]) + end + defp misused_divs?(_, _), do: false + + defp unlikely_tree?({tag, attrs, _}) do + idclass_str = attrs + |> Enum.filter_map(&(elem(&1, 0) =~ ~r/id|class/i), &(elem(&1, 1))) + |> Enum.join("") + str = tag <> idclass_str + + str =~ Readability.regexes[:unlikely_candidate] + && !(str =~ Readability.regexes[:ok_maybe_its_a_candidate]) + && tag != "html" + end +end diff --git a/lib/readability/candidate/scoring.ex b/lib/readability/candidate/scoring.ex new file mode 100644 index 0000000..ed9edbb --- /dev/null +++ b/lib/readability/candidate/scoring.ex @@ -0,0 +1,89 @@ +defmodule Readability.Candidate.Scoring do + @moduledoc """ + Score html tree + """ + alias Readability.Helper + + @element_scores %{"div" => 5, + "blockquote" => 3, + "form" => -3, + "th" => -5 + } + + @type html_tree :: tuple | list + @type options :: list + + @doc """ + Score html tree by some algorithm that check children nodes, attributes, link densities, etcs.. + options -> weight_classes :: boolean, calculate weight class + """ + @spec calc_score(html_tree, options) :: number + def calc_score(html_tree, opts \\ []) do + score = calc_node_score(html_tree, opts) + score = score + calc_children_content_score(html_tree) + calc_grand_children_content_score(html_tree) + score * (1 - calc_link_density(html_tree)) + end + + defp calc_content_score(html_tree) do + score = 1 + inner_text = html_tree |> Floki.text + split_score = inner_text |> String.split(",") |> length + length_score = [(String.length(inner_text) / 100), 3] |> Enum.min + score + split_score + length_score + end + + defp calc_node_score({tag, attrs, _}, opts) do + score = 0 + if opts[:weight_classes], do: score = score + class_weight(attrs) + score + (@element_scores[tag] || 0) + end + defp calc_node_score([h|t], opts) do + calc_node_score(h, opts) + calc_node_score(t, opts) + end + defp calc_node_score([], _), do: 0 + + def class_weight(attrs) do + weight = 0 + class = attrs |> List.keyfind("class", 0, {"", ""}) |> elem(1) + id = attrs |> List.keyfind("id", 0, {"", ""}) |> elem(1) + + if class =~ Readability.regexes[:positive], do: weight = weight + 25 + if id =~ Readability.regexes[:positive], do: weight = weight + 25 + if class =~ Readability.regexes[:negative], do: weight = weight - 25 + if id =~ Readability.regexes[:negative], do: weight = weight - 25 + + weight + end + + def calc_link_density(html_tree) do + link_length = html_tree + |> Floki.find("a") + |> Floki.text + |> String.length + + text_length = html_tree + |> Floki.text + |> String.length + + if text_length == 0 do + 0 + else + link_length / text_length + end + end + + defp calc_children_content_score({_, _, children_tree}) do + children_tree + |> Enum.filter(&(is_tuple(&1) && Helper.candidate_tag?(&1))) + |> calc_content_score + end + + defp calc_grand_children_content_score({_, _, children_tree}) do + score = children_tree + |> Enum.filter_map(&is_tuple(&1), &elem(&1, 2)) + |> List.flatten + |> Enum.filter(&(is_tuple(&1) && Helper.candidate_tag?(&1))) + |> calc_content_score + score / 2 + end +end diff --git a/lib/readability/candidate_finder.ex b/lib/readability/candidate_finder.ex new file mode 100644 index 0000000..4f4896a --- /dev/null +++ b/lib/readability/candidate_finder.ex @@ -0,0 +1,60 @@ +defmodule Readability.CandidateFinder do + @moduledoc """ + The builing and finding candidates engine + It traverses the HTML tree searching, removing, socring nodes + """ + + alias Readability.Helper + alias Readability.Candidate + alias Readability.Candidate.Scoring + + @type html_tree :: tuple | list + @type options :: list + + @doc """ + Find candidates that shuld be meaningful article by analysing nodes + """ + @spec find(html_tree, options, number) :: [Candidate.t] + def find(_, opts \\ [], tree_depth \\ 0) + def find([], _, _), do: [] + def find([h|t], opts, tree_depth) do + [find(h, opts, tree_depth) | find(t, opts, tree_depth)] + |> List.flatten + end + def find(text, _, _) when is_binary(text), do: [] + def find({tag, attrs, inner_tree}, opts, tree_depth) do + html_tree = {tag, attrs, inner_tree} + if candidate?(html_tree) do + candidate = %Candidate{html_tree: html_tree, + score: Scoring.calc_score(html_tree, opts), + tree_depth: tree_depth} + + [candidate | find(inner_tree, opts, tree_depth + 1)] + else + find(inner_tree, opts, tree_depth + 1) + end + end + + @doc """ + Find the highest score candidate. + """ + @spec find_best_candidate([Candidate.t]) :: Candidate.t + def find_best_candidate([]), do: nil + def find_best_candidate(candidates) do + candidates + |> Enum.max_by(fn(candidate) -> candidate.score end) + end + + defp candidate?(_, depth \\ 0) + defp candidate?(_, depth) when depth > 2, do: false + defp candidate?([h|t], depth), do: candidate?(h, depth) || candidate?(t, depth) + defp candidate?([], _), do: false + defp candidate?(text, _) when is_binary(text), do: false + defp candidate?({_, _, inner_tree} = html_tree, depth) do + if Helper.candidate_tag?(html_tree) do + true + else + candidate?(inner_tree, depth + 1) + end + end +end diff --git a/lib/readability/content_finder.ex b/lib/readability/content_finder.ex deleted file mode 100644 index 43c5991..0000000 --- a/lib/readability/content_finder.ex +++ /dev/null @@ -1,94 +0,0 @@ -defmodule Readability.ContentFinder do - @moduledoc """ - ContentFinder uses a variety of metrics for finding the content - that is most likely to be the stuff a user wants to read. - Then return it wrapped up in a div. - """ - - @regexes [ unlikelyCandidatesRe: ~r/combx|comment|community|disqus|extra|foot|header|lightbox|modal|menu|meta|nav|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i, - okMaybeItsACandidateRe: ~r/and|article|body|column|main|shadow/i, - positiveRe: ~r/article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i, - negativeRe: ~r/combx|comment|com-|contact|foot|footer|footnote|link|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|utility|widget/i, - divToPElementsRe: ~r/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i, - replaceBrsRe: ~r/(]*>[ \n\r\t]*){2,}/i, - replaceFontsRe: ~r/<(\/?)font[^>]*>/i, - trimRe: ~r/^\s+|\s+$/, - normalizeRe: ~r/\s{2,}/, - killBreaksRe: ~r/((\s| ?)*){1,}/, - videoRe: ~r/http:\/\/(www\.)?(youtube|vimeo)\.com/i - ] - - @type html_tree :: tuple | list - - @spec content(html_tree) :: html_tree - - def content(html_tree, options \\ []) do - candidate = html_tree - |> preapre_cadidates - - best_candidate = candidate - |> select_best_candidate - - candidate - |> fix_relative_uris - end - - defp preapre_cadidates(html_tree) do - html_tree - |> Floki.filter_out("script") - |> Floki.filter_out("style") - |> remove_unlikely_candidates - |> transform_misused_divs_into_paragraphs - end - - @doc """ - Remove unlikely tag nodes - """ - - @spec remove_unlikely_candidates(html_tree) :: html_tree - - def remove_unlikely_candidates(content) when is_binary(content), do: content - def remove_unlikely_candidates([]), do: [] - def remove_unlikely_candidates([h|t]) do - case remove_unlikely_candidates(h) do - nil -> remove_unlikely_candidates(t) - html_tree -> [html_tree|remove_unlikely_candidates(t)] - end - end - def remove_unlikely_candidates({tag_name, attrs, inner_tree}) do - cond do - unlikely_candidate?(tag_name, attrs) -> nil - true -> {tag_name, attrs, remove_unlikely_candidates(inner_tree)} - end - end - defp unlikely_candidate?(tag_name, attrs) do - idclass_str = attrs - |> Enum.filter_map(fn(attr) -> elem(attr, 0) =~ ~r/id|class/i end, - fn(attr) -> elem(attr, 1) end) - |> Enum.join("") - str = tag_name <> idclass_str - str =~ @regexes[:unlikelyCandidatesRe] && !(str =~ @regexes[:okMaybeItsACandidateRe]) && tag_name != "html" - end - - def transform_misused_divs_into_paragraphs(content) when is_binary(content), do: content - def transform_misused_divs_into_paragraphs([]), do: [] - def transform_misused_divs_into_paragraphs([h|t]) do - [transform_misused_divs_into_paragraphs(h)|transform_misused_divs_into_paragraphs(t)] - end - def transform_misused_divs_into_paragraphs({tag_name, attrs, inner_tree} = html_tree) do - if misused_divs?(tag_name, inner_tree), do: tag_name = "p" - {tag_name, attrs, transform_misused_divs_into_paragraphs(inner_tree)} - end - defp misused_divs?("div", inner_tree) do - !(Floki.raw_html(inner_tree) =~ @regexes[:divToPElementsRe]) - end - defp misused_divs?(_, _), do: false - - defp select_best_candidate(html_tree) do - html_tree - end - - defp fix_relative_uris(html_tree) do - html_tree - end -end diff --git a/lib/readability/helper.ex b/lib/readability/helper.ex index 3650cf6..9551da3 100644 --- a/lib/readability/helper.ex +++ b/lib/readability/helper.ex @@ -1,25 +1,93 @@ defmodule Readability.Helper do @moduledoc """ - Utilities + Helpers for parsing, updating, removing html tree """ @type html_tree :: tuple | list @doc """ - change existing tags by selector + Change existing tags by selector """ - @spec change_tag(html_tree, String.t, String.t) :: html_tree - + def change_tag(content, _, _) when is_binary(content), do: content + def change_tag([], _, _), do: [] + def change_tag([h|t], selector, tag) do + [change_tag(h, selector, tag)|change_tag(t, selector, tag)] + end def change_tag({tag_name, attrs, inner_tree}, tag_name, tag) do {tag, attrs, change_tag(inner_tree, tag_name, tag)} end def change_tag({tag_name, attrs, html_tree}, selector, tag) do {tag_name, attrs, change_tag(html_tree, selector, tag)} end - def change_tag([h|t], selector, tag) do - [change_tag(h, selector, tag)|change_tag(t, selector, tag)] + + @spec remove_attrs(html_tree, String.t | [String.t] | Regex.t) :: html_tree + def remove_attrs(content, _) when is_binary(content), do: content + def remove_attrs([], _), do: [] + def remove_attrs([h|t], t_attrs) do + [remove_attrs(h, t_attrs)|remove_attrs(t, t_attrs)] + end + def remove_attrs({tag_name, attrs, inner_tree}, target_attr) do + reject_fun = fn(attr) -> attr end + cond do + is_binary(target_attr) -> + reject_fun = fn(attr) -> elem(attr, 0) == target_attr end + Regex.regex?(target_attr) -> + reject_fun = fn(attr) -> elem(attr, 0) =~ target_attr end + is_list(target_attr) -> + reject_fun = fn(attr) -> Enum.member?(target_attr, elem(attr, 0)) end + true -> nil + end + {tag_name, Enum.reject(attrs, reject_fun), remove_attrs(inner_tree, target_attr)} + end + + + @doc """ + Remove tags + """ + @spec remove_tag(html_tree, fun) :: html_tree + def remove_tag(content, _) when is_binary(content), do: content + def remove_tag([], _), do: [] + def remove_tag([h|t], fun) do + node = remove_tag(h, fun) + if is_nil(node) do + remove_tag(t, fun) + else + [node|remove_tag(t, fun)] + end + end + def remove_tag({tag, attrs, inner_tree} = html_tree, fun) do + if fun.(html_tree) do + nil + else + {tag, attrs, remove_tag(inner_tree, fun)} + end + end + + @doc """ + count only text length + """ + @spec text_length(html_tree) :: number + def text_length(html_tree) do + html_tree |> Floki.text |> String.strip |> String.length + end + + @doc """ + Check html_tree can be candidate or not. + """ + @spec candidate_tag?(html_tree) :: boolean + def candidate_tag?(html_tree) do + Enum.any?(candidates_selector, fn(selector) -> + Floki.Selector.match?(html_tree, selector) + && (text_length(html_tree)) >= Readability.default_options[:min_text_length] + end) + end + + defp candidates_selector do + ["p", "td"] + |> Enum.map(fn(s) -> + tokens = Floki.SelectorTokenizer.tokenize(s) + Floki.SelectorParser.parse(tokens) + end) end - def change_tag([], selector, tag), do: [] - def change_tag(content, selector, tag) when is_binary(content), do: content end diff --git a/lib/readability/sanitizer.ex b/lib/readability/sanitizer.ex new file mode 100644 index 0000000..b8eae7a --- /dev/null +++ b/lib/readability/sanitizer.ex @@ -0,0 +1,85 @@ +defmodule Readability.Sanitizer do + @moduledoc """ + Clean an element of all tags of type "tag" if they look fishy. + "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc. + """ + + alias Readability.Helper + alias Readability.Candidate + alias Readability.Candidate.Scoring + + @type html_tree :: tuple | list + + @doc """ + Sanitizes article html tree + """ + @spec sanitize(html_tree, [Candidate.t], list) :: html_tree + def sanitize(html_tree, candidates, opts \\ []) do + html_tree = html_tree + |> Helper.remove_tag(&clean_headline_tag?(&1)) + |> Helper.remove_tag(&clean_unlikely_tag?(&1)) + |> Helper.remove_tag(&clean_empty_p?(&1)) + + if opts[:clean_conditionally] do + html_tree = html_tree + |> Helper.remove_tag(conditionally_cleaing_fn(candidates)) + end + + html_tree |> Helper.remove_attrs("style") + end + + defp conditionally_cleaing_fn(candidates) do + fn({tag, attrs, _} = tree) -> + if Enum.any?(["table", "ul", "div"], &(&1 == tag)) do + weight = Scoring.class_weight(attrs) + same_tree = candidates + |> Enum.find(%Candidate{}, &(&1.html_tree == tree)) + list? = tag == "ul" + cond do + weight + same_tree.score < 0 + -> true + + length(Regex.scan(~r/\,/, Floki.text(tree))) < 10 -> + # If there are not very many commas, and the number of + # non-paragraph elements is more than paragraphs or other + # ominous signs, remove the element. + p_len = tree |> Floki.find("p") |> length + img_len = tree |> Floki.find("img") |> length + li_len = tree |> Floki.find("li") |> length + input_len = tree |> Floki.find("input") |> length + embed_len = tree + |> Floki.find("embed") + |> Enum.reject(&(&1 =~ Readability.regexes[:video])) + |> length + + link_density = Scoring.calc_link_density(tree) + conent_len = Helper.text_length(tree) + + img_len > p_len # too many image + || (!list? && li_len > p_len) # more

  • s than

    s + || input_len > (p_len / 3) # less than 3x

    s than s + || (!list? && conent_len < Readability.regexes[:min_text_length] && img_len != 1) # too short a content length without a single image + || (weight < 25 && link_density > 0.2) # too many links for its weight (#{weight}) + || (weight >= 25 && link_density > 0.5) # too many links for its weight (#{weight}) + || ((embed_len == 1 && conent_len < 75) || embed_len > 1) # s with too short a content length, or too many s + + true -> false + end + end + end + end + + defp clean_headline_tag?({tag, attrs, _} = html_tree) do + tag =~ ~r/^h\d{1}$/ + && (Scoring.class_weight(attrs) < 0 || Scoring.calc_link_density(html_tree) > 0.33) + end + + defp clean_unlikely_tag?({tag, attrs, _}) do + attrs_str = attrs |> Enum.map(&(elem(&1, 1))) |> Enum.join("") + tag =~ ~r/form|object|iframe|embed/ && !(attrs_str =~ Readability.regexes[:video]) + end + + defp clean_empty_p?({tag, _, _} = html_tree) do + tag == "p" && Helper.text_length(html_tree) == 0 + end +end diff --git a/lib/readability/title_finder.ex b/lib/readability/title_finder.ex index ce26a17..7ebd8a2 100644 --- a/lib/readability/title_finder.ex +++ b/lib/readability/title_finder.ex @@ -11,9 +11,7 @@ defmodule Readability.TitleFinder do @doc """ Find proper title """ - @spec title(html_tree) :: binary - def title(html_tree) do maybe_title = tag_title(html_tree) if length(String.split(maybe_title, " ")) <= 4 do @@ -25,42 +23,36 @@ defmodule Readability.TitleFinder do @doc """ Find title from title tag """ - @spec tag_title(html_tree) :: binary - def tag_title(html_tree) do html_tree |> Floki.find("title") - |> to_clean_text + |> clean_title end @doc """ Find title from og:title property of meta tag """ - @spec og_title(html_tree) :: binary - def og_title(html_tree) do html_tree |> Floki.find("meta[property=og:title]") |> Floki.attribute("content") - |> to_clean_text + |> clean_title end @doc """ Find title from h tag """ - @spec h_tag_title(html_tree, String.t) :: binary - - def h_tag_title(html_tree, selector \\@h_tag_selector) do + def h_tag_title(html_tree, selector \\ @h_tag_selector) do html_tree |> Floki.find(selector) |> hd - |> to_clean_text + |> clean_title end - defp to_clean_text(html_tree) do + defp clean_title(html_tree) do title_text = html_tree |> Floki.text |> String.split(@title_suffix) diff --git a/lib/test.js b/lib/test.js deleted file mode 100644 index 53b4051..0000000 --- a/lib/test.js +++ /dev/null @@ -1,1835 +0,0 @@ -/*eslint-env es6:false*/ -/* - * Copyright (c) 2010 Arc90 Inc - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * This code is heavily based on Arc90's readability.js (1.7.1) script - * available at: http://code.google.com/p/arc90labs-readability - */ -var root = this; - -/** - * Public constructor. - * @param {Object} uri The URI descriptor object. - * @param {HTMLDocument} doc The document to parse. - * @param {Object} options The options object. - */ -var Readability = function(uri, doc, options) { - options = options || {}; - - this._uri = uri; - this._doc = doc; - this._biggestFrame = false; - this._articleByline = null; - this._articleDir = null; - - // Configureable options - this._debug = !!options.debug; - this._maxElemsToParse = options.maxElemsToParse || this.DEFAULT_MAX_ELEMS_TO_PARSE; - this._nbTopCandidates = options.nbTopCandidates || this.DEFAULT_N_TOP_CANDIDATES; - this._maxPages = options.maxPages || this.DEFAULT_MAX_PAGES; - - // Start with all flags set - this._flags = this.FLAG_STRIP_UNLIKELYS | - this.FLAG_WEIGHT_CLASSES | - this.FLAG_CLEAN_CONDITIONALLY; - - // The list of pages we've parsed in this call of readability, - // for autopaging. As a key store for easier searching. - this._parsedPages = {}; - - // A list of the ETag headers of pages we've parsed, in case they happen to match, - // we'll know it's a duplicate. - this._pageETags = {}; - - // Make an AJAX request for each page and append it to the document. - this._curPageNum = 1; - - var logEl; - - // Control whether log messages are sent to the console - if (this._debug) { - logEl = function(e) { - var rv = e.nodeName + " "; - if (e.nodeType == e.TEXT_NODE) { - return rv + '("' + e.textContent + '")'; - } - var classDesc = e.className && ("." + e.className.replace(/ /g, ".")); - var elDesc = ""; - if (e.id) - elDesc = "(#" + e.id + classDesc + ")"; - else if (classDesc) - elDesc = "(" + classDesc + ")"; - return rv + elDesc; - }; - this.log = function () { - if ("dump" in root) { - var msg = Array.prototype.map.call(arguments, function(x) { - return (x && x.nodeName) ? logEl(x) : x; - }).join(" "); - dump("Reader: (Readability) " + msg + "\n"); - } else if ("console" in root) { - var args = ["Reader: (Readability) "].concat(arguments); - console.log.apply(console, args); - } - }; - } else { - this.log = function () {}; - } -} - -Readability.prototype = { - FLAG_STRIP_UNLIKELYS: 0x1, - FLAG_WEIGHT_CLASSES: 0x2, - FLAG_CLEAN_CONDITIONALLY: 0x4, - - // Max number of nodes supported by this parser. Default: 0 (no limit) - DEFAULT_MAX_ELEMS_TO_PARSE: 0, - - // The number of top candidates to consider when analysing how - // tight the competition is among candidates. - DEFAULT_N_TOP_CANDIDATES: 5, - - // The maximum number of pages to loop through before we call - // it quits and just show a link. - DEFAULT_MAX_PAGES: 5, - - // Element tags to score by default. - DEFAULT_TAGS_TO_SCORE: "section,h2,h3,h4,h5,h6,p,td,pre".toUpperCase().split(","), - - // All of the regular expressions in use within readability. - // Defined up here so we don't instantiate them repeatedly in loops. - REGEXPS: { - unlikelyCandidates: /banner|combx|comment|community|disqus|extra|foot|header|menu|modal|related|remark|rss|share|shoutbox|sidebar|skyscraper|sponsor|ad-break|agegate|pagination|pager|popup/i, - okMaybeItsACandidate: /and|article|body|column|main|shadow/i, - positive: /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i, - negative: /hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|modal|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i, - extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i, - byline: /byline|author|dateline|writtenby/i, - replaceFonts: /<(\/?)font[^>]*>/gi, - normalize: /\s{2,}/g, - videos: /\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i, - nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i, - prevLink: /(prev|earl|old|new|<|«)/i, - whitespace: /^\s*$/, - hasContent: /\S$/, - }, - - DIV_TO_P_ELEMS: [ "A", "BLOCKQUOTE", "DL", "DIV", "IMG", "OL", "P", "PRE", "TABLE", "UL", "SELECT" ], - - ALTER_TO_DIV_EXCEPTIONS: ["DIV", "ARTICLE", "SECTION", "P"], - - /** - * Run any post-process modifications to article content as necessary. - * - * @param Element - * @return void - **/ - _postProcessContent: function(articleContent) { - // Readability cannot open relative uris so we convert them to absolute uris. - this._fixRelativeUris(articleContent); - }, - - /** - * Iterate over a NodeList, which doesn't natively fully implement the Array - * interface. - * - * For convenience, the current object context is applied to the provided - * iterate function. - * - * @param NodeList nodeList The NodeList. - * @param Function fn The iterate function. - * @return void - */ - _forEachNode: function(nodeList, fn) { - return Array.prototype.forEach.call(nodeList, fn, this); - }, - - /** - * Iterate over a NodeList, return true if any of the provided iterate - * function calls returns true, false otherwise. - * - * For convenience, the current object context is applied to the - * provided iterate function. - * - * @param NodeList nodeList The NodeList. - * @param Function fn The iterate function. - * @return Boolean - */ - _someNode: function(nodeList, fn) { - return Array.prototype.some.call(nodeList, fn, this); - }, - - /** - * Concat all nodelists passed as arguments. - * - * @return ...NodeList - * @return Array - */ - _concatNodeLists: function() { - var slice = Array.prototype.slice; - var args = slice.call(arguments); - var nodeLists = args.map(function(list) { - return slice.call(list); - }); - return Array.prototype.concat.apply([], nodeLists); - }, - - _getAllNodesWithTag: function(node, tagNames) { - if (node.querySelectorAll) { - return node.querySelectorAll(tagNames.join(',')); - } - return [].concat.apply([], tagNames.map(function(tag) { - return node.getElementsByTagName(tag); - })); - }, - - /** - * Converts each and uri in the given element to an absolute URI, - * ignoring #ref URIs. - * - * @param Element - * @return void - */ - _fixRelativeUris: function(articleContent) { - var scheme = this._uri.scheme; - var prePath = this._uri.prePath; - var pathBase = this._uri.pathBase; - - function toAbsoluteURI(uri) { - // If this is already an absolute URI, return it. - if (/^[a-zA-Z][a-zA-Z0-9\+\-\.]*:/.test(uri)) - return uri; - - // Scheme-rooted relative URI. - if (uri.substr(0, 2) == "//") - return scheme + "://" + uri.substr(2); - - // Prepath-rooted relative URI. - if (uri[0] == "/") - return prePath + uri; - - // Dotslash relative URI. - if (uri.indexOf("./") === 0) - return pathBase + uri.slice(2); - - // Ignore hash URIs: - if (uri[0] == "#") - return uri; - - // Standard relative URI; add entire path. pathBase already includes a - // trailing "/". - return pathBase + uri; - } - - var links = articleContent.getElementsByTagName("a"); - this._forEachNode(links, function(link) { - var href = link.getAttribute("href"); - if (href) { - // Replace links with javascript: URIs with text content, since - // they won't work after scripts have been removed from the page. - if (href.indexOf("javascript:") === 0) { - var text = this._doc.createTextNode(link.textContent); - link.parentNode.replaceChild(text, link); - } else { - link.setAttribute("href", toAbsoluteURI(href)); - } - } - }); - - var imgs = articleContent.getElementsByTagName("img"); - this._forEachNode(imgs, function(img) { - var src = img.getAttribute("src"); - if (src) { - img.setAttribute("src", toAbsoluteURI(src)); - } - }); - }, - - /** - * Get the article title as an H1. - * - * @return void - **/ - _getArticleTitle: function() { - var doc = this._doc; - var curTitle = ""; - var origTitle = ""; - - try { - curTitle = origTitle = doc.title; - - // If they had an element with id "title" in their HTML - if (typeof curTitle !== "string") - curTitle = origTitle = this._getInnerText(doc.getElementsByTagName('title')[0]); - } catch(e) {} - - if (curTitle.match(/ [\|\-] /)) { - curTitle = origTitle.replace(/(.*)[\|\-] .*/gi,'$1'); - - if (curTitle.split(' ').length < 3) - curTitle = origTitle.replace(/[^\|\-]*[\|\-](.*)/gi,'$1'); - } else if (curTitle.indexOf(': ') !== -1) { - // Check if we have an heading containing this exact string, so we - // could assume it's the full title. - var headings = this._concatNodeLists( - doc.getElementsByTagName('h1'), - doc.getElementsByTagName('h2') - ); - var match = this._someNode(headings, function(heading) { - return heading.textContent === curTitle; - }); - - // If we don't, let's extract the title out of the original title string. - if (!match) { - curTitle = origTitle.- substring(origTitle.lastIndexOf(':') + 1); - - // If the title is now too short, try the first colon instead: - if (curTitle.split(' ').length < 3) - curTitle = origTitle.substring(origTitle.indexOf(':') + 1); - } - } else if (curTitle.length > 150 || curTitle.length < 15) { - var hOnes = doc.getElementsByTagName('h1'); - - if (hOnes.length === 1) - curTitle = this._getInnerText(hOnes[0]); - } - - curTitle = curTitle.trim(); - - if (curTitle.split(' ').length <= 4) - curTitle = origTitle; - - return curTitle; - }, - - /** - * Prepare the HTML document for readability to scrape it. - * This includes things like stripping javascript, CSS, and handling terrible markup. - * - * @return void - **/ - _prepDocument: function() { - var doc = this._doc; - - // Remove all style tags in head - this._forEachNode(doc.getElementsByTagName("style"), function(styleNode) { - styleNode.parentNode.removeChild(styleNode); - }); - - if (doc.body) { - this._replaceBrs(doc.body); - } - - this._forEachNode(doc.getElementsByTagName("font"), function(fontNode) { - this._setNodeTag(fontNode, "SPAN"); - }); - }, - - /** - * Finds the next element, starting from the given node, and ignoring - * whitespace in between. If the given node is an element, the same node is - * returned. - */ - _nextElement: function (node) { - var next = node; - while (next - && (next.nodeType != Node.ELEMENT_NODE) - && this.REGEXPS.whitespace.test(next.textContent)) { - next = next.nextSibling; - } - return next; - }, - - /** - * Replaces 2 or more successive
    elements with a single

    . - * Whitespace between
    elements are ignored. For example: - *

    - * will become: - *
    foo
    bar

    abc

    - */ - _replaceBrs: function (elem) { - this._forEachNode(elem.getElementsByTagName("br"), function(br) { - var next = br.nextSibling; - - // Whether 2 or more
    elements have been found and replaced with a - //

    block. - var replaced = false; - - // If we find a
    chain, remove the
    s until we hit another element - // or non-whitespace. This leaves behind the first
    in the chain - // (which will be replaced with a

    later). - while ((next = this._nextElement(next)) && (next.tagName == "BR")) { - replaced = true; - var brSibling = next.nextSibling; - next.parentNode.removeChild(next); - next = brSibling; - } - - // If we removed a
    chain, replace the remaining
    with a

    . Add - // all sibling nodes as children of the

    until we hit another
    - // chain. - if (replaced) { - var p = this._doc.createElement("p"); - br.parentNode.replaceChild(p, br); - - next = p.nextSibling; - while (next) { - // If we've hit another

    , we're done adding children to this

    . - if (next.tagName == "BR") { - var nextElem = this._nextElement(next); - if (nextElem && nextElem.tagName == "BR") - break; - } - - // Otherwise, make this node a child of the new

    . - var sibling = next.nextSibling; - p.appendChild(next); - next = sibling; - } - } - }); - }, - - _setNodeTag: function (node, tag) { - this.log("_setNodeTag", node, tag); - if (node.__JSDOMParser__) { - node.localName = tag.toLowerCase(); - node.tagName = tag.toUpperCase(); - return node; - } - - var replacement = node.ownerDocument.createElement(tag); - while (node.firstChild) { - replacement.appendChild(node.firstChild); - } - node.parentNode.replaceChild(replacement, node); - if (node.readability) - replacement.readability = node.readability; - - for (var i = 0; i < node.attributes.length; i++) { - replacement.setAttribute(node.attributes[i].name, node.attributes[i].value); - } - return replacement; - }, - - /** - * Prepare the article node for display. Clean out any inline styles, - * iframes, forms, strip extraneous

    tags, etc. - * - * @param Element - * @return void - **/ - _prepArticle: function(articleContent) { - this._cleanStyles(articleContent); - - // Clean out junk from the article content - this._cleanConditionally(articleContent, "form"); - this._clean(articleContent, "object"); - this._clean(articleContent, "embed"); - this._clean(articleContent, "h1"); - this._clean(articleContent, "footer"); - - // If there is only one h2, they are probably using it as a header - // and not a subheader, so remove it since we already have a header. - if (articleContent.getElementsByTagName('h2').length === 1) - this._clean(articleContent, "h2"); - - this._clean(articleContent, "iframe"); - this._cleanHeaders(articleContent); - - // Do these last as the previous stuff may have removed junk - // that will affect these - this._cleanConditionally(articleContent, "table"); - this._cleanConditionally(articleContent, "ul"); - this._cleanConditionally(articleContent, "div"); - - // Remove extra paragraphs - this._forEachNode(articleContent.getElementsByTagName('p'), function(paragraph) { - var imgCount = paragraph.getElementsByTagName('img').length; - var embedCount = paragraph.getElementsByTagName('embed').length; - var objectCount = paragraph.getElementsByTagName('object').length; - // At this point, nasty iframes have been removed, only remain embedded video ones. - var iframeCount = paragraph.getElementsByTagName('iframe').length; - var totalCount = imgCount + embedCount + objectCount + iframeCount; - - if (totalCount === 0 && !this._getInnerText(paragraph, false)) - paragraph.parentNode.removeChild(paragraph); - }); - - this._forEachNode(articleContent.getElementsByTagName("br"), function(br) { - var next = this._nextElement(br.nextSibling); - if (next && next.tagName == "P") - br.parentNode.removeChild(br); - }); - }, - - /** - * Initialize a node with the readability object. Also checks the - * className/id for special names to add to its score. - * - * @param Element - * @return void - **/ - _initializeNode: function(node) { - node.readability = {"contentScore": 0}; - - switch(node.tagName) { - case 'DIV': - node.readability.contentScore += 5; - break; - - case 'PRE': - case 'TD': - case 'BLOCKQUOTE': - node.readability.contentScore += 3; - break; - - case 'ADDRESS': - case 'OL': - case 'UL': - case 'DL': - case 'DD': - case 'DT': - case 'LI': - case 'FORM': - node.readability.contentScore -= 3; - break; - - case 'H1': - case 'H2': - case 'H3': - case 'H4': - case 'H5': - case 'H6': - case 'TH': - node.readability.contentScore -= 5; - break; - } - - node.readability.contentScore += this._getClassWeight(node); - }, - - _removeAndGetNext: function(node) { - var nextNode = this._getNextNode(node, true); - node.parentNode.removeChild(node); - return nextNode; - }, - - /** - * Traverse the DOM from node to node, starting at the node passed in. - * Pass true for the second parameter to indicate this node itself - * (and its kids) are going away, and we want the next node over. - * - * Calling this in a loop will traverse the DOM depth-first. - */ - _getNextNode: function(node, ignoreSelfAndKids) { - // First check for kids if those aren't being ignored - if (!ignoreSelfAndKids && node.firstElementChild) { - return node.firstElementChild; - } - // Then for siblings... - if (node.nextElementSibling) { - return node.nextElementSibling; - } - // And finally, move up the parent chain *and* find a sibling - // (because this is depth-first traversal, we will have already - // seen the parent nodes themselves). - do { - node = node.parentNode; - } while (node && !node.nextElementSibling); - return node && node.nextElementSibling; - }, - - /** - * Like _getNextNode, but for DOM implementations with no - * firstElementChild/nextElementSibling functionality... - */ - _getNextNodeNoElementProperties: function(node, ignoreSelfAndKids) { - function nextSiblingEl(n) { - do { - n = n.nextSibling; - } while (n && n.nodeType !== n.ELEMENT_NODE); - return n; - } - // First check for kids if those aren't being ignored - if (!ignoreSelfAndKids && node.children[0]) { - return node.children[0]; - } - // Then for siblings... - var next = nextSiblingEl(node); - if (next) { - return next; - } - // And finally, move up the parent chain *and* find a sibling - // (because this is depth-first traversal, we will have already - // seen the parent nodes themselves). - do { - node = node.parentNode; - if (node) - next = nextSiblingEl(node); - } while (node && !next); - return node && next; - }, - - _checkByline: function(node, matchString) { - if (this._articleByline) { - return false; - } - - if (node.getAttribute !== undefined) { - var rel = node.getAttribute("rel"); - } - - if ((rel === "author" || this.REGEXPS.byline.test(matchString)) && this._isValidByline(node.textContent)) { - this._articleByline = node.textContent.trim(); - return true; - } - - return false; - }, - - _getNodeAncestors: function(node, maxDepth) { - maxDepth = maxDepth || 0; - var i = 0, ancestors = []; - while (node.parentNode) { - ancestors.push(node.parentNode) - if (maxDepth && ++i === maxDepth) - break; - node = node.parentNode; - } - return ancestors; - }, - - /*** - * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is - * most likely to be the stuff a user wants to read. Then return it wrapped up in a div. - * - * @param page a document to run upon. Needs to be a full document, complete with body. - * @return Element - **/ - _grabArticle: function (page) { - this.log("**** grabArticle ****"); - var doc = this._doc; - var isPaging = (page !== null ? true: false); - page = page ? page : this._doc.body; - - // We can't grab an article if we don't have a page! - if (!page) { - this.log("No body found in document. Abort."); - return null; - } - - var pageCacheHtml = page.innerHTML; - - // Check if any "dir" is set on the toplevel document element - this._articleDir = doc.documentElement.getAttribute("dir"); - - while (true) { - var stripUnlikelyCandidates = this._flagIsActive(this.FLAG_STRIP_UNLIKELYS); - - // First, node prepping. Trash nodes that look cruddy (like ones with the - // class name "comment", etc), and turn divs into P tags where they have been - // used inappropriately (as in, where they contain no other block level elements.) - var elementsToScore = []; - var node = this._doc.documentElement; - - while (node) { - var matchString = node.className + " " + node.id; - - // Check to see if this node is a byline, and remove it if it is. - if (this._checkByline(node, matchString)) { - node = this._removeAndGetNext(node); - continue; - } - - // Remove unlikely candidates - if (stripUnlikelyCandidates) { - if (this.REGEXPS.unlikelyCandidates.test(matchString) && - !this.REGEXPS.okMaybeItsACandidate.test(matchString) && - node.tagName !== "BODY" && - node.tagName !== "A") { - this.log("Removing unlikely candidate - " + matchString); - node = this._removeAndGetNext(node); - continue; - } - } - - if (this.DEFAULT_TAGS_TO_SCORE.indexOf(node.tagName) !== -1) { - elementsToScore.push(node); - } - - // Turn all divs that don't have children block level elements into p's - if (node.tagName === "DIV") { - // Sites like http://mobile.slate.com encloses each paragraph with a DIV - // element. DIVs with only a P element inside and no text content can be - // safely converted into plain P elements to avoid confusing the scoring - // algorithm with DIVs with are, in practice, paragraphs. - if (this._hasSinglePInsideElement(node)) { - var newNode = node.children[0]; - node.parentNode.replaceChild(newNode, node); - node = newNode; - } else if (!this._hasChildBlockElement(node)) { - node = this._setNodeTag(node, "P"); - elementsToScore.push(node); - } else { - // EXPERIMENTAL - this._forEachNode(node.childNodes, function(childNode) { - if (childNode.nodeType === Node.TEXT_NODE) { - var p = doc.createElement('p'); - p.textContent = childNode.textContent; - p.style.display = 'inline'; - p.className = 'readability-styled'; - node.replaceChild(p, childNode); - } - }); - } - } - node = this._getNextNode(node); - } - - /** - * Loop through all paragraphs, and assign a score to them based on how content-y they look. - * Then add their score to their parent node. - * - * A score is determined by things like number of commas, class names, etc. Maybe eventually link density. - **/ - var candidates = []; - this._forEachNode(elementsToScore, function(elementToScore) { - if (!elementToScore.parentNode || typeof(elementToScore.parentNode.tagName) === 'undefined') - return; - - // If this paragraph is less than 25 characters, don't even count it. - var innerText = this._getInnerText(elementToScore); - if (innerText.length < 25) - return; - - // Exclude nodes with no ancestor. - var ancestors = this._getNodeAncestors(elementToScore, 3); - if (ancestors.length === 0) - return; - - var contentScore = 0; - - // Add a point for the paragraph itself as a base. - contentScore += 1; - - // Add points for any commas within this paragraph. - contentScore += innerText.split(',').length; - - // For every 100 characters in this paragraph, add another point. Up to 3 points. - contentScore += Math.min(Math.floor(innerText.length / 100), 3); - - // Initialize and score ancestors. - this._forEachNode(ancestors, function(ancestor, level) { - if (!ancestor.tagName) - return; - - if (typeof(ancestor.readability) === 'undefined') { - this._initializeNode(ancestor); - candidates.push(ancestor); - } - - // Node score divider: - // - parent: 1 (no division) - // - grandparent: 2 - // - great grandparent+: ancestor level * 3 - if (level === 0) - var scoreDivider = 1; - else if (level === 1) - scoreDivider = 2; - else - scoreDivider = level * 3; - ancestor.readability.contentScore += contentScore / scoreDivider; - }); - }); - - // After we've calculated scores, loop through all of the possible - // candidate nodes we found and find the one with the highest score. - var topCandidates = []; - for (var c = 0, cl = candidates.length; c < cl; c += 1) { - var candidate = candidates[c]; - - // Scale the final candidates score based on link density. Good content - // should have a relatively small link density (5% or less) and be mostly - // unaffected by this operation. - var candidateScore = candidate.readability.contentScore * (1 - this._getLinkDensity(candidate)); - candidate.readability.contentScore = candidateScore; - - this.log('Candidate:', candidate, "with score " + candidateScore); - - for (var t = 0; t < this._nbTopCandidates; t++) { - var aTopCandidate = topCandidates[t]; - - if (!aTopCandidate || candidateScore > aTopCandidate.readability.contentScore) { - topCandidates.splice(t, 0, candidate); - if (topCandidates.length > this._nbTopCandidates) - topCandidates.pop(); - break; - } - } - } - - var topCandidate = topCandidates[0] || null; - var neededToCreateTopCandidate = false; - - // If we still have no top candidate, just use the body as a last resort. - // We also have to copy the body node so it is something we can modify. - if (topCandidate === null || topCandidate.tagName === "BODY") { - // Move all of the page's children into topCandidate - topCandidate = doc.createElement("DIV"); - neededToCreateTopCandidate = true; - // Move everything (not just elements, also text nodes etc.) into the container - // so we even include text directly in the body: - var kids = page.childNodes; - while (kids.length) { - this.log("Moving child out:", kids[0]); - topCandidate.appendChild(kids[0]); - } - - page.appendChild(topCandidate); - - this._initializeNode(topCandidate); - } else if (topCandidate) { - // Because of our bonus system, parents of candidates might have scores - // themselves. They get half of the node. There won't be nodes with higher - // scores than our topCandidate, but if we see the score going *up* in the first - // few steps up the tree, that's a decent sign that there might be more content - // lurking in other places that we want to unify in. The sibling stuff - // below does some of that - but only if we've looked high enough up the DOM - // tree. - var parentOfTopCandidate = topCandidate.parentNode; - var lastScore = topCandidate.readability.contentScore; - // The scores shouldn't get too low. - var scoreThreshold = lastScore / 3; - while (parentOfTopCandidate && parentOfTopCandidate.readability) { - var parentScore = parentOfTopCandidate.readability.contentScore; - if (parentScore < scoreThreshold) - break; - if (parentScore > lastScore) { - // Alright! We found a better parent to use. - topCandidate = parentOfTopCandidate; - break; - } - lastScore = parentOfTopCandidate.readability.contentScore; - parentOfTopCandidate = parentOfTopCandidate.parentNode; - } - } - - // Now that we have the top candidate, look through its siblings for content - // that might also be related. Things like preambles, content split by ads - // that we removed, etc. - var articleContent = doc.createElement("DIV"); - if (isPaging) - articleContent.id = "readability-content"; - - var siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2); - var siblings = topCandidate.parentNode.children; - - for (var s = 0, sl = siblings.length; s < sl; s++) { - var sibling = siblings[s]; - var append = false; - - this.log("Looking at sibling node:", sibling, sibling.readability ? ("with score " + sibling.readability.contentScore) : ''); - this.log("Sibling has score", sibling.readability ? sibling.readability.contentScore : 'Unknown'); - - if (sibling === topCandidate) { - append = true; - } else { - var contentBonus = 0; - - // Give a bonus if sibling nodes and top candidates have the example same classname - if (sibling.className === topCandidate.className && topCandidate.className !== "") - contentBonus += topCandidate.readability.contentScore * 0.2; - - if (sibling.readability && - ((sibling.readability.contentScore + contentBonus) >= siblingScoreThreshold)) { - append = true; - } else if (sibling.nodeName === "P") { - var linkDensity = this._getLinkDensity(sibling); - var nodeContent = this._getInnerText(sibling); - var nodeLength = nodeContent.length; - - if (nodeLength > 80 && linkDensity < 0.25) { - append = true; - } else if (nodeLength < 80 && nodeLength > 0 && linkDensity === 0 && - nodeContent.search(/\.( |$)/) !== -1) { - append = true; - } - } - } - - if (append) { - this.log("Appending node:", sibling); - - if (this.ALTER_TO_DIV_EXCEPTIONS.indexOf(sibling.nodeName) === -1) { - // We have a node that isn't a common block level element, like a form or td tag. - // Turn it into a div so it doesn't get filtered out later by accident. - this.log("Altering sibling:", sibling, 'to div.'); - - sibling = this._setNodeTag(sibling, "DIV"); - } - - articleContent.appendChild(sibling); - // siblings is a reference to the children array, and - // sibling is removed from the array when we call appendChild(). - // As a result, we must revisit this index since the nodes - // have been shifted. - s -= 1; - sl -= 1; - } - } - - if (this._debug) - this.log("Article content pre-prep: " + articleContent.innerHTML); - // So we have all of the content that we need. Now we clean it up for presentation. - this._prepArticle(articleContent); - if (this._debug) - this.log("Article content post-prep: " + articleContent.innerHTML); - - if (this._curPageNum === 1) { - if (neededToCreateTopCandidate) { - // We already created a fake div thing, and there wouldn't have been any siblings left - // for the previous loop, so there's no point trying to create a new div, and then - // move all the children over. Just assign IDs and class names here. No need to append - // because that already happened anyway. - topCandidate.id = "readability-page-1"; - topCandidate.className = "page"; - } else { - var div = doc.createElement("DIV"); - div.id = "readability-page-1"; - div.className = "page"; - var children = articleContent.childNodes; - while (children.length) { - div.appendChild(children[0]); - } - articleContent.appendChild(div); - } - } - - if (this._debug) - this.log("Article content after paging: " + articleContent.innerHTML); - - // Now that we've gone through the full algorithm, check to see if - // we got any meaningful content. If we didn't, we may need to re-run - // grabArticle with different flags set. This gives us a higher likelihood of - // finding the content, and the sieve approach gives us a higher likelihood of - // finding the -right- content. - if (this._getInnerText(articleContent, true).length < 500) { - page.innerHTML = pageCacheHtml; - - if (this._flagIsActive(this.FLAG_STRIP_UNLIKELYS)) { - this._removeFlag(this.FLAG_STRIP_UNLIKELYS); - } else if (this._flagIsActive(this.FLAG_WEIGHT_CLASSES)) { - this._removeFlag(this.FLAG_WEIGHT_CLASSES); - } else if (this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)) { - this._removeFlag(this.FLAG_CLEAN_CONDITIONALLY); - } else { - return null; - } - } else { - return articleContent; - } - } - }, - - /** - * Check whether the input string could be a byline. - * This verifies that the input is a string, and that the length - * is less than 100 chars. - * - * @param possibleByline {string} - a string to check whether its a byline. - * @return Boolean - whether the input string is a byline. - */ - _isValidByline: function(byline) { - if (typeof byline == 'string' || byline instanceof String) { - byline = byline.trim(); - return (byline.length > 0) && (byline.length < 100); - } - return false; - }, - - /** - * Attempts to get excerpt and byline metadata for the article. - * - * @return Object with optional "excerpt" and "byline" properties - */ - _getArticleMetadata: function() { - var metadata = {}; - var values = {}; - var metaElements = this._doc.getElementsByTagName("meta"); - - // Match "description", or Twitter's "twitter:description" (Cards) - // in name attribute. - var namePattern = /^\s*((twitter)\s*:\s*)?(description|title)\s*$/gi; - - // Match Facebook's Open Graph title & description properties. - var propertyPattern = /^\s*og\s*:\s*(description|title)\s*$/gi; - - // Find description tags. - this._forEachNode(metaElements, function(element) { - var elementName = element.getAttribute("name"); - var elementProperty = element.getAttribute("property"); - - if ([elementName, elementProperty].indexOf("author") !== -1) { - metadata.byline = element.getAttribute("content"); - return; - } - - var name = null; - if (namePattern.test(elementName)) { - name = elementName; - } else if (propertyPattern.test(elementProperty)) { - name = elementProperty; - } - - if (name) { - var content = element.getAttribute("content"); - if (content) { - // Convert to lowercase and remove any whitespace - // so we can match below. - name = name.toLowerCase().replace(/\s/g, ''); - values[name] = content.trim(); - } - } - }); - - if ("description" in values) { - metadata.excerpt = values["description"]; - } else if ("og:description" in values) { - // Use facebook open graph description. - metadata.excerpt = values["og:description"]; - } else if ("twitter:description" in values) { - // Use twitter cards description. - metadata.excerpt = values["twitter:description"]; - } - - if ("og:title" in values) { - // Use facebook open graph title. - metadata.title = values["og:title"]; - } else if ("twitter:title" in values) { - // Use twitter cards title. - metadata.title = values["twitter:title"]; - } - - return metadata; - }, - - /** - * Removes script tags from the document. - * - * @param Element - **/ - _removeScripts: function(doc) { - this._forEachNode(doc.getElementsByTagName('script'), function(scriptNode) { - scriptNode.nodeValue = ""; - scriptNode.removeAttribute('src'); - - if (scriptNode.parentNode) - scriptNode.parentNode.removeChild(scriptNode); - }); - this._forEachNode(doc.getElementsByTagName('noscript'), function(noscriptNode) { - if (noscriptNode.parentNode) - noscriptNode.parentNode.removeChild(noscriptNode); - }); - }, - - /** - * Check if this node has only whitespace and a single P element - * Returns false if the DIV node contains non-empty text nodes - * or if it contains no P or more than 1 element. - * - * @param Element - **/ - _hasSinglePInsideElement: function(element) { - // There should be exactly 1 element child which is a P: - if (element.children.length != 1 || element.children[0].tagName !== "P") { - return false; - } - - // And there should be no text nodes with real content - return !this._someNode(element.childNodes, function(node) { - return node.nodeType === Node.TEXT_NODE && - this.REGEXPS.hasContent.test(node.textContent); - }); - }, - - /** - * Determine whether element has any children block level elements. - * - * @param Element - */ - _hasChildBlockElement: function (element) { - return this._someNode(element.childNodes, function(node) { - return this.DIV_TO_P_ELEMS.indexOf(node.tagName) !== -1 || - this._hasChildBlockElement(node); - }); - }, - - /** - * Get the inner text of a node - cross browser compatibly. - * This also strips out any excess whitespace to be found. - * - * @param Element - * @param Boolean normalizeSpaces (default: true) - * @return string - **/ - _getInnerText: function(e, normalizeSpaces) { - normalizeSpaces = (typeof normalizeSpaces === 'undefined') ? true : normalizeSpaces; - var textContent = e.textContent.trim(); - - if (normalizeSpaces) { - return textContent.replace(this.REGEXPS.normalize, " "); - } else { - return textContent; - } - }, - - /** - * Get the number of times a string s appears in the node e. - * - * @param Element - * @param string - what to split on. Default is "," - * @return number (integer) - **/ - _getCharCount: function(e,s) { - s = s || ","; - return this._getInnerText(e).split(s).length - 1; - }, - - /** - * Remove the style attribute on every e and under. - * TODO: Test if getElementsByTagName(*) is faster. - * - * @param Element - * @return void - **/ - _cleanStyles: function(e) { - e = e || this._doc; - if (!e) - return; - var cur = e.firstChild; - - // Remove any root styles, if we're able. - if (typeof e.removeAttribute === 'function' && e.className !== 'readability-styled') - e.removeAttribute('style'); - - // Go until there are no more child nodes - while (cur !== null) { - if (cur.nodeType === cur.ELEMENT_NODE) { - // Remove style attribute(s) : - if (cur.className !== "readability-styled") - cur.removeAttribute("style"); - - this._cleanStyles(cur); - } - - cur = cur.nextSibling; - } - }, - - /** - * Get the density of links as a percentage of the content - * This is the amount of text that is inside a link divided by the total text in the node. - * - * @param Element - * @return number (float) - **/ - _getLinkDensity: function(element) { - var textLength = this._getInnerText(element).length; - if (textLength === 0) - return 0; - - var linkLength = 0; - - // XXX implement _reduceNodeList? - this._forEachNode(element.getElementsByTagName("a"), function(linkNode) { - linkLength += this._getInnerText(linkNode).length; - }); - - return linkLength / textLength; - }, - - /** - * Find a cleaned up version of the current URL, to use for comparing links for possible next-pageyness. - * - * @author Dan Lacy - * @return string the base url - **/ - _findBaseUrl: function() { - var uri = this._uri; - var noUrlParams = uri.path.split("?")[0]; - var urlSlashes = noUrlParams.split("/").reverse(); - var cleanedSegments = []; - var possibleType = ""; - - for (var i = 0, slashLen = urlSlashes.length; i < slashLen; i += 1) { - var segment = urlSlashes[i]; - - // Split off and save anything that looks like a file type. - if (segment.indexOf(".") !== -1) { - possibleType = segment.split(".")[1]; - - // If the type isn't alpha-only, it's probably not actually a file extension. - if (!possibleType.match(/[^a-zA-Z]/)) - segment = segment.split(".")[0]; - } - - // EW-CMS specific segment replacement. Ugly. - // Example: http://www.ew.com/ew/article/0,,20313460_20369436,00.html - if (segment.indexOf(',00') !== -1) - segment = segment.replace(',00', ''); - - // If our first or second segment has anything looking like a page number, remove it. - if (segment.match(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i) && ((i === 1) || (i === 0))) - segment = segment.replace(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i, ""); - - var del = false; - - // If this is purely a number, and it's the first or second segment, - // it's probably a page number. Remove it. - if (i < 2 && segment.match(/^\d{1,2}$/)) - del = true; - - // If this is the first segment and it's just "index", remove it. - if (i === 0 && segment.toLowerCase() === "index") - del = true; - - // If our first or second segment is smaller than 3 characters, - // and the first segment was purely alphas, remove it. - if (i < 2 && segment.length < 3 && !urlSlashes[0].match(/[a-z]/i)) - del = true; - - // If it's not marked for deletion, push it to cleanedSegments. - if (!del) - cleanedSegments.push(segment); - } - - // This is our final, cleaned, base article URL. - return uri.scheme + "://" + uri.host + cleanedSegments.reverse().join("/"); - }, - - /** - * Look for any paging links that may occur within the document. - * - * @param body - * @return object (array) - **/ - _findNextPageLink: function(elem) { - var uri = this._uri; - var possiblePages = {}; - var allLinks = elem.getElementsByTagName('a'); - var articleBaseUrl = this._findBaseUrl(); - - // Loop through all links, looking for hints that they may be next-page links. - // Things like having "page" in their textContent, className or id, or being a child - // of a node with a page-y className or id. - // - // Also possible: levenshtein distance? longest common subsequence? - // - // After we do that, assign each page a score, and - for (var i = 0, il = allLinks.length; i < il; i += 1) { - var link = allLinks[i]; - var linkHref = allLinks[i].href.replace(/#.*$/, '').replace(/\/$/, ''); - - // If we've already seen this page, ignore it. - if (linkHref === "" || - linkHref === articleBaseUrl || - linkHref === uri.spec || - linkHref in this._parsedPages) { - continue; - } - - // If it's on a different domain, skip it. - if (uri.host !== linkHref.split(/\/+/g)[1]) - continue; - - var linkText = this._getInnerText(link); - - // If the linkText looks like it's not the next page, skip it. - if (linkText.match(this.REGEXPS.extraneous) || linkText.length > 25) - continue; - - // If the leftovers of the URL after removing the base URL don't contain - // any digits, it's certainly not a next page link. - var linkHrefLeftover = linkHref.replace(articleBaseUrl, ''); - if (!linkHrefLeftover.match(/\d/)) - continue; - - if (!(linkHref in possiblePages)) { - possiblePages[linkHref] = {"score": 0, "linkText": linkText, "href": linkHref}; - } else { - possiblePages[linkHref].linkText += ' | ' + linkText; - } - - var linkObj = possiblePages[linkHref]; - - // If the articleBaseUrl isn't part of this URL, penalize this link. It could - // still be the link, but the odds are lower. - // Example: http://www.actionscript.org/resources/articles/745/1/JavaScript-and-VBScript-Injection-in-ActionScript-3/Page1.html - if (linkHref.indexOf(articleBaseUrl) !== 0) - linkObj.score -= 25; - - var linkData = linkText + ' ' + link.className + ' ' + link.id; - if (linkData.match(this.REGEXPS.nextLink)) - linkObj.score += 50; - - if (linkData.match(/pag(e|ing|inat)/i)) - linkObj.score += 25; - - if (linkData.match(/(first|last)/i)) { - // -65 is enough to negate any bonuses gotten from a > or » in the text, - // If we already matched on "next", last is probably fine. - // If we didn't, then it's bad. Penalize. - if (!linkObj.linkText.match(this.REGEXPS.nextLink)) - linkObj.score -= 65; - } - - if (linkData.match(this.REGEXPS.negative) || linkData.match(this.REGEXPS.extraneous)) - linkObj.score -= 50; - - if (linkData.match(this.REGEXPS.prevLink)) - linkObj.score -= 200; - - // If a parentNode contains page or paging or paginat - var parentNode = link.parentNode; - var positiveNodeMatch = false; - var negativeNodeMatch = false; - - while (parentNode) { - var parentNodeClassAndId = parentNode.className + ' ' + parentNode.id; - - if (!positiveNodeMatch && parentNodeClassAndId && parentNodeClassAndId.match(/pag(e|ing|inat)/i)) { - positiveNodeMatch = true; - linkObj.score += 25; - } - - if (!negativeNodeMatch && parentNodeClassAndId && parentNodeClassAndId.match(this.REGEXPS.negative)) { - // If this is just something like "footer", give it a negative. - // If it's something like "body-and-footer", leave it be. - if (!parentNodeClassAndId.match(this.REGEXPS.positive)) { - linkObj.score -= 25; - negativeNodeMatch = true; - } - } - - parentNode = parentNode.parentNode; - } - - // If the URL looks like it has paging in it, add to the score. - // Things like /page/2/, /pagenum/2, ?p=3, ?page=11, ?pagination=34 - if (linkHref.match(/p(a|g|ag)?(e|ing|ination)?(=|\/)[0-9]{1,2}/i) || linkHref.match(/(page|paging)/i)) - linkObj.score += 25; - - // If the URL contains negative values, give a slight decrease. - if (linkHref.match(this.REGEXPS.extraneous)) - linkObj.score -= 15; - - /** - * Minor punishment to anything that doesn't match our current URL. - * NOTE: I'm finding this to cause more harm than good where something is exactly 50 points. - * Dan, can you show me a counterexample where this is necessary? - * if (linkHref.indexOf(window.location.href) !== 0) { - * linkObj.score -= 1; - * } - **/ - - // If the link text can be parsed as a number, give it a minor bonus, with a slight - // bias towards lower numbered pages. This is so that pages that might not have 'next' - // in their text can still get scored, and sorted properly by score. - var linkTextAsNumber = parseInt(linkText, 10); - if (linkTextAsNumber) { - // Punish 1 since we're either already there, or it's probably - // before what we want anyways. - if (linkTextAsNumber === 1) { - linkObj.score -= 10; - } else { - linkObj.score += Math.max(0, 10 - linkTextAsNumber); - } - } - } - - // Loop thrugh all of our possible pages from above and find our top - // candidate for the next page URL. Require at least a score of 50, which - // is a relatively high confidence that this page is the next link. - var topPage = null; - for (var page in possiblePages) { - if (possiblePages.hasOwnProperty(page)) { - if (possiblePages[page].score >= 50 && - (!topPage || topPage.score < possiblePages[page].score)) - topPage = possiblePages[page]; - } - } - - if (topPage) { - var nextHref = topPage.href.replace(/\/$/,''); - - this.log('NEXT PAGE IS ' + nextHref); - this._parsedPages[nextHref] = true; - return nextHref; - } else { - return null; - } - }, - - _successfulRequest: function(request) { - return (request.status >= 200 && request.status < 300) || - request.status === 304 || - (request.status === 0 && request.responseText); - }, - - _ajax: function(url, options) { - var request = new XMLHttpRequest(); - - function respondToReadyState(readyState) { - if (request.readyState === 4) { - if (this._successfulRequest(request)) { - if (options.success) - options.success(request); - } else { - if (options.error) - options.error(request); - } - } - } - - if (typeof options === 'undefined') - options = {}; - - request.onreadystatechange = respondToReadyState; - - request.open('get', url, true); - request.setRequestHeader('Accept', 'text/html'); - - try { - request.send(options.postBody); - } catch (e) { - if (options.error) - options.error(); - } - - return request; - }, - - _appendNextPage: function(nextPageLink) { - var doc = this._doc; - this._curPageNum += 1; - - var articlePage = doc.createElement("DIV"); - articlePage.id = 'readability-page-' + this._curPageNum; - articlePage.className = 'page'; - articlePage.innerHTML = '

    §

    '; - - doc.getElementById("readability-content").appendChild(articlePage); - - if (this._curPageNum > this._maxPages) { - var nextPageMarkup = "
    "; - articlePage.innerHTML = articlePage.innerHTML + nextPageMarkup; - return; - } - - // Now that we've built the article page DOM element, get the page content - // asynchronously and load the cleaned content into the div we created for it. - (function(pageUrl, thisPage) { - this._ajax(pageUrl, { - success: function(r) { - - // First, check to see if we have a matching ETag in headers - if we do, this is a duplicate page. - var eTag = r.getResponseHeader('ETag'); - if (eTag) { - if (eTag in this._pageETags) { - this.log("Exact duplicate page found via ETag. Aborting."); - articlePage.style.display = 'none'; - return; - } else { - this._pageETags[eTag] = 1; - } - } - - // TODO: this ends up doubling up page numbers on NYTimes articles. Need to generically parse those away. - var page = doc.createElement("DIV"); - - // Do some preprocessing to our HTML to make it ready for appending. - // - Remove any script tags. Swap and reswap newlines with a unicode - // character because multiline regex doesn't work in javascript. - // - Turn any noscript tags into divs so that we can parse them. This - // allows us to find any next page links hidden via javascript. - // - Turn all double br's into p's - was handled by prepDocument in the original view. - // Maybe in the future abstract out prepDocument to work for both the original document - // and AJAX-added pages. - var responseHtml = r.responseText.replace(/\n/g,'\uffff').replace(/.*?<\/script>/gi, ''); - responseHtml = responseHtml.replace(/\n/g,'\uffff').replace(/.*?<\/script>/gi, ''); - responseHtml = responseHtml.replace(/\uffff/g,'\n').replace(/<(\/?)noscript/gi, '<$1div'); - responseHtml = responseHtml.replace(this.REGEXPS.replaceFonts, '<$1span>'); - - page.innerHTML = responseHtml; - this._replaceBrs(page); - - // Reset all flags for the next page, as they will search through it and - // disable as necessary at the end of grabArticle. - this._flags = 0x1 | 0x2 | 0x4; - - var nextPageLink = this._findNextPageLink(page); - - // NOTE: if we end up supporting _appendNextPage(), we'll need to - // change this call to be async - var content = this._grabArticle(page); - - if (!content) { - this.log("No content found in page to append. Aborting."); - return; - } - - // Anti-duplicate mechanism. Essentially, get the first paragraph of our new page. - // Compare it against all of the the previous document's we've gotten. If the previous - // document contains exactly the innerHTML of this first paragraph, it's probably a duplicate. - var firstP = content.getElementsByTagName("P").length ? content.getElementsByTagName("P")[0] : null; - if (firstP && firstP.innerHTML.length > 100) { - for (var i = 1; i <= this._curPageNum; i += 1) { - var rPage = doc.getElementById('readability-page-' + i); - if (rPage && rPage.innerHTML.indexOf(firstP.innerHTML) !== -1) { - this.log('Duplicate of page ' + i + ' - skipping.'); - articlePage.style.display = 'none'; - this._parsedPages[pageUrl] = true; - return; - } - } - } - - this._removeScripts(content); - - thisPage.innerHTML = thisPage.innerHTML + content.innerHTML; - - // After the page has rendered, post process the content. This delay is necessary because, - // in webkit at least, offsetWidth is not set in time to determine image width. We have to - // wait a little bit for reflow to finish before we can fix floating images. - setTimeout((function() { - this._postProcessContent(thisPage); - }).bind(this), 500); - - - if (nextPageLink) - this._appendNextPage(nextPageLink); - } - }); - }).bind(this)(nextPageLink, articlePage); - }, - - /** - * Get an elements class/id weight. Uses regular expressions to tell if this - * element looks good or bad. - * - * @param Element - * @return number (Integer) - **/ - _getClassWeight: function(e) { - if (!this._flagIsActive(this.FLAG_WEIGHT_CLASSES)) - return 0; - - var weight = 0; - - // Look for a special classname - if (typeof(e.className) === 'string' && e.className !== '') { - if (this.REGEXPS.negative.test(e.className)) - weight -= 25; - - if (this.REGEXPS.positive.test(e.className)) - weight += 25; - } - - // Look for a special ID - if (typeof(e.id) === 'string' && e.id !== '') { - if (this.REGEXPS.negative.test(e.id)) - weight -= 25; - - if (this.REGEXPS.positive.test(e.id)) - weight += 25; - } - - return weight; - }, - - /** - * Clean a node of all elements of type "tag". - * (Unless it's a youtube/vimeo video. People love movies.) - * - * @param Element - * @param string tag to clean - * @return void - **/ - _clean: function(e, tag) { - var isEmbed = ["object", "embed", "iframe"].indexOf(tag) !== -1; - - this._forEachNode(e.getElementsByTagName(tag), function(element) { - // Allow youtube and vimeo videos through as people usually want to see those. - if (isEmbed) { - var attributeValues = [].map.call(element.attributes, function(attr) { - return attr.value; - }).join("|"); - - // First, check the elements attributes to see if any of them contain youtube or vimeo - if (this.REGEXPS.videos.test(attributeValues)) - return; - - // Then check the elements inside this element for the same. - if (this.REGEXPS.videos.test(element.innerHTML)) - return; - } - - element.parentNode.removeChild(element); - }); - }, - - /** - * Check if a given node has one of its ancestor tag name matching the - * provided one. - * @param HTMLElement node - * @param String tagName - * @param Number maxDepth - * @return Boolean - */ - _hasAncestorTag: function(node, tagName, maxDepth) { - maxDepth = maxDepth || 3; - tagName = tagName.toUpperCase(); - var depth = 0; - while (node.parentNode) { - if (depth > maxDepth) - return false; - if (node.parentNode.tagName === tagName) - return true; - node = node.parentNode; - depth++; - } - return false; - }, - - /** - * Clean an element of all tags of type "tag" if they look fishy. - * "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc. - * - * @return void - **/ - _cleanConditionally: function(e, tag) { - if (!this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)) - return; - - var tagsList = e.getElementsByTagName(tag); - var curTagsLength = tagsList.length; - var isList = tag === "ul" || tag === "ol"; - - // Gather counts for other typical elements embedded within. - // Traverse backwards so we can remove nodes at the same time - // without effecting the traversal. - // - // TODO: Consider taking into account original contentScore here. - for (var i = curTagsLength-1; i >= 0; i -= 1) { - var weight = this._getClassWeight(tagsList[i]); - var contentScore = 0; - - this.log("Cleaning Conditionally", tagsList[i]); - - if (weight + contentScore < 0) { - tagsList[i].parentNode.removeChild(tagsList[i]); - } else if (this._getCharCount(tagsList[i],',') < 10) { - // If there are not very many commas, and the number of - // non-paragraph elements is more than paragraphs or other - // ominous signs, remove the element. - var p = tagsList[i].getElementsByTagName("p").length; - var img = tagsList[i].getElementsByTagName("img").length; - var li = tagsList[i].getElementsByTagName("li").length-100; - var input = tagsList[i].getElementsByTagName("input").length; - - var embedCount = 0; - var embeds = tagsList[i].getElementsByTagName("embed"); - for (var ei = 0, il = embeds.length; ei < il; ei += 1) { - if (!this.REGEXPS.videos.test(embeds[ei].src)) - embedCount += 1; - } - - var linkDensity = this._getLinkDensity(tagsList[i]); - var contentLength = this._getInnerText(tagsList[i]).length; - var toRemove = false; - if (img > p && !this._hasAncestorTag(tagsList[i], "figure")) { - toRemove = true; - } else if (!isList && li > p) { - toRemove = true; - } else if (input > Math.floor(p/3)) { - toRemove = true; - } else if (!isList && contentLength < 25 && (img === 0 || img > 2)) { - toRemove = true; - } else if (!isList && weight < 25 && linkDensity > 0.2) { - toRemove = true; - } else if (weight >= 25 && linkDensity > 0.5) { - toRemove = true; - } else if ((embedCount === 1 && contentLength < 75) || embedCount > 1) { - toRemove = true; - } - - if (toRemove) { - tagsList[i].parentNode.removeChild(tagsList[i]); - } - } - } - }, - - /** - * Clean out spurious headers from an Element. Checks things like classnames and link density. - * - * @param Element - * @return void - **/ - _cleanHeaders: function(e) { - for (var headerIndex = 1; headerIndex < 3; headerIndex += 1) { - var headers = e.getElementsByTagName('h' + headerIndex); - for (var i = headers.length - 1; i >= 0; i -= 1) { - if (this._getClassWeight(headers[i]) < 0) - headers[i].parentNode.removeChild(headers[i]); - } - } - }, - - _flagIsActive: function(flag) { - return (this._flags & flag) > 0; - }, - - _addFlag: function(flag) { - this._flags = this._flags | flag; - }, - - _removeFlag: function(flag) { - this._flags = this._flags & ~flag; - }, - - /** - * Decides whether or not the document is reader-able without parsing the whole thing. - * - * @return boolean Whether or not we suspect parse() will suceeed at returning an article object. - */ - isProbablyReaderable: function(helperIsVisible) { - var nodes = this._getAllNodesWithTag(this._doc, ["p", "pre"]); - - // FIXME we should have a fallback for helperIsVisible, but this is - // problematic because of jsdom's elem.style handling - see - // https://github.com/mozilla/readability/pull/186 for context. - - var score = 0; - // This is a little cheeky, we use the accumulator 'score' to decide what to return from - // this callback: - return this._someNode(nodes, function(node) { - if (helperIsVisible && !helperIsVisible(node)) - return false; - var matchString = node.className + " " + node.id; - - if (this.REGEXPS.unlikelyCandidates.test(matchString) && - !this.REGEXPS.okMaybeItsACandidate.test(matchString)) { - return false; - } - - if (node.matches && node.matches("li p")) { - return false; - } - - var textContentLength = node.textContent.trim().length; - if (textContentLength < 140) { - return false; - } - - score += Math.sqrt(textContentLength - 140); - - if (score > 20) { - return true; - } - return false; - }); - }, - - /** - * Runs readability. - * - * Workflow: - * 1. Prep the document by removing script tags, css, etc. - * 2. Build readability's DOM tree. - * 3. Grab the article content from the current dom tree. - * 4. Replace the current DOM tree with the new one. - * 5. Read peacefully. - * - * @return void - **/ - parse: function () { - // Avoid parsing too large documents, as per configuration option - if (this._maxElemsToParse > 0) { - var numTags = this._doc.getElementsByTagName("*").length; - if (numTags > this._maxElemsToParse) { - throw new Error("Aborting parsing document; " + numTags + " elements found"); - } - } - - if (typeof this._doc.documentElement.firstElementChild === "undefined") { - this._getNextNode = this._getNextNodeNoElementProperties; - } - // Remove script tags from the document. - this._removeScripts(this._doc); - - // FIXME: Disabled multi-page article support for now as it - // needs more work on infrastructure. - - // Make sure this document is added to the list of parsed pages first, - // so we don't double up on the first page. - // this._parsedPages[uri.spec.replace(/\/$/, '')] = true; - - // Pull out any possible next page link first. - // var nextPageLink = this._findNextPageLink(doc.body); - - this._prepDocument(); - - var metadata = this._getArticleMetadata(); - var articleTitle = metadata.title || this._getArticleTitle(); - - var articleContent = this._grabArticle(); - if (!articleContent) - return null; - - this.log("Grabbed: " + articleContent.innerHTML); - - this._postProcessContent(articleContent); - - // if (nextPageLink) { - // // Append any additional pages after a small timeout so that people - // // can start reading without having to wait for this to finish processing. - // setTimeout((function() { - // this._appendNextPage(nextPageLink); - // }).bind(this), 500); - // } - - // If we haven't found an excerpt in the article's metadata, use the article's - // first paragraph as the excerpt. This is used for displaying a preview of - // the article's content. - if (!metadata.excerpt) { - var paragraphs = articleContent.getElementsByTagName("p"); - if (paragraphs.length > 0) { - metadata.excerpt = paragraphs[0].textContent.trim(); - } - } - - return { uri: this._uri, - title: articleTitle, - byline: metadata.byline || this._articleByline, - dir: this._articleDir, - content: articleContent.innerHTML, - length: articleContent.textContent.length, - excerpt: metadata.excerpt }; - } -}; diff --git a/lib/test.rb b/lib/test.rb deleted file mode 100644 index 813de6b..0000000 --- a/lib/test.rb +++ /dev/null @@ -1,522 +0,0 @@ -# encoding: utf-8 - -require 'rubygems' -require 'nokogiri' -require 'guess_html_encoding' - -module Readability - class Document - DEFAULT_OPTIONS = { - :retry_length => 250, - :min_text_length => 25, - :remove_unlikely_candidates => true, - :weight_classes => true, - :clean_conditionally => true, - :remove_empty_nodes => true, - :min_image_width => 130, - :min_image_height => 80, - :ignore_image_format => [], - :blacklist => nil, - :whitelist => nil - }.freeze - - REGEXES = { - :unlikelyCandidatesRe => /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i, - :okMaybeItsACandidateRe => /and|article|body|column|main|shadow/i, - :positiveRe => /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i, - :negativeRe => /combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i, - :divToPElementsRe => /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i, - :replaceBrsRe => /(]*>[ \n\r\t]*){2,}/i, - :replaceFontsRe => /<(\/?)font[^>]*>/i, - :trimRe => /^\s+|\s+$/, - :normalizeRe => /\s{2,}/, - :killBreaksRe => /((\s| ?)*){1,}/, - :videoRe => /http:\/\/(www\.)?(youtube|vimeo)\.com/i - } - - attr_accessor :options, :html, :best_candidate, :candidates, :best_candidate_has_image - - def initialize(input, options = {}) - @options = DEFAULT_OPTIONS.merge(options) - @input = input - - if RUBY_VERSION =~ /^(1\.9|2)/ && !@options[:encoding] - @input = GuessHtmlEncoding.encode(@input, @options[:html_headers]) unless @options[:do_not_guess_encoding] - @options[:encoding] = @input.encoding.to_s - end - - @input = @input.gsub(REGEXES[:replaceBrsRe], '

    ').gsub(REGEXES[:replaceFontsRe], '<\1span>') - @remove_unlikely_candidates = @options[:remove_unlikely_candidates] - @weight_classes = @options[:weight_classes] - @clean_conditionally = @options[:clean_conditionally] - @best_candidate_has_image = true - make_html - handle_exclusions!(@options[:whitelist], @options[:blacklist]) - end - - def images(content=nil, reload=false) - begin - require 'fastimage' - rescue LoadError - raise "Please install fastimage in order to use the #images feature." - end - - @best_candidate_has_image = false if reload - - prepare_candidates - list_images = [] - tested_images = [] - content = @best_candidate[:elem] unless reload - - return list_images if content.nil? - elements = content.css("img").map(&:attributes) - - elements.each do |element| - next unless element["src"] - - url = element["src"].value - height = element["height"].nil? ? 0 : element["height"].value.to_i - width = element["width"].nil? ? 0 : element["width"].value.to_i - - if url =~ /\Ahttps?:\/\//i && (height.zero? || width.zero?) - image = get_image_size(url) - next unless image - else - image = {:width => width, :height => height} - end - - image[:format] = File.extname(url).gsub(".", "") - - if tested_images.include?(url) - debug("Image was tested: #{url}") - next - end - - tested_images.push(url) - if image_meets_criteria?(image) - list_images << url - else - debug("Image discarded: #{url} - height: #{image[:height]} - width: #{image[:width]} - format: #{image[:format]}") - end - end - - (list_images.empty? and content != @html) ? images(@html, true) : list_images - end - - def images_with_fqdn_uris!(source_uri) - images_with_fqdn_uris(@html, source_uri) - end - - def images_with_fqdn_uris(document = @html.dup, source_uri) - uri = URI.parse(source_uri) - host = uri.host - scheme = uri.scheme - port = uri.port # defaults to 80 - - base = "#{scheme}://#{host}:#{port}/" - - images = [] - document.css("img").each do |elem| - begin - elem['src'] = URI.join(base,elem['src']).to_s if URI.parse(elem['src']).host == nil - images << elem['src'].to_s - rescue URI::InvalidURIError => exc - elem.remove - end - end - - images(document,true) - end - - def get_image_size(url) - w, h = FastImage.size(url) - raise "Couldn't get size." if w.nil? || h.nil? - {:width => w, :height => h} - rescue => e - debug("Image error: #{e}") - nil - end - - def image_meets_criteria?(image) - return false if options[:ignore_image_format].include?(image[:format].downcase) - image[:width] >= (options[:min_image_width] || 0) && image[:height] >= (options[:min_image_height] || 0) - end - - def title - title = @html.css("title").first - title ? title.text : nil - end - - # Look through the @html document looking for the author - # Precedence Information here on the wiki: (TODO attach wiki URL if it is accepted) - # Returns nil if no author is detected - def author - # Let's grab this author: - # - author_elements = @html.xpath('//meta[@name = "dc.creator"]') - unless author_elements.empty? - author_elements.each do |element| - return element['content'].strip if element['content'] - end - end - - # Now let's try to grab this - # - #

    By
    - author_elements = @html.xpath('//*[contains(@class, "vcard")]//*[contains(@class, "fn")]') - unless author_elements.empty? - author_elements.each do |element| - return element.text.strip if element.text - end - end - - # Now let's try to grab this - # - # TODO: strip out the (rel)? - author_elements = @html.xpath('//a[@rel = "author"]') - unless author_elements.empty? - author_elements.each do |element| - return element.text.strip if element.text - end - end - - author_elements = @html.xpath('//*[@id = "author"]') - unless author_elements.empty? - author_elements.each do |element| - return element.text.strip if element.text - end - end - end - - def content(remove_unlikely_candidates = :default) - @remove_unlikely_candidates = false if remove_unlikely_candidates == false - - prepare_candidates - article = get_article(@candidates, @best_candidate) - - cleaned_article = sanitize(article, @candidates, options) - if article.text.strip.length < options[:retry_length] - if @remove_unlikely_candidates - @remove_unlikely_candidates = false - elsif @weight_classes - @weight_classes = false - elsif @clean_conditionally - @clean_conditionally = false - else - # nothing we can do - return cleaned_article - end - - make_html - content - else - cleaned_article - end - end - - def get_article(candidates, best_candidate) - # Now that we have the top candidate, look through its siblings for content that might also be related. - # Things like preambles, content split by ads that we removed, etc. - - sibling_score_threshold = [10, best_candidate[:content_score] * 0.2].max - output = Nokogiri::XML::Node.new('div', @html) - best_candidate[:elem].parent.children.each do |sibling| - append = false - append = true if sibling == best_candidate[:elem] - append = true if candidates[sibling] && candidates[sibling][:content_score] >= sibling_score_threshold - - if sibling.name.downcase == "p" - link_density = get_link_density(sibling) - node_content = sibling.text - node_length = node_content.length - - append = if node_length > 80 && link_density < 0.25 - true - elsif node_length < 80 && link_density == 0 && node_content =~ /\.( |$)/ - true - end - end - - if append - sibling_dup = sibling.dup # otherwise the state of the document in processing will change, thus creating side effects - sibling_dup.name = "div" unless %w[div p].include?(sibling.name.downcase) - output << sibling_dup - end - end - - output - end - - def select_best_candidate(candidates) - sorted_candidates = candidates.values.sort { |a, b| b[:content_score] <=> a[:content_score] } - - debug("Top 5 candidates:") - sorted_candidates[0...5].each do |candidate| - debug("Candidate #{candidate[:elem].name}##{candidate[:elem][:id]}.#{candidate[:elem][:class]} with score #{candidate[:content_score]}") - end - - best_candidate = sorted_candidates.first || { :elem => @html.css("body").first, :content_score => 0 } - debug("Best candidate #{best_candidate[:elem].name}##{best_candidate[:elem][:id]}.#{best_candidate[:elem][:class]} with score #{best_candidate[:content_score]}") - - best_candidate - end - - def get_link_density(elem) - link_length = elem.css("a").map(&:text).join("").length - text_length = elem.text.length - link_length / text_length.to_f - end - - def class_weight(e) - weight = 0 - return weight unless @weight_classes - - if e[:class] && e[:class] != "" - weight -= 25 if e[:class] =~ REGEXES[:negativeRe] - weight += 25 if e[:class] =~ REGEXES[:positiveRe] - end - - if e[:id] && e[:id] != "" - weight -= 25 if e[:id] =~ REGEXES[:negativeRe] - weight += 25 if e[:id] =~ REGEXES[:positiveRe] - end - - weight - end - - ELEMENT_SCORES = { - 'div' => 5, - 'blockquote' => 3, - 'form' => -3, - 'th' => -5 - }.freeze - - def score_node(elem) - content_score = class_weight(elem) - content_score += ELEMENT_SCORES.fetch(elem.name.downcase, 0) - { :content_score => content_score, :elem => elem } - end - - def debug(str) - puts str if options[:debug] - end - - def sanitize(node, candidates, options = {}) - node.css("h1, h2, h3, h4, h5, h6").each do |header| - header.remove if class_weight(header) < 0 || get_link_density(header) > 0.33 - end - - node.css("form, object, iframe, embed").each do |elem| - elem.remove - end - - if @options[:remove_empty_nodes] - # remove

    tags that have no text content - this will also remove p tags that contain only images. - node.css("p").each do |elem| - elem.remove if elem.content.strip.empty? - end - end - - # Conditionally clean s,
      s, and
      s - clean_conditionally(node, candidates, "table, ul, div") - - # We'll sanitize all elements using a whitelist - base_whitelist = @options[:tags] || %w[div p] - # We'll add whitespace instead of block elements, - # so a
      b will have a nice space between them - base_replace_with_whitespace = %w[br hr h1 h2 h3 h4 h5 h6 dl dd ol li ul address blockquote center] - - # Use a hash for speed (don't want to make a million calls to include?) - whitelist = Hash.new - base_whitelist.each {|tag| whitelist[tag] = true } - replace_with_whitespace = Hash.new - base_replace_with_whitespace.each { |tag| replace_with_whitespace[tag] = true } - - ([node] + node.css("*")).each do |el| - # If element is in whitelist, delete all its attributes - if whitelist[el.node_name] - el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) } - - # Otherwise, replace the element with its contents - else - # If element is root, replace the node as a text node - if el.parent.nil? - node = Nokogiri::XML::Text.new(el.text, el.document) - break - else - if replace_with_whitespace[el.node_name] - el.swap(Nokogiri::XML::Text.new(' ' << el.text << ' ', el.document)) - else - el.swap(Nokogiri::XML::Text.new(el.text, el.document)) - end - end - end - - end - - s = Nokogiri::XML::Node::SaveOptions - save_opts = s::NO_DECLARATION | s::NO_EMPTY_TAGS | s::AS_XHTML - html = node.serialize(:save_with => save_opts) - - # Get rid of duplicate whitespace - return html.gsub(/[\r\n\f]+/, "\n" ) - end - - def clean_conditionally(node, candidates, selector) - return unless @clean_conditionally - node.css(selector).each do |el| - weight = class_weight(el) - content_score = candidates[el] ? candidates[el][:content_score] : 0 - name = el.name.downcase - - if weight + content_score < 0 - el.remove - debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because score + content score was less than zero.") - elsif el.text.count(",") < 10 - counts = %w[p img li a embed input].inject({}) { |m, kind| m[kind] = el.css(kind).length; m } - counts["li"] -= 100 - - # For every img under a noscript tag discount one from the count to avoid double counting - counts["img"] -= el.css("noscript").css("img").length - - content_length = el.text.strip.length # Count the text length excluding any surrounding whitespace - link_density = get_link_density(el) - - reason = clean_conditionally_reason?(name, counts, content_length, options, weight, link_density) - if reason - debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because it has #{reason}.") - el.remove - end - end - end - end - - def clean_conditionally_reason?(name, counts, content_length, options, weight, link_density) - if (counts["img"] > counts["p"]) && (counts["img"] > 1) - "too many images" - elsif counts["li"] > counts["p"] && name != "ul" && name != "ol" - "more
    • s than

      s" - elsif counts["input"] > (counts["p"] / 3).to_i - "less than 3x

      s than s" - elsif (content_length < options[:min_text_length]) && (counts["img"] != 1) - "too short a content length without a single image" - elsif weight < 25 && link_density > 0.2 - "too many links for its weight (#{weight})" - elsif weight >= 25 && link_density > 0.5 - "too many links for its weight (#{weight})" - elsif (counts["embed"] == 1 && content_length < 75) || counts["embed"] > 1 - "s with too short a content length, or too many s" - else - nil - end - end - - private - - # 제거항목 추가항목을 지정한다. - def handle_exclusions!(whitelist, blacklist) - return unless whitelist || blacklist - - if blacklist - elems = @html.css(blacklist) - if elems - elems.each do |e| - e.remove - end - end - end - - if whitelist - elems = @html.css(whitelist).to_s - - if body = @html.at_css('body') - body.inner_html = elems - end - end - - @input = @html.to_s - end - - # 코멘트가 제거된 기본 html 노드 반환 - def make_html(whitelist=nil, blacklist=nil) - @html = Nokogiri::HTML(@input, nil, @options[:encoding]) - # In case document has no body, such as from empty string or redirect - @html = Nokogiri::HTML('', nil, @options[:encoding]) if @html.css('body').length == 0 - # Remove html comment tags - @html.xpath('//comment()').each { |i| i.remove } - end - - - def prepare_candidates - @html.css("script, style").each { |i| i.remove } - remove_unlikely_candidates! if @remove_unlikely_candidates - transform_misused_divs_into_paragraphs! - - @candidates = score_paragraphs(options[:min_text_length]) - @best_candidate = select_best_candidate(@candidates) - end - - # 가망없는 후보자를 제거한다. (명확한 후보자는 제외하고 제거한다.) - def remove_unlikely_candidates! - @html.css("*").each do |elem| - str = "#{elem[:class]}#{elem[:id]}" - if str =~ REGEXES[:unlikelyCandidatesRe] && str !~ REGEXES[:okMaybeItsACandidateRe] && (elem.name.downcase != 'html') && (elem.name.downcase != 'body') - debug("Removing unlikely candidate - #{str}") - elem.remove - end - end - end - - # 잘못 사용되고 있는 DIV를 p로 변환한다. - def transform_misused_divs_into_paragraphs! - @html.css("*").each do |elem| - if elem.name.downcase == "div" - # transform

      s that do not contain other block elements into

      s - if elem.inner_html !~ REGEXES[:divToPElementsRe] - debug("Altering div(##{elem[:id]}.#{elem[:class]}) to p"); - elem.name = "p" - end - else - # wrap text nodes in p tags -# elem.children.each do |child| -# if child.text? -# debug("wrapping text node with a p") -# child.swap("

      #{child.text}

      ") -# end -# end - end - end - end - - # 가능노드에 점수를 매긴다. - def score_paragraphs(min_text_length) - candidates = {} - @html.css("p,td").each do |elem| - parent_node = elem.parent - grand_parent_node = parent_node.respond_to?(:parent) ? parent_node.parent : nil - inner_text = elem.text - - # If this paragraph is less than 25 characters, don't even count it. - next if inner_text.length < min_text_length - - candidates[parent_node] ||= score_node(parent_node) - candidates[grand_parent_node] ||= score_node(grand_parent_node) if grand_parent_node - - content_score = 1 - content_score += inner_text.split(',').length - content_score += [(inner_text.length / 100).to_i, 3].min - - candidates[parent_node][:content_score] += content_score - candidates[grand_parent_node][:content_score] += content_score / 2.0 if grand_parent_node - end - - # Scale the final candidates score based on link density. Good content should have a - # relatively small link density (5% or less) and be mostly unaffected by this operation. - candidates.each do |elem, candidate| - candidate[:content_score] = candidate[:content_score] * (1 - get_link_density(elem)) - end - - candidates - end - end -end diff --git a/mix.exs b/mix.exs index 2720fad..cd6a2a9 100644 --- a/mix.exs +++ b/mix.exs @@ -1,4 +1,7 @@ defmodule Readability.Mixfile do + @moduledoc """ + """ + use Mix.Project def project do @@ -15,7 +18,8 @@ defmodule Readability.Mixfile do # Type "mix help compile.app" for more information def application do [applications: [:logger, - :floki + :floki, + :httpoison ]] end @@ -29,6 +33,10 @@ defmodule Readability.Mixfile do # # Type "mix help deps" for more examples and options defp deps do - [{:floki, "~> 0.8.0"}] + [{:floki, "~> 0.8.0"}, + {:httpoison, "~> 0.8.0"}, + {:credo, "~> 0.3", only: [:dev, :test]}, + {:dialyxir, "~> 0.3", only: [:dev]} + ] end end diff --git a/mix.lock b/mix.lock index 7874674..9aaf1ae 100644 --- a/mix.lock +++ b/mix.lock @@ -1,2 +1,12 @@ -%{"floki": {:hex, :floki, "0.8.0"}, - "mochiweb_html": {:hex, :mochiweb_html, "2.13.0"}} +%{"bunt": {:hex, :bunt, "0.1.5"}, + "certifi": {:hex, :certifi, "0.4.0"}, + "credo": {:hex, :credo, "0.3.12"}, + "dialyxir": {:hex, :dialyxir, "0.3.3"}, + "floki": {:hex, :floki, "0.8.0"}, + "hackney": {:hex, :hackney, "1.6.0"}, + "httpoison": {:hex, :httpoison, "0.8.3"}, + "idna": {:hex, :idna, "1.2.0"}, + "metrics": {:hex, :metrics, "1.0.1"}, + "mimerl": {:hex, :mimerl, "1.0.2"}, + "mochiweb_html": {:hex, :mochiweb_html, "2.13.0"}, + "ssl_verify_fun": {:hex, :ssl_verify_fun, "1.1.0"}} diff --git a/test.html b/test.html new file mode 100644 index 0000000..b20208c --- /dev/null +++ b/test.html @@ -0,0 +1 @@ +
      \"\"
      Buddhist monks performing as part of “Treasures From Korea: Arts and Culture of the Joseon Dynasty, 1392-1910,” at the Philadelphia Museum of Art. via Philadelphia Museum of Art

      One afternoon in December, the Arts of Korea Gallery at the was abuzz, as dignitaries from the South Korean government inspected the display, while members of the press watched. Then, Oh Seung-je, the director of the Korean Cultural Service of New York, and Daniel H. Weiss, the Met’s president, ceremonially signed a pact establishing a long-term partnership between the Met and South Korea’s Ministry of Culture, Sports and Tourism.

      Along with the agreement came a $1 million gift to the Met from South Korea. It will pay for enhancement of the gallery, loans from Korean museums, a major exhibition of Korean art in 2018, new research, and residencies at the Met for curators from the National Museum of Korea.

      It was far from the first time the South Korean government or its affiliates, in tandem with Korean corporations, had rained money on American museums to ensure that Korean art was seen here. The Met’s Arts of Korea Gallery was created in 1998 with a gift from the Korea Foundation, an independent organization affiliated with South Korea’s Ministry of Foreign Affairs.

      Other countries, including Italy, Japan and the Netherlands, promote their visual culture in the United States — sponsoring museum exhibitions, lending valuable artworks and so on. What is striking about South Korea is that it has systematically invested in building and maintaining permanent places to see Korean art at American museums, and in underwriting exhibitions that promote the country’s ancient and contemporary art in the United States.

      “Since the American public only has rather limited opportunities to view and appreciate Korean artworks, concerted efforts are necessary to bring attention to the richness of Korea’s culture and arts,” Yoon Keum-jin, the executive vice president in the Korea Foundation’s Washington office, wrote in an email.

      Since its founding 25 years ago, the Korea Foundation alone has midwifed the birth of permanent Korean art galleries at 18 American museums, including the Museum of Fine Arts, Houston; the Cleveland Museum of Art; the Seattle Art Museum; the Los Angeles County Museum of Art; and the Museum of Fine Arts, Boston. It has spent more than $9 million to construct these galleries, Ms. Yoon said. Without such funding, the museums say, many of these galleries would probably not exist.

      Reasons for this effort are both political and cultural. “The Koreans have a tradition of centralized planning and the government being involved in long-term economic planning and investment,” said Timothy F. Rub, director of the Philadelphia Museum of Art, which received a grant from the Korea Foundation to upgrade its Korean gallery, along with other aid over the years. “They see the promulgation of culture as an instrument of economic policy.”

      They also perceive a real need. “Many are aware of Korea through the rapid development of its technology and electronics sectors, but it is a country equally abundant in the cultural arts,” said Mr. Oh of the Korean Culture Service.

      Historically, Mr. Rub added, Korea’s culture has been overshadowed. “Look at the collections in the United States of East Asian art. Japan and China are far more prominent than Korea.” Yet, he said, “look at the history of Korean art; it was very much prized by the Japanese and the Chinese.”

      \"\"
      Korea Foundation Day at the Museum of Fine Arts, Boston in 2012. Museum of Fine Arts, Boston

      For museums in the Korean fold, the support can be steady. Soon after the Houston museum received $470,000 to build its gallery, the Korea Foundation provided $30,000 for programming. In 2010 it gave the museum $150,000 to organize and present “Your Bright Future: 12 Contemporary Artists From Korea,” the first major American museum exhibition of contemporary Korean art in years. Since then, the Korea Foundation has donated more money for programming of Korean art.

      The Los Angeles County Museum of Art, which was a co-organizer of “Your Bright Future,” also received money from the Korea Foundation for the show, which was sponsored by a Korean container company.

      The Koreans have also proved receptive to American ideas. Some years ago, Mr. Rub approached the National Museum of Korea with the notion of showcasing the art of the Joseon dynasty, which ruled Korea from 1392 to 1910. That turned into a cultural exchange that included the Museum of Fine Arts, Houston, and the Los Angeles County Museum of Art. They each presented the Korean exhibition and sent an exhibition of American art to South Korea.

      South Korea also supported a series of workshops between 1999 and 2013. The Korea Foundation hosted curators from 26 countries, who traveled to South Korea to hear art experts, take field trips to cultural sites and learn technical information about Korean art and its proper display.

      Despite the current economic uncertainties that have prompted many governments to cut their funding of the arts, Mr. Oh said that South Korea would continue its overseas largess. “I believe that economic prosperity and cultural wealth go hand in hand,” he said in an email. “This is why it is important to even further promote the cultural arts during times of economic slowdown.”

      What’s more, he added, South Korea’s president, Park Geun-hye, has made cultural enrichment one of her major priorities.

      diff --git a/test/fixtures/bbc.html b/test/fixtures/bbc.html index 80d39bf..db47cc4 100644 --- a/test/fixtures/bbc.html +++ b/test/fixtures/bbc.html @@ -1,2066 +1,2557 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - BBC News - Submarine escape: A WWII survival tale from Kefalonia - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + Obama admits US gun laws are his 'biggest frustration' - BBC News + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + +
      + + +
      +
      +
      + + +
      + + + + +
      + +
      + + + + US & Canada + + + + +
      + +
      + + + US & Canada + + + + +
      +
      +

      Obama admits US gun laws are his 'biggest frustration'

      + + +
      +
        +
      • 24 July 2015
        +
      • +
      • From the section US & Canada
      • +
      • 941 comments
      • +
      +
      + +
      +
      Media caption Mr Obama told the BBC that gun control was his biggest frustration

      President Barack Obama has admitted that his failure to pass "common sense gun safety laws" in the US is the greatest frustration of his presidency.

      In an interview with the BBC, Mr Obama said it was "distressing" not to have made progress on the issue "even in the face of repeated mass killings".

      He vowed to keep trying, but the BBC's North America editor Jon Sopel said the president did not sound very confident.

      However, Mr Obama said race relations had improved during his presidency.

      Hours after the interview, a gunman opened fire at a cinema in the US state of Louisiana, killing two people and injuring several others before shooting himself.

      In a wide-ranging interview, President Obama also said:

        +
      • +The UK must stay in the EU to have influence on the world stage
      • +
      • He is confident the Iran nuclear deal will be passed by Congress
      • +
      • Syria needs a political solution in order to defeat the Islamic State group
      • +
      • He would speak "bluntly" against corruption and human rights violations in Kenya +
      • +
      • He would defend his advocacy of gay rights following protests in Kenya
      • +
      • Despite racial tensions, the US is becoming more diverse and more tolerant
      • +

      Read the full transcript of his interview

      Mr Obama lands in Kenya later on Friday for his first visit since becoming president.

      But with just 18 months left in power, he said gun control was the area where he has been "most frustrated and most stymied" since coming to power in 2009.

      "If you look at the number of Americans killed since 9/11 by terrorism, it's less than 100. If you look at the number that have been killed by gun violence, it's in the tens of thousands," Mr Obama said.

      Media caption Barack Obama: "Great Britain has always been our best partner"
      Gun control campaigners protest in McPhearson Square in Washington DC - 25 April 2013 +
      + + The president said he would continue fighting for greater gun control laws + +

      "For us not to be able to resolve that issue has been something that is distressing," he added.

      Mr Obama has pushed for stricter gun control throughout his presidency but has been unable to secure any significant changes to the laws.

      After nine African-American churchgoers were killed in South Carolina in June, he admitted "politics in this town" meant there were few options available.

      line

      Analysis: Jon Sopel, BBC News, Washington

      President Barack Obama participates in an interview with Jon Sopel of BBC in the Roosevelt Room of the White House - 23 July 2015

      Nine months ago, the president seemed like a spent force, after taking a beating in the midterm elections, during which members of his own party were reluctant to campaign on his record.

      But the man sat before me today was relaxed and confident, buoyed by a string of "wins" on healthcare, Cuba and Iran, after bitter and ongoing battles with his many critics.

      The only body swerve the president performed was when I asked him how many minds he had changed on the Iran nuclear deal after an intense sell aimed at Gulf allies and members of US Congress who remain implacably opposed.

      There was a momentary flicker across the president's face as if to say "You think you got me?" before his smile returned and he proceeded to talk about how Congress would come round.

      But notably, he did not give a direct answer to that question, which leaves me with the impression that he has persuaded precisely zero.

      Five things we learned from Obama interview

      The presidential body swerve

      line
      Media caption The BBC contrasts President Obama's reactions after mass shootings, with the levels of US gun ownership during his terms in office. (Video by David Botti)

      On race relations, Mr Obama said recent concerns around policing and mass incarcerations were "legitimate and deserve intense attention" but insisted progress had been made.

      Children growing up during the eight years of his presidency "will have a different view of race relations in this country and what's possible," he said.

      "There are going to be tensions that arise. But if you look at my daughters' generation, they have an attitude about race that's entirely different than even my generation."

      Talking about how he was feeling after his recent successes, he said "every president, every leader has strengths and weaknesses".

      "One of my strengths is I have a pretty even temperament. I don't get too high when it's high and I don't get too low when it's low," he said.

      Customer looks at Obama shirts at a stall in Nairobi's Kibera slums, 23 July 2015 +
      + + Kenya is getting ready to welcome the US president + +

      Kenya trip

      Mr Obama was speaking to the BBC at the White House before departing for Kenya.

      His father was Kenyan and the president is expected to meet relatives in Nairobi.

      Mr Obama has faced criticism in the country after the US legalised gay marriage. However, in his interview, the president said he would not fall silent on the issue.

      Media caption President Obama told the BBC he would deliver a blunt message on gay rights when he travelled to Africa

      "I am not a fan of discrimination and bullying of anybody on the basis of race, on the basis of religion, on the basis of sexual orientation or gender," he said.

      The president also admitted that some African governments, including Kenya's, needed to improve their records on human rights and democracy. However, he defended his decision to engage with and visit those governments.

      "Well, they're not ideal institutions. But what we found is, is that when we combined blunt talk with engagement, that gives us the best opportunity to influence and open up space for civil society."

      Mr Obama will become the first US president to address the African Union when he travels on to Ethiopia on Sunday.

      +
      +
      + + + +
      +

      More on this story

      + + + +
      + +

      + US & Canada + +

      +
      +
      + + + +
      +
      + + +
      + + + + +
      + +
      + + + + +
      - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + }; - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
      - - - - - - - - - - - - - - - - - - - - - - - - -
      - - -
      -
      - - - - - -
      - -
      - - - - -
      - - - -
      - - -
      - -
      - - - - -
      - - - - - - - - - - - - - -
      -
      - - - - - - - - - -

      Submarine escape: A WWII survival tale from Kefalonia

      - - - - - - - -
      - Launch of HMS Perseus in May 1929 - - HMS Perseus was launched in May 1929 -
      -
      - - - - - -
      - - - - - - - - - -

      In today's Magazine

      - - - - - - - - - - -
      - -

      Seventy years ago, off the Greek island of Kefalonia, the British submarine HMS Perseus hit an Italian mine, sparking one of the greatest and most controversial survival stories of World War II.

      -

      The clear waters of the Mediterranean were a death trap for British submarines in World War II.

      -

      Some were bombed from the air, others hunted with sonar and depth charges, and many, perhaps most, collided with mines.

      -

      Two fifths of the subs that ventured into the Mediterranean were sunk and when a submarine sank it became a communal coffin - everyone on board died. That was the rule.

      -

      In fact, during the whole of the war there were only four escapes from stricken British submarines. And the most remarkable of these took place on 6 December 1941, when HMS Perseus plummeted to the seabed.

      - Enigma -

      When she left the British submarine base at Malta at the end of November 1941, HMS Perseus had on board her 59 crew and two passengers, one of whom was John Capes, a 31-year-old Navy stoker en route to Alexandria.

      -
      - John Capes - - John Capes: Stoker on the Perseus -
      -

      Tall, dark, handsome and a bit of an enigma, Capes had been educated at Dulwich College, and as the son of a diplomat he would naturally have been officer class rather than one of the lowliest of the mechanics who looked after the engines.

      -

      On the rough winter night of 6 December, Perseus was on the surface of the sea 3km (two miles) off the coast of Kefalonia, recharging her batteries under cover of darkness in preparation for another day underwater.

      -

      According to newspaper articles Capes later wrote or contributed to, he was relaxing in a makeshift bunk converted from a spare torpedo tube when, with no warning, there was a devastating explosion.

      -

      The boat twisted, plunged, and hit the bottom with what Capes described as a "nerve-shattering jolt".

      -

      His bunk reared up and threw him across the compartment. The lights went out.

      -
      -

      Escape from the Deep

      - - - -
        -
      • Louis de Bernieres returns to Kefalonia to tell the story of John Capes and HMS Perseus
      • -
      • Tim Clayton acted as a programme consultant
      • -
      • Broadcast on Friday 2 December 2011 at 1100 GMT on BBC Radio 4, or listen again on iPlayer
      • -
      - -

      Capes guessed they had hit a mine. Finding that he could stand, he groped for a torch. In the increasingly foul air and rising water of the engine room he found "the mangled bodies of a dozen dead".

      -

      But that was as far as he could get. The engine room door was forced shut by the pressure of water on the other side. "It was creaking under the great pressure. Jets and trickles from the rubber joint were seeping through," said Capes.

      -

      He dragged any stokers who showed signs of life towards the escape hatch and fitted them and himself with Davis Submarine Escape Apparatus, a rubber lung with an oxygen bottle, mouthpiece and goggles.

      -
      -

      British WWII submarine escapes

      - -
      - Graphic showing the depth at which British WWII submariners escaped - -
      - - -
        -
      • HMS Umpire sank near Norfolk, England on 19 July 1941. Escapees: 14-15
      • -
      • HMS Stratagem sank near Malacca, Malaysia on 22 November 1944. Escapees: 10
      • -
      • HMS Perseus sank near Kefalonia, Greece on 6 December 1941. Escapees: 1
      • -
      • HMS P32 sank near Tripoli, Libya on 18 August 1941 (but the wreck was discovered only in 1999). Escapees: 2
      • -
      - -

      This equipment had only been tested to a depth of 100ft (30m). The depth gauge showed just over 270ft, and as far as Capes knew, no-one had ever made an escape from such a depth.

      -

      In fact the gauge was broken, over-estimating the depth by 100ft, but time was running out. It was difficult to breathe now.

      -

      He flooded the compartment, lowered the canvas trunk beneath the escape hatch and with great difficulty released the damaged bolts on the hatch.

      -

      He pushed his injured companions into the trunk, up through the hatch and away into the cold sea above. Then he took a last swig of rum from his blitz bottle, ducked under and passed through the hatch himself.

      -

      "I let go, and the buoyant oxygen lifted me quickly upward. Suddenly I was alone in the middle of the great ocean.

      -

      "The pain became frantic, my lungs and whole body as fit to burst apart. Agony made me dizzy. How long can I last?

      -

      "Then, with the suddenness of certainty, I burst to the surface and wallowed in a slight swell with whitecaps here and there."

      -

      But having made the deepest escape yet recorded, his ordeal was not over.

      -

      His fellow injured stokers had not made it to the surface with him so he found himself alone in the middle of a cold December sea.

      -

      In the darkness he spotted a band of white cliffs and realised he had no choice but to strike out for those.

      - Story doubted -

      The next morning, Capes was found unconscious by two fishermen on the shore of Kefalonia.

      -

      For the following 18 months he was passed from house to house, to evade the Italian occupiers. He lost 70lb (32kg) in weight and dyed his hair black in an effort to blend in.

      -

      He recalled later: "Always, at the moment of despair, some utterly poor but friendly and patriotic islander would risk the lives of all his family for my sake.

      -
      - Kostas Thoctarides swimming next to the wreck of HMS Perseus - - Kostas Thoctarides and his dive team found the wreck of HMS Perseus in 1997 -
      -

      "They even gave me one of their prize possessions, a donkey called Mareeka. There was one condition attached to her - I had to take a solemn vow not to eat her."

      -

      He was finally taken off the island on a fishing boat in May 1943, in a clandestine operation organised by the Royal Navy.

      -

      A dangerous, roundabout journey of 640km took him to Turkey and from there back to the submarine service in Alexandria.

      -

      Despite being awarded a medal for his escape, Capes's story was so extraordinary that many people, both within and outside the Navy, doubted it.

      -

      Was he really on the boat at all? After all, he was not on the crew list. And submarine commanders had been ordered to bolt escape hatches shut from the outside to prevent them lifting during depth charge attacks.

      -

      There were no witnesses, he had a reputation as a great storyteller, and his own written accounts after the war varied in their details.

      -

      And the depth gauge reading 270ft made his story all the harder to believe.

      -

      John Capes died in 1985 but it was not until 1997 that his story was finally verified.

      -

      In a series of dives to the wreck of Perseus, Kostas Thoctarides discovered Capes's empty torpedo tube bunk, the hatch and compartment exactly as he had described it, and finally, his blitz bottle from which he had taken that last fortifying swig of rum.

      -

      Tim Clayton is the author of Sea Wolves: the Extraordinary Story of Britain's WW2 Submarines.

      -

      BBC Radio 4's Escape from the Deep is broadcast on Friday 2 December 2011 at 1100 GMT. Or listen again on BBC iPlayer.

      - - -
      - -
      - -
      -

      More on This Story

      - - - - - -
      - -
      - -

      In today's Magazine - -

      - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
      - -
      - - - - - - - - - -

      The BBC is not responsible for the content of external Internet sites

      -
      - - -
      - - - - - - -
      - -
      - - - - -
      - -
      - - - - - -
      - -
      - -
      - - - - - - -
      - -

      Top Stories

      - - - - - - - - - -
      - - - - -
      - -
      - - - - -
      - -

      Features & Analysis

      - -
        - - - - - - - - - -
      • - - - - - - - - - - - - - - -

        - Meryl StreepIt's quiz time! - -

        - - -

        Meryl Streep's last Oscar was for which film? -

        - -
        -
      • - - -
      • - - - - - - - - - - - - - - -

        - Qatari prime ministerLeague of its own - -

        - - -

        How Arab leaders embraced revolution -

        - -
        -
      • - - -
      • - - - - - - - - - - - - - - -

        - Delegates are seen beneath a ceiling painted by Spanish artist Miquel Barcelo during a special session of the UN Human Rights Council, GenevaDay in pictures - -

        - - -

        24 hours of news photos from around the world -

        - -
        -
      • - - - - -
      -
      - - - - - - - - -
      - - - - - - - - -
      -
      - -

      Elsewhere on BBC News

      - -
        - - - -
      • - - - - - - - - - - - - - -

        - Zhai MeiqingGiving a bit back - -

        - -

        The entrepreneur and now multi-millionaire at the forefront of China's new-found philanthropic thinking

        -
      • -
      - -
      - -
      - - -
      - - - -
      - - - -
      - -
      - - - - - - - - -
      - - - -
      -

      Programmes

      - - - - -
        - - - -
      • - - - - - - - - - - - - - -

        - Toyota Fun ViiClick Watch - -

        - -

        Toyota's futuristic car that changes colour and other tech news in Click's weekly bulletin

        - - -
        - -
        - - -
      • -
      -
      - - -
      - - - -
      - -
      - - - -
      - - -
      - - -
      - - - - - - - -
      - - - - - - -

      bbc.co.uk navigation

      BBC © 2011 The BBC is not responsible for the content of external sites. Read more.

      This page is best viewed in an up-to-date web browser with style sheets (CSS) enabled. While you will be able to view the content of this page in your current browser, you will not be able to get the full visual experience. Please consider upgrading your browser software or enabling style sheets (CSS) if you are able to do so.

      -
      - - - - - - - - - - - - - - - - - - - - - - - - - + require(['orb/promomanager'], function (promoManager) { + promoManager.init(document.getElementById('orb-footer-promo')); + }) +})(); + - + + + - - - - - - - -
      - - - - - - - + - - + + + + + + + +
      Krux('scrape', { 'page_attr_description': {meta_name: 'Description'}});
      diff --git a/test/fixtures/nytimes.html b/test/fixtures/nytimes.html index cf2d529..00da6e9 100644 --- a/test/fixtures/nytimes.html +++ b/test/fixtures/nytimes.html @@ -1,58 +1,1296 @@ -Health Care for a Changing Work Force - NYTimes.com -
      The New York Times


      December 1, 2011, 9:15 pm

      Health Care for a Changing Work Force

      Fixes

      Fixes looks at solutions to social problems and why they work.

      Sara Horowitz addressed members of the Freelancers Union during a 2009 forum which invited candidates for Public Advocate and Comptroller to discuss the issues affecting independent workers.Carolyn SilveiraSara Horowitz, the founder of the Freelancers Union, addressed union members at a 2009 forum on the issues affecting independent workers.

      Big institutions are often slow to awaken to major social transformations. Microsoft was famously late to grasp the importance of the Internet. American auto manufacturers were slow to identify the demand for fuel-efficient cars. And today, the United States government is making a similar mistake: it still doesn’t seem to recognize that Americans no longer work the way they used to.

      Today, some 42 million people — about a third of the United States work force — do not have jobs in the traditional sense. They fall into a catchall category the government calls “contingent” workers. These people — independent contractors, freelancers, temp workers, part-timers, people between jobs — typically work on a project-to-project basis for a variety of clients, and most are outcasts from the traditional system of benefits that provide economic security to Americans. Even as the economy has changed, employment benefits are still based on an outdated industrial-era model in which workers are expected to stay with a single company for years, if not their whole careers.

      The industrial-era model of employer-based health care no longer applies.

      For most of the 20th century, it was efficient to link benefits to jobs this way. But today, more and more work falls outside the one-to-one, employee-to-employer relationship. Work is decentralized, workers are mobile, and working arrangements are fluid. However, the risks of life haven’t gone away: people still need protections. They just need a different system to distribute them. They need benefits that they can carry around, like their laptops. As things stand, millions of independent workers go without health and unemployment insurance, protection against discrimination and unpaid wages, and pension plans. It makes no sense.

      One of the social innovators to recognize this problem early and act on it was Sara Horowitz, the founder of the Freelancers Union, which has more than 165,000 members across all 50 states. At Fixes, we highlight practical applications of ideas that have the potential to achieve widespread impact. That means looking at how ideas take root in institutions that become part of the fabric of society.

      In the early 20th century, a landscape of new institutions — including the early labor unions and hundreds of civil society organizations like Rotary International, the Boy and Girl Scouts, and the N.A.A.C.P. — reshaped the American landscape. Today, the Freelancers Union offers a glimpse of the kind of social enterprise — mission-driven and pragmatic, market-savvy and cooperative — that is likely to proliferate in the coming years to meet the needs of a fast-changing work force and society.

      Horowitz had been a labor lawyer and union organizer when, in the early 1990s, she recognized that the number of people turning to independent work was on the rise. It was also clear that institutions had not yet been built to represent them in any meaningful way. (On one occasion, Horowitz found herself misclassified by an employer as an independent contractor — and quickly discovered that she received no job benefits.) Horowitz had the idea to create an organization to bring freelancers together so they could wield their power in the marketplace and in the political arena, much like AARP does for retirees.

      She quickly discovered that their biggest concern was the cost of health insurance. But there were other problems, too. Unlike traditional workers who receive unemployment benefits, independent contractors have to rely on their own resources to get through hard times. In 2009, Freelancers Union surveyed 3,000 members and found that more than 80 percent had gone jobless or underemployed during the year. More than 60 percent had used their credit cards or borrowed from friends and family to make ends meet, and 12 percent had to turn to food stamps. Close to 40 percent had given up, or downgraded, their health insurance protection.

      Another problem was getting paid. Some companies, like Time Inc., actually charge freelancers penalties if they request payment within 25 days. Freelancers Union found that 77 percent of its members had been cheated by a client during their careers and 40 percent had had trouble getting paid in 2009. The average wage loss was $6,000. The Department of Labor protects traditional workers from unpaid wages, but freelancers have no equivalent recourse. Then there were difficulties obtaining mortgages, the lack of access to 401(k) plans, and other issues.

      An insurance provider that stays viable by not seeking to maximize profits.

      Horowitz saw that she could attract a large membership if she could figure out how to provide health insurance at lower cost. Health insurance companies don’t have much love for freelancers. They prefer to serve large groups because it’s easier to deal with one corporate benefits manager than a multitude of individuals. And because insurers often lack reliable information about independent workers, they tend to assume that they are risky. As a result, premiums in the open marketplace for health insurance are higher and more volatile than those for employees. (The Affordable Care Act is designed to address this problem beginning in 2014 by subsidizing private insurance, but it applies only to people with low and moderate incomes.)

      Horowitz got the idea of grouping freelancers in New York State so they could purchase their health insurance together. It made sense in theory, but it had never been done. She worked closely with officials in Albany, notably Assemblyman Sheldon Silver, who was a strong ally, and Gregory Serio, the former superintendent of insurance for New York State, who had the authority to grant approval for “discretionary” insurance groups.

      “A lot of health insurers have looked at individual and sole proprietors as very expensive and risky to underwrite,” explained Serio. “Sara was able to foresee a trend [in the rise of independent work] before a lot of other people did. She went and found out that these people were not bad risks. Her creativity was in using existing concepts of insurance risk sharing and applying it to a community that has been ignored by the marketplace and, in fact, almost vilified by the marketplace.”

      Serio and Horowitz made an interesting team. “I was a conservative Republican from Nassau County working for George Pataki,” he told me. “And she was my liberal friend from Brooklyn.” But Serio found the idea of protecting freelancers appealing because his father had been a dentist who operated out of a second-floor walk-up office on Jamaica Avenue, in Woodhaven, Queens. “I grew up in a sole proprietor household,” he said. “If my father didn’t work, he didn’t get paid. And I knew what it was like seeing health insurance rates go up and up.”

      Today, the Freelancers Insurance Company (F.I.C.), which is wholly owned by the Freelancers Union (a nonprofit), has revenues of roughly $100 million and covers 25,000 independent workers and their family members in New York State, offering them premiums that the company calculates are more than a third below the open market rate. Close to 60 percent of its clients were previously uninsured or on COBRA (a temporary extension of their previous insurance). The renewal rate last year was 97 percent. (Disclosure: I have purchased health insurance from F.I.C. for a number of years.) The company was financed with $17 million in loans and grants from social investors, including the Rockefeller Foundation, the Robert Wood Johnson Foundation and the New York City Investment Fund. “Our freelancers have access to the best doctors and hospitals,” says Horowitz. “We have skilled human resource people, just like Fortune 500 companies. We’re able to watch out for our members.”

      How can the F.I.C. undercut market rates and still be a viable enterprise? The key is that while it seeks to be profitable, it does not seek to maximize profits. Its executives receive salaries that are below industry averages, and it has only one shareholder (the Freelancers Union) to satisfy. Those are fundamental differences. Silver, who is the speaker of the State Assembly, notes that the success of the F.I.C. makes it more difficult for traditional insurers to contend that they can’t deliver insurance at lower cost. “Duplicating the model and showing the ability of [the F.I.C.] to keep costs under control is something that we will be looking at,” he adds.

      Like many social goods, health insurance is often seen through a binary lens: either it must be handled by the government or it must be handled by the free market. But the F.I.C. is demonstrating that a middle way can work, too, and that it may be preferable to provide vital services like insurance through social-benefit companies, at least to certain customer groups. In fact, the Affordable Care Act has a provision to finance a new type of nonprofit health insurance company that would be run by its customers. It would be called a Consumer Operated and Oriented Plan (CO-OP). The Freelancers Union has proposed to establish CO-OPs in Florida, New Jersey, New York, Oregon and Washington.

      Because the F.I.C. has a close connection with freelancers, it can be more effective helping its members make good health care decisions. “We’re moving away from fee-for-service medicine to one where a primary care doctor aggressively coordinates care,” explains Horowitz. “We’re also trying to innovate with alternative care — promoting meditation, yoga, and nutrition which can have long-term beneficial effects.” In 2012, the organization will be opening up the Brooklyn Freelancers Medical Practice, a health center modeled on the medical-home approach and designed in partnership with a physician named Rushika Fernandopulle, who pioneered a team-based model of care that is attracting attention across the country.

      For now, the United States government doesn’t keep an accurate count of the independent work force. This is an oversight. It appears likely that this way of working will continue to grow. In cities with concentrations of knowledge workers, you find a proliferation of co-working spaces designed specifically for freelancers. And online marketplaces for freelancers like Etsy, oDesk and Elance are expanding rapidly.

      Related
      More From Fixes

      Read previous contributions to this series.

      It’s not just hipsters who work like this. Forty-five percent of Freelancers Union members are over 40 years old. Not all follow this path by choice. Many freelancers are former employees, like journalists, who lost jobs. Recent college graduates, discovering that a degree is far from a job guarantee, are forced to be more entrepreneurial. And many companies, seeking to hold costs down, engage freelancers rather than hire full-time workers. All of these workers deserve the same protections accorded to others.

      “The industrial workers of the 20th century helped bring about the New Deal,” says Horowitz. “But the New Deal hasn’t evolved to include independent workers. I think this work force will help bring about the next New Deal — a framework of economic security that is parallel in its goals but led by a network of new institutions.”

      “The government can’t replace civil society,” she added. “So if the civil society organizations have control, it will be harder to have your benefits taken away — if you happen to lose an election.”

      On Wednesday, I’ll report on some of the other ways the Freelancers Union is helping to make independent work more secure. In the meantime, if you are a freelancer, or know someone who works this way, let us know about your experiences.
      Join Fixes on Facebook and follow updates on twitter.com/nytimesfixes.


      David Bornstein

      David Bornstein is the author of “How to Change the World,” which has been published in 20 languages, and “The Price of a Dream: The Story of the Grameen Bank,” and is co-author of “Social Entrepreneurship: What Everyone Needs to Know.” He is the founder of dowser.org, a media site that reports on social innovation.


      + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

      Inside Opinionator

      December 1, 2011
      Health Care for a Changing Work Force

      With 42 million independent workers in the United States, the Freelancers Union’s health plan may be a model for the future.

      November 29, 2011
      Giving Where It Works

      Most social enterprises struggle to survive. These seven programs make the most of the charitable dollars they receive.

      More From Fixes »

      December 1, 2011
      Gun Nuts in a Rut

      Despite all evidence to the contrary, the N.R.A. is determined to see President Obama as anti-gun.

      November 24, 2011
      My End of the Food Chain

      Food from the wild is the way to go, even if it means having to remove the odd shotgun pellet.

      More From Timothy Egan »

      December 1, 2011
      Lincoln’s P.R. Coup

      Why did the president release scores of State Department correspondence to the public in 1861?

      November 29, 2011
      Beyond ‘Glory’

      The complicated story of African-American soldiers goes back much further than the Civil War.

      More From Disunion »

      November 30, 2011
      My Bridge to Nowhere

      We thought that once we decided on adoption, out of the ether, a child would appear. I have been wrong before, but never quite this wrong.

      November 19, 2011
      My Coney Island Crime

      I came upon a man stealing sand from the beach, and then I helped him.

      More From Townies »

      November 30, 2011
      Sins of the Parents

      Florida is treating residents who have lived their entire lives there as non-residents for tuition purposes if they can’t prove their parents are in the United States legally.

      November 16, 2011
      Reasonable Expectations

      The Supreme Court will explore the permissible limits of government watchfulness over our daily lives.

      More From Linda Greenhouse »

      November 30, 2011
      On Abortion and Defining a ‘Person’

      A recent referendum in Mississippi suggests important consequences for the logic of the abortion debate.

      November 30, 2011
      Stone Links

      A gathering of recent philosophy-related links.

      More From The Stone »

      November 30, 2011
      The Not-Romneys

      What drives the anybody-but-Mitt phenomenon?

      November 16, 2011
      Lightning Round

      There are a lot of things to talk about: Cain, Gingrich, Occupy Wall Street — and Cincinnati.

      More From The Conversation »

      November 29, 2011
      Making Local Food Real

      In Vermont, community-supported agriculture that’s working.

      November 19, 2011
      No Turkeys Here

      It’s easy to find signs of hope in the people and organizations who’ve been prodding American food back on a natural, sustainable, beautiful track.

      More From Mark Bittman »

      November 28, 2011
      A Not-So-Straight Story

      The American-Canadian border, famously said to run straight across the 49th parallel for hundreds of miles, is neither straight nor along the 49th parallel.

      November 21, 2011
      The Way We Were

      In an Age of Lead, it’s easy for countries to dream of a Golden Era when they were stronger — and much bigger.

      More From Borderlines »

      November 28, 2011
      Looking at Dogs and Cars

      Is having a generous impulse enough, or do you have to follow through?

      November 14, 2011
      The Tobacco Horror Show

      A court case involving cigarette ads raises murky issues about the relative impact of words and images.

      More From Stanley Fish »

      November 11, 2011
      Up Against the Wall

      A college ritual remembered: peculiar, official and all in the name of posture.

      October 21, 2011
      Tough Sell

      Even a brilliant visionary had a hard time getting this author even to unpack his computer.

      More From Dick Cavett »

      November 10, 2011
      Lives During Wartime, Vol. 3

      A collection of reader photographs and remembrances of veterans and their service.

      November 9, 2011
      Checkpoints: A U.S. Veteran in Baghdad

      A poet and Iraq war veteran journeys to Baghdad’s “Street of the Dead” and beyond to try to understand life in the city today.

      More From Home Fires »

      October 28, 2011
      The Cain Enigma

      For many analysts, the success of Herman Cain defies logic.

      October 21, 2011
      Battle of the Borders

      While Republicans talked tough on immigration, the Obama administration deported a record number of illegal immigrants.

      More From The Thread »

      October 18, 2011
      Prophecy of Machines

      Technology has surpassed art, not only in its power to influence public imagination, but also in prophetic vision.

      August 3, 2011
      Scoring Outside the Lines

      Writing music that goes beyond notes and clefs, into the realm of visual art.

      More From The Score »

      October 18, 2011
      Prophecy of Machines

      Technology has surpassed art, not only in its power to influence public imagination, but also in prophetic vision.

      August 3, 2011
      Scoring Outside the Lines

      Writing music that goes beyond notes and clefs, into the realm of visual art.

      More From The Score »

      Opinionator Highlights

      Health Care for a Changing Work Force

      With 42 million independent workers in the United States, the Freelancers Union’s health plan may be a model for the future.

      On Abortion and Defining a ‘Person’

      A recent referendum in Mississippi suggests important consequences for the logic of the abortion debate.

      Thumbnail
      Giving Where It Works

      Most social enterprises struggle to survive. These seven programs make the most of the charitable dollars they receive.

      Thumbnail
      Scaling the ‘Wall in the Head’

      Walls and fences, electrified or not, protect people not from nameless barbarians, but from their own anxieties and fears.

      An Electronic Eye on Hospital Hand-Washing

      A video monitoring system is helping to increase hand-washing rates and reduce deadly hospital-acquired infections.

      - - + + + + + + + +
      + + + + + + + +
      +
      + + +
      + + + +
      + +
      + + +
      +
      +
      + Photo +
      + + + +
      +
      + Buddhist monks performing as part of “Treasures From Korea: Arts and Culture of the Joseon Dynasty, 1392-1910,” at the Philadelphia Museum of Art. + + Credit + via Philadelphia Museum of Art +
      +
      +

      One afternoon in December, the Arts of Korea Gallery at the Metropolitan Museum of Art was abuzz, as dignitaries from the South Korean government inspected the display, while members of the press watched. Then, Oh Seung-je, the director of the Korean Cultural Service of New York, and Daniel H. Weiss, the Met’s president, ceremonially signed a pact establishing a long-term partnership between the Met and South Korea’s Ministry of Culture, Sports and Tourism.

      Along with the agreement came a $1 million gift to the Met from South Korea. It will pay for enhancement of the gallery, loans from Korean museums, a major exhibition of Korean art in 2018, new research, and residencies at the Met for curators from the National Museum of Korea.

      It was far from the first time the South Korean government or its affiliates, in tandem with Korean corporations, had rained money on American museums to ensure that Korean art was seen here. The Met’s Arts of Korea Gallery was created in 1998 with a gift from the Korea Foundation, an independent organization affiliated with South Korea’s Ministry of Foreign Affairs.

      Other countries, including Italy, Japan and the Netherlands, promote their visual culture in the United States — sponsoring museum exhibitions, lending valuable artworks and so on. What is striking about South Korea is that it has systematically invested in building and maintaining permanent places to see Korean art at American museums, and in underwriting exhibitions that promote the country’s ancient and contemporary art in the United States.

      + + +
      +
      + +
      + + +
      + +
      +

      Special Section

      +

      Museums

      +

      + Uncovering the innovative trends, exhibits and figures at the world's leading institutions.

      +
      + +
      +
      + +
      +
      + + EXPLORE Section +
      +
      +
      +
      + + +
      +

      “Since the American public only has rather limited opportunities to view and appreciate Korean artworks, concerted efforts are necessary to bring attention to the richness of Korea’s culture and arts,” Yoon Keum-jin, the executive vice president in the Korea Foundation’s Washington office, wrote in an email.

      Since its founding 25 years ago, the Korea Foundation alone has midwifed the birth of permanent Korean art galleries at 18 American museums, including the Museum of Fine Arts, Houston; the Cleveland Museum of Art; the Seattle Art Museum; the Los Angeles County Museum of Art; and the Museum of Fine Arts, Boston. It has spent more than $9 million to construct these galleries, Ms. Yoon said. Without such funding, the museums say, many of these galleries would probably not exist.

      +

      Reasons for this effort are both political and cultural. “The Koreans have a tradition of centralized planning and the government being involved in long-term economic planning and investment,” said Timothy F. Rub, director of the Philadelphia Museum of Art, which received a grant from the Korea Foundation to upgrade its Korean gallery, along with other aid over the years. “They see the promulgation of culture as an instrument of economic policy.”

      They also perceive a real need. “Many are aware of Korea through the rapid development of its technology and electronics sectors, but it is a country equally abundant in the cultural arts,” said Mr. Oh of the Korean Culture Service.

      Historically, Mr. Rub added, Korea’s culture has been overshadowed. “Look at the collections in the United States of East Asian art. Japan and China are far more prominent than Korea.” Yet, he said, “look at the history of Korean art; it was very much prized by the Japanese and the Chinese.”

      + Photo +
      + + + +
      +
      + Korea Foundation Day at the Museum of Fine Arts, Boston in 2012. + + Credit + Museum of Fine Arts, Boston +
      +
      +

      For museums in the Korean fold, the support can be steady. Soon after the Houston museum received $470,000 to build its gallery, the Korea Foundation provided $30,000 for programming. In 2010 it gave the museum $150,000 to organize and present “Your Bright Future: 12 Contemporary Artists From Korea,” the first major American museum exhibition of contemporary Korean art in years. Since then, the Korea Foundation has donated more money for programming of Korean art.

      The Los Angeles County Museum of Art, which was a co-organizer of “Your Bright Future,” also received money from the Korea Foundation for the show, which was sponsored by a Korean container company.

      The Koreans have also proved receptive to American ideas. Some years ago, Mr. Rub approached the National Museum of Korea with the notion of showcasing the art of the Joseon dynasty, which ruled Korea from 1392 to 1910. That turned into a cultural exchange that included the Museum of Fine Arts, Houston, and the Los Angeles County Museum of Art. They each presented the Korean exhibition and sent an exhibition of American art to South Korea.

      South Korea also supported a series of workshops between 1999 and 2013. The Korea Foundation hosted curators from 26 countries, who traveled to South Korea to hear art experts, take field trips to cultural sites and learn technical information about Korean art and its proper display.

      Despite the current economic uncertainties that have prompted many governments to cut their funding of the arts, Mr. Oh said that South Korea would continue its overseas largess. “I believe that economic prosperity and cultural wealth go hand in hand,” he said in an email. “This is why it is important to even further promote the cultural arts during times of economic slowdown.”

      What’s more, he added, South Korea’s president, Park Geun-hye, has made cultural enrichment one of her major priorities.

      + +
      + Continue reading the main story +
      +
      +
      +
      + + + + +
      + + + +
      +
      +
      +
      +

      Go to Home Page »

      +

      + Site Index + + The New York Times + +

      + +
      + + + +
      + + +
      +
      + + + + + + + + + + + + + + diff --git a/test/helper_text.exs b/test/helper_text.exs deleted file mode 100644 index 57aced2..0000000 --- a/test/helper_text.exs +++ /dev/null @@ -1,31 +0,0 @@ -defmodule Readability.HelperTest do - use ExUnit.Case, async: true - - import Readability, only: :functions - alias Readability.Helper - - @sample """ - - -

      - a -

      - abc -

      -

      -

      - b -

      - - - """ - - test "change font tag to span" do - expectred = @sample - |> String.replace(~r/font/, "span") - |> Floki.parse - - result = Helper.change_tag(parse(@sample), "font", "span") - assert expectred == result - end -end diff --git a/test/readability/candidate/_builder.exs b/test/readability/candidate/_builder.exs new file mode 100644 index 0000000..961b7a5 --- /dev/null +++ b/test/readability/candidate/_builder.exs @@ -0,0 +1,53 @@ +defmodule Readability.Candidate.BuilderTest.A do + use ExUnit.Case, async: true + import Readability, only: [parse: 1] + alias Readability.Candidate.Builder + + doctest Readability + + @sample """ +
      +
      +

      + Elixir is a dynamic, functional language designed for building scalable and maintainable applications. +

      +
      +
    +
    +
    +
    +

    + Elixir leverages the Erlang VM, known for running low-latency, distributed and fault-tolerant systems, while also being successfully used in web development and the embedded software domain. +

    +
    +
    +
    +
    + + not p, td node + +
    + + """ + + test "build candidate" do + candidates = Builder.build(parse(@sample)) + expected = parse(@sample) |> Floki.find(".candidate") |> length + assert length(candidates) == expected + + result = candidates + |> Enum.all?(fn(cand) -> + attrs = elem(cand.html_tree, 1) + "candidate" == attrs + |> List.keyfind("class", 0, {"", ""}) + |> elem(1) + end) + assert result == true + end + + test "sample" do + candidates = Builder.build(parse(@sample)) + end +end diff --git a/test/content_finder_test.ex b/test/readability/candidate/_finder.ex similarity index 68% rename from test/content_finder_test.ex rename to test/readability/candidate/_finder.ex index 73061e3..c1b51ac 100644 --- a/test/content_finder_test.ex +++ b/test/readability/candidate/_finder.ex @@ -1,8 +1,11 @@ -defmodule Readability.ContentFinderTest do +defmodule Readability.Candidate.FinderTest.A do use ExUnit.Case, async: true - doctest Readability.ContentFinder + doctest Readability.Candidate.Finder + alias Readability.Candidate.Finder + alias Readability.Candidate.MisusedTrasformer + alias Readability.Candidate.UnlikelyCandidatesRemover @unlikey_sample """ @@ -19,7 +22,7 @@ defmodule Readability.ContentFinderTest do expected = {"html", [], [ {"body", [], [ {"article", [{"class", "community"}], ["ARTICLE"]} ]} ]} result = @unlikey_sample |> Readability.parse - |> Readability.ContentFinder.remove_unlikely_candidates + |> UnlikelyCandidatesRemover.remove assert expected == result end @@ -53,10 +56,19 @@ defmodule Readability.ContentFinderTest do result = @misused_sample |> Readability.parse - |> Readability.ContentFinder.transform_misused_divs_into_paragraphs + |> MisusedTrasformer.transform assert expected == result end + @candidate_sample [{"div", + [], + [{"p", [], ["12345678901234567890123456"]}, + {"p", [], ["12345678901234567890123456"]} + ] + },{"div" + + }] + def read_html(name) do {:ok, body} = File.read("./test/fixtures/#{name}.html") diff --git a/test/readability/candidate/cleaner_test.exs b/test/readability/candidate/cleaner_test.exs new file mode 100644 index 0000000..28e113b --- /dev/null +++ b/test/readability/candidate/cleaner_test.exs @@ -0,0 +1,59 @@ +defmodule Readability.Candidate.CleanerTest do + use ExUnit.Case, async: true + + doctest Readability.Candidate.Cleaner + + alias Readability.Candidate.Cleaner + + @sample """ + + + title! + + +
    +

    a comment

    +
    real content
    +
    something in a table
    +
    + + + """ + + setup do + html_tree = Readability.parse(@sample) + {:ok, html_tree: html_tree} + end + + ### Transform misued div + + test "transform divs containing no block elements", %{html_tree: html_tree} do + html_tree = Cleaner.transform_misused_div_to_p(html_tree) + [{tag, _, _}|_] = html_tree |> Floki.find("#body") + + assert tag == "p" + end + + test "not transform divs that contain block elements", %{html_tree: html_tree} do + html_tree = Cleaner.transform_misused_div_to_p(html_tree) + [{tag, _, _}|_] = html_tree |> Floki.find("#contains_blockquote") + assert tag == "div" + end + + ### Remove unlikely tag + + test "remove things that have class comment", %{html_tree: html_tree} do + html_tree = Cleaner.remove_unlikely_tree(html_tree) + refute Floki.text(html_tree) =~ ~r/a comment/ + end + + test "not remove body tags", %{html_tree: html_tree} do + html_tree = Cleaner.remove_unlikely_tree(html_tree) + Floki.find(html_tree, "body") == [] + end + + test "not remove body tags", %{html_tree: html_tree} do + html_tree = Cleaner.remove_unlikely_tree(html_tree) + assert Floki.text(html_tree) =~ ~r/real content/ + end +end diff --git a/test/readability/helper_test.exs b/test/readability/helper_test.exs new file mode 100644 index 0000000..c25ba7b --- /dev/null +++ b/test/readability/helper_test.exs @@ -0,0 +1,48 @@ +defmodule Readability.HelperTest do + use ExUnit.Case, async: true + + import Readability, only: [parse: 1] + alias Readability.Helper + + @sample """ + + +

    + a +

    + abc +

    +

    +

    + b +

    + + + """ + + setup do + html_tree = Readability.parse(@sample) + {:ok, html_tree: html_tree} + end + + test "change font tag to span", %{html_tree: html_tree} do + expectred = @sample |> String.replace(~r/font/, "span") |> parse + result = Helper.change_tag(html_tree, "font", "span") + assert result == expectred + end + + test "remove tag", %{html_tree: html_tree} do + expected = "" |> parse + result = html_tree + |> Helper.remove_tag(fn({tag, _, _}) -> + tag == "p" + end) + + assert result == expected + end + + test "inner text lengt", %{html_tree: html_tree} do + result = html_tree |> Helper.text_length + assert result == 5 + end +end diff --git a/test/title_finder_test.exs b/test/readability/title_finder_test.exs similarity index 100% rename from test/title_finder_test.exs rename to test/readability/title_finder_test.exs index ea5098d..af724ba 100644 --- a/test/title_finder_test.exs +++ b/test/readability/title_finder_test.exs @@ -18,6 +18,11 @@ defmodule Readability.TitleFinderTest do """ + test "extract most proper title" do + title = Readability.TitleFinder.title(@html) + assert title == "og title" + end + test "extract og title" do title = Readability.TitleFinder.og_title(@html) assert title == "og title" @@ -37,9 +42,4 @@ defmodule Readability.TitleFinderTest do title = Readability.TitleFinder.h_tag_title(@html, "h2") assert title == "h2 title" end - - test "extract most proper title" do - title = Readability.TitleFinder.title(@html) - assert title == "og title" - end end diff --git a/test/readability_test.exs b/test/readability_test.exs index 7623e59..82af7c9 100644 --- a/test/readability_test.exs +++ b/test/readability_test.exs @@ -1,8 +1,34 @@ defmodule ReadabilityTest do - use ExUnit.Case - doctest Readability + use ExUnit.Case, async: true - test "the truth" do - assert 1 + 1 == 2 + @fixtures_path "./test/fixtures/" + + test "readability for NY Times" do + {:ok, nytimes} = File.read(@fixtures_path <> "nytimes.html") + opts = [clean_conditionally: false] + nytimes = Readability.content(nytimes, opts) + + nytimes_html = Readability.raw_html(nytimes) + assert nytimes_html =~ ~r/^
    / + assert nytimes_html =~ ~r/major priorities.<\/p><\/div><\/div>$/ + + nytimes_text = Readability.readabl_text(nytimes) + assert nytimes_text =~ ~r/^Buddhist monks performing as part of/ + assert nytimes_text =~ ~r/one of her major priorities.$/ + end + + test "readability for BBC" do + %{status_code: 200, body: body} = HTTPoison.get!("http://www.bbc.com/news/business-36108166") + Readability.content(body) |> Readability.readabl_text + end + + test "readability for medium" do + %{status_code: 200, body: body} = HTTPoison.get!("https://medium.com/@kenmazaika/why-im-betting-on-elixir-7c8f847b58#.d0xmzfd15") + IO.inspect Readability.content(body) |> Readability.readabl_text + end + + test "readability for buzzfeed" do + %{status_code: 200, body: body} = HTTPoison.get!("http://www.buzzfeed.com/salvadorhernandez/fbi-obtains-passcode-to-iphone-in-new-york-drops-case-agains#.koMMa21lj8") + IO.inspect Readability.content(body) |> Readability.readabl_text end end
    + too short content +