add candidate builder

add test
2016-04-17 21:26:51 +09:00 · 2016-04-17 21:26:51 +09:00 · b131d7effa
parent 4e4a712718
commit b131d7effa
26 changed files with 4592 additions and 4661 deletions
--- a/.gitignore
+++ b/.gitignore
@ -3,3 +3,4 @@
 /deps
 erl_crash.dump
 *.ez
+.credo.exs
--- a/lib/document.ex
+++ b/lib/document.ex
@ -1,58 +0,0 @@
-defmodule Readability.Document do
-  @default_options  [retry_length: 250,
-                     min_text_length: 25,
-                     remove_unlikely_candidates: true,
-                     weight_classes: true,
-                     clean_conditionally: true,
-                     remove_empty_nodes: true,
-                     min_image_width: 130,
-                     min_image_height: 80,
-                     ignore_image_format: [],
-                     blacklist: nil,
-                     whitelist: nil
-                   ]
-
-  @regexes [ unlikelyCandidatesRe: ~r/combx|comment|community|disqus|extra|foot|header|lightbox|modal|menu|meta|nav|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
-             okMaybeItsACandidateRe: ~r/and|article|body|column|main|shadow/i,
-             positiveRe: ~r/article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
-             negativeRe: ~r/combx|comment|com-|contact|foot|footer|footnote|link|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|utility|widget/i,
-             divToPElementsRe: ~r/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
-             replaceBrsRe: ~r/(<br[^>]*>[ \n\r\t]*){2,}/i,
-             replaceFontsRe: ~r/<(\/?)font[^>]*>/i,
-             trimRe: ~r/^\s+|\s+$/,
-             normalizeRe: ~r/\s{2,}/,
-             killBreaksRe: ~r/(<br\s*\/?>(\s|&nbsp;?)*){1,}/,
-             videoRe: ~r/http:\/\/(www\.)?(youtube|vimeo)\.com/i
-           ]
-
-  def html do
-    page
-    |> String.replace(@regexes[:replaceBrsRe], "</p><p>")
-    |> String.replace(@regexes[:replaceFontsRe], "<\1span>")
-    |> Floki.find("html")
-    |> Floki.filter_out(:comment)
-  end
-
-  def title do
-    html |> Floki.find("title") |> Floki.text
-  end
-
-  def content do
-    html
-    |> Floki.filter_out("script")
-    |> Floki.filter_out("style")
-  end
-
-  def page do
-    {:ok, f} = File.read("test/features/nytimes.html")
-    f
-  end
-
-  def default_options do
-    @default_options
-  end
-
-  def regexes do
-    @regexes
-  end
-end
--- a/lib/readability.ex
+++ b/lib/readability.ex
@ -1,10 +1,86 @@
 defmodule Readability do
+  @moduledoc """
+  """
+
  alias Readability.TitleFinder
+  alias Readability.ArticleBuilder
+
+  @default_options [retry_length: 250,
+                    min_text_length: 25,
+                    remove_unlikely_candidates: true,
+                    weight_classes: true,
+                    clean_conditionally: true,
+                    remove_empty_nodes: true,
+                    min_image_width: 130,
+                    min_image_height: 80,
+                    ignore_image_format: [],
+                    blacklist: nil,
+                    whitelist: nil
+                   ]
+
+  @regexes [unlikely_candidate: ~r/combx|comment|community|disqus|extra|foot|header|hidden|lightbox|modal|menu|meta|nav|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
+            ok_maybe_its_a_candidate: ~r/and|article|body|column|main|shadow/i,
+            positive: ~r/article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
+            negative: ~r/hidden|^hid|combx|comment|com-|contact|foot|footer|footnote|link|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|utility|widget/i,
+            div_to_p_elements: ~r/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
+            replace_brs: ~r/(<br[^>]*>[ \n\r\t]*){2,}/i,
+            replace_fonts: ~r/<(\/?)font[^>]*>/i,
+            normalize: ~r/\s{2,}/,
+            video: ~r/\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i
+           ]

  @type html_tree :: tuple | list
+  @type options :: list

-  def title(html) when is_binary(html), do: parse(html) |> title
+  def title(html) when is_binary(html), do: html |> parse |> title
  def title(html_tree), do: TitleFinder.title(html_tree)

-  def parse(raw_html), do: Floki.parse(raw_html)
+  @doc """
+  Using a variety of metrics (content score, classname, element types), find the content that is
+  most likely to be the stuff a user wants to read
+  """
+  @spec content(binary, options) :: binary
+  def content(raw_html, opts \\ @default_options) do
+    opts = Keyword.merge(@default_options, opts)
+    raw_html
+    |> parse
+    |> ArticleBuilder.build(opts)
+  end
+
+  @doc """
+  Normalize and Parse to html tree(tuple or list)) from binary html
+  """
+  @spec parse(binary) :: html_tree
+  def parse(raw_html) do
+    raw_html
+    |> String.replace(Readability.regexes[:replace_brs], "</p><p>")
+    |> String.replace(Readability.regexes[:replace_fonts], "<\1span>")
+    |> String.replace(Readability.regexes[:normalize], " ")
+    |> Floki.parse
+    |> Floki.filter_out(:comment)
+  end
+
+  @doc """
+  return raw html binary from html tree tuple
+  """
+  @spec raw_html(html_tree) :: binary
+  def raw_html(html_tree) do
+    html_tree |> Floki.raw_html
+  end
+
+  @doc """
+  return only text binary from html tree tuple
+  """
+  @spec raw_html(html_tree) :: binary
+  def readabl_text(html_tree) do
+    tags_to_br = ~r/<\/(p|div|article|h\d)/i
+    html_str = html_tree |> raw_html
+    Regex.replace(tags_to_br, html_str, &("\n#{&1}"))
+    |> Floki.parse
+    |> Floki.text
+  end
+
+  def regexes, do: @regexes
+
+  def default_options, do: @default_options
 end
--- a/lib/readability/article_builder.ex
+++ b/lib/readability/article_builder.ex
@ -0,0 +1,100 @@
+defmodule Readability.ArticleBuilder do
+  @moduledoc """
+  build article for readability
+  """
+
+  alias Readability.Helper
+  alias Readability.Sanitizer
+  alias Readability.Candidate
+  alias Readability.CandidateFinder
+  alias Readability.Candidate.Cleaner
+  alias Readability.Candidate.Scoring
+
+  @type html_tree :: tuple | list
+  @type options :: list
+
+  @doc """
+  Prepare the article node for display.
+  Clean out any inline styles, iframes, forms, strip extraneous <p> tags, etc.
+  """
+  @spec build(html_tree, options) :: html_tree
+  def build(html_tree, opts) do
+    origin_tree = html_tree
+    html_tree = html_tree
+                |> Helper.remove_tag(fn({tag, _, _}) ->
+                     Enum.member?(["script", "style"], tag)
+                   end)
+
+    if opts[:remove_unlikely_candidates] do
+      html_tree = Cleaner.remove_unlikely_tree(html_tree)
+    end
+    html_tree = Cleaner.transform_misused_div_to_p(html_tree)
+
+    candidates = CandidateFinder.find(html_tree, opts)
+    article = find_article(candidates, html_tree)
+
+    html_tree = Sanitizer.sanitize(article, candidates, opts)
+
+    if Helper.text_length(html_tree) < opts[:retry_length] do
+      if opts = next_try_opts(opts) do
+        build(origin_tree, opts)
+      else
+        html_tree
+      end
+    else
+      html_tree
+    end
+  end
+
+  defp next_try_opts(opts) do
+    cond do
+      opts[:remove_unlikely_candidates] ->
+        Keyword.put(opts, :remove_unlikely_candidates, false)
+      opts[:weight_classes] ->
+        Keyword.put(opts, :weight_classes, false)
+      opts[:clean_conditionally] ->
+        Keyword.put(opts, :clean_conditionally, false)
+      true -> nil
+    end
+  end
+
+  defp find_article(candidates, html_tree) do
+    best_candidate = CandidateFinder.find_best_candidate(candidates)
+    unless best_candidate do
+      tree = html_tree |> Floki.find("body") |> hd
+      best_candidate = %Candidate{html_tree: tree}
+    end
+    article_trees = find_article_trees(best_candidate, candidates)
+    {"div", [], article_trees}
+  end
+
+  defp find_article_trees(best_candidate, candidates) do
+    score_threshold = Enum.max([10, best_candidate.score * 0.2])
+
+    candidates
+    |> Enum.filter(&(&1.tree_depth == best_candidate.tree_depth))
+    |> Enum.filter_map(fn(candidate) ->
+         candidate == best_candidate
+         || candidate.score >= score_threshold
+         || append?(candidate)
+       end, &(to_article_tag(&1.html_tree)))
+  end
+
+  defp append?(%Candidate{html_tree: html_tree}) when elem(html_tree, 0) == "p" do
+    link_density = Scoring.calc_link_density(html_tree)
+    inner_text = html_tree |> Floki.text
+    inner_length = inner_text |> String.length
+
+    (inner_length > 80 && link_density < 0.25)
+    || (inner_length < 80 && link_density == 0 && inner_text =~ ~r/\.( |$)/)
+  end
+  defp append?(_), do: false
+
+  defp to_article_tag({tag, attrs, inner_tree} = html_tree) do
+    if tag =~ ~r/^p$|^div$/ do
+      html_tree
+    else
+      {"div", attrs, inner_tree}
+    end
+  end
+end
--- a/lib/readability/candidate.ex
+++ b/lib/readability/candidate.ex
@ -0,0 +1,6 @@
+defmodule Readability.Candidate do
+  @moduledoc """
+  Candidate can be article
+  """
+  defstruct html_tree: {}, score: 0, tree_depth: 0
+end
--- a/lib/readability/candidate/cleaner.ex
+++ b/lib/readability/candidate/cleaner.ex
@ -0,0 +1,48 @@
+defmodule Readability.Candidate.Cleaner do
+  @moduledoc """
+  Clean html tree for prepare candidates.
+  It transforms misused tags and removes unlikely candidates.
+  """
+
+  alias Readability.Helper
+
+  @type html_tree :: tuple | list
+
+  @doc """
+  Transform misused divs <div>s that do not contain other block elements into <p>s
+  """
+  @spec transform_misused_div_to_p(html_tree) :: html_tree
+  def transform_misused_div_to_p(content) when is_binary(content), do: content
+  def transform_misused_div_to_p([]), do: []
+  def transform_misused_div_to_p([h|t]) do
+    [transform_misused_div_to_p(h)|transform_misused_div_to_p(t)]
+  end
+  def transform_misused_div_to_p({tag, attrs, inner_tree}) do
+    if misused_divs?(tag, inner_tree), do: tag = "p"
+    {tag, attrs, transform_misused_div_to_p(inner_tree)}
+  end
+
+  @doc """
+  Remove unlikely html tree
+  """
+  @spec remove_unlikely_tree(html_tree) :: html_tree
+  def remove_unlikely_tree(html_tree) do
+    Helper.remove_tag(html_tree, &unlikely_tree?(&1))
+  end
+
+  defp misused_divs?("div", inner_tree) do
+    !(Floki.raw_html(inner_tree) =~ Readability.regexes[:div_to_p_elements])
+  end
+  defp misused_divs?(_, _), do: false
+
+  defp unlikely_tree?({tag, attrs, _}) do
+    idclass_str = attrs
+                  |> Enum.filter_map(&(elem(&1, 0)  =~ ~r/id|class/i), &(elem(&1, 1)))
+                  |> Enum.join("")
+    str = tag <> idclass_str
+
+    str =~ Readability.regexes[:unlikely_candidate]
+      && !(str =~ Readability.regexes[:ok_maybe_its_a_candidate])
+      && tag != "html"
+  end
+end
--- a/lib/readability/candidate/scoring.ex
+++ b/lib/readability/candidate/scoring.ex
@ -0,0 +1,89 @@
+defmodule Readability.Candidate.Scoring do
+  @moduledoc """
+  Score html tree
+  """
+  alias Readability.Helper
+
+  @element_scores %{"div" => 5,
+                    "blockquote" => 3,
+                    "form" => -3,
+                    "th" => -5
+                  }
+
+  @type html_tree :: tuple | list
+  @type options :: list
+
+  @doc """
+  Score html tree by some algorithm that check children nodes, attributes, link densities, etcs..
+  options -> weight_classes :: boolean, calculate weight class
+  """
+  @spec calc_score(html_tree, options) :: number
+  def calc_score(html_tree, opts \\ []) do
+    score = calc_node_score(html_tree, opts)
+    score = score + calc_children_content_score(html_tree) + calc_grand_children_content_score(html_tree)
+    score * (1 - calc_link_density(html_tree))
+  end
+
+  defp calc_content_score(html_tree) do
+    score = 1
+    inner_text = html_tree |> Floki.text
+    split_score = inner_text |> String.split(",") |> length
+    length_score = [(String.length(inner_text) / 100), 3] |> Enum.min
+    score + split_score + length_score
+  end
+
+  defp calc_node_score({tag, attrs, _}, opts) do
+    score = 0
+    if opts[:weight_classes], do: score = score + class_weight(attrs)
+    score + (@element_scores[tag] || 0)
+  end
+  defp calc_node_score([h|t], opts) do
+    calc_node_score(h, opts) + calc_node_score(t, opts)
+  end
+  defp calc_node_score([], _), do: 0
+
+  def class_weight(attrs) do
+    weight = 0
+    class = attrs |> List.keyfind("class", 0, {"", ""}) |> elem(1)
+    id = attrs |> List.keyfind("id", 0, {"", ""}) |> elem(1)
+
+    if class =~ Readability.regexes[:positive], do: weight = weight + 25
+    if id =~ Readability.regexes[:positive], do: weight = weight + 25
+    if class =~ Readability.regexes[:negative], do: weight = weight - 25
+    if id =~ Readability.regexes[:negative], do: weight = weight - 25
+
+    weight
+  end
+
+  def calc_link_density(html_tree) do
+    link_length = html_tree
+                  |> Floki.find("a")
+                  |> Floki.text
+                  |> String.length
+
+    text_length = html_tree
+                  |> Floki.text
+                  |> String.length
+
+    if text_length == 0 do
+      0
+    else
+      link_length / text_length
+    end
+  end
+
+  defp calc_children_content_score({_, _, children_tree}) do
+    children_tree
+    |> Enum.filter(&(is_tuple(&1) && Helper.candidate_tag?(&1)))
+    |> calc_content_score
+  end
+
+  defp calc_grand_children_content_score({_, _, children_tree}) do
+    score = children_tree
+            |> Enum.filter_map(&is_tuple(&1), &elem(&1, 2))
+            |> List.flatten
+            |> Enum.filter(&(is_tuple(&1) && Helper.candidate_tag?(&1)))
+            |> calc_content_score
+    score / 2
+  end
+end
--- a/lib/readability/candidate_finder.ex
+++ b/lib/readability/candidate_finder.ex
@ -0,0 +1,60 @@
+defmodule Readability.CandidateFinder do
+  @moduledoc """
+  The builing and finding candidates  engine
+  It traverses the HTML tree searching, removing, socring nodes
+  """
+
+  alias Readability.Helper
+  alias Readability.Candidate
+  alias Readability.Candidate.Scoring
+
+  @type html_tree :: tuple | list
+  @type options :: list
+
+  @doc """
+  Find candidates that shuld be meaningful article by analysing nodes
+  """
+  @spec find(html_tree, options, number) :: [Candidate.t]
+  def find(_, opts \\ [], tree_depth \\ 0)
+  def find([], _, _), do: []
+  def find([h|t], opts, tree_depth) do
+    [find(h, opts, tree_depth) | find(t, opts, tree_depth)]
+    |> List.flatten
+  end
+  def find(text, _, _) when is_binary(text), do: []
+  def find({tag, attrs, inner_tree}, opts, tree_depth) do
+    html_tree = {tag, attrs, inner_tree}
+    if candidate?(html_tree) do
+      candidate = %Candidate{html_tree: html_tree,
+                             score: Scoring.calc_score(html_tree, opts),
+                             tree_depth: tree_depth}
+
+      [candidate | find(inner_tree, opts, tree_depth + 1)]
+    else
+      find(inner_tree, opts, tree_depth + 1)
+    end
+  end
+
+  @doc """
+  Find the highest score candidate.
+  """
+  @spec find_best_candidate([Candidate.t]) :: Candidate.t
+  def find_best_candidate([]), do: nil
+  def find_best_candidate(candidates) do
+    candidates
+    |> Enum.max_by(fn(candidate) -> candidate.score end)
+  end
+
+  defp candidate?(_, depth \\ 0)
+  defp candidate?(_, depth) when depth > 2, do: false
+  defp candidate?([h|t], depth), do: candidate?(h, depth) || candidate?(t, depth)
+  defp candidate?([], _), do: false
+  defp candidate?(text, _) when is_binary(text), do: false
+  defp candidate?({_, _, inner_tree} = html_tree, depth) do
+    if Helper.candidate_tag?(html_tree) do
+      true
+    else
+      candidate?(inner_tree, depth + 1)
+    end
+  end
+end
--- a/lib/readability/content_finder.ex
+++ b/lib/readability/content_finder.ex
@ -1,94 +0,0 @@
-defmodule Readability.ContentFinder do
-  @moduledoc """
-  ContentFinder uses a variety of metrics for finding the content
-  that is most likely to be the stuff a user wants to read.
-  Then return it wrapped up in a div.
-  """
-
-  @regexes [ unlikelyCandidatesRe: ~r/combx|comment|community|disqus|extra|foot|header|lightbox|modal|menu|meta|nav|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
-             okMaybeItsACandidateRe: ~r/and|article|body|column|main|shadow/i,
-             positiveRe: ~r/article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
-             negativeRe: ~r/combx|comment|com-|contact|foot|footer|footnote|link|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|utility|widget/i,
-             divToPElementsRe: ~r/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
-             replaceBrsRe: ~r/(<br[^>]*>[ \n\r\t]*){2,}/i,
-             replaceFontsRe: ~r/<(\/?)font[^>]*>/i,
-             trimRe: ~r/^\s+|\s+$/,
-             normalizeRe: ~r/\s{2,}/,
-             killBreaksRe: ~r/(<br\s*\/?>(\s|&nbsp;?)*){1,}/,
-             videoRe: ~r/http:\/\/(www\.)?(youtube|vimeo)\.com/i
-           ]
-
-  @type html_tree :: tuple | list
-
-  @spec content(html_tree) :: html_tree
-
-  def content(html_tree, options \\ []) do
-    candidate = html_tree
-                |> preapre_cadidates
-
-    best_candidate = candidate
-                     |> select_best_candidate
-
-    candidate
-    |> fix_relative_uris
-  end
-
-  defp preapre_cadidates(html_tree) do
-    html_tree
-    |> Floki.filter_out("script")
-    |> Floki.filter_out("style")
-    |> remove_unlikely_candidates
-    |> transform_misused_divs_into_paragraphs
-  end
-
-  @doc """
-  Remove unlikely tag nodes
-  """
-
-  @spec remove_unlikely_candidates(html_tree) :: html_tree
-
-  def remove_unlikely_candidates(content) when is_binary(content), do: content
-  def remove_unlikely_candidates([]), do: []
-  def remove_unlikely_candidates([h|t]) do
-    case remove_unlikely_candidates(h) do
-      nil -> remove_unlikely_candidates(t)
-      html_tree -> [html_tree|remove_unlikely_candidates(t)]
-    end
-  end
-  def remove_unlikely_candidates({tag_name, attrs, inner_tree}) do
-    cond do
-      unlikely_candidate?(tag_name, attrs) -> nil
-      true -> {tag_name, attrs, remove_unlikely_candidates(inner_tree)}
-    end
-  end
-  defp unlikely_candidate?(tag_name, attrs) do
-    idclass_str = attrs
-                  |> Enum.filter_map(fn(attr) -> elem(attr, 0) =~ ~r/id|class/i end,
-                                     fn(attr) -> elem(attr, 1) end)
-                  |> Enum.join("")
-    str = tag_name <> idclass_str
-    str =~ @regexes[:unlikelyCandidatesRe] && !(str =~ @regexes[:okMaybeItsACandidateRe]) && tag_name != "html"
-  end
-
-  def transform_misused_divs_into_paragraphs(content) when is_binary(content), do: content
-  def transform_misused_divs_into_paragraphs([]), do: []
-  def transform_misused_divs_into_paragraphs([h|t]) do
-    [transform_misused_divs_into_paragraphs(h)|transform_misused_divs_into_paragraphs(t)]
-  end
-  def transform_misused_divs_into_paragraphs({tag_name, attrs, inner_tree} = html_tree) do
-    if misused_divs?(tag_name, inner_tree), do: tag_name = "p"
-    {tag_name, attrs, transform_misused_divs_into_paragraphs(inner_tree)}
-  end
-  defp misused_divs?("div", inner_tree) do
-    !(Floki.raw_html(inner_tree) =~ @regexes[:divToPElementsRe])
-  end
-  defp misused_divs?(_, _), do: false
-
-  defp select_best_candidate(html_tree) do
-    html_tree
-  end
-
-  defp fix_relative_uris(html_tree) do
-    html_tree
-  end
-end
--- a/lib/readability/helper.ex
+++ b/lib/readability/helper.ex
@ -1,25 +1,93 @@
 defmodule Readability.Helper do
  @moduledoc """
-  Utilities
+  Helpers for parsing, updating, removing html tree
  """

  @type html_tree :: tuple | list

  @doc """
-    change existing tags by selector
+  Change existing tags by selector
  """
-
  @spec change_tag(html_tree, String.t, String.t) :: html_tree
-
+  def change_tag(content, _, _) when is_binary(content), do: content
+  def change_tag([], _, _), do: []
+  def change_tag([h|t], selector, tag) do
+    [change_tag(h, selector, tag)|change_tag(t, selector, tag)]
+  end
  def change_tag({tag_name, attrs, inner_tree}, tag_name, tag) do
    {tag, attrs, change_tag(inner_tree, tag_name, tag)}
  end
  def change_tag({tag_name, attrs, html_tree}, selector, tag) do
    {tag_name, attrs, change_tag(html_tree, selector, tag)}
  end
-  def change_tag([h|t], selector, tag) do
-    [change_tag(h, selector, tag)|change_tag(t, selector, tag)]
+
+  @spec remove_attrs(html_tree, String.t | [String.t] | Regex.t) :: html_tree
+  def remove_attrs(content, _) when is_binary(content), do: content
+  def remove_attrs([], _), do: []
+  def remove_attrs([h|t], t_attrs) do
+    [remove_attrs(h, t_attrs)|remove_attrs(t, t_attrs)]
+  end
+  def remove_attrs({tag_name, attrs, inner_tree}, target_attr) do
+    reject_fun = fn(attr) -> attr end
+    cond do
+      is_binary(target_attr) ->
+        reject_fun = fn(attr) -> elem(attr, 0) == target_attr end
+      Regex.regex?(target_attr) ->
+        reject_fun = fn(attr) -> elem(attr, 0) =~ target_attr end
+      is_list(target_attr) ->
+        reject_fun = fn(attr) -> Enum.member?(target_attr, elem(attr, 0)) end
+      true -> nil
+    end
+    {tag_name, Enum.reject(attrs, reject_fun), remove_attrs(inner_tree, target_attr)}
+  end
+
+
+  @doc """
+  Remove tags
+  """
+  @spec remove_tag(html_tree, fun) :: html_tree
+  def remove_tag(content, _) when is_binary(content), do: content
+  def remove_tag([], _), do: []
+  def remove_tag([h|t], fun) do
+    node = remove_tag(h, fun)
+    if is_nil(node) do
+      remove_tag(t, fun)
+    else
+      [node|remove_tag(t, fun)]
+    end
+  end
+  def remove_tag({tag, attrs, inner_tree} = html_tree, fun) do
+    if fun.(html_tree) do
+      nil
+    else
+      {tag, attrs, remove_tag(inner_tree, fun)}
+    end
+  end
+
+  @doc """
+  count only text length
+  """
+  @spec text_length(html_tree) :: number
+  def text_length(html_tree) do
+    html_tree |> Floki.text |> String.strip |> String.length
+  end
+
+  @doc """
+  Check html_tree can be candidate or not.
+  """
+  @spec candidate_tag?(html_tree) :: boolean
+  def candidate_tag?(html_tree) do
+    Enum.any?(candidates_selector, fn(selector) ->
+      Floki.Selector.match?(html_tree, selector)
+      && (text_length(html_tree)) >= Readability.default_options[:min_text_length]
+    end)
+  end
+
+  defp candidates_selector do
+    ["p", "td"]
+    |> Enum.map(fn(s) ->
+         tokens = Floki.SelectorTokenizer.tokenize(s)
+         Floki.SelectorParser.parse(tokens)
+       end)
  end
-  def change_tag([], selector, tag), do: []
-  def change_tag(content, selector, tag) when is_binary(content), do: content
 end
--- a/lib/readability/sanitizer.ex
+++ b/lib/readability/sanitizer.ex
@ -0,0 +1,85 @@
+defmodule Readability.Sanitizer do
+  @moduledoc """
+  Clean an element of all tags of type "tag" if they look fishy.
+  "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
+  """
+
+  alias Readability.Helper
+  alias Readability.Candidate
+  alias Readability.Candidate.Scoring
+
+  @type html_tree :: tuple | list
+
+  @doc """
+  Sanitizes article html tree
+  """
+  @spec sanitize(html_tree, [Candidate.t], list) :: html_tree
+  def sanitize(html_tree, candidates, opts  \\ []) do
+    html_tree = html_tree
+                |> Helper.remove_tag(&clean_headline_tag?(&1))
+                |> Helper.remove_tag(&clean_unlikely_tag?(&1))
+                |> Helper.remove_tag(&clean_empty_p?(&1))
+
+    if opts[:clean_conditionally] do
+      html_tree = html_tree
+                  |> Helper.remove_tag(conditionally_cleaing_fn(candidates))
+    end
+
+    html_tree |> Helper.remove_attrs("style")
+  end
+
+  defp conditionally_cleaing_fn(candidates) do
+    fn({tag, attrs, _} = tree) ->
+      if Enum.any?(["table", "ul", "div"], &(&1 == tag)) do
+        weight = Scoring.class_weight(attrs)
+        same_tree = candidates
+                    |> Enum.find(%Candidate{}, &(&1.html_tree == tree))
+        list? = tag == "ul"
+        cond do
+          weight + same_tree.score < 0
+            -> true
+
+          length(Regex.scan(~r/\,/, Floki.text(tree))) < 10 ->
+            # If there are not very many commas, and the number of
+            # non-paragraph elements is more than paragraphs or other
+            # ominous signs, remove the element.
+            p_len = tree |> Floki.find("p") |> length
+            img_len = tree |> Floki.find("img") |> length
+            li_len = tree |> Floki.find("li") |> length
+            input_len = tree |> Floki.find("input") |> length
+            embed_len = tree
+                        |> Floki.find("embed")
+                        |> Enum.reject(&(&1 =~ Readability.regexes[:video]))
+                        |> length
+
+            link_density =  Scoring.calc_link_density(tree)
+            conent_len = Helper.text_length(tree)
+
+            img_len > p_len                 # too many image
+            || (!list? && li_len > p_len)   # more <li>s than <p>s
+            || input_len > (p_len / 3)      # less than 3x <p>s than <input>s
+            || (!list? && conent_len < Readability.regexes[:min_text_length] && img_len != 1) # too short a content length without a single image
+            || (weight < 25 && link_density > 0.2) # too many links for its weight (#{weight})
+            || (weight >= 25 && link_density > 0.5) # too many links for its weight (#{weight})
+            || ((embed_len == 1 && conent_len < 75) || embed_len > 1) # <embed>s with too short a content length, or too many <embed>s
+
+          true -> false
+        end
+      end
+    end
+  end
+
+  defp clean_headline_tag?({tag, attrs, _} = html_tree) do
+    tag =~ ~r/^h\d{1}$/
+    && (Scoring.class_weight(attrs) < 0 || Scoring.calc_link_density(html_tree) > 0.33)
+  end
+
+  defp clean_unlikely_tag?({tag, attrs, _}) do
+    attrs_str = attrs |> Enum.map(&(elem(&1, 1))) |> Enum.join("")
+    tag =~ ~r/form|object|iframe|embed/ && !(attrs_str =~ Readability.regexes[:video])
+  end
+
+  defp clean_empty_p?({tag, _, _} = html_tree) do
+    tag == "p" && Helper.text_length(html_tree) == 0
+  end
+end
--- a/lib/readability/title_finder.ex
+++ b/lib/readability/title_finder.ex
@ -11,9 +11,7 @@ defmodule Readability.TitleFinder do
  @doc """
  Find proper title
  """
-
  @spec title(html_tree) :: binary
-
  def title(html_tree) do
    maybe_title = tag_title(html_tree)
    if length(String.split(maybe_title, " ")) <= 4 do
@ -25,42 +23,36 @@ defmodule Readability.TitleFinder do
  @doc """
  Find title from title tag
  """
-
  @spec tag_title(html_tree) :: binary
-
  def tag_title(html_tree) do
    html_tree
    |> Floki.find("title")
-    |> to_clean_text
+    |> clean_title
  end

  @doc """
  Find title from og:title property of meta tag
  """
-
  @spec og_title(html_tree) :: binary
-
  def og_title(html_tree) do
    html_tree
    |> Floki.find("meta[property=og:title]")
    |> Floki.attribute("content")
-    |> to_clean_text
+    |> clean_title
  end

  @doc """
  Find title from h tag
  """
-
  @spec h_tag_title(html_tree, String.t) :: binary
-
-  def h_tag_title(html_tree, selector \\@h_tag_selector) do
+  def h_tag_title(html_tree, selector \\ @h_tag_selector) do
    html_tree
    |> Floki.find(selector)
    |> hd
-    |> to_clean_text
+    |> clean_title
  end

-  defp to_clean_text(html_tree) do
+  defp clean_title(html_tree) do
    title_text = html_tree
                 |> Floki.text
                 |> String.split(@title_suffix)
--- a/lib/test.js
+++ b/lib/test.js
--- a/lib/test.rb
+++ b/lib/test.rb
@ -1,522 +0,0 @@
-# encoding: utf-8
-
-require 'rubygems'
-require 'nokogiri'
-require 'guess_html_encoding'
-
-module Readability
-  class Document
-    DEFAULT_OPTIONS = {
-      :retry_length               => 250,
-      :min_text_length            => 25,
-      :remove_unlikely_candidates => true,
-      :weight_classes             => true,
-      :clean_conditionally        => true,
-      :remove_empty_nodes         => true,
-      :min_image_width            => 130,
-      :min_image_height           => 80,
-      :ignore_image_format        => [],
-      :blacklist                  => nil,
-      :whitelist                  => nil
-    }.freeze
-
-    REGEXES = {
-        :unlikelyCandidatesRe => /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
-        :okMaybeItsACandidateRe => /and|article|body|column|main|shadow/i,
-        :positiveRe => /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
-        :negativeRe => /combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i,
-        :divToPElementsRe => /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
-        :replaceBrsRe => /(<br[^>]*>[ \n\r\t]*){2,}/i,
-        :replaceFontsRe => /<(\/?)font[^>]*>/i,
-        :trimRe => /^\s+|\s+$/,
-        :normalizeRe => /\s{2,}/,
-        :killBreaksRe => /(<br\s*\/?>(\s|&nbsp;?)*){1,}/,
-        :videoRe => /http:\/\/(www\.)?(youtube|vimeo)\.com/i
-    }
-
-    attr_accessor :options, :html, :best_candidate, :candidates, :best_candidate_has_image
-
-    def initialize(input, options = {})
-      @options = DEFAULT_OPTIONS.merge(options)
-      @input = input
-
-      if RUBY_VERSION =~ /^(1\.9|2)/ && !@options[:encoding]
-        @input = GuessHtmlEncoding.encode(@input, @options[:html_headers]) unless @options[:do_not_guess_encoding]
-        @options[:encoding] = @input.encoding.to_s
-      end
-
-      @input = @input.gsub(REGEXES[:replaceBrsRe], '</p><p>').gsub(REGEXES[:replaceFontsRe], '<\1span>')
-      @remove_unlikely_candidates = @options[:remove_unlikely_candidates]
-      @weight_classes = @options[:weight_classes]
-      @clean_conditionally = @options[:clean_conditionally]
-      @best_candidate_has_image = true
-      make_html
-      handle_exclusions!(@options[:whitelist], @options[:blacklist])
-    end
-
-    def images(content=nil, reload=false)
-      begin
-        require 'fastimage'
-      rescue LoadError
-        raise "Please install fastimage in order to use the #images feature."
-      end
-
-      @best_candidate_has_image = false if reload
-
-      prepare_candidates
-      list_images   = []
-      tested_images = []
-      content       = @best_candidate[:elem] unless reload
-
-      return list_images if content.nil?
-      elements = content.css("img").map(&:attributes)
-
-        elements.each do |element|
-          next unless element["src"]
-
-          url     = element["src"].value
-          height  = element["height"].nil?  ? 0 : element["height"].value.to_i
-          width   = element["width"].nil?   ? 0 : element["width"].value.to_i
-
-          if url =~ /\Ahttps?:\/\//i && (height.zero? || width.zero?)
-            image   = get_image_size(url)
-            next unless image
-          else
-            image = {:width => width, :height => height}
-          end
-
-          image[:format] = File.extname(url).gsub(".", "")
-
-          if tested_images.include?(url)
-            debug("Image was tested: #{url}")
-            next
-          end
-
-          tested_images.push(url)
-          if image_meets_criteria?(image)
-            list_images << url
-          else
-            debug("Image discarded: #{url} - height: #{image[:height]} - width: #{image[:width]} - format: #{image[:format]}")
-          end
-        end
-
-      (list_images.empty? and content != @html) ? images(@html, true) : list_images
-    end
-
-    def images_with_fqdn_uris!(source_uri)
-      images_with_fqdn_uris(@html, source_uri)
-    end
-
-    def images_with_fqdn_uris(document = @html.dup, source_uri)
-      uri = URI.parse(source_uri)
-      host = uri.host
-      scheme = uri.scheme
-      port = uri.port # defaults to 80
-
-      base = "#{scheme}://#{host}:#{port}/"
-
-      images = []
-      document.css("img").each do |elem|
-        begin
-          elem['src'] = URI.join(base,elem['src']).to_s if URI.parse(elem['src']).host == nil
-          images << elem['src'].to_s
-        rescue URI::InvalidURIError => exc
-          elem.remove
-        end
-      end
-
-      images(document,true)
-    end
-
-    def get_image_size(url)
-      w, h = FastImage.size(url)
-      raise "Couldn't get size." if w.nil? || h.nil?
-      {:width => w, :height => h}
-    rescue => e
-      debug("Image error: #{e}")
-      nil
-    end
-
-    def image_meets_criteria?(image)
-      return false if options[:ignore_image_format].include?(image[:format].downcase)
-      image[:width] >= (options[:min_image_width] || 0) && image[:height] >= (options[:min_image_height] || 0)
-    end
-
-    def title
-      title = @html.css("title").first
-      title ? title.text : nil
-    end
-
-    # Look through the @html document looking for the author
-    # Precedence Information here on the wiki: (TODO attach wiki URL if it is accepted)
-    # Returns nil if no author is detected
-    def author
-      # Let's grab this author:
-      # <meta name="dc.creator" content="Finch - http://www.getfinch.com" />
-      author_elements = @html.xpath('//meta[@name = "dc.creator"]')
-      unless author_elements.empty?
-        author_elements.each do |element|
-          return element['content'].strip if element['content']
-        end
-      end
-
-      # Now let's try to grab this
-      # <span class="byline author vcard"><span>By</span><cite class="fn">Austin Fonacier</cite></span>
-      # <div class="author">By</div><div class="author vcard"><a class="url fn" href="http://austinlivesinyoapp.com/">Austin Fonacier</a></div>
-      author_elements = @html.xpath('//*[contains(@class, "vcard")]//*[contains(@class, "fn")]')
-      unless author_elements.empty?
-        author_elements.each do |element|
-          return element.text.strip if element.text
-        end
-      end
-
-      # Now let's try to grab this
-      # <a rel="author" href="http://dbanksdesign.com">Danny Banks (rel)</a>
-      # TODO: strip out the (rel)?
-      author_elements = @html.xpath('//a[@rel = "author"]')
-      unless author_elements.empty?
-        author_elements.each do |element|
-          return element.text.strip if element.text
-        end
-      end
-
-      author_elements = @html.xpath('//*[@id = "author"]')
-      unless author_elements.empty?
-        author_elements.each do |element|
-          return element.text.strip if element.text
-        end
-      end
-    end
-
-    def content(remove_unlikely_candidates = :default)
-      @remove_unlikely_candidates = false if remove_unlikely_candidates == false
-
-      prepare_candidates
-      article = get_article(@candidates, @best_candidate)
-
-      cleaned_article = sanitize(article, @candidates, options)
-      if article.text.strip.length < options[:retry_length]
-        if @remove_unlikely_candidates
-          @remove_unlikely_candidates = false
-        elsif @weight_classes
-          @weight_classes = false
-        elsif @clean_conditionally
-          @clean_conditionally = false
-        else
-          # nothing we can do
-          return cleaned_article
-        end
-
-        make_html
-        content
-      else
-        cleaned_article
-      end
-    end
-
-    def get_article(candidates, best_candidate)
-      # Now that we have the top candidate, look through its siblings for content that might also be related.
-      # Things like preambles, content split by ads that we removed, etc.
-
-      sibling_score_threshold = [10, best_candidate[:content_score] * 0.2].max
-      output = Nokogiri::XML::Node.new('div', @html)
-      best_candidate[:elem].parent.children.each do |sibling|
-        append = false
-        append = true if sibling == best_candidate[:elem]
-        append = true if candidates[sibling] && candidates[sibling][:content_score] >= sibling_score_threshold
-
-        if sibling.name.downcase == "p"
-          link_density = get_link_density(sibling)
-          node_content = sibling.text
-          node_length = node_content.length
-
-          append = if node_length > 80 && link_density < 0.25
-            true
-          elsif node_length < 80 && link_density == 0 && node_content =~ /\.( |$)/
-            true
-          end
-        end
-
-        if append
-          sibling_dup = sibling.dup # otherwise the state of the document in processing will change, thus creating side effects
-          sibling_dup.name = "div" unless %w[div p].include?(sibling.name.downcase)
-          output << sibling_dup
-        end
-      end
-
-      output
-    end
-
-    def select_best_candidate(candidates)
-      sorted_candidates = candidates.values.sort { |a, b| b[:content_score] <=> a[:content_score] }
-
-      debug("Top 5 candidates:")
-      sorted_candidates[0...5].each do |candidate|
-        debug("Candidate #{candidate[:elem].name}##{candidate[:elem][:id]}.#{candidate[:elem][:class]} with score #{candidate[:content_score]}")
-      end
-
-      best_candidate = sorted_candidates.first || { :elem => @html.css("body").first, :content_score => 0 }
-      debug("Best candidate #{best_candidate[:elem].name}##{best_candidate[:elem][:id]}.#{best_candidate[:elem][:class]} with score #{best_candidate[:content_score]}")
-
-      best_candidate
-    end
-
-    def get_link_density(elem)
-      link_length = elem.css("a").map(&:text).join("").length
-      text_length = elem.text.length
-      link_length / text_length.to_f
-    end
-
-    def class_weight(e)
-      weight = 0
-      return weight unless @weight_classes
-
-      if e[:class] && e[:class] != ""
-        weight -= 25 if e[:class] =~ REGEXES[:negativeRe]
-        weight += 25 if e[:class] =~ REGEXES[:positiveRe]
-      end
-
-      if e[:id] && e[:id] != ""
-        weight -= 25 if e[:id] =~ REGEXES[:negativeRe]
-        weight += 25 if e[:id] =~ REGEXES[:positiveRe]
-      end
-
-      weight
-    end
-
-    ELEMENT_SCORES = {
-      'div' => 5,
-      'blockquote' => 3,
-      'form' => -3,
-      'th' => -5
-    }.freeze
-
-    def score_node(elem)
-      content_score = class_weight(elem)
-      content_score += ELEMENT_SCORES.fetch(elem.name.downcase, 0)
-      { :content_score => content_score, :elem => elem }
-    end
-
-    def debug(str)
-      puts str if options[:debug]
-    end
-
-    def sanitize(node, candidates, options = {})
-      node.css("h1, h2, h3, h4, h5, h6").each do |header|
-        header.remove if class_weight(header) < 0 || get_link_density(header) > 0.33
-      end
-
-      node.css("form, object, iframe, embed").each do |elem|
-        elem.remove
-      end
-
-      if @options[:remove_empty_nodes]
-        # remove <p> tags that have no text content - this will also remove p tags that contain only images.
-        node.css("p").each do |elem|
-          elem.remove if elem.content.strip.empty?
-        end
-      end
-
-      # Conditionally clean <table>s, <ul>s, and <div>s
-      clean_conditionally(node, candidates, "table, ul, div")
-
-      # We'll sanitize all elements using a whitelist
-      base_whitelist = @options[:tags] || %w[div p]
-      # We'll add whitespace instead of block elements,
-      # so a<br>b will have a nice space between them
-      base_replace_with_whitespace = %w[br hr h1 h2 h3 h4 h5 h6 dl dd ol li ul address blockquote center]
-
-      # Use a hash for speed (don't want to make a million calls to include?)
-      whitelist = Hash.new
-      base_whitelist.each {|tag| whitelist[tag] = true }
-      replace_with_whitespace = Hash.new
-      base_replace_with_whitespace.each { |tag| replace_with_whitespace[tag] = true }
-
-      ([node] + node.css("*")).each do |el|
-        # If element is in whitelist, delete all its attributes
-        if whitelist[el.node_name]
-          el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) }
-
-          # Otherwise, replace the element with its contents
-        else
-          # If element is root, replace the node as a text node
-          if el.parent.nil?
-            node = Nokogiri::XML::Text.new(el.text, el.document)
-            break
-          else
-            if replace_with_whitespace[el.node_name]
-              el.swap(Nokogiri::XML::Text.new(' ' << el.text << ' ', el.document))
-            else
-              el.swap(Nokogiri::XML::Text.new(el.text, el.document))
-            end
-          end
-        end
-
-      end
-
-      s = Nokogiri::XML::Node::SaveOptions
-      save_opts = s::NO_DECLARATION | s::NO_EMPTY_TAGS | s::AS_XHTML
-      html = node.serialize(:save_with => save_opts)
-
-      # Get rid of duplicate whitespace
-      return html.gsub(/[\r\n\f]+/, "\n" )
-    end
-
-    def clean_conditionally(node, candidates, selector)
-      return unless @clean_conditionally
-      node.css(selector).each do |el|
-        weight = class_weight(el)
-        content_score = candidates[el] ? candidates[el][:content_score] : 0
-        name = el.name.downcase
-
-        if weight + content_score < 0
-          el.remove
-          debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because score + content score was less than zero.")
-        elsif el.text.count(",") < 10
-          counts = %w[p img li a embed input].inject({}) { |m, kind| m[kind] = el.css(kind).length; m }
-          counts["li"] -= 100
-
-          # For every img under a noscript tag discount one from the count to avoid double counting
-          counts["img"] -= el.css("noscript").css("img").length
-
-          content_length = el.text.strip.length  # Count the text length excluding any surrounding whitespace
-          link_density = get_link_density(el)
-
-          reason = clean_conditionally_reason?(name, counts, content_length, options, weight, link_density)
-          if reason
-            debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because it has #{reason}.")
-            el.remove
-          end
-        end
-      end
-    end
-
-    def clean_conditionally_reason?(name, counts, content_length, options, weight, link_density)
-      if (counts["img"] > counts["p"]) && (counts["img"] > 1)
-        "too many images"
-      elsif counts["li"] > counts["p"] && name != "ul" && name != "ol"
-        "more <li>s than <p>s"
-      elsif counts["input"] > (counts["p"] / 3).to_i
-        "less than 3x <p>s than <input>s"
-      elsif (content_length < options[:min_text_length]) && (counts["img"] != 1)
-        "too short a content length without a single image"
-      elsif weight < 25 && link_density > 0.2
-        "too many links for its weight (#{weight})"
-      elsif weight >= 25 && link_density > 0.5
-        "too many links for its weight (#{weight})"
-      elsif (counts["embed"] == 1 && content_length < 75) || counts["embed"] > 1
-        "<embed>s with too short a content length, or too many <embed>s"
-      else
-        nil
-      end
-    end
-
-    private
-
-    # 제거항목 추가항목을 지정한다.
-    def handle_exclusions!(whitelist, blacklist)
-      return unless whitelist || blacklist
-
-      if blacklist
-        elems = @html.css(blacklist)
-        if elems
-          elems.each do |e|
-            e.remove
-          end
-        end
-      end
-
-      if whitelist
-        elems = @html.css(whitelist).to_s
-
-        if body = @html.at_css('body')
-          body.inner_html = elems
-        end
-      end
-
-      @input = @html.to_s
-    end
-
-    # 코멘트가 제거된 기본 html 노드 반환
-    def make_html(whitelist=nil, blacklist=nil)
-      @html = Nokogiri::HTML(@input, nil, @options[:encoding])
-      # In case document has no body, such as from empty string or redirect
-      @html = Nokogiri::HTML('<body />', nil, @options[:encoding]) if @html.css('body').length == 0
-      # Remove html comment tags
-      @html.xpath('//comment()').each { |i| i.remove }
-    end
-
-
-    def prepare_candidates
-      @html.css("script, style").each { |i| i.remove }
-      remove_unlikely_candidates! if @remove_unlikely_candidates
-      transform_misused_divs_into_paragraphs!
-
-      @candidates     = score_paragraphs(options[:min_text_length])
-      @best_candidate = select_best_candidate(@candidates)
-    end
-
-    # 가망없는 후보자를 제거한다. (명확한 후보자는 제외하고 제거한다.)
-    def remove_unlikely_candidates!
-      @html.css("*").each do |elem|
-        str = "#{elem[:class]}#{elem[:id]}"
-        if str =~ REGEXES[:unlikelyCandidatesRe] && str !~ REGEXES[:okMaybeItsACandidateRe] && (elem.name.downcase != 'html') && (elem.name.downcase != 'body')
-          debug("Removing unlikely candidate - #{str}")
-          elem.remove
-        end
-      end
-    end
-
-    # 잘못 사용되고 있는 DIV를 p로 변환한다.
-    def transform_misused_divs_into_paragraphs!
-      @html.css("*").each do |elem|
-        if elem.name.downcase == "div"
-          # transform <div>s that do not contain other block elements into <p>s
-          if elem.inner_html !~ REGEXES[:divToPElementsRe]
-            debug("Altering div(##{elem[:id]}.#{elem[:class]}) to p");
-            elem.name = "p"
-          end
-        else
-          # wrap text nodes in p tags
-#          elem.children.each do |child|
-#            if child.text?
-#              debug("wrapping text node with a p")
-#              child.swap("<p>#{child.text}</p>")
-#            end
-#          end
-        end
-      end
-    end
-
-    # 가능노드에 점수를 매긴다.
-    def score_paragraphs(min_text_length)
-      candidates = {}
-      @html.css("p,td").each do |elem|
-        parent_node = elem.parent
-        grand_parent_node = parent_node.respond_to?(:parent) ? parent_node.parent : nil
-        inner_text = elem.text
-
-        # If this paragraph is less than 25 characters, don't even count it.
-        next if inner_text.length < min_text_length
-
-        candidates[parent_node] ||= score_node(parent_node)
-        candidates[grand_parent_node] ||= score_node(grand_parent_node) if grand_parent_node
-
-        content_score = 1
-        content_score += inner_text.split(',').length
-        content_score += [(inner_text.length / 100).to_i, 3].min
-
-        candidates[parent_node][:content_score] += content_score
-        candidates[grand_parent_node][:content_score] += content_score / 2.0 if grand_parent_node
-      end
-
-      # Scale the final candidates score based on link density. Good content should have a
-      # relatively small link density (5% or less) and be mostly unaffected by this operation.
-      candidates.each do |elem, candidate|
-        candidate[:content_score] = candidate[:content_score] * (1 - get_link_density(elem))
-      end
-
-      candidates
-    end
-  end
-end
--- a/mix.exs
+++ b/mix.exs
@ -1,4 +1,7 @@
 defmodule Readability.Mixfile do
+  @moduledoc """
+  """
+
  use Mix.Project

  def project do
@ -15,7 +18,8 @@ defmodule Readability.Mixfile do
  # Type "mix help compile.app" for more information
  def application do
    [applications: [:logger,
-                    :floki
+                    :floki,
+                    :httpoison
                   ]]
  end

@ -29,6 +33,10 @@ defmodule Readability.Mixfile do
  #
  # Type "mix help deps" for more examples and options
  defp deps do
-    [{:floki, "~> 0.8.0"}]
+    [{:floki, "~> 0.8.0"},
+     {:httpoison, "~> 0.8.0"},
+     {:credo, "~> 0.3", only: [:dev, :test]},
+     {:dialyxir, "~> 0.3", only: [:dev]}
+    ]
  end
 end
--- a/mix.lock
+++ b/mix.lock
@ -1,2 +1,12 @@
-%{"floki": {:hex, :floki, "0.8.0"},
-  "mochiweb_html": {:hex, :mochiweb_html, "2.13.0"}}
+%{"bunt": {:hex, :bunt, "0.1.5"},
+  "certifi": {:hex, :certifi, "0.4.0"},
+  "credo": {:hex, :credo, "0.3.12"},
+  "dialyxir": {:hex, :dialyxir, "0.3.3"},
+  "floki": {:hex, :floki, "0.8.0"},
+  "hackney": {:hex, :hackney, "1.6.0"},
+  "httpoison": {:hex, :httpoison, "0.8.3"},
+  "idna": {:hex, :idna, "1.2.0"},
+  "metrics": {:hex, :metrics, "1.0.1"},
+  "mimerl": {:hex, :mimerl, "1.0.2"},
+  "mochiweb_html": {:hex, :mochiweb_html, "2.13.0"},
+  "ssl_verify_fun": {:hex, :ssl_verify_fun, "1.1.0"}}
--- a/test.html
+++ b/test.html
--- a/test/fixtures/bbc.html
+++ b/test/fixtures/bbc.html
--- a/test/fixtures/nytimes.html
+++ b/test/fixtures/nytimes.html
--- a/test/helper_text.exs
+++ b/test/helper_text.exs
@ -1,31 +0,0 @@
-defmodule Readability.HelperTest do
-  use ExUnit.Case, async: true
-
-  import Readability, only: :functions
-  alias Readability.Helper
-
-  @sample """
-    <html>
-      <body>
-        <p>
-          <font>a</fond>
-          <p>
-            <font>abc</font>
-          </p>
-        </p>
-        <p>
-          <font>b</font>
-        </p>
-      </body>
-    </html>
-  """
-
-  test "change font tag to span" do
-    expectred = @sample
-                |> String.replace(~r/font/, "span")
-                |> Floki.parse
-
-    result = Helper.change_tag(parse(@sample), "font", "span")
-    assert expectred == result
-  end
-end
--- a/test/readability/candidate/_builder.exs
+++ b/test/readability/candidate/_builder.exs
@ -0,0 +1,53 @@
+defmodule Readability.Candidate.BuilderTest.A do
+  use ExUnit.Case, async: true
+  import Readability, only: [parse: 1]
+  alias Readability.Candidate.Builder
+
+  doctest Readability
+
+  @sample """
+  <div id="1" class="candidate">
+    <div id="2" class="candidate">
+      <p id="3" class="candidate">
+        Elixir is a dynamic, functional language designed for building scalable and maintainable applications.
+      </p>
+    </div>
+    <td>
+      <a>too short content</a>
+    </td>
+    <div id="4">
+      <div id="5" class="candidate">
+        <div id="6" class="candidate">
+          <p id="7" class="candidate">
+            Elixir leverages the Erlang VM, known for running low-latency, distributed and fault-tolerant systems, while also being successfully used in web development and the embedded software domain.
+          </p>
+        </div>
+      </div>
+    </div>
+    <div>
+      <span>
+        not p, td node
+      </span>
+    </div>
+  </div>
+  """
+
+  test "build candidate" do
+    candidates = Builder.build(parse(@sample))
+    expected = parse(@sample) |> Floki.find(".candidate") |> length
+    assert length(candidates) == expected
+
+    result =  candidates
+              |> Enum.all?(fn(cand) ->
+                   attrs = elem(cand.html_tree, 1)
+                   "candidate" == attrs
+                                  |> List.keyfind("class", 0, {"", ""})
+                                  |> elem(1)
+                 end)
+    assert result == true
+  end
+
+  test "sample" do
+    candidates = Builder.build(parse(@sample))
+  end
+end
--- a/test/readability/candidate/_finder.ex
+++ b/test/readability/candidate/_finder.ex
@ -1,8 +1,11 @@
-defmodule Readability.ContentFinderTest do
+defmodule Readability.Candidate.FinderTest.A do
  use ExUnit.Case, async: true

-  doctest Readability.ContentFinder
+  doctest Readability.Candidate.Finder

+  alias Readability.Candidate.Finder
+  alias Readability.Candidate.MisusedTrasformer
+  alias Readability.Candidate.UnlikelyCandidatesRemover

  @unlikey_sample """
  <html>
@ -19,7 +22,7 @@ defmodule Readability.ContentFinderTest do
    expected = {"html", [], [ {"body", [], [ {"article", [{"class", "community"}], ["ARTICLE"]} ]} ]}
    result = @unlikey_sample
             |> Readability.parse
-             |> Readability.ContentFinder.remove_unlikely_candidates
+             |> UnlikelyCandidatesRemover.remove
    assert expected == result
  end

@ -53,10 +56,19 @@ defmodule Readability.ContentFinderTest do

    result = @misused_sample
             |> Readability.parse
-             |> Readability.ContentFinder.transform_misused_divs_into_paragraphs
+             |> MisusedTrasformer.transform
    assert expected == result
  end

+  @candidate_sample [{"div",
+                      [],
+                      [{"p", [], ["12345678901234567890123456"]},
+                       {"p", [], ["12345678901234567890123456"]}
+                      ]
+                    },{"div"
+
+                      }]
+

  def read_html(name) do
    {:ok, body} = File.read("./test/fixtures/#{name}.html")
--- a/test/readability/candidate/cleaner_test.exs
+++ b/test/readability/candidate/cleaner_test.exs
@ -0,0 +1,59 @@
+defmodule Readability.Candidate.CleanerTest do
+  use ExUnit.Case, async: true
+
+  doctest Readability.Candidate.Cleaner
+
+  alias Readability.Candidate.Cleaner
+
+  @sample """
+  <html>
+    <head>
+      <title>title!</title>
+    </head>
+    <body class='comment'>
+      <div>
+        <p class='comment'>a comment</p>
+        <div class='comment' id='body'>real content</div>
+        <div id="contains_blockquote"><blockquote>something in a table</blockquote></div>
+      </div>
+    </body>
+  </html>
+  """
+
+  setup do
+    html_tree = Readability.parse(@sample)
+    {:ok, html_tree: html_tree}
+  end
+
+  ### Transform misued div
+
+  test "transform divs containing no block elements", %{html_tree: html_tree} do
+    html_tree = Cleaner.transform_misused_div_to_p(html_tree)
+    [{tag, _, _}|_] = html_tree |> Floki.find("#body")
+
+    assert tag == "p"
+  end
+
+  test "not transform divs that contain block elements", %{html_tree: html_tree} do
+    html_tree = Cleaner.transform_misused_div_to_p(html_tree)
+    [{tag, _, _}|_] = html_tree |> Floki.find("#contains_blockquote")
+    assert tag == "div"
+  end
+
+  ### Remove unlikely tag
+
+  test "remove things that have class comment", %{html_tree: html_tree} do
+    html_tree = Cleaner.remove_unlikely_tree(html_tree)
+    refute Floki.text(html_tree) =~ ~r/a comment/
+  end
+
+  test "not remove body tags", %{html_tree: html_tree} do
+    html_tree = Cleaner.remove_unlikely_tree(html_tree)
+    Floki.find(html_tree, "body") == []
+  end
+
+  test "not remove body tags", %{html_tree: html_tree} do
+    html_tree = Cleaner.remove_unlikely_tree(html_tree)
+    assert Floki.text(html_tree) =~ ~r/real content/
+  end
+end
--- a/test/readability/helper_test.exs
+++ b/test/readability/helper_test.exs
@ -0,0 +1,48 @@
+defmodule Readability.HelperTest do
+  use ExUnit.Case, async: true
+
+  import Readability, only: [parse: 1]
+  alias Readability.Helper
+
+  @sample """
+    <html>
+      <body>
+        <p>
+          <font>a</fond>
+          <p>
+            <font>abc</font>
+          </p>
+        </p>
+        <p>
+          <font>b</font>
+        </p>
+      </body>
+    </html>
+  """
+
+  setup do
+    html_tree = Readability.parse(@sample)
+    {:ok, html_tree: html_tree}
+  end
+
+  test "change font tag to span", %{html_tree: html_tree} do
+    expectred = @sample |> String.replace(~r/font/, "span") |> parse
+    result = Helper.change_tag(html_tree, "font", "span")
+    assert result == expectred
+  end
+
+  test "remove tag", %{html_tree: html_tree} do
+    expected = "<html><body></body></html>" |> parse
+    result = html_tree
+             |> Helper.remove_tag(fn({tag, _, _}) ->
+               tag == "p"
+             end)
+
+    assert result == expected
+  end
+
+  test "inner text lengt", %{html_tree: html_tree} do
+    result = html_tree |> Helper.text_length
+    assert result == 5
+  end
+end
--- a/test/readability/title_finder_test.exs
+++ b/test/readability/title_finder_test.exs
@ -18,6 +18,11 @@ defmodule Readability.TitleFinderTest do
  </html>
  """

+  test "extract most proper title" do
+    title = Readability.TitleFinder.title(@html)
+    assert title == "og title"
+  end
+
  test "extract og title" do
    title = Readability.TitleFinder.og_title(@html)
    assert title == "og title"
@ -37,9 +42,4 @@ defmodule Readability.TitleFinderTest do
    title = Readability.TitleFinder.h_tag_title(@html, "h2")
    assert title == "h2 title"
  end
-
-  test "extract most proper title" do
-    title = Readability.TitleFinder.title(@html)
-    assert title == "og title"
-  end
 end
--- a/test/readability_test.exs
+++ b/test/readability_test.exs
@ -1,8 +1,34 @@
 defmodule ReadabilityTest do
-  use ExUnit.Case
-  doctest Readability
+  use ExUnit.Case, async: true

-  test "the truth" do
-    assert 1 + 1 == 2
+  @fixtures_path "./test/fixtures/"
+
+  test "readability for NY Times" do
+    {:ok, nytimes} = File.read(@fixtures_path <> "nytimes.html")
+    opts = [clean_conditionally: false]
+    nytimes = Readability.content(nytimes, opts)
+
+    nytimes_html = Readability.raw_html(nytimes)
+    assert nytimes_html =~ ~r/^<div><div class=\"story-body\">/
+    assert nytimes_html =~ ~r/major priorities.<\/p><\/div><\/div>$/
+
+    nytimes_text = Readability.readabl_text(nytimes)
+    assert nytimes_text =~ ~r/^Buddhist monks performing as part of/
+    assert nytimes_text =~ ~r/one of her major priorities.$/
+  end
+
+  test "readability for BBC" do
+    %{status_code: 200, body: body} = HTTPoison.get!("http://www.bbc.com/news/business-36108166")
+    Readability.content(body) |> Readability.readabl_text
+  end
+
+  test "readability for medium" do
+    %{status_code: 200, body: body} = HTTPoison.get!("https://medium.com/@kenmazaika/why-im-betting-on-elixir-7c8f847b58#.d0xmzfd15")
+    IO.inspect Readability.content(body) |> Readability.readabl_text
+  end
+
+  test "readability for buzzfeed" do
+    %{status_code: 200, body: body} = HTTPoison.get!("http://www.buzzfeed.com/salvadorhernandez/fbi-obtains-passcode-to-iphone-in-new-york-drops-case-agains#.koMMa21lj8")
+    IO.inspect Readability.content(body) |> Readability.readabl_text
  end
 end