add filter algorithms

2016-04-17 15:28:33 +09:00 · 2016-04-17 15:28:33 +09:00 · 4e4a712718
parent d9f8b5d36f
commit 4e4a712718
12 changed files with 3482 additions and 2 deletions
--- a/lib/readability.ex
+++ b/lib/readability.ex
@ -6,5 +6,5 @@ defmodule Readability do
  def title(html) when is_binary(html), do: parse(html) |> title
  def title(html_tree), do: TitleFinder.title(html_tree)
-  defp parse(raw_html), do: Floki.parse(raw_html)
+  def parse(raw_html), do: Floki.parse(raw_html)
 end
--- a/lib/readability/content_finder.ex
+++ b/lib/readability/content_finder.ex
@ -0,0 +1,94 @@
 defmodule Readability.ContentFinder do
  @moduledoc """
  ContentFinder uses a variety of metrics for finding the content
  that is most likely to be the stuff a user wants to read.
  Then return it wrapped up in a div.
  """
  @regexes [ unlikelyCandidatesRe: ~r/combx|comment|community|disqus|extra|foot|header|lightbox|modal|menu|meta|nav|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
             okMaybeItsACandidateRe: ~r/and|article|body|column|main|shadow/i,
             positiveRe: ~r/article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
             negativeRe: ~r/combx|comment|com-|contact|foot|footer|footnote|link|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|utility|widget/i,
             divToPElementsRe: ~r/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
             replaceBrsRe: ~r/(<br[^>]*>[ \n\r\t]*){2,}/i,
             replaceFontsRe: ~r/<(\/?)font[^>]*>/i,
             trimRe: ~r/^\s+|\s+$/,
             normalizeRe: ~r/\s{2,}/,
             killBreaksRe: ~r/(<br\s*\/?>(\s|&nbsp;?)*){1,}/,
             videoRe: ~r/http:\/\/(www\.)?(youtube|vimeo)\.com/i
           ]
  @type html_tree :: tuple | list
  @spec content(html_tree) :: html_tree
  def content(html_tree, options \\ []) do
    candidate = html_tree
                |> preapre_cadidates
    best_candidate = candidate
                     |> select_best_candidate
    candidate
    |> fix_relative_uris
  end
  defp preapre_cadidates(html_tree) do
    html_tree
    |> Floki.filter_out("script")
    |> Floki.filter_out("style")
    |> remove_unlikely_candidates
    |> transform_misused_divs_into_paragraphs
  end
  @doc """
  Remove unlikely tag nodes
  """
  @spec remove_unlikely_candidates(html_tree) :: html_tree
  def remove_unlikely_candidates(content) when is_binary(content), do: content
  def remove_unlikely_candidates([]), do: []
  def remove_unlikely_candidates([h|t]) do
    case remove_unlikely_candidates(h) do
      nil -> remove_unlikely_candidates(t)
      html_tree -> [html_tree|remove_unlikely_candidates(t)]
    end
  end
  def remove_unlikely_candidates({tag_name, attrs, inner_tree}) do
    cond do
      unlikely_candidate?(tag_name, attrs) -> nil
      true -> {tag_name, attrs, remove_unlikely_candidates(inner_tree)}
    end
  end
  defp unlikely_candidate?(tag_name, attrs) do
    idclass_str = attrs
                  |> Enum.filter_map(fn(attr) -> elem(attr, 0) =~ ~r/id|class/i end,
                                     fn(attr) -> elem(attr, 1) end)
                  |> Enum.join("")
    str = tag_name <> idclass_str
    str =~ @regexes[:unlikelyCandidatesRe] && !(str =~ @regexes[:okMaybeItsACandidateRe]) && tag_name != "html"
  end
  def transform_misused_divs_into_paragraphs(content) when is_binary(content), do: content
  def transform_misused_divs_into_paragraphs([]), do: []
  def transform_misused_divs_into_paragraphs([h|t]) do
    [transform_misused_divs_into_paragraphs(h)|transform_misused_divs_into_paragraphs(t)]
  end
  def transform_misused_divs_into_paragraphs({tag_name, attrs, inner_tree} = html_tree) do
    if misused_divs?(tag_name, inner_tree), do: tag_name = "p"
    {tag_name, attrs, transform_misused_divs_into_paragraphs(inner_tree)}
  end
  defp misused_divs?("div", inner_tree) do
    !(Floki.raw_html(inner_tree) =~ @regexes[:divToPElementsRe])
  end
  defp misused_divs?(_, _), do: false
  defp select_best_candidate(html_tree) do
    html_tree
  end
  defp fix_relative_uris(html_tree) do
    html_tree
  end
 end
--- a/lib/readability/helper.ex
+++ b/lib/readability/helper.ex
@ -0,0 +1,25 @@
 defmodule Readability.Helper do
  @moduledoc """
  Utilities
  """
  @type html_tree :: tuple | list
  @doc """
    change existing tags by selector
  """
  @spec change_tag(html_tree, String.t, String.t) :: html_tree
  def change_tag({tag_name, attrs, inner_tree}, tag_name, tag) do
    {tag, attrs, change_tag(inner_tree, tag_name, tag)}
  end
  def change_tag({tag_name, attrs, html_tree}, selector, tag) do
    {tag_name, attrs, change_tag(html_tree, selector, tag)}
  end
  def change_tag([h|t], selector, tag) do
    [change_tag(h, selector, tag)|change_tag(t, selector, tag)]
  end
  def change_tag([], selector, tag), do: []
  def change_tag(content, selector, tag) when is_binary(content), do: content
 end
--- a/lib/readability/title_finder.ex
+++ b/lib/readability/title_finder.ex
@ -8,6 +8,12 @@ defmodule Readability.TitleFinder do
  @type html_tree :: tuple | list
  @doc """
  Find proper title
  """
  @spec title(html_tree) :: binary
  def title(html_tree) do
    maybe_title = tag_title(html_tree)
    if length(String.split(maybe_title, " ")) <= 4 do
--- a/test/content_finder_test.ex
+++ b/test/content_finder_test.ex
@ -0,0 +1,65 @@
 defmodule Readability.ContentFinderTest do
  use ExUnit.Case, async: true
  doctest Readability.ContentFinder
  @unlikey_sample """
  <html>
    <body>
      <header>HEADER</header>
      <nav>NAV</nav>
      <article class="community">ARTICLE</article>
      <div class="disqus">SOCIAL</div>
    </body>
  </html>
  """
  test "remove unlikely tag nodes" do
    expected = {"html", [], [ {"body", [], [ {"article", [{"class", "community"}], ["ARTICLE"]} ]} ]}
    result = @unlikey_sample
             |> Readability.parse
             |> Readability.ContentFinder.remove_unlikely_candidates
    assert expected == result
  end
  @misused_sample """
  <html>
    <body>
      <div>
        <span>here</span>
      </div>
      <div>
        <p>not here</p>
      </div>
    </body>
  </html>
  """
  test "transform misused div tag" do
    expected = {"html",
                  [],
                  [{"body",
                    [],
                    [{"p",
                      [],
                      [{"span", [], ["here"]}]
                    }, {"div",
                      [],
                      [{"p", [], ["not here"]}]
                    }]
                  }]
                }
    result = @misused_sample
             |> Readability.parse
             |> Readability.ContentFinder.transform_misused_divs_into_paragraphs
    assert expected == result
  end
  def read_html(name) do
    {:ok, body} = File.read("./test/fixtures/#{name}.html")
    body
  end
 end
--- a/test/fixtures/bbc.html
+++ b/test/fixtures/bbc.html
--- a/test/fixtures/code.html
+++ b/test/fixtures/code.html
@ -0,0 +1,13 @@
 <html>
  <body>
    <code><pre>
 root
  indented
    </pre></code>
    <pre><code>
 second
  indented
    </code></pre>
  </body>
 </html>
--- a/test/fixtures/nytimes.html
+++ b/test/fixtures/nytimes.html
--- a/test/features/nytimes.html
+++ b/test/features/nytimes.html
--- a/test/fixtures/thesun.html
+++ b/test/fixtures/thesun.html
--- a/test/helper_text.exs
+++ b/test/helper_text.exs
@ -0,0 +1,31 @@
 defmodule Readability.HelperTest do
  use ExUnit.Case, async: true
  import Readability, only: :functions
  alias Readability.Helper
  @sample """
    <html>
      <body>
        <p>
          <font>a</fond>
          <p>
            <font>abc</font>
          </p>
        </p>
        <p>
          <font>b</font>
        </p>
      </body>
    </html>
  """
  test "change font tag to span" do
    expectred = @sample
                |> String.replace(~r/font/, "span")
                |> Floki.parse
    result = Helper.change_tag(parse(@sample), "font", "span")
    assert expectred == result
  end
 end
--- a/test/title_finder_test.exs
+++ b/test/title_finder_test.exs
@ -1,7 +1,7 @@
 defmodule Readability.TitleFinderTest do
  use ExUnit.Case, async: true
-  doctest Readability
+  doctest Readability.TitleFinder
  @html """
  <html>