add filter algorithms

2016-04-17 15:28:33 +09:00 · 2016-04-17 15:28:33 +09:00 · 4e4a712718
commit 4e4a712718
parent d9f8b5d36f
12 changed files with 3482 additions and 2 deletions
--- a/lib/readability.ex
+++ b/lib/readability.ex
@ -6,5 +6,5 @@ defmodule Readability do
  def title(html) when is_binary(html), do: parse(html) |> title
  def title(html_tree), do: TitleFinder.title(html_tree)

-  defp parse(raw_html), do: Floki.parse(raw_html)
+  def parse(raw_html), do: Floki.parse(raw_html)
 end
--- a/lib/readability/content_finder.ex
+++ b/lib/readability/content_finder.ex
@ -0,0 +1,94 @@
+defmodule Readability.ContentFinder do
+  @moduledoc """
+  ContentFinder uses a variety of metrics for finding the content
+  that is most likely to be the stuff a user wants to read.
+  Then return it wrapped up in a div.
+  """
+
+  @regexes [ unlikelyCandidatesRe: ~r/combx|comment|community|disqus|extra|foot|header|lightbox|modal|menu|meta|nav|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
+             okMaybeItsACandidateRe: ~r/and|article|body|column|main|shadow/i,
+             positiveRe: ~r/article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
+             negativeRe: ~r/combx|comment|com-|contact|foot|footer|footnote|link|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|utility|widget/i,
+             divToPElementsRe: ~r/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
+             replaceBrsRe: ~r/(<br[^>]*>[ \n\r\t]*){2,}/i,
+             replaceFontsRe: ~r/<(\/?)font[^>]*>/i,
+             trimRe: ~r/^\s+|\s+$/,
+             normalizeRe: ~r/\s{2,}/,
+             killBreaksRe: ~r/(<br\s*\/?>(\s|&nbsp;?)*){1,}/,
+             videoRe: ~r/http:\/\/(www\.)?(youtube|vimeo)\.com/i
+           ]
+
+  @type html_tree :: tuple | list
+
+  @spec content(html_tree) :: html_tree
+
+  def content(html_tree, options \\ []) do
+    candidate = html_tree
+                |> preapre_cadidates
+
+    best_candidate = candidate
+                     |> select_best_candidate
+
+    candidate
+    |> fix_relative_uris
+  end
+
+  defp preapre_cadidates(html_tree) do
+    html_tree
+    |> Floki.filter_out("script")
+    |> Floki.filter_out("style")
+    |> remove_unlikely_candidates
+    |> transform_misused_divs_into_paragraphs
+  end
+
+  @doc """
+  Remove unlikely tag nodes
+  """
+
+  @spec remove_unlikely_candidates(html_tree) :: html_tree
+
+  def remove_unlikely_candidates(content) when is_binary(content), do: content
+  def remove_unlikely_candidates([]), do: []
+  def remove_unlikely_candidates([h|t]) do
+    case remove_unlikely_candidates(h) do
+      nil -> remove_unlikely_candidates(t)
+      html_tree -> [html_tree|remove_unlikely_candidates(t)]
+    end
+  end
+  def remove_unlikely_candidates({tag_name, attrs, inner_tree}) do
+    cond do
+      unlikely_candidate?(tag_name, attrs) -> nil
+      true -> {tag_name, attrs, remove_unlikely_candidates(inner_tree)}
+    end
+  end
+  defp unlikely_candidate?(tag_name, attrs) do
+    idclass_str = attrs
+                  |> Enum.filter_map(fn(attr) -> elem(attr, 0) =~ ~r/id|class/i end,
+                                     fn(attr) -> elem(attr, 1) end)
+                  |> Enum.join("")
+    str = tag_name <> idclass_str
+    str =~ @regexes[:unlikelyCandidatesRe] && !(str =~ @regexes[:okMaybeItsACandidateRe]) && tag_name != "html"
+  end
+
+  def transform_misused_divs_into_paragraphs(content) when is_binary(content), do: content
+  def transform_misused_divs_into_paragraphs([]), do: []
+  def transform_misused_divs_into_paragraphs([h|t]) do
+    [transform_misused_divs_into_paragraphs(h)|transform_misused_divs_into_paragraphs(t)]
+  end
+  def transform_misused_divs_into_paragraphs({tag_name, attrs, inner_tree} = html_tree) do
+    if misused_divs?(tag_name, inner_tree), do: tag_name = "p"
+    {tag_name, attrs, transform_misused_divs_into_paragraphs(inner_tree)}
+  end
+  defp misused_divs?("div", inner_tree) do
+    !(Floki.raw_html(inner_tree) =~ @regexes[:divToPElementsRe])
+  end
+  defp misused_divs?(_, _), do: false
+
+  defp select_best_candidate(html_tree) do
+    html_tree
+  end
+
+  defp fix_relative_uris(html_tree) do
+    html_tree
+  end
+end
--- a/lib/readability/helper.ex
+++ b/lib/readability/helper.ex
@ -0,0 +1,25 @@
+defmodule Readability.Helper do
+  @moduledoc """
+  Utilities
+  """
+
+  @type html_tree :: tuple | list
+
+  @doc """
+    change existing tags by selector
+  """
+
+  @spec change_tag(html_tree, String.t, String.t) :: html_tree
+
+  def change_tag({tag_name, attrs, inner_tree}, tag_name, tag) do
+    {tag, attrs, change_tag(inner_tree, tag_name, tag)}
+  end
+  def change_tag({tag_name, attrs, html_tree}, selector, tag) do
+    {tag_name, attrs, change_tag(html_tree, selector, tag)}
+  end
+  def change_tag([h|t], selector, tag) do
+    [change_tag(h, selector, tag)|change_tag(t, selector, tag)]
+  end
+  def change_tag([], selector, tag), do: []
+  def change_tag(content, selector, tag) when is_binary(content), do: content
+end
--- a/lib/readability/title_finder.ex
+++ b/lib/readability/title_finder.ex
@ -8,6 +8,12 @@ defmodule Readability.TitleFinder do

  @type html_tree :: tuple | list

+  @doc """
+  Find proper title
+  """
+
+  @spec title(html_tree) :: binary
+
  def title(html_tree) do
    maybe_title = tag_title(html_tree)
    if length(String.split(maybe_title, " ")) <= 4 do
--- a/test/content_finder_test.ex
+++ b/test/content_finder_test.ex
@ -0,0 +1,65 @@
+defmodule Readability.ContentFinderTest do
+  use ExUnit.Case, async: true
+
+  doctest Readability.ContentFinder
+
+
+  @unlikey_sample """
+  <html>
+    <body>
+      <header>HEADER</header>
+      <nav>NAV</nav>
+      <article class="community">ARTICLE</article>
+      <div class="disqus">SOCIAL</div>
+    </body>
+  </html>
+  """
+
+  test "remove unlikely tag nodes" do
+    expected = {"html", [], [ {"body", [], [ {"article", [{"class", "community"}], ["ARTICLE"]} ]} ]}
+    result = @unlikey_sample
+             |> Readability.parse
+             |> Readability.ContentFinder.remove_unlikely_candidates
+    assert expected == result
+  end
+
+  @misused_sample """
+  <html>
+    <body>
+      <div>
+        <span>here</span>
+      </div>
+      <div>
+        <p>not here</p>
+      </div>
+    </body>
+  </html>
+  """
+
+  test "transform misused div tag" do
+    expected = {"html",
+                  [],
+                  [{"body",
+                    [],
+                    [{"p",
+                      [],
+                      [{"span", [], ["here"]}]
+                    }, {"div",
+                      [],
+                      [{"p", [], ["not here"]}]
+                    }]
+                  }]
+                }
+
+    result = @misused_sample
+             |> Readability.parse
+             |> Readability.ContentFinder.transform_misused_divs_into_paragraphs
+    assert expected == result
+  end
+
+
+  def read_html(name) do
+    {:ok, body} = File.read("./test/fixtures/#{name}.html")
+    body
+  end
+end
--- a/test/fixtures/bbc.html
+++ b/test/fixtures/bbc.html
--- a/test/fixtures/code.html
+++ b/test/fixtures/code.html
@ -0,0 +1,13 @@
+<html>
+  <body>
+    <code><pre>
+root
+  indented
+    </pre></code>
+
+    <pre><code>
+second
+  indented
+    </code></pre>
+  </body>
+</html>
--- a/test/fixtures/nytimes.html
+++ b/test/fixtures/nytimes.html
--- a/test/features/nytimes.html
+++ b/test/features/nytimes.html
--- a/test/fixtures/thesun.html
+++ b/test/fixtures/thesun.html
--- a/test/helper_text.exs
+++ b/test/helper_text.exs
@ -0,0 +1,31 @@
+defmodule Readability.HelperTest do
+  use ExUnit.Case, async: true
+
+  import Readability, only: :functions
+  alias Readability.Helper
+
+  @sample """
+    <html>
+      <body>
+        <p>
+          <font>a</fond>
+          <p>
+            <font>abc</font>
+          </p>
+        </p>
+        <p>
+          <font>b</font>
+        </p>
+      </body>
+    </html>
+  """
+
+  test "change font tag to span" do
+    expectred = @sample
+                |> String.replace(~r/font/, "span")
+                |> Floki.parse
+
+    result = Helper.change_tag(parse(@sample), "font", "span")
+    assert expectred == result
+  end
+end
--- a/test/title_finder_test.exs
+++ b/test/title_finder_test.exs
@ -1,7 +1,7 @@
 defmodule Readability.TitleFinderTest do
  use ExUnit.Case, async: true

-  doctest Readability
+  doctest Readability.TitleFinder

  @html """
  <html>