add test

2016-04-24 14:32:43 +09:00 · 2016-04-24 14:32:43 +09:00 · d8677a599c
parent f69365c4a4
commit d8677a599c
9 changed files with 7272 additions and 2114 deletions
--- a/lib/readability.ex
+++ b/lib/readability.ex
@ -73,11 +73,13 @@ defmodule Readability do
  """
  @spec raw_html(html_tree) :: binary
  def readabl_text(html_tree) do
+    # TODO: Remove image caption when extract only text
    tags_to_br = ~r/<\/(p|div|article|h\d)/i
    html_str = html_tree |> raw_html
    Regex.replace(tags_to_br, html_str, &("\n#{&1}"))
    |> Floki.parse
    |> Floki.text
+    |> String.strip
  end

  def regexes, do: @regexes
--- a/test/fixtures/bbc.html
+++ b/test/fixtures/bbc.html
--- a/test/fixtures/buzzfeed.html
+++ b/test/fixtures/buzzfeed.html
--- a/test/fixtures/medium.html
+++ b/test/fixtures/medium.html
--- a/test/readability/candidate/_builder.exs
+++ b/test/readability/candidate/_builder.exs
@ -1,53 +0,0 @@
-defmodule Readability.Candidate.BuilderTest.A do
-  use ExUnit.Case, async: true
-  import Readability, only: [parse: 1]
-  alias Readability.Candidate.Builder
-
-  doctest Readability
-
-  @sample """
-  <div id="1" class="candidate">
-    <div id="2" class="candidate">
-      <p id="3" class="candidate">
-        Elixir is a dynamic, functional language designed for building scalable and maintainable applications.
-      </p>
-    </div>
-    <td>
-      <a>too short content</a>
-    </td>
-    <div id="4">
-      <div id="5" class="candidate">
-        <div id="6" class="candidate">
-          <p id="7" class="candidate">
-            Elixir leverages the Erlang VM, known for running low-latency, distributed and fault-tolerant systems, while also being successfully used in web development and the embedded software domain.
-          </p>
-        </div>
-      </div>
-    </div>
-    <div>
-      <span>
-        not p, td node
-      </span>
-    </div>
-  </div>
-  """
-
-  test "build candidate" do
-    candidates = Builder.build(parse(@sample))
-    expected = parse(@sample) |> Floki.find(".candidate") |> length
-    assert length(candidates) == expected
-
-    result =  candidates
-              |> Enum.all?(fn(cand) ->
-                   attrs = elem(cand.html_tree, 1)
-                   "candidate" == attrs
-                                  |> List.keyfind("class", 0, {"", ""})
-                                  |> elem(1)
-                 end)
-    assert result == true
-  end
-
-  test "sample" do
-    candidates = Builder.build(parse(@sample))
-  end
-end
--- a/test/readability/candidate/_finder.ex
+++ b/test/readability/candidate/_finder.ex
@ -1,77 +0,0 @@
-defmodule Readability.Candidate.FinderTest.A do
-  use ExUnit.Case, async: true
-
-  doctest Readability.Candidate.Finder
-
-  alias Readability.Candidate.Finder
-  alias Readability.Candidate.MisusedTrasformer
-  alias Readability.Candidate.UnlikelyCandidatesRemover
-
-  @unlikey_sample """
-  <html>
-    <body>
-      <header>HEADER</header>
-      <nav>NAV</nav>
-      <article class="community">ARTICLE</article>
-      <div class="disqus">SOCIAL</div>
-    </body>
-  </html>
-  """
-
-  test "remove unlikely tag nodes" do
-    expected = {"html", [], [ {"body", [], [ {"article", [{"class", "community"}], ["ARTICLE"]} ]} ]}
-    result = @unlikey_sample
-             |> Readability.parse
-             |> UnlikelyCandidatesRemover.remove
-    assert expected == result
-  end
-
-  @misused_sample """
-  <html>
-    <body>
-      <div>
-        <span>here</span>
-      </div>
-      <div>
-        <p>not here</p>
-      </div>
-    </body>
-  </html>
-  """
-
-  test "transform misused div tag" do
-    expected = {"html",
-                  [],
-                  [{"body",
-                    [],
-                    [{"p",
-                      [],
-                      [{"span", [], ["here"]}]
-                    }, {"div",
-                      [],
-                      [{"p", [], ["not here"]}]
-                    }]
-                  }]
-                }
-
-    result = @misused_sample
-             |> Readability.parse
-             |> MisusedTrasformer.transform
-    assert expected == result
-  end
-
-  @candidate_sample [{"div",
-                      [],
-                      [{"p", [], ["12345678901234567890123456"]},
-                       {"p", [], ["12345678901234567890123456"]}
-                      ]
-                    },{"div"
-
-                      }]
-
-
-  def read_html(name) do
-    {:ok, body} = File.read("./test/fixtures/#{name}.html")
-    body
-  end
-end
--- a/test/readability/candidate/cleaner_test.exs
+++ b/test/readability/candidate/cleaner_test.exs
@ -49,11 +49,6 @@ defmodule Readability.Candidate.CleanerTest do

  test "not remove body tags", %{html_tree: html_tree} do
    html_tree = Cleaner.remove_unlikely_tree(html_tree)
-    Floki.find(html_tree, "body") == []
-  end
-
-  test "not remove body tags", %{html_tree: html_tree} do
-    html_tree = Cleaner.remove_unlikely_tree(html_tree)
-    assert Floki.text(html_tree) =~ ~r/real content/
+    refute Floki.find(html_tree, "body") == []
  end
 end
--- a/test/readability_test.exs
+++ b/test/readability_test.exs
@ -1,12 +1,10 @@
 defmodule ReadabilityTest do
  use ExUnit.Case, async: true

-  @fixtures_path "./test/fixtures/"
-
  test "readability for NY Times" do
-    {:ok, nytimes} = File.read(@fixtures_path <> "nytimes.html")
+    html = TestHelper.read_fixture("nytimes.html")
    opts = [clean_conditionally: false]
-    nytimes = Readability.content(nytimes, opts)
+    nytimes = Readability.content(html, opts)

    nytimes_html = Readability.raw_html(nytimes)
    assert nytimes_html =~ ~r/^<div><div class=\"story-body\">/
@ -18,17 +16,47 @@ defmodule ReadabilityTest do
  end

  test "readability for BBC" do
-    %{status_code: 200, body: body} = HTTPoison.get!("http://www.bbc.com/news/business-36108166")
-    Readability.content(body) |> Readability.readabl_text
+    html = TestHelper.read_fixture("bbc.html")
+    bbc = Readability.content(html)
+
+    bbc_html = Readability.raw_html(bbc)
+
+    assert bbc_html =~ ~r/^<div><div class=\"story-body__inner\" property=\"articleBody\">/
+    assert bbc_html =~ ~r/connected computing devices\".<\/p><\/div><\/div>$/
+
+    bbc_text = Readability.readabl_text(bbc)
+    # TODO: Remove image caption when extract only text
+    # assert bbc_text =~ ~r/^Microsoft\'s quarterly profit has missed analysts/
+    assert bbc_text =~ ~r/connected computing devices\".$/
  end

  test "readability for medium" do
-    %{status_code: 200, body: body} = HTTPoison.get!("https://medium.com/@kenmazaika/why-im-betting-on-elixir-7c8f847b58#.d0xmzfd15")
-    Readability.content(body) |> Readability.readabl_text
+    html = TestHelper.read_fixture("medium.html")
+    medium = Readability.content(html)
+
+    medium_html = Readability.raw_html(medium)
+
+    assert medium_html =~ ~r/^<div><div class=\"section-inner layoutSingleColumn\">/
+    assert medium_html =~ ~r/recommend button!<\/em><\/h3><\/div><\/div>$/
+
+    medium_text = Readability.readabl_text(medium)
+
+    assert medium_text =~ ~r/^Background: I’ve spent the past 6/
+    assert medium_text =~ ~r/a lot to me if you hit the recommend button!$/
  end

  test "readability for buzzfeed" do
-    %{status_code: 200, body: body} = HTTPoison.get!("http://www.buzzfeed.com/salvadorhernandez/fbi-obtains-passcode-to-iphone-in-new-york-drops-case-agains#.koMMa21lj8")
-    Readability.content(body) |> Readability.readabl_text
+    html = TestHelper.read_fixture("buzzfeed.html")
+    buzzfeed = Readability.content(html)
+
+    buzzfeed_html = Readability.raw_html(buzzfeed)
+
+    assert buzzfeed_html =~ ~r/^<div><div class=\"buzz_superlist_item_text\"><p>/
+    assert buzzfeed_html =~ ~r/encrypted devices.<\/p><hr\/><hr\/><hr\/><hr\/><\/div><\/div>$/
+
+    buzzfeed_text = Readability.readabl_text(buzzfeed)
+
+    assert buzzfeed_text =~ ~r/^The FBI no longer needs Apple’s help/
+    assert buzzfeed_text =~ ~r/issue of court orders and encrypted devices.$/
  end
 end
--- a/test/test_helper.exs
+++ b/test/test_helper.exs
@ -1 +1,10 @@
+defmodule TestHelper do
+  @fixtures_path "./test/fixtures/"
+
+  def read_fixture(file_name) do
+    {:ok, html} = File.read(@fixtures_path <> file_name)
+    html
+  end
+end
+
 ExUnit.start()