add test

2016-04-24 14:32:43 +09:00 · 2016-04-24 14:32:43 +09:00 · d8677a599c
parent f69365c4a4
commit d8677a599c
9 changed files with 7272 additions and 2114 deletions
--- a/lib/readability.ex
+++ b/lib/readability.ex
@ -73,11 +73,13 @@ defmodule Readability do
  """
  @spec raw_html(html_tree) :: binary
  def readabl_text(html_tree) do
    # TODO: Remove image caption when extract only text
    tags_to_br = ~r/<\/(p|div|article|h\d)/i
    html_str = html_tree |> raw_html
    Regex.replace(tags_to_br, html_str, &("\n#{&1}"))
    |> Floki.parse
    |> Floki.text
    |> String.strip
  end
  def regexes, do: @regexes
--- a/test/fixtures/bbc.html
+++ b/test/fixtures/bbc.html
--- a/test/fixtures/buzzfeed.html
+++ b/test/fixtures/buzzfeed.html
--- a/test/fixtures/medium.html
+++ b/test/fixtures/medium.html
--- a/test/readability/candidate/_builder.exs
+++ b/test/readability/candidate/_builder.exs
@ -1,53 +0,0 @@
 defmodule Readability.Candidate.BuilderTest.A do
  use ExUnit.Case, async: true
  import Readability, only: [parse: 1]
  alias Readability.Candidate.Builder
  doctest Readability
  @sample """
  <div id="1" class="candidate">
    <div id="2" class="candidate">
      <p id="3" class="candidate">
        Elixir is a dynamic, functional language designed for building scalable and maintainable applications.
      </p>
    </div>
    <td>
      <a>too short content</a>
    </td>
    <div id="4">
      <div id="5" class="candidate">
        <div id="6" class="candidate">
          <p id="7" class="candidate">
            Elixir leverages the Erlang VM, known for running low-latency, distributed and fault-tolerant systems, while also being successfully used in web development and the embedded software domain.
          </p>
        </div>
      </div>
    </div>
    <div>
      <span>
        not p, td node
      </span>
    </div>
  </div>
  """
  test "build candidate" do
    candidates = Builder.build(parse(@sample))
    expected = parse(@sample) |> Floki.find(".candidate") |> length
    assert length(candidates) == expected
    result =  candidates
              |> Enum.all?(fn(cand) ->
                   attrs = elem(cand.html_tree, 1)
                   "candidate" == attrs
                                  |> List.keyfind("class", 0, {"", ""})
                                  |> elem(1)
                 end)
    assert result == true
  end
  test "sample" do
    candidates = Builder.build(parse(@sample))
  end
 end
--- a/test/readability/candidate/_finder.ex
+++ b/test/readability/candidate/_finder.ex
@ -1,77 +0,0 @@
 defmodule Readability.Candidate.FinderTest.A do
  use ExUnit.Case, async: true
  doctest Readability.Candidate.Finder
  alias Readability.Candidate.Finder
  alias Readability.Candidate.MisusedTrasformer
  alias Readability.Candidate.UnlikelyCandidatesRemover
  @unlikey_sample """
  <html>
    <body>
      <header>HEADER</header>
      <nav>NAV</nav>
      <article class="community">ARTICLE</article>
      <div class="disqus">SOCIAL</div>
    </body>
  </html>
  """
  test "remove unlikely tag nodes" do
    expected = {"html", [], [ {"body", [], [ {"article", [{"class", "community"}], ["ARTICLE"]} ]} ]}
    result = @unlikey_sample
             |> Readability.parse
             |> UnlikelyCandidatesRemover.remove
    assert expected == result
  end
  @misused_sample """
  <html>
    <body>
      <div>
        <span>here</span>
      </div>
      <div>
        <p>not here</p>
      </div>
    </body>
  </html>
  """
  test "transform misused div tag" do
    expected = {"html",
                  [],
                  [{"body",
                    [],
                    [{"p",
                      [],
                      [{"span", [], ["here"]}]
                    }, {"div",
                      [],
                      [{"p", [], ["not here"]}]
                    }]
                  }]
                }
    result = @misused_sample
             |> Readability.parse
             |> MisusedTrasformer.transform
    assert expected == result
  end
  @candidate_sample [{"div",
                      [],
                      [{"p", [], ["12345678901234567890123456"]},
                       {"p", [], ["12345678901234567890123456"]}
                      ]
                    },{"div"
                      }]
  def read_html(name) do
    {:ok, body} = File.read("./test/fixtures/#{name}.html")
    body
  end
 end
--- a/test/readability/candidate/cleaner_test.exs
+++ b/test/readability/candidate/cleaner_test.exs
@ -49,11 +49,6 @@ defmodule Readability.Candidate.CleanerTest do
  test "not remove body tags", %{html_tree: html_tree} do
    html_tree = Cleaner.remove_unlikely_tree(html_tree)
-    Floki.find(html_tree, "body") == []
+    refute Floki.find(html_tree, "body") == []
  end
  test "not remove body tags", %{html_tree: html_tree} do
    html_tree = Cleaner.remove_unlikely_tree(html_tree)
    assert Floki.text(html_tree) =~ ~r/real content/
  end
 end
--- a/test/readability_test.exs
+++ b/test/readability_test.exs
@ -1,12 +1,10 @@
 defmodule ReadabilityTest do
  use ExUnit.Case, async: true
  @fixtures_path "./test/fixtures/"
  test "readability for NY Times" do
-    {:ok, nytimes} = File.read(@fixtures_path <> "nytimes.html")
+    html = TestHelper.read_fixture("nytimes.html")
    opts = [clean_conditionally: false]
-    nytimes = Readability.content(nytimes, opts)
+    nytimes = Readability.content(html, opts)
    nytimes_html = Readability.raw_html(nytimes)
    assert nytimes_html =~ ~r/^<div><div class=\"story-body\">/
@ -18,17 +16,47 @@ defmodule ReadabilityTest do
  end
  test "readability for BBC" do
-    %{status_code: 200, body: body} = HTTPoison.get!("http://www.bbc.com/news/business-36108166")
+    html = TestHelper.read_fixture("bbc.html")
-    Readability.content(body) |> Readability.readabl_text
+    bbc = Readability.content(html)
    bbc_html = Readability.raw_html(bbc)
    assert bbc_html =~ ~r/^<div><div class=\"story-body__inner\" property=\"articleBody\">/
    assert bbc_html =~ ~r/connected computing devices\".<\/p><\/div><\/div>$/
    bbc_text = Readability.readabl_text(bbc)
    # TODO: Remove image caption when extract only text
    # assert bbc_text =~ ~r/^Microsoft\'s quarterly profit has missed analysts/
    assert bbc_text =~ ~r/connected computing devices\".$/
  end
  test "readability for medium" do
-    %{status_code: 200, body: body} = HTTPoison.get!("https://medium.com/@kenmazaika/why-im-betting-on-elixir-7c8f847b58#.d0xmzfd15")
+    html = TestHelper.read_fixture("medium.html")
-    Readability.content(body) |> Readability.readabl_text
+    medium = Readability.content(html)
    medium_html = Readability.raw_html(medium)
    assert medium_html =~ ~r/^<div><div class=\"section-inner layoutSingleColumn\">/
    assert medium_html =~ ~r/recommend button!<\/em><\/h3><\/div><\/div>$/
    medium_text = Readability.readabl_text(medium)
    assert medium_text =~ ~r/^Background: I’ve spent the past 6/
    assert medium_text =~ ~r/a lot to me if you hit the recommend button!$/
  end
  test "readability for buzzfeed" do
-    %{status_code: 200, body: body} = HTTPoison.get!("http://www.buzzfeed.com/salvadorhernandez/fbi-obtains-passcode-to-iphone-in-new-york-drops-case-agains#.koMMa21lj8")
+    html = TestHelper.read_fixture("buzzfeed.html")
-    Readability.content(body) |> Readability.readabl_text
+    buzzfeed = Readability.content(html)
    buzzfeed_html = Readability.raw_html(buzzfeed)
    assert buzzfeed_html =~ ~r/^<div><div class=\"buzz_superlist_item_text\"><p>/
    assert buzzfeed_html =~ ~r/encrypted devices.<\/p><hr\/><hr\/><hr\/><hr\/><\/div><\/div>$/
    buzzfeed_text = Readability.readabl_text(buzzfeed)
    assert buzzfeed_text =~ ~r/^The FBI no longer needs Apple’s help/
    assert buzzfeed_text =~ ~r/issue of court orders and encrypted devices.$/
  end
 end
--- a/test/test_helper.exs
+++ b/test/test_helper.exs
@ -1 +1,10 @@
 defmodule TestHelper do
  @fixtures_path "./test/fixtures/"
  def read_fixture(file_name) do
    {:ok, html} = File.read(@fixtures_path <> file_name)
    html
  end
 end
 ExUnit.start()