add test
This commit is contained in:
parent
f69365c4a4
commit
d8677a599c
@ -73,11 +73,13 @@ defmodule Readability do
|
||||
"""
|
||||
@spec raw_html(html_tree) :: binary
|
||||
def readabl_text(html_tree) do
|
||||
# TODO: Remove image caption when extract only text
|
||||
tags_to_br = ~r/<\/(p|div|article|h\d)/i
|
||||
html_str = html_tree |> raw_html
|
||||
Regex.replace(tags_to_br, html_str, &("\n#{&1}"))
|
||||
|> Floki.parse
|
||||
|> Floki.text
|
||||
|> String.strip
|
||||
end
|
||||
|
||||
def regexes, do: @regexes
|
||||
|
2612
test/fixtures/bbc.html
vendored
2612
test/fixtures/bbc.html
vendored
File diff suppressed because one or more lines are too long
6570
test/fixtures/buzzfeed.html
vendored
Normal file
6570
test/fixtures/buzzfeed.html
vendored
Normal file
File diff suppressed because one or more lines are too long
8
test/fixtures/medium.html
vendored
Normal file
8
test/fixtures/medium.html
vendored
Normal file
File diff suppressed because one or more lines are too long
@ -1,53 +0,0 @@
|
||||
defmodule Readability.Candidate.BuilderTest.A do
|
||||
use ExUnit.Case, async: true
|
||||
import Readability, only: [parse: 1]
|
||||
alias Readability.Candidate.Builder
|
||||
|
||||
doctest Readability
|
||||
|
||||
@sample """
|
||||
<div id="1" class="candidate">
|
||||
<div id="2" class="candidate">
|
||||
<p id="3" class="candidate">
|
||||
Elixir is a dynamic, functional language designed for building scalable and maintainable applications.
|
||||
</p>
|
||||
</div>
|
||||
<td>
|
||||
<a>too short content</a>
|
||||
</td>
|
||||
<div id="4">
|
||||
<div id="5" class="candidate">
|
||||
<div id="6" class="candidate">
|
||||
<p id="7" class="candidate">
|
||||
Elixir leverages the Erlang VM, known for running low-latency, distributed and fault-tolerant systems, while also being successfully used in web development and the embedded software domain.
|
||||
</p>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div>
|
||||
<span>
|
||||
not p, td node
|
||||
</span>
|
||||
</div>
|
||||
</div>
|
||||
"""
|
||||
|
||||
test "build candidate" do
|
||||
candidates = Builder.build(parse(@sample))
|
||||
expected = parse(@sample) |> Floki.find(".candidate") |> length
|
||||
assert length(candidates) == expected
|
||||
|
||||
result = candidates
|
||||
|> Enum.all?(fn(cand) ->
|
||||
attrs = elem(cand.html_tree, 1)
|
||||
"candidate" == attrs
|
||||
|> List.keyfind("class", 0, {"", ""})
|
||||
|> elem(1)
|
||||
end)
|
||||
assert result == true
|
||||
end
|
||||
|
||||
test "sample" do
|
||||
candidates = Builder.build(parse(@sample))
|
||||
end
|
||||
end
|
@ -1,77 +0,0 @@
|
||||
defmodule Readability.Candidate.FinderTest.A do
|
||||
use ExUnit.Case, async: true
|
||||
|
||||
doctest Readability.Candidate.Finder
|
||||
|
||||
alias Readability.Candidate.Finder
|
||||
alias Readability.Candidate.MisusedTrasformer
|
||||
alias Readability.Candidate.UnlikelyCandidatesRemover
|
||||
|
||||
@unlikey_sample """
|
||||
<html>
|
||||
<body>
|
||||
<header>HEADER</header>
|
||||
<nav>NAV</nav>
|
||||
<article class="community">ARTICLE</article>
|
||||
<div class="disqus">SOCIAL</div>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
test "remove unlikely tag nodes" do
|
||||
expected = {"html", [], [ {"body", [], [ {"article", [{"class", "community"}], ["ARTICLE"]} ]} ]}
|
||||
result = @unlikey_sample
|
||||
|> Readability.parse
|
||||
|> UnlikelyCandidatesRemover.remove
|
||||
assert expected == result
|
||||
end
|
||||
|
||||
@misused_sample """
|
||||
<html>
|
||||
<body>
|
||||
<div>
|
||||
<span>here</span>
|
||||
</div>
|
||||
<div>
|
||||
<p>not here</p>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
test "transform misused div tag" do
|
||||
expected = {"html",
|
||||
[],
|
||||
[{"body",
|
||||
[],
|
||||
[{"p",
|
||||
[],
|
||||
[{"span", [], ["here"]}]
|
||||
}, {"div",
|
||||
[],
|
||||
[{"p", [], ["not here"]}]
|
||||
}]
|
||||
}]
|
||||
}
|
||||
|
||||
result = @misused_sample
|
||||
|> Readability.parse
|
||||
|> MisusedTrasformer.transform
|
||||
assert expected == result
|
||||
end
|
||||
|
||||
@candidate_sample [{"div",
|
||||
[],
|
||||
[{"p", [], ["12345678901234567890123456"]},
|
||||
{"p", [], ["12345678901234567890123456"]}
|
||||
]
|
||||
},{"div"
|
||||
|
||||
}]
|
||||
|
||||
|
||||
def read_html(name) do
|
||||
{:ok, body} = File.read("./test/fixtures/#{name}.html")
|
||||
body
|
||||
end
|
||||
end
|
@ -49,11 +49,6 @@ defmodule Readability.Candidate.CleanerTest do
|
||||
|
||||
test "not remove body tags", %{html_tree: html_tree} do
|
||||
html_tree = Cleaner.remove_unlikely_tree(html_tree)
|
||||
Floki.find(html_tree, "body") == []
|
||||
end
|
||||
|
||||
test "not remove body tags", %{html_tree: html_tree} do
|
||||
html_tree = Cleaner.remove_unlikely_tree(html_tree)
|
||||
assert Floki.text(html_tree) =~ ~r/real content/
|
||||
refute Floki.find(html_tree, "body") == []
|
||||
end
|
||||
end
|
||||
|
@ -1,12 +1,10 @@
|
||||
defmodule ReadabilityTest do
|
||||
use ExUnit.Case, async: true
|
||||
|
||||
@fixtures_path "./test/fixtures/"
|
||||
|
||||
test "readability for NY Times" do
|
||||
{:ok, nytimes} = File.read(@fixtures_path <> "nytimes.html")
|
||||
html = TestHelper.read_fixture("nytimes.html")
|
||||
opts = [clean_conditionally: false]
|
||||
nytimes = Readability.content(nytimes, opts)
|
||||
nytimes = Readability.content(html, opts)
|
||||
|
||||
nytimes_html = Readability.raw_html(nytimes)
|
||||
assert nytimes_html =~ ~r/^<div><div class=\"story-body\">/
|
||||
@ -18,17 +16,47 @@ defmodule ReadabilityTest do
|
||||
end
|
||||
|
||||
test "readability for BBC" do
|
||||
%{status_code: 200, body: body} = HTTPoison.get!("http://www.bbc.com/news/business-36108166")
|
||||
Readability.content(body) |> Readability.readabl_text
|
||||
html = TestHelper.read_fixture("bbc.html")
|
||||
bbc = Readability.content(html)
|
||||
|
||||
bbc_html = Readability.raw_html(bbc)
|
||||
|
||||
assert bbc_html =~ ~r/^<div><div class=\"story-body__inner\" property=\"articleBody\">/
|
||||
assert bbc_html =~ ~r/connected computing devices\".<\/p><\/div><\/div>$/
|
||||
|
||||
bbc_text = Readability.readabl_text(bbc)
|
||||
# TODO: Remove image caption when extract only text
|
||||
# assert bbc_text =~ ~r/^Microsoft\'s quarterly profit has missed analysts/
|
||||
assert bbc_text =~ ~r/connected computing devices\".$/
|
||||
end
|
||||
|
||||
test "readability for medium" do
|
||||
%{status_code: 200, body: body} = HTTPoison.get!("https://medium.com/@kenmazaika/why-im-betting-on-elixir-7c8f847b58#.d0xmzfd15")
|
||||
Readability.content(body) |> Readability.readabl_text
|
||||
html = TestHelper.read_fixture("medium.html")
|
||||
medium = Readability.content(html)
|
||||
|
||||
medium_html = Readability.raw_html(medium)
|
||||
|
||||
assert medium_html =~ ~r/^<div><div class=\"section-inner layoutSingleColumn\">/
|
||||
assert medium_html =~ ~r/recommend button!<\/em><\/h3><\/div><\/div>$/
|
||||
|
||||
medium_text = Readability.readabl_text(medium)
|
||||
|
||||
assert medium_text =~ ~r/^Background: I’ve spent the past 6/
|
||||
assert medium_text =~ ~r/a lot to me if you hit the recommend button!$/
|
||||
end
|
||||
|
||||
test "readability for buzzfeed" do
|
||||
%{status_code: 200, body: body} = HTTPoison.get!("http://www.buzzfeed.com/salvadorhernandez/fbi-obtains-passcode-to-iphone-in-new-york-drops-case-agains#.koMMa21lj8")
|
||||
Readability.content(body) |> Readability.readabl_text
|
||||
html = TestHelper.read_fixture("buzzfeed.html")
|
||||
buzzfeed = Readability.content(html)
|
||||
|
||||
buzzfeed_html = Readability.raw_html(buzzfeed)
|
||||
|
||||
assert buzzfeed_html =~ ~r/^<div><div class=\"buzz_superlist_item_text\"><p>/
|
||||
assert buzzfeed_html =~ ~r/encrypted devices.<\/p><hr\/><hr\/><hr\/><hr\/><\/div><\/div>$/
|
||||
|
||||
buzzfeed_text = Readability.readabl_text(buzzfeed)
|
||||
|
||||
assert buzzfeed_text =~ ~r/^The FBI no longer needs Apple’s help/
|
||||
assert buzzfeed_text =~ ~r/issue of court orders and encrypted devices.$/
|
||||
end
|
||||
end
|
||||
|
@ -1 +1,10 @@
|
||||
defmodule TestHelper do
|
||||
@fixtures_path "./test/fixtures/"
|
||||
|
||||
def read_fixture(file_name) do
|
||||
{:ok, html} = File.read(@fixtures_path <> file_name)
|
||||
html
|
||||
end
|
||||
end
|
||||
|
||||
ExUnit.start()
|
||||
|
Loading…
x
Reference in New Issue
Block a user