add test
This commit is contained in:
parent
f69365c4a4
commit
d8677a599c
|
@ -73,11 +73,13 @@ defmodule Readability do
|
||||||
"""
|
"""
|
||||||
@spec raw_html(html_tree) :: binary
|
@spec raw_html(html_tree) :: binary
|
||||||
def readabl_text(html_tree) do
|
def readabl_text(html_tree) do
|
||||||
|
# TODO: Remove image caption when extract only text
|
||||||
tags_to_br = ~r/<\/(p|div|article|h\d)/i
|
tags_to_br = ~r/<\/(p|div|article|h\d)/i
|
||||||
html_str = html_tree |> raw_html
|
html_str = html_tree |> raw_html
|
||||||
Regex.replace(tags_to_br, html_str, &("\n#{&1}"))
|
Regex.replace(tags_to_br, html_str, &("\n#{&1}"))
|
||||||
|> Floki.parse
|
|> Floki.parse
|
||||||
|> Floki.text
|
|> Floki.text
|
||||||
|
|> String.strip
|
||||||
end
|
end
|
||||||
|
|
||||||
def regexes, do: @regexes
|
def regexes, do: @regexes
|
||||||
|
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -1,53 +0,0 @@
|
||||||
defmodule Readability.Candidate.BuilderTest.A do
|
|
||||||
use ExUnit.Case, async: true
|
|
||||||
import Readability, only: [parse: 1]
|
|
||||||
alias Readability.Candidate.Builder
|
|
||||||
|
|
||||||
doctest Readability
|
|
||||||
|
|
||||||
@sample """
|
|
||||||
<div id="1" class="candidate">
|
|
||||||
<div id="2" class="candidate">
|
|
||||||
<p id="3" class="candidate">
|
|
||||||
Elixir is a dynamic, functional language designed for building scalable and maintainable applications.
|
|
||||||
</p>
|
|
||||||
</div>
|
|
||||||
<td>
|
|
||||||
<a>too short content</a>
|
|
||||||
</td>
|
|
||||||
<div id="4">
|
|
||||||
<div id="5" class="candidate">
|
|
||||||
<div id="6" class="candidate">
|
|
||||||
<p id="7" class="candidate">
|
|
||||||
Elixir leverages the Erlang VM, known for running low-latency, distributed and fault-tolerant systems, while also being successfully used in web development and the embedded software domain.
|
|
||||||
</p>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
<div>
|
|
||||||
<span>
|
|
||||||
not p, td node
|
|
||||||
</span>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
"""
|
|
||||||
|
|
||||||
test "build candidate" do
|
|
||||||
candidates = Builder.build(parse(@sample))
|
|
||||||
expected = parse(@sample) |> Floki.find(".candidate") |> length
|
|
||||||
assert length(candidates) == expected
|
|
||||||
|
|
||||||
result = candidates
|
|
||||||
|> Enum.all?(fn(cand) ->
|
|
||||||
attrs = elem(cand.html_tree, 1)
|
|
||||||
"candidate" == attrs
|
|
||||||
|> List.keyfind("class", 0, {"", ""})
|
|
||||||
|> elem(1)
|
|
||||||
end)
|
|
||||||
assert result == true
|
|
||||||
end
|
|
||||||
|
|
||||||
test "sample" do
|
|
||||||
candidates = Builder.build(parse(@sample))
|
|
||||||
end
|
|
||||||
end
|
|
|
@ -1,77 +0,0 @@
|
||||||
defmodule Readability.Candidate.FinderTest.A do
|
|
||||||
use ExUnit.Case, async: true
|
|
||||||
|
|
||||||
doctest Readability.Candidate.Finder
|
|
||||||
|
|
||||||
alias Readability.Candidate.Finder
|
|
||||||
alias Readability.Candidate.MisusedTrasformer
|
|
||||||
alias Readability.Candidate.UnlikelyCandidatesRemover
|
|
||||||
|
|
||||||
@unlikey_sample """
|
|
||||||
<html>
|
|
||||||
<body>
|
|
||||||
<header>HEADER</header>
|
|
||||||
<nav>NAV</nav>
|
|
||||||
<article class="community">ARTICLE</article>
|
|
||||||
<div class="disqus">SOCIAL</div>
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
"""
|
|
||||||
|
|
||||||
test "remove unlikely tag nodes" do
|
|
||||||
expected = {"html", [], [ {"body", [], [ {"article", [{"class", "community"}], ["ARTICLE"]} ]} ]}
|
|
||||||
result = @unlikey_sample
|
|
||||||
|> Readability.parse
|
|
||||||
|> UnlikelyCandidatesRemover.remove
|
|
||||||
assert expected == result
|
|
||||||
end
|
|
||||||
|
|
||||||
@misused_sample """
|
|
||||||
<html>
|
|
||||||
<body>
|
|
||||||
<div>
|
|
||||||
<span>here</span>
|
|
||||||
</div>
|
|
||||||
<div>
|
|
||||||
<p>not here</p>
|
|
||||||
</div>
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
"""
|
|
||||||
|
|
||||||
test "transform misused div tag" do
|
|
||||||
expected = {"html",
|
|
||||||
[],
|
|
||||||
[{"body",
|
|
||||||
[],
|
|
||||||
[{"p",
|
|
||||||
[],
|
|
||||||
[{"span", [], ["here"]}]
|
|
||||||
}, {"div",
|
|
||||||
[],
|
|
||||||
[{"p", [], ["not here"]}]
|
|
||||||
}]
|
|
||||||
}]
|
|
||||||
}
|
|
||||||
|
|
||||||
result = @misused_sample
|
|
||||||
|> Readability.parse
|
|
||||||
|> MisusedTrasformer.transform
|
|
||||||
assert expected == result
|
|
||||||
end
|
|
||||||
|
|
||||||
@candidate_sample [{"div",
|
|
||||||
[],
|
|
||||||
[{"p", [], ["12345678901234567890123456"]},
|
|
||||||
{"p", [], ["12345678901234567890123456"]}
|
|
||||||
]
|
|
||||||
},{"div"
|
|
||||||
|
|
||||||
}]
|
|
||||||
|
|
||||||
|
|
||||||
def read_html(name) do
|
|
||||||
{:ok, body} = File.read("./test/fixtures/#{name}.html")
|
|
||||||
body
|
|
||||||
end
|
|
||||||
end
|
|
|
@ -49,11 +49,6 @@ defmodule Readability.Candidate.CleanerTest do
|
||||||
|
|
||||||
test "not remove body tags", %{html_tree: html_tree} do
|
test "not remove body tags", %{html_tree: html_tree} do
|
||||||
html_tree = Cleaner.remove_unlikely_tree(html_tree)
|
html_tree = Cleaner.remove_unlikely_tree(html_tree)
|
||||||
Floki.find(html_tree, "body") == []
|
refute Floki.find(html_tree, "body") == []
|
||||||
end
|
|
||||||
|
|
||||||
test "not remove body tags", %{html_tree: html_tree} do
|
|
||||||
html_tree = Cleaner.remove_unlikely_tree(html_tree)
|
|
||||||
assert Floki.text(html_tree) =~ ~r/real content/
|
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -1,12 +1,10 @@
|
||||||
defmodule ReadabilityTest do
|
defmodule ReadabilityTest do
|
||||||
use ExUnit.Case, async: true
|
use ExUnit.Case, async: true
|
||||||
|
|
||||||
@fixtures_path "./test/fixtures/"
|
|
||||||
|
|
||||||
test "readability for NY Times" do
|
test "readability for NY Times" do
|
||||||
{:ok, nytimes} = File.read(@fixtures_path <> "nytimes.html")
|
html = TestHelper.read_fixture("nytimes.html")
|
||||||
opts = [clean_conditionally: false]
|
opts = [clean_conditionally: false]
|
||||||
nytimes = Readability.content(nytimes, opts)
|
nytimes = Readability.content(html, opts)
|
||||||
|
|
||||||
nytimes_html = Readability.raw_html(nytimes)
|
nytimes_html = Readability.raw_html(nytimes)
|
||||||
assert nytimes_html =~ ~r/^<div><div class=\"story-body\">/
|
assert nytimes_html =~ ~r/^<div><div class=\"story-body\">/
|
||||||
|
@ -18,17 +16,47 @@ defmodule ReadabilityTest do
|
||||||
end
|
end
|
||||||
|
|
||||||
test "readability for BBC" do
|
test "readability for BBC" do
|
||||||
%{status_code: 200, body: body} = HTTPoison.get!("http://www.bbc.com/news/business-36108166")
|
html = TestHelper.read_fixture("bbc.html")
|
||||||
Readability.content(body) |> Readability.readabl_text
|
bbc = Readability.content(html)
|
||||||
|
|
||||||
|
bbc_html = Readability.raw_html(bbc)
|
||||||
|
|
||||||
|
assert bbc_html =~ ~r/^<div><div class=\"story-body__inner\" property=\"articleBody\">/
|
||||||
|
assert bbc_html =~ ~r/connected computing devices\".<\/p><\/div><\/div>$/
|
||||||
|
|
||||||
|
bbc_text = Readability.readabl_text(bbc)
|
||||||
|
# TODO: Remove image caption when extract only text
|
||||||
|
# assert bbc_text =~ ~r/^Microsoft\'s quarterly profit has missed analysts/
|
||||||
|
assert bbc_text =~ ~r/connected computing devices\".$/
|
||||||
end
|
end
|
||||||
|
|
||||||
test "readability for medium" do
|
test "readability for medium" do
|
||||||
%{status_code: 200, body: body} = HTTPoison.get!("https://medium.com/@kenmazaika/why-im-betting-on-elixir-7c8f847b58#.d0xmzfd15")
|
html = TestHelper.read_fixture("medium.html")
|
||||||
Readability.content(body) |> Readability.readabl_text
|
medium = Readability.content(html)
|
||||||
|
|
||||||
|
medium_html = Readability.raw_html(medium)
|
||||||
|
|
||||||
|
assert medium_html =~ ~r/^<div><div class=\"section-inner layoutSingleColumn\">/
|
||||||
|
assert medium_html =~ ~r/recommend button!<\/em><\/h3><\/div><\/div>$/
|
||||||
|
|
||||||
|
medium_text = Readability.readabl_text(medium)
|
||||||
|
|
||||||
|
assert medium_text =~ ~r/^Background: I’ve spent the past 6/
|
||||||
|
assert medium_text =~ ~r/a lot to me if you hit the recommend button!$/
|
||||||
end
|
end
|
||||||
|
|
||||||
test "readability for buzzfeed" do
|
test "readability for buzzfeed" do
|
||||||
%{status_code: 200, body: body} = HTTPoison.get!("http://www.buzzfeed.com/salvadorhernandez/fbi-obtains-passcode-to-iphone-in-new-york-drops-case-agains#.koMMa21lj8")
|
html = TestHelper.read_fixture("buzzfeed.html")
|
||||||
Readability.content(body) |> Readability.readabl_text
|
buzzfeed = Readability.content(html)
|
||||||
|
|
||||||
|
buzzfeed_html = Readability.raw_html(buzzfeed)
|
||||||
|
|
||||||
|
assert buzzfeed_html =~ ~r/^<div><div class=\"buzz_superlist_item_text\"><p>/
|
||||||
|
assert buzzfeed_html =~ ~r/encrypted devices.<\/p><hr\/><hr\/><hr\/><hr\/><\/div><\/div>$/
|
||||||
|
|
||||||
|
buzzfeed_text = Readability.readabl_text(buzzfeed)
|
||||||
|
|
||||||
|
assert buzzfeed_text =~ ~r/^The FBI no longer needs Apple’s help/
|
||||||
|
assert buzzfeed_text =~ ~r/issue of court orders and encrypted devices.$/
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -1 +1,10 @@
|
||||||
|
defmodule TestHelper do
|
||||||
|
@fixtures_path "./test/fixtures/"
|
||||||
|
|
||||||
|
def read_fixture(file_name) do
|
||||||
|
{:ok, html} = File.read(@fixtures_path <> file_name)
|
||||||
|
html
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
ExUnit.start()
|
ExUnit.start()
|
||||||
|
|
Loading…
Reference in New Issue