This commit is contained in:
keepcosmos 2016-04-24 14:32:43 +09:00
parent f69365c4a4
commit d8677a599c
9 changed files with 7272 additions and 2114 deletions

View File

@ -73,11 +73,13 @@ defmodule Readability do
""" """
@spec raw_html(html_tree) :: binary @spec raw_html(html_tree) :: binary
def readabl_text(html_tree) do def readabl_text(html_tree) do
# TODO: Remove image caption when extract only text
tags_to_br = ~r/<\/(p|div|article|h\d)/i tags_to_br = ~r/<\/(p|div|article|h\d)/i
html_str = html_tree |> raw_html html_str = html_tree |> raw_html
Regex.replace(tags_to_br, html_str, &("\n#{&1}")) Regex.replace(tags_to_br, html_str, &("\n#{&1}"))
|> Floki.parse |> Floki.parse
|> Floki.text |> Floki.text
|> String.strip
end end
def regexes, do: @regexes def regexes, do: @regexes

2420
test/fixtures/bbc.html vendored

File diff suppressed because one or more lines are too long

6570
test/fixtures/buzzfeed.html vendored Normal file

File diff suppressed because one or more lines are too long

8
test/fixtures/medium.html vendored Normal file

File diff suppressed because one or more lines are too long

View File

@ -1,53 +0,0 @@
defmodule Readability.Candidate.BuilderTest.A do
use ExUnit.Case, async: true
import Readability, only: [parse: 1]
alias Readability.Candidate.Builder
doctest Readability
@sample """
<div id="1" class="candidate">
<div id="2" class="candidate">
<p id="3" class="candidate">
Elixir is a dynamic, functional language designed for building scalable and maintainable applications.
</p>
</div>
<td>
<a>too short content</a>
</td>
<div id="4">
<div id="5" class="candidate">
<div id="6" class="candidate">
<p id="7" class="candidate">
Elixir leverages the Erlang VM, known for running low-latency, distributed and fault-tolerant systems, while also being successfully used in web development and the embedded software domain.
</p>
</div>
</div>
</div>
<div>
<span>
not p, td node
</span>
</div>
</div>
"""
test "build candidate" do
candidates = Builder.build(parse(@sample))
expected = parse(@sample) |> Floki.find(".candidate") |> length
assert length(candidates) == expected
result = candidates
|> Enum.all?(fn(cand) ->
attrs = elem(cand.html_tree, 1)
"candidate" == attrs
|> List.keyfind("class", 0, {"", ""})
|> elem(1)
end)
assert result == true
end
test "sample" do
candidates = Builder.build(parse(@sample))
end
end

View File

@ -1,77 +0,0 @@
defmodule Readability.Candidate.FinderTest.A do
use ExUnit.Case, async: true
doctest Readability.Candidate.Finder
alias Readability.Candidate.Finder
alias Readability.Candidate.MisusedTrasformer
alias Readability.Candidate.UnlikelyCandidatesRemover
@unlikey_sample """
<html>
<body>
<header>HEADER</header>
<nav>NAV</nav>
<article class="community">ARTICLE</article>
<div class="disqus">SOCIAL</div>
</body>
</html>
"""
test "remove unlikely tag nodes" do
expected = {"html", [], [ {"body", [], [ {"article", [{"class", "community"}], ["ARTICLE"]} ]} ]}
result = @unlikey_sample
|> Readability.parse
|> UnlikelyCandidatesRemover.remove
assert expected == result
end
@misused_sample """
<html>
<body>
<div>
<span>here</span>
</div>
<div>
<p>not here</p>
</div>
</body>
</html>
"""
test "transform misused div tag" do
expected = {"html",
[],
[{"body",
[],
[{"p",
[],
[{"span", [], ["here"]}]
}, {"div",
[],
[{"p", [], ["not here"]}]
}]
}]
}
result = @misused_sample
|> Readability.parse
|> MisusedTrasformer.transform
assert expected == result
end
@candidate_sample [{"div",
[],
[{"p", [], ["12345678901234567890123456"]},
{"p", [], ["12345678901234567890123456"]}
]
},{"div"
}]
def read_html(name) do
{:ok, body} = File.read("./test/fixtures/#{name}.html")
body
end
end

View File

@ -49,11 +49,6 @@ defmodule Readability.Candidate.CleanerTest do
test "not remove body tags", %{html_tree: html_tree} do test "not remove body tags", %{html_tree: html_tree} do
html_tree = Cleaner.remove_unlikely_tree(html_tree) html_tree = Cleaner.remove_unlikely_tree(html_tree)
Floki.find(html_tree, "body") == [] refute Floki.find(html_tree, "body") == []
end
test "not remove body tags", %{html_tree: html_tree} do
html_tree = Cleaner.remove_unlikely_tree(html_tree)
assert Floki.text(html_tree) =~ ~r/real content/
end end
end end

View File

@ -1,12 +1,10 @@
defmodule ReadabilityTest do defmodule ReadabilityTest do
use ExUnit.Case, async: true use ExUnit.Case, async: true
@fixtures_path "./test/fixtures/"
test "readability for NY Times" do test "readability for NY Times" do
{:ok, nytimes} = File.read(@fixtures_path <> "nytimes.html") html = TestHelper.read_fixture("nytimes.html")
opts = [clean_conditionally: false] opts = [clean_conditionally: false]
nytimes = Readability.content(nytimes, opts) nytimes = Readability.content(html, opts)
nytimes_html = Readability.raw_html(nytimes) nytimes_html = Readability.raw_html(nytimes)
assert nytimes_html =~ ~r/^<div><div class=\"story-body\">/ assert nytimes_html =~ ~r/^<div><div class=\"story-body\">/
@ -18,17 +16,47 @@ defmodule ReadabilityTest do
end end
test "readability for BBC" do test "readability for BBC" do
%{status_code: 200, body: body} = HTTPoison.get!("http://www.bbc.com/news/business-36108166") html = TestHelper.read_fixture("bbc.html")
Readability.content(body) |> Readability.readabl_text bbc = Readability.content(html)
bbc_html = Readability.raw_html(bbc)
assert bbc_html =~ ~r/^<div><div class=\"story-body__inner\" property=\"articleBody\">/
assert bbc_html =~ ~r/connected computing devices\".<\/p><\/div><\/div>$/
bbc_text = Readability.readabl_text(bbc)
# TODO: Remove image caption when extract only text
# assert bbc_text =~ ~r/^Microsoft\'s quarterly profit has missed analysts/
assert bbc_text =~ ~r/connected computing devices\".$/
end end
test "readability for medium" do test "readability for medium" do
%{status_code: 200, body: body} = HTTPoison.get!("https://medium.com/@kenmazaika/why-im-betting-on-elixir-7c8f847b58#.d0xmzfd15") html = TestHelper.read_fixture("medium.html")
Readability.content(body) |> Readability.readabl_text medium = Readability.content(html)
medium_html = Readability.raw_html(medium)
assert medium_html =~ ~r/^<div><div class=\"section-inner layoutSingleColumn\">/
assert medium_html =~ ~r/recommend button!<\/em><\/h3><\/div><\/div>$/
medium_text = Readability.readabl_text(medium)
assert medium_text =~ ~r/^Background: Ive spent the past 6/
assert medium_text =~ ~r/a lot to me if you hit the recommend button!$/
end end
test "readability for buzzfeed" do test "readability for buzzfeed" do
%{status_code: 200, body: body} = HTTPoison.get!("http://www.buzzfeed.com/salvadorhernandez/fbi-obtains-passcode-to-iphone-in-new-york-drops-case-agains#.koMMa21lj8") html = TestHelper.read_fixture("buzzfeed.html")
Readability.content(body) |> Readability.readabl_text buzzfeed = Readability.content(html)
buzzfeed_html = Readability.raw_html(buzzfeed)
assert buzzfeed_html =~ ~r/^<div><div class=\"buzz_superlist_item_text\"><p>/
assert buzzfeed_html =~ ~r/encrypted devices.<\/p><hr\/><hr\/><hr\/><hr\/><\/div><\/div>$/
buzzfeed_text = Readability.readabl_text(buzzfeed)
assert buzzfeed_text =~ ~r/^The FBI no longer needs Apples help/
assert buzzfeed_text =~ ~r/issue of court orders and encrypted devices.$/
end end
end end

View File

@ -1 +1,10 @@
defmodule TestHelper do
@fixtures_path "./test/fixtures/"
def read_fixture(file_name) do
{:ok, html} = File.read(@fixtures_path <> file_name)
html
end
end
ExUnit.start() ExUnit.start()