2016-04-15 11:51:29 +00:00
|
|
|
|
defmodule ReadabilityTest do
|
2016-04-17 12:26:51 +00:00
|
|
|
|
use ExUnit.Case, async: true
|
2016-04-15 11:51:29 +00:00
|
|
|
|
|
2016-04-17 12:26:51 +00:00
|
|
|
|
test "readability for NY Times" do
|
2016-04-24 05:32:43 +00:00
|
|
|
|
html = TestHelper.read_fixture("nytimes.html")
|
2016-04-17 12:26:51 +00:00
|
|
|
|
opts = [clean_conditionally: false]
|
2016-04-24 09:40:35 +00:00
|
|
|
|
nytimes = Readability.article(html, opts)
|
2016-04-17 12:26:51 +00:00
|
|
|
|
|
2016-04-28 06:13:03 +00:00
|
|
|
|
nytimes_html = Readability.readable_html(nytimes)
|
2018-02-09 03:42:08 +00:00
|
|
|
|
|
|
|
|
|
assert nytimes_html =~
|
|
|
|
|
~r/^<div><div><figure id=\"media-100000004245260\"><div><img src=\"https/
|
|
|
|
|
|
2016-04-17 12:26:51 +00:00
|
|
|
|
assert nytimes_html =~ ~r/major priorities.<\/p><\/div><\/div>$/
|
|
|
|
|
|
2016-04-24 07:14:31 +00:00
|
|
|
|
nytimes_text = Readability.readable_text(nytimes)
|
2016-04-17 12:26:51 +00:00
|
|
|
|
assert nytimes_text =~ ~r/^Buddhist monks performing as part of/
|
|
|
|
|
assert nytimes_text =~ ~r/one of her major priorities.$/
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
test "readability for BBC" do
|
2016-04-24 05:32:43 +00:00
|
|
|
|
html = TestHelper.read_fixture("bbc.html")
|
2016-04-24 09:40:35 +00:00
|
|
|
|
bbc = Readability.article(html)
|
2016-04-24 05:32:43 +00:00
|
|
|
|
|
2016-04-28 06:13:03 +00:00
|
|
|
|
bbc_html = Readability.readable_html(bbc)
|
2016-04-24 05:32:43 +00:00
|
|
|
|
|
2016-04-28 06:13:03 +00:00
|
|
|
|
assert bbc_html =~ ~r/^<div><div><figure><span><img alt=\"A Microsoft logo/
|
2016-04-24 05:32:43 +00:00
|
|
|
|
assert bbc_html =~ ~r/connected computing devices\".<\/p><\/div><\/div>$/
|
|
|
|
|
|
2016-04-24 07:14:31 +00:00
|
|
|
|
bbc_text = Readability.readable_text(bbc)
|
2016-04-24 05:32:43 +00:00
|
|
|
|
# TODO: Remove image caption when extract only text
|
|
|
|
|
# assert bbc_text =~ ~r/^Microsoft\'s quarterly profit has missed analysts/
|
|
|
|
|
assert bbc_text =~ ~r/connected computing devices\".$/
|
2016-04-17 12:26:51 +00:00
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
test "readability for medium" do
|
2016-04-24 05:32:43 +00:00
|
|
|
|
html = TestHelper.read_fixture("medium.html")
|
2016-04-24 09:40:35 +00:00
|
|
|
|
medium = Readability.article(html)
|
2016-04-24 05:32:43 +00:00
|
|
|
|
|
2016-04-28 06:13:03 +00:00
|
|
|
|
medium_html = Readability.readable_html(medium)
|
2016-04-24 05:32:43 +00:00
|
|
|
|
|
2016-04-28 06:13:03 +00:00
|
|
|
|
assert medium_html =~ ~r/^<div><div><p id=\"3476\"><strong><em>Background:/
|
2016-04-24 05:32:43 +00:00
|
|
|
|
assert medium_html =~ ~r/recommend button!<\/em><\/h3><\/div><\/div>$/
|
|
|
|
|
|
2016-04-24 07:14:31 +00:00
|
|
|
|
medium_text = Readability.readable_text(medium)
|
2016-04-24 05:32:43 +00:00
|
|
|
|
|
|
|
|
|
assert medium_text =~ ~r/^Background: I’ve spent the past 6/
|
|
|
|
|
assert medium_text =~ ~r/a lot to me if you hit the recommend button!$/
|
2016-04-17 12:26:51 +00:00
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
test "readability for buzzfeed" do
|
2016-04-24 05:32:43 +00:00
|
|
|
|
html = TestHelper.read_fixture("buzzfeed.html")
|
2016-04-24 09:40:35 +00:00
|
|
|
|
buzzfeed = Readability.article(html)
|
2016-04-24 05:32:43 +00:00
|
|
|
|
|
2016-04-28 06:13:03 +00:00
|
|
|
|
buzzfeed_html = Readability.readable_html(buzzfeed)
|
2016-04-24 05:32:43 +00:00
|
|
|
|
|
2016-04-28 06:13:03 +00:00
|
|
|
|
assert buzzfeed_html =~ ~r/^<div><div><p>The FBI no longer needs Apple’s help/
|
2016-04-24 05:32:43 +00:00
|
|
|
|
assert buzzfeed_html =~ ~r/encrypted devices.<\/p><hr\/><hr\/><hr\/><hr\/><\/div><\/div>$/
|
|
|
|
|
|
2016-04-24 07:14:31 +00:00
|
|
|
|
buzzfeed_text = Readability.readable_text(buzzfeed)
|
2016-04-24 05:32:43 +00:00
|
|
|
|
|
|
|
|
|
assert buzzfeed_text =~ ~r/^The FBI no longer needs Apple’s help/
|
|
|
|
|
assert buzzfeed_text =~ ~r/issue of court orders and encrypted devices.$/
|
2016-04-15 11:51:29 +00:00
|
|
|
|
end
|
2016-08-14 05:11:01 +00:00
|
|
|
|
|
|
|
|
|
test "readability for pubmed" do
|
|
|
|
|
html = TestHelper.read_fixture("pubmed.html")
|
|
|
|
|
pubmed = Readability.article(html)
|
|
|
|
|
|
|
|
|
|
pubmed_html = Readability.readable_html(pubmed)
|
|
|
|
|
|
2018-02-09 03:42:08 +00:00
|
|
|
|
assert pubmed_html =~
|
|
|
|
|
~r/^<div><div><h4>BACKGROUND AND OBJECTIVES: <\/h4><p><abstracttext>Although strict blood pressure/
|
|
|
|
|
|
|
|
|
|
assert pubmed_html =~
|
|
|
|
|
~r/different mechanisms yielded potent antihypertensive efficacy with safety and decreased plasma BNP levels.<\/abstracttext><\/p><\/div><\/div>$/
|
2016-08-14 05:11:01 +00:00
|
|
|
|
|
|
|
|
|
pubmed_text = Readability.readable_text(pubmed)
|
|
|
|
|
|
|
|
|
|
assert pubmed_text =~ ~r/^BACKGROUND AND OBJECTIVES: \nAlthough strict blood pressure/
|
2018-02-09 03:42:08 +00:00
|
|
|
|
|
|
|
|
|
assert pubmed_text =~
|
|
|
|
|
~r/with different mechanisms yielded potent antihypertensive efficacy with safety and decreased plasma BNP levels.$/
|
2016-08-14 05:11:01 +00:00
|
|
|
|
end
|
2016-04-15 11:51:29 +00:00
|
|
|
|
end
|