Compare commits

..

No commits in common. "75404b197d67e118a6575ee9b39a9ae2ac3c2dcc" and "1538ca2a8c3c3b6b041c9e86b560f368c4547896" have entirely different histories.

4 changed files with 92 additions and 7 deletions

View File

@ -29,7 +29,6 @@ defmodule Readability.Helper do
""" """
@spec remove_attrs(html_tree, String.t() | [String.t()] | Regex.t()) :: html_tree @spec remove_attrs(html_tree, String.t() | [String.t()] | Regex.t()) :: html_tree
def remove_attrs(content, _) when is_binary(content), do: content def remove_attrs(content, _) when is_binary(content), do: content
def remove_attrs({:comment, _} = comment, _), do: comment
def remove_attrs([], _), do: [] def remove_attrs([], _), do: []
def remove_attrs([h | t], t_attrs) do def remove_attrs([h | t], t_attrs) do

View File

@ -8,6 +8,7 @@
"floki": {:hex, :floki, "0.20.3", "dfb3a71eb99938e330b4156433d55c6d0b188d936c9683d115a8540bac56e019", [:mix], [{:html_entities, "~> 0.4.0", [hex: :html_entities, repo: "hexpm", optional: false]}, {:mochiweb, "~> 2.15", [hex: :mochiweb, repo: "hexpm", optional: false]}], "hexpm", "77032ea4d961b7e0895e6b84ca4dae45671ae3aaec706db8614077a19bb62d6e"}, "floki": {:hex, :floki, "0.20.3", "dfb3a71eb99938e330b4156433d55c6d0b188d936c9683d115a8540bac56e019", [:mix], [{:html_entities, "~> 0.4.0", [hex: :html_entities, repo: "hexpm", optional: false]}, {:mochiweb, "~> 2.15", [hex: :mochiweb, repo: "hexpm", optional: false]}], "hexpm", "77032ea4d961b7e0895e6b84ca4dae45671ae3aaec706db8614077a19bb62d6e"},
"hackney": {:hex, :hackney, "1.9.0", "51c506afc0a365868469dcfc79a9d0b94d896ec741cfd5bd338f49a5ec515bfe", [:rebar3], [{:certifi, "2.0.0", [hex: :certifi, optional: false]}, {:idna, "5.1.0", [hex: :idna, optional: false]}, {:metrics, "1.0.1", [hex: :metrics, optional: false]}, {:mimerl, "1.0.2", [hex: :mimerl, optional: false]}, {:ssl_verify_fun, "1.1.1", [hex: :ssl_verify_fun, optional: false]}]}, "hackney": {:hex, :hackney, "1.9.0", "51c506afc0a365868469dcfc79a9d0b94d896ec741cfd5bd338f49a5ec515bfe", [:rebar3], [{:certifi, "2.0.0", [hex: :certifi, optional: false]}, {:idna, "5.1.0", [hex: :idna, optional: false]}, {:metrics, "1.0.1", [hex: :metrics, optional: false]}, {:mimerl, "1.0.2", [hex: :mimerl, optional: false]}, {:ssl_verify_fun, "1.1.1", [hex: :ssl_verify_fun, optional: false]}]},
"html_entities": {:hex, :html_entities, "0.4.0", "f2fee876858cf6aaa9db608820a3209e45a087c5177332799592142b50e89a6b", [:mix], [], "hexpm", "3e3d7156a272950373ce5a4018b1490bea26676f8d6a7d409f6fac8568b8cb9a"}, "html_entities": {:hex, :html_entities, "0.4.0", "f2fee876858cf6aaa9db608820a3209e45a087c5177332799592142b50e89a6b", [:mix], [], "hexpm", "3e3d7156a272950373ce5a4018b1490bea26676f8d6a7d409f6fac8568b8cb9a"},
"httpoison": {:hex, :httpoison, "0.13.0", "bfaf44d9f133a6599886720f3937a7699466d23bb0cd7a88b6ba011f53c6f562", [:mix], [{:hackney, "~> 1.8", [hex: :hackney, optional: false]}]},
"idna": {:hex, :idna, "5.1.0", "d72b4effeb324ad5da3cab1767cb16b17939004e789d8c0ad5b70f3cea20c89a", [:rebar3], [{:unicode_util_compat, "0.3.1", [hex: :unicode_util_compat, optional: false]}]}, "idna": {:hex, :idna, "5.1.0", "d72b4effeb324ad5da3cab1767cb16b17939004e789d8c0ad5b70f3cea20c89a", [:rebar3], [{:unicode_util_compat, "0.3.1", [hex: :unicode_util_compat, optional: false]}]},
"meck": {:hex, :meck, "0.8.7", "ebad16ca23f685b07aed3bc011efff65fbaf28881a8adf925428ef5472d390ee", [:rebar3], [], "hexpm", "51274d4b536dc7958eb4df3aefa5245f4a6df1d6198cb8f8b97d6747033597ca"}, "meck": {:hex, :meck, "0.8.7", "ebad16ca23f685b07aed3bc011efff65fbaf28881a8adf925428ef5472d390ee", [:rebar3], [], "hexpm", "51274d4b536dc7958eb4df3aefa5245f4a6df1d6198cb8f8b97d6747033597ca"},
"metrics": {:hex, :metrics, "1.0.1", "25f094dea2cda98213cecc3aeff09e940299d950904393b2a29d191c346a8486", [:rebar3], []}, "metrics": {:hex, :metrics, "1.0.1", "25f094dea2cda98213cecc3aeff09e940299d950904393b2a29d191c346a8486", [:rebar3], []},

View File

@ -57,6 +57,7 @@ defmodule Readability.HelperTest do
assert result == expected assert result == expected
end end
test "inner text length", %{html_tree: html_tree} do test "inner text length", %{html_tree: html_tree} do
result = html_tree |> Helper.text_length() result = html_tree |> Helper.text_length()
assert result == 5 assert result == 5
@ -92,10 +93,4 @@ defmodule Readability.HelperTest do
assert result_with_scheme =~ foo_url assert result_with_scheme =~ foo_url
assert result_with_scheme =~ bar_url_https assert result_with_scheme =~ bar_url_https
end end
test "remove attrs with comments" do
tree = Floki.parse("<div class=\"foo\">hello <span><!-- world --></span></div>")
expected = Floki.parse("<div>hello <span><!-- world --></span></div>")
assert expected == Helper.remove_attrs(tree, ~w[class])
end
end end

View File

@ -0,0 +1,90 @@
defmodule ReadabilityHttpTest do
use ExUnit.Case
import Mock
require IEx
test "blank response is parsed as plain text" do
url = "https://tools.ietf.org/rfc/rfc2616.txt"
content = TestHelper.read_fixture("rfc2616.txt")
response = %HTTPoison.Response{status_code: 200, headers: [], body: content}
with_mock HTTPoison, get!: fn _url, _headers, _opts -> response end do
%Readability.Summary{article_text: result_text} = Readability.summarize(url)
assert result_text =~ ~r/3 Protocol Parameters/
end
end
test "text/plain response is parsed as plain text" do
url = "https://tools.ietf.org/rfc/rfc2616.txt"
content = TestHelper.read_fixture("rfc2616.txt")
response = %HTTPoison.Response{
status_code: 200,
headers: [{"Content-Type", "text/plain"}],
body: content
}
with_mock HTTPoison, get!: fn _url, _headers, _opts -> response end do
%Readability.Summary{article_text: result_text} = Readability.summarize(url)
assert result_text =~ ~r/3 Protocol Parameters/
end
end
test "*ml responses are parsed as markup" do
url = "https://news.bbc.co.uk/test.html"
content = TestHelper.read_fixture("bbc.html")
mimes = ["text/html", "application/xml", "application/xhtml+xml"]
mimes
|> Enum.each(fn mime ->
response = %HTTPoison.Response{
status_code: 200,
headers: [{"Content-Type", mime}],
body: content
}
with_mock HTTPoison, get!: fn _url, _headers, _opts -> response end do
%Readability.Summary{article_html: result_html} = Readability.summarize(url)
assert result_html =~ ~r/connected computing devices\".<\/p><\/div><\/div>$/
end
end)
end
test "response with charset is parsed correctly" do
url = "https://news.bbc.co.uk/test.html"
content = TestHelper.read_fixture("bbc.html")
response = %HTTPoison.Response{
status_code: 200,
headers: [{"Content-Type", "text/html; charset=UTF-8"}],
body: content
}
with_mock HTTPoison, get!: fn _url, _headers, _opts -> response end do
%Readability.Summary{article_html: result_html} = Readability.summarize(url)
assert result_html =~ ~r/connected computing devices\".<\/p><\/div><\/div>$/
end
end
test "response with content-type in different case is parsed correctly" do
# HTTP header keys are case insensitive (RFC2616 - Section 4.2)
url = "https://news.bbc.co.uk/test.html"
content = TestHelper.read_fixture("bbc.html")
response = %HTTPoison.Response{
status_code: 200,
headers: [{"content-Type", "text/html; charset=UTF-8"}],
body: content
}
with_mock HTTPoison, get!: fn _url, _headers, _opts -> response end do
%Readability.Summary{article_html: result_html} = Readability.summarize(url)
assert result_html =~ ~r/connected computing devices\".<\/p><\/div><\/div>$/
end
end
end