This commit is contained in:
keepcosmos 2017-08-23 14:10:11 +09:00
commit aca14e3aef
6 changed files with 9994 additions and 17 deletions

View File

@ -61,10 +61,13 @@ defmodule Readability do
protect_attrs: ~r/^(?!id|rel|for|summary|title|href|src|alt|srcdoc)/i protect_attrs: ~r/^(?!id|rel|for|summary|title|href|src|alt|srcdoc)/i
] ]
@markup_mimes ~r/^(application|text)\/[a-z\-_\.\+]+ml(;\s+charset=.*)?$/i
@type html_tree :: tuple | list @type html_tree :: tuple | list
@type raw_html :: binary @type raw_html :: binary
@type url :: binary @type url :: binary
@type options :: list @type options :: list
@type headers :: list[tuple]
@doc """ @doc """
summarize the primary readable content of a webpage. summarize the primary readable content of a webpage.
@ -73,16 +76,59 @@ defmodule Readability do
def summarize(url, opts \\ []) do def summarize(url, opts \\ []) do
opts = Keyword.merge(opts, [page_url: url]) opts = Keyword.merge(opts, [page_url: url])
httpoison_options = Application.get_env :readability, :httpoison_options, [] httpoison_options = Application.get_env :readability, :httpoison_options, []
%{status_code: _, body: raw_html} = HTTPoison.get!(url, [], httpoison_options) %{status_code: _, body: raw, headers: headers} = HTTPoison.get!(url, [], httpoison_options)
html_tree = Helper.normalize(raw_html)
article_tree = html_tree
|> ArticleBuilder.build(opts)
%Summary{title: title(html_tree), case is_response_markup(headers) do
authors: authors(html_tree), true ->
article_html: readable_html(article_tree), html_tree = Helper.normalize(raw)
article_text: readable_text(article_tree) article_tree = html_tree
} |> ArticleBuilder.build(opts)
%Summary{title: title(html_tree),
authors: authors(html_tree),
article_html: readable_html(article_tree),
article_text: readable_text(article_tree)
}
_ ->
%Summary{title: nil,
authors: nil,
article_html: nil,
article_text: raw
}
end
end
@doc """
Extract MIME Type from headers
## Example
iex> mime = Readability.mime(headers_list)
"text/html"
"""
@spec mime(headers) :: String.t()
def mime(headers \\ []) do
headers
|> Enum.find(
{"Content-Type", "text/plain"}, # default
fn({key, _}) -> key == "Content-Type" end)
|> elem(1)
end
@doc """
Return true if Content-Type in provided headers list is a markup type,
else false
## Example
iex> Readability.is_response_markup?([{"Content-Type", "text/html"}])
true
"""
@spec is_response_markup(headers) :: boolean
def is_response_markup(headers) do
mime(headers) =~ @markup_mimes
end end
@doc """ @doc """

View File

@ -16,11 +16,12 @@ defmodule Readability.TitleFinder do
case og_title(html_tree) do case og_title(html_tree) do
"" -> "" ->
title = tag_title(html_tree) title = tag_title(html_tree)
h_title = h_tag_title(html_tree)
if good_title?(title) do if good_title?(title) || h_title == "" do
title title
else else
h_tag_title(html_tree) h_title
end end
title when is_binary(title) -> title when is_binary(title) ->
title title

View File

@ -2,7 +2,7 @@ defmodule Readability.Mixfile do
@moduledoc """ @moduledoc """
""" """
@version "0.8.0" @version "0.9.0"
@description """ @description """
Readability library for extracting and curating articles. Readability library for extracting and curating articles.
""" """
@ -44,7 +44,8 @@ defmodule Readability.Mixfile do
{:httpoison, "~> 0.13.0"}, {:httpoison, "~> 0.13.0"},
{:ex_doc, "~> 0.14", only: :dev}, {:ex_doc, "~> 0.14", only: :dev},
{:credo, "~> 0.6.1", only: [:dev, :test]}, {:credo, "~> 0.6.1", only: [:dev, :test]},
{:dialyxir, "~> 0.3", only: [:dev]} {:dialyxir, "~> 0.3", only: [:dev]},
{:mock, "~> 0.2.0", only: :test},
] ]
end end

View File

@ -1,16 +1,17 @@
%{"bunt": {:hex, :bunt, "0.2.0", "951c6e801e8b1d2cbe58ebbd3e616a869061ddadcc4863d0a2182541acae9a38", [:mix], []}, %{"bunt": {:hex, :bunt, "0.2.0", "951c6e801e8b1d2cbe58ebbd3e616a869061ddadcc4863d0a2182541acae9a38", [:mix], []},
"certifi": {:hex, :certifi, "2.0.0", "a0c0e475107135f76b8c1d5bc7efb33cd3815cb3cf3dea7aefdd174dabead064", [:rebar3], []}, "certifi": {:hex, :certifi, "2.0.0", "a0c0e475107135f76b8c1d5bc7efb33cd3815cb3cf3dea7aefdd174dabead064", [:rebar3], []},
"credo": {:hex, :credo, "0.6.1", "a941e2591bd2bd2055dc92b810c174650b40b8290459c89a835af9d59ac4a5f8", [:mix], [{:bunt, "~> 0.2.0", [hex: :bunt, optional: false]}]}, "credo": {:hex, :credo, "0.6.1", "a941e2591bd2bd2055dc92b810c174650b40b8290459c89a835af9d59ac4a5f8", [:mix], [{:bunt, "~> 0.2.0", [hex: :bunt, optional: false]}]},
"dialyxir": {:hex, :dialyxir, "0.3.5", "eaba092549e044c76f83165978979f60110dc58dd5b92fd952bf2312f64e9b14", [:mix], []}, "dialyxir": {:hex, :dialyxir, "0.5.1", "b331b091720fd93e878137add264bac4f644e1ddae07a70bf7062c7862c4b952", [:mix], []},
"earmark": {:hex, :earmark, "1.1.1", "433136b7f2e99cde88b745b3a0cfc3fbc81fe58b918a09b40fce7f00db4d8187", [:mix], []}, "earmark": {:hex, :earmark, "1.2.3", "206eb2e2ac1a794aa5256f3982de7a76bf4579ff91cb28d0e17ea2c9491e46a4", [:mix], []},
"ex_doc": {:hex, :ex_doc, "0.14.5", "c0433c8117e948404d93ca69411dd575ec6be39b47802e81ca8d91017a0cf83c", [:mix], [{:earmark, "~> 1.0", [hex: :earmark, optional: false]}]}, "ex_doc": {:hex, :ex_doc, "0.16.3", "cd2a4cfe5d26e37502d3ec776702c72efa1adfa24ed9ce723bb565f4c30bd31a", [:mix], [{:earmark, "~> 1.1", [hex: :earmark, optional: false]}]},
"floki": {:hex, :floki, "0.18.0", "643d5e4bb325905328d250760ea622faebac4f7e1521f770d35fbb43d8dd4f5f", [:mix], [{:mochiweb, "~> 2.15", [hex: :mochiweb, optional: false]}]}, "floki": {:hex, :floki, "0.18.0", "643d5e4bb325905328d250760ea622faebac4f7e1521f770d35fbb43d8dd4f5f", [:mix], [{:mochiweb, "~> 2.15", [hex: :mochiweb, optional: false]}]},
"hackney": {:hex, :hackney, "1.9.0", "51c506afc0a365868469dcfc79a9d0b94d896ec741cfd5bd338f49a5ec515bfe", [:rebar3], [{:certifi, "2.0.0", [hex: :certifi, optional: false]}, {:idna, "5.1.0", [hex: :idna, optional: false]}, {:metrics, "1.0.1", [hex: :metrics, optional: false]}, {:mimerl, "1.0.2", [hex: :mimerl, optional: false]}, {:ssl_verify_fun, "1.1.1", [hex: :ssl_verify_fun, optional: false]}]}, "hackney": {:hex, :hackney, "1.9.0", "51c506afc0a365868469dcfc79a9d0b94d896ec741cfd5bd338f49a5ec515bfe", [:rebar3], [{:certifi, "2.0.0", [hex: :certifi, optional: false]}, {:idna, "5.1.0", [hex: :idna, optional: false]}, {:metrics, "1.0.1", [hex: :metrics, optional: false]}, {:mimerl, "1.0.2", [hex: :mimerl, optional: false]}, {:ssl_verify_fun, "1.1.1", [hex: :ssl_verify_fun, optional: false]}]},
"httpoison": {:hex, :httpoison, "0.13.0", "bfaf44d9f133a6599886720f3937a7699466d23bb0cd7a88b6ba011f53c6f562", [:mix], [{:hackney, "~> 1.8", [hex: :hackney, optional: false]}]}, "httpoison": {:hex, :httpoison, "0.13.0", "bfaf44d9f133a6599886720f3937a7699466d23bb0cd7a88b6ba011f53c6f562", [:mix], [{:hackney, "~> 1.8", [hex: :hackney, optional: false]}]},
"idna": {:hex, :idna, "5.1.0", "d72b4effeb324ad5da3cab1767cb16b17939004e789d8c0ad5b70f3cea20c89a", [:rebar3], [{:unicode_util_compat, "0.3.1", [hex: :unicode_util_compat, optional: false]}]}, "idna": {:hex, :idna, "5.1.0", "d72b4effeb324ad5da3cab1767cb16b17939004e789d8c0ad5b70f3cea20c89a", [:rebar3], [{:unicode_util_compat, "0.3.1", [hex: :unicode_util_compat, optional: false]}]},
"meck": {:hex, :meck, "0.8.7", "ebad16ca23f685b07aed3bc011efff65fbaf28881a8adf925428ef5472d390ee", [:rebar3], []},
"metrics": {:hex, :metrics, "1.0.1", "25f094dea2cda98213cecc3aeff09e940299d950904393b2a29d191c346a8486", [:rebar3], []}, "metrics": {:hex, :metrics, "1.0.1", "25f094dea2cda98213cecc3aeff09e940299d950904393b2a29d191c346a8486", [:rebar3], []},
"mimerl": {:hex, :mimerl, "1.0.2", "993f9b0e084083405ed8252b99460c4f0563e41729ab42d9074fd5e52439be88", [:rebar3], []}, "mimerl": {:hex, :mimerl, "1.0.2", "993f9b0e084083405ed8252b99460c4f0563e41729ab42d9074fd5e52439be88", [:rebar3], []},
"mochiweb": {:hex, :mochiweb, "2.15.0", "e1daac474df07651e5d17cc1e642c4069c7850dc4508d3db7263a0651330aacc", [:rebar3], []}, "mochiweb": {:hex, :mochiweb, "2.15.0", "e1daac474df07651e5d17cc1e642c4069c7850dc4508d3db7263a0651330aacc", [:rebar3], []},
"mochiweb_html": {:hex, :mochiweb_html, "2.15.0", "d7402e967d7f9f2912f8befa813c37be62d5eeeddbbcb6fe986c44e01460d497", [:rebar3], []}, "mock": {:hex, :mock, "0.2.1", "bfdba786903e77f9c18772dee472d020ceb8ef000783e737725a4c8f54ad28ec", [:mix], [{:meck, "~> 0.8.2", [hex: :meck, optional: false]}]},
"ssl_verify_fun": {:hex, :ssl_verify_fun, "1.1.1", "28a4d65b7f59893bc2c7de786dec1e1555bd742d336043fe644ae956c3497fbe", [:make, :rebar], []}, "ssl_verify_fun": {:hex, :ssl_verify_fun, "1.1.1", "28a4d65b7f59893bc2c7de786dec1e1555bd742d336043fe644ae956c3497fbe", [:make, :rebar], []},
"unicode_util_compat": {:hex, :unicode_util_compat, "0.3.1", "a1f612a7b512638634a603c8f401892afbf99b8ce93a45041f8aaca99cadb85e", [:rebar3], []}} "unicode_util_compat": {:hex, :unicode_util_compat, "0.3.1", "a1f612a7b512638634a603c8f401892afbf99b8ce93a45041f8aaca99cadb85e", [:rebar3], []}}

9859
test/fixtures/rfc2616.txt vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,69 @@
defmodule ReadabilityHttpTest do
use ExUnit.Case
import Mock
require IEx
test "blank response is parsed as plain text" do
url = "https://tools.ietf.org/rfc/rfc2616.txt"
content = TestHelper.read_fixture("rfc2616.txt")
response = %HTTPoison.Response{
status_code: 200,
headers: [],
body: content}
with_mock HTTPoison, [get!: fn(_url, _headers, _opts) -> response end] do
%Readability.Summary{article_text: result_text} = Readability.summarize(url)
assert result_text =~ ~r/3 Protocol Parameters/
end
end
test "text/plain response is parsed as plain text" do
url = "https://tools.ietf.org/rfc/rfc2616.txt"
content = TestHelper.read_fixture("rfc2616.txt")
response = %HTTPoison.Response{
status_code: 200,
headers: [{"Content-Type", "text/plain"}],
body: content}
with_mock HTTPoison, [get!: fn(_url, _headers, _opts) -> response end] do
%Readability.Summary{article_text: result_text} = Readability.summarize(url)
assert result_text =~ ~r/3 Protocol Parameters/
end
end
test "*ml responses are parsed as markup" do
url = "https://news.bbc.co.uk/test.html"
content = TestHelper.read_fixture("bbc.html")
mimes = ["text/html", "application/xml", "application/xhtml+xml"]
mimes |> Enum.each(fn(mime) ->
response = %HTTPoison.Response{
status_code: 200,
headers: [{"Content-Type", mime}],
body: content}
with_mock HTTPoison, [get!: fn(_url, _headers, _opts) -> response end] do
%Readability.Summary{article_html: result_html} = Readability.summarize(url)
assert result_html =~ ~r/connected computing devices\".<\/p><\/div><\/div>$/
end
end)
end
test "response with charset is parsed correctly" do
url = "https://news.bbc.co.uk/test.html"
content = TestHelper.read_fixture("bbc.html")
response = %HTTPoison.Response{
status_code: 200,
headers: [{"Content-Type", "text/html; charset=UTF-8"}],
body: content}
with_mock HTTPoison, [get!: fn(_url, _headers, _opts) -> response end] do
%Readability.Summary{article_html: result_html} = Readability.summarize(url)
assert result_html =~ ~r/connected computing devices\".<\/p><\/div><\/div>$/
end
end
end