Merge branch 'master' of https://github.com/keepcosmos/readability
This commit is contained in:
commit
aca14e3aef
|
@ -61,10 +61,13 @@ defmodule Readability do
|
|||
protect_attrs: ~r/^(?!id|rel|for|summary|title|href|src|alt|srcdoc)/i
|
||||
]
|
||||
|
||||
@markup_mimes ~r/^(application|text)\/[a-z\-_\.\+]+ml(;\s+charset=.*)?$/i
|
||||
|
||||
@type html_tree :: tuple | list
|
||||
@type raw_html :: binary
|
||||
@type url :: binary
|
||||
@type options :: list
|
||||
@type headers :: list[tuple]
|
||||
|
||||
@doc """
|
||||
summarize the primary readable content of a webpage.
|
||||
|
@ -73,8 +76,11 @@ defmodule Readability do
|
|||
def summarize(url, opts \\ []) do
|
||||
opts = Keyword.merge(opts, [page_url: url])
|
||||
httpoison_options = Application.get_env :readability, :httpoison_options, []
|
||||
%{status_code: _, body: raw_html} = HTTPoison.get!(url, [], httpoison_options)
|
||||
html_tree = Helper.normalize(raw_html)
|
||||
%{status_code: _, body: raw, headers: headers} = HTTPoison.get!(url, [], httpoison_options)
|
||||
|
||||
case is_response_markup(headers) do
|
||||
true ->
|
||||
html_tree = Helper.normalize(raw)
|
||||
article_tree = html_tree
|
||||
|> ArticleBuilder.build(opts)
|
||||
|
||||
|
@ -83,6 +89,46 @@ defmodule Readability do
|
|||
article_html: readable_html(article_tree),
|
||||
article_text: readable_text(article_tree)
|
||||
}
|
||||
|
||||
_ ->
|
||||
%Summary{title: nil,
|
||||
authors: nil,
|
||||
article_html: nil,
|
||||
article_text: raw
|
||||
}
|
||||
end
|
||||
|
||||
end
|
||||
|
||||
@doc """
|
||||
Extract MIME Type from headers
|
||||
|
||||
## Example
|
||||
|
||||
iex> mime = Readability.mime(headers_list)
|
||||
"text/html"
|
||||
"""
|
||||
@spec mime(headers) :: String.t()
|
||||
def mime(headers \\ []) do
|
||||
headers
|
||||
|> Enum.find(
|
||||
{"Content-Type", "text/plain"}, # default
|
||||
fn({key, _}) -> key == "Content-Type" end)
|
||||
|> elem(1)
|
||||
end
|
||||
|
||||
@doc """
|
||||
Return true if Content-Type in provided headers list is a markup type,
|
||||
else false
|
||||
|
||||
## Example
|
||||
|
||||
iex> Readability.is_response_markup?([{"Content-Type", "text/html"}])
|
||||
true
|
||||
"""
|
||||
@spec is_response_markup(headers) :: boolean
|
||||
def is_response_markup(headers) do
|
||||
mime(headers) =~ @markup_mimes
|
||||
end
|
||||
|
||||
@doc """
|
||||
|
|
|
@ -16,11 +16,12 @@ defmodule Readability.TitleFinder do
|
|||
case og_title(html_tree) do
|
||||
"" ->
|
||||
title = tag_title(html_tree)
|
||||
h_title = h_tag_title(html_tree)
|
||||
|
||||
if good_title?(title) do
|
||||
if good_title?(title) || h_title == "" do
|
||||
title
|
||||
else
|
||||
h_tag_title(html_tree)
|
||||
h_title
|
||||
end
|
||||
title when is_binary(title) ->
|
||||
title
|
||||
|
|
5
mix.exs
5
mix.exs
|
@ -2,7 +2,7 @@ defmodule Readability.Mixfile do
|
|||
@moduledoc """
|
||||
"""
|
||||
|
||||
@version "0.8.0"
|
||||
@version "0.9.0"
|
||||
@description """
|
||||
Readability library for extracting and curating articles.
|
||||
"""
|
||||
|
@ -44,7 +44,8 @@ defmodule Readability.Mixfile do
|
|||
{:httpoison, "~> 0.13.0"},
|
||||
{:ex_doc, "~> 0.14", only: :dev},
|
||||
{:credo, "~> 0.6.1", only: [:dev, :test]},
|
||||
{:dialyxir, "~> 0.3", only: [:dev]}
|
||||
{:dialyxir, "~> 0.3", only: [:dev]},
|
||||
{:mock, "~> 0.2.0", only: :test},
|
||||
]
|
||||
end
|
||||
|
||||
|
|
9
mix.lock
9
mix.lock
|
@ -1,16 +1,17 @@
|
|||
%{"bunt": {:hex, :bunt, "0.2.0", "951c6e801e8b1d2cbe58ebbd3e616a869061ddadcc4863d0a2182541acae9a38", [:mix], []},
|
||||
"certifi": {:hex, :certifi, "2.0.0", "a0c0e475107135f76b8c1d5bc7efb33cd3815cb3cf3dea7aefdd174dabead064", [:rebar3], []},
|
||||
"credo": {:hex, :credo, "0.6.1", "a941e2591bd2bd2055dc92b810c174650b40b8290459c89a835af9d59ac4a5f8", [:mix], [{:bunt, "~> 0.2.0", [hex: :bunt, optional: false]}]},
|
||||
"dialyxir": {:hex, :dialyxir, "0.3.5", "eaba092549e044c76f83165978979f60110dc58dd5b92fd952bf2312f64e9b14", [:mix], []},
|
||||
"earmark": {:hex, :earmark, "1.1.1", "433136b7f2e99cde88b745b3a0cfc3fbc81fe58b918a09b40fce7f00db4d8187", [:mix], []},
|
||||
"ex_doc": {:hex, :ex_doc, "0.14.5", "c0433c8117e948404d93ca69411dd575ec6be39b47802e81ca8d91017a0cf83c", [:mix], [{:earmark, "~> 1.0", [hex: :earmark, optional: false]}]},
|
||||
"dialyxir": {:hex, :dialyxir, "0.5.1", "b331b091720fd93e878137add264bac4f644e1ddae07a70bf7062c7862c4b952", [:mix], []},
|
||||
"earmark": {:hex, :earmark, "1.2.3", "206eb2e2ac1a794aa5256f3982de7a76bf4579ff91cb28d0e17ea2c9491e46a4", [:mix], []},
|
||||
"ex_doc": {:hex, :ex_doc, "0.16.3", "cd2a4cfe5d26e37502d3ec776702c72efa1adfa24ed9ce723bb565f4c30bd31a", [:mix], [{:earmark, "~> 1.1", [hex: :earmark, optional: false]}]},
|
||||
"floki": {:hex, :floki, "0.18.0", "643d5e4bb325905328d250760ea622faebac4f7e1521f770d35fbb43d8dd4f5f", [:mix], [{:mochiweb, "~> 2.15", [hex: :mochiweb, optional: false]}]},
|
||||
"hackney": {:hex, :hackney, "1.9.0", "51c506afc0a365868469dcfc79a9d0b94d896ec741cfd5bd338f49a5ec515bfe", [:rebar3], [{:certifi, "2.0.0", [hex: :certifi, optional: false]}, {:idna, "5.1.0", [hex: :idna, optional: false]}, {:metrics, "1.0.1", [hex: :metrics, optional: false]}, {:mimerl, "1.0.2", [hex: :mimerl, optional: false]}, {:ssl_verify_fun, "1.1.1", [hex: :ssl_verify_fun, optional: false]}]},
|
||||
"httpoison": {:hex, :httpoison, "0.13.0", "bfaf44d9f133a6599886720f3937a7699466d23bb0cd7a88b6ba011f53c6f562", [:mix], [{:hackney, "~> 1.8", [hex: :hackney, optional: false]}]},
|
||||
"idna": {:hex, :idna, "5.1.0", "d72b4effeb324ad5da3cab1767cb16b17939004e789d8c0ad5b70f3cea20c89a", [:rebar3], [{:unicode_util_compat, "0.3.1", [hex: :unicode_util_compat, optional: false]}]},
|
||||
"meck": {:hex, :meck, "0.8.7", "ebad16ca23f685b07aed3bc011efff65fbaf28881a8adf925428ef5472d390ee", [:rebar3], []},
|
||||
"metrics": {:hex, :metrics, "1.0.1", "25f094dea2cda98213cecc3aeff09e940299d950904393b2a29d191c346a8486", [:rebar3], []},
|
||||
"mimerl": {:hex, :mimerl, "1.0.2", "993f9b0e084083405ed8252b99460c4f0563e41729ab42d9074fd5e52439be88", [:rebar3], []},
|
||||
"mochiweb": {:hex, :mochiweb, "2.15.0", "e1daac474df07651e5d17cc1e642c4069c7850dc4508d3db7263a0651330aacc", [:rebar3], []},
|
||||
"mochiweb_html": {:hex, :mochiweb_html, "2.15.0", "d7402e967d7f9f2912f8befa813c37be62d5eeeddbbcb6fe986c44e01460d497", [:rebar3], []},
|
||||
"mock": {:hex, :mock, "0.2.1", "bfdba786903e77f9c18772dee472d020ceb8ef000783e737725a4c8f54ad28ec", [:mix], [{:meck, "~> 0.8.2", [hex: :meck, optional: false]}]},
|
||||
"ssl_verify_fun": {:hex, :ssl_verify_fun, "1.1.1", "28a4d65b7f59893bc2c7de786dec1e1555bd742d336043fe644ae956c3497fbe", [:make, :rebar], []},
|
||||
"unicode_util_compat": {:hex, :unicode_util_compat, "0.3.1", "a1f612a7b512638634a603c8f401892afbf99b8ce93a45041f8aaca99cadb85e", [:rebar3], []}}
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,69 @@
|
|||
defmodule ReadabilityHttpTest do
|
||||
use ExUnit.Case
|
||||
import Mock
|
||||
require IEx
|
||||
|
||||
test "blank response is parsed as plain text" do
|
||||
url = "https://tools.ietf.org/rfc/rfc2616.txt"
|
||||
content = TestHelper.read_fixture("rfc2616.txt")
|
||||
response = %HTTPoison.Response{
|
||||
status_code: 200,
|
||||
headers: [],
|
||||
body: content}
|
||||
|
||||
with_mock HTTPoison, [get!: fn(_url, _headers, _opts) -> response end] do
|
||||
%Readability.Summary{article_text: result_text} = Readability.summarize(url)
|
||||
|
||||
assert result_text =~ ~r/3 Protocol Parameters/
|
||||
end
|
||||
end
|
||||
|
||||
test "text/plain response is parsed as plain text" do
|
||||
url = "https://tools.ietf.org/rfc/rfc2616.txt"
|
||||
content = TestHelper.read_fixture("rfc2616.txt")
|
||||
response = %HTTPoison.Response{
|
||||
status_code: 200,
|
||||
headers: [{"Content-Type", "text/plain"}],
|
||||
body: content}
|
||||
|
||||
with_mock HTTPoison, [get!: fn(_url, _headers, _opts) -> response end] do
|
||||
%Readability.Summary{article_text: result_text} = Readability.summarize(url)
|
||||
|
||||
assert result_text =~ ~r/3 Protocol Parameters/
|
||||
end
|
||||
end
|
||||
|
||||
test "*ml responses are parsed as markup" do
|
||||
url = "https://news.bbc.co.uk/test.html"
|
||||
content = TestHelper.read_fixture("bbc.html")
|
||||
mimes = ["text/html", "application/xml", "application/xhtml+xml"]
|
||||
|
||||
mimes |> Enum.each(fn(mime) ->
|
||||
response = %HTTPoison.Response{
|
||||
status_code: 200,
|
||||
headers: [{"Content-Type", mime}],
|
||||
body: content}
|
||||
|
||||
with_mock HTTPoison, [get!: fn(_url, _headers, _opts) -> response end] do
|
||||
%Readability.Summary{article_html: result_html} = Readability.summarize(url)
|
||||
|
||||
assert result_html =~ ~r/connected computing devices\".<\/p><\/div><\/div>$/
|
||||
end
|
||||
end)
|
||||
end
|
||||
|
||||
test "response with charset is parsed correctly" do
|
||||
url = "https://news.bbc.co.uk/test.html"
|
||||
content = TestHelper.read_fixture("bbc.html")
|
||||
response = %HTTPoison.Response{
|
||||
status_code: 200,
|
||||
headers: [{"Content-Type", "text/html; charset=UTF-8"}],
|
||||
body: content}
|
||||
|
||||
with_mock HTTPoison, [get!: fn(_url, _headers, _opts) -> response end] do
|
||||
%Readability.Summary{article_html: result_html} = Readability.summarize(url)
|
||||
|
||||
assert result_html =~ ~r/connected computing devices\".<\/p><\/div><\/div>$/
|
||||
end
|
||||
end
|
||||
end
|
Loading…
Reference in New Issue