added ability to handle text-based responses

added fix for content-type with charset

updated function names to match elixir naming conventions (is_ vs ?)

minor version bump

added default content-type of text/plain when header is missing
This commit is contained in:
Phillip Oldham 2017-04-14 14:11:44 +01:00
parent 93955d36d2
commit 2b53a90f3d
5 changed files with 9988 additions and 11 deletions

View File

@ -61,10 +61,13 @@ defmodule Readability do
protect_attrs: ~r/^(?!id|rel|for|summary|title|href|src|alt|srcdoc)/i protect_attrs: ~r/^(?!id|rel|for|summary|title|href|src|alt|srcdoc)/i
] ]
@markup_mimes ~r/^(application|text)\/[a-z\-_\.\+]+ml(;\s+charset=.*)?$/i
@type html_tree :: tuple | list @type html_tree :: tuple | list
@type raw_html :: binary @type raw_html :: binary
@type url :: binary @type url :: binary
@type options :: list @type options :: list
@type headers :: list[tuple]
@doc """ @doc """
summarize the primary readable content of a webpage. summarize the primary readable content of a webpage.
@ -73,8 +76,11 @@ defmodule Readability do
def summarize(url, opts \\ []) do def summarize(url, opts \\ []) do
opts = Keyword.merge(opts, [page_url: url]) opts = Keyword.merge(opts, [page_url: url])
httpoison_options = Application.get_env :readability, :httpoison_options, [] httpoison_options = Application.get_env :readability, :httpoison_options, []
%{status_code: _, body: raw_html} = HTTPoison.get!(url, [], httpoison_options) %{status_code: _, body: raw, headers: headers} = HTTPoison.get!(url, [], httpoison_options)
html_tree = Helper.normalize(raw_html)
case is_response_markup(headers) do
true ->
html_tree = Helper.normalize(raw)
article_tree = html_tree article_tree = html_tree
|> ArticleBuilder.build(opts) |> ArticleBuilder.build(opts)
@ -83,6 +89,46 @@ defmodule Readability do
article_html: readable_html(article_tree), article_html: readable_html(article_tree),
article_text: readable_text(article_tree) article_text: readable_text(article_tree)
} }
_ ->
%Summary{title: nil,
authors: nil,
article_html: nil,
article_text: raw
}
end
end
@doc """
Extract MIME Type from headers
## Example
iex> mime = Readability.mime(headers_list)
"text/html"
"""
@spec mime(headers) :: String.t()
def mime(headers \\ []) do
headers
|> Enum.find(
{"Content-Type", "text/plain"}, # default
fn({key, _}) -> key == "Content-Type" end)
|> elem(1)
end
@doc """
Return true if Content-Type in provided headers list is a markup type,
else false
## Example
iex> Readability.is_response_markup?([{"Content-Type", "text/html"}])
true
"""
@spec is_response_markup(headers) :: boolean
def is_response_markup(headers) do
mime(headers) =~ @markup_mimes
end end
@doc """ @doc """

View File

@ -2,7 +2,7 @@ defmodule Readability.Mixfile do
@moduledoc """ @moduledoc """
""" """
@version "0.8.0" @version "0.9.0"
@description """ @description """
Readability library for extracting and curating articles. Readability library for extracting and curating articles.
""" """
@ -44,7 +44,8 @@ defmodule Readability.Mixfile do
{:httpoison, "~> 0.11.0"}, {:httpoison, "~> 0.11.0"},
{:ex_doc, "~> 0.14", only: :dev}, {:ex_doc, "~> 0.14", only: :dev},
{:credo, "~> 0.6.1", only: [:dev, :test]}, {:credo, "~> 0.6.1", only: [:dev, :test]},
{:dialyxir, "~> 0.3", only: [:dev]} {:dialyxir, "~> 0.3", only: [:dev]},
{:mock, "~> 0.2.0", only: :test},
] ]
end end

View File

@ -8,8 +8,10 @@
"hackney": {:hex, :hackney, "1.6.5", "8c025ee397ac94a184b0743c73b33b96465e85f90a02e210e86df6cbafaa5065", [:rebar3], [{:certifi, "0.7.0", [hex: :certifi, optional: false]}, {:idna, "1.2.0", [hex: :idna, optional: false]}, {:metrics, "1.0.1", [hex: :metrics, optional: false]}, {:mimerl, "1.0.2", [hex: :mimerl, optional: false]}, {:ssl_verify_fun, "1.1.1", [hex: :ssl_verify_fun, optional: false]}]}, "hackney": {:hex, :hackney, "1.6.5", "8c025ee397ac94a184b0743c73b33b96465e85f90a02e210e86df6cbafaa5065", [:rebar3], [{:certifi, "0.7.0", [hex: :certifi, optional: false]}, {:idna, "1.2.0", [hex: :idna, optional: false]}, {:metrics, "1.0.1", [hex: :metrics, optional: false]}, {:mimerl, "1.0.2", [hex: :mimerl, optional: false]}, {:ssl_verify_fun, "1.1.1", [hex: :ssl_verify_fun, optional: false]}]},
"httpoison": {:hex, :httpoison, "0.11.0", "b9240a9c44fc46fcd8618d17898859ba09a3c1b47210b74316c0ffef10735e76", [:mix], [{:hackney, "~> 1.6.3", [hex: :hackney, optional: false]}]}, "httpoison": {:hex, :httpoison, "0.11.0", "b9240a9c44fc46fcd8618d17898859ba09a3c1b47210b74316c0ffef10735e76", [:mix], [{:hackney, "~> 1.6.3", [hex: :hackney, optional: false]}]},
"idna": {:hex, :idna, "1.2.0", "ac62ee99da068f43c50dc69acf700e03a62a348360126260e87f2b54eced86b2", [:rebar3], []}, "idna": {:hex, :idna, "1.2.0", "ac62ee99da068f43c50dc69acf700e03a62a348360126260e87f2b54eced86b2", [:rebar3], []},
"meck": {:hex, :meck, "0.8.4", "59ca1cd971372aa223138efcf9b29475bde299e1953046a0c727184790ab1520", [:make, :rebar], []},
"metrics": {:hex, :metrics, "1.0.1", "25f094dea2cda98213cecc3aeff09e940299d950904393b2a29d191c346a8486", [:rebar3], []}, "metrics": {:hex, :metrics, "1.0.1", "25f094dea2cda98213cecc3aeff09e940299d950904393b2a29d191c346a8486", [:rebar3], []},
"mimerl": {:hex, :mimerl, "1.0.2", "993f9b0e084083405ed8252b99460c4f0563e41729ab42d9074fd5e52439be88", [:rebar3], []}, "mimerl": {:hex, :mimerl, "1.0.2", "993f9b0e084083405ed8252b99460c4f0563e41729ab42d9074fd5e52439be88", [:rebar3], []},
"mochiweb": {:hex, :mochiweb, "2.15.0", "e1daac474df07651e5d17cc1e642c4069c7850dc4508d3db7263a0651330aacc", [:rebar3], []}, "mochiweb": {:hex, :mochiweb, "2.15.0", "e1daac474df07651e5d17cc1e642c4069c7850dc4508d3db7263a0651330aacc", [:rebar3], []},
"mochiweb_html": {:hex, :mochiweb_html, "2.15.0", "d7402e967d7f9f2912f8befa813c37be62d5eeeddbbcb6fe986c44e01460d497", [:rebar3], []}, "mochiweb_html": {:hex, :mochiweb_html, "2.15.0", "d7402e967d7f9f2912f8befa813c37be62d5eeeddbbcb6fe986c44e01460d497", [:rebar3], []},
"mock": {:hex, :mock, "0.2.1", "bfdba786903e77f9c18772dee472d020ceb8ef000783e737725a4c8f54ad28ec", [:mix], [{:meck, "~> 0.8.2", [hex: :meck, optional: false]}]},
"ssl_verify_fun": {:hex, :ssl_verify_fun, "1.1.1", "28a4d65b7f59893bc2c7de786dec1e1555bd742d336043fe644ae956c3497fbe", [:make, :rebar], []}} "ssl_verify_fun": {:hex, :ssl_verify_fun, "1.1.1", "28a4d65b7f59893bc2c7de786dec1e1555bd742d336043fe644ae956c3497fbe", [:make, :rebar], []}}

9859
test/fixtures/rfc2616.txt vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,69 @@
defmodule ReadabilityHttpTest do
use ExUnit.Case
import Mock
require IEx
test "blank response is parsed as plain text" do
url = "https://tools.ietf.org/rfc/rfc2616.txt"
content = TestHelper.read_fixture("rfc2616.txt")
response = %HTTPoison.Response{
status_code: 200,
headers: [],
body: content}
with_mock HTTPoison, [get!: fn(_url, _headers, _opts) -> response end] do
%Readability.Summary{article_text: result_text} = Readability.summarize(url)
assert result_text =~ ~r/3 Protocol Parameters/
end
end
test "text/plain response is parsed as plain text" do
url = "https://tools.ietf.org/rfc/rfc2616.txt"
content = TestHelper.read_fixture("rfc2616.txt")
response = %HTTPoison.Response{
status_code: 200,
headers: [{"Content-Type", "text/plain"}],
body: content}
with_mock HTTPoison, [get!: fn(_url, _headers, _opts) -> response end] do
%Readability.Summary{article_text: result_text} = Readability.summarize(url)
assert result_text =~ ~r/3 Protocol Parameters/
end
end
test "*ml responses are parsed as markup" do
url = "https://news.bbc.co.uk/test.html"
content = TestHelper.read_fixture("bbc.html")
mimes = ["text/html", "application/xml", "application/xhtml+xml"]
mimes |> Enum.each(fn(mime) ->
response = %HTTPoison.Response{
status_code: 200,
headers: [{"Content-Type", mime}],
body: content}
with_mock HTTPoison, [get!: fn(_url, _headers, _opts) -> response end] do
%Readability.Summary{article_html: result_html} = Readability.summarize(url)
assert result_html =~ ~r/connected computing devices\".<\/p><\/div><\/div>$/
end
end)
end
test "response with charset is parsed correctly" do
url = "https://news.bbc.co.uk/test.html"
content = TestHelper.read_fixture("bbc.html")
response = %HTTPoison.Response{
status_code: 200,
headers: [{"Content-Type", "text/html; charset=UTF-8"}],
body: content}
with_mock HTTPoison, [get!: fn(_url, _headers, _opts) -> response end] do
%Readability.Summary{article_html: result_html} = Readability.summarize(url)
assert result_html =~ ~r/connected computing devices\".<\/p><\/div><\/div>$/
end
end
end