Merge pull request #25 from OldhamMade/master
Handle text-based responses
This commit is contained in:
commit
89d3958fd7
|
@ -61,10 +61,13 @@ defmodule Readability do
|
|||
protect_attrs: ~r/^(?!id|rel|for|summary|title|href|src|alt|srcdoc)/i
|
||||
]
|
||||
|
||||
@markup_mimes ~r/^(application|text)\/[a-z\-_\.\+]+ml(;\s+charset=.*)?$/i
|
||||
|
||||
@type html_tree :: tuple | list
|
||||
@type raw_html :: binary
|
||||
@type url :: binary
|
||||
@type options :: list
|
||||
@type headers :: list[tuple]
|
||||
|
||||
@doc """
|
||||
summarize the primary readable content of a webpage.
|
||||
|
@ -73,8 +76,11 @@ defmodule Readability do
|
|||
def summarize(url, opts \\ []) do
|
||||
opts = Keyword.merge(opts, [page_url: url])
|
||||
httpoison_options = Application.get_env :readability, :httpoison_options, []
|
||||
%{status_code: _, body: raw_html} = HTTPoison.get!(url, [], httpoison_options)
|
||||
html_tree = Helper.normalize(raw_html)
|
||||
%{status_code: _, body: raw, headers: headers} = HTTPoison.get!(url, [], httpoison_options)
|
||||
|
||||
case is_response_markup(headers) do
|
||||
true ->
|
||||
html_tree = Helper.normalize(raw)
|
||||
article_tree = html_tree
|
||||
|> ArticleBuilder.build(opts)
|
||||
|
||||
|
@ -83,6 +89,46 @@ defmodule Readability do
|
|||
article_html: readable_html(article_tree),
|
||||
article_text: readable_text(article_tree)
|
||||
}
|
||||
|
||||
_ ->
|
||||
%Summary{title: nil,
|
||||
authors: nil,
|
||||
article_html: nil,
|
||||
article_text: raw
|
||||
}
|
||||
end
|
||||
|
||||
end
|
||||
|
||||
@doc """
|
||||
Extract MIME Type from headers
|
||||
|
||||
## Example
|
||||
|
||||
iex> mime = Readability.mime(headers_list)
|
||||
"text/html"
|
||||
"""
|
||||
@spec mime(headers) :: String.t()
|
||||
def mime(headers \\ []) do
|
||||
headers
|
||||
|> Enum.find(
|
||||
{"Content-Type", "text/plain"}, # default
|
||||
fn({key, _}) -> key == "Content-Type" end)
|
||||
|> elem(1)
|
||||
end
|
||||
|
||||
@doc """
|
||||
Return true if Content-Type in provided headers list is a markup type,
|
||||
else false
|
||||
|
||||
## Example
|
||||
|
||||
iex> Readability.is_response_markup?([{"Content-Type", "text/html"}])
|
||||
true
|
||||
"""
|
||||
@spec is_response_markup(headers) :: boolean
|
||||
def is_response_markup(headers) do
|
||||
mime(headers) =~ @markup_mimes
|
||||
end
|
||||
|
||||
@doc """
|
||||
|
|
5
mix.exs
5
mix.exs
|
@ -2,7 +2,7 @@ defmodule Readability.Mixfile do
|
|||
@moduledoc """
|
||||
"""
|
||||
|
||||
@version "0.8.0"
|
||||
@version "0.9.0"
|
||||
@description """
|
||||
Readability library for extracting and curating articles.
|
||||
"""
|
||||
|
@ -44,7 +44,8 @@ defmodule Readability.Mixfile do
|
|||
{:httpoison, "~> 0.11.0"},
|
||||
{:ex_doc, "~> 0.14", only: :dev},
|
||||
{:credo, "~> 0.6.1", only: [:dev, :test]},
|
||||
{:dialyxir, "~> 0.3", only: [:dev]}
|
||||
{:dialyxir, "~> 0.3", only: [:dev]},
|
||||
{:mock, "~> 0.2.0", only: :test},
|
||||
]
|
||||
end
|
||||
|
||||
|
|
2
mix.lock
2
mix.lock
|
@ -8,8 +8,10 @@
|
|||
"hackney": {:hex, :hackney, "1.6.5", "8c025ee397ac94a184b0743c73b33b96465e85f90a02e210e86df6cbafaa5065", [:rebar3], [{:certifi, "0.7.0", [hex: :certifi, optional: false]}, {:idna, "1.2.0", [hex: :idna, optional: false]}, {:metrics, "1.0.1", [hex: :metrics, optional: false]}, {:mimerl, "1.0.2", [hex: :mimerl, optional: false]}, {:ssl_verify_fun, "1.1.1", [hex: :ssl_verify_fun, optional: false]}]},
|
||||
"httpoison": {:hex, :httpoison, "0.11.0", "b9240a9c44fc46fcd8618d17898859ba09a3c1b47210b74316c0ffef10735e76", [:mix], [{:hackney, "~> 1.6.3", [hex: :hackney, optional: false]}]},
|
||||
"idna": {:hex, :idna, "1.2.0", "ac62ee99da068f43c50dc69acf700e03a62a348360126260e87f2b54eced86b2", [:rebar3], []},
|
||||
"meck": {:hex, :meck, "0.8.4", "59ca1cd971372aa223138efcf9b29475bde299e1953046a0c727184790ab1520", [:make, :rebar], []},
|
||||
"metrics": {:hex, :metrics, "1.0.1", "25f094dea2cda98213cecc3aeff09e940299d950904393b2a29d191c346a8486", [:rebar3], []},
|
||||
"mimerl": {:hex, :mimerl, "1.0.2", "993f9b0e084083405ed8252b99460c4f0563e41729ab42d9074fd5e52439be88", [:rebar3], []},
|
||||
"mochiweb": {:hex, :mochiweb, "2.15.0", "e1daac474df07651e5d17cc1e642c4069c7850dc4508d3db7263a0651330aacc", [:rebar3], []},
|
||||
"mochiweb_html": {:hex, :mochiweb_html, "2.15.0", "d7402e967d7f9f2912f8befa813c37be62d5eeeddbbcb6fe986c44e01460d497", [:rebar3], []},
|
||||
"mock": {:hex, :mock, "0.2.1", "bfdba786903e77f9c18772dee472d020ceb8ef000783e737725a4c8f54ad28ec", [:mix], [{:meck, "~> 0.8.2", [hex: :meck, optional: false]}]},
|
||||
"ssl_verify_fun": {:hex, :ssl_verify_fun, "1.1.1", "28a4d65b7f59893bc2c7de786dec1e1555bd742d336043fe644ae956c3497fbe", [:make, :rebar], []}}
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,69 @@
|
|||
defmodule ReadabilityHttpTest do
|
||||
use ExUnit.Case
|
||||
import Mock
|
||||
require IEx
|
||||
|
||||
test "blank response is parsed as plain text" do
|
||||
url = "https://tools.ietf.org/rfc/rfc2616.txt"
|
||||
content = TestHelper.read_fixture("rfc2616.txt")
|
||||
response = %HTTPoison.Response{
|
||||
status_code: 200,
|
||||
headers: [],
|
||||
body: content}
|
||||
|
||||
with_mock HTTPoison, [get!: fn(_url, _headers, _opts) -> response end] do
|
||||
%Readability.Summary{article_text: result_text} = Readability.summarize(url)
|
||||
|
||||
assert result_text =~ ~r/3 Protocol Parameters/
|
||||
end
|
||||
end
|
||||
|
||||
test "text/plain response is parsed as plain text" do
|
||||
url = "https://tools.ietf.org/rfc/rfc2616.txt"
|
||||
content = TestHelper.read_fixture("rfc2616.txt")
|
||||
response = %HTTPoison.Response{
|
||||
status_code: 200,
|
||||
headers: [{"Content-Type", "text/plain"}],
|
||||
body: content}
|
||||
|
||||
with_mock HTTPoison, [get!: fn(_url, _headers, _opts) -> response end] do
|
||||
%Readability.Summary{article_text: result_text} = Readability.summarize(url)
|
||||
|
||||
assert result_text =~ ~r/3 Protocol Parameters/
|
||||
end
|
||||
end
|
||||
|
||||
test "*ml responses are parsed as markup" do
|
||||
url = "https://news.bbc.co.uk/test.html"
|
||||
content = TestHelper.read_fixture("bbc.html")
|
||||
mimes = ["text/html", "application/xml", "application/xhtml+xml"]
|
||||
|
||||
mimes |> Enum.each(fn(mime) ->
|
||||
response = %HTTPoison.Response{
|
||||
status_code: 200,
|
||||
headers: [{"Content-Type", mime}],
|
||||
body: content}
|
||||
|
||||
with_mock HTTPoison, [get!: fn(_url, _headers, _opts) -> response end] do
|
||||
%Readability.Summary{article_html: result_html} = Readability.summarize(url)
|
||||
|
||||
assert result_html =~ ~r/connected computing devices\".<\/p><\/div><\/div>$/
|
||||
end
|
||||
end)
|
||||
end
|
||||
|
||||
test "response with charset is parsed correctly" do
|
||||
url = "https://news.bbc.co.uk/test.html"
|
||||
content = TestHelper.read_fixture("bbc.html")
|
||||
response = %HTTPoison.Response{
|
||||
status_code: 200,
|
||||
headers: [{"Content-Type", "text/html; charset=UTF-8"}],
|
||||
body: content}
|
||||
|
||||
with_mock HTTPoison, [get!: fn(_url, _headers, _opts) -> response end] do
|
||||
%Readability.Summary{article_html: result_html} = Readability.summarize(url)
|
||||
|
||||
assert result_html =~ ~r/connected computing devices\".<\/p><\/div><\/div>$/
|
||||
end
|
||||
end
|
||||
end
|
Loading…
Reference in New Issue