Add Daring Fireball scraper
This commit is contained in:
parent
17310911ce
commit
e55a694194
|
@ -10,7 +10,7 @@ defmodule Frenzy.Pipeline.ScrapeStage do
|
||||||
{:ok, %{item_params | content: content}}
|
{:ok, %{item_params | content: content}}
|
||||||
|
|
||||||
{:error, reason} ->
|
{:error, reason} ->
|
||||||
Logger.warn("Unable to get article content: #{reason}")
|
Logger.warn("Unable to get article content for #{url}: #{reason}")
|
||||||
item_params
|
item_params
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
@ -47,8 +47,16 @@ defmodule Frenzy.Pipeline.ScrapeStage do
|
||||||
when status_code in [301, 302] do
|
when status_code in [301, 302] do
|
||||||
{"Location", new_url} = Enum.find(headers, fn {name, _value} -> name == "Location" end)
|
{"Location", new_url} = Enum.find(headers, fn {name, _value} -> name == "Location" end)
|
||||||
|
|
||||||
|
headers
|
||||||
|
|> Enum.find(fn {name, _value} -> name == "Location" end)
|
||||||
|
|> case do
|
||||||
|
{"Location", new_url} ->
|
||||||
Logger.debug("Got 301 redirect from #{url} to #{new_url}")
|
Logger.debug("Got 301 redirect from #{url} to #{new_url}")
|
||||||
get_article_content(new_url)
|
get_article_content(new_url)
|
||||||
|
|
||||||
|
_ ->
|
||||||
|
{:error, "Missing Location header for redirect"}
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
defp handle_response(_url, %HTTPoison.Response{status_code: 403}) do
|
defp handle_response(_url, %HTTPoison.Response{status_code: 403}) do
|
||||||
|
|
|
@ -0,0 +1,102 @@
|
||||||
|
defmodule Frenzy.Pipeline.Site.DaringFireballScrapeStage do
|
||||||
|
require Logger
|
||||||
|
alias Frenzy.Pipeline.Stage
|
||||||
|
@behaviour Stage
|
||||||
|
|
||||||
|
@impl Stage
|
||||||
|
def apply(_opts, %{url: url} = item_params) do
|
||||||
|
case get_article_content(url) do
|
||||||
|
{:ok, content} ->
|
||||||
|
{:ok, %{item_params | content: content}}
|
||||||
|
|
||||||
|
{:error, reason} ->
|
||||||
|
Logger.warn("Unable to get Daring Fireball article content for #{url}: #{reason}")
|
||||||
|
item_params
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
@impl Stage
|
||||||
|
def validate_opts(opts), do: {:ok, opts}
|
||||||
|
|
||||||
|
defp get_article_content(url) when is_binary(url) and url != "" do
|
||||||
|
Logger.debug("Get Daring Fireball article from #{url}")
|
||||||
|
|
||||||
|
url
|
||||||
|
|> HTTPoison.get()
|
||||||
|
|> case do
|
||||||
|
{:ok, response} ->
|
||||||
|
handle_response(url, response)
|
||||||
|
|
||||||
|
{:error, %HTTPoison.Error{reason: reason}} ->
|
||||||
|
{:error, "HTTPoison error: #{reason}"}
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
defp get_article_content(_url), do: {:error, "URL must be a non-empty string"}
|
||||||
|
|
||||||
|
defp handle_response(url, %HTTPoison.Response{status_code: 200, body: body}) do
|
||||||
|
html_tree = Floki.parse(body)
|
||||||
|
|
||||||
|
case get_article_element(html_tree) || get_link_element(html_tree) do
|
||||||
|
nil ->
|
||||||
|
{:error, "no matching element"}
|
||||||
|
|
||||||
|
elem ->
|
||||||
|
readable_html =
|
||||||
|
elem
|
||||||
|
|> Floki.filter_out(:comment)
|
||||||
|
|> Readability.readable_html()
|
||||||
|
|
||||||
|
{:ok, readable_html}
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
defp handle_response(_url, %HTTPoison.Response{status_code: 404}) do
|
||||||
|
{:error, "404 not found"}
|
||||||
|
end
|
||||||
|
|
||||||
|
defp handle_response(url, %HTTPoison.Response{status_code: status_code, headers: headers})
|
||||||
|
when status_code in [301, 302] do
|
||||||
|
{"Location", new_url} = Enum.find(headers, fn {name, _value} -> name == "Location" end)
|
||||||
|
|
||||||
|
headers
|
||||||
|
|> Enum.find(fn {name, _value} -> name == "Location" end)
|
||||||
|
|> case do
|
||||||
|
{"Location", new_url} ->
|
||||||
|
Logger.debug("Got 301 redirect from #{url} to #{new_url}")
|
||||||
|
get_article_content(new_url)
|
||||||
|
|
||||||
|
_ ->
|
||||||
|
{:error, "Missing Location header for redirect"}
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
defp handle_response(_url, %HTTPoison.Response{status_code: 403}) do
|
||||||
|
{:error, "403 Forbidden"}
|
||||||
|
end
|
||||||
|
|
||||||
|
defp handle_response(_url, %HTTPoison.Response{status_code: status_code} = response) do
|
||||||
|
{:error, "No handler for response #{inspect(response)}"}
|
||||||
|
end
|
||||||
|
|
||||||
|
defp get_article_element(html_tree) do
|
||||||
|
case Floki.find(html_tree, "div.article") do
|
||||||
|
[article_elem | _] ->
|
||||||
|
# articles include extra information in the div.article element
|
||||||
|
Floki.filter_out(article_elem, "h1, .dateline, #PreviousNext")
|
||||||
|
|
||||||
|
_ ->
|
||||||
|
nil
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
defp get_link_element(html_tree) do
|
||||||
|
case Floki.find(html_tree, "dl.linkedlist dd") do
|
||||||
|
[dd_elem | _] ->
|
||||||
|
dd_elem
|
||||||
|
|
||||||
|
_ ->
|
||||||
|
nil
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
Loading…
Reference in New Issue