frenzy/lib/frenzy/pipeline/site/daring_fireball_scrape_stag...

106 lines
2.9 KiB
Elixir

defmodule Frenzy.Pipeline.Site.DaringFireballScrapeStage do
require Logger
alias Frenzy.Pipeline.Stage
@behaviour Stage
@impl Stage
def apply(_opts, %{url: url} = item_params) do
case get_article_content(url) do
{:ok, content} ->
{:ok, %{item_params | content: content}}
{:error, reason} ->
Logger.warn("Unable to get Daring Fireball article content for #{url}: #{reason}")
{:ok, item_params}
end
end
@impl Stage
def validate_opts(opts), do: {:ok, opts}
@spec get_article_content(String.t()) :: {:ok, String.t()} | {:error, String.t()}
defp get_article_content(url) when is_binary(url) and url != "" do
Logger.debug("Get Daring Fireball article from #{url}")
url
|> HTTPoison.get()
|> case do
{:ok, response} ->
handle_response(url, response)
{:error, %HTTPoison.Error{reason: reason}} ->
{:error, "HTTPoison error: #{reason}"}
end
end
defp get_article_content(_url), do: {:error, "URL must be a non-empty string"}
@spec handle_response(String.t(), HTTPoison.Response.t()) ::
{:ok, String.t()} | {:error, String.t()}
defp handle_response(url, %HTTPoison.Response{status_code: 200, body: body}) do
html_tree = Floki.parse(body)
case get_article_element(html_tree) || get_link_element(html_tree) do
nil ->
{:error, "no matching element"}
elem ->
readable_html =
elem
|> Floki.filter_out(:comment)
|> Readability.readable_html()
{:ok, readable_html}
end
end
defp handle_response(_url, %HTTPoison.Response{status_code: 404}) do
{:error, "404 not found"}
end
defp handle_response(url, %HTTPoison.Response{status_code: status_code, headers: headers})
when status_code in [301, 302] do
{"Location", new_url} = Enum.find(headers, fn {name, _value} -> name == "Location" end)
headers
|> Enum.find(fn {name, _value} -> name == "Location" end)
|> case do
{"Location", new_url} ->
Logger.debug("Got 301 redirect from #{url} to #{new_url}")
get_article_content(new_url)
_ ->
{:error, "Missing Location header for redirect"}
end
end
defp handle_response(_url, %HTTPoison.Response{status_code: 403}) do
{:error, "403 Forbidden"}
end
defp handle_response(_url, %HTTPoison.Response{status_code: status_code} = response) do
{:error, "No handler for response #{inspect(response)}"}
end
defp get_article_element(html_tree) do
case Floki.find(html_tree, "div.article") do
[article_elem | _] ->
# articles include extra information in the div.article element
Floki.filter_out(article_elem, "h1, .dateline, #PreviousNext")
_ ->
nil
end
end
defp get_link_element(html_tree) do
case Floki.find(html_tree, "dl.linkedlist dd") do
[dd_elem | _] ->
dd_elem
_ ->
nil
end
end
end