defmodule Frenzy.Pipeline.Site.DaringFireballScrapeStage do require Logger alias Frenzy.Pipeline.Stage @behaviour Stage @impl Stage def apply(_opts, %{url: url} = item_params) do case get_article_content(url) do {:ok, content} -> {:ok, %{item_params | content: content}} {:error, reason} -> Logger.warn("Unable to get Daring Fireball article content for #{url}: #{reason}") item_params end end @impl Stage def validate_opts(opts), do: {:ok, opts} defp get_article_content(url) when is_binary(url) and url != "" do Logger.debug("Get Daring Fireball article from #{url}") url |> HTTPoison.get() |> case do {:ok, response} -> handle_response(url, response) {:error, %HTTPoison.Error{reason: reason}} -> {:error, "HTTPoison error: #{reason}"} end end defp get_article_content(_url), do: {:error, "URL must be a non-empty string"} defp handle_response(url, %HTTPoison.Response{status_code: 200, body: body}) do html_tree = Floki.parse(body) case get_article_element(html_tree) || get_link_element(html_tree) do nil -> {:error, "no matching element"} elem -> readable_html = elem |> Floki.filter_out(:comment) |> Readability.readable_html() {:ok, readable_html} end end defp handle_response(_url, %HTTPoison.Response{status_code: 404}) do {:error, "404 not found"} end defp handle_response(url, %HTTPoison.Response{status_code: status_code, headers: headers}) when status_code in [301, 302] do {"Location", new_url} = Enum.find(headers, fn {name, _value} -> name == "Location" end) headers |> Enum.find(fn {name, _value} -> name == "Location" end) |> case do {"Location", new_url} -> Logger.debug("Got 301 redirect from #{url} to #{new_url}") get_article_content(new_url) _ -> {:error, "Missing Location header for redirect"} end end defp handle_response(_url, %HTTPoison.Response{status_code: 403}) do {:error, "403 Forbidden"} end defp handle_response(_url, %HTTPoison.Response{status_code: status_code} = response) do {:error, "No handler for response #{inspect(response)}"} end defp get_article_element(html_tree) do case Floki.find(html_tree, "div.article") do [article_elem | _] -> # articles include extra information in the div.article element Floki.filter_out(article_elem, "h1, .dateline, #PreviousNext") _ -> nil end end defp get_link_element(html_tree) do case Floki.find(html_tree, "dl.linkedlist dd") do [dd_elem | _] -> dd_elem _ -> nil end end end