defmodule Frenzy.Pipeline.ScrapeStage do require Logger alias Frenzy.Pipeline.Stage @behaviour Stage @impl Stage def apply(_opts, %{url: url} = item_params) do case get_article_content(url) do {:ok, content} -> {:ok, %{item_params | content: content}} {:error, reason} -> Logger.warn("Unable to get article content for #{url}: #{reason}") {:ok, item_params} end end @impl Stage def validate_opts(opts), do: {:ok, opts} @spec get_article_content(String.t()) :: {:ok, String.t()} | {:error, String.t()} defp get_article_content(url) when is_binary(url) and url != "" do Logger.debug("Getting article from #{url}") url |> HTTPoison.get() |> case do {:ok, response} -> handle_response(url, response) {:error, %HTTPoison.Error{reason: reason}} -> {:error, "HTTPoison error: #{reason}"} end end defp get_article_content(_url), do: {:error, "URL must be a non-empty string"} @spec handle_response(String.t(), HTTPoison.Response.t()) :: {:ok, String.t()} | {:error, String.t()} defp handle_response(_url, %HTTPoison.Response{status_code: 200, body: body}) do article = Readability.article(body) {:ok, Readability.readable_html(article)} end defp handle_response(_url, %HTTPoison.Response{status_code: 404}) do {:error, "404 not found"} end defp handle_response(url, %HTTPoison.Response{status_code: status_code, headers: headers}) when status_code in [301, 302] do {"Location", new_url} = Enum.find(headers, fn {name, _value} -> name == "Location" end) headers |> Enum.find(fn {name, _value} -> name == "Location" end) |> case do {"Location", new_url} -> Logger.debug("Got 301 redirect from #{url} to #{new_url}") get_article_content(new_url) _ -> {:error, "Missing Location header for redirect"} end end defp handle_response(_url, %HTTPoison.Response{status_code: 403}) do {:error, "403 Forbidden"} end defp handle_response(_url, %HTTPoison.Response{status_code: status_code} = response) do {:error, "No handler for response #{inspect(response)}"} end end