defmodule Frenzy.Pipeline.ScrapeStage do require Logger alias Frenzy.Pipeline.Stage @behaviour Stage @impl Stage def apply(_opts, %{url: url} = item_params) do case get_article_content(url) do {:ok, content} -> {:ok, %{item_params | content: content}} {:error, reason} -> Logger.warn("Unable to get article content: #{reason}") item_params end end defp get_article_content(url) when is_binary(url) and url != "" do Logger.debug("Getting article from #{url}") url |> HTTPoison.get() |> case do {:ok, response} -> handle_response(url, response) {:error, %HTTPoison.Error{reason: reason}} -> {:error, "HTTPoison error: #{reason}"} end end defp get_article_content(_url), do: {:error, "URL must be a non-empty string"} defp handle_response(_url, %HTTPoison.Response{status_code: 200, body: body}) do article = Readability.article(body) {:ok, Readability.readable_html(article)} end defp handle_response(_url, %HTTPoison.Response{status_code: 404}) do {:error, "404 not found"} end defp handle_response(url, %HTTPoison.Response{status_code: status_code, headers: headers}) when status_code in [301, 302] do {"Location", new_url} = Enum.find(headers, fn {name, _value} -> name == "Location" end) Logger.debug("Got 301 redirect from #{url} to #{new_url}") get_article_content(new_url) end defp handle_response(_url, %HTTPoison.Response{status_code: 403}) do {:error, "403 Forbidden"} end defp handle_response(_url, %HTTPoison.Response{status_code: status_code} = response) do {:error, "No handler for response #{inspect(response)}"} end end