defmodule Frenzy.Pipeline.ScrapeStage do require Logger alias Frenzy.Pipeline.Stage @behaviour Stage @impl Stage def apply(opts, %{url: url} = item_params) do case get_article_content(url, opts["extractor"]) do {:ok, content} -> {:ok, %{item_params | content: content}} {:error, reason} -> Logger.warn("Unable to get article content for #{url}: #{reason}") {:ok, item_params} end end @impl Stage def validate_opts(opts) when is_map(opts) do # todo: figure out why this errors when an empty map is provided case opts["extractor"] do nil -> {:ok, %{opts | extractor: "builtin"}} extractor when not is_binary(extractor) -> {:error, "extractor must be a string"} "builtin" -> {:ok, opts} extractor -> try do String.to_existing_atom("Elixir." <> extractor) {:ok, opts} rescue ArgumentError -> {:error, "extractor must be \"builtin\" or a module that exists"} end end end @impl Stage def validate_opts(_), do: {:error, "options must be a map"} @spec get_article_content(String.t(), String.t()) :: {:ok, String.t()} | {:error, String.t()} defp get_article_content(url, extractor) when is_binary(url) and url != "" do Logger.debug("Getting article from #{url}") url |> HTTPoison.get() |> case do {:ok, response} -> handle_response(url, response, extractor) {:error, %HTTPoison.Error{reason: reason}} -> {:error, "HTTPoison error: #{reason}"} end end defp get_article_content(_url), do: {:error, "URL must be a non-empty string"} @spec handle_response(String.t(), HTTPoison.Response.t(), String.t()) :: {:ok, String.t()} | {:error, String.t()} defp handle_response(_url, %HTTPoison.Response{status_code: 200, body: body}, extractor) do case extractor do "builtin" -> article = Readability.article(body) {:ok, Readability.readable_html(article)} module_name -> html_tree = Floki.parse(body) case apply(String.to_existing_atom("Elixir." <> module_name), :extract, [html_tree]) do {:error, _} = err -> err {:ok, html_tree} -> html_tree |> Floki.raw_html() end end end defp handle_response(_url, %HTTPoison.Response{status_code: 404}, _extractor) do {:error, "404 not found"} end defp handle_response( url, %HTTPoison.Response{status_code: status_code, headers: headers}, extractor ) when status_code in [301, 302] do headers |> Enum.find(fn {name, _value} -> name == "Location" end) |> case do {"Location", new_url} -> Logger.debug("Got 301 redirect from #{url} to #{new_url}") get_article_content(new_url, extractor) _ -> {:error, "Missing Location header for redirect"} end end defp handle_response(_url, %HTTPoison.Response{status_code: 403}, _extractor) do {:error, "403 Forbidden"} end defp handle_response(_url, %HTTPoison.Response{} = response, _extractor) do {:error, "No handler for response #{inspect(response)}"} end end