defmodule Frenzy.Pipeline.ScrapeStage do require Logger alias Frenzy.Pipeline.Stage @behaviour Stage @impl Stage def apply(opts, %{url: url} = item_params) do case get_article_content(url, opts["extractor"]) do {:ok, content} -> {:ok, %{item_params | content: content}} {:error, reason} -> Logger.warn("Unable to get article content for #{url}: #{reason}") {:ok, item_params} end end @impl Stage def validate_opts(opts) when is_map(opts) do # todo: figure out why this errors when an empty map is provided case opts["extractor"] do nil -> {:ok, %{opts | extractor: "builtin"}} extractor when not is_binary(extractor) -> {:error, "extractor must be a string"} "builtin" -> {:ok, opts} extractor -> try do String.to_existing_atom("Elixir." <> extractor) {:ok, opts} rescue ArgumentError -> {:error, "extractor must be \"builtin\" or a module that exists"} end end end @impl Stage def validate_opts(_), do: {:error, "options must be a map"} @spec get_article_content(String.t(), String.t()) :: {:ok, String.t()} | {:error, String.t()} defp get_article_content(url, extractor) when is_binary(url) and url != "" do Logger.debug("Getting article from #{url}") url |> HTTPoison.get() |> case do {:ok, response} -> handle_response(url, response, extractor) {:error, %HTTPoison.Error{reason: reason}} -> {:error, "HTTPoison error: #{reason}"} end end defp get_article_content(_url), do: {:error, "URL must be a non-empty string"} @spec handle_response(String.t(), HTTPoison.Response.t(), String.t()) :: {:ok, String.t()} | {:error, String.t()} defp handle_response(url, %HTTPoison.Response{status_code: 200, body: body}, extractor) do case extractor do "builtin" -> {:ok, Readability.article(body)} module_name -> html_tree = Floki.parse(body) apply(String.to_existing_atom("Elixir." <> module_name), :extract, [html_tree]) end |> case do {:ok, html} -> html = Floki.map(html, rewrite_image_urls(URI.parse(url))) case extractor do "builtin" -> {:ok, Readability.readable_html(html)} _ -> {:ok, Floki.raw_html(html)} end res -> res end end defp handle_response(_url, %HTTPoison.Response{status_code: 404}, _extractor) do {:error, "404 not found"} end defp handle_response( url, %HTTPoison.Response{status_code: status_code, headers: headers}, extractor ) when status_code in [301, 302] do headers |> Enum.find(fn {name, _value} -> name == "Location" end) |> case do {"Location", new_url} -> Logger.debug("Got 301 redirect from #{url} to #{new_url}") get_article_content(new_url, extractor) _ -> {:error, "Missing Location header for redirect"} end end defp handle_response(_url, %HTTPoison.Response{status_code: 403}, _extractor) do {:error, "403 Forbidden"} end defp handle_response(_url, %HTTPoison.Response{} = response, _extractor) do {:error, "No handler for response #{inspect(response)}"} end # Generates a helper function for the article with the given URI that takes an HTML element and, # if it's an element whose src attribute does not have a hostname, adds the hostname and # scheme to the element. defp rewrite_image_urls(%URI{host: host, scheme: scheme}) do fn {"img", [{"src", src} | attrs]} = elem -> case URI.parse(src) do %URI{host: nil, path: path} -> new_src = URI.to_string(%URI{path: path, host: host, scheme: scheme}) {"img", [{"src", new_src} | attrs]} _ -> elem end elem -> elem end end end