defmodule Frenzy.Pipeline.ScrapeStage do require Logger alias Frenzy.Network alias Frenzy.BuiltinExtractor alias Frenzy.Pipeline.Stage @behaviour Stage @impl Stage def apply(opts, %{url: url} = item_params) do case get_article_content(url, opts) do {:ok, content} -> {:ok, %{item_params | content: content}} {:error, reason} -> Logger.warning("Unable to get article content for #{url}: #{reason}") {:ok, item_params} end end @impl Stage def validate_opts(opts) when is_map(opts) do opts = case opts["extractor"] do nil -> {:ok, Map.put(opts, "extractor", "builtin")} extractor when not is_binary(extractor) -> {:error, "extractor must be a string"} "builtin" -> {:ok, opts} extractor -> try do String.to_existing_atom("Elixir." <> extractor) {:ok, opts} rescue ArgumentError -> {:error, "extractor must be \"builtin\" or a module that exists"} end end case opts do {:ok, opts} -> case opts["convert_to_data_uris"] do nil -> {:ok, Map.put(opts, "convert_to_data_uris", true)} value when is_boolean(value) -> {:ok, opts} _ -> {:error, "convert_to_data_uris must be a boolean"} end _ -> opts end end @impl Stage def validate_opts(_), do: {:error, "options must be a map"} @impl Stage def default_opts(), do: %{} @spec get_article_content(String.t(), map()) :: {:ok, String.t()} | {:error, String.t()} defp get_article_content(url, opts) when is_binary(url) and url != "" do Logger.debug("Getting article from #{url}") url |> Network.http_get() |> case do {:ok, %Tesla.Env{status: code} = response} when code in 200..299 -> handle_response(url, response, opts) {:ok, %Tesla.Env{status: code}} -> {:error, "Unexpected HTTP code #{code}"} {:error, reason} -> {:error, "Couldn't scrape article: #{reason}"} end end defp get_article_content(_url, _opts), do: {:error, "URL must be a non-empty string"} @spec handle_response(String.t(), Tesla.Env.t(), map()) :: {:ok, String.t()} | {:error, String.t()} defp handle_response(url, %Tesla.Env{body: body}, opts) do case opts["extractor"] do "builtin" -> {:ok, BuiltinExtractor.article(url, body)} module_name -> {:ok, html_tree} = Floki.parse_document(body) try do apply(String.to_existing_atom("Elixir." <> module_name), :extract, [html_tree]) |> case do {:ok, content} -> # non-builtin extractors go through readable_html to cleanup any bad/untrusted html # this is what Floki.readable_html without turning back into a string content = Readability.Helper.remove_attrs(content, Readability.regexes(:protect_attrs)) {:ok, content} err -> err end rescue e -> Logger.error( "Encountered error extracting article content from '#{url}' with #{module_name}, falling back to default" ) Logger.error(Exception.format(:error, e, __STACKTRACE__)) if Frenzy.sentry_enabled?() do Sentry.capture_exception(e, stacktrace: __STACKTRACE__, extra: %{extractor: module_name, item_url: url} ) end {:ok, BuiltinExtractor.article(url, body)} end end |> case do {:ok, html} -> convert_to_data_uris = case opts["convert_to_data_uris"] do nil -> true value -> value end html = Floki.find_and_update( html, "img", rewrite_image_urls(convert_to_data_uris, URI.parse(url)) ) {:ok, Floki.raw_html(html)} res -> res end end # Generates a helper function for the article with the given URI that takes an HTML element and, # if it's an element whose src attribute does not have a hostname, adds the hostname and # scheme to the element. defp rewrite_image_urls(convert_to_data_uris, site_uri) do fn {"img", attrs} -> new_attrs =, fn {"src", src} -> {"src", image_to_data_uri(src, site_uri, convert_to_data_uris)} attr -> attr end) has_src = Enum.find(new_attrs, fn {name, _} -> name == "src" end) # remove srcsets because our transformation only applies to the src attribute, so that should always be used new_attrs = if has_src do Enum.reject(new_attrs, fn {name, _} -> name == "srcset" end) else new_attrs end {"img", new_attrs} elem -> elem end end @content_type_allowlist ["image/jpeg", "image/png", "image/heic", "image/heif", "image/tiff"] # convert images to data URIs so that they're stored by clients as part of the body defp image_to_data_uri("data:" <> _ = src, _site_uri, _convert) do src end defp image_to_data_uri(src, site_uri, true) do absolute_url = URI.merge(site_uri, src) |> to_string() case Network.http_get(absolute_url) do {:ok, %Tesla.Env{body: body, headers: headers}} -> Enum.find(headers, fn {header, _value} -> String.downcase(header) == "content-type" end) |> case do {_, content_type} when content_type in @content_type_allowlist -> "data:#{content_type};base64,#{Base.encode64(body)}" _ -> src end _ -> src end end defp image_to_data_uri(src, site_uri, false), do: to_string(URI.merge(site_uri, src)) end