frenzy/lib/frenzy/pipeline/scrape_stage.ex

defmodule Frenzy.Pipeline.ScrapeStage do
  require Logger
  alias Frenzy.Network
  alias Frenzy.Pipeline.Stage
  @behaviour Stage

  @impl Stage
  def apply(opts, %{url: url} = item_params) do
    case get_article_content(url, opts) do
      {:ok, content} ->
        {:ok, %{item_params | content: content}}

      {:error, reason} ->
        Logger.warn("Unable to get article content for #{url}: #{reason}")
        {:ok, item_params}
    end
  end

  @impl Stage
  def validate_opts(opts) when is_map(opts) do
    opts =
      case opts["extractor"] do
        nil ->
          {:ok, Map.put(opts, "extractor", "builtin")}

        extractor when not is_binary(extractor) ->
          {:error, "extractor must be a string"}

        "builtin" ->
          {:ok, opts}

        extractor ->
          try do
            String.to_existing_atom("Elixir." <> extractor)
            {:ok, opts}
          rescue
            ArgumentError ->
              {:error, "extractor must be \"builtin\" or a module that exists"}
          end
      end

    case opts do
      {:ok, opts} ->
        case opts["convert_to_data_uris"] do
          nil ->
            {:ok, Map.put(opts, "convert_to_data_uris", true)}

          value when is_boolean(value) ->
            {:ok, opts}

          _ ->
            {:error, "convert_to_data_uris must be a boolean"}
        end

      _ ->
        opts
    end
  end

  @impl Stage
  def validate_opts(_), do: {:error, "options must be a map"}

  @impl Stage
  def default_opts(), do: %{}

  @spec get_article_content(String.t(), map()) :: {:ok, String.t()} | {:error, String.t()}
  defp get_article_content(url, opts) when is_binary(url) and url != "" do
    Logger.debug("Getting article from #{url}")

    url
    |> Network.http_get()
    |> case do
      {:ok, %Tesla.Env{status: code} = response} when code in 200..299 ->
        handle_response(url, response, opts)

      {:error, reason} ->
        {:error, "Couldn't scrape article: #{reason}"}
    end
  end

  defp get_article_content(_url, _opts), do: {:error, "URL must be a non-empty string"}

  @spec handle_response(String.t(), Tesla.Env.t(), map()) ::
          {:ok, String.t()} | {:error, String.t()}
  defp handle_response(url, %Tesla.Env{body: body}, opts) do
    case opts["extractor"] do
      "builtin" ->
        {:ok, Readability.article(body)}

      module_name ->
        html_tree = Floki.parse(body)

        try do
          apply(String.to_existing_atom("Elixir." <> module_name), :extract, [html_tree])
        rescue
          e ->
            Logger.error(
              "Encountered error extracting article content from '#{url}' with #{module_name}, falling back to default"
            )

            Logger.error(Exception.format(:error, e, __STACKTRACE__))

            {:ok, Readability.article(body)}
        end
    end
    |> case do
      {:ok, html} ->
        convert_to_data_uris =
          case opts["convert_to_data_uris"] do
            nil -> true
            value -> value
          end

        html = Floki.map(html, rewrite_image_urls(convert_to_data_uris, URI.parse(url)))

        {:ok, Readability.readable_html(html)}

      res ->
        res
    end
  end

  # Generates a helper function for the article with the given URI that takes an HTML element and,
  # if it's an <img> element whose src attribute does not have a hostname, adds the hostname and
  # scheme to the element.
  defp rewrite_image_urls(convert_to_data_uris, site_uri) do
    fn
      {"img", attrs} ->
        new_attrs =
          Enum.map(attrs, fn
            {"src", src} ->
              {"src", image_to_data_uri(src, site_uri, convert_to_data_uris)}

            attr ->
              attr
          end)

        {"img", new_attrs}

      elem ->
        elem
    end
  end

  @content_type_allowlist ["image/jpeg", "image/png", "image/heic", "image/heif", "image/tiff"]

  # convert images to data URIs so that they're stored by clients as part of the body
  defp image_to_data_uri(src, site_uri, true) do
    absolute_url = URI.merge(site_uri, src) |> to_string()

    case Network.http_get(absolute_url) do
      {:ok, %Tesla.Env{body: body, headers: headers}} ->
        Enum.find(headers, fn {header, _value} -> String.downcase(header) == "content-type" end)
        |> case do
          {_, content_type} when content_type in @content_type_allowlist ->
            "data:#{content_type};base64,#{Base.encode64(body)}"

          _ ->
            src
        end

      _ ->
        src
    end
  end

  defp image_to_data_uri(src, _site_uri, false), do: src
end
Start pipeline system 2019-07-09 02:41:18 +00:00			`defmodule Frenzy.Pipeline.ScrapeStage do`
			`require Logger`
Add gemini protocol feed fetching 2020-07-18 23:27:53 +00:00			`alias Frenzy.Network`
Start pipeline system 2019-07-09 02:41:18 +00:00			`alias Frenzy.Pipeline.Stage`
			`@behaviour Stage`

			`@impl Stage`
Replace site-specific pipeline stages with new extractor architecture 2019-10-31 20:42:24 +00:00			`def apply(opts, %{url: url} = item_params) do`
Add option to convert images in article content to data URIs 2019-11-01 01:59:55 +00:00			`case get_article_content(url, opts) do`
Start pipeline system 2019-07-09 02:41:18 +00:00			`{:ok, content} ->`
			`{:ok, %{item_params \| content: content}}`

			`{:error, reason} ->`
Add Daring Fireball scraper 2019-07-21 23:04:43 +00:00			`Logger.warn("Unable to get article content for #{url}: #{reason}")`
Add types, Dialyzer, fix Dialyzer warnings 2019-08-30 23:31:38 +00:00			`{:ok, item_params}`
Start pipeline system 2019-07-09 02:41:18 +00:00			`end`
			`end`

Add pipeline stage option validation/error reporting 2019-07-21 16:21:28 +00:00			`@impl Stage`
Replace site-specific pipeline stages with new extractor architecture 2019-10-31 20:42:24 +00:00			`def validate_opts(opts) when is_map(opts) do`
Add option to convert images in article content to data URIs 2019-11-01 01:59:55 +00:00			`opts =`
			`case opts["extractor"] do`
			`nil ->`
Fix pipeline validation not working 2020-05-31 19:56:27 +00:00			`{:ok, Map.put(opts, "extractor", "builtin")}`
Add pipeline stage option validation/error reporting 2019-07-21 16:21:28 +00:00
Add option to convert images in article content to data URIs 2019-11-01 01:59:55 +00:00			`extractor when not is_binary(extractor) ->`
			`{:error, "extractor must be a string"}`
Replace site-specific pipeline stages with new extractor architecture 2019-10-31 20:42:24 +00:00
Add option to convert images in article content to data URIs 2019-11-01 01:59:55 +00:00			`"builtin" ->`
			`{:ok, opts}`

			`extractor ->`
			`try do`
			`String.to_existing_atom("Elixir." <> extractor)`
			`{:ok, opts}`
			`rescue`
			`ArgumentError ->`
			`{:error, "extractor must be \"builtin\" or a module that exists"}`
			`end`
			`end`

Fix error while validating scrape stage options 2019-11-01 22:27:08 +00:00			`case opts do`
			`{:ok, opts} ->`
			`case opts["convert_to_data_uris"] do`
			`nil ->`
Fix pipeline validation not working 2020-05-31 19:56:27 +00:00			`{:ok, Map.put(opts, "convert_to_data_uris", true)}`
Add option to convert images in article content to data URIs 2019-11-01 01:59:55 +00:00
Fix error while validating scrape stage options 2019-11-01 22:27:08 +00:00			`value when is_boolean(value) ->`
			`{:ok, opts}`

			`_ ->`
			`{:error, "convert_to_data_uris must be a boolean"}`
			`end`
Replace site-specific pipeline stages with new extractor architecture 2019-10-31 20:42:24 +00:00
Add option to convert images in article content to data URIs 2019-11-01 01:59:55 +00:00			`_ ->`
Fix error while validating scrape stage options 2019-11-01 22:27:08 +00:00			`opts`
Replace site-specific pipeline stages with new extractor architecture 2019-10-31 20:42:24 +00:00			`end`
			`end`

			`@impl Stage`
			`def validate_opts(_), do: {:error, "options must be a map"}`

Add basic LiveView pipeline editor, scrape stage config editing 2020-06-09 02:49:45 +00:00			`@impl Stage`
			`def default_opts(), do: %{}`

Remove old code 2020-06-01 22:30:59 +00:00			`@spec get_article_content(String.t(), map()) :: {:ok, String.t()} \| {:error, String.t()}`
Add option to convert images in article content to data URIs 2019-11-01 01:59:55 +00:00			`defp get_article_content(url, opts) when is_binary(url) and url != "" do`
Start pipeline system 2019-07-09 02:41:18 +00:00			`Logger.debug("Getting article from #{url}")`

			`url`
Add gemini protocol feed fetching 2020-07-18 23:27:53 +00:00			`\|> Network.http_get()`
Start pipeline system 2019-07-09 02:41:18 +00:00			`\|> case do`
Switch to hackney via Tesla 2021-03-31 23:28:25 +00:00			`{:ok, %Tesla.Env{status: code} = response} when code in 200..299 ->`
Add option to convert images in article content to data URIs 2019-11-01 01:59:55 +00:00			`handle_response(url, response, opts)`
Start pipeline system 2019-07-09 02:41:18 +00:00
Implement basic favicon scraping 2019-11-10 19:04:00 +00:00			`{:error, reason} ->`
			`{:error, "Couldn't scrape article: #{reason}"}`
Start pipeline system 2019-07-09 02:41:18 +00:00			`end`
			`end`

Remove old code 2020-06-01 22:30:59 +00:00			`defp get_article_content(_url, _opts), do: {:error, "URL must be a non-empty string"}`
Start pipeline system 2019-07-09 02:41:18 +00:00
Switch to hackney via Tesla 2021-03-31 23:28:25 +00:00			`@spec handle_response(String.t(), Tesla.Env.t(), map()) ::`
Add types, Dialyzer, fix Dialyzer warnings 2019-08-30 23:31:38 +00:00			`{:ok, String.t()} \| {:error, String.t()}`
Switch to hackney via Tesla 2021-03-31 23:28:25 +00:00			`defp handle_response(url, %Tesla.Env{body: body}, opts) do`
Add option to convert images in article content to data URIs 2019-11-01 01:59:55 +00:00			`case opts["extractor"] do`
Replace site-specific pipeline stages with new extractor architecture 2019-10-31 20:42:24 +00:00			`"builtin" ->`
Rewrite image URLs without hosts to use the host of the article URL 2019-10-31 21:38:16 +00:00			`{:ok, Readability.article(body)}`
Replace site-specific pipeline stages with new extractor architecture 2019-10-31 20:42:24 +00:00
			`module_name ->`
Change extractors to accept/return html trees 2019-10-31 21:12:02 +00:00			`html_tree = Floki.parse(body)`
Recover from errors in custom extractors 2021-03-31 19:30:17 +00:00
			`try do`
			`apply(String.to_existing_atom("Elixir." <> module_name), :extract, [html_tree])`
			`rescue`
			`e ->`
			`Logger.error(`
			`"Encountered error extracting article content from '#{url}' with #{module_name}, falling back to default"`
			`)`

			`Logger.error(Exception.format(:error, e, __STACKTRACE__))`

			`{:ok, Readability.article(body)}`
			`end`
Rewrite image URLs without hosts to use the host of the article URL 2019-10-31 21:38:16 +00:00			`end`
			`\|> case do`
			`{:ok, html} ->`
Add option to convert images in article content to data URIs 2019-11-01 01:59:55 +00:00			`convert_to_data_uris =`
			`case opts["convert_to_data_uris"] do`
			`nil -> true`
			`value -> value`
			`end`

			`html = Floki.map(html, rewrite_image_urls(convert_to_data_uris, URI.parse(url)))`
Change extractors to accept/return html trees 2019-10-31 21:12:02 +00:00
Add Slate extractor 2021-09-03 20:24:35 +00:00			`{:ok, Readability.readable_html(html)}`
Rewrite image URLs without hosts to use the host of the article URL 2019-10-31 21:38:16 +00:00
			`res ->`
			`res`
Replace site-specific pipeline stages with new extractor architecture 2019-10-31 20:42:24 +00:00			`end`
Start pipeline system 2019-07-09 02:41:18 +00:00			`end`

Rewrite image URLs without hosts to use the host of the article URL 2019-10-31 21:38:16 +00:00			`# Generates a helper function for the article with the given URI that takes an HTML element and,`
			`# if it's an <img> element whose src attribute does not have a hostname, adds the hostname and`
			`# scheme to the element.`
Remove old code 2020-06-01 22:30:59 +00:00			`defp rewrite_image_urls(convert_to_data_uris, site_uri) do`
Rewrite image URLs without hosts to use the host of the article URL 2019-10-31 21:38:16 +00:00			`fn`
Add option to convert images in article content to data URIs 2019-11-01 01:59:55 +00:00			`{"img", attrs} ->`
			`new_attrs =`
			`Enum.map(attrs, fn`
			`{"src", src} ->`
Fix scraping images w/ URLs w/o schemes 2020-02-17 17:09:03 +00:00			`{"src", image_to_data_uri(src, site_uri, convert_to_data_uris)}`
Add option to convert images in article content to data URIs 2019-11-01 01:59:55 +00:00
			`attr ->`
			`attr`
			`end)`

			`{"img", new_attrs}`
Rewrite image URLs without hosts to use the host of the article URL 2019-10-31 21:38:16 +00:00
			`elem ->`
			`elem`
			`end`
			`end`
Add option to convert images in article content to data URIs 2019-11-01 01:59:55 +00:00
			`@content_type_allowlist ["image/jpeg", "image/png", "image/heic", "image/heif", "image/tiff"]`

			`# convert images to data URIs so that they're stored by clients as part of the body`
Fix scraping images w/ URLs w/o schemes 2020-02-17 17:09:03 +00:00			`defp image_to_data_uri(src, site_uri, true) do`
Remove old code 2020-06-01 22:30:59 +00:00			`absolute_url = URI.merge(site_uri, src) \|> to_string()`
Fix scraping images w/ URLs w/o schemes 2020-02-17 17:09:03 +00:00
Add gemini protocol feed fetching 2020-07-18 23:27:53 +00:00			`case Network.http_get(absolute_url) do`
Switch to hackney via Tesla 2021-03-31 23:28:25 +00:00			`{:ok, %Tesla.Env{body: body, headers: headers}} ->`
			`Enum.find(headers, fn {header, _value} -> String.downcase(header) == "content-type" end)`
Fix error when attempting to convert image w/o Content-Type header to data URI 2020-10-24 17:37:06 +00:00			`\|> case do`
Switch to hackney via Tesla 2021-03-31 23:28:25 +00:00			`{_, content_type} when content_type in @content_type_allowlist ->`
Fix error when attempting to convert image w/o Content-Type header to data URI 2020-10-24 17:37:06 +00:00			`"data:#{content_type};base64,#{Base.encode64(body)}"`
Add option to convert images in article content to data URIs 2019-11-01 01:59:55 +00:00
Fix error when attempting to convert image w/o Content-Type header to data URI 2020-10-24 17:37:06 +00:00			`_ ->`
			`src`
Add option to convert images in article content to data URIs 2019-11-01 01:59:55 +00:00			`end`

			`_ ->`
			`src`
			`end`
			`end`

Fix scraping images w/ URLs w/o schemes 2020-02-17 17:09:03 +00:00			`defp image_to_data_uri(src, _site_uri, false), do: src`
Start pipeline system 2019-07-09 02:41:18 +00:00			`end`