From 3bc37952d12be5f61f012f968245322588a1786d Mon Sep 17 00:00:00 2001 From: Shadowfacts Date: Thu, 31 Oct 2019 21:59:55 -0400 Subject: [PATCH] Add option to convert images in article content to data URIs --- lib/frenzy/pipeline/scrape_stage.ex | 111 ++++++++++++++++++++-------- 1 file changed, 81 insertions(+), 30 deletions(-) diff --git a/lib/frenzy/pipeline/scrape_stage.ex b/lib/frenzy/pipeline/scrape_stage.ex index c6d2363..8593738 100644 --- a/lib/frenzy/pipeline/scrape_stage.ex +++ b/lib/frenzy/pipeline/scrape_stage.ex @@ -5,7 +5,7 @@ defmodule Frenzy.Pipeline.ScrapeStage do @impl Stage def apply(opts, %{url: url} = item_params) do - case get_article_content(url, opts["extractor"]) do + case get_article_content(url, opts) do {:ok, content} -> {:ok, %{item_params | content: content}} @@ -18,24 +18,36 @@ defmodule Frenzy.Pipeline.ScrapeStage do @impl Stage def validate_opts(opts) when is_map(opts) do # todo: figure out why this errors when an empty map is provided - case opts["extractor"] do + opts = + case opts["extractor"] do + nil -> + {:ok, %{opts | extractor: "builtin"}} + + extractor when not is_binary(extractor) -> + {:error, "extractor must be a string"} + + "builtin" -> + {:ok, opts} + + extractor -> + try do + String.to_existing_atom("Elixir." <> extractor) + {:ok, opts} + rescue + ArgumentError -> + {:error, "extractor must be \"builtin\" or a module that exists"} + end + end + + case opts["convert_to_data_uris"] do nil -> - {:ok, %{opts | extractor: "builtin"}} + {:ok, %{opts | convert_to_data_uris: true}} - extractor when not is_binary(extractor) -> - {:error, "extractor must be a string"} - - "builtin" -> + value when is_boolean(value) -> {:ok, opts} - extractor -> - try do - String.to_existing_atom("Elixir." <> extractor) - {:ok, opts} - rescue - ArgumentError -> - {:error, "extractor must be \"builtin\" or a module that exists"} - end + _ -> + {:error, "convert_to_data_uris must be a boolean"} end end @@ -43,14 +55,14 @@ defmodule Frenzy.Pipeline.ScrapeStage do def validate_opts(_), do: {:error, "options must be a map"} @spec get_article_content(String.t(), String.t()) :: {:ok, String.t()} | {:error, String.t()} - defp get_article_content(url, extractor) when is_binary(url) and url != "" do + defp get_article_content(url, opts) when is_binary(url) and url != "" do Logger.debug("Getting article from #{url}") url |> HTTPoison.get() |> case do {:ok, response} -> - handle_response(url, response, extractor) + handle_response(url, response, opts) {:error, %HTTPoison.Error{reason: reason}} -> {:error, "HTTPoison error: #{reason}"} @@ -61,8 +73,8 @@ defmodule Frenzy.Pipeline.ScrapeStage do @spec handle_response(String.t(), HTTPoison.Response.t(), String.t()) :: {:ok, String.t()} | {:error, String.t()} - defp handle_response(url, %HTTPoison.Response{status_code: 200, body: body}, extractor) do - case extractor do + defp handle_response(url, %HTTPoison.Response{status_code: 200, body: body}, opts) do + case opts["extractor"] do "builtin" -> {:ok, Readability.article(body)} @@ -72,9 +84,15 @@ defmodule Frenzy.Pipeline.ScrapeStage do end |> case do {:ok, html} -> - html = Floki.map(html, rewrite_image_urls(URI.parse(url))) + convert_to_data_uris = + case opts["convert_to_data_uris"] do + nil -> true + value -> value + end - case extractor do + html = Floki.map(html, rewrite_image_urls(convert_to_data_uris, URI.parse(url))) + + case opts["extractor"] do "builtin" -> {:ok, Readability.readable_html(html)} @@ -120,21 +138,54 @@ defmodule Frenzy.Pipeline.ScrapeStage do # Generates a helper function for the article with the given URI that takes an HTML element and, # if it's an element whose src attribute does not have a hostname, adds the hostname and # scheme to the element. - defp rewrite_image_urls(%URI{host: host, scheme: scheme}) do + defp rewrite_image_urls(convert_to_data_uris, %URI{host: host, scheme: scheme}) do fn - {"img", [{"src", src} | attrs]} = elem -> - case URI.parse(src) do - %URI{host: nil, path: path} -> - new_src = URI.to_string(%URI{path: path, host: host, scheme: scheme}) + {"img", attrs} -> + new_attrs = + Enum.map(attrs, fn + {"src", src} -> + case URI.parse(src) do + %URI{host: nil, path: path} -> + new_src = + URI.to_string(%URI{path: path, host: host, scheme: scheme}) + |> image_to_data_uri(convert_to_data_uris) - {"img", [{"src", new_src} | attrs]} + {"src", new_src} - _ -> - elem - end + _ -> + {"src", image_to_data_uri(convert_to_data_uris, src)} + end + + attr -> + attr + end) + + {"img", new_attrs} elem -> elem end end + + @content_type_allowlist ["image/jpeg", "image/png", "image/heic", "image/heif", "image/tiff"] + + # convert images to data URIs so that they're stored by clients as part of the body + defp image_to_data_uri(true, src) do + case HTTPoison.get(src) do + {:ok, %HTTPoison.Response{status_code: 200, body: body, headers: headers}} -> + {"Content-Type", content_type} = + Enum.find(headers, fn {header, _value} -> header == "Content-Type" end) + + if content_type in @content_type_allowlist do + "data:#{content_type};base64,#{Base.encode64(body)}" + else + src + end + + _ -> + src + end + end + + defp image_to_data_uri(false, src), do: src end