Add option to convert images in article content to data URIs

This commit is contained in:
Shadowfacts 2019-10-31 21:59:55 -04:00
parent 98a182986c
commit 3bc37952d1
Signed by: shadowfacts
GPG Key ID: 94A5AB95422746E5
1 changed files with 81 additions and 30 deletions

View File

@ -5,7 +5,7 @@ defmodule Frenzy.Pipeline.ScrapeStage do
@impl Stage
def apply(opts, %{url: url} = item_params) do
case get_article_content(url, opts["extractor"]) do
case get_article_content(url, opts) do
{:ok, content} ->
{:ok, %{item_params | content: content}}
@ -18,24 +18,36 @@ defmodule Frenzy.Pipeline.ScrapeStage do
@impl Stage
def validate_opts(opts) when is_map(opts) do
# todo: figure out why this errors when an empty map is provided
case opts["extractor"] do
opts =
case opts["extractor"] do
nil ->
{:ok, %{opts | extractor: "builtin"}}
extractor when not is_binary(extractor) ->
{:error, "extractor must be a string"}
"builtin" ->
{:ok, opts}
extractor ->
try do
String.to_existing_atom("Elixir." <> extractor)
{:ok, opts}
rescue
ArgumentError ->
{:error, "extractor must be \"builtin\" or a module that exists"}
end
end
case opts["convert_to_data_uris"] do
nil ->
{:ok, %{opts | extractor: "builtin"}}
{:ok, %{opts | convert_to_data_uris: true}}
extractor when not is_binary(extractor) ->
{:error, "extractor must be a string"}
"builtin" ->
value when is_boolean(value) ->
{:ok, opts}
extractor ->
try do
String.to_existing_atom("Elixir." <> extractor)
{:ok, opts}
rescue
ArgumentError ->
{:error, "extractor must be \"builtin\" or a module that exists"}
end
_ ->
{:error, "convert_to_data_uris must be a boolean"}
end
end
@ -43,14 +55,14 @@ defmodule Frenzy.Pipeline.ScrapeStage do
def validate_opts(_), do: {:error, "options must be a map"}
@spec get_article_content(String.t(), String.t()) :: {:ok, String.t()} | {:error, String.t()}
defp get_article_content(url, extractor) when is_binary(url) and url != "" do
defp get_article_content(url, opts) when is_binary(url) and url != "" do
Logger.debug("Getting article from #{url}")
url
|> HTTPoison.get()
|> case do
{:ok, response} ->
handle_response(url, response, extractor)
handle_response(url, response, opts)
{:error, %HTTPoison.Error{reason: reason}} ->
{:error, "HTTPoison error: #{reason}"}
@ -61,8 +73,8 @@ defmodule Frenzy.Pipeline.ScrapeStage do
@spec handle_response(String.t(), HTTPoison.Response.t(), String.t()) ::
{:ok, String.t()} | {:error, String.t()}
defp handle_response(url, %HTTPoison.Response{status_code: 200, body: body}, extractor) do
case extractor do
defp handle_response(url, %HTTPoison.Response{status_code: 200, body: body}, opts) do
case opts["extractor"] do
"builtin" ->
{:ok, Readability.article(body)}
@ -72,9 +84,15 @@ defmodule Frenzy.Pipeline.ScrapeStage do
end
|> case do
{:ok, html} ->
html = Floki.map(html, rewrite_image_urls(URI.parse(url)))
convert_to_data_uris =
case opts["convert_to_data_uris"] do
nil -> true
value -> value
end
case extractor do
html = Floki.map(html, rewrite_image_urls(convert_to_data_uris, URI.parse(url)))
case opts["extractor"] do
"builtin" ->
{:ok, Readability.readable_html(html)}
@ -120,21 +138,54 @@ defmodule Frenzy.Pipeline.ScrapeStage do
# Generates a helper function for the article with the given URI that takes an HTML element and,
# if it's an <img> element whose src attribute does not have a hostname, adds the hostname and
# scheme to the element.
defp rewrite_image_urls(%URI{host: host, scheme: scheme}) do
defp rewrite_image_urls(convert_to_data_uris, %URI{host: host, scheme: scheme}) do
fn
{"img", [{"src", src} | attrs]} = elem ->
case URI.parse(src) do
%URI{host: nil, path: path} ->
new_src = URI.to_string(%URI{path: path, host: host, scheme: scheme})
{"img", attrs} ->
new_attrs =
Enum.map(attrs, fn
{"src", src} ->
case URI.parse(src) do
%URI{host: nil, path: path} ->
new_src =
URI.to_string(%URI{path: path, host: host, scheme: scheme})
|> image_to_data_uri(convert_to_data_uris)
{"img", [{"src", new_src} | attrs]}
{"src", new_src}
_ ->
elem
end
_ ->
{"src", image_to_data_uri(convert_to_data_uris, src)}
end
attr ->
attr
end)
{"img", new_attrs}
elem ->
elem
end
end
@content_type_allowlist ["image/jpeg", "image/png", "image/heic", "image/heif", "image/tiff"]
# convert images to data URIs so that they're stored by clients as part of the body
defp image_to_data_uri(true, src) do
case HTTPoison.get(src) do
{:ok, %HTTPoison.Response{status_code: 200, body: body, headers: headers}} ->
{"Content-Type", content_type} =
Enum.find(headers, fn {header, _value} -> header == "Content-Type" end)
if content_type in @content_type_allowlist do
"data:#{content_type};base64,#{Base.encode64(body)}"
else
src
end
_ ->
src
end
end
defp image_to_data_uri(false, src), do: src
end