frenzy/lib/frenzy/pipeline/scrape_stage.ex

180 lines
4.8 KiB
Elixir
Raw Normal View History

2019-07-09 02:41:18 +00:00
defmodule Frenzy.Pipeline.ScrapeStage do
require Logger
2020-07-18 23:27:53 +00:00
alias Frenzy.Network
2019-07-09 02:41:18 +00:00
alias Frenzy.Pipeline.Stage
@behaviour Stage
@impl Stage
def apply(opts, %{url: url} = item_params) do
case get_article_content(url, opts) do
2019-07-09 02:41:18 +00:00
{:ok, content} ->
{:ok, %{item_params | content: content}}
{:error, reason} ->
2019-07-21 23:04:43 +00:00
Logger.warn("Unable to get article content for #{url}: #{reason}")
{:ok, item_params}
2019-07-09 02:41:18 +00:00
end
end
@impl Stage
def validate_opts(opts) when is_map(opts) do
opts =
case opts["extractor"] do
nil ->
2020-05-31 19:56:27 +00:00
{:ok, Map.put(opts, "extractor", "builtin")}
extractor when not is_binary(extractor) ->
{:error, "extractor must be a string"}
"builtin" ->
{:ok, opts}
extractor ->
try do
String.to_existing_atom("Elixir." <> extractor)
{:ok, opts}
rescue
ArgumentError ->
{:error, "extractor must be \"builtin\" or a module that exists"}
end
end
case opts do
{:ok, opts} ->
case opts["convert_to_data_uris"] do
nil ->
2020-05-31 19:56:27 +00:00
{:ok, Map.put(opts, "convert_to_data_uris", true)}
value when is_boolean(value) ->
{:ok, opts}
_ ->
{:error, "convert_to_data_uris must be a boolean"}
end
_ ->
opts
end
end
@impl Stage
def validate_opts(_), do: {:error, "options must be a map"}
@impl Stage
def default_opts(), do: %{}
2020-06-01 22:30:59 +00:00
@spec get_article_content(String.t(), map()) :: {:ok, String.t()} | {:error, String.t()}
defp get_article_content(url, opts) when is_binary(url) and url != "" do
2019-07-09 02:41:18 +00:00
Logger.debug("Getting article from #{url}")
url
2020-07-18 23:27:53 +00:00
|> Network.http_get()
2019-07-09 02:41:18 +00:00
|> case do
2021-03-31 23:28:25 +00:00
{:ok, %Tesla.Env{status: code} = response} when code in 200..299 ->
handle_response(url, response, opts)
2019-07-09 02:41:18 +00:00
2019-11-10 19:04:00 +00:00
{:error, reason} ->
{:error, "Couldn't scrape article: #{reason}"}
2019-07-09 02:41:18 +00:00
end
end
2020-06-01 22:30:59 +00:00
defp get_article_content(_url, _opts), do: {:error, "URL must be a non-empty string"}
2019-07-09 02:41:18 +00:00
2021-03-31 23:28:25 +00:00
@spec handle_response(String.t(), Tesla.Env.t(), map()) ::
{:ok, String.t()} | {:error, String.t()}
2021-03-31 23:28:25 +00:00
defp handle_response(url, %Tesla.Env{body: body}, opts) do
case opts["extractor"] do
"builtin" ->
{:ok, Readability.article(body)}
module_name ->
html_tree = Floki.parse(body)
try do
apply(String.to_existing_atom("Elixir." <> module_name), :extract, [html_tree])
rescue
e ->
Logger.error(
"Encountered error extracting article content from '#{url}' with #{module_name}, falling back to default"
)
Logger.error(Exception.format(:error, e, __STACKTRACE__))
2021-09-22 17:59:44 +00:00
if Frenzy.sentry_enabled?() do
Sentry.capture_exception(e,
stacktrace: __STACKTRACE__,
extra: %{extractor: module_name, item_url: url}
)
end
{:ok, Readability.article(body)}
end
end
|> case do
{:ok, html} ->
convert_to_data_uris =
case opts["convert_to_data_uris"] do
nil -> true
value -> value
end
html = Floki.map(html, rewrite_image_urls(convert_to_data_uris, URI.parse(url)))
2021-09-03 20:24:35 +00:00
{:ok, Readability.readable_html(html)}
res ->
res
end
2019-07-09 02:41:18 +00:00
end
# Generates a helper function for the article with the given URI that takes an HTML element and,
# if it's an <img> element whose src attribute does not have a hostname, adds the hostname and
# scheme to the element.
2020-06-01 22:30:59 +00:00
defp rewrite_image_urls(convert_to_data_uris, site_uri) do
fn
{"img", attrs} ->
new_attrs =
Enum.map(attrs, fn
{"src", src} ->
{"src", image_to_data_uri(src, site_uri, convert_to_data_uris)}
attr ->
attr
end)
{"img", new_attrs}
elem ->
elem
end
end
@content_type_allowlist ["image/jpeg", "image/png", "image/heic", "image/heif", "image/tiff"]
# convert images to data URIs so that they're stored by clients as part of the body
defp image_to_data_uri("data:" <> _ = src, _site_uri, _convert) do
src
end
defp image_to_data_uri(src, site_uri, true) do
2020-06-01 22:30:59 +00:00
absolute_url = URI.merge(site_uri, src) |> to_string()
2020-07-18 23:27:53 +00:00
case Network.http_get(absolute_url) do
2021-03-31 23:28:25 +00:00
{:ok, %Tesla.Env{body: body, headers: headers}} ->
Enum.find(headers, fn {header, _value} -> String.downcase(header) == "content-type" end)
|> case do
2021-03-31 23:28:25 +00:00
{_, content_type} when content_type in @content_type_allowlist ->
"data:#{content_type};base64,#{Base.encode64(body)}"
_ ->
src
end
_ ->
src
end
end
defp image_to_data_uri(src, _site_uri, false), do: src
2019-07-09 02:41:18 +00:00
end