2019-07-09 02:41:18 +00:00
|
|
|
defmodule Frenzy.Pipeline.ScrapeStage do
|
|
|
|
require Logger
|
2020-07-18 23:27:53 +00:00
|
|
|
alias Frenzy.Network
|
2019-07-09 02:41:18 +00:00
|
|
|
alias Frenzy.Pipeline.Stage
|
|
|
|
@behaviour Stage
|
|
|
|
|
|
|
|
@impl Stage
|
2019-10-31 20:42:24 +00:00
|
|
|
def apply(opts, %{url: url} = item_params) do
|
2019-11-01 01:59:55 +00:00
|
|
|
case get_article_content(url, opts) do
|
2019-07-09 02:41:18 +00:00
|
|
|
{:ok, content} ->
|
|
|
|
{:ok, %{item_params | content: content}}
|
|
|
|
|
|
|
|
{:error, reason} ->
|
2019-07-21 23:04:43 +00:00
|
|
|
Logger.warn("Unable to get article content for #{url}: #{reason}")
|
2019-08-30 23:31:38 +00:00
|
|
|
{:ok, item_params}
|
2019-07-09 02:41:18 +00:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2019-07-21 16:21:28 +00:00
|
|
|
@impl Stage
|
2019-10-31 20:42:24 +00:00
|
|
|
def validate_opts(opts) when is_map(opts) do
|
2019-11-01 01:59:55 +00:00
|
|
|
opts =
|
|
|
|
case opts["extractor"] do
|
|
|
|
nil ->
|
2020-05-31 19:56:27 +00:00
|
|
|
{:ok, Map.put(opts, "extractor", "builtin")}
|
2019-07-21 16:21:28 +00:00
|
|
|
|
2019-11-01 01:59:55 +00:00
|
|
|
extractor when not is_binary(extractor) ->
|
|
|
|
{:error, "extractor must be a string"}
|
2019-10-31 20:42:24 +00:00
|
|
|
|
2019-11-01 01:59:55 +00:00
|
|
|
"builtin" ->
|
|
|
|
{:ok, opts}
|
|
|
|
|
|
|
|
extractor ->
|
|
|
|
try do
|
|
|
|
String.to_existing_atom("Elixir." <> extractor)
|
|
|
|
{:ok, opts}
|
|
|
|
rescue
|
|
|
|
ArgumentError ->
|
|
|
|
{:error, "extractor must be \"builtin\" or a module that exists"}
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2019-11-01 22:27:08 +00:00
|
|
|
case opts do
|
|
|
|
{:ok, opts} ->
|
|
|
|
case opts["convert_to_data_uris"] do
|
|
|
|
nil ->
|
2020-05-31 19:56:27 +00:00
|
|
|
{:ok, Map.put(opts, "convert_to_data_uris", true)}
|
2019-11-01 01:59:55 +00:00
|
|
|
|
2019-11-01 22:27:08 +00:00
|
|
|
value when is_boolean(value) ->
|
|
|
|
{:ok, opts}
|
|
|
|
|
|
|
|
_ ->
|
|
|
|
{:error, "convert_to_data_uris must be a boolean"}
|
|
|
|
end
|
2019-10-31 20:42:24 +00:00
|
|
|
|
2019-11-01 01:59:55 +00:00
|
|
|
_ ->
|
2019-11-01 22:27:08 +00:00
|
|
|
opts
|
2019-10-31 20:42:24 +00:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
@impl Stage
|
|
|
|
def validate_opts(_), do: {:error, "options must be a map"}
|
|
|
|
|
2020-06-09 02:49:45 +00:00
|
|
|
@impl Stage
|
|
|
|
def default_opts(), do: %{}
|
|
|
|
|
2020-06-01 22:30:59 +00:00
|
|
|
@spec get_article_content(String.t(), map()) :: {:ok, String.t()} | {:error, String.t()}
|
2019-11-01 01:59:55 +00:00
|
|
|
defp get_article_content(url, opts) when is_binary(url) and url != "" do
|
2019-07-09 02:41:18 +00:00
|
|
|
Logger.debug("Getting article from #{url}")
|
|
|
|
|
|
|
|
url
|
2020-07-18 23:27:53 +00:00
|
|
|
|> Network.http_get()
|
2019-07-09 02:41:18 +00:00
|
|
|
|> case do
|
2021-03-31 23:28:25 +00:00
|
|
|
{:ok, %Tesla.Env{status: code} = response} when code in 200..299 ->
|
2019-11-01 01:59:55 +00:00
|
|
|
handle_response(url, response, opts)
|
2019-07-09 02:41:18 +00:00
|
|
|
|
2019-11-10 19:04:00 +00:00
|
|
|
{:error, reason} ->
|
|
|
|
{:error, "Couldn't scrape article: #{reason}"}
|
2019-07-09 02:41:18 +00:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2020-06-01 22:30:59 +00:00
|
|
|
defp get_article_content(_url, _opts), do: {:error, "URL must be a non-empty string"}
|
2019-07-09 02:41:18 +00:00
|
|
|
|
2021-03-31 23:28:25 +00:00
|
|
|
@spec handle_response(String.t(), Tesla.Env.t(), map()) ::
|
2019-08-30 23:31:38 +00:00
|
|
|
{:ok, String.t()} | {:error, String.t()}
|
2021-03-31 23:28:25 +00:00
|
|
|
defp handle_response(url, %Tesla.Env{body: body}, opts) do
|
2019-11-01 01:59:55 +00:00
|
|
|
case opts["extractor"] do
|
2019-10-31 20:42:24 +00:00
|
|
|
"builtin" ->
|
2019-10-31 21:38:16 +00:00
|
|
|
{:ok, Readability.article(body)}
|
2019-10-31 20:42:24 +00:00
|
|
|
|
|
|
|
module_name ->
|
2019-10-31 21:12:02 +00:00
|
|
|
html_tree = Floki.parse(body)
|
2021-03-31 19:30:17 +00:00
|
|
|
|
|
|
|
try do
|
|
|
|
apply(String.to_existing_atom("Elixir." <> module_name), :extract, [html_tree])
|
|
|
|
rescue
|
|
|
|
e ->
|
|
|
|
Logger.error(
|
|
|
|
"Encountered error extracting article content from '#{url}' with #{module_name}, falling back to default"
|
|
|
|
)
|
|
|
|
|
|
|
|
Logger.error(Exception.format(:error, e, __STACKTRACE__))
|
|
|
|
|
|
|
|
{:ok, Readability.article(body)}
|
|
|
|
end
|
2019-10-31 21:38:16 +00:00
|
|
|
end
|
|
|
|
|> case do
|
|
|
|
{:ok, html} ->
|
2019-11-01 01:59:55 +00:00
|
|
|
convert_to_data_uris =
|
|
|
|
case opts["convert_to_data_uris"] do
|
|
|
|
nil -> true
|
|
|
|
value -> value
|
|
|
|
end
|
|
|
|
|
|
|
|
html = Floki.map(html, rewrite_image_urls(convert_to_data_uris, URI.parse(url)))
|
2019-10-31 21:12:02 +00:00
|
|
|
|
2021-09-03 20:24:35 +00:00
|
|
|
{:ok, Readability.readable_html(html)}
|
2019-10-31 21:38:16 +00:00
|
|
|
|
|
|
|
res ->
|
|
|
|
res
|
2019-10-31 20:42:24 +00:00
|
|
|
end
|
2019-07-09 02:41:18 +00:00
|
|
|
end
|
|
|
|
|
2019-10-31 21:38:16 +00:00
|
|
|
# Generates a helper function for the article with the given URI that takes an HTML element and,
|
|
|
|
# if it's an <img> element whose src attribute does not have a hostname, adds the hostname and
|
|
|
|
# scheme to the element.
|
2020-06-01 22:30:59 +00:00
|
|
|
defp rewrite_image_urls(convert_to_data_uris, site_uri) do
|
2019-10-31 21:38:16 +00:00
|
|
|
fn
|
2019-11-01 01:59:55 +00:00
|
|
|
{"img", attrs} ->
|
|
|
|
new_attrs =
|
|
|
|
Enum.map(attrs, fn
|
|
|
|
{"src", src} ->
|
2020-02-17 17:09:03 +00:00
|
|
|
{"src", image_to_data_uri(src, site_uri, convert_to_data_uris)}
|
2019-11-01 01:59:55 +00:00
|
|
|
|
|
|
|
attr ->
|
|
|
|
attr
|
|
|
|
end)
|
|
|
|
|
|
|
|
{"img", new_attrs}
|
2019-10-31 21:38:16 +00:00
|
|
|
|
|
|
|
elem ->
|
|
|
|
elem
|
|
|
|
end
|
|
|
|
end
|
2019-11-01 01:59:55 +00:00
|
|
|
|
|
|
|
@content_type_allowlist ["image/jpeg", "image/png", "image/heic", "image/heif", "image/tiff"]
|
|
|
|
|
|
|
|
# convert images to data URIs so that they're stored by clients as part of the body
|
2020-02-17 17:09:03 +00:00
|
|
|
defp image_to_data_uri(src, site_uri, true) do
|
2020-06-01 22:30:59 +00:00
|
|
|
absolute_url = URI.merge(site_uri, src) |> to_string()
|
2020-02-17 17:09:03 +00:00
|
|
|
|
2020-07-18 23:27:53 +00:00
|
|
|
case Network.http_get(absolute_url) do
|
2021-03-31 23:28:25 +00:00
|
|
|
{:ok, %Tesla.Env{body: body, headers: headers}} ->
|
|
|
|
Enum.find(headers, fn {header, _value} -> String.downcase(header) == "content-type" end)
|
2020-10-24 17:37:06 +00:00
|
|
|
|> case do
|
2021-03-31 23:28:25 +00:00
|
|
|
{_, content_type} when content_type in @content_type_allowlist ->
|
2020-10-24 17:37:06 +00:00
|
|
|
"data:#{content_type};base64,#{Base.encode64(body)}"
|
2019-11-01 01:59:55 +00:00
|
|
|
|
2020-10-24 17:37:06 +00:00
|
|
|
_ ->
|
|
|
|
src
|
2019-11-01 01:59:55 +00:00
|
|
|
end
|
|
|
|
|
|
|
|
_ ->
|
|
|
|
src
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2020-02-17 17:09:03 +00:00
|
|
|
defp image_to_data_uri(src, _site_uri, false), do: src
|
2019-07-09 02:41:18 +00:00
|
|
|
end
|