frenzy/lib/frenzy/pipeline/scrape_stage.ex

141 lines
3.9 KiB
Elixir

defmodule Frenzy.Pipeline.ScrapeStage do
require Logger
alias Frenzy.Pipeline.Stage
@behaviour Stage
@impl Stage
def apply(opts, %{url: url} = item_params) do
case get_article_content(url, opts["extractor"]) do
{:ok, content} ->
{:ok, %{item_params | content: content}}
{:error, reason} ->
Logger.warn("Unable to get article content for #{url}: #{reason}")
{:ok, item_params}
end
end
@impl Stage
def validate_opts(opts) when is_map(opts) do
# todo: figure out why this errors when an empty map is provided
case opts["extractor"] do
nil ->
{:ok, %{opts | extractor: "builtin"}}
extractor when not is_binary(extractor) ->
{:error, "extractor must be a string"}
"builtin" ->
{:ok, opts}
extractor ->
try do
String.to_existing_atom("Elixir." <> extractor)
{:ok, opts}
rescue
ArgumentError ->
{:error, "extractor must be \"builtin\" or a module that exists"}
end
end
end
@impl Stage
def validate_opts(_), do: {:error, "options must be a map"}
@spec get_article_content(String.t(), String.t()) :: {:ok, String.t()} | {:error, String.t()}
defp get_article_content(url, extractor) when is_binary(url) and url != "" do
Logger.debug("Getting article from #{url}")
url
|> HTTPoison.get()
|> case do
{:ok, response} ->
handle_response(url, response, extractor)
{:error, %HTTPoison.Error{reason: reason}} ->
{:error, "HTTPoison error: #{reason}"}
end
end
defp get_article_content(_url), do: {:error, "URL must be a non-empty string"}
@spec handle_response(String.t(), HTTPoison.Response.t(), String.t()) ::
{:ok, String.t()} | {:error, String.t()}
defp handle_response(url, %HTTPoison.Response{status_code: 200, body: body}, extractor) do
case extractor do
"builtin" ->
{:ok, Readability.article(body)}
module_name ->
html_tree = Floki.parse(body)
apply(String.to_existing_atom("Elixir." <> module_name), :extract, [html_tree])
end
|> case do
{:ok, html} ->
html = Floki.map(html, rewrite_image_urls(URI.parse(url)))
case extractor do
"builtin" ->
{:ok, Readability.readable_html(html)}
_ ->
{:ok, Floki.raw_html(html)}
end
res ->
res
end
end
defp handle_response(_url, %HTTPoison.Response{status_code: 404}, _extractor) do
{:error, "404 not found"}
end
defp handle_response(
url,
%HTTPoison.Response{status_code: status_code, headers: headers},
extractor
)
when status_code in [301, 302] do
headers
|> Enum.find(fn {name, _value} -> name == "Location" end)
|> case do
{"Location", new_url} ->
Logger.debug("Got 301 redirect from #{url} to #{new_url}")
get_article_content(new_url, extractor)
_ ->
{:error, "Missing Location header for redirect"}
end
end
defp handle_response(_url, %HTTPoison.Response{status_code: 403}, _extractor) do
{:error, "403 Forbidden"}
end
defp handle_response(_url, %HTTPoison.Response{} = response, _extractor) do
{:error, "No handler for response #{inspect(response)}"}
end
# Generates a helper function for the article with the given URI that takes an HTML element and,
# if it's an <img> element whose src attribute does not have a hostname, adds the hostname and
# scheme to the element.
defp rewrite_image_urls(%URI{host: host, scheme: scheme}) do
fn
{"img", [{"src", src} | attrs]} = elem ->
case URI.parse(src) do
%URI{host: nil, path: path} ->
new_src = URI.to_string(%URI{path: path, host: host, scheme: scheme})
{"img", [{"src", new_src} | attrs]}
_ ->
elem
end
elem ->
elem
end
end
end