Replace site-specific pipeline stages with new extractor architecture

This commit is contained in:
Shadowfacts 2019-10-31 16:42:24 -04:00
parent dbef262779
commit 3192969889
Signed by: shadowfacts
GPG Key ID: 94A5AB95422746E5
6 changed files with 94 additions and 134 deletions

View File

@ -0,0 +1,3 @@
defmodule Frenzy.Pipeline.Extractor do
@callback extract(String.t()) :: {:ok, String.t()} | {:error, String.t()}
end

View File

@ -0,0 +1,38 @@
defmodule Frenzy.Pipeline.Extractor.DaringFireball do
alias Frenzy.Pipeline.Extractor
@behaviour Extractor
@impl Extractor
def extract(body) do
html_tree = Floki.parse(body)
case get_article_element(html_tree) || get_link_element(html_tree) do
nil ->
{:error, "no matching elements"}
elem ->
{:ok, Floki.raw_html(elem)}
end
end
defp get_article_element(html_tree) do
case Floki.find(html_tree, "div.article") do
[article_elem | _] ->
# articles include extra information in the div.article element
Floki.filter_out(article_elem, "h1, .dateline, #PreviousNext")
_ ->
nil
end
end
defp get_link_element(html_tree) do
case Floki.find(html_tree, "dl.linkedlist dd") do
[dd_elem | _] ->
dd_elem
_ ->
nil
end
end
end

View File

@ -4,8 +4,8 @@ defmodule Frenzy.Pipeline.ScrapeStage do
@behaviour Stage @behaviour Stage
@impl Stage @impl Stage
def apply(_opts, %{url: url} = item_params) do def apply(opts, %{url: url} = item_params) do
case get_article_content(url) do case get_article_content(url, opts["extractor"]) do
{:ok, content} -> {:ok, content} ->
{:ok, %{item_params | content: content}} {:ok, %{item_params | content: content}}
@ -16,17 +16,41 @@ defmodule Frenzy.Pipeline.ScrapeStage do
end end
@impl Stage @impl Stage
def validate_opts(opts), do: {:ok, opts} def validate_opts(opts) when is_map(opts) do
# todo: figure out why this errors when an empty map is provided
case opts["extractor"] do
nil ->
{:ok, %{opts | extractor: "builtin"}}
@spec get_article_content(String.t()) :: {:ok, String.t()} | {:error, String.t()} extractor when not is_binary(extractor) ->
defp get_article_content(url) when is_binary(url) and url != "" do {:error, "extractor must be a string"}
"builtin" ->
{:ok, opts}
extractor ->
try do
String.to_existing_atom("Elixir." <> extractor)
{:ok, opts}
rescue
ArgumentError ->
{:error, "extractor must be \"builtin\" or a module that exists"}
end
end
end
@impl Stage
def validate_opts(_), do: {:error, "options must be a map"}
@spec get_article_content(String.t(), String.t()) :: {:ok, String.t()} | {:error, String.t()}
defp get_article_content(url, extractor) when is_binary(url) and url != "" do
Logger.debug("Getting article from #{url}") Logger.debug("Getting article from #{url}")
url url
|> HTTPoison.get() |> HTTPoison.get()
|> case do |> case do
{:ok, response} -> {:ok, response} ->
handle_response(url, response) handle_response(url, response, extractor)
{:error, %HTTPoison.Error{reason: reason}} -> {:error, %HTTPoison.Error{reason: reason}} ->
{:error, "HTTPoison error: #{reason}"} {:error, "HTTPoison error: #{reason}"}
@ -35,38 +59,46 @@ defmodule Frenzy.Pipeline.ScrapeStage do
defp get_article_content(_url), do: {:error, "URL must be a non-empty string"} defp get_article_content(_url), do: {:error, "URL must be a non-empty string"}
@spec handle_response(String.t(), HTTPoison.Response.t()) :: @spec handle_response(String.t(), HTTPoison.Response.t(), String.t()) ::
{:ok, String.t()} | {:error, String.t()} {:ok, String.t()} | {:error, String.t()}
defp handle_response(_url, %HTTPoison.Response{status_code: 200, body: body}) do defp handle_response(_url, %HTTPoison.Response{status_code: 200, body: body}, extractor) do
article = Readability.article(body) case extractor do
{:ok, Readability.readable_html(article)} "builtin" ->
article = Readability.article(body)
{:ok, Readability.readable_html(article)}
module_name ->
apply(String.to_existing_atom("Elixir." <> module_name), :extract, [body])
end
end end
defp handle_response(_url, %HTTPoison.Response{status_code: 404}) do defp handle_response(_url, %HTTPoison.Response{status_code: 404}, _extractor) do
{:error, "404 not found"} {:error, "404 not found"}
end end
defp handle_response(url, %HTTPoison.Response{status_code: status_code, headers: headers}) defp handle_response(
url,
%HTTPoison.Response{status_code: status_code, headers: headers},
extractor
)
when status_code in [301, 302] do when status_code in [301, 302] do
{"Location", new_url} = Enum.find(headers, fn {name, _value} -> name == "Location" end)
headers headers
|> Enum.find(fn {name, _value} -> name == "Location" end) |> Enum.find(fn {name, _value} -> name == "Location" end)
|> case do |> case do
{"Location", new_url} -> {"Location", new_url} ->
Logger.debug("Got 301 redirect from #{url} to #{new_url}") Logger.debug("Got 301 redirect from #{url} to #{new_url}")
get_article_content(new_url) get_article_content(new_url, extractor)
_ -> _ ->
{:error, "Missing Location header for redirect"} {:error, "Missing Location header for redirect"}
end end
end end
defp handle_response(_url, %HTTPoison.Response{status_code: 403}) do defp handle_response(_url, %HTTPoison.Response{status_code: 403}, _extractor) do
{:error, "403 Forbidden"} {:error, "403 Forbidden"}
end end
defp handle_response(_url, %HTTPoison.Response{status_code: status_code} = response) do defp handle_response(_url, %HTTPoison.Response{} = response, _extractor) do
{:error, "No handler for response #{inspect(response)}"} {:error, "No handler for response #{inspect(response)}"}
end end
end end

View File

@ -1,105 +0,0 @@
defmodule Frenzy.Pipeline.Site.DaringFireballScrapeStage do
require Logger
alias Frenzy.Pipeline.Stage
@behaviour Stage
@impl Stage
def apply(_opts, %{url: url} = item_params) do
case get_article_content(url) do
{:ok, content} ->
{:ok, %{item_params | content: content}}
{:error, reason} ->
Logger.warn("Unable to get Daring Fireball article content for #{url}: #{reason}")
{:ok, item_params}
end
end
@impl Stage
def validate_opts(opts), do: {:ok, opts}
@spec get_article_content(String.t()) :: {:ok, String.t()} | {:error, String.t()}
defp get_article_content(url) when is_binary(url) and url != "" do
Logger.debug("Get Daring Fireball article from #{url}")
url
|> HTTPoison.get()
|> case do
{:ok, response} ->
handle_response(url, response)
{:error, %HTTPoison.Error{reason: reason}} ->
{:error, "HTTPoison error: #{reason}"}
end
end
defp get_article_content(_url), do: {:error, "URL must be a non-empty string"}
@spec handle_response(String.t(), HTTPoison.Response.t()) ::
{:ok, String.t()} | {:error, String.t()}
defp handle_response(url, %HTTPoison.Response{status_code: 200, body: body}) do
html_tree = Floki.parse(body)
case get_article_element(html_tree) || get_link_element(html_tree) do
nil ->
{:error, "no matching element"}
elem ->
readable_html =
elem
|> Floki.filter_out(:comment)
|> Readability.readable_html()
{:ok, readable_html}
end
end
defp handle_response(_url, %HTTPoison.Response{status_code: 404}) do
{:error, "404 not found"}
end
defp handle_response(url, %HTTPoison.Response{status_code: status_code, headers: headers})
when status_code in [301, 302] do
{"Location", new_url} = Enum.find(headers, fn {name, _value} -> name == "Location" end)
headers
|> Enum.find(fn {name, _value} -> name == "Location" end)
|> case do
{"Location", new_url} ->
Logger.debug("Got 301 redirect from #{url} to #{new_url}")
get_article_content(new_url)
_ ->
{:error, "Missing Location header for redirect"}
end
end
defp handle_response(_url, %HTTPoison.Response{status_code: 403}) do
{:error, "403 Forbidden"}
end
defp handle_response(_url, %HTTPoison.Response{status_code: status_code} = response) do
{:error, "No handler for response #{inspect(response)}"}
end
defp get_article_element(html_tree) do
case Floki.find(html_tree, "div.article") do
[article_elem | _] ->
# articles include extra information in the div.article element
Floki.filter_out(article_elem, "h1, .dateline, #PreviousNext")
_ ->
nil
end
end
defp get_link_element(html_tree) do
case Floki.find(html_tree, "dl.linkedlist dd") do
[dd_elem | _] ->
dd_elem
_ ->
nil
end
end
end

View File

@ -1,6 +1,6 @@
defmodule FrenzyWeb.AccountController do defmodule FrenzyWeb.AccountController do
use FrenzyWeb, :controller use FrenzyWeb, :controller
alias Frenzy.{Repo, User, FervorClient, Filter} alias Frenzy.{Repo, User, FervorClient}
alias FrenzyWeb.Router.Helpers, as: Routes alias FrenzyWeb.Router.Helpers, as: Routes
alias FrenzyWeb.Endpoint alias FrenzyWeb.Endpoint
@ -125,11 +125,7 @@ defmodule FrenzyWeb.AccountController do
Enum.each(feeds, fn feed_url -> Enum.each(feeds, fn feed_url ->
feed_changeset = feed_changeset =
Ecto.build_assoc(group, :feeds, %{ Ecto.build_assoc(group, :feeds, %{
feed_url: feed_url, feed_url: feed_url
filter: %Filter{
mode: "reject",
score: 0
}
}) })
{:ok, _feed} = Repo.insert(feed_changeset) {:ok, _feed} = Repo.insert(feed_changeset)

View File

@ -1,6 +1,6 @@
defmodule FrenzyWeb.Fervor.FeedsController do defmodule FrenzyWeb.Fervor.FeedsController do
use FrenzyWeb, :controller use FrenzyWeb, :controller
alias Frenzy.{Repo, Feed, Filter, Item} alias Frenzy.{Repo, Feed, Item}
import Ecto.Query import Ecto.Query
alias FrenzyWeb.Fervor.Paginator alias FrenzyWeb.Fervor.Paginator
@ -85,11 +85,7 @@ defmodule FrenzyWeb.Fervor.FeedsController do
group -> group ->
changeset = changeset =
Ecto.build_assoc(group, :feeds, %{ Ecto.build_assoc(group, :feeds, %{
feed_url: feed_url, feed_url: feed_url
filter: %Filter{
mode: "reject",
score: 0
}
}) })
{:ok, feed} = Repo.insert(changeset) {:ok, feed} = Repo.insert(changeset)