From 31929698897515cfcb27bc29ef84972bf1f9107e Mon Sep 17 00:00:00 2001 From: Shadowfacts Date: Thu, 31 Oct 2019 16:42:24 -0400 Subject: [PATCH] Replace site-specific pipeline stages with new extractor architecture --- lib/frenzy/pipeline/extractor.ex | 3 + .../pipeline/extractor/daring_fireball.ex | 38 +++++++ lib/frenzy/pipeline/scrape_stage.ex | 66 ++++++++--- .../site/daring_fireball_scrape_stage.ex | 105 ------------------ .../controllers/account_controller.ex | 8 +- .../controllers/fervor/feeds_controller.ex | 8 +- 6 files changed, 94 insertions(+), 134 deletions(-) create mode 100644 lib/frenzy/pipeline/extractor.ex create mode 100644 lib/frenzy/pipeline/extractor/daring_fireball.ex delete mode 100644 lib/frenzy/pipeline/site/daring_fireball_scrape_stage.ex diff --git a/lib/frenzy/pipeline/extractor.ex b/lib/frenzy/pipeline/extractor.ex new file mode 100644 index 0000000..c160c8b --- /dev/null +++ b/lib/frenzy/pipeline/extractor.ex @@ -0,0 +1,3 @@ +defmodule Frenzy.Pipeline.Extractor do + @callback extract(String.t()) :: {:ok, String.t()} | {:error, String.t()} +end diff --git a/lib/frenzy/pipeline/extractor/daring_fireball.ex b/lib/frenzy/pipeline/extractor/daring_fireball.ex new file mode 100644 index 0000000..d668f6d --- /dev/null +++ b/lib/frenzy/pipeline/extractor/daring_fireball.ex @@ -0,0 +1,38 @@ +defmodule Frenzy.Pipeline.Extractor.DaringFireball do + alias Frenzy.Pipeline.Extractor + @behaviour Extractor + + @impl Extractor + def extract(body) do + html_tree = Floki.parse(body) + + case get_article_element(html_tree) || get_link_element(html_tree) do + nil -> + {:error, "no matching elements"} + + elem -> + {:ok, Floki.raw_html(elem)} + end + end + + defp get_article_element(html_tree) do + case Floki.find(html_tree, "div.article") do + [article_elem | _] -> + # articles include extra information in the div.article element + Floki.filter_out(article_elem, "h1, .dateline, #PreviousNext") + + _ -> + nil + end + end + + defp get_link_element(html_tree) do + case Floki.find(html_tree, "dl.linkedlist dd") do + [dd_elem | _] -> + dd_elem + + _ -> + nil + end + end +end diff --git a/lib/frenzy/pipeline/scrape_stage.ex b/lib/frenzy/pipeline/scrape_stage.ex index 6295080..61834e0 100644 --- a/lib/frenzy/pipeline/scrape_stage.ex +++ b/lib/frenzy/pipeline/scrape_stage.ex @@ -4,8 +4,8 @@ defmodule Frenzy.Pipeline.ScrapeStage do @behaviour Stage @impl Stage - def apply(_opts, %{url: url} = item_params) do - case get_article_content(url) do + def apply(opts, %{url: url} = item_params) do + case get_article_content(url, opts["extractor"]) do {:ok, content} -> {:ok, %{item_params | content: content}} @@ -16,17 +16,41 @@ defmodule Frenzy.Pipeline.ScrapeStage do end @impl Stage - def validate_opts(opts), do: {:ok, opts} + def validate_opts(opts) when is_map(opts) do + # todo: figure out why this errors when an empty map is provided + case opts["extractor"] do + nil -> + {:ok, %{opts | extractor: "builtin"}} - @spec get_article_content(String.t()) :: {:ok, String.t()} | {:error, String.t()} - defp get_article_content(url) when is_binary(url) and url != "" do + extractor when not is_binary(extractor) -> + {:error, "extractor must be a string"} + + "builtin" -> + {:ok, opts} + + extractor -> + try do + String.to_existing_atom("Elixir." <> extractor) + {:ok, opts} + rescue + ArgumentError -> + {:error, "extractor must be \"builtin\" or a module that exists"} + end + end + end + + @impl Stage + def validate_opts(_), do: {:error, "options must be a map"} + + @spec get_article_content(String.t(), String.t()) :: {:ok, String.t()} | {:error, String.t()} + defp get_article_content(url, extractor) when is_binary(url) and url != "" do Logger.debug("Getting article from #{url}") url |> HTTPoison.get() |> case do {:ok, response} -> - handle_response(url, response) + handle_response(url, response, extractor) {:error, %HTTPoison.Error{reason: reason}} -> {:error, "HTTPoison error: #{reason}"} @@ -35,38 +59,46 @@ defmodule Frenzy.Pipeline.ScrapeStage do defp get_article_content(_url), do: {:error, "URL must be a non-empty string"} - @spec handle_response(String.t(), HTTPoison.Response.t()) :: + @spec handle_response(String.t(), HTTPoison.Response.t(), String.t()) :: {:ok, String.t()} | {:error, String.t()} - defp handle_response(_url, %HTTPoison.Response{status_code: 200, body: body}) do - article = Readability.article(body) - {:ok, Readability.readable_html(article)} + defp handle_response(_url, %HTTPoison.Response{status_code: 200, body: body}, extractor) do + case extractor do + "builtin" -> + article = Readability.article(body) + {:ok, Readability.readable_html(article)} + + module_name -> + apply(String.to_existing_atom("Elixir." <> module_name), :extract, [body]) + end end - defp handle_response(_url, %HTTPoison.Response{status_code: 404}) do + defp handle_response(_url, %HTTPoison.Response{status_code: 404}, _extractor) do {:error, "404 not found"} end - defp handle_response(url, %HTTPoison.Response{status_code: status_code, headers: headers}) + defp handle_response( + url, + %HTTPoison.Response{status_code: status_code, headers: headers}, + extractor + ) when status_code in [301, 302] do - {"Location", new_url} = Enum.find(headers, fn {name, _value} -> name == "Location" end) - headers |> Enum.find(fn {name, _value} -> name == "Location" end) |> case do {"Location", new_url} -> Logger.debug("Got 301 redirect from #{url} to #{new_url}") - get_article_content(new_url) + get_article_content(new_url, extractor) _ -> {:error, "Missing Location header for redirect"} end end - defp handle_response(_url, %HTTPoison.Response{status_code: 403}) do + defp handle_response(_url, %HTTPoison.Response{status_code: 403}, _extractor) do {:error, "403 Forbidden"} end - defp handle_response(_url, %HTTPoison.Response{status_code: status_code} = response) do + defp handle_response(_url, %HTTPoison.Response{} = response, _extractor) do {:error, "No handler for response #{inspect(response)}"} end end diff --git a/lib/frenzy/pipeline/site/daring_fireball_scrape_stage.ex b/lib/frenzy/pipeline/site/daring_fireball_scrape_stage.ex deleted file mode 100644 index 11d511e..0000000 --- a/lib/frenzy/pipeline/site/daring_fireball_scrape_stage.ex +++ /dev/null @@ -1,105 +0,0 @@ -defmodule Frenzy.Pipeline.Site.DaringFireballScrapeStage do - require Logger - alias Frenzy.Pipeline.Stage - @behaviour Stage - - @impl Stage - def apply(_opts, %{url: url} = item_params) do - case get_article_content(url) do - {:ok, content} -> - {:ok, %{item_params | content: content}} - - {:error, reason} -> - Logger.warn("Unable to get Daring Fireball article content for #{url}: #{reason}") - {:ok, item_params} - end - end - - @impl Stage - def validate_opts(opts), do: {:ok, opts} - - @spec get_article_content(String.t()) :: {:ok, String.t()} | {:error, String.t()} - defp get_article_content(url) when is_binary(url) and url != "" do - Logger.debug("Get Daring Fireball article from #{url}") - - url - |> HTTPoison.get() - |> case do - {:ok, response} -> - handle_response(url, response) - - {:error, %HTTPoison.Error{reason: reason}} -> - {:error, "HTTPoison error: #{reason}"} - end - end - - defp get_article_content(_url), do: {:error, "URL must be a non-empty string"} - - @spec handle_response(String.t(), HTTPoison.Response.t()) :: - {:ok, String.t()} | {:error, String.t()} - defp handle_response(url, %HTTPoison.Response{status_code: 200, body: body}) do - html_tree = Floki.parse(body) - - case get_article_element(html_tree) || get_link_element(html_tree) do - nil -> - {:error, "no matching element"} - - elem -> - readable_html = - elem - |> Floki.filter_out(:comment) - |> Readability.readable_html() - - {:ok, readable_html} - end - end - - defp handle_response(_url, %HTTPoison.Response{status_code: 404}) do - {:error, "404 not found"} - end - - defp handle_response(url, %HTTPoison.Response{status_code: status_code, headers: headers}) - when status_code in [301, 302] do - {"Location", new_url} = Enum.find(headers, fn {name, _value} -> name == "Location" end) - - headers - |> Enum.find(fn {name, _value} -> name == "Location" end) - |> case do - {"Location", new_url} -> - Logger.debug("Got 301 redirect from #{url} to #{new_url}") - get_article_content(new_url) - - _ -> - {:error, "Missing Location header for redirect"} - end - end - - defp handle_response(_url, %HTTPoison.Response{status_code: 403}) do - {:error, "403 Forbidden"} - end - - defp handle_response(_url, %HTTPoison.Response{status_code: status_code} = response) do - {:error, "No handler for response #{inspect(response)}"} - end - - defp get_article_element(html_tree) do - case Floki.find(html_tree, "div.article") do - [article_elem | _] -> - # articles include extra information in the div.article element - Floki.filter_out(article_elem, "h1, .dateline, #PreviousNext") - - _ -> - nil - end - end - - defp get_link_element(html_tree) do - case Floki.find(html_tree, "dl.linkedlist dd") do - [dd_elem | _] -> - dd_elem - - _ -> - nil - end - end -end diff --git a/lib/frenzy_web/controllers/account_controller.ex b/lib/frenzy_web/controllers/account_controller.ex index 2343d39..7925040 100644 --- a/lib/frenzy_web/controllers/account_controller.ex +++ b/lib/frenzy_web/controllers/account_controller.ex @@ -1,6 +1,6 @@ defmodule FrenzyWeb.AccountController do use FrenzyWeb, :controller - alias Frenzy.{Repo, User, FervorClient, Filter} + alias Frenzy.{Repo, User, FervorClient} alias FrenzyWeb.Router.Helpers, as: Routes alias FrenzyWeb.Endpoint @@ -125,11 +125,7 @@ defmodule FrenzyWeb.AccountController do Enum.each(feeds, fn feed_url -> feed_changeset = Ecto.build_assoc(group, :feeds, %{ - feed_url: feed_url, - filter: %Filter{ - mode: "reject", - score: 0 - } + feed_url: feed_url }) {:ok, _feed} = Repo.insert(feed_changeset) diff --git a/lib/frenzy_web/controllers/fervor/feeds_controller.ex b/lib/frenzy_web/controllers/fervor/feeds_controller.ex index 4c0df73..9af3b32 100644 --- a/lib/frenzy_web/controllers/fervor/feeds_controller.ex +++ b/lib/frenzy_web/controllers/fervor/feeds_controller.ex @@ -1,6 +1,6 @@ defmodule FrenzyWeb.Fervor.FeedsController do use FrenzyWeb, :controller - alias Frenzy.{Repo, Feed, Filter, Item} + alias Frenzy.{Repo, Feed, Item} import Ecto.Query alias FrenzyWeb.Fervor.Paginator @@ -85,11 +85,7 @@ defmodule FrenzyWeb.Fervor.FeedsController do group -> changeset = Ecto.build_assoc(group, :feeds, %{ - feed_url: feed_url, - filter: %Filter{ - mode: "reject", - score: 0 - } + feed_url: feed_url }) {:ok, feed} = Repo.insert(changeset)