From e84ebc473aeed31927ae49a7a77832dfff25d3b1 Mon Sep 17 00:00:00 2001 From: Shadowfacts Date: Sat, 6 Nov 2021 10:10:49 -0400 Subject: [PATCH] Add support for external readability implementation --- config/config.exs | 1 + lib/frenzy/application.ex | 3 +- lib/frenzy/builtin_extractor.ex | 76 +++++++++++++++++++++++++++++ lib/frenzy/network.ex | 9 ++++ lib/frenzy/pipeline/scrape_stage.ex | 11 +++-- 5 files changed, 96 insertions(+), 4 deletions(-) create mode 100644 lib/frenzy/builtin_extractor.ex diff --git a/config/config.exs b/config/config.exs index f5b8b26..5dc4544 100644 --- a/config/config.exs +++ b/config/config.exs @@ -31,6 +31,7 @@ config :phoenix, :json_library, Jason config :logger, truncate: :infinity config :frenzy, sentry_enabled: false +config :frenzy, external_readability: false # Import environment specific config. This must remain at the bottom # of this file so it overrides the configuration defined above. diff --git a/lib/frenzy/application.ex b/lib/frenzy/application.ex index e8900f5..74f9dd2 100644 --- a/lib/frenzy/application.ex +++ b/lib/frenzy/application.ex @@ -16,7 +16,8 @@ defmodule Frenzy.Application do FrenzyWeb.Endpoint, # Starts a worker by calling: Frenzy.Worker.start_link(arg) # {Frenzy.Worker, arg}, - {Frenzy.UpdateFeeds, name: Frenzy.UpdateFeeds} + {Frenzy.UpdateFeeds, name: Frenzy.UpdateFeeds}, + {Frenzy.BuiltinExtractor, name: Frenzy.BuiltinExtractor} ] # See https://hexdocs.pm/elixir/Supervisor.html diff --git a/lib/frenzy/builtin_extractor.ex b/lib/frenzy/builtin_extractor.ex new file mode 100644 index 0000000..b77b5f8 --- /dev/null +++ b/lib/frenzy/builtin_extractor.ex @@ -0,0 +1,76 @@ +defmodule Frenzy.BuiltinExtractor do + use GenServer + alias Frenzy.Network + require Logger + + @external_url Application.get_env(:frenzy, :external_readability_url) + + def start_link(state) do + GenServer.start_link(__MODULE__, :ok, state) + end + + @spec article(String.t(), String.t()) :: Floki.html_tree() + def article(url, html) do + GenServer.call(__MODULE__, {:article, url, html}) + end + + def init(_state) do + use_external = Application.get_env(:frenzy, :external_readability) + + use_external = + if use_external do + uri = URI.parse(@external_url) + uri = %URI{uri | path: "/status"} + uri = URI.to_string(uri) + + case Network.http_get(uri) do + {:ok, %Tesla.Env{status: 200}} -> + true + + _ -> + Logger.warn("Could not reach external readability for healthcheck, disabling") + false + end + else + false + end + + {:ok, use_external} + end + + def handle_call({:article, url, html}, _from, state) do + # the genserver state is a boolean telling us whether to use the external readability + if state do + uri = URI.parse(@external_url) + uri = %URI{uri | path: "/readability", query: URI.encode_query(url: url)} + uri = URI.to_string(uri) + + Logger.debug("Sending external readability request: #{uri}") + + case Network.http_post(uri, html, headers: [{"content-type", "text/html"}]) do + {:ok, %Tesla.Env{status: 200, body: body}} -> + {:reply, Floki.parse(body), state} + + {:ok, %Tesla.Env{status: status}} -> + Logger.error("External readability failed, got HTTP #{status}") + + if Frenzy.sentry_enabled?() do + Sentry.capture_message("External readability failed, got HTTP #{status}") + end + + {:reply, Readability.article(html), state} + + {:error, reason} -> + Logger.error("External readability failed: #{inspect(reason)}") + + if Frenzy.sentry_enabled?() do + Sentry.capture_message("External readability failed: #{inspect(reason)}") + end + + {:reply, Readability.article(html), state} + end + else + {:reply, Readability.article(html), state} + end + end +end diff --git a/lib/frenzy/network.ex b/lib/frenzy/network.ex index 80b271d..0ea6780 100644 --- a/lib/frenzy/network.ex +++ b/lib/frenzy/network.ex @@ -30,6 +30,15 @@ defmodule Frenzy.Network do HTTP.get(url) end + @spec http_post(String.t(), Tesla.Env.body(), [Tesla.option()]) :: Tesla.Env.result() + def http_post(url, body, options \\ []) do + if Frenzy.sentry_enabled?() do + Sentry.Context.add_breadcrumb(%{category: "http_get", message: url}) + end + + HTTP.post(url, body, options) + end + # @http_redirect_codes [301, 302] # @spec http_get(String.t()) :: {:ok, HTTPoison.Response.t()} | {:error, term()} diff --git a/lib/frenzy/pipeline/scrape_stage.ex b/lib/frenzy/pipeline/scrape_stage.ex index f7d3bc3..b72cb97 100644 --- a/lib/frenzy/pipeline/scrape_stage.ex +++ b/lib/frenzy/pipeline/scrape_stage.ex @@ -1,6 +1,7 @@ defmodule Frenzy.Pipeline.ScrapeStage do require Logger alias Frenzy.Network + alias Frenzy.BuiltinExtractor alias Frenzy.Pipeline.Stage @behaviour Stage @@ -88,7 +89,7 @@ defmodule Frenzy.Pipeline.ScrapeStage do defp handle_response(url, %Tesla.Env{body: body}, opts) do case opts["extractor"] do "builtin" -> - {:ok, Readability.article(body)} + {:ok, BuiltinExtractor.article(url, body)} module_name -> html_tree = Floki.parse(body) @@ -110,11 +111,15 @@ defmodule Frenzy.Pipeline.ScrapeStage do ) end - {:ok, Readability.article(body)} + {:ok, BuiltinExtractor.article(url, body)} end end |> case do {:ok, html} -> + # todo: probably don't need to go through readable_html if it used the builtin extractor + # this is what Floki.readable_html without turning back into a string + html = Readability.Helper.remove_attrs(html, Readability.regexes(:protect_attrs)) + convert_to_data_uris = case opts["convert_to_data_uris"] do nil -> true @@ -123,7 +128,7 @@ defmodule Frenzy.Pipeline.ScrapeStage do html = Floki.map(html, rewrite_image_urls(convert_to_data_uris, URI.parse(url))) - {:ok, Readability.readable_html(html)} + {:ok, Floki.raw_html(html)} res -> res