Add support for external readability implementation

This commit is contained in:
Shadowfacts 2021-11-06 10:10:49 -04:00
parent f1435611ef
commit e84ebc473a
5 changed files with 96 additions and 4 deletions

View File

@ -31,6 +31,7 @@ config :phoenix, :json_library, Jason
config :logger, truncate: :infinity config :logger, truncate: :infinity
config :frenzy, sentry_enabled: false config :frenzy, sentry_enabled: false
config :frenzy, external_readability: false
# Import environment specific config. This must remain at the bottom # Import environment specific config. This must remain at the bottom
# of this file so it overrides the configuration defined above. # of this file so it overrides the configuration defined above.

View File

@ -16,7 +16,8 @@ defmodule Frenzy.Application do
FrenzyWeb.Endpoint, FrenzyWeb.Endpoint,
# Starts a worker by calling: Frenzy.Worker.start_link(arg) # Starts a worker by calling: Frenzy.Worker.start_link(arg)
# {Frenzy.Worker, arg}, # {Frenzy.Worker, arg},
{Frenzy.UpdateFeeds, name: Frenzy.UpdateFeeds} {Frenzy.UpdateFeeds, name: Frenzy.UpdateFeeds},
{Frenzy.BuiltinExtractor, name: Frenzy.BuiltinExtractor}
] ]
# See https://hexdocs.pm/elixir/Supervisor.html # See https://hexdocs.pm/elixir/Supervisor.html

View File

@ -0,0 +1,76 @@
defmodule Frenzy.BuiltinExtractor do
use GenServer
alias Frenzy.Network
require Logger
@external_url Application.get_env(:frenzy, :external_readability_url)
def start_link(state) do
GenServer.start_link(__MODULE__, :ok, state)
end
@spec article(String.t(), String.t()) :: Floki.html_tree()
def article(url, html) do
GenServer.call(__MODULE__, {:article, url, html})
end
def init(_state) do
use_external = Application.get_env(:frenzy, :external_readability)
use_external =
if use_external do
uri = URI.parse(@external_url)
uri = %URI{uri | path: "/status"}
uri = URI.to_string(uri)
case Network.http_get(uri) do
{:ok, %Tesla.Env{status: 200}} ->
true
_ ->
Logger.warn("Could not reach external readability for healthcheck, disabling")
false
end
else
false
end
{:ok, use_external}
end
def handle_call({:article, url, html}, _from, state) do
# the genserver state is a boolean telling us whether to use the external readability
if state do
uri = URI.parse(@external_url)
uri = %URI{uri | path: "/readability", query: URI.encode_query(url: url)}
uri = URI.to_string(uri)
Logger.debug("Sending external readability request: #{uri}")
case Network.http_post(uri, html, headers: [{"content-type", "text/html"}]) do
{:ok, %Tesla.Env{status: 200, body: body}} ->
{:reply, Floki.parse(body), state}
{:ok, %Tesla.Env{status: status}} ->
Logger.error("External readability failed, got HTTP #{status}")
if Frenzy.sentry_enabled?() do
Sentry.capture_message("External readability failed, got HTTP #{status}")
end
{:reply, Readability.article(html), state}
{:error, reason} ->
Logger.error("External readability failed: #{inspect(reason)}")
if Frenzy.sentry_enabled?() do
Sentry.capture_message("External readability failed: #{inspect(reason)}")
end
{:reply, Readability.article(html), state}
end
else
{:reply, Readability.article(html), state}
end
end
end

View File

@ -30,6 +30,15 @@ defmodule Frenzy.Network do
HTTP.get(url) HTTP.get(url)
end end
@spec http_post(String.t(), Tesla.Env.body(), [Tesla.option()]) :: Tesla.Env.result()
def http_post(url, body, options \\ []) do
if Frenzy.sentry_enabled?() do
Sentry.Context.add_breadcrumb(%{category: "http_get", message: url})
end
HTTP.post(url, body, options)
end
# @http_redirect_codes [301, 302] # @http_redirect_codes [301, 302]
# @spec http_get(String.t()) :: {:ok, HTTPoison.Response.t()} | {:error, term()} # @spec http_get(String.t()) :: {:ok, HTTPoison.Response.t()} | {:error, term()}

View File

@ -1,6 +1,7 @@
defmodule Frenzy.Pipeline.ScrapeStage do defmodule Frenzy.Pipeline.ScrapeStage do
require Logger require Logger
alias Frenzy.Network alias Frenzy.Network
alias Frenzy.BuiltinExtractor
alias Frenzy.Pipeline.Stage alias Frenzy.Pipeline.Stage
@behaviour Stage @behaviour Stage
@ -88,7 +89,7 @@ defmodule Frenzy.Pipeline.ScrapeStage do
defp handle_response(url, %Tesla.Env{body: body}, opts) do defp handle_response(url, %Tesla.Env{body: body}, opts) do
case opts["extractor"] do case opts["extractor"] do
"builtin" -> "builtin" ->
{:ok, Readability.article(body)} {:ok, BuiltinExtractor.article(url, body)}
module_name -> module_name ->
html_tree = Floki.parse(body) html_tree = Floki.parse(body)
@ -110,11 +111,15 @@ defmodule Frenzy.Pipeline.ScrapeStage do
) )
end end
{:ok, Readability.article(body)} {:ok, BuiltinExtractor.article(url, body)}
end end
end end
|> case do |> case do
{:ok, html} -> {:ok, html} ->
# todo: probably don't need to go through readable_html if it used the builtin extractor
# this is what Floki.readable_html without turning back into a string
html = Readability.Helper.remove_attrs(html, Readability.regexes(:protect_attrs))
convert_to_data_uris = convert_to_data_uris =
case opts["convert_to_data_uris"] do case opts["convert_to_data_uris"] do
nil -> true nil -> true
@ -123,7 +128,7 @@ defmodule Frenzy.Pipeline.ScrapeStage do
html = Floki.map(html, rewrite_image_urls(convert_to_data_uris, URI.parse(url))) html = Floki.map(html, rewrite_image_urls(convert_to_data_uris, URI.parse(url)))
{:ok, Readability.readable_html(html)} {:ok, Floki.raw_html(html)}
res -> res ->
res res