Don't put content from builtin extractor through readable_html twice

2021-11-06 11:09:05 -04:00 · 2021-11-06 11:09:05 -04:00 · 7df1c5a6ba
parent fd4b3db6c0
commit 7df1c5a6ba
1 changed files with 14 additions and 6 deletions
--- a/lib/frenzy/pipeline/scrape_stage.ex
+++ b/lib/frenzy/pipeline/scrape_stage.ex
@ -89,13 +89,25 @@ defmodule Frenzy.Pipeline.ScrapeStage do
  defp handle_response(url, %Tesla.Env{body: body}, opts) do
    case opts["extractor"] do
      "builtin" ->
-        {:ok, BuiltinExtractor.article(url, body)}
+        {:ok, BuiltinExtractor.article(url, body), true}

      module_name ->
        html_tree = Floki.parse(body)

        try do
          apply(String.to_existing_atom("Elixir." <> module_name), :extract, [html_tree])
+          |> case do
+            {:ok, content} ->
+              # non-builtin extractors go through readable_html to cleanup any bad/untrusted html
+              # this is what Floki.readable_html without turning back into a string
+              content =
+                Readability.Helper.remove_attrs(content, Readability.regexes(:protect_attrs))
+
+              {:ok, content}
+
+            err ->
+              err
+          end
        rescue
          e ->
            Logger.error(
@ -111,15 +123,11 @@ defmodule Frenzy.Pipeline.ScrapeStage do
              )
            end

-            {:ok, BuiltinExtractor.article(url, body)}
+            {:ok, BuiltinExtractor.article(url, body), true}
        end
    end
    |> case do
      {:ok, html} ->
-        # todo: probably don't need to go through readable_html if it used the builtin extractor
-        # this is what Floki.readable_html without turning back into a string
-        html = Readability.Helper.remove_attrs(html, Readability.regexes(:protect_attrs))
-
        convert_to_data_uris =
          case opts["convert_to_data_uris"] do
            nil -> true