Don't put content from builtin extractor through readable_html twice

2021-11-06 11:09:05 -04:00 · 2021-11-06 11:09:05 -04:00 · 37a802b7a8
commit 37a802b7a8
parent d2d4651f1d
1 changed files with 12 additions and 4 deletions
--- a/lib/frenzy/pipeline/scrape_stage.ex
+++ b/lib/frenzy/pipeline/scrape_stage.ex
@ -96,6 +96,18 @@ defmodule Frenzy.Pipeline.ScrapeStage do

        try do
          apply(String.to_existing_atom("Elixir." <> module_name), :extract, [html_tree])
+          |> case do
+            {:ok, content} ->
+              # non-builtin extractors go through readable_html to cleanup any bad/untrusted html
+              # this is what Floki.readable_html without turning back into a string
+              content =
+                Readability.Helper.remove_attrs(content, Readability.regexes(:protect_attrs))
+
+              {:ok, content}
+
+            err ->
+              err
+          end
        rescue
          e ->
            Logger.error(
@ -116,10 +128,6 @@ defmodule Frenzy.Pipeline.ScrapeStage do
    end
    |> case do
      {:ok, html} ->
-        # todo: probably don't need to go through readable_html if it used the builtin extractor
-        # this is what Floki.readable_html without turning back into a string
-        html = Readability.Helper.remove_attrs(html, Readability.regexes(:protect_attrs))
-
        convert_to_data_uris =
          case opts["convert_to_data_uris"] do
            nil -> true