From 37a802b7a88bacc9e3437ff4b3e9e85cdd22040f Mon Sep 17 00:00:00 2001 From: Shadowfacts Date: Sat, 6 Nov 2021 11:09:05 -0400 Subject: [PATCH] Don't put content from builtin extractor through readable_html twice --- lib/frenzy/pipeline/scrape_stage.ex | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/lib/frenzy/pipeline/scrape_stage.ex b/lib/frenzy/pipeline/scrape_stage.ex index b72cb97..c0d7768 100644 --- a/lib/frenzy/pipeline/scrape_stage.ex +++ b/lib/frenzy/pipeline/scrape_stage.ex @@ -96,6 +96,18 @@ defmodule Frenzy.Pipeline.ScrapeStage do try do apply(String.to_existing_atom("Elixir." <> module_name), :extract, [html_tree]) + |> case do + {:ok, content} -> + # non-builtin extractors go through readable_html to cleanup any bad/untrusted html + # this is what Floki.readable_html without turning back into a string + content = + Readability.Helper.remove_attrs(content, Readability.regexes(:protect_attrs)) + + {:ok, content} + + err -> + err + end rescue e -> Logger.error( @@ -116,10 +128,6 @@ defmodule Frenzy.Pipeline.ScrapeStage do end |> case do {:ok, html} -> - # todo: probably don't need to go through readable_html if it used the builtin extractor - # this is what Floki.readable_html without turning back into a string - html = Readability.Helper.remove_attrs(html, Readability.regexes(:protect_attrs)) - convert_to_data_uris = case opts["convert_to_data_uris"] do nil -> true