diff --git a/lib/frenzy/pipeline/scrape_stage.ex b/lib/frenzy/pipeline/scrape_stage.ex index b72cb97..703c0c6 100644 --- a/lib/frenzy/pipeline/scrape_stage.ex +++ b/lib/frenzy/pipeline/scrape_stage.ex @@ -89,13 +89,25 @@ defmodule Frenzy.Pipeline.ScrapeStage do defp handle_response(url, %Tesla.Env{body: body}, opts) do case opts["extractor"] do "builtin" -> - {:ok, BuiltinExtractor.article(url, body)} + {:ok, BuiltinExtractor.article(url, body), true} module_name -> html_tree = Floki.parse(body) try do apply(String.to_existing_atom("Elixir." <> module_name), :extract, [html_tree]) + |> case do + {:ok, content} -> + # non-builtin extractors go through readable_html to cleanup any bad/untrusted html + # this is what Floki.readable_html without turning back into a string + content = + Readability.Helper.remove_attrs(content, Readability.regexes(:protect_attrs)) + + {:ok, content} + + err -> + err + end rescue e -> Logger.error( @@ -111,15 +123,11 @@ defmodule Frenzy.Pipeline.ScrapeStage do ) end - {:ok, BuiltinExtractor.article(url, body)} + {:ok, BuiltinExtractor.article(url, body), true} end end |> case do {:ok, html} -> - # todo: probably don't need to go through readable_html if it used the builtin extractor - # this is what Floki.readable_html without turning back into a string - html = Readability.Helper.remove_attrs(html, Readability.regexes(:protect_attrs)) - convert_to_data_uris = case opts["convert_to_data_uris"] do nil -> true