Don't put content from builtin extractor through readable_html twice

This commit is contained in:
Shadowfacts 2021-11-06 11:09:05 -04:00
parent d2d4651f1d
commit 37a802b7a8
1 changed files with 12 additions and 4 deletions

View File

@ -96,6 +96,18 @@ defmodule Frenzy.Pipeline.ScrapeStage do
try do
apply(String.to_existing_atom("Elixir." <> module_name), :extract, [html_tree])
|> case do
{:ok, content} ->
# non-builtin extractors go through readable_html to cleanup any bad/untrusted html
# this is what Floki.readable_html without turning back into a string
content =
Readability.Helper.remove_attrs(content, Readability.regexes(:protect_attrs))
{:ok, content}
err ->
err
end
rescue
e ->
Logger.error(
@ -116,10 +128,6 @@ defmodule Frenzy.Pipeline.ScrapeStage do
end
|> case do
{:ok, html} ->
# todo: probably don't need to go through readable_html if it used the builtin extractor
# this is what Floki.readable_html without turning back into a string
html = Readability.Helper.remove_attrs(html, Readability.regexes(:protect_attrs))
convert_to_data_uris =
case opts["convert_to_data_uris"] do
nil -> true