Don't put content from builtin extractor through readable_html twice

This commit is contained in:
Shadowfacts 2021-11-06 11:09:05 -04:00
parent fd4b3db6c0
commit 7df1c5a6ba
1 changed files with 14 additions and 6 deletions

View File

@ -89,13 +89,25 @@ defmodule Frenzy.Pipeline.ScrapeStage do
defp handle_response(url, %Tesla.Env{body: body}, opts) do
case opts["extractor"] do
"builtin" ->
{:ok, BuiltinExtractor.article(url, body)}
{:ok, BuiltinExtractor.article(url, body), true}
module_name ->
html_tree = Floki.parse(body)
try do
apply(String.to_existing_atom("Elixir." <> module_name), :extract, [html_tree])
|> case do
{:ok, content} ->
# non-builtin extractors go through readable_html to cleanup any bad/untrusted html
# this is what Floki.readable_html without turning back into a string
content =
Readability.Helper.remove_attrs(content, Readability.regexes(:protect_attrs))
{:ok, content}
err ->
err
end
rescue
e ->
Logger.error(
@ -111,15 +123,11 @@ defmodule Frenzy.Pipeline.ScrapeStage do
)
end
{:ok, BuiltinExtractor.article(url, body)}
{:ok, BuiltinExtractor.article(url, body), true}
end
end
|> case do
{:ok, html} ->
# todo: probably don't need to go through readable_html if it used the builtin extractor
# this is what Floki.readable_html without turning back into a string
html = Readability.Helper.remove_attrs(html, Readability.regexes(:protect_attrs))
convert_to_data_uris =
case opts["convert_to_data_uris"] do
nil -> true