Don't put content from builtin extractor through readable_html twice
This commit is contained in:
parent
d2d4651f1d
commit
37a802b7a8
|
@ -96,6 +96,18 @@ defmodule Frenzy.Pipeline.ScrapeStage do
|
|||
|
||||
try do
|
||||
apply(String.to_existing_atom("Elixir." <> module_name), :extract, [html_tree])
|
||||
|> case do
|
||||
{:ok, content} ->
|
||||
# non-builtin extractors go through readable_html to cleanup any bad/untrusted html
|
||||
# this is what Floki.readable_html without turning back into a string
|
||||
content =
|
||||
Readability.Helper.remove_attrs(content, Readability.regexes(:protect_attrs))
|
||||
|
||||
{:ok, content}
|
||||
|
||||
err ->
|
||||
err
|
||||
end
|
||||
rescue
|
||||
e ->
|
||||
Logger.error(
|
||||
|
@ -116,10 +128,6 @@ defmodule Frenzy.Pipeline.ScrapeStage do
|
|||
end
|
||||
|> case do
|
||||
{:ok, html} ->
|
||||
# todo: probably don't need to go through readable_html if it used the builtin extractor
|
||||
# this is what Floki.readable_html without turning back into a string
|
||||
html = Readability.Helper.remove_attrs(html, Readability.regexes(:protect_attrs))
|
||||
|
||||
convert_to_data_uris =
|
||||
case opts["convert_to_data_uris"] do
|
||||
nil -> true
|
||||
|
|
Loading…
Reference in New Issue