Don't put content from builtin extractor through readable_html twice
This commit is contained in:
parent
d2d4651f1d
commit
37a802b7a8
|
@ -96,6 +96,18 @@ defmodule Frenzy.Pipeline.ScrapeStage do
|
||||||
|
|
||||||
try do
|
try do
|
||||||
apply(String.to_existing_atom("Elixir." <> module_name), :extract, [html_tree])
|
apply(String.to_existing_atom("Elixir." <> module_name), :extract, [html_tree])
|
||||||
|
|> case do
|
||||||
|
{:ok, content} ->
|
||||||
|
# non-builtin extractors go through readable_html to cleanup any bad/untrusted html
|
||||||
|
# this is what Floki.readable_html without turning back into a string
|
||||||
|
content =
|
||||||
|
Readability.Helper.remove_attrs(content, Readability.regexes(:protect_attrs))
|
||||||
|
|
||||||
|
{:ok, content}
|
||||||
|
|
||||||
|
err ->
|
||||||
|
err
|
||||||
|
end
|
||||||
rescue
|
rescue
|
||||||
e ->
|
e ->
|
||||||
Logger.error(
|
Logger.error(
|
||||||
|
@ -116,10 +128,6 @@ defmodule Frenzy.Pipeline.ScrapeStage do
|
||||||
end
|
end
|
||||||
|> case do
|
|> case do
|
||||||
{:ok, html} ->
|
{:ok, html} ->
|
||||||
# todo: probably don't need to go through readable_html if it used the builtin extractor
|
|
||||||
# this is what Floki.readable_html without turning back into a string
|
|
||||||
html = Readability.Helper.remove_attrs(html, Readability.regexes(:protect_attrs))
|
|
||||||
|
|
||||||
convert_to_data_uris =
|
convert_to_data_uris =
|
||||||
case opts["convert_to_data_uris"] do
|
case opts["convert_to_data_uris"] do
|
||||||
nil -> true
|
nil -> true
|
||||||
|
|
Loading…
Reference in New Issue