Don't put content from builtin extractor through readable_html twice
This commit is contained in:
parent
fd4b3db6c0
commit
7df1c5a6ba
|
@ -89,13 +89,25 @@ defmodule Frenzy.Pipeline.ScrapeStage do
|
|||
defp handle_response(url, %Tesla.Env{body: body}, opts) do
|
||||
case opts["extractor"] do
|
||||
"builtin" ->
|
||||
{:ok, BuiltinExtractor.article(url, body)}
|
||||
{:ok, BuiltinExtractor.article(url, body), true}
|
||||
|
||||
module_name ->
|
||||
html_tree = Floki.parse(body)
|
||||
|
||||
try do
|
||||
apply(String.to_existing_atom("Elixir." <> module_name), :extract, [html_tree])
|
||||
|> case do
|
||||
{:ok, content} ->
|
||||
# non-builtin extractors go through readable_html to cleanup any bad/untrusted html
|
||||
# this is what Floki.readable_html without turning back into a string
|
||||
content =
|
||||
Readability.Helper.remove_attrs(content, Readability.regexes(:protect_attrs))
|
||||
|
||||
{:ok, content}
|
||||
|
||||
err ->
|
||||
err
|
||||
end
|
||||
rescue
|
||||
e ->
|
||||
Logger.error(
|
||||
|
@ -111,15 +123,11 @@ defmodule Frenzy.Pipeline.ScrapeStage do
|
|||
)
|
||||
end
|
||||
|
||||
{:ok, BuiltinExtractor.article(url, body)}
|
||||
{:ok, BuiltinExtractor.article(url, body), true}
|
||||
end
|
||||
end
|
||||
|> case do
|
||||
{:ok, html} ->
|
||||
# todo: probably don't need to go through readable_html if it used the builtin extractor
|
||||
# this is what Floki.readable_html without turning back into a string
|
||||
html = Readability.Helper.remove_attrs(html, Readability.regexes(:protect_attrs))
|
||||
|
||||
convert_to_data_uris =
|
||||
case opts["convert_to_data_uris"] do
|
||||
nil -> true
|
||||
|
|
Loading…
Reference in New Issue