Rewrite image URLs without hosts to use the host of the article URL
This commit is contained in:
parent
eec0b918e7
commit
cfd9f7505a
@ -61,23 +61,29 @@ defmodule Frenzy.Pipeline.ScrapeStage do
|
||||
|
||||
@spec handle_response(String.t(), HTTPoison.Response.t(), String.t()) ::
|
||||
{:ok, String.t()} | {:error, String.t()}
|
||||
defp handle_response(_url, %HTTPoison.Response{status_code: 200, body: body}, extractor) do
|
||||
defp handle_response(url, %HTTPoison.Response{status_code: 200, body: body}, extractor) do
|
||||
case extractor do
|
||||
"builtin" ->
|
||||
article = Readability.article(body)
|
||||
{:ok, Readability.readable_html(article)}
|
||||
{:ok, Readability.article(body)}
|
||||
|
||||
module_name ->
|
||||
html_tree = Floki.parse(body)
|
||||
apply(String.to_existing_atom("Elixir." <> module_name), :extract, [html_tree])
|
||||
end
|
||||
|> case do
|
||||
{:ok, html} ->
|
||||
html = Floki.map(html, rewrite_image_urls(URI.parse(url)))
|
||||
|
||||
case apply(String.to_existing_atom("Elixir." <> module_name), :extract, [html_tree]) do
|
||||
{:error, _} = err ->
|
||||
err
|
||||
case extractor do
|
||||
"builtin" ->
|
||||
{:ok, Readability.readable_html(html)}
|
||||
|
||||
{:ok, html_tree} ->
|
||||
html_tree
|
||||
|> Floki.raw_html()
|
||||
_ ->
|
||||
{:ok, Floki.raw_html(html)}
|
||||
end
|
||||
|
||||
res ->
|
||||
res
|
||||
end
|
||||
end
|
||||
|
||||
@ -110,4 +116,25 @@ defmodule Frenzy.Pipeline.ScrapeStage do
|
||||
defp handle_response(_url, %HTTPoison.Response{} = response, _extractor) do
|
||||
{:error, "No handler for response #{inspect(response)}"}
|
||||
end
|
||||
|
||||
# Generates a helper function for the article with the given URI that takes an HTML element and,
|
||||
# if it's an <img> element whose src attribute does not have a hostname, adds the hostname and
|
||||
# scheme to the element.
|
||||
defp rewrite_image_urls(%URI{host: host, scheme: scheme}) do
|
||||
fn
|
||||
{"img", [{"src", src} | attrs]} = elem ->
|
||||
case URI.parse(src) do
|
||||
%URI{host: nil, path: path} ->
|
||||
new_src = URI.to_string(%URI{path: path, host: host, scheme: scheme})
|
||||
|
||||
{"img", [{"src", new_src} | attrs]}
|
||||
|
||||
_ ->
|
||||
elem
|
||||
end
|
||||
|
||||
elem ->
|
||||
elem
|
||||
end
|
||||
end
|
||||
end
|
||||
|
Loading…
x
Reference in New Issue
Block a user