Rewrite image URLs without hosts to use the host of the article URL
This commit is contained in:
parent
eec0b918e7
commit
cfd9f7505a
|
@ -61,23 +61,29 @@ defmodule Frenzy.Pipeline.ScrapeStage do
|
||||||
|
|
||||||
@spec handle_response(String.t(), HTTPoison.Response.t(), String.t()) ::
|
@spec handle_response(String.t(), HTTPoison.Response.t(), String.t()) ::
|
||||||
{:ok, String.t()} | {:error, String.t()}
|
{:ok, String.t()} | {:error, String.t()}
|
||||||
defp handle_response(_url, %HTTPoison.Response{status_code: 200, body: body}, extractor) do
|
defp handle_response(url, %HTTPoison.Response{status_code: 200, body: body}, extractor) do
|
||||||
case extractor do
|
case extractor do
|
||||||
"builtin" ->
|
"builtin" ->
|
||||||
article = Readability.article(body)
|
{:ok, Readability.article(body)}
|
||||||
{:ok, Readability.readable_html(article)}
|
|
||||||
|
|
||||||
module_name ->
|
module_name ->
|
||||||
html_tree = Floki.parse(body)
|
html_tree = Floki.parse(body)
|
||||||
|
apply(String.to_existing_atom("Elixir." <> module_name), :extract, [html_tree])
|
||||||
case apply(String.to_existing_atom("Elixir." <> module_name), :extract, [html_tree]) do
|
|
||||||
{:error, _} = err ->
|
|
||||||
err
|
|
||||||
|
|
||||||
{:ok, html_tree} ->
|
|
||||||
html_tree
|
|
||||||
|> Floki.raw_html()
|
|
||||||
end
|
end
|
||||||
|
|> case do
|
||||||
|
{:ok, html} ->
|
||||||
|
html = Floki.map(html, rewrite_image_urls(URI.parse(url)))
|
||||||
|
|
||||||
|
case extractor do
|
||||||
|
"builtin" ->
|
||||||
|
{:ok, Readability.readable_html(html)}
|
||||||
|
|
||||||
|
_ ->
|
||||||
|
{:ok, Floki.raw_html(html)}
|
||||||
|
end
|
||||||
|
|
||||||
|
res ->
|
||||||
|
res
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -110,4 +116,25 @@ defmodule Frenzy.Pipeline.ScrapeStage do
|
||||||
defp handle_response(_url, %HTTPoison.Response{} = response, _extractor) do
|
defp handle_response(_url, %HTTPoison.Response{} = response, _extractor) do
|
||||||
{:error, "No handler for response #{inspect(response)}"}
|
{:error, "No handler for response #{inspect(response)}"}
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Generates a helper function for the article with the given URI that takes an HTML element and,
|
||||||
|
# if it's an <img> element whose src attribute does not have a hostname, adds the hostname and
|
||||||
|
# scheme to the element.
|
||||||
|
defp rewrite_image_urls(%URI{host: host, scheme: scheme}) do
|
||||||
|
fn
|
||||||
|
{"img", [{"src", src} | attrs]} = elem ->
|
||||||
|
case URI.parse(src) do
|
||||||
|
%URI{host: nil, path: path} ->
|
||||||
|
new_src = URI.to_string(%URI{path: path, host: host, scheme: scheme})
|
||||||
|
|
||||||
|
{"img", [{"src", new_src} | attrs]}
|
||||||
|
|
||||||
|
_ ->
|
||||||
|
elem
|
||||||
|
end
|
||||||
|
|
||||||
|
elem ->
|
||||||
|
elem
|
||||||
|
end
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
Loading…
Reference in New Issue