Rewrite image URLs without hosts to use the host of the article URL

This commit is contained in:
Shadowfacts 2019-10-31 17:38:16 -04:00
parent eec0b918e7
commit cfd9f7505a
Signed by: shadowfacts
GPG Key ID: 94A5AB95422746E5
1 changed files with 36 additions and 9 deletions

View File

@ -61,23 +61,29 @@ defmodule Frenzy.Pipeline.ScrapeStage do
@spec handle_response(String.t(), HTTPoison.Response.t(), String.t()) :: @spec handle_response(String.t(), HTTPoison.Response.t(), String.t()) ::
{:ok, String.t()} | {:error, String.t()} {:ok, String.t()} | {:error, String.t()}
defp handle_response(_url, %HTTPoison.Response{status_code: 200, body: body}, extractor) do defp handle_response(url, %HTTPoison.Response{status_code: 200, body: body}, extractor) do
case extractor do case extractor do
"builtin" -> "builtin" ->
article = Readability.article(body) {:ok, Readability.article(body)}
{:ok, Readability.readable_html(article)}
module_name -> module_name ->
html_tree = Floki.parse(body) html_tree = Floki.parse(body)
apply(String.to_existing_atom("Elixir." <> module_name), :extract, [html_tree])
case apply(String.to_existing_atom("Elixir." <> module_name), :extract, [html_tree]) do
{:error, _} = err ->
err
{:ok, html_tree} ->
html_tree
|> Floki.raw_html()
end end
|> case do
{:ok, html} ->
html = Floki.map(html, rewrite_image_urls(URI.parse(url)))
case extractor do
"builtin" ->
{:ok, Readability.readable_html(html)}
_ ->
{:ok, Floki.raw_html(html)}
end
res ->
res
end end
end end
@ -110,4 +116,25 @@ defmodule Frenzy.Pipeline.ScrapeStage do
defp handle_response(_url, %HTTPoison.Response{} = response, _extractor) do defp handle_response(_url, %HTTPoison.Response{} = response, _extractor) do
{:error, "No handler for response #{inspect(response)}"} {:error, "No handler for response #{inspect(response)}"}
end end
# Generates a helper function for the article with the given URI that takes an HTML element and,
# if it's an <img> element whose src attribute does not have a hostname, adds the hostname and
# scheme to the element.
defp rewrite_image_urls(%URI{host: host, scheme: scheme}) do
fn
{"img", [{"src", src} | attrs]} = elem ->
case URI.parse(src) do
%URI{host: nil, path: path} ->
new_src = URI.to_string(%URI{path: path, host: host, scheme: scheme})
{"img", [{"src", new_src} | attrs]}
_ ->
elem
end
elem ->
elem
end
end
end end