Fix scraping images w/ URLs w/o schemes
This commit is contained in:
parent
ecaf4c78b7
commit
4a09ce1cb0
|
@ -43,4 +43,26 @@ defmodule Frenzy.HTTP do
|
||||||
{:error, reason}
|
{:error, reason}
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def resolve_uri(uri, site_uri) when is_binary(site_uri) do
|
||||||
|
resolve_uri(uri, URI.parse(site_uri))
|
||||||
|
end
|
||||||
|
|
||||||
|
def resolve_uri(%URI{host: nil, path: path}, %URI{} = site_uri) do
|
||||||
|
%URI{site_uri | path: path}
|
||||||
|
|> resolve_uri(site_uri)
|
||||||
|
end
|
||||||
|
|
||||||
|
def resolve_uri(%URI{scheme: nil} = uri, %URI{scheme: scheme} = site_uri) do
|
||||||
|
scheme =
|
||||||
|
case scheme do
|
||||||
|
nil -> "https"
|
||||||
|
_ -> scheme
|
||||||
|
end
|
||||||
|
|
||||||
|
%URI{uri | scheme: scheme}
|
||||||
|
|> resolve_uri(site_uri)
|
||||||
|
end
|
||||||
|
|
||||||
|
def resolve_uri(uri, _), do: uri
|
||||||
end
|
end
|
||||||
|
|
|
@ -112,27 +112,16 @@ defmodule Frenzy.Pipeline.ScrapeStage do
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
#
|
|
||||||
# Generates a helper function for the article with the given URI that takes an HTML element and,
|
# Generates a helper function for the article with the given URI that takes an HTML element and,
|
||||||
# if it's an <img> element whose src attribute does not have a hostname, adds the hostname and
|
# if it's an <img> element whose src attribute does not have a hostname, adds the hostname and
|
||||||
# scheme to the element.
|
# scheme to the element.
|
||||||
defp rewrite_image_urls(convert_to_data_uris, %URI{host: host, scheme: scheme}) do
|
defp rewrite_image_urls(convert_to_data_uris, %URI{host: host, scheme: scheme} = site_uri) do
|
||||||
fn
|
fn
|
||||||
{"img", attrs} ->
|
{"img", attrs} ->
|
||||||
new_attrs =
|
new_attrs =
|
||||||
Enum.map(attrs, fn
|
Enum.map(attrs, fn
|
||||||
{"src", src} ->
|
{"src", src} ->
|
||||||
case URI.parse(src) do
|
{"src", image_to_data_uri(src, site_uri, convert_to_data_uris)}
|
||||||
%URI{host: nil, path: path} ->
|
|
||||||
new_src =
|
|
||||||
URI.to_string(%URI{path: path, host: host, scheme: scheme})
|
|
||||||
|> image_to_data_uri(convert_to_data_uris)
|
|
||||||
|
|
||||||
{"src", new_src}
|
|
||||||
|
|
||||||
_ ->
|
|
||||||
{"src", image_to_data_uri(src, convert_to_data_uris)}
|
|
||||||
end
|
|
||||||
|
|
||||||
attr ->
|
attr ->
|
||||||
attr
|
attr
|
||||||
|
@ -148,8 +137,13 @@ defmodule Frenzy.Pipeline.ScrapeStage do
|
||||||
@content_type_allowlist ["image/jpeg", "image/png", "image/heic", "image/heif", "image/tiff"]
|
@content_type_allowlist ["image/jpeg", "image/png", "image/heic", "image/heif", "image/tiff"]
|
||||||
|
|
||||||
# convert images to data URIs so that they're stored by clients as part of the body
|
# convert images to data URIs so that they're stored by clients as part of the body
|
||||||
defp image_to_data_uri(src, true) do
|
defp image_to_data_uri(src, site_uri, true) do
|
||||||
case HTTP.get(src) do
|
absolute_url =
|
||||||
|
src
|
||||||
|
|> URI.parse()
|
||||||
|
|> HTTP.resolve_uri(site_uri)
|
||||||
|
|
||||||
|
case HTTP.get(absolute_url) do
|
||||||
{:ok, %HTTPoison.Response{body: body, headers: headers}} ->
|
{:ok, %HTTPoison.Response{body: body, headers: headers}} ->
|
||||||
{"Content-Type", content_type} =
|
{"Content-Type", content_type} =
|
||||||
Enum.find(headers, fn {header, _value} -> header == "Content-Type" end)
|
Enum.find(headers, fn {header, _value} -> header == "Content-Type" end)
|
||||||
|
@ -165,5 +159,5 @@ defmodule Frenzy.Pipeline.ScrapeStage do
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
defp image_to_data_uri(src, false), do: src
|
defp image_to_data_uri(src, _site_uri, false), do: src
|
||||||
end
|
end
|
||||||
|
|
|
@ -105,7 +105,7 @@ defmodule Frenzy.Task.FetchFavicon do
|
||||||
absolute_url =
|
absolute_url =
|
||||||
favicon_url
|
favicon_url
|
||||||
|> URI.parse()
|
|> URI.parse()
|
||||||
|> resolve_uri(site_url)
|
|> HTTP.resolve_uri(URI.parse(site_url))
|
||||||
|
|
||||||
case HTTP.get(absolute_url) do
|
case HTTP.get(absolute_url) do
|
||||||
{:ok, %HTTPoison.Response{body: body}} ->
|
{:ok, %HTTPoison.Response{body: body}} ->
|
||||||
|
@ -118,27 +118,6 @@ defmodule Frenzy.Task.FetchFavicon do
|
||||||
|
|
||||||
defp fetch_favicon_data(_, _), do: {:error, "No or invalid href for link"}
|
defp fetch_favicon_data(_, _), do: {:error, "No or invalid href for link"}
|
||||||
|
|
||||||
defp resolve_uri(%URI{host: nil, path: path}, site_url) do
|
|
||||||
%URI{URI.parse(site_url) | path: path}
|
|
||||||
|> resolve_uri(site_url)
|
|
||||||
end
|
|
||||||
|
|
||||||
defp resolve_uri(%URI{scheme: nil} = uri, site_url) do
|
|
||||||
scheme =
|
|
||||||
case URI.parse(site_url) do
|
|
||||||
%URI{scheme: scheme} when scheme != "" ->
|
|
||||||
scheme
|
|
||||||
|
|
||||||
_ ->
|
|
||||||
"https"
|
|
||||||
end
|
|
||||||
|
|
||||||
%URI{uri | scheme: scheme}
|
|
||||||
|> resolve_uri(site_url)
|
|
||||||
end
|
|
||||||
|
|
||||||
defp resolve_uri(uri, _), do: uri
|
|
||||||
|
|
||||||
# from https://github.com/elixir-plug/plug/blob/v1.8.3/lib/plug/request_id.ex#L60
|
# from https://github.com/elixir-plug/plug/blob/v1.8.3/lib/plug/request_id.ex#L60
|
||||||
defp generate_task_id() do
|
defp generate_task_id() do
|
||||||
binary = <<
|
binary = <<
|
||||||
|
|
Loading…
Reference in New Issue