From 4a09ce1cb05bcd4d6f40014da1ff26b26cd5809a Mon Sep 17 00:00:00 2001 From: Shadowfacts Date: Mon, 17 Feb 2020 12:09:03 -0500 Subject: [PATCH] Fix scraping images w/ URLs w/o schemes --- lib/frenzy/http.ex | 22 ++++++++++++++++++++++ lib/frenzy/pipeline/scrape_stage.ex | 26 ++++++++++---------------- lib/frenzy/task/fetch_favicon.ex | 23 +---------------------- 3 files changed, 33 insertions(+), 38 deletions(-) diff --git a/lib/frenzy/http.ex b/lib/frenzy/http.ex index 615be51..1e2dc5c 100644 --- a/lib/frenzy/http.ex +++ b/lib/frenzy/http.ex @@ -43,4 +43,26 @@ defmodule Frenzy.HTTP do {:error, reason} end end + + def resolve_uri(uri, site_uri) when is_binary(site_uri) do + resolve_uri(uri, URI.parse(site_uri)) + end + + def resolve_uri(%URI{host: nil, path: path}, %URI{} = site_uri) do + %URI{site_uri | path: path} + |> resolve_uri(site_uri) + end + + def resolve_uri(%URI{scheme: nil} = uri, %URI{scheme: scheme} = site_uri) do + scheme = + case scheme do + nil -> "https" + _ -> scheme + end + + %URI{uri | scheme: scheme} + |> resolve_uri(site_uri) + end + + def resolve_uri(uri, _), do: uri end diff --git a/lib/frenzy/pipeline/scrape_stage.ex b/lib/frenzy/pipeline/scrape_stage.ex index 674ce90..823ae4c 100644 --- a/lib/frenzy/pipeline/scrape_stage.ex +++ b/lib/frenzy/pipeline/scrape_stage.ex @@ -112,27 +112,16 @@ defmodule Frenzy.Pipeline.ScrapeStage do end end - # # Generates a helper function for the article with the given URI that takes an HTML element and, # if it's an element whose src attribute does not have a hostname, adds the hostname and # scheme to the element. - defp rewrite_image_urls(convert_to_data_uris, %URI{host: host, scheme: scheme}) do + defp rewrite_image_urls(convert_to_data_uris, %URI{host: host, scheme: scheme} = site_uri) do fn {"img", attrs} -> new_attrs = Enum.map(attrs, fn {"src", src} -> - case URI.parse(src) do - %URI{host: nil, path: path} -> - new_src = - URI.to_string(%URI{path: path, host: host, scheme: scheme}) - |> image_to_data_uri(convert_to_data_uris) - - {"src", new_src} - - _ -> - {"src", image_to_data_uri(src, convert_to_data_uris)} - end + {"src", image_to_data_uri(src, site_uri, convert_to_data_uris)} attr -> attr @@ -148,8 +137,13 @@ defmodule Frenzy.Pipeline.ScrapeStage do @content_type_allowlist ["image/jpeg", "image/png", "image/heic", "image/heif", "image/tiff"] # convert images to data URIs so that they're stored by clients as part of the body - defp image_to_data_uri(src, true) do - case HTTP.get(src) do + defp image_to_data_uri(src, site_uri, true) do + absolute_url = + src + |> URI.parse() + |> HTTP.resolve_uri(site_uri) + + case HTTP.get(absolute_url) do {:ok, %HTTPoison.Response{body: body, headers: headers}} -> {"Content-Type", content_type} = Enum.find(headers, fn {header, _value} -> header == "Content-Type" end) @@ -165,5 +159,5 @@ defmodule Frenzy.Pipeline.ScrapeStage do end end - defp image_to_data_uri(src, false), do: src + defp image_to_data_uri(src, _site_uri, false), do: src end diff --git a/lib/frenzy/task/fetch_favicon.ex b/lib/frenzy/task/fetch_favicon.ex index 0a9cbaf..3c31563 100644 --- a/lib/frenzy/task/fetch_favicon.ex +++ b/lib/frenzy/task/fetch_favicon.ex @@ -105,7 +105,7 @@ defmodule Frenzy.Task.FetchFavicon do absolute_url = favicon_url |> URI.parse() - |> resolve_uri(site_url) + |> HTTP.resolve_uri(URI.parse(site_url)) case HTTP.get(absolute_url) do {:ok, %HTTPoison.Response{body: body}} -> @@ -118,27 +118,6 @@ defmodule Frenzy.Task.FetchFavicon do defp fetch_favicon_data(_, _), do: {:error, "No or invalid href for link"} - defp resolve_uri(%URI{host: nil, path: path}, site_url) do - %URI{URI.parse(site_url) | path: path} - |> resolve_uri(site_url) - end - - defp resolve_uri(%URI{scheme: nil} = uri, site_url) do - scheme = - case URI.parse(site_url) do - %URI{scheme: scheme} when scheme != "" -> - scheme - - _ -> - "https" - end - - %URI{uri | scheme: scheme} - |> resolve_uri(site_url) - end - - defp resolve_uri(uri, _), do: uri - # from https://github.com/elixir-plug/plug/blob/v1.8.3/lib/plug/request_id.ex#L60 defp generate_task_id() do binary = <<