From b0d9189399a95ab8bc12c012154fa86dd50d5d3d Mon Sep 17 00:00:00 2001 From: Shadowfacts Date: Fri, 29 May 2020 19:47:14 -0400 Subject: [PATCH] Prevent unnecessary refetching of favicons --- lib/frenzy/feed.ex | 5 +- lib/frenzy/task/fetch_favicon.ex | 94 +++++++++---------- .../20200529233101_feeds_add_favicon_url.exs | 9 ++ 3 files changed, 60 insertions(+), 48 deletions(-) create mode 100644 priv/repo/migrations/20200529233101_feeds_add_favicon_url.exs diff --git a/lib/frenzy/feed.ex b/lib/frenzy/feed.ex index bf9efb8..a869431 100644 --- a/lib/frenzy/feed.ex +++ b/lib/frenzy/feed.ex @@ -35,6 +35,7 @@ defmodule Frenzy.Feed do field :site_url, :string field :title, :string field :favicon, :string + field :favicon_url, :string belongs_to :group, Frenzy.Group belongs_to :pipeline, Frenzy.Pipeline @@ -52,6 +53,7 @@ defmodule Frenzy.Feed do site_url: String.t() | nil, title: String.t() | nil, favicon: String.t() | nil, + favicon_url: String.t() | nil, group: Frenzy.Group.t() | Ecto.Association.NotLoaded.t(), pipeline: Frenzy.Pipeline.t() | nil | Ecto.Association.NotLoaded.t(), items: [Frenzy.Item.t()] | Ecto.Association.NotLoaded.t(), @@ -68,7 +70,8 @@ defmodule Frenzy.Feed do :site_url, :last_updated, :pipeline_id, - :favicon + :favicon, + :favicon_url ]) |> validate_required([:feed_url]) end diff --git a/lib/frenzy/task/fetch_favicon.ex b/lib/frenzy/task/fetch_favicon.ex index 3c31563..e3278ad 100644 --- a/lib/frenzy/task/fetch_favicon.ex +++ b/lib/frenzy/task/fetch_favicon.ex @@ -21,44 +21,40 @@ defmodule Frenzy.Task.FetchFavicon do Logger.debug("Fetching favicon for #{site_url}") - case fetch_favicon_from_webpage(site_url) do - {:ok, favicon_data} -> - changeset = Feed.changeset(feed, %{favicon: favicon_data}) - {:ok, _feed} = Repo.update(changeset) + favicon_url = fetch_favicon_url_from_webpage(site_url) || URI.merge(site_url, "/favicon.ico") + + with %Feed{favicon_url: old_url} when old_url != favicon_url <- feed, + {:ok, favicon_data} <- fetch_favicon_data(favicon_url) do + IO.inspect(favicon_url) + changeset = Feed.changeset(feed, %{favicon: favicon_data, favicon_url: favicon_url}) + {:ok, _feed} = Repo.update(changeset) + else + _ -> + :ok + end + end + + @spec fetch_favicon_url_from_webpage(url :: String.t()) :: String.t() + + defp fetch_favicon_url_from_webpage(url) when is_binary(url) do + case HTTP.get(url) do + {:ok, %HTTPoison.Response{body: body, status_code: code}} when code in 200..299 -> + extract_favicon_url(url, body) + + {:ok, %HTTPoison.Response{status_code: code}} -> + Logger.debug("Unhandled HTTP code #{code} for '#{url}'") + nil {:error, reason} -> - Logger.info("Couldn't fetch favicon for #{site_url}: #{reason}") - - favicon_uri = - %{URI.parse(site_url) | path: "/favicon.ico", query: nil, fragment: nil} - |> URI.to_string() - - Logger.info("Trying default path: #{favicon_uri}") - - case fetch_favicon_data(favicon_uri, site_url) do - {:ok, favicon_data} -> - changeset = Feed.changeset(feed, %{favicon: favicon_data}) - {:ok, _feed} = Repo.update(changeset) - - {:error, reason} -> - Logger.info("Couldn't fetch default /favicon.ico for #{site_url}: #{reason}") - end + Logger.debug("Error fetching webpage for favicon: #{inspect(reason)}") + nil end end - defp fetch_favicon_from_webpage(url) when is_binary(url) do - case HTTP.get(url) do - {:ok, %HTTPoison.Response{body: body}} -> - extract_favicon(url, body) + defp fetch_favicon_url_from_webpage(_), do: {:error, "URL must be a string"} - {:error, _reason} = err -> - err - end - end - - defp fetch_favicon_from_webpage(_), do: {:error, "URL must be a string"} - - defp extract_favicon(page_url, body) do + @spec extract_favicon_url(page_url :: String.t(), body :: term()) :: String.t() + defp extract_favicon_url(page_url, body) do html_tree = Floki.parse(body) case Floki.find(html_tree, "link[rel=icon]") do @@ -89,35 +85,39 @@ defmodule Frenzy.Task.FetchFavicon do nil -> {:error, "No link[rel=icon] with type of image/png"} - # todo: try requesting /favicon.ico - link -> link |> Floki.attribute("href") |> List.first() - |> fetch_favicon_data(page_url) + |> case do + href when is_binary(href) -> + URI.merge(page_url, href) |> to_string() + + _ -> + nil + end end end end - defp fetch_favicon_data(favicon_url, site_url) when is_binary(favicon_url) do - # handle relative URIs, set default scheme if not provided - absolute_url = - favicon_url - |> URI.parse() - |> HTTP.resolve_uri(URI.parse(site_url)) + @spec fetch_favicon_data(favicon_url :: String.t()) :: {:ok, String.t()} | :error + defp fetch_favicon_data(favicon_url) do + Logger.debug("Fetching favicon from: '#{favicon_url}'") - case HTTP.get(absolute_url) do - {:ok, %HTTPoison.Response{body: body}} -> + case HTTP.get(favicon_url) do + {:ok, %HTTPoison.Response{body: body, status_code: code}} when code in 200..299 -> {:ok, "data:image/png;base64,#{Base.encode64(body)}"} - {:error, _reason} = err -> - err + {:ok, %HTTPoison.Response{status_code: code}} -> + Logger.debug("Unhandled HTTP code #{code} for '#{favicon_url}'") + :error + + {:error, reason} -> + Logger.debug("Error fetching favicon: #{inspect(reason)}") + :error end end - defp fetch_favicon_data(_, _), do: {:error, "No or invalid href for link"} - # from https://github.com/elixir-plug/plug/blob/v1.8.3/lib/plug/request_id.ex#L60 defp generate_task_id() do binary = << diff --git a/priv/repo/migrations/20200529233101_feeds_add_favicon_url.exs b/priv/repo/migrations/20200529233101_feeds_add_favicon_url.exs new file mode 100644 index 0000000..811703a --- /dev/null +++ b/priv/repo/migrations/20200529233101_feeds_add_favicon_url.exs @@ -0,0 +1,9 @@ +defmodule Frenzy.Repo.Migrations.FeedsAddFaviconUrl do + use Ecto.Migration + + def change do + alter table(:feeds) do + add :favicon_url, :string, default: nil + end + end +end