diff --git a/config/config.exs b/config/config.exs index 9e2662b..637c247 100644 --- a/config/config.exs +++ b/config/config.exs @@ -20,7 +20,7 @@ config :frenzy, FrenzyWeb.Endpoint, # Configures Elixir's Logger config :logger, :console, format: "$time $metadata[$level] $message\n", - metadata: [:request_id, :item_task_id] + metadata: [:request_id, :item_task_id, :favicon_task_id] # Use Jason for JSON parsing in Phoenix config :phoenix, :json_library, Jason diff --git a/lib/frenzy/feed.ex b/lib/frenzy/feed.ex index 98a667c..bf9efb8 100644 --- a/lib/frenzy/feed.ex +++ b/lib/frenzy/feed.ex @@ -34,6 +34,7 @@ defmodule Frenzy.Feed do field :last_updated, :utc_datetime field :site_url, :string field :title, :string + field :favicon, :string belongs_to :group, Frenzy.Group belongs_to :pipeline, Frenzy.Pipeline @@ -50,6 +51,7 @@ defmodule Frenzy.Feed do last_updated: DateTime.t() | nil, site_url: String.t() | nil, title: String.t() | nil, + favicon: String.t() | nil, group: Frenzy.Group.t() | Ecto.Association.NotLoaded.t(), pipeline: Frenzy.Pipeline.t() | nil | Ecto.Association.NotLoaded.t(), items: [Frenzy.Item.t()] | Ecto.Association.NotLoaded.t(), @@ -65,7 +67,8 @@ defmodule Frenzy.Feed do :feed_url, :site_url, :last_updated, - :pipeline_id + :pipeline_id, + :favicon ]) |> validate_required([:feed_url]) end diff --git a/lib/frenzy/http.ex b/lib/frenzy/http.ex new file mode 100644 index 0000000..1468b3d --- /dev/null +++ b/lib/frenzy/http.ex @@ -0,0 +1,34 @@ +defmodule Frenzy.HTTP do + require Logger + @redirect_codes [301, 302] + + # @spec get(url :: String.t()) :: {:ok, HTTPoison.Response.t()} | {:error, String.()} + def get(url) do + case HTTPoison.get(url) do + {:ok, %HTTPoison.Response{status_code: 200} = response} -> + {:ok, response} + + {:ok, %HTTPoison.Response{status_code: status_code, headers: headers}} + when status_code in @redirect_codes -> + headers + |> Enum.find(fn {name, _value} -> name == "Location" end) + |> case do + {"Location", new_url} -> + Logger.debug("Got 301 redirect from #{url} to #{new_url}") + get(new_url) + + _ -> + {:error, "Missing Location header for redirect"} + end + + {:ok, %HTTPoison.Response{status_code: 403}} -> + {:error, "403 Forbidden"} + + {:ok, %HTTPoison.Response{status_code: 404}} -> + {:error, "404 Not Found"} + + {:error, %HTTPoison.Error{reason: reason}} -> + {:error, reason} + end + end +end diff --git a/lib/frenzy/pipeline/scrape_stage.ex b/lib/frenzy/pipeline/scrape_stage.ex index 199680d..674ce90 100644 --- a/lib/frenzy/pipeline/scrape_stage.ex +++ b/lib/frenzy/pipeline/scrape_stage.ex @@ -1,5 +1,6 @@ defmodule Frenzy.Pipeline.ScrapeStage do require Logger + alias Frenzy.HTTP alias Frenzy.Pipeline.Stage @behaviour Stage @@ -65,13 +66,13 @@ defmodule Frenzy.Pipeline.ScrapeStage do Logger.debug("Getting article from #{url}") url - |> HTTPoison.get() + |> HTTP.get() |> case do {:ok, response} -> handle_response(url, response, opts) - {:error, %HTTPoison.Error{reason: reason}} -> - {:error, "HTTPoison error: #{reason}"} + {:error, reason} -> + {:error, "Couldn't scrape article: #{reason}"} end end @@ -79,7 +80,7 @@ defmodule Frenzy.Pipeline.ScrapeStage do @spec handle_response(String.t(), HTTPoison.Response.t(), String.t()) :: {:ok, String.t()} | {:error, String.t()} - defp handle_response(url, %HTTPoison.Response{status_code: 200, body: body}, opts) do + defp handle_response(url, %HTTPoison.Response{body: body}, opts) do case opts["extractor"] do "builtin" -> {:ok, Readability.article(body)} @@ -111,36 +112,7 @@ defmodule Frenzy.Pipeline.ScrapeStage do end end - defp handle_response(_url, %HTTPoison.Response{status_code: 404}, _extractor) do - {:error, "404 not found"} - end - - defp handle_response( - url, - %HTTPoison.Response{status_code: status_code, headers: headers}, - extractor - ) - when status_code in [301, 302] do - headers - |> Enum.find(fn {name, _value} -> name == "Location" end) - |> case do - {"Location", new_url} -> - Logger.debug("Got 301 redirect from #{url} to #{new_url}") - get_article_content(new_url, extractor) - - _ -> - {:error, "Missing Location header for redirect"} - end - end - - defp handle_response(_url, %HTTPoison.Response{status_code: 403}, _extractor) do - {:error, "403 Forbidden"} - end - - defp handle_response(_url, %HTTPoison.Response{} = response, _extractor) do - {:error, "No handler for response #{inspect(response)}"} - end - + # # Generates a helper function for the article with the given URI that takes an HTML element and, # if it's an element whose src attribute does not have a hostname, adds the hostname and # scheme to the element. @@ -177,8 +149,8 @@ defmodule Frenzy.Pipeline.ScrapeStage do # convert images to data URIs so that they're stored by clients as part of the body defp image_to_data_uri(src, true) do - case HTTPoison.get(src) do - {:ok, %HTTPoison.Response{status_code: 200, body: body, headers: headers}} -> + case HTTP.get(src) do + {:ok, %HTTPoison.Response{body: body, headers: headers}} -> {"Content-Type", content_type} = Enum.find(headers, fn {header, _value} -> header == "Content-Type" end) diff --git a/lib/frenzy/create_item_task.ex b/lib/frenzy/task/create_item.ex similarity index 98% rename from lib/frenzy/create_item_task.ex rename to lib/frenzy/task/create_item.ex index 2bd993a..5874d32 100644 --- a/lib/frenzy/create_item_task.ex +++ b/lib/frenzy/task/create_item.ex @@ -1,4 +1,4 @@ -defmodule Frenzy.CreateItemTask do +defmodule Frenzy.Task.CreateItem do require Logger use Task alias Frenzy.Repo diff --git a/lib/frenzy/task/fetch_favicon.ex b/lib/frenzy/task/fetch_favicon.ex new file mode 100644 index 0000000..fd79aa4 --- /dev/null +++ b/lib/frenzy/task/fetch_favicon.ex @@ -0,0 +1,112 @@ +defmodule Frenzy.Task.FetchFavicon do + require Logger + use Task + alias Frenzy.{HTTP, Repo, Feed} + + def start_link(feed) do + Task.start_link(__MODULE__, :run, [feed]) + end + + def run(feed) do + Logger.metadata(favicon_task_id: generate_task_id()) + + case fetch_favicon_from_webpage(feed.site_url) do + {:ok, favicon_data} -> + changeset = Feed.changeset(feed, %{favicon: favicon_data}) + {:ok, _feed} = Repo.update(changeset) + + {:error, reason} -> + Logger.info("Couldn't fetch favicon for #{feed.site_url}: #{reason}") + + favicon_uri = + %{URI.parse(feed.site_url) | path: "/favicon.ico", query: nil, fragment: nil} + |> URI.to_string() + + Logger.info("Trying default path: #{favicon_uri}") + + case fetch_favicon_data(favicon_uri) do + {:ok, favicon_data} -> + changeset = Feed.changeset(feed, %{favicon: favicon_data}) + {:ok, _feed} = Repo.update(changeset) + + {:error, reason} -> + Logger.info("Couldn't fetch default /favicon.ico for #{feed.site_url}: #{reason}") + end + end + end + + defp fetch_favicon_from_webpage(url) do + case HTTP.get(url) do + {:ok, %HTTPoison.Response{body: body}} -> + extract_favicon(body) + + {:error, _reason} = err -> + err + end + end + + defp extract_favicon(body) do + html_tree = Floki.parse(body) + + case Floki.find(html_tree, "link[rel=icon]") do + [] -> + {:error, "No element matching link[rel=icon]"} + + links -> + links + |> Enum.find(fn link -> + link + |> Floki.attribute("type") + |> Enum.map(&String.downcase/1) + |> Enum.any?(&(&1 == "image/png")) + |> case do + false -> + link + |> Floki.attribute("href") + # bad hack for missing type attr + |> Enum.any?(&String.contains?(&1, ".png")) + + true -> + true + end + + # todo: support more image types + end) + |> case do + nil -> + {:error, "No link[rel=icon] with type of image/png"} + + # todo: try requesting /favicon.ico + + link -> + link + |> Floki.attribute("href") + |> List.first() + |> fetch_favicon_data() + end + end + end + + defp fetch_favicon_data(nil), do: {:error, "No href for link"} + + defp fetch_favicon_data(url) do + case HTTP.get(url) do + {:ok, %HTTPoison.Response{body: body}} -> + {:ok, "data:image/png;base64,#{Base.encode64(body)}"} + + {:error, _reason} = err -> + err + end + end + + # from https://github.com/elixir-plug/plug/blob/v1.8.3/lib/plug/request_id.ex#L60 + defp generate_task_id() do + binary = << + System.system_time(:nanosecond)::64, + :erlang.phash2({node(), self()}, 16_777_216)::24, + :erlang.unique_integer()::32 + >> + + Base.url_encode64(binary) + end +end diff --git a/lib/frenzy/update_feeds.ex b/lib/frenzy/update_feeds.ex index dbd738e..ed38e9c 100644 --- a/lib/frenzy/update_feeds.ex +++ b/lib/frenzy/update_feeds.ex @@ -1,6 +1,7 @@ defmodule Frenzy.UpdateFeeds do use GenServer - alias Frenzy.{Repo, Feed, Item, CreateItemTask} + alias Frenzy.{HTTP, Repo, Feed, Item} + alias Frenzy.Task.{CreateItem, FetchFavicon} import Ecto.Query require Logger @@ -121,16 +122,18 @@ defmodule Frenzy.UpdateFeeds do last_updated: (rss.last_updated || DateTime.utc_now()) |> Timex.Timezone.convert(:utc) }) - Repo.update(changeset) + {:ok, feed} = Repo.update(changeset) + + if is_nil(feed.favicon) do + FetchFavicon.run(feed) + end feed = Repo.preload(feed, [:items]) Enum.each(rss.items, fn entry -> # todo: use Repo.exists for this if !Enum.any?(feed.items, fn item -> item.guid == entry.guid end) do - CreateItemTask.start_link(feed, entry) - # Task.start_link(__MODULE__, :create_item, [feed, entry]) - # spawn(__MODULE__, :create_item, [feed, entry]) + CreateItem.start_link(feed, entry) end end) end diff --git a/priv/repo/migrations/20191110165504_feeds_add_favicon.exs b/priv/repo/migrations/20191110165504_feeds_add_favicon.exs new file mode 100644 index 0000000..66df18c --- /dev/null +++ b/priv/repo/migrations/20191110165504_feeds_add_favicon.exs @@ -0,0 +1,9 @@ +defmodule Frenzy.Repo.Migrations.FeedsAddFavicon do + use Ecto.Migration + + def change do + alter table(:feeds) do + add :favicon, :text, default: nil + end + end +end