Implement basic favicon scraping
This commit is contained in:
parent
2d88b0b4e1
commit
e684737fcd
|
@ -20,7 +20,7 @@ config :frenzy, FrenzyWeb.Endpoint,
|
|||
# Configures Elixir's Logger
|
||||
config :logger, :console,
|
||||
format: "$time $metadata[$level] $message\n",
|
||||
metadata: [:request_id, :item_task_id]
|
||||
metadata: [:request_id, :item_task_id, :favicon_task_id]
|
||||
|
||||
# Use Jason for JSON parsing in Phoenix
|
||||
config :phoenix, :json_library, Jason
|
||||
|
|
|
@ -34,6 +34,7 @@ defmodule Frenzy.Feed do
|
|||
field :last_updated, :utc_datetime
|
||||
field :site_url, :string
|
||||
field :title, :string
|
||||
field :favicon, :string
|
||||
|
||||
belongs_to :group, Frenzy.Group
|
||||
belongs_to :pipeline, Frenzy.Pipeline
|
||||
|
@ -50,6 +51,7 @@ defmodule Frenzy.Feed do
|
|||
last_updated: DateTime.t() | nil,
|
||||
site_url: String.t() | nil,
|
||||
title: String.t() | nil,
|
||||
favicon: String.t() | nil,
|
||||
group: Frenzy.Group.t() | Ecto.Association.NotLoaded.t(),
|
||||
pipeline: Frenzy.Pipeline.t() | nil | Ecto.Association.NotLoaded.t(),
|
||||
items: [Frenzy.Item.t()] | Ecto.Association.NotLoaded.t(),
|
||||
|
@ -65,7 +67,8 @@ defmodule Frenzy.Feed do
|
|||
:feed_url,
|
||||
:site_url,
|
||||
:last_updated,
|
||||
:pipeline_id
|
||||
:pipeline_id,
|
||||
:favicon
|
||||
])
|
||||
|> validate_required([:feed_url])
|
||||
end
|
||||
|
|
|
@ -0,0 +1,34 @@
|
|||
defmodule Frenzy.HTTP do
|
||||
require Logger
|
||||
@redirect_codes [301, 302]
|
||||
|
||||
# @spec get(url :: String.t()) :: {:ok, HTTPoison.Response.t()} | {:error, String.()}
|
||||
def get(url) do
|
||||
case HTTPoison.get(url) do
|
||||
{:ok, %HTTPoison.Response{status_code: 200} = response} ->
|
||||
{:ok, response}
|
||||
|
||||
{:ok, %HTTPoison.Response{status_code: status_code, headers: headers}}
|
||||
when status_code in @redirect_codes ->
|
||||
headers
|
||||
|> Enum.find(fn {name, _value} -> name == "Location" end)
|
||||
|> case do
|
||||
{"Location", new_url} ->
|
||||
Logger.debug("Got 301 redirect from #{url} to #{new_url}")
|
||||
get(new_url)
|
||||
|
||||
_ ->
|
||||
{:error, "Missing Location header for redirect"}
|
||||
end
|
||||
|
||||
{:ok, %HTTPoison.Response{status_code: 403}} ->
|
||||
{:error, "403 Forbidden"}
|
||||
|
||||
{:ok, %HTTPoison.Response{status_code: 404}} ->
|
||||
{:error, "404 Not Found"}
|
||||
|
||||
{:error, %HTTPoison.Error{reason: reason}} ->
|
||||
{:error, reason}
|
||||
end
|
||||
end
|
||||
end
|
|
@ -1,5 +1,6 @@
|
|||
defmodule Frenzy.Pipeline.ScrapeStage do
|
||||
require Logger
|
||||
alias Frenzy.HTTP
|
||||
alias Frenzy.Pipeline.Stage
|
||||
@behaviour Stage
|
||||
|
||||
|
@ -65,13 +66,13 @@ defmodule Frenzy.Pipeline.ScrapeStage do
|
|||
Logger.debug("Getting article from #{url}")
|
||||
|
||||
url
|
||||
|> HTTPoison.get()
|
||||
|> HTTP.get()
|
||||
|> case do
|
||||
{:ok, response} ->
|
||||
handle_response(url, response, opts)
|
||||
|
||||
{:error, %HTTPoison.Error{reason: reason}} ->
|
||||
{:error, "HTTPoison error: #{reason}"}
|
||||
{:error, reason} ->
|
||||
{:error, "Couldn't scrape article: #{reason}"}
|
||||
end
|
||||
end
|
||||
|
||||
|
@ -79,7 +80,7 @@ defmodule Frenzy.Pipeline.ScrapeStage do
|
|||
|
||||
@spec handle_response(String.t(), HTTPoison.Response.t(), String.t()) ::
|
||||
{:ok, String.t()} | {:error, String.t()}
|
||||
defp handle_response(url, %HTTPoison.Response{status_code: 200, body: body}, opts) do
|
||||
defp handle_response(url, %HTTPoison.Response{body: body}, opts) do
|
||||
case opts["extractor"] do
|
||||
"builtin" ->
|
||||
{:ok, Readability.article(body)}
|
||||
|
@ -111,36 +112,7 @@ defmodule Frenzy.Pipeline.ScrapeStage do
|
|||
end
|
||||
end
|
||||
|
||||
defp handle_response(_url, %HTTPoison.Response{status_code: 404}, _extractor) do
|
||||
{:error, "404 not found"}
|
||||
end
|
||||
|
||||
defp handle_response(
|
||||
url,
|
||||
%HTTPoison.Response{status_code: status_code, headers: headers},
|
||||
extractor
|
||||
)
|
||||
when status_code in [301, 302] do
|
||||
headers
|
||||
|> Enum.find(fn {name, _value} -> name == "Location" end)
|
||||
|> case do
|
||||
{"Location", new_url} ->
|
||||
Logger.debug("Got 301 redirect from #{url} to #{new_url}")
|
||||
get_article_content(new_url, extractor)
|
||||
|
||||
_ ->
|
||||
{:error, "Missing Location header for redirect"}
|
||||
end
|
||||
end
|
||||
|
||||
defp handle_response(_url, %HTTPoison.Response{status_code: 403}, _extractor) do
|
||||
{:error, "403 Forbidden"}
|
||||
end
|
||||
|
||||
defp handle_response(_url, %HTTPoison.Response{} = response, _extractor) do
|
||||
{:error, "No handler for response #{inspect(response)}"}
|
||||
end
|
||||
|
||||
#
|
||||
# Generates a helper function for the article with the given URI that takes an HTML element and,
|
||||
# if it's an <img> element whose src attribute does not have a hostname, adds the hostname and
|
||||
# scheme to the element.
|
||||
|
@ -177,8 +149,8 @@ defmodule Frenzy.Pipeline.ScrapeStage do
|
|||
|
||||
# convert images to data URIs so that they're stored by clients as part of the body
|
||||
defp image_to_data_uri(src, true) do
|
||||
case HTTPoison.get(src) do
|
||||
{:ok, %HTTPoison.Response{status_code: 200, body: body, headers: headers}} ->
|
||||
case HTTP.get(src) do
|
||||
{:ok, %HTTPoison.Response{body: body, headers: headers}} ->
|
||||
{"Content-Type", content_type} =
|
||||
Enum.find(headers, fn {header, _value} -> header == "Content-Type" end)
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
defmodule Frenzy.CreateItemTask do
|
||||
defmodule Frenzy.Task.CreateItem do
|
||||
require Logger
|
||||
use Task
|
||||
alias Frenzy.Repo
|
|
@ -0,0 +1,112 @@
|
|||
defmodule Frenzy.Task.FetchFavicon do
|
||||
require Logger
|
||||
use Task
|
||||
alias Frenzy.{HTTP, Repo, Feed}
|
||||
|
||||
def start_link(feed) do
|
||||
Task.start_link(__MODULE__, :run, [feed])
|
||||
end
|
||||
|
||||
def run(feed) do
|
||||
Logger.metadata(favicon_task_id: generate_task_id())
|
||||
|
||||
case fetch_favicon_from_webpage(feed.site_url) do
|
||||
{:ok, favicon_data} ->
|
||||
changeset = Feed.changeset(feed, %{favicon: favicon_data})
|
||||
{:ok, _feed} = Repo.update(changeset)
|
||||
|
||||
{:error, reason} ->
|
||||
Logger.info("Couldn't fetch favicon for #{feed.site_url}: #{reason}")
|
||||
|
||||
favicon_uri =
|
||||
%{URI.parse(feed.site_url) | path: "/favicon.ico", query: nil, fragment: nil}
|
||||
|> URI.to_string()
|
||||
|
||||
Logger.info("Trying default path: #{favicon_uri}")
|
||||
|
||||
case fetch_favicon_data(favicon_uri) do
|
||||
{:ok, favicon_data} ->
|
||||
changeset = Feed.changeset(feed, %{favicon: favicon_data})
|
||||
{:ok, _feed} = Repo.update(changeset)
|
||||
|
||||
{:error, reason} ->
|
||||
Logger.info("Couldn't fetch default /favicon.ico for #{feed.site_url}: #{reason}")
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
defp fetch_favicon_from_webpage(url) do
|
||||
case HTTP.get(url) do
|
||||
{:ok, %HTTPoison.Response{body: body}} ->
|
||||
extract_favicon(body)
|
||||
|
||||
{:error, _reason} = err ->
|
||||
err
|
||||
end
|
||||
end
|
||||
|
||||
defp extract_favicon(body) do
|
||||
html_tree = Floki.parse(body)
|
||||
|
||||
case Floki.find(html_tree, "link[rel=icon]") do
|
||||
[] ->
|
||||
{:error, "No element matching link[rel=icon]"}
|
||||
|
||||
links ->
|
||||
links
|
||||
|> Enum.find(fn link ->
|
||||
link
|
||||
|> Floki.attribute("type")
|
||||
|> Enum.map(&String.downcase/1)
|
||||
|> Enum.any?(&(&1 == "image/png"))
|
||||
|> case do
|
||||
false ->
|
||||
link
|
||||
|> Floki.attribute("href")
|
||||
# bad hack for missing type attr
|
||||
|> Enum.any?(&String.contains?(&1, ".png"))
|
||||
|
||||
true ->
|
||||
true
|
||||
end
|
||||
|
||||
# todo: support more image types
|
||||
end)
|
||||
|> case do
|
||||
nil ->
|
||||
{:error, "No link[rel=icon] with type of image/png"}
|
||||
|
||||
# todo: try requesting /favicon.ico
|
||||
|
||||
link ->
|
||||
link
|
||||
|> Floki.attribute("href")
|
||||
|> List.first()
|
||||
|> fetch_favicon_data()
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
defp fetch_favicon_data(nil), do: {:error, "No href for link"}
|
||||
|
||||
defp fetch_favicon_data(url) do
|
||||
case HTTP.get(url) do
|
||||
{:ok, %HTTPoison.Response{body: body}} ->
|
||||
{:ok, "data:image/png;base64,#{Base.encode64(body)}"}
|
||||
|
||||
{:error, _reason} = err ->
|
||||
err
|
||||
end
|
||||
end
|
||||
|
||||
# from https://github.com/elixir-plug/plug/blob/v1.8.3/lib/plug/request_id.ex#L60
|
||||
defp generate_task_id() do
|
||||
binary = <<
|
||||
System.system_time(:nanosecond)::64,
|
||||
:erlang.phash2({node(), self()}, 16_777_216)::24,
|
||||
:erlang.unique_integer()::32
|
||||
>>
|
||||
|
||||
Base.url_encode64(binary)
|
||||
end
|
||||
end
|
|
@ -1,6 +1,7 @@
|
|||
defmodule Frenzy.UpdateFeeds do
|
||||
use GenServer
|
||||
alias Frenzy.{Repo, Feed, Item, CreateItemTask}
|
||||
alias Frenzy.{HTTP, Repo, Feed, Item}
|
||||
alias Frenzy.Task.{CreateItem, FetchFavicon}
|
||||
import Ecto.Query
|
||||
require Logger
|
||||
|
||||
|
@ -121,16 +122,18 @@ defmodule Frenzy.UpdateFeeds do
|
|||
last_updated: (rss.last_updated || DateTime.utc_now()) |> Timex.Timezone.convert(:utc)
|
||||
})
|
||||
|
||||
Repo.update(changeset)
|
||||
{:ok, feed} = Repo.update(changeset)
|
||||
|
||||
if is_nil(feed.favicon) do
|
||||
FetchFavicon.run(feed)
|
||||
end
|
||||
|
||||
feed = Repo.preload(feed, [:items])
|
||||
|
||||
Enum.each(rss.items, fn entry ->
|
||||
# todo: use Repo.exists for this
|
||||
if !Enum.any?(feed.items, fn item -> item.guid == entry.guid end) do
|
||||
CreateItemTask.start_link(feed, entry)
|
||||
# Task.start_link(__MODULE__, :create_item, [feed, entry])
|
||||
# spawn(__MODULE__, :create_item, [feed, entry])
|
||||
CreateItem.start_link(feed, entry)
|
||||
end
|
||||
end)
|
||||
end
|
||||
|
|
|
@ -0,0 +1,9 @@
|
|||
defmodule Frenzy.Repo.Migrations.FeedsAddFavicon do
|
||||
use Ecto.Migration
|
||||
|
||||
def change do
|
||||
alter table(:feeds) do
|
||||
add :favicon, :text, default: nil
|
||||
end
|
||||
end
|
||||
end
|
Loading…
Reference in New Issue