Implement basic favicon scraping
This commit is contained in:
parent
2d88b0b4e1
commit
e684737fcd
|
@ -20,7 +20,7 @@ config :frenzy, FrenzyWeb.Endpoint,
|
||||||
# Configures Elixir's Logger
|
# Configures Elixir's Logger
|
||||||
config :logger, :console,
|
config :logger, :console,
|
||||||
format: "$time $metadata[$level] $message\n",
|
format: "$time $metadata[$level] $message\n",
|
||||||
metadata: [:request_id, :item_task_id]
|
metadata: [:request_id, :item_task_id, :favicon_task_id]
|
||||||
|
|
||||||
# Use Jason for JSON parsing in Phoenix
|
# Use Jason for JSON parsing in Phoenix
|
||||||
config :phoenix, :json_library, Jason
|
config :phoenix, :json_library, Jason
|
||||||
|
|
|
@ -34,6 +34,7 @@ defmodule Frenzy.Feed do
|
||||||
field :last_updated, :utc_datetime
|
field :last_updated, :utc_datetime
|
||||||
field :site_url, :string
|
field :site_url, :string
|
||||||
field :title, :string
|
field :title, :string
|
||||||
|
field :favicon, :string
|
||||||
|
|
||||||
belongs_to :group, Frenzy.Group
|
belongs_to :group, Frenzy.Group
|
||||||
belongs_to :pipeline, Frenzy.Pipeline
|
belongs_to :pipeline, Frenzy.Pipeline
|
||||||
|
@ -50,6 +51,7 @@ defmodule Frenzy.Feed do
|
||||||
last_updated: DateTime.t() | nil,
|
last_updated: DateTime.t() | nil,
|
||||||
site_url: String.t() | nil,
|
site_url: String.t() | nil,
|
||||||
title: String.t() | nil,
|
title: String.t() | nil,
|
||||||
|
favicon: String.t() | nil,
|
||||||
group: Frenzy.Group.t() | Ecto.Association.NotLoaded.t(),
|
group: Frenzy.Group.t() | Ecto.Association.NotLoaded.t(),
|
||||||
pipeline: Frenzy.Pipeline.t() | nil | Ecto.Association.NotLoaded.t(),
|
pipeline: Frenzy.Pipeline.t() | nil | Ecto.Association.NotLoaded.t(),
|
||||||
items: [Frenzy.Item.t()] | Ecto.Association.NotLoaded.t(),
|
items: [Frenzy.Item.t()] | Ecto.Association.NotLoaded.t(),
|
||||||
|
@ -65,7 +67,8 @@ defmodule Frenzy.Feed do
|
||||||
:feed_url,
|
:feed_url,
|
||||||
:site_url,
|
:site_url,
|
||||||
:last_updated,
|
:last_updated,
|
||||||
:pipeline_id
|
:pipeline_id,
|
||||||
|
:favicon
|
||||||
])
|
])
|
||||||
|> validate_required([:feed_url])
|
|> validate_required([:feed_url])
|
||||||
end
|
end
|
||||||
|
|
|
@ -0,0 +1,34 @@
|
||||||
|
defmodule Frenzy.HTTP do
|
||||||
|
require Logger
|
||||||
|
@redirect_codes [301, 302]
|
||||||
|
|
||||||
|
# @spec get(url :: String.t()) :: {:ok, HTTPoison.Response.t()} | {:error, String.()}
|
||||||
|
def get(url) do
|
||||||
|
case HTTPoison.get(url) do
|
||||||
|
{:ok, %HTTPoison.Response{status_code: 200} = response} ->
|
||||||
|
{:ok, response}
|
||||||
|
|
||||||
|
{:ok, %HTTPoison.Response{status_code: status_code, headers: headers}}
|
||||||
|
when status_code in @redirect_codes ->
|
||||||
|
headers
|
||||||
|
|> Enum.find(fn {name, _value} -> name == "Location" end)
|
||||||
|
|> case do
|
||||||
|
{"Location", new_url} ->
|
||||||
|
Logger.debug("Got 301 redirect from #{url} to #{new_url}")
|
||||||
|
get(new_url)
|
||||||
|
|
||||||
|
_ ->
|
||||||
|
{:error, "Missing Location header for redirect"}
|
||||||
|
end
|
||||||
|
|
||||||
|
{:ok, %HTTPoison.Response{status_code: 403}} ->
|
||||||
|
{:error, "403 Forbidden"}
|
||||||
|
|
||||||
|
{:ok, %HTTPoison.Response{status_code: 404}} ->
|
||||||
|
{:error, "404 Not Found"}
|
||||||
|
|
||||||
|
{:error, %HTTPoison.Error{reason: reason}} ->
|
||||||
|
{:error, reason}
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
|
@ -1,5 +1,6 @@
|
||||||
defmodule Frenzy.Pipeline.ScrapeStage do
|
defmodule Frenzy.Pipeline.ScrapeStage do
|
||||||
require Logger
|
require Logger
|
||||||
|
alias Frenzy.HTTP
|
||||||
alias Frenzy.Pipeline.Stage
|
alias Frenzy.Pipeline.Stage
|
||||||
@behaviour Stage
|
@behaviour Stage
|
||||||
|
|
||||||
|
@ -65,13 +66,13 @@ defmodule Frenzy.Pipeline.ScrapeStage do
|
||||||
Logger.debug("Getting article from #{url}")
|
Logger.debug("Getting article from #{url}")
|
||||||
|
|
||||||
url
|
url
|
||||||
|> HTTPoison.get()
|
|> HTTP.get()
|
||||||
|> case do
|
|> case do
|
||||||
{:ok, response} ->
|
{:ok, response} ->
|
||||||
handle_response(url, response, opts)
|
handle_response(url, response, opts)
|
||||||
|
|
||||||
{:error, %HTTPoison.Error{reason: reason}} ->
|
{:error, reason} ->
|
||||||
{:error, "HTTPoison error: #{reason}"}
|
{:error, "Couldn't scrape article: #{reason}"}
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -79,7 +80,7 @@ defmodule Frenzy.Pipeline.ScrapeStage do
|
||||||
|
|
||||||
@spec handle_response(String.t(), HTTPoison.Response.t(), String.t()) ::
|
@spec handle_response(String.t(), HTTPoison.Response.t(), String.t()) ::
|
||||||
{:ok, String.t()} | {:error, String.t()}
|
{:ok, String.t()} | {:error, String.t()}
|
||||||
defp handle_response(url, %HTTPoison.Response{status_code: 200, body: body}, opts) do
|
defp handle_response(url, %HTTPoison.Response{body: body}, opts) do
|
||||||
case opts["extractor"] do
|
case opts["extractor"] do
|
||||||
"builtin" ->
|
"builtin" ->
|
||||||
{:ok, Readability.article(body)}
|
{:ok, Readability.article(body)}
|
||||||
|
@ -111,36 +112,7 @@ defmodule Frenzy.Pipeline.ScrapeStage do
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
defp handle_response(_url, %HTTPoison.Response{status_code: 404}, _extractor) do
|
#
|
||||||
{:error, "404 not found"}
|
|
||||||
end
|
|
||||||
|
|
||||||
defp handle_response(
|
|
||||||
url,
|
|
||||||
%HTTPoison.Response{status_code: status_code, headers: headers},
|
|
||||||
extractor
|
|
||||||
)
|
|
||||||
when status_code in [301, 302] do
|
|
||||||
headers
|
|
||||||
|> Enum.find(fn {name, _value} -> name == "Location" end)
|
|
||||||
|> case do
|
|
||||||
{"Location", new_url} ->
|
|
||||||
Logger.debug("Got 301 redirect from #{url} to #{new_url}")
|
|
||||||
get_article_content(new_url, extractor)
|
|
||||||
|
|
||||||
_ ->
|
|
||||||
{:error, "Missing Location header for redirect"}
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
defp handle_response(_url, %HTTPoison.Response{status_code: 403}, _extractor) do
|
|
||||||
{:error, "403 Forbidden"}
|
|
||||||
end
|
|
||||||
|
|
||||||
defp handle_response(_url, %HTTPoison.Response{} = response, _extractor) do
|
|
||||||
{:error, "No handler for response #{inspect(response)}"}
|
|
||||||
end
|
|
||||||
|
|
||||||
# Generates a helper function for the article with the given URI that takes an HTML element and,
|
# Generates a helper function for the article with the given URI that takes an HTML element and,
|
||||||
# if it's an <img> element whose src attribute does not have a hostname, adds the hostname and
|
# if it's an <img> element whose src attribute does not have a hostname, adds the hostname and
|
||||||
# scheme to the element.
|
# scheme to the element.
|
||||||
|
@ -177,8 +149,8 @@ defmodule Frenzy.Pipeline.ScrapeStage do
|
||||||
|
|
||||||
# convert images to data URIs so that they're stored by clients as part of the body
|
# convert images to data URIs so that they're stored by clients as part of the body
|
||||||
defp image_to_data_uri(src, true) do
|
defp image_to_data_uri(src, true) do
|
||||||
case HTTPoison.get(src) do
|
case HTTP.get(src) do
|
||||||
{:ok, %HTTPoison.Response{status_code: 200, body: body, headers: headers}} ->
|
{:ok, %HTTPoison.Response{body: body, headers: headers}} ->
|
||||||
{"Content-Type", content_type} =
|
{"Content-Type", content_type} =
|
||||||
Enum.find(headers, fn {header, _value} -> header == "Content-Type" end)
|
Enum.find(headers, fn {header, _value} -> header == "Content-Type" end)
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
defmodule Frenzy.CreateItemTask do
|
defmodule Frenzy.Task.CreateItem do
|
||||||
require Logger
|
require Logger
|
||||||
use Task
|
use Task
|
||||||
alias Frenzy.Repo
|
alias Frenzy.Repo
|
|
@ -0,0 +1,112 @@
|
||||||
|
defmodule Frenzy.Task.FetchFavicon do
|
||||||
|
require Logger
|
||||||
|
use Task
|
||||||
|
alias Frenzy.{HTTP, Repo, Feed}
|
||||||
|
|
||||||
|
def start_link(feed) do
|
||||||
|
Task.start_link(__MODULE__, :run, [feed])
|
||||||
|
end
|
||||||
|
|
||||||
|
def run(feed) do
|
||||||
|
Logger.metadata(favicon_task_id: generate_task_id())
|
||||||
|
|
||||||
|
case fetch_favicon_from_webpage(feed.site_url) do
|
||||||
|
{:ok, favicon_data} ->
|
||||||
|
changeset = Feed.changeset(feed, %{favicon: favicon_data})
|
||||||
|
{:ok, _feed} = Repo.update(changeset)
|
||||||
|
|
||||||
|
{:error, reason} ->
|
||||||
|
Logger.info("Couldn't fetch favicon for #{feed.site_url}: #{reason}")
|
||||||
|
|
||||||
|
favicon_uri =
|
||||||
|
%{URI.parse(feed.site_url) | path: "/favicon.ico", query: nil, fragment: nil}
|
||||||
|
|> URI.to_string()
|
||||||
|
|
||||||
|
Logger.info("Trying default path: #{favicon_uri}")
|
||||||
|
|
||||||
|
case fetch_favicon_data(favicon_uri) do
|
||||||
|
{:ok, favicon_data} ->
|
||||||
|
changeset = Feed.changeset(feed, %{favicon: favicon_data})
|
||||||
|
{:ok, _feed} = Repo.update(changeset)
|
||||||
|
|
||||||
|
{:error, reason} ->
|
||||||
|
Logger.info("Couldn't fetch default /favicon.ico for #{feed.site_url}: #{reason}")
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
defp fetch_favicon_from_webpage(url) do
|
||||||
|
case HTTP.get(url) do
|
||||||
|
{:ok, %HTTPoison.Response{body: body}} ->
|
||||||
|
extract_favicon(body)
|
||||||
|
|
||||||
|
{:error, _reason} = err ->
|
||||||
|
err
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
defp extract_favicon(body) do
|
||||||
|
html_tree = Floki.parse(body)
|
||||||
|
|
||||||
|
case Floki.find(html_tree, "link[rel=icon]") do
|
||||||
|
[] ->
|
||||||
|
{:error, "No element matching link[rel=icon]"}
|
||||||
|
|
||||||
|
links ->
|
||||||
|
links
|
||||||
|
|> Enum.find(fn link ->
|
||||||
|
link
|
||||||
|
|> Floki.attribute("type")
|
||||||
|
|> Enum.map(&String.downcase/1)
|
||||||
|
|> Enum.any?(&(&1 == "image/png"))
|
||||||
|
|> case do
|
||||||
|
false ->
|
||||||
|
link
|
||||||
|
|> Floki.attribute("href")
|
||||||
|
# bad hack for missing type attr
|
||||||
|
|> Enum.any?(&String.contains?(&1, ".png"))
|
||||||
|
|
||||||
|
true ->
|
||||||
|
true
|
||||||
|
end
|
||||||
|
|
||||||
|
# todo: support more image types
|
||||||
|
end)
|
||||||
|
|> case do
|
||||||
|
nil ->
|
||||||
|
{:error, "No link[rel=icon] with type of image/png"}
|
||||||
|
|
||||||
|
# todo: try requesting /favicon.ico
|
||||||
|
|
||||||
|
link ->
|
||||||
|
link
|
||||||
|
|> Floki.attribute("href")
|
||||||
|
|> List.first()
|
||||||
|
|> fetch_favicon_data()
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
defp fetch_favicon_data(nil), do: {:error, "No href for link"}
|
||||||
|
|
||||||
|
defp fetch_favicon_data(url) do
|
||||||
|
case HTTP.get(url) do
|
||||||
|
{:ok, %HTTPoison.Response{body: body}} ->
|
||||||
|
{:ok, "data:image/png;base64,#{Base.encode64(body)}"}
|
||||||
|
|
||||||
|
{:error, _reason} = err ->
|
||||||
|
err
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# from https://github.com/elixir-plug/plug/blob/v1.8.3/lib/plug/request_id.ex#L60
|
||||||
|
defp generate_task_id() do
|
||||||
|
binary = <<
|
||||||
|
System.system_time(:nanosecond)::64,
|
||||||
|
:erlang.phash2({node(), self()}, 16_777_216)::24,
|
||||||
|
:erlang.unique_integer()::32
|
||||||
|
>>
|
||||||
|
|
||||||
|
Base.url_encode64(binary)
|
||||||
|
end
|
||||||
|
end
|
|
@ -1,6 +1,7 @@
|
||||||
defmodule Frenzy.UpdateFeeds do
|
defmodule Frenzy.UpdateFeeds do
|
||||||
use GenServer
|
use GenServer
|
||||||
alias Frenzy.{Repo, Feed, Item, CreateItemTask}
|
alias Frenzy.{HTTP, Repo, Feed, Item}
|
||||||
|
alias Frenzy.Task.{CreateItem, FetchFavicon}
|
||||||
import Ecto.Query
|
import Ecto.Query
|
||||||
require Logger
|
require Logger
|
||||||
|
|
||||||
|
@ -121,16 +122,18 @@ defmodule Frenzy.UpdateFeeds do
|
||||||
last_updated: (rss.last_updated || DateTime.utc_now()) |> Timex.Timezone.convert(:utc)
|
last_updated: (rss.last_updated || DateTime.utc_now()) |> Timex.Timezone.convert(:utc)
|
||||||
})
|
})
|
||||||
|
|
||||||
Repo.update(changeset)
|
{:ok, feed} = Repo.update(changeset)
|
||||||
|
|
||||||
|
if is_nil(feed.favicon) do
|
||||||
|
FetchFavicon.run(feed)
|
||||||
|
end
|
||||||
|
|
||||||
feed = Repo.preload(feed, [:items])
|
feed = Repo.preload(feed, [:items])
|
||||||
|
|
||||||
Enum.each(rss.items, fn entry ->
|
Enum.each(rss.items, fn entry ->
|
||||||
# todo: use Repo.exists for this
|
# todo: use Repo.exists for this
|
||||||
if !Enum.any?(feed.items, fn item -> item.guid == entry.guid end) do
|
if !Enum.any?(feed.items, fn item -> item.guid == entry.guid end) do
|
||||||
CreateItemTask.start_link(feed, entry)
|
CreateItem.start_link(feed, entry)
|
||||||
# Task.start_link(__MODULE__, :create_item, [feed, entry])
|
|
||||||
# spawn(__MODULE__, :create_item, [feed, entry])
|
|
||||||
end
|
end
|
||||||
end)
|
end)
|
||||||
end
|
end
|
||||||
|
|
|
@ -0,0 +1,9 @@
|
||||||
|
defmodule Frenzy.Repo.Migrations.FeedsAddFavicon do
|
||||||
|
use Ecto.Migration
|
||||||
|
|
||||||
|
def change do
|
||||||
|
alter table(:feeds) do
|
||||||
|
add :favicon, :text, default: nil
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
Loading…
Reference in New Issue