Implement basic favicon scraping

This commit is contained in:
Shadowfacts 2019-11-10 14:04:00 -05:00
parent 2d88b0b4e1
commit e684737fcd
Signed by: shadowfacts
GPG Key ID: 94A5AB95422746E5
8 changed files with 177 additions and 44 deletions

View File

@ -20,7 +20,7 @@ config :frenzy, FrenzyWeb.Endpoint,
# Configures Elixir's Logger
config :logger, :console,
format: "$time $metadata[$level] $message\n",
metadata: [:request_id, :item_task_id]
metadata: [:request_id, :item_task_id, :favicon_task_id]
# Use Jason for JSON parsing in Phoenix
config :phoenix, :json_library, Jason

View File

@ -34,6 +34,7 @@ defmodule Frenzy.Feed do
field :last_updated, :utc_datetime
field :site_url, :string
field :title, :string
field :favicon, :string
belongs_to :group, Frenzy.Group
belongs_to :pipeline, Frenzy.Pipeline
@ -50,6 +51,7 @@ defmodule Frenzy.Feed do
last_updated: DateTime.t() | nil,
site_url: String.t() | nil,
title: String.t() | nil,
favicon: String.t() | nil,
group: Frenzy.Group.t() | Ecto.Association.NotLoaded.t(),
pipeline: Frenzy.Pipeline.t() | nil | Ecto.Association.NotLoaded.t(),
items: [Frenzy.Item.t()] | Ecto.Association.NotLoaded.t(),
@ -65,7 +67,8 @@ defmodule Frenzy.Feed do
:feed_url,
:site_url,
:last_updated,
:pipeline_id
:pipeline_id,
:favicon
])
|> validate_required([:feed_url])
end

34
lib/frenzy/http.ex Normal file
View File

@ -0,0 +1,34 @@
defmodule Frenzy.HTTP do
require Logger
@redirect_codes [301, 302]
# @spec get(url :: String.t()) :: {:ok, HTTPoison.Response.t()} | {:error, String.()}
def get(url) do
case HTTPoison.get(url) do
{:ok, %HTTPoison.Response{status_code: 200} = response} ->
{:ok, response}
{:ok, %HTTPoison.Response{status_code: status_code, headers: headers}}
when status_code in @redirect_codes ->
headers
|> Enum.find(fn {name, _value} -> name == "Location" end)
|> case do
{"Location", new_url} ->
Logger.debug("Got 301 redirect from #{url} to #{new_url}")
get(new_url)
_ ->
{:error, "Missing Location header for redirect"}
end
{:ok, %HTTPoison.Response{status_code: 403}} ->
{:error, "403 Forbidden"}
{:ok, %HTTPoison.Response{status_code: 404}} ->
{:error, "404 Not Found"}
{:error, %HTTPoison.Error{reason: reason}} ->
{:error, reason}
end
end
end

View File

@ -1,5 +1,6 @@
defmodule Frenzy.Pipeline.ScrapeStage do
require Logger
alias Frenzy.HTTP
alias Frenzy.Pipeline.Stage
@behaviour Stage
@ -65,13 +66,13 @@ defmodule Frenzy.Pipeline.ScrapeStage do
Logger.debug("Getting article from #{url}")
url
|> HTTPoison.get()
|> HTTP.get()
|> case do
{:ok, response} ->
handle_response(url, response, opts)
{:error, %HTTPoison.Error{reason: reason}} ->
{:error, "HTTPoison error: #{reason}"}
{:error, reason} ->
{:error, "Couldn't scrape article: #{reason}"}
end
end
@ -79,7 +80,7 @@ defmodule Frenzy.Pipeline.ScrapeStage do
@spec handle_response(String.t(), HTTPoison.Response.t(), String.t()) ::
{:ok, String.t()} | {:error, String.t()}
defp handle_response(url, %HTTPoison.Response{status_code: 200, body: body}, opts) do
defp handle_response(url, %HTTPoison.Response{body: body}, opts) do
case opts["extractor"] do
"builtin" ->
{:ok, Readability.article(body)}
@ -111,36 +112,7 @@ defmodule Frenzy.Pipeline.ScrapeStage do
end
end
defp handle_response(_url, %HTTPoison.Response{status_code: 404}, _extractor) do
{:error, "404 not found"}
end
defp handle_response(
url,
%HTTPoison.Response{status_code: status_code, headers: headers},
extractor
)
when status_code in [301, 302] do
headers
|> Enum.find(fn {name, _value} -> name == "Location" end)
|> case do
{"Location", new_url} ->
Logger.debug("Got 301 redirect from #{url} to #{new_url}")
get_article_content(new_url, extractor)
_ ->
{:error, "Missing Location header for redirect"}
end
end
defp handle_response(_url, %HTTPoison.Response{status_code: 403}, _extractor) do
{:error, "403 Forbidden"}
end
defp handle_response(_url, %HTTPoison.Response{} = response, _extractor) do
{:error, "No handler for response #{inspect(response)}"}
end
#
# Generates a helper function for the article with the given URI that takes an HTML element and,
# if it's an <img> element whose src attribute does not have a hostname, adds the hostname and
# scheme to the element.
@ -177,8 +149,8 @@ defmodule Frenzy.Pipeline.ScrapeStage do
# convert images to data URIs so that they're stored by clients as part of the body
defp image_to_data_uri(src, true) do
case HTTPoison.get(src) do
{:ok, %HTTPoison.Response{status_code: 200, body: body, headers: headers}} ->
case HTTP.get(src) do
{:ok, %HTTPoison.Response{body: body, headers: headers}} ->
{"Content-Type", content_type} =
Enum.find(headers, fn {header, _value} -> header == "Content-Type" end)

View File

@ -1,4 +1,4 @@
defmodule Frenzy.CreateItemTask do
defmodule Frenzy.Task.CreateItem do
require Logger
use Task
alias Frenzy.Repo

View File

@ -0,0 +1,112 @@
defmodule Frenzy.Task.FetchFavicon do
require Logger
use Task
alias Frenzy.{HTTP, Repo, Feed}
def start_link(feed) do
Task.start_link(__MODULE__, :run, [feed])
end
def run(feed) do
Logger.metadata(favicon_task_id: generate_task_id())
case fetch_favicon_from_webpage(feed.site_url) do
{:ok, favicon_data} ->
changeset = Feed.changeset(feed, %{favicon: favicon_data})
{:ok, _feed} = Repo.update(changeset)
{:error, reason} ->
Logger.info("Couldn't fetch favicon for #{feed.site_url}: #{reason}")
favicon_uri =
%{URI.parse(feed.site_url) | path: "/favicon.ico", query: nil, fragment: nil}
|> URI.to_string()
Logger.info("Trying default path: #{favicon_uri}")
case fetch_favicon_data(favicon_uri) do
{:ok, favicon_data} ->
changeset = Feed.changeset(feed, %{favicon: favicon_data})
{:ok, _feed} = Repo.update(changeset)
{:error, reason} ->
Logger.info("Couldn't fetch default /favicon.ico for #{feed.site_url}: #{reason}")
end
end
end
defp fetch_favicon_from_webpage(url) do
case HTTP.get(url) do
{:ok, %HTTPoison.Response{body: body}} ->
extract_favicon(body)
{:error, _reason} = err ->
err
end
end
defp extract_favicon(body) do
html_tree = Floki.parse(body)
case Floki.find(html_tree, "link[rel=icon]") do
[] ->
{:error, "No element matching link[rel=icon]"}
links ->
links
|> Enum.find(fn link ->
link
|> Floki.attribute("type")
|> Enum.map(&String.downcase/1)
|> Enum.any?(&(&1 == "image/png"))
|> case do
false ->
link
|> Floki.attribute("href")
# bad hack for missing type attr
|> Enum.any?(&String.contains?(&1, ".png"))
true ->
true
end
# todo: support more image types
end)
|> case do
nil ->
{:error, "No link[rel=icon] with type of image/png"}
# todo: try requesting /favicon.ico
link ->
link
|> Floki.attribute("href")
|> List.first()
|> fetch_favicon_data()
end
end
end
defp fetch_favicon_data(nil), do: {:error, "No href for link"}
defp fetch_favicon_data(url) do
case HTTP.get(url) do
{:ok, %HTTPoison.Response{body: body}} ->
{:ok, "data:image/png;base64,#{Base.encode64(body)}"}
{:error, _reason} = err ->
err
end
end
# from https://github.com/elixir-plug/plug/blob/v1.8.3/lib/plug/request_id.ex#L60
defp generate_task_id() do
binary = <<
System.system_time(:nanosecond)::64,
:erlang.phash2({node(), self()}, 16_777_216)::24,
:erlang.unique_integer()::32
>>
Base.url_encode64(binary)
end
end

View File

@ -1,6 +1,7 @@
defmodule Frenzy.UpdateFeeds do
use GenServer
alias Frenzy.{Repo, Feed, Item, CreateItemTask}
alias Frenzy.{HTTP, Repo, Feed, Item}
alias Frenzy.Task.{CreateItem, FetchFavicon}
import Ecto.Query
require Logger
@ -121,16 +122,18 @@ defmodule Frenzy.UpdateFeeds do
last_updated: (rss.last_updated || DateTime.utc_now()) |> Timex.Timezone.convert(:utc)
})
Repo.update(changeset)
{:ok, feed} = Repo.update(changeset)
if is_nil(feed.favicon) do
FetchFavicon.run(feed)
end
feed = Repo.preload(feed, [:items])
Enum.each(rss.items, fn entry ->
# todo: use Repo.exists for this
if !Enum.any?(feed.items, fn item -> item.guid == entry.guid end) do
CreateItemTask.start_link(feed, entry)
# Task.start_link(__MODULE__, :create_item, [feed, entry])
# spawn(__MODULE__, :create_item, [feed, entry])
CreateItem.start_link(feed, entry)
end
end)
end

View File

@ -0,0 +1,9 @@
defmodule Frenzy.Repo.Migrations.FeedsAddFavicon do
use Ecto.Migration
def change do
alter table(:feeds) do
add :favicon, :text, default: nil
end
end
end