2019-02-11 22:22:35 +00:00
|
|
|
defmodule Frenzy.UpdateFeeds do
|
|
|
|
use GenServer
|
2019-03-15 01:42:02 +00:00
|
|
|
alias Frenzy.{Repo, Feed, Item, FilterEngine}
|
2019-02-11 22:22:35 +00:00
|
|
|
import Ecto.Query
|
|
|
|
require Logger
|
|
|
|
|
|
|
|
def start_link(state) do
|
|
|
|
GenServer.start_link(__MODULE__, :ok, state)
|
|
|
|
end
|
|
|
|
|
|
|
|
def refresh(pid, feed) do
|
|
|
|
GenServer.call(pid, {:refresh, feed})
|
|
|
|
end
|
|
|
|
|
|
|
|
def init(state) do
|
|
|
|
update_feeds()
|
|
|
|
schedule_update()
|
|
|
|
{:ok, state}
|
|
|
|
end
|
|
|
|
|
|
|
|
def handle_call({:refresh, feed}, _from, state) do
|
|
|
|
update_feed(feed)
|
|
|
|
new_feed = Feed |> Repo.get(feed.id) |> Repo.preload(:items)
|
|
|
|
{:reply, new_feed, state}
|
|
|
|
end
|
|
|
|
|
|
|
|
def handle_info(:update_feeds, state) do
|
|
|
|
update_feeds()
|
|
|
|
schedule_update()
|
|
|
|
{:noreply, state}
|
|
|
|
end
|
|
|
|
|
|
|
|
defp schedule_update() do
|
2019-03-09 15:59:08 +00:00
|
|
|
# 15 minutes
|
2019-03-23 23:42:38 +00:00
|
|
|
Process.send_after(self(), :update_feeds, 15 * 60 * 1000)
|
2019-03-14 23:48:46 +00:00
|
|
|
# 1 minutes
|
2019-03-23 23:42:38 +00:00
|
|
|
# Process.send_after(self(), :update_feeds, 60 * 1000)
|
2019-02-11 22:22:35 +00:00
|
|
|
end
|
|
|
|
|
|
|
|
defp update_feeds() do
|
|
|
|
Logger.info("Updating all feeds")
|
|
|
|
|
2019-03-14 23:48:46 +00:00
|
|
|
Repo.all(from Feed, preload: [:filter])
|
|
|
|
|> Enum.map(&update_feed/1)
|
|
|
|
|
2019-02-11 22:22:35 +00:00
|
|
|
prune_old_items()
|
|
|
|
end
|
|
|
|
|
|
|
|
defp prune_old_items() do
|
2019-03-09 15:59:08 +00:00
|
|
|
{count, _} =
|
2019-03-14 23:48:46 +00:00
|
|
|
from(i in Item,
|
|
|
|
where: i.read and not i.tombstone,
|
|
|
|
# where: i.read_date <= from_now(-1, "week"),
|
|
|
|
where: i.read_date <= from_now(-1, "minute"),
|
|
|
|
update: [
|
|
|
|
set: [tombstone: true, content: nil, creator: nil, date: nil, url: nil, title: nil]
|
|
|
|
]
|
|
|
|
)
|
|
|
|
|> Repo.update_all([])
|
|
|
|
|
|
|
|
Logger.info("Converted #{count} read items to tombstones")
|
2019-02-11 22:22:35 +00:00
|
|
|
end
|
|
|
|
|
|
|
|
defp update_feed(feed) do
|
|
|
|
Logger.debug("Updating #{feed.feed_url}")
|
|
|
|
|
|
|
|
case HTTPoison.get(feed.feed_url) do
|
|
|
|
{:ok, %HTTPoison.Response{status_code: 200, body: body}} ->
|
|
|
|
case Fiet.parse(body) do
|
|
|
|
{:ok, rss} ->
|
|
|
|
update_feed_from_rss(feed, rss)
|
|
|
|
end
|
2019-03-09 15:59:08 +00:00
|
|
|
|
2019-02-11 22:22:35 +00:00
|
|
|
{:ok, %HTTPoison.Response{status_code: 404}} ->
|
|
|
|
Logger.warn("RSS feed #{feed.feed_url} not found")
|
2019-03-09 15:59:08 +00:00
|
|
|
|
2019-07-01 01:41:18 +00:00
|
|
|
{:ok, %HTTPoison.Response{status_code: status_code}} when status_code in [301, 302] ->
|
|
|
|
{"Location", new_url} =
|
|
|
|
Enum.find(headers, fn {name, _value} ->
|
|
|
|
name == "Location"
|
|
|
|
end)
|
|
|
|
|
|
|
|
Logger.debug("Got 301 redirect from #{url} to #{new_url}, updating feed URL")
|
|
|
|
changeset = Feed.changeset(feed, %{feed_url: new_url})
|
|
|
|
{:ok, feed} = Repo.insert(changeset)
|
|
|
|
update_feed(feed)
|
|
|
|
|
2019-02-11 22:22:35 +00:00
|
|
|
{:error, %HTTPoison.Error{reason: reason}} ->
|
|
|
|
Logger.error("Couldn't load RSS feed: #{reason}")
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
defp update_feed_from_rss(feed, rss) do
|
2019-07-01 01:25:11 +00:00
|
|
|
last_updated =
|
|
|
|
if rss.updated_at do
|
|
|
|
parse_date(rss.updated_at)
|
|
|
|
else
|
|
|
|
DateTime.utc_now()
|
|
|
|
end
|
|
|
|
|
2019-03-09 15:59:08 +00:00
|
|
|
changeset =
|
|
|
|
Feed.changeset(feed, %{
|
|
|
|
title: rss.title,
|
|
|
|
site_url: rss.link.href,
|
2019-07-01 01:25:11 +00:00
|
|
|
last_updated: last_updated
|
2019-02-11 22:22:35 +00:00
|
|
|
})
|
2019-03-09 15:59:08 +00:00
|
|
|
|
2019-02-11 22:22:35 +00:00
|
|
|
Repo.update(changeset)
|
|
|
|
|
2019-03-15 01:42:02 +00:00
|
|
|
feed = Repo.preload(feed, items: [], filter: [:rules])
|
2019-02-11 22:22:35 +00:00
|
|
|
|
|
|
|
Enum.map(rss.items, fn entry ->
|
2019-03-10 23:47:01 +00:00
|
|
|
# todo: use Repo.exists for this
|
2019-02-11 22:22:35 +00:00
|
|
|
if !Enum.any?(feed.items, fn item -> item.guid == entry.id end) do
|
|
|
|
create_item(feed, entry)
|
|
|
|
end
|
|
|
|
end)
|
|
|
|
end
|
|
|
|
|
|
|
|
defp create_item(feed, entry) do
|
|
|
|
url = get_real_url(entry)
|
|
|
|
|
2019-03-21 21:40:34 +00:00
|
|
|
Logger.debug("Creating item for #{url}")
|
|
|
|
|
2019-03-10 23:44:31 +00:00
|
|
|
content =
|
|
|
|
case get_article_content(url) do
|
|
|
|
{:ok, content} ->
|
|
|
|
content
|
|
|
|
|
|
|
|
{:err, reason} ->
|
|
|
|
Logger.warn("Unable to fetch article for #{url}: #{reason}")
|
2019-03-10 23:47:01 +00:00
|
|
|
entry.description
|
2019-03-10 23:44:31 +00:00
|
|
|
end
|
|
|
|
|
2019-03-15 01:42:02 +00:00
|
|
|
item_params = %{
|
|
|
|
guid: entry.id,
|
|
|
|
title: entry.title,
|
|
|
|
url: url,
|
|
|
|
date: parse_date(entry.published_at),
|
|
|
|
creator: "",
|
|
|
|
content: content
|
|
|
|
}
|
|
|
|
|
|
|
|
result =
|
|
|
|
if feed.filter_enabled do
|
|
|
|
case {feed.filter.mode, FilterEngine.matches?(item_params, feed.filter)} do
|
|
|
|
{"accept", true} ->
|
|
|
|
:store
|
|
|
|
|
|
|
|
{"reject", false} ->
|
|
|
|
:store
|
|
|
|
|
|
|
|
_ ->
|
|
|
|
Logger.debug("Skipping item #{url} due to feed filter")
|
|
|
|
:tombstone
|
|
|
|
end
|
|
|
|
else
|
|
|
|
:store
|
|
|
|
end
|
|
|
|
|
2019-03-10 23:44:31 +00:00
|
|
|
changeset =
|
2019-03-15 01:42:02 +00:00
|
|
|
case result do
|
|
|
|
:store ->
|
|
|
|
Ecto.build_assoc(feed, :items, item_params)
|
|
|
|
|
|
|
|
:tombstone ->
|
|
|
|
Ecto.build_assoc(feed, :items, %{
|
|
|
|
guid: entry.id,
|
|
|
|
tombstone: true
|
|
|
|
})
|
|
|
|
end
|
2019-03-10 23:44:31 +00:00
|
|
|
|
|
|
|
Repo.insert(changeset)
|
2019-02-11 22:22:35 +00:00
|
|
|
end
|
|
|
|
|
|
|
|
defp parse_date(str) do
|
|
|
|
case Timex.parse(str, "{RFC1123}") do
|
|
|
|
{:ok, date} ->
|
|
|
|
Timex.Timezone.convert(date, :utc)
|
2019-03-09 15:59:08 +00:00
|
|
|
|
2019-02-11 22:22:35 +00:00
|
|
|
_ ->
|
|
|
|
{:ok, date, _} = DateTime.from_iso8601(str)
|
|
|
|
Timex.Timezone.convert(date, :utc)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
defp get_real_url(entry) do
|
|
|
|
links = Enum.reject(entry.links, fn l -> l.rel == "shorturl" end)
|
2019-03-09 15:59:08 +00:00
|
|
|
|
2019-02-11 22:22:35 +00:00
|
|
|
case Enum.find(links, fn l -> l.rel == "related" end) do
|
|
|
|
nil ->
|
|
|
|
case Enum.find(links, fn l -> l.rel == "alternate" end) do
|
|
|
|
nil -> Enum.fetch!(links, 0).href
|
|
|
|
link -> link.href
|
|
|
|
end
|
2019-03-09 15:59:08 +00:00
|
|
|
|
|
|
|
link ->
|
|
|
|
link.href
|
2019-02-11 22:22:35 +00:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
defp get_article_content(url) do
|
|
|
|
Logger.debug("Getting article from #{url}")
|
|
|
|
|
|
|
|
case HTTPoison.get(url) do
|
|
|
|
{:ok, %HTTPoison.Response{status_code: 200, body: body}} ->
|
|
|
|
article = Readability.article(body)
|
2019-03-09 15:59:08 +00:00
|
|
|
{:ok, Readability.readable_html(article)}
|
|
|
|
|
2019-02-11 22:22:35 +00:00
|
|
|
{:ok, %HTTPoison.Response{status_code: 404}} ->
|
2019-03-09 15:59:08 +00:00
|
|
|
{:err, "404 not found"}
|
|
|
|
|
2019-07-01 01:31:24 +00:00
|
|
|
{:ok, %HTTPoison.Response{status_code: status_code, headers: headers}}
|
2019-07-01 01:41:18 +00:00
|
|
|
when status_code in [301, 302] ->
|
2019-03-09 15:59:08 +00:00
|
|
|
{"Location", new_url} =
|
2019-03-11 02:23:28 +00:00
|
|
|
Enum.find(headers, fn {name, _value} ->
|
2019-03-09 15:59:08 +00:00
|
|
|
name == "Location"
|
|
|
|
end)
|
|
|
|
|
|
|
|
Logger.debug("Got 301 redirect from #{url} to #{new_url}")
|
|
|
|
get_article_content(new_url)
|
|
|
|
|
2019-07-01 01:31:24 +00:00
|
|
|
{:ok, %HTTPoison.Response{status_code: 403}} ->
|
|
|
|
{:err, "403 Forbidden"}
|
|
|
|
|
2019-02-11 22:22:35 +00:00
|
|
|
{:error, %HTTPoison.Error{reason: reason}} ->
|
2019-03-09 15:59:08 +00:00
|
|
|
{:err, reason}
|
2019-02-11 22:22:35 +00:00
|
|
|
end
|
|
|
|
end
|
2019-03-09 15:59:08 +00:00
|
|
|
end
|