Make parsing articles more resilient
This commit is contained in:
parent
026b0ea2ae
commit
eaa463f2c0
|
@ -31,7 +31,8 @@ defmodule Frenzy.UpdateFeeds do
|
||||||
end
|
end
|
||||||
|
|
||||||
defp schedule_update() do
|
defp schedule_update() do
|
||||||
Process.send_after(self(), :update_feeds, 15 * 60 * 1000) # 15 minutes
|
# 15 minutes
|
||||||
|
Process.send_after(self(), :update_feeds, 15 * 60 * 1000)
|
||||||
# Process.send_after(self(), :update_feeds, 60 * 1000) # 1 minutes
|
# Process.send_after(self(), :update_feeds, 60 * 1000) # 1 minutes
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -43,7 +44,9 @@ defmodule Frenzy.UpdateFeeds do
|
||||||
end
|
end
|
||||||
|
|
||||||
defp prune_old_items() do
|
defp prune_old_items() do
|
||||||
{count, _} = Repo.delete_all(from i in Item, where: i.read, where: i.read_date <= from_now(-1, "week"))
|
{count, _} =
|
||||||
|
Repo.delete_all(from i in Item, where: i.read, where: i.read_date <= from_now(-1, "week"))
|
||||||
|
|
||||||
Logger.info("Removed #{count} read items")
|
Logger.info("Removed #{count} read items")
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -56,19 +59,23 @@ defmodule Frenzy.UpdateFeeds do
|
||||||
{:ok, rss} ->
|
{:ok, rss} ->
|
||||||
update_feed_from_rss(feed, rss)
|
update_feed_from_rss(feed, rss)
|
||||||
end
|
end
|
||||||
|
|
||||||
{:ok, %HTTPoison.Response{status_code: 404}} ->
|
{:ok, %HTTPoison.Response{status_code: 404}} ->
|
||||||
Logger.warn("RSS feed #{feed.feed_url} not found")
|
Logger.warn("RSS feed #{feed.feed_url} not found")
|
||||||
|
|
||||||
{:error, %HTTPoison.Error{reason: reason}} ->
|
{:error, %HTTPoison.Error{reason: reason}} ->
|
||||||
Logger.error("Couldn't load RSS feed: #{reason}")
|
Logger.error("Couldn't load RSS feed: #{reason}")
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
defp update_feed_from_rss(feed, rss) do
|
defp update_feed_from_rss(feed, rss) do
|
||||||
changeset = Feed.changeset(feed, %{
|
changeset =
|
||||||
title: rss.title,
|
Feed.changeset(feed, %{
|
||||||
site_url: rss.link.href,
|
title: rss.title,
|
||||||
last_updated: parse_date(rss.updated_at)
|
site_url: rss.link.href,
|
||||||
|
last_updated: parse_date(rss.updated_at)
|
||||||
})
|
})
|
||||||
|
|
||||||
Repo.update(changeset)
|
Repo.update(changeset)
|
||||||
|
|
||||||
feed = Repo.preload(feed, :items)
|
feed = Repo.preload(feed, :items)
|
||||||
|
@ -86,22 +93,30 @@ defmodule Frenzy.UpdateFeeds do
|
||||||
|
|
||||||
url = get_real_url(entry)
|
url = get_real_url(entry)
|
||||||
|
|
||||||
changeset = Ecto.build_assoc(feed, :items, %{
|
case get_article_content(url) do
|
||||||
guid: entry.id,
|
{:ok, content} ->
|
||||||
title: entry.title,
|
changeset =
|
||||||
url: url,
|
Ecto.build_assoc(feed, :items, %{
|
||||||
date: parse_date(entry.published_at),
|
guid: entry.id,
|
||||||
creator: "",
|
title: entry.title,
|
||||||
content: get_article_content(url)
|
url: url,
|
||||||
})
|
date: parse_date(entry.published_at),
|
||||||
|
creator: "",
|
||||||
|
content: content
|
||||||
|
})
|
||||||
|
|
||||||
Repo.insert(changeset)
|
Repo.insert(changeset)
|
||||||
|
|
||||||
|
{:err, reason} ->
|
||||||
|
Logger.error("Unable to create item for #{url}: #{reason}")
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
defp parse_date(str) do
|
defp parse_date(str) do
|
||||||
case Timex.parse(str, "{RFC1123}") do
|
case Timex.parse(str, "{RFC1123}") do
|
||||||
{:ok, date} ->
|
{:ok, date} ->
|
||||||
Timex.Timezone.convert(date, :utc)
|
Timex.Timezone.convert(date, :utc)
|
||||||
|
|
||||||
_ ->
|
_ ->
|
||||||
{:ok, date, _} = DateTime.from_iso8601(str)
|
{:ok, date, _} = DateTime.from_iso8601(str)
|
||||||
Timex.Timezone.convert(date, :utc)
|
Timex.Timezone.convert(date, :utc)
|
||||||
|
@ -110,13 +125,16 @@ defmodule Frenzy.UpdateFeeds do
|
||||||
|
|
||||||
defp get_real_url(entry) do
|
defp get_real_url(entry) do
|
||||||
links = Enum.reject(entry.links, fn l -> l.rel == "shorturl" end)
|
links = Enum.reject(entry.links, fn l -> l.rel == "shorturl" end)
|
||||||
|
|
||||||
case Enum.find(links, fn l -> l.rel == "related" end) do
|
case Enum.find(links, fn l -> l.rel == "related" end) do
|
||||||
nil ->
|
nil ->
|
||||||
case Enum.find(links, fn l -> l.rel == "alternate" end) do
|
case Enum.find(links, fn l -> l.rel == "alternate" end) do
|
||||||
nil -> Enum.fetch!(links, 0).href
|
nil -> Enum.fetch!(links, 0).href
|
||||||
link -> link.href
|
link -> link.href
|
||||||
end
|
end
|
||||||
link -> link.href
|
|
||||||
|
link ->
|
||||||
|
link.href
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -126,11 +144,22 @@ defmodule Frenzy.UpdateFeeds do
|
||||||
case HTTPoison.get(url) do
|
case HTTPoison.get(url) do
|
||||||
{:ok, %HTTPoison.Response{status_code: 200, body: body}} ->
|
{:ok, %HTTPoison.Response{status_code: 200, body: body}} ->
|
||||||
article = Readability.article(body)
|
article = Readability.article(body)
|
||||||
Readability.readable_html(article)
|
{:ok, Readability.readable_html(article)}
|
||||||
|
|
||||||
{:ok, %HTTPoison.Response{status_code: 404}} ->
|
{:ok, %HTTPoison.Response{status_code: 404}} ->
|
||||||
Logger.warn("Article #{url} not found")
|
{:err, "404 not found"}
|
||||||
|
|
||||||
|
{:ok, %HTTPoison.Response{status_code: 301, headers: headers}} ->
|
||||||
|
{"Location", new_url} =
|
||||||
|
Enum.find(headers, fn {name, value} ->
|
||||||
|
name == "Location"
|
||||||
|
end)
|
||||||
|
|
||||||
|
Logger.debug("Got 301 redirect from #{url} to #{new_url}")
|
||||||
|
get_article_content(new_url)
|
||||||
|
|
||||||
{:error, %HTTPoison.Error{reason: reason}} ->
|
{:error, %HTTPoison.Error{reason: reason}} ->
|
||||||
Logger.error("Couldn't load article: #{reason}")
|
{:err, reason}
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
Loading…
Reference in New Issue