From eaa463f2c08c8e5fe73a1d6a6f53f51578a089ad Mon Sep 17 00:00:00 2001 From: Shadowfacts Date: Sat, 9 Mar 2019 10:59:08 -0500 Subject: [PATCH] Make parsing articles more resilient --- lib/frenzy/update_feeds.ex | 69 +++++++++++++++++++++++++++----------- 1 file changed, 49 insertions(+), 20 deletions(-) diff --git a/lib/frenzy/update_feeds.ex b/lib/frenzy/update_feeds.ex index deb3b28..91d3049 100644 --- a/lib/frenzy/update_feeds.ex +++ b/lib/frenzy/update_feeds.ex @@ -31,7 +31,8 @@ defmodule Frenzy.UpdateFeeds do end defp schedule_update() do - Process.send_after(self(), :update_feeds, 15 * 60 * 1000) # 15 minutes + # 15 minutes + Process.send_after(self(), :update_feeds, 15 * 60 * 1000) # Process.send_after(self(), :update_feeds, 60 * 1000) # 1 minutes end @@ -43,7 +44,9 @@ defmodule Frenzy.UpdateFeeds do end defp prune_old_items() do - {count, _} = Repo.delete_all(from i in Item, where: i.read, where: i.read_date <= from_now(-1, "week")) + {count, _} = + Repo.delete_all(from i in Item, where: i.read, where: i.read_date <= from_now(-1, "week")) + Logger.info("Removed #{count} read items") end @@ -56,19 +59,23 @@ defmodule Frenzy.UpdateFeeds do {:ok, rss} -> update_feed_from_rss(feed, rss) end + {:ok, %HTTPoison.Response{status_code: 404}} -> Logger.warn("RSS feed #{feed.feed_url} not found") + {:error, %HTTPoison.Error{reason: reason}} -> Logger.error("Couldn't load RSS feed: #{reason}") end end defp update_feed_from_rss(feed, rss) do - changeset = Feed.changeset(feed, %{ - title: rss.title, - site_url: rss.link.href, - last_updated: parse_date(rss.updated_at) + changeset = + Feed.changeset(feed, %{ + title: rss.title, + site_url: rss.link.href, + last_updated: parse_date(rss.updated_at) }) + Repo.update(changeset) feed = Repo.preload(feed, :items) @@ -86,22 +93,30 @@ defmodule Frenzy.UpdateFeeds do url = get_real_url(entry) - changeset = Ecto.build_assoc(feed, :items, %{ - guid: entry.id, - title: entry.title, - url: url, - date: parse_date(entry.published_at), - creator: "", - content: get_article_content(url) - }) + case get_article_content(url) do + {:ok, content} -> + changeset = + Ecto.build_assoc(feed, :items, %{ + guid: entry.id, + title: entry.title, + url: url, + date: parse_date(entry.published_at), + creator: "", + content: content + }) - Repo.insert(changeset) + Repo.insert(changeset) + + {:err, reason} -> + Logger.error("Unable to create item for #{url}: #{reason}") + end end defp parse_date(str) do case Timex.parse(str, "{RFC1123}") do {:ok, date} -> Timex.Timezone.convert(date, :utc) + _ -> {:ok, date, _} = DateTime.from_iso8601(str) Timex.Timezone.convert(date, :utc) @@ -110,13 +125,16 @@ defmodule Frenzy.UpdateFeeds do defp get_real_url(entry) do links = Enum.reject(entry.links, fn l -> l.rel == "shorturl" end) + case Enum.find(links, fn l -> l.rel == "related" end) do nil -> case Enum.find(links, fn l -> l.rel == "alternate" end) do nil -> Enum.fetch!(links, 0).href link -> link.href end - link -> link.href + + link -> + link.href end end @@ -126,11 +144,22 @@ defmodule Frenzy.UpdateFeeds do case HTTPoison.get(url) do {:ok, %HTTPoison.Response{status_code: 200, body: body}} -> article = Readability.article(body) - Readability.readable_html(article) + {:ok, Readability.readable_html(article)} + {:ok, %HTTPoison.Response{status_code: 404}} -> - Logger.warn("Article #{url} not found") + {:err, "404 not found"} + + {:ok, %HTTPoison.Response{status_code: 301, headers: headers}} -> + {"Location", new_url} = + Enum.find(headers, fn {name, value} -> + name == "Location" + end) + + Logger.debug("Got 301 redirect from #{url} to #{new_url}") + get_article_content(new_url) + {:error, %HTTPoison.Error{reason: reason}} -> - Logger.error("Couldn't load article: #{reason}") + {:err, reason} end end -end \ No newline at end of file +end