2019-09-01 03:25:10 +00:00
|
|
|
defmodule FeedParser.Parser.RSS2 do
|
|
|
|
@moduledoc """
|
|
|
|
A `FeedParser.Parser` that handles [RSS 2.0 feeds](https://cyber.harvard.edu/rss/rss.html).
|
|
|
|
"""
|
|
|
|
|
2019-08-31 23:02:41 +00:00
|
|
|
alias FeedParser.XML
|
|
|
|
require XML
|
|
|
|
|
|
|
|
@behaviour FeedParser.Parser
|
|
|
|
|
|
|
|
@impl FeedParser.Parser
|
|
|
|
def accepts(data, content_type) do
|
2021-08-29 23:25:38 +00:00
|
|
|
cond do
|
|
|
|
content_type in ["application/rss+xml", "text/rss+xml"] ->
|
2022-04-11 20:25:08 +00:00
|
|
|
case XML.parse(data) do
|
|
|
|
{:error, _} -> false
|
|
|
|
{:ok, doc} -> {true, doc}
|
|
|
|
end
|
2019-08-31 23:02:41 +00:00
|
|
|
|
2021-08-29 23:25:38 +00:00
|
|
|
content_type in ["text/xml", "application/xml"] ->
|
2022-04-11 20:25:08 +00:00
|
|
|
case XML.parse(data) do
|
|
|
|
{:error, _} ->
|
|
|
|
false
|
|
|
|
|
|
|
|
{:ok, doc} ->
|
|
|
|
if XML.xmlElement(doc, :name) == :rss do
|
|
|
|
{true, doc}
|
|
|
|
else
|
|
|
|
false
|
|
|
|
end
|
2019-08-31 23:02:41 +00:00
|
|
|
end
|
|
|
|
|
2021-08-29 23:25:38 +00:00
|
|
|
true ->
|
2019-08-31 23:02:41 +00:00
|
|
|
false
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
@impl FeedParser.Parser
|
|
|
|
def parse_feed(rss) do
|
|
|
|
[channel] = :xmerl_xpath.string('/rss/channel', rss)
|
|
|
|
title = text('/channel/title/text()', channel)
|
|
|
|
link = text('/channel/link/text()', channel)
|
|
|
|
image = text('/channel/image/url/text()', channel)
|
|
|
|
|
2019-09-01 20:11:13 +00:00
|
|
|
last_updated =
|
|
|
|
text('/channel/lastBuildDate/text()', channel)
|
|
|
|
|> Timex.parse("{RFC1123}")
|
|
|
|
|> case do
|
|
|
|
{:ok, date} -> date
|
|
|
|
_ -> nil
|
|
|
|
end
|
|
|
|
|
2019-08-31 23:02:41 +00:00
|
|
|
items =
|
|
|
|
:xmerl_xpath.string('/channel/item', channel)
|
|
|
|
|> Enum.map(fn item ->
|
|
|
|
guid = text('/item/guid/text()', item)
|
|
|
|
title = text('/item/title/text()', item)
|
|
|
|
link = text('/item/link/text()', item)
|
|
|
|
description = text('/item/description/text()', item)
|
|
|
|
|
|
|
|
pubDate =
|
|
|
|
text('/item/pubDate/text()', item)
|
|
|
|
|> Timex.parse("{RFC1123}")
|
|
|
|
|> case do
|
|
|
|
{:ok, date} -> date
|
|
|
|
_ -> nil
|
|
|
|
end
|
|
|
|
|
2021-09-03 21:06:08 +00:00
|
|
|
# from Dublin Core extension: https://www.rssboard.org/rss-profile#namespace-elements-dublin
|
|
|
|
# todo: should this only be attempted if the xmlns:dc is defined?
|
|
|
|
creator = text('/item/dc:creator/text()', item)
|
|
|
|
|
2019-08-31 23:02:41 +00:00
|
|
|
%FeedParser.Item{
|
|
|
|
guid: guid,
|
|
|
|
title: title,
|
|
|
|
url: link,
|
2019-09-01 20:35:32 +00:00
|
|
|
links: [{link, nil}],
|
2019-08-31 23:02:41 +00:00
|
|
|
content: description,
|
2021-09-03 21:06:08 +00:00
|
|
|
date: pubDate,
|
|
|
|
creator: creator
|
2019-08-31 23:02:41 +00:00
|
|
|
}
|
|
|
|
end)
|
|
|
|
|
|
|
|
{:ok,
|
|
|
|
%FeedParser.Feed{
|
|
|
|
site_url: link,
|
|
|
|
title: title,
|
|
|
|
image_url: image,
|
2019-09-01 20:11:13 +00:00
|
|
|
last_updated: last_updated,
|
2019-08-31 23:02:41 +00:00
|
|
|
items: items
|
|
|
|
}}
|
|
|
|
end
|
|
|
|
|
|
|
|
defp text(xpath, element) do
|
|
|
|
case :xmerl_xpath.string(xpath, element) do
|
|
|
|
[el] ->
|
|
|
|
XML.xmlText(el, :value) |> List.to_string() |> String.trim()
|
|
|
|
|
|
|
|
_ ->
|
|
|
|
nil
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|