From 13394e38f6cf378e0e5789ea5b471d63d5b3794b Mon Sep 17 00:00:00 2001 From: Shadowfacts Date: Fri, 3 Sep 2021 17:06:08 -0400 Subject: [PATCH] Add creator to Item --- lib/item.ex | 5 +-- lib/parser/atom.ex | 23 +++++++++--- lib/parser/jsonfeed.ex | 40 ++++++++++++++++++--- lib/parser/rss2.ex | 7 +++- test/fixtures/atom/multi_author.xml | 23 ++++++++++++ test/fixtures/jsonfeed/v1_1.json | 18 ++++++++++ test/fixtures/rss2/dc_creator.xml | 25 +++++++++++++ test/parser/atom_test.exs | 8 +++++ test/parser/jsonfeed_test.exs | 54 +++++++++++++++++++++++++++++ test/parser/rss2_test.exs | 8 +++++ 10 files changed, 199 insertions(+), 12 deletions(-) create mode 100644 test/fixtures/atom/multi_author.xml create mode 100644 test/fixtures/jsonfeed/v1_1.json create mode 100644 test/fixtures/rss2/dc_creator.xml diff --git a/lib/item.ex b/lib/item.ex index 9506873..ce3e9da 100644 --- a/lib/item.ex +++ b/lib/item.ex @@ -3,7 +3,7 @@ defmodule FeedParser.Item do A item in a feed. Has metadata and content from the item. """ - defstruct [:guid, :url, :links, :title, :content, :date] + defstruct [:guid, :url, :links, :title, :content, :date, :creator] @type t() :: %__MODULE__{ guid: String.t(), @@ -11,6 +11,7 @@ defmodule FeedParser.Item do links: [{href :: String.t(), rel :: String.t() | nil}], title: String.t() | nil, content: String.t(), - date: DateTime.t() + date: DateTime.t(), + creator: String.t() | nil } end diff --git a/lib/parser/atom.ex b/lib/parser/atom.ex index 457c2a4..a734420 100644 --- a/lib/parser/atom.ex +++ b/lib/parser/atom.ex @@ -33,6 +33,7 @@ defmodule FeedParser.Parser.Atom do title = text('/feed/title/text()', feed) link = attr('/feed/link/@href', feed) icon = text('/feed/icon/text()', feed) + feed_author = texts('/feed/author/name/text()', feed) updated = text('/feed/updated/text()', feed) @@ -71,6 +72,8 @@ defmodule FeedParser.Parser.Atom do _ -> nil end + author = texts('/entry/author/name/text()', entry) || feed_author + content = text('/entry/content/text()', entry) || text('/entry/summary/text()', entry) %FeedParser.Item{ @@ -79,7 +82,8 @@ defmodule FeedParser.Parser.Atom do url: url, links: links, content: content, - date: updated + date: updated, + creator: author |> Enum.join(", ") } end) @@ -94,12 +98,21 @@ defmodule FeedParser.Parser.Atom do end defp text(xpath, element) do - case :xmerl_xpath.string(xpath, element) do - [el] -> - XML.xmlText(el, :value) |> List.to_string() |> String.trim() + case texts(xpath, element) do + [text] -> text + _ -> nil + end + end - _ -> + defp texts(xpath, element) do + case :xmerl_xpath.string(xpath, element) do + [] -> nil + + els -> + Enum.map(els, fn el -> + XML.xmlText(el, :value) |> List.to_string() |> String.trim() + end) end end diff --git a/lib/parser/jsonfeed.ex b/lib/parser/jsonfeed.ex index 8f031f7..7a18256 100644 --- a/lib/parser/jsonfeed.ex +++ b/lib/parser/jsonfeed.ex @@ -5,11 +5,21 @@ defmodule FeedParser.Parser.JSONFeed do @behaviour FeedParser.Parser + @mime_types [ + "application/json", + "application/feed+json" + ] + + @versions [ + "https://jsonfeed.org/version/1", + "https://jsonfeed.org/version/1.1" + ] + @impl FeedParser.Parser def accepts(data, content_type) do - with "application/json" <- content_type, + with true <- content_type in @mime_types, {:ok, json} <- Poison.decode(data), - %{"version" => "https://jsonfeed.org/version/1"} <- json do + %{"version" => v} when v in @versions <- json do {true, json} else _ -> @@ -23,13 +33,16 @@ defmodule FeedParser.Parser.JSONFeed do home_page_url = Map.get(json, "home_page_url") icon = Map.get(json, "icon") || Map.get(json, "favicon") + feed_author = authors_string(json) + items = Map.get(json, "items", []) |> Enum.map(fn item -> id = item["id"] url = - Map.get(item, "url") || if String.starts_with?(id, ~r/https?:\/\//), do: id, else: nil + Map.get(item, "url") || + if String.starts_with?(id, ["http://", "https://"]), do: id, else: nil title = Map.get(item, "title") @@ -45,13 +58,16 @@ defmodule FeedParser.Parser.JSONFeed do _ -> nil end + author = authors_string(item) || feed_author + %FeedParser.Item{ guid: id, url: url, links: [{url, nil}], title: title, content: content, - date: date + date: date, + creator: author } end) @@ -64,4 +80,20 @@ defmodule FeedParser.Parser.JSONFeed do items: items }} end + + defp authors_string(%{"author" => author}) do + author_name(author) + end + + defp authors_string(%{"authors" => authors}) do + authors + |> Enum.map(&author_name/1) + |> Enum.reject(&is_nil/1) + |> Enum.join(", ") + end + + defp authors_string(_), do: nil + + defp author_name(%{"name" => name}), do: name + defp author_name(_), do: nil end diff --git a/lib/parser/rss2.ex b/lib/parser/rss2.ex index 41c3358..8f3cbc6 100644 --- a/lib/parser/rss2.ex +++ b/lib/parser/rss2.ex @@ -59,13 +59,18 @@ defmodule FeedParser.Parser.RSS2 do _ -> nil end + # from Dublin Core extension: https://www.rssboard.org/rss-profile#namespace-elements-dublin + # todo: should this only be attempted if the xmlns:dc is defined? + creator = text('/item/dc:creator/text()', item) + %FeedParser.Item{ guid: guid, title: title, url: link, links: [{link, nil}], content: description, - date: pubDate + date: pubDate, + creator: creator } end) diff --git a/test/fixtures/atom/multi_author.xml b/test/fixtures/atom/multi_author.xml new file mode 100644 index 0000000..189448f --- /dev/null +++ b/test/fixtures/atom/multi_author.xml @@ -0,0 +1,23 @@ + + + + Example Feed + + 2003-12-13T18:30:02Z + + John Doe + + + Jane Doe + + urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6 + + + Atom-Powered Robots Run Amok + + urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a + 2003-12-13T18:30:02Z + Some text. + + + diff --git a/test/fixtures/jsonfeed/v1_1.json b/test/fixtures/jsonfeed/v1_1.json new file mode 100644 index 0000000..7d03d4c --- /dev/null +++ b/test/fixtures/jsonfeed/v1_1.json @@ -0,0 +1,18 @@ +{ + "version": "https://jsonfeed.org/version/1.1", + "title": "My Example Feed", + "home_page_url": "https://example.org/", + "feed_url": "https://example.org/feed.json", + "items": [ + { + "id": "2", + "content_text": "This is a second item.", + "url": "https://example.org/second-item" + }, + { + "id": "1", + "content_html": "

Hello, world!

", + "url": "https://example.org/initial-post" + } + ] +} diff --git a/test/fixtures/rss2/dc_creator.xml b/test/fixtures/rss2/dc_creator.xml new file mode 100644 index 0000000..2b87d3b --- /dev/null +++ b/test/fixtures/rss2/dc_creator.xml @@ -0,0 +1,25 @@ + + + + News and Politics - Slate Magazine + Slate RSS - News and Politics section + https://slate.com/news-and-politics + Fri, 03 Sep 2021 20:29:29 +0000 + http://blogs.law.harvard.edu/tech/rss + 2021 + Feed delivered by Clay + + ckt4s2oap0036b8kzx6zfh6r6 + <![CDATA[What Joe Manchin’s Op-Ed Lamenting the National Debt Is Really About]]> + https://slate.com/news-and-politics/2021/09/manchin-debt-oped-what-it-means.html?via=rss + Fri, 03 Sep 2021 20:08:14 GMT + https://slate.com/news-and-politics/2021/09/manchin-debt-oped-what-it-means.html + + Jim Newell + + + + + + + diff --git a/test/parser/atom_test.exs b/test/parser/atom_test.exs index b98f5e0..00dae3c 100644 --- a/test/parser/atom_test.exs +++ b/test/parser/atom_test.exs @@ -21,6 +21,7 @@ defmodule FeedParser.Parser.AtomTest do assert item.title == "Atom-Powered Robots Run Amok" assert item.url == "http://example.org/2003/12/13/atom03" assert item.date == ~U[2003-12-13 18:30:02Z] + assert item.creator == "John Doe" assert item.content == "Some text." end @@ -40,4 +41,11 @@ defmodule FeedParser.Parser.AtomTest do {"https://daringfireball.net/linked/2019/08/30/dorsey-twitter-account", "related"} ] end + + test "parses atom entry with multiple authors" do + data = File.read!("test/fixtures/atom/multi_author.xml") + {true, parsed_data} = Atom.accepts(data, "application/atom+xml") + assert {:ok, %FeedParser.Feed{items: items}} = Atom.parse_feed(parsed_data) + assert [%FeedParser.Item{creator: "John Doe, Jane Doe"}] = items + end end diff --git a/test/parser/jsonfeed_test.exs b/test/parser/jsonfeed_test.exs index ee5e5c9..d0c61b1 100644 --- a/test/parser/jsonfeed_test.exs +++ b/test/parser/jsonfeed_test.exs @@ -8,6 +8,11 @@ defmodule FeedParser.Parser.JSONFeedTest do assert {true, _} = JSONFeed.accepts(data, "application/json") end + test "matches v1.1 feed" do + data = File.read!("test/fixtures/jsonfeed/v1_1.json") + assert {true, _} = JSONFeed.accepts(data, "application/feed+json") + end + test "parses json feed" do data = File.read!("test/fixtures/jsonfeed/feed.json") {true, parsed_data} = JSONFeed.accepts(data, "application/json") @@ -23,4 +28,53 @@ defmodule FeedParser.Parser.JSONFeedTest do assert item1.content == "

Hello, world!

" assert item1.url == "https://example.org/initial-post" end + + test "parses item author" do + assert {:ok, %FeedParser.Feed{items: items}} = + JSONFeed.parse_feed(%{ + "title" => "test", + "items" => [ + %{ + "id" => "1", + "author" => %{ + "name" => "foo" + } + } + ] + }) + + assert [%FeedParser.Item{creator: "foo"}] = items + end + + test "falls back to feed author" do + assert {:ok, %FeedParser.Feed{items: items}} = + JSONFeed.parse_feed(%{ + "title" => "test", + "author" => %{ + "name" => "foo" + }, + "items" => [ + %{ + "id" => "1" + } + ] + }) + + assert [%FeedParser.Item{creator: "foo"}] = items + end + + test "handles multiple authors" do + assert {:ok, %FeedParser.Feed{items: items}} = + JSONFeed.parse_feed(%{ + "title" => "test", + "items" => [ + %{ + "id" => "1", + "authors" => [%{"name" => "foo"}, %{"name" => "bar"}] + } + ] + }) + + assert [%FeedParser.Item{creator: "foo, bar"}] = items + end end diff --git a/test/parser/rss2_test.exs b/test/parser/rss2_test.exs index 68b0409..6390818 100644 --- a/test/parser/rss2_test.exs +++ b/test/parser/rss2_test.exs @@ -28,4 +28,12 @@ defmodule FeedParser.Parser.RSS2Test do assert item.date == ~U[2003-06-03 09:39:21Z] assert item.guid == "http://liftoff.msfc.nasa.gov/2003/06/03.html#item573" end + + test "parses rss2 item with dc:creator" do + data = File.read!("test/fixtures/rss2/dc_creator.xml") + {true, parsed_data} = RSS2.accepts(data, "application/rss+xml") + assert {:ok, %FeedParser.Feed{} = feed} = RSS2.parse_feed(parsed_data) + assert [%FeedParser.Item{} = item] = feed.items + assert item.creator == "Jim Newell" + end end