Add creator to Item

This commit is contained in:
Shadowfacts 2021-09-03 17:06:08 -04:00
parent b8de34c436
commit 13394e38f6
Signed by: shadowfacts
GPG Key ID: 94A5AB95422746E5
10 changed files with 199 additions and 12 deletions

View File

@ -3,7 +3,7 @@ defmodule FeedParser.Item do
A item in a feed. Has metadata and content from the item.
"""
defstruct [:guid, :url, :links, :title, :content, :date]
defstruct [:guid, :url, :links, :title, :content, :date, :creator]
@type t() :: %__MODULE__{
guid: String.t(),
@ -11,6 +11,7 @@ defmodule FeedParser.Item do
links: [{href :: String.t(), rel :: String.t() | nil}],
title: String.t() | nil,
content: String.t(),
date: DateTime.t()
date: DateTime.t(),
creator: String.t() | nil
}
end

View File

@ -33,6 +33,7 @@ defmodule FeedParser.Parser.Atom do
title = text('/feed/title/text()', feed)
link = attr('/feed/link/@href', feed)
icon = text('/feed/icon/text()', feed)
feed_author = texts('/feed/author/name/text()', feed)
updated =
text('/feed/updated/text()', feed)
@ -71,6 +72,8 @@ defmodule FeedParser.Parser.Atom do
_ -> nil
end
author = texts('/entry/author/name/text()', entry) || feed_author
content = text('/entry/content/text()', entry) || text('/entry/summary/text()', entry)
%FeedParser.Item{
@ -79,7 +82,8 @@ defmodule FeedParser.Parser.Atom do
url: url,
links: links,
content: content,
date: updated
date: updated,
creator: author |> Enum.join(", ")
}
end)
@ -94,12 +98,21 @@ defmodule FeedParser.Parser.Atom do
end
defp text(xpath, element) do
case :xmerl_xpath.string(xpath, element) do
[el] ->
XML.xmlText(el, :value) |> List.to_string() |> String.trim()
case texts(xpath, element) do
[text] -> text
_ -> nil
end
end
_ ->
defp texts(xpath, element) do
case :xmerl_xpath.string(xpath, element) do
[] ->
nil
els ->
Enum.map(els, fn el ->
XML.xmlText(el, :value) |> List.to_string() |> String.trim()
end)
end
end

View File

@ -5,11 +5,21 @@ defmodule FeedParser.Parser.JSONFeed do
@behaviour FeedParser.Parser
@mime_types [
"application/json",
"application/feed+json"
]
@versions [
"https://jsonfeed.org/version/1",
"https://jsonfeed.org/version/1.1"
]
@impl FeedParser.Parser
def accepts(data, content_type) do
with "application/json" <- content_type,
with true <- content_type in @mime_types,
{:ok, json} <- Poison.decode(data),
%{"version" => "https://jsonfeed.org/version/1"} <- json do
%{"version" => v} when v in @versions <- json do
{true, json}
else
_ ->
@ -23,13 +33,16 @@ defmodule FeedParser.Parser.JSONFeed do
home_page_url = Map.get(json, "home_page_url")
icon = Map.get(json, "icon") || Map.get(json, "favicon")
feed_author = authors_string(json)
items =
Map.get(json, "items", [])
|> Enum.map(fn item ->
id = item["id"]
url =
Map.get(item, "url") || if String.starts_with?(id, ~r/https?:\/\//), do: id, else: nil
Map.get(item, "url") ||
if String.starts_with?(id, ["http://", "https://"]), do: id, else: nil
title = Map.get(item, "title")
@ -45,13 +58,16 @@ defmodule FeedParser.Parser.JSONFeed do
_ -> nil
end
author = authors_string(item) || feed_author
%FeedParser.Item{
guid: id,
url: url,
links: [{url, nil}],
title: title,
content: content,
date: date
date: date,
creator: author
}
end)
@ -64,4 +80,20 @@ defmodule FeedParser.Parser.JSONFeed do
items: items
}}
end
defp authors_string(%{"author" => author}) do
author_name(author)
end
defp authors_string(%{"authors" => authors}) do
authors
|> Enum.map(&author_name/1)
|> Enum.reject(&is_nil/1)
|> Enum.join(", ")
end
defp authors_string(_), do: nil
defp author_name(%{"name" => name}), do: name
defp author_name(_), do: nil
end

View File

@ -59,13 +59,18 @@ defmodule FeedParser.Parser.RSS2 do
_ -> nil
end
# from Dublin Core extension: https://www.rssboard.org/rss-profile#namespace-elements-dublin
# todo: should this only be attempted if the xmlns:dc is defined?
creator = text('/item/dc:creator/text()', item)
%FeedParser.Item{
guid: guid,
title: title,
url: link,
links: [{link, nil}],
content: description,
date: pubDate
date: pubDate,
creator: creator
}
end)

23
test/fixtures/atom/multi_author.xml vendored Normal file
View File

@ -0,0 +1,23 @@
<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<title>Example Feed</title>
<link href="http://example.org/"/>
<updated>2003-12-13T18:30:02Z</updated>
<author>
<name>John Doe</name>
</author>
<author>
<name>Jane Doe</name>
</author>
<id>urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6</id>
<entry>
<title>Atom-Powered Robots Run Amok</title>
<link href="http://example.org/2003/12/13/atom03"/>
<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
<updated>2003-12-13T18:30:02Z</updated>
<summary>Some text.</summary>
</entry>
</feed>

18
test/fixtures/jsonfeed/v1_1.json vendored Normal file
View File

@ -0,0 +1,18 @@
{
"version": "https://jsonfeed.org/version/1.1",
"title": "My Example Feed",
"home_page_url": "https://example.org/",
"feed_url": "https://example.org/feed.json",
"items": [
{
"id": "2",
"content_text": "This is a second item.",
"url": "https://example.org/second-item"
},
{
"id": "1",
"content_html": "<p>Hello, world!</p>",
"url": "https://example.org/initial-post"
}
]
}

25
test/fixtures/rss2/dc_creator.xml vendored Normal file
View File

@ -0,0 +1,25 @@
<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:mi="http://schemas.ingestion.microsoft.com/common/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:media="http://search.yahoo.com/mrss/" xmlns:slate="https://slate.com">
<channel>
<title>News and Politics - Slate Magazine</title>
<description>Slate RSS - News and Politics section</description>
<link>https://slate.com/news-and-politics</link>
<lastBuildDate>Fri, 03 Sep 2021 20:29:29 +0000</lastBuildDate>
<docs>http://blogs.law.harvard.edu/tech/rss</docs>
<copyright>2021</copyright>
<generator>Feed delivered by Clay</generator>
<item>
<slate:id>ckt4s2oap0036b8kzx6zfh6r6</slate:id>
<title><![CDATA[What Joe Manchins Op-Ed Lamenting the National Debt Is Really About]]></title>
<link>https://slate.com/news-and-politics/2021/09/manchin-debt-oped-what-it-means.html?via=rss</link>
<pubDate>Fri, 03 Sep 2021 20:08:14 GMT</pubDate>
<guid isPermaLink="true">https://slate.com/news-and-politics/2021/09/manchin-debt-oped-what-it-means.html</guid>
<description><![CDATA[The Democrats are still stuck negotiating with themselves.]]></description>
<dc:creator>Jim Newell</dc:creator>
<media:content url="https://compote.slate.com/images/139b39c5-ef93-4c41-9404-288d067dbf73.jpeg?width=780&amp;height=520&amp;rect=7018x4679&amp;offset=0x0" medium="image">
<media:credit><![CDATA[Kevin Dietsch/Getty Images]]></media:credit>
<media:title type="html"><![CDATA[Sen. Joe Manchin leaves the U.S. Capitol following a vote on August 3, 2021 in Washington, DC.]]></media:title>
</media:content>
</item>
</channel>
</rss>

View File

@ -21,6 +21,7 @@ defmodule FeedParser.Parser.AtomTest do
assert item.title == "Atom-Powered Robots Run Amok"
assert item.url == "http://example.org/2003/12/13/atom03"
assert item.date == ~U[2003-12-13 18:30:02Z]
assert item.creator == "John Doe"
assert item.content == "Some text."
end
@ -40,4 +41,11 @@ defmodule FeedParser.Parser.AtomTest do
{"https://daringfireball.net/linked/2019/08/30/dorsey-twitter-account", "related"}
]
end
test "parses atom entry with multiple authors" do
data = File.read!("test/fixtures/atom/multi_author.xml")
{true, parsed_data} = Atom.accepts(data, "application/atom+xml")
assert {:ok, %FeedParser.Feed{items: items}} = Atom.parse_feed(parsed_data)
assert [%FeedParser.Item{creator: "John Doe, Jane Doe"}] = items
end
end

View File

@ -8,6 +8,11 @@ defmodule FeedParser.Parser.JSONFeedTest do
assert {true, _} = JSONFeed.accepts(data, "application/json")
end
test "matches v1.1 feed" do
data = File.read!("test/fixtures/jsonfeed/v1_1.json")
assert {true, _} = JSONFeed.accepts(data, "application/feed+json")
end
test "parses json feed" do
data = File.read!("test/fixtures/jsonfeed/feed.json")
{true, parsed_data} = JSONFeed.accepts(data, "application/json")
@ -23,4 +28,53 @@ defmodule FeedParser.Parser.JSONFeedTest do
assert item1.content == "<p>Hello, world!</p>"
assert item1.url == "https://example.org/initial-post"
end
test "parses item author" do
assert {:ok, %FeedParser.Feed{items: items}} =
JSONFeed.parse_feed(%{
"title" => "test",
"items" => [
%{
"id" => "1",
"author" => %{
"name" => "foo"
}
}
]
})
assert [%FeedParser.Item{creator: "foo"}] = items
end
test "falls back to feed author" do
assert {:ok, %FeedParser.Feed{items: items}} =
JSONFeed.parse_feed(%{
"title" => "test",
"author" => %{
"name" => "foo"
},
"items" => [
%{
"id" => "1"
}
]
})
assert [%FeedParser.Item{creator: "foo"}] = items
end
test "handles multiple authors" do
assert {:ok, %FeedParser.Feed{items: items}} =
JSONFeed.parse_feed(%{
"title" => "test",
"items" => [
%{
"id" => "1",
"authors" => [%{"name" => "foo"}, %{"name" => "bar"}]
}
]
})
assert [%FeedParser.Item{creator: "foo, bar"}] = items
end
end

View File

@ -28,4 +28,12 @@ defmodule FeedParser.Parser.RSS2Test do
assert item.date == ~U[2003-06-03 09:39:21Z]
assert item.guid == "http://liftoff.msfc.nasa.gov/2003/06/03.html#item573"
end
test "parses rss2 item with dc:creator" do
data = File.read!("test/fixtures/rss2/dc_creator.xml")
{true, parsed_data} = RSS2.accepts(data, "application/rss+xml")
assert {:ok, %FeedParser.Feed{} = feed} = RSS2.parse_feed(parsed_data)
assert [%FeedParser.Item{} = item] = feed.items
assert item.creator == "Jim Newell"
end
end