commit 42a3b5344c62615b0895c79e29a4d10d7349e25d Author: Shadowfacts Date: Sat Aug 31 19:02:41 2019 -0400 Initial commit diff --git a/.formatter.exs b/.formatter.exs new file mode 100644 index 0000000..d2cda26 --- /dev/null +++ b/.formatter.exs @@ -0,0 +1,4 @@ +# Used by "mix format" +[ + inputs: ["{mix,.formatter}.exs", "{config,lib,test}/**/*.{ex,exs}"] +] diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..176f1b7 --- /dev/null +++ b/.gitignore @@ -0,0 +1,31 @@ +# The directory Mix will write compiled artifacts to. +/_build/ + +# If you run "mix test --cover", coverage assets end up here. +/cover/ + +# The directory Mix downloads your dependencies sources to. +/deps/ + +# Where 3rd-party dependencies like ExDoc output generated docs. +/doc/ + +# Ignore .fetch files in case you like to edit your project deps locally. +/.fetch + +# If the VM crashes, it generates a dump, let's ignore it too. +erl_crash.dump + +# Also ignore archive artifacts (built via "mix archive.build"). +*.ez + +# Ignore package tarball (built via "mix hex.build"). +frenzy-*.tar + +# Files matching config/*.secret.exs pattern contain sensitive +# data and you should not commit them into version control. +# +# Alternatively, you may comment the line below and commit the +# secrets files as long as you replace their contents by environment +# variables. +/config/*.secret.exs diff --git a/README.md b/README.md new file mode 100644 index 0000000..eacca45 --- /dev/null +++ b/README.md @@ -0,0 +1,21 @@ +# FeedParser + +**TODO: Add description** + +## Installation + +If [available in Hex](https://hex.pm/docs/publish), the package can be installed +by adding `feed_parser` to your list of dependencies in `mix.exs`: + +```elixir +def deps do + [ + {:feed_parser, "~> 0.1.0"} + ] +end +``` + +Documentation can be generated with [ExDoc](https://github.com/elixir-lang/ex_doc) +and published on [HexDocs](https://hexdocs.pm). Once published, the docs can +be found at [https://hexdocs.pm/feed_parser](https://hexdocs.pm/feed_parser). + diff --git a/lib/atom/parser.ex b/lib/atom/parser.ex new file mode 100644 index 0000000..fd7bb2e --- /dev/null +++ b/lib/atom/parser.ex @@ -0,0 +1,87 @@ +defmodule FeedParser.Atom.Parser do + alias FeedParser.XML + require XML + + @behaviour FeedParser.Parser + + @impl FeedParser.Parser + def accepts(data, content_type) do + case content_type do + "application/atom+xml" -> + true + + _ when content_type in ["text/xml", "application/xml"] -> + doc = XML.parse(data) + + if XML.xmlElement(doc, :name) == :feed do + {true, doc} + else + false + end + + _ -> + false + end + end + + @impl FeedParser.Parser + def parse_feed(feed) do + title = text('/feed/title/text()', feed) + link = attr('/feed/link/@href', feed) + icon = text('/feed/icon/text()', feed) + + items = + :xmerl_xpath.string('/feed/entry', feed) + |> Enum.map(fn entry -> + id = text('/entry/id/text()', entry) + title = text('/entry/title/text()', entry) + link = attr('/entry/link/@href', entry) + + updated = + text('/entry/updated/text()', entry) + |> Timex.parse("{ISO:Extended}") + |> case do + {:ok, date} -> date + _ -> nil + end + + content = text('/entry/content/text()', entry) || text('/entry/summary/text()', entry) + + %FeedParser.Item{ + guid: id, + title: title, + url: link, + content: content, + date: updated + } + end) + + {:ok, + %FeedParser.Feed{ + site_url: link, + title: title, + image_url: icon, + items: items + }} + end + + defp text(xpath, element) do + case :xmerl_xpath.string(xpath, element) do + [el] -> + XML.xmlText(el, :value) |> List.to_string() |> String.trim() + + _ -> + nil + end + end + + defp attr(xpath, element) do + case :xmerl_xpath.string(xpath, element) do + [attr] -> + XML.xmlAttribute(attr, :value) |> List.to_string() |> String.trim() + + _ -> + nil + end + end +end diff --git a/lib/feed.ex b/lib/feed.ex new file mode 100644 index 0000000..c9db358 --- /dev/null +++ b/lib/feed.ex @@ -0,0 +1,10 @@ +defmodule FeedParser.Feed do + defstruct [:site_url, :title, :image_url, :items] + + @type t() :: %__MODULE__{ + site_url: String.t(), + title: String.t(), + image_url: String.t() | nil, + items: [FeedParser.Item.t()] + } +end diff --git a/lib/feed_parser.ex b/lib/feed_parser.ex new file mode 100644 index 0000000..87005b3 --- /dev/null +++ b/lib/feed_parser.ex @@ -0,0 +1,25 @@ +defmodule FeedParser do + @default_parsers [FeedParser.RSS2.Parser, FeedParser.Atom.Parser, FeedParser.JSONFeed.Parser] + + @spec parse_feed(data :: String.t(), content_type :: String.t(), parsers :: [module()]) :: + {:ok, feed :: FeedParser.Feed.t()} | {:error, reason :: String.t()} + def parse_feed(data, content_type, parsers \\ @default_parsers) when is_binary(data) do + parsers + |> Enum.reduce_while(false, fn parser, acc -> + case parser.accepts(data, content_type) do + {true, result} -> + {:halt, {parser, result}} + + _ -> + {:cont, acc} + end + end) + |> case do + {parser, result} -> + parser.parse_feed(result) + + false -> + {:error, "no parser matched the given content type and data"} + end + end +end diff --git a/lib/item.ex b/lib/item.ex new file mode 100644 index 0000000..0a14fb1 --- /dev/null +++ b/lib/item.ex @@ -0,0 +1,11 @@ +defmodule FeedParser.Item do + defstruct [:guid, :url, :title, :content, :date] + + @type t() :: %__MODULE__{ + guid: String.t(), + url: String.t() | nil, + title: String.t() | nil, + content: String.t(), + date: DateTime.t() + } +end diff --git a/lib/jsonfeed/parser.ex b/lib/jsonfeed/parser.ex new file mode 100644 index 0000000..d96488a --- /dev/null +++ b/lib/jsonfeed/parser.ex @@ -0,0 +1,61 @@ +defmodule FeedParser.JSONFeed.Parser do + @behaviour FeedParser.Parser + + @impl FeedParser.Parser + def accepts(data, content_type) do + with "application/json" <- content_type, + {:ok, json} <- Poison.decode(data), + %{"version" => "https://jsonfeed.org/version/1"} <- json do + {true, json} + else + _ -> + false + end + end + + @impl FeedParser.Parser + def parse_feed(json) do + title = json["title"] + home_page_url = Map.get(json, "home_page_url") + icon = Map.get(json, "icon") || Map.get(json, "favicon") + + items = + Map.get(json, "items", []) + |> Enum.map(fn item -> + id = item["id"] + + url = + Map.get(item, "url") || if String.starts_with?(id, ~r/https?:\/\//), do: id, else: nil + + title = Map.get(item, "title") + + content = + Map.get(item, "content_html") || Map.get(item, "content_text") || + Map.get(item, "summary") + + date = + (Map.get(item, "date_published") || Map.get(item, "date_updated")) + |> Timex.parse("{RFC3339}") + |> case do + {:ok, date} -> date + _ -> nil + end + + %FeedParser.Item{ + guid: id, + url: url, + title: title, + content: content, + date: date + } + end) + + {:ok, + %FeedParser.Feed{ + site_url: home_page_url, + title: title, + image_url: icon, + items: items + }} + end +end diff --git a/lib/parser.ex b/lib/parser.ex new file mode 100644 index 0000000..c718910 --- /dev/null +++ b/lib/parser.ex @@ -0,0 +1,5 @@ +defmodule FeedParser.Parser do + @callback accepts(data :: String.t(), content_type :: String.t()) :: {true, any()} | false + @callback parse_feed(data :: any()) :: + {:ok, feed :: FeedParser.Feed.t()} | {:error, reason :: String.t()} +end diff --git a/lib/rss2/parser.ex b/lib/rss2/parser.ex new file mode 100644 index 0000000..0047754 --- /dev/null +++ b/lib/rss2/parser.ex @@ -0,0 +1,77 @@ +defmodule FeedParser.RSS2.Parser do + alias FeedParser.XML + require XML + + @behaviour FeedParser.Parser + + @impl FeedParser.Parser + def accepts(data, content_type) do + case content_type do + "application/rss+xml" -> + {true, XML.parse(data)} + + _ when content_type in ["text/xml", "application/xml"] -> + doc = XML.parse(data) + + if XML.xmlElement(doc, :name) == :rss do + {true, doc} + else + false + end + + _ -> + false + end + end + + @impl FeedParser.Parser + def parse_feed(rss) do + [channel] = :xmerl_xpath.string('/rss/channel', rss) + title = text('/channel/title/text()', channel) + link = text('/channel/link/text()', channel) + image = text('/channel/image/url/text()', channel) + + items = + :xmerl_xpath.string('/channel/item', channel) + |> Enum.map(fn item -> + guid = text('/item/guid/text()', item) + title = text('/item/title/text()', item) + link = text('/item/link/text()', item) + description = text('/item/description/text()', item) + + pubDate = + text('/item/pubDate/text()', item) + |> Timex.parse("{RFC1123}") + |> case do + {:ok, date} -> date + _ -> nil + end + + %FeedParser.Item{ + guid: guid, + title: title, + url: link, + content: description, + date: pubDate + } + end) + + {:ok, + %FeedParser.Feed{ + site_url: link, + title: title, + image_url: image, + items: items + }} + end + + defp text(xpath, element) do + case :xmerl_xpath.string(xpath, element) do + [el] -> + XML.xmlText(el, :value) |> List.to_string() |> String.trim() + + _ -> + nil + end + end +end diff --git a/lib/xml.ex b/lib/xml.ex new file mode 100644 index 0000000..aacc3fd --- /dev/null +++ b/lib/xml.ex @@ -0,0 +1,17 @@ +defmodule FeedParser.XML do + import Record + + defrecord :xmlElement, extract(:xmlElement, from_lib: "xmerl/include/xmerl.hrl") + defrecord :xmlAttribute, extract(:xmlAttribute, from_lib: "xmerl/include/xmerl.hrl") + defrecord :xmlText, extract(:xmlText, from_lib: "xmerl/include/xmerl.hrl") + + @spec parse(data :: String.t()) :: tuple() + def parse(data) do + {doc, _} = + data + |> :binary.bin_to_list() + |> :xmerl_scan.string() + + doc + end +end diff --git a/mix.exs b/mix.exs new file mode 100644 index 0000000..9c9dd43 --- /dev/null +++ b/mix.exs @@ -0,0 +1,30 @@ +defmodule FeedParser.MixProject do + use Mix.Project + + def project do + [ + app: :feed_parser, + version: "0.1.0", + elixir: "~> 1.9", + start_permanent: Mix.env() == :prod, + deps: deps() + ] + end + + # Run "mix help compile.app" to learn about applications. + def application do + [ + extra_applications: [:logger] + ] + end + + # Run "mix help deps" to learn about dependencies. + defp deps do + [ + {:timex, "~> 3.6.1"}, + {:poison, "~> 4.0.1"} + # {:dep_from_hexpm, "~> 0.3.0"}, + # {:dep_from_git, git: "https://github.com/elixir-lang/my_dep.git", tag: "0.1.0"} + ] + end +end diff --git a/mix.lock b/mix.lock new file mode 100644 index 0000000..71ca143 --- /dev/null +++ b/mix.lock @@ -0,0 +1,16 @@ +%{ + "certifi": {:hex, :certifi, "2.5.1", "867ce347f7c7d78563450a18a6a28a8090331e77fa02380b4a21962a65d36ee5", [:rebar3], [{:parse_trans, "~>3.3", [hex: :parse_trans, repo: "hexpm", optional: false]}], "hexpm"}, + "combine": {:hex, :combine, "0.10.0", "eff8224eeb56498a2af13011d142c5e7997a80c8f5b97c499f84c841032e429f", [:mix], [], "hexpm"}, + "gettext": {:hex, :gettext, "0.17.0", "abe21542c831887a2b16f4c94556db9c421ab301aee417b7c4fbde7fbdbe01ec", [:mix], [], "hexpm"}, + "hackney": {:hex, :hackney, "1.15.1", "9f8f471c844b8ce395f7b6d8398139e26ddca9ebc171a8b91342ee15a19963f4", [:rebar3], [{:certifi, "2.5.1", [hex: :certifi, repo: "hexpm", optional: false]}, {:idna, "6.0.0", [hex: :idna, repo: "hexpm", optional: false]}, {:metrics, "1.0.1", [hex: :metrics, repo: "hexpm", optional: false]}, {:mimerl, "~>1.1", [hex: :mimerl, repo: "hexpm", optional: false]}, {:ssl_verify_fun, "1.1.4", [hex: :ssl_verify_fun, repo: "hexpm", optional: false]}], "hexpm"}, + "idna": {:hex, :idna, "6.0.0", "689c46cbcdf3524c44d5f3dde8001f364cd7608a99556d8fbd8239a5798d4c10", [:rebar3], [{:unicode_util_compat, "0.4.1", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm"}, + "metrics": {:hex, :metrics, "1.0.1", "25f094dea2cda98213cecc3aeff09e940299d950904393b2a29d191c346a8486", [:rebar3], [], "hexpm"}, + "mimerl": {:hex, :mimerl, "1.2.0", "67e2d3f571088d5cfd3e550c383094b47159f3eee8ffa08e64106cdf5e981be3", [:rebar3], [], "hexpm"}, + "parse_trans": {:hex, :parse_trans, "3.3.0", "09765507a3c7590a784615cfd421d101aec25098d50b89d7aa1d66646bc571c1", [:rebar3], [], "hexpm"}, + "poison": {:hex, :poison, "4.0.1", "bcb755a16fac91cad79bfe9fc3585bb07b9331e50cfe3420a24bcc2d735709ae", [:mix], [], "hexpm"}, + "saxy": {:hex, :saxy, "0.10.0", "38879f46a595862c22114792c71379355ecfcfa0f713b1cfcc59e1d4127f1f55", [:mix], [], "hexpm"}, + "ssl_verify_fun": {:hex, :ssl_verify_fun, "1.1.4", "f0eafff810d2041e93f915ef59899c923f4568f4585904d010387ed74988e77b", [:make, :mix, :rebar3], [], "hexpm"}, + "timex": {:hex, :timex, "3.6.1", "efdf56d0e67a6b956cc57774353b0329c8ab7726766a11547e529357ffdc1d56", [:mix], [{:combine, "~> 0.10", [hex: :combine, repo: "hexpm", optional: false]}, {:gettext, "~> 0.10", [hex: :gettext, repo: "hexpm", optional: false]}, {:tzdata, "~> 0.1.8 or ~> 0.5 or ~> 1.0.0", [hex: :tzdata, repo: "hexpm", optional: false]}], "hexpm"}, + "tzdata": {:hex, :tzdata, "1.0.1", "f6027a331af7d837471248e62733c6ebee86a72e57c613aa071ebb1f750fc71a", [:mix], [{:hackney, "~> 1.0", [hex: :hackney, repo: "hexpm", optional: false]}], "hexpm"}, + "unicode_util_compat": {:hex, :unicode_util_compat, "0.4.1", "d869e4c68901dd9531385bb0c8c40444ebf624e60b6962d95952775cac5e90cd", [:rebar3], [], "hexpm"}, +} diff --git a/test/feed_parser_test.exs b/test/feed_parser_test.exs new file mode 100644 index 0000000..37e34c7 --- /dev/null +++ b/test/feed_parser_test.exs @@ -0,0 +1,8 @@ +defmodule FeedParserTest do + use ExUnit.Case + doctest FeedParser + + test "greets the world" do + assert FeedParser.hello() == :world + end +end diff --git a/test/test_helper.exs b/test/test_helper.exs new file mode 100644 index 0000000..869559e --- /dev/null +++ b/test/test_helper.exs @@ -0,0 +1 @@ +ExUnit.start()