diff --git a/lib/mercury/cleaners/author.ex b/lib/mercury/cleaners/author.ex new file mode 100644 index 0000000..44dbd88 --- /dev/null +++ b/lib/mercury/cleaners/author.ex @@ -0,0 +1,18 @@ +defmodule Mercury.Cleaners.Author do + import Mercury.Utils.Text, only: [normalize_spaces: 1] + + @clean_author ~r/^\s*(posted |written )?by\s*:?\s*(.*)/i + + @doc """ + Take an author string (like 'by David Smith ') and clean it to just the name. + + ## Examples + + iex> Mercury.Cleaners.Author.clean_author("by David Smith") + "David Smith" + """ + def clean_author(author) do + Regex.replace(@clean_author, normalize_spaces(author), "\\2") + |> String.trim() + end +end diff --git a/lib/mercury/utils/text.ex b/lib/mercury/utils/text.ex new file mode 100644 index 0000000..53925c5 --- /dev/null +++ b/lib/mercury/utils/text.ex @@ -0,0 +1,122 @@ +defmodule Mercury.Utils.Text do + @page_in_href ~r"(page|paging|(p(a|g|ag)?(e|enum|ewanted|ing|ination)))?(=|/)([0-9]{1,3})"i + @has_alpha ~r/[a-z]/i + @is_alpha ~r/^[a-z]+$/i + @is_digit ~r/^[0-9]+$/i + @encoding ~r/charset=([\w-]+)\b/ + @default_encoding "utf-8" + + def article_base_url(url), do: article_base_url(url, URI.parse(url)) + + @doc """ + Take a URL and return the article base of said URL. That is, no + pagination data exists in it. Useful for comparing to other links + that might have pagination data within them. + """ + def article_base_url(url, %URI{scheme: protocol, authority: host, path: path}) + when is_binary(url) do + reversed_segments = + path + |> String.split("/", trim: true) + |> Enum.reverse() + + # If we're on the first segment, check to see if we have any characters in it. + # The first segment is actually the last bit of the URL, and this will be + # helpful to determine if we're on a URL segment that looks like "/2/" for example. + first_segment_has_letters = Regex.match?(@has_alpha, List.first(reversed_segments)) + + cleaned_segments = + reversed_segments + |> Enum.with_index() + |> Enum.reduce([], fn {segment, index}, acc -> + # Split off anything that looks like a file type. + segment = + case String.split(segment, ".") do + [possible_segment, fileExt] -> + if Regex.match?(@is_alpha, fileExt), do: possible_segment, else: segment + + _ -> + segment + end + + # If our first or second segment has anything looking like a page number, remove it. + segment = + if index < 2 and Regex.match?(@page_in_href, segment) do + Regex.replace(@page_in_href, segment, "") + else + segment + end + + # If it's not marked for deletion, push it to cleaned_segments. + if is_good_segment(segment, index, first_segment_has_letters) do + [segment | acc] + else + acc + end + end) + |> Enum.reject(fn + "" -> true + _ -> false + end) + + URI.to_string(%URI{ + authority: host, + scheme: protocol, + path: "/" <> (cleaned_segments |> Enum.join("/")) + }) + end + + defp is_good_segment(segment, index, first_segment_has_letters) do + cond do + # If our first or second url_segment is smaller than 3 characters, + # and the first url_segment had no alphas, remove it. + index < 2 and String.length(segment) < 3 and !first_segment_has_letters -> false + # If this is the first url_segment and it's just index, + # remove it. + index === 0 and String.downcase(segment) == "index" -> false + # If this is purely a number, and it's the first or second url_segment, + # it's probably a page number. Remove it. + index < 2 and Regex.match?(@is_digit, segment) and String.length(segment) < 3 -> false + true -> true + end + end + + @doc """ + Given a string, return true if it appears to have an ending sentence within it, false otherwise. + + ## Examples + + iex> Mercury.Utils.Text.has_sentence_end("This is a sentence.") + true + iex> Mercury.Utils.Text.has_sentence_end("This isn't ") + false + """ + def has_sentence_end(text) do + Regex.match?(~r/\.( |$)/, text) + end + + @doc """ + Normalizes multiple spaces to a single space and trims the resulting string. + + ## Examples + + iex> Mercury.Utils.Text.normalize_spaces(" hello world ") + "hello world" + """ + def normalize_spaces(text) do + Regex.replace(~r/\s{2,}/, text, " ") + |> String.trim() + end + + @doc """ + Removes the anchor component of a URL. + + ## Examples + + iex> Mercury.Utils.Text.remove_anchor("https://example.com/foo/bar#baz") + "https://example.com/foo/bar" + """ + def remove_anchor(url) do + URI.to_string(%URI{URI.parse(url) | fragment: nil}) + end +end diff --git a/mix.exs b/mix.exs index 8d6f25d..9c1212b 100644 --- a/mix.exs +++ b/mix.exs @@ -23,6 +23,7 @@ defmodule Mercury.MixProject do [ # {:dep_from_hexpm, "~> 0.3.0"}, # {:dep_from_git, git: "https://github.com/elixir-lang/my_dep.git", tag: "0.1.0"}, + {:timex, "~> 3.0"} ] end end diff --git a/mix.lock b/mix.lock new file mode 100644 index 0000000..5a89821 --- /dev/null +++ b/mix.lock @@ -0,0 +1,14 @@ +%{ + "certifi": {:hex, :certifi, "2.4.2", "75424ff0f3baaccfd34b1214184b6ef616d89e420b258bb0a5ea7d7bc628f7f0", [:rebar3], [{:parse_trans, "~>3.3", [hex: :parse_trans, repo: "hexpm", optional: false]}], "hexpm"}, + "combine": {:hex, :combine, "0.10.0", "eff8224eeb56498a2af13011d142c5e7997a80c8f5b97c499f84c841032e429f", [:mix], [], "hexpm"}, + "gettext": {:hex, :gettext, "0.16.1", "e2130b25eebcbe02bb343b119a07ae2c7e28bd4b146c4a154da2ffb2b3507af2", [:mix], [], "hexpm"}, + "hackney": {:hex, :hackney, "1.15.0", "287a5d2304d516f63e56c469511c42b016423bcb167e61b611f6bad47e3ca60e", [:rebar3], [{:certifi, "2.4.2", [hex: :certifi, repo: "hexpm", optional: false]}, {:idna, "6.0.0", [hex: :idna, repo: "hexpm", optional: false]}, {:metrics, "1.0.1", [hex: :metrics, repo: "hexpm", optional: false]}, {:mimerl, "1.0.2", [hex: :mimerl, repo: "hexpm", optional: false]}, {:ssl_verify_fun, "1.1.4", [hex: :ssl_verify_fun, repo: "hexpm", optional: false]}], "hexpm"}, + "idna": {:hex, :idna, "6.0.0", "689c46cbcdf3524c44d5f3dde8001f364cd7608a99556d8fbd8239a5798d4c10", [:rebar3], [{:unicode_util_compat, "0.4.1", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm"}, + "metrics": {:hex, :metrics, "1.0.1", "25f094dea2cda98213cecc3aeff09e940299d950904393b2a29d191c346a8486", [:rebar3], [], "hexpm"}, + "mimerl": {:hex, :mimerl, "1.0.2", "993f9b0e084083405ed8252b99460c4f0563e41729ab42d9074fd5e52439be88", [:rebar3], [], "hexpm"}, + "parse_trans": {:hex, :parse_trans, "3.3.0", "09765507a3c7590a784615cfd421d101aec25098d50b89d7aa1d66646bc571c1", [:rebar3], [], "hexpm"}, + "ssl_verify_fun": {:hex, :ssl_verify_fun, "1.1.4", "f0eafff810d2041e93f915ef59899c923f4568f4585904d010387ed74988e77b", [:make, :mix, :rebar3], [], "hexpm"}, + "timex": {:hex, :timex, "3.5.0", "b0a23167da02d0fe4f1a4e104d1f929a00d348502b52432c05de875d0b9cffa5", [:mix], [{:combine, "~> 0.10", [hex: :combine, repo: "hexpm", optional: false]}, {:gettext, "~> 0.10", [hex: :gettext, repo: "hexpm", optional: false]}, {:tzdata, "~> 0.1.8 or ~> 0.5", [hex: :tzdata, repo: "hexpm", optional: false]}], "hexpm"}, + "tzdata": {:hex, :tzdata, "0.5.19", "7962a3997bf06303b7d1772988ede22260f3dae1bf897408ebdac2b4435f4e6a", [:mix], [{:hackney, "~> 1.0", [hex: :hackney, repo: "hexpm", optional: false]}], "hexpm"}, + "unicode_util_compat": {:hex, :unicode_util_compat, "0.4.1", "d869e4c68901dd9531385bb0c8c40444ebf624e60b6962d95952775cac5e90cd", [:rebar3], [], "hexpm"}, +} diff --git a/test/mercury/cleaners/author_tests.ex b/test/mercury/cleaners/author_tests.ex new file mode 100644 index 0000000..8097cbf --- /dev/null +++ b/test/mercury/cleaners/author_tests.ex @@ -0,0 +1,13 @@ +defmodule AuthorTests do + use ExUnit.Case + doctest Mercury.Cleaners.Author + import Mercury.Cleaners.Author + + test "removes the By from an author string" do + assert clean_author("by Bob Dylan") == "Bob Dylan" + end + + test "trims trailing whitespace and line breaks" do + assert clean_author("\twritten by\n\tBob Dylan\n\t") == "Bob Dylan" + end +end diff --git a/test/mercury/utils/text_tests.ex b/test/mercury/utils/text_tests.ex new file mode 100644 index 0000000..6e398b7 --- /dev/null +++ b/test/mercury/utils/text_tests.ex @@ -0,0 +1,21 @@ +defmodule Mercury.Utils.TextTest do + use ExUnit.Case + doctest Mercury.Utils.Text + + test "returns the base url of a paginated url" do + url = "http://example.com/foo/bar/wow-cool/page=10" + cleaned = "http://example.com/foo/bar/wow-cool" + assert article_base_url(url) == cleaned + end + + test "returns the same url if url has no pagination info" do + url = "http://example.com/foo/bar/wow-cool/" + cleaned = "http://example.com/foo/bar/wow-cool" + assert article_base_url(url) == cleaned + end + + test "normalize spaces" do + assert normalize_spaces(" hello world ") == "hello world" + assert normalize_spaces("\thello\n\tworld\n") == "hello world" + end +end