Some stuff

2019-02-11 22:16:32 -05:00 · 2019-02-11 22:16:32 -05:00 · 8663faff3e
commit 8663faff3e
parent 81375ede91
6 changed files with 189 additions and 0 deletions
--- a/lib/mercury/cleaners/author.ex
+++ b/lib/mercury/cleaners/author.ex
@ -0,0 +1,18 @@
+defmodule Mercury.Cleaners.Author do
+  import Mercury.Utils.Text, only: [normalize_spaces: 1]
+
+  @clean_author ~r/^\s*(posted |written )?by\s*:?\s*(.*)/i
+
+  @doc """
+  Take an author string (like 'by David Smith ') and clean it to just the name.
+
+  ## Examples
+
+      iex> Mercury.Cleaners.Author.clean_author("by David Smith")
+      "David Smith"
+  """
+  def clean_author(author) do
+    Regex.replace(@clean_author, normalize_spaces(author), "\\2")
+    |> String.trim()
+  end
+end
--- a/lib/mercury/utils/text.ex
+++ b/lib/mercury/utils/text.ex
@ -0,0 +1,122 @@
+defmodule Mercury.Utils.Text do
+  @page_in_href ~r"(page|paging|(p(a|g|ag)?(e|enum|ewanted|ing|ination)))?(=|/)([0-9]{1,3})"i
+  @has_alpha ~r/[a-z]/i
+  @is_alpha ~r/^[a-z]+$/i
+  @is_digit ~r/^[0-9]+$/i
+  @encoding ~r/charset=([\w-]+)\b/
+  @default_encoding "utf-8"
+
+  def article_base_url(url), do: article_base_url(url, URI.parse(url))
+
+  @doc """
+  Take a URL and return the article base of said URL. That is, no
+  pagination data exists in it. Useful for comparing to other links
+  that might have pagination data within them.
+  """
+  def article_base_url(url, %URI{scheme: protocol, authority: host, path: path})
+      when is_binary(url) do
+    reversed_segments =
+      path
+      |> String.split("/", trim: true)
+      |> Enum.reverse()
+
+    # If we're on the first segment, check to see if we have any characters in it.
+    # The first segment is actually the last bit of the URL, and this will be
+    # helpful to determine if we're on a URL segment that looks like "/2/" for example.
+    first_segment_has_letters = Regex.match?(@has_alpha, List.first(reversed_segments))
+
+    cleaned_segments =
+      reversed_segments
+      |> Enum.with_index()
+      |> Enum.reduce([], fn {segment, index}, acc ->
+        # Split off anything that looks like a file type.
+        segment =
+          case String.split(segment, ".") do
+            [possible_segment, fileExt] ->
+              if Regex.match?(@is_alpha, fileExt), do: possible_segment, else: segment
+
+            _ ->
+              segment
+          end
+
+        # If our first or second segment has anything looking like a page number, remove it.
+        segment =
+          if index < 2 and Regex.match?(@page_in_href, segment) do
+            Regex.replace(@page_in_href, segment, "")
+          else
+            segment
+          end
+
+        # If it's not marked for deletion, push it to cleaned_segments.
+        if is_good_segment(segment, index, first_segment_has_letters) do
+          [segment | acc]
+        else
+          acc
+        end
+      end)
+      |> Enum.reject(fn
+        "" -> true
+        _ -> false
+      end)
+
+    URI.to_string(%URI{
+      authority: host,
+      scheme: protocol,
+      path: "/" <> (cleaned_segments |> Enum.join("/"))
+    })
+  end
+
+  defp is_good_segment(segment, index, first_segment_has_letters) do
+    cond do
+      # If our first or second url_segment is smaller than 3 characters,
+      # and the first url_segment had no alphas, remove it.
+      index < 2 and String.length(segment) < 3 and !first_segment_has_letters -> false
+      # If this is the first url_segment and it's just index,
+      # remove it.
+      index === 0 and String.downcase(segment) == "index" -> false
+      # If this is purely a number, and it's the first or second url_segment,
+      # it's probably a page number. Remove it.
+      index < 2 and Regex.match?(@is_digit, segment) and String.length(segment) < 3 -> false
+      true -> true
+    end
+  end
+
+  @doc """
+  Given a string, return true if it appears to have an ending sentence within it, false otherwise.
+
+  ## Examples
+
+      iex> Mercury.Utils.Text.has_sentence_end("This is a sentence.")
+      true
+      iex> Mercury.Utils.Text.has_sentence_end("This isn't ")
+      false
+  """
+  def has_sentence_end(text) do
+    Regex.match?(~r/\.( |$)/, text)
+  end
+
+  @doc """
+  Normalizes multiple spaces to a single space and trims the resulting string.
+
+  ## Examples
+
+      iex> Mercury.Utils.Text.normalize_spaces(" hello  world ")
+      "hello world"
+  """
+  def normalize_spaces(text) do
+    Regex.replace(~r/\s{2,}/, text, " ")
+    |> String.trim()
+  end
+
+  @doc """
+  Removes the anchor component of a URL.
+
+  ## Examples
+
+      iex> Mercury.Utils.Text.remove_anchor("https://example.com/foo/bar#baz")
+      "https://example.com/foo/bar"
+  """
+  def remove_anchor(url) do
+    URI.to_string(%URI{URI.parse(url) | fragment: nil})
+  end
+end
--- a/mix.exs
+++ b/mix.exs
@ -23,6 +23,7 @@ defmodule Mercury.MixProject do
    [
      # {:dep_from_hexpm, "~> 0.3.0"},
      # {:dep_from_git, git: "https://github.com/elixir-lang/my_dep.git", tag: "0.1.0"},
+      {:timex, "~> 3.0"}
    ]
  end
 end
--- a/mix.lock
+++ b/mix.lock
@ -0,0 +1,14 @@
+%{
+  "certifi": {:hex, :certifi, "2.4.2", "75424ff0f3baaccfd34b1214184b6ef616d89e420b258bb0a5ea7d7bc628f7f0", [:rebar3], [{:parse_trans, "~>3.3", [hex: :parse_trans, repo: "hexpm", optional: false]}], "hexpm"},
+  "combine": {:hex, :combine, "0.10.0", "eff8224eeb56498a2af13011d142c5e7997a80c8f5b97c499f84c841032e429f", [:mix], [], "hexpm"},
+  "gettext": {:hex, :gettext, "0.16.1", "e2130b25eebcbe02bb343b119a07ae2c7e28bd4b146c4a154da2ffb2b3507af2", [:mix], [], "hexpm"},
+  "hackney": {:hex, :hackney, "1.15.0", "287a5d2304d516f63e56c469511c42b016423bcb167e61b611f6bad47e3ca60e", [:rebar3], [{:certifi, "2.4.2", [hex: :certifi, repo: "hexpm", optional: false]}, {:idna, "6.0.0", [hex: :idna, repo: "hexpm", optional: false]}, {:metrics, "1.0.1", [hex: :metrics, repo: "hexpm", optional: false]}, {:mimerl, "1.0.2", [hex: :mimerl, repo: "hexpm", optional: false]}, {:ssl_verify_fun, "1.1.4", [hex: :ssl_verify_fun, repo: "hexpm", optional: false]}], "hexpm"},
+  "idna": {:hex, :idna, "6.0.0", "689c46cbcdf3524c44d5f3dde8001f364cd7608a99556d8fbd8239a5798d4c10", [:rebar3], [{:unicode_util_compat, "0.4.1", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm"},
+  "metrics": {:hex, :metrics, "1.0.1", "25f094dea2cda98213cecc3aeff09e940299d950904393b2a29d191c346a8486", [:rebar3], [], "hexpm"},
+  "mimerl": {:hex, :mimerl, "1.0.2", "993f9b0e084083405ed8252b99460c4f0563e41729ab42d9074fd5e52439be88", [:rebar3], [], "hexpm"},
+  "parse_trans": {:hex, :parse_trans, "3.3.0", "09765507a3c7590a784615cfd421d101aec25098d50b89d7aa1d66646bc571c1", [:rebar3], [], "hexpm"},
+  "ssl_verify_fun": {:hex, :ssl_verify_fun, "1.1.4", "f0eafff810d2041e93f915ef59899c923f4568f4585904d010387ed74988e77b", [:make, :mix, :rebar3], [], "hexpm"},
+  "timex": {:hex, :timex, "3.5.0", "b0a23167da02d0fe4f1a4e104d1f929a00d348502b52432c05de875d0b9cffa5", [:mix], [{:combine, "~> 0.10", [hex: :combine, repo: "hexpm", optional: false]}, {:gettext, "~> 0.10", [hex: :gettext, repo: "hexpm", optional: false]}, {:tzdata, "~> 0.1.8 or ~> 0.5", [hex: :tzdata, repo: "hexpm", optional: false]}], "hexpm"},
+  "tzdata": {:hex, :tzdata, "0.5.19", "7962a3997bf06303b7d1772988ede22260f3dae1bf897408ebdac2b4435f4e6a", [:mix], [{:hackney, "~> 1.0", [hex: :hackney, repo: "hexpm", optional: false]}], "hexpm"},
+  "unicode_util_compat": {:hex, :unicode_util_compat, "0.4.1", "d869e4c68901dd9531385bb0c8c40444ebf624e60b6962d95952775cac5e90cd", [:rebar3], [], "hexpm"},
+}
--- a/test/mercury/cleaners/author_tests.ex
+++ b/test/mercury/cleaners/author_tests.ex
@ -0,0 +1,13 @@
+defmodule AuthorTests do
+  use ExUnit.Case
+  doctest Mercury.Cleaners.Author
+  import Mercury.Cleaners.Author
+
+  test "removes the By from an author string" do
+    assert clean_author("by Bob Dylan") == "Bob Dylan"
+  end
+
+  test "trims trailing whitespace and line breaks" do
+    assert clean_author("\twritten by\n\tBob Dylan\n\t") == "Bob Dylan"
+  end
+end
--- a/test/mercury/utils/text_tests.ex
+++ b/test/mercury/utils/text_tests.ex
@ -0,0 +1,21 @@
+defmodule Mercury.Utils.TextTest do
+  use ExUnit.Case
+  doctest Mercury.Utils.Text
+
+  test "returns the base url of a paginated url" do
+    url = "http://example.com/foo/bar/wow-cool/page=10"
+    cleaned = "http://example.com/foo/bar/wow-cool"
+    assert article_base_url(url) == cleaned
+  end
+
+  test "returns the same url if url has no pagination info" do
+    url = "http://example.com/foo/bar/wow-cool/"
+    cleaned = "http://example.com/foo/bar/wow-cool"
+    assert article_base_url(url) == cleaned
+  end
+
+  test "normalize spaces" do
+    assert normalize_spaces(" hello  world ") == "hello world"
+    assert normalize_spaces("\thello\n\tworld\n") == "hello world"
+  end
+end