mercury_ex/lib/mercury/utils/text.ex

defmodule Mercury.Utils.Text do
  @page_in_href ~r"(page|paging|(p(a|g|ag)?(e|enum|ewanted|ing|ination)))?(=|/)([0-9]{1,3})"i
  @has_alpha ~r/[a-z]/i
  @is_alpha ~r/^[a-z]+$/i
  @is_digit ~r/^[0-9]+$/i
  @encoding ~r/charset=([\w-]+)\b/
  @default_encoding "utf-8"

  def article_base_url(url), do: article_base_url(url, URI.parse(url))

  @doc """
  Take a URL and return the article base of said URL. That is, no
  pagination data exists in it. Useful for comparing to other links
  that might have pagination data within them.
  """
  def article_base_url(url, %URI{scheme: protocol, authority: host, path: path})
      when is_binary(url) do
    reversed_segments =
      path
      |> String.split("/", trim: true)
      |> Enum.reverse()

    # If we're on the first segment, check to see if we have any characters in it.
    # The first segment is actually the last bit of the URL, and this will be
    # helpful to determine if we're on a URL segment that looks like "/2/" for example.
    first_segment_has_letters = Regex.match?(@has_alpha, List.first(reversed_segments))

    cleaned_segments =
      reversed_segments
      |> Enum.with_index()
      |> Enum.reduce([], fn {segment, index}, acc ->
        # Split off anything that looks like a file type.
        segment =
          case String.split(segment, ".") do
            [possible_segment, fileExt] ->
              if Regex.match?(@is_alpha, fileExt), do: possible_segment, else: segment

            _ ->
              segment
          end

        # If our first or second segment has anything looking like a page number, remove it.
        segment =
          if index < 2 and Regex.match?(@page_in_href, segment) do
            Regex.replace(@page_in_href, segment, "")
          else
            segment
          end

        # If it's not marked for deletion, push it to cleaned_segments.
        if is_good_segment(segment, index, first_segment_has_letters) do
          [segment | acc]
        else
          acc
        end
      end)
      |> Enum.reject(fn
        "" -> true
        _ -> false
      end)

    URI.to_string(%URI{
      authority: host,
      scheme: protocol,
      path: "/" <> (cleaned_segments |> Enum.join("/"))
    })
  end

  defp is_good_segment(segment, index, first_segment_has_letters) do
    cond do
      # If our first or second url_segment is smaller than 3 characters,
      # and the first url_segment had no alphas, remove it.
      index < 2 and String.length(segment) < 3 and !first_segment_has_letters -> false
      # If this is the first url_segment and it's just index,
      # remove it.
      index === 0 and String.downcase(segment) == "index" -> false
      # If this is purely a number, and it's the first or second url_segment,
      # it's probably a page number. Remove it.
      index < 2 and Regex.match?(@is_digit, segment) and String.length(segment) < 3 -> false
      true -> true
    end
  end

  @doc """
  Given a string, return true if it appears to have an ending sentence within it, false otherwise.

  ## Examples

      iex> Mercury.Utils.Text.has_sentence_end("This is a sentence.")
      true
      iex> Mercury.Utils.Text.has_sentence_end("This isn't ")
      false
  """
  def has_sentence_end(text) do
    Regex.match?(~r/\.( |$)/, text)
  end

  @doc """
  Normalizes multiple spaces to a single space and trims the resulting string.

  ## Examples

      iex> Mercury.Utils.Text.normalize_spaces(" hello  world ")
      "hello world"
  """
  def normalize_spaces(text) do
    Regex.replace(~r/\s{2,}/, text, " ")
    |> String.trim()
  end

  @doc """
  Removes the anchor component of a URL.

  ## Examples

      iex> Mercury.Utils.Text.remove_anchor("https://example.com/foo/bar#baz")
      "https://example.com/foo/bar"
  """
  def remove_anchor(url) do
    URI.to_string(%URI{URI.parse(url) | fragment: nil})
  end
end