defmodule Mercury.Utils.Text do @page_in_href ~r"(page|paging|(p(a|g|ag)?(e|enum|ewanted|ing|ination)))?(=|/)([0-9]{1,3})"i @has_alpha ~r/[a-z]/i @is_alpha ~r/^[a-z]+$/i @is_digit ~r/^[0-9]+$/i @encoding ~r/charset=([\w-]+)\b/ @default_encoding "utf-8" def article_base_url(url), do: article_base_url(url, URI.parse(url)) @doc """ Take a URL and return the article base of said URL. That is, no pagination data exists in it. Useful for comparing to other links that might have pagination data within them. """ def article_base_url(url, %URI{scheme: protocol, authority: host, path: path}) when is_binary(url) do reversed_segments = path |> String.split("/", trim: true) |> Enum.reverse() # If we're on the first segment, check to see if we have any characters in it. # The first segment is actually the last bit of the URL, and this will be # helpful to determine if we're on a URL segment that looks like "/2/" for example. first_segment_has_letters = Regex.match?(@has_alpha, List.first(reversed_segments)) cleaned_segments = reversed_segments |> Enum.with_index() |> Enum.reduce([], fn {segment, index}, acc -> # Split off anything that looks like a file type. segment = case String.split(segment, ".") do [possible_segment, fileExt] -> if Regex.match?(@is_alpha, fileExt), do: possible_segment, else: segment _ -> segment end # If our first or second segment has anything looking like a page number, remove it. segment = if index < 2 and Regex.match?(@page_in_href, segment) do Regex.replace(@page_in_href, segment, "") else segment end # If it's not marked for deletion, push it to cleaned_segments. if is_good_segment(segment, index, first_segment_has_letters) do [segment | acc] else acc end end) |> Enum.reject(fn "" -> true _ -> false end) URI.to_string(%URI{ authority: host, scheme: protocol, path: "/" <> (cleaned_segments |> Enum.join("/")) }) end defp is_good_segment(segment, index, first_segment_has_letters) do cond do # If our first or second url_segment is smaller than 3 characters, # and the first url_segment had no alphas, remove it. index < 2 and String.length(segment) < 3 and !first_segment_has_letters -> false # If this is the first url_segment and it's just index, # remove it. index === 0 and String.downcase(segment) == "index" -> false # If this is purely a number, and it's the first or second url_segment, # it's probably a page number. Remove it. index < 2 and Regex.match?(@is_digit, segment) and String.length(segment) < 3 -> false true -> true end end @doc """ Given a string, return true if it appears to have an ending sentence within it, false otherwise. ## Examples iex> Mercury.Utils.Text.has_sentence_end("This is a sentence.") true iex> Mercury.Utils.Text.has_sentence_end("This isn't ") false """ def has_sentence_end(text) do Regex.match?(~r/\.( |$)/, text) end @doc """ Normalizes multiple spaces to a single space and trims the resulting string. ## Examples iex> Mercury.Utils.Text.normalize_spaces(" hello world ") "hello world" """ def normalize_spaces(text) do Regex.replace(~r/\s{2,}/, text, " ") |> String.trim() end @doc """ Removes the anchor component of a URL. ## Examples iex> Mercury.Utils.Text.remove_anchor("https://example.com/foo/bar#baz") "https://example.com/foo/bar" """ def remove_anchor(url) do URI.to_string(%URI{URI.parse(url) | fragment: nil}) end end