123 lines
3.8 KiB
Elixir
123 lines
3.8 KiB
Elixir
defmodule Mercury.Utils.Text do
|
|
@page_in_href ~r"(page|paging|(p(a|g|ag)?(e|enum|ewanted|ing|ination)))?(=|/)([0-9]{1,3})"i
|
|
@has_alpha ~r/[a-z]/i
|
|
@is_alpha ~r/^[a-z]+$/i
|
|
@is_digit ~r/^[0-9]+$/i
|
|
@encoding ~r/charset=([\w-]+)\b/
|
|
@default_encoding "utf-8"
|
|
|
|
def article_base_url(url), do: article_base_url(url, URI.parse(url))
|
|
|
|
@doc """
|
|
Take a URL and return the article base of said URL. That is, no
|
|
pagination data exists in it. Useful for comparing to other links
|
|
that might have pagination data within them.
|
|
"""
|
|
def article_base_url(url, %URI{scheme: protocol, authority: host, path: path})
|
|
when is_binary(url) do
|
|
reversed_segments =
|
|
path
|
|
|> String.split("/", trim: true)
|
|
|> Enum.reverse()
|
|
|
|
# If we're on the first segment, check to see if we have any characters in it.
|
|
# The first segment is actually the last bit of the URL, and this will be
|
|
# helpful to determine if we're on a URL segment that looks like "/2/" for example.
|
|
first_segment_has_letters = Regex.match?(@has_alpha, List.first(reversed_segments))
|
|
|
|
cleaned_segments =
|
|
reversed_segments
|
|
|> Enum.with_index()
|
|
|> Enum.reduce([], fn {segment, index}, acc ->
|
|
# Split off anything that looks like a file type.
|
|
segment =
|
|
case String.split(segment, ".") do
|
|
[possible_segment, fileExt] ->
|
|
if Regex.match?(@is_alpha, fileExt), do: possible_segment, else: segment
|
|
|
|
_ ->
|
|
segment
|
|
end
|
|
|
|
# If our first or second segment has anything looking like a page number, remove it.
|
|
segment =
|
|
if index < 2 and Regex.match?(@page_in_href, segment) do
|
|
Regex.replace(@page_in_href, segment, "")
|
|
else
|
|
segment
|
|
end
|
|
|
|
# If it's not marked for deletion, push it to cleaned_segments.
|
|
if is_good_segment(segment, index, first_segment_has_letters) do
|
|
[segment | acc]
|
|
else
|
|
acc
|
|
end
|
|
end)
|
|
|> Enum.reject(fn
|
|
"" -> true
|
|
_ -> false
|
|
end)
|
|
|
|
URI.to_string(%URI{
|
|
authority: host,
|
|
scheme: protocol,
|
|
path: "/" <> (cleaned_segments |> Enum.join("/"))
|
|
})
|
|
end
|
|
|
|
defp is_good_segment(segment, index, first_segment_has_letters) do
|
|
cond do
|
|
# If our first or second url_segment is smaller than 3 characters,
|
|
# and the first url_segment had no alphas, remove it.
|
|
index < 2 and String.length(segment) < 3 and !first_segment_has_letters -> false
|
|
# If this is the first url_segment and it's just index,
|
|
# remove it.
|
|
index === 0 and String.downcase(segment) == "index" -> false
|
|
# If this is purely a number, and it's the first or second url_segment,
|
|
# it's probably a page number. Remove it.
|
|
index < 2 and Regex.match?(@is_digit, segment) and String.length(segment) < 3 -> false
|
|
true -> true
|
|
end
|
|
end
|
|
|
|
@doc """
|
|
Given a string, return true if it appears to have an ending sentence within it, false otherwise.
|
|
|
|
## Examples
|
|
|
|
iex> Mercury.Utils.Text.has_sentence_end("This is a sentence.")
|
|
true
|
|
iex> Mercury.Utils.Text.has_sentence_end("This isn't ")
|
|
false
|
|
"""
|
|
def has_sentence_end(text) do
|
|
Regex.match?(~r/\.( |$)/, text)
|
|
end
|
|
|
|
@doc """
|
|
Normalizes multiple spaces to a single space and trims the resulting string.
|
|
|
|
## Examples
|
|
|
|
iex> Mercury.Utils.Text.normalize_spaces(" hello world ")
|
|
"hello world"
|
|
"""
|
|
def normalize_spaces(text) do
|
|
Regex.replace(~r/\s{2,}/, text, " ")
|
|
|> String.trim()
|
|
end
|
|
|
|
@doc """
|
|
Removes the anchor component of a URL.
|
|
|
|
## Examples
|
|
|
|
iex> Mercury.Utils.Text.remove_anchor("https://example.com/foo/bar#baz")
|
|
"https://example.com/foo/bar"
|
|
"""
|
|
def remove_anchor(url) do
|
|
URI.to_string(%URI{URI.parse(url) | fragment: nil})
|
|
end
|
|
end
|