mercury_ex/lib/mercury/utils/text.ex

123 lines
3.8 KiB
Elixir

defmodule Mercury.Utils.Text do
@page_in_href ~r"(page|paging|(p(a|g|ag)?(e|enum|ewanted|ing|ination)))?(=|/)([0-9]{1,3})"i
@has_alpha ~r/[a-z]/i
@is_alpha ~r/^[a-z]+$/i
@is_digit ~r/^[0-9]+$/i
@encoding ~r/charset=([\w-]+)\b/
@default_encoding "utf-8"
def article_base_url(url), do: article_base_url(url, URI.parse(url))
@doc """
Take a URL and return the article base of said URL. That is, no
pagination data exists in it. Useful for comparing to other links
that might have pagination data within them.
"""
def article_base_url(url, %URI{scheme: protocol, authority: host, path: path})
when is_binary(url) do
reversed_segments =
path
|> String.split("/", trim: true)
|> Enum.reverse()
# If we're on the first segment, check to see if we have any characters in it.
# The first segment is actually the last bit of the URL, and this will be
# helpful to determine if we're on a URL segment that looks like "/2/" for example.
first_segment_has_letters = Regex.match?(@has_alpha, List.first(reversed_segments))
cleaned_segments =
reversed_segments
|> Enum.with_index()
|> Enum.reduce([], fn {segment, index}, acc ->
# Split off anything that looks like a file type.
segment =
case String.split(segment, ".") do
[possible_segment, fileExt] ->
if Regex.match?(@is_alpha, fileExt), do: possible_segment, else: segment
_ ->
segment
end
# If our first or second segment has anything looking like a page number, remove it.
segment =
if index < 2 and Regex.match?(@page_in_href, segment) do
Regex.replace(@page_in_href, segment, "")
else
segment
end
# If it's not marked for deletion, push it to cleaned_segments.
if is_good_segment(segment, index, first_segment_has_letters) do
[segment | acc]
else
acc
end
end)
|> Enum.reject(fn
"" -> true
_ -> false
end)
URI.to_string(%URI{
authority: host,
scheme: protocol,
path: "/" <> (cleaned_segments |> Enum.join("/"))
})
end
defp is_good_segment(segment, index, first_segment_has_letters) do
cond do
# If our first or second url_segment is smaller than 3 characters,
# and the first url_segment had no alphas, remove it.
index < 2 and String.length(segment) < 3 and !first_segment_has_letters -> false
# If this is the first url_segment and it's just index,
# remove it.
index === 0 and String.downcase(segment) == "index" -> false
# If this is purely a number, and it's the first or second url_segment,
# it's probably a page number. Remove it.
index < 2 and Regex.match?(@is_digit, segment) and String.length(segment) < 3 -> false
true -> true
end
end
@doc """
Given a string, return true if it appears to have an ending sentence within it, false otherwise.
## Examples
iex> Mercury.Utils.Text.has_sentence_end("This is a sentence.")
true
iex> Mercury.Utils.Text.has_sentence_end("This isn't ")
false
"""
def has_sentence_end(text) do
Regex.match?(~r/\.( |$)/, text)
end
@doc """
Normalizes multiple spaces to a single space and trims the resulting string.
## Examples
iex> Mercury.Utils.Text.normalize_spaces(" hello world ")
"hello world"
"""
def normalize_spaces(text) do
Regex.replace(~r/\s{2,}/, text, " ")
|> String.trim()
end
@doc """
Removes the anchor component of a URL.
## Examples
iex> Mercury.Utils.Text.remove_anchor("https://example.com/foo/bar#baz")
"https://example.com/foo/bar"
"""
def remove_anchor(url) do
URI.to_string(%URI{URI.parse(url) | fragment: nil})
end
end