Some stuff
This commit is contained in:
parent
81375ede91
commit
8663faff3e
|
@ -0,0 +1,18 @@
|
||||||
|
defmodule Mercury.Cleaners.Author do
|
||||||
|
import Mercury.Utils.Text, only: [normalize_spaces: 1]
|
||||||
|
|
||||||
|
@clean_author ~r/^\s*(posted |written )?by\s*:?\s*(.*)/i
|
||||||
|
|
||||||
|
@doc """
|
||||||
|
Take an author string (like 'by David Smith ') and clean it to just the name.
|
||||||
|
|
||||||
|
## Examples
|
||||||
|
|
||||||
|
iex> Mercury.Cleaners.Author.clean_author("by David Smith")
|
||||||
|
"David Smith"
|
||||||
|
"""
|
||||||
|
def clean_author(author) do
|
||||||
|
Regex.replace(@clean_author, normalize_spaces(author), "\\2")
|
||||||
|
|> String.trim()
|
||||||
|
end
|
||||||
|
end
|
|
@ -0,0 +1,122 @@
|
||||||
|
defmodule Mercury.Utils.Text do
|
||||||
|
@page_in_href ~r"(page|paging|(p(a|g|ag)?(e|enum|ewanted|ing|ination)))?(=|/)([0-9]{1,3})"i
|
||||||
|
@has_alpha ~r/[a-z]/i
|
||||||
|
@is_alpha ~r/^[a-z]+$/i
|
||||||
|
@is_digit ~r/^[0-9]+$/i
|
||||||
|
@encoding ~r/charset=([\w-]+)\b/
|
||||||
|
@default_encoding "utf-8"
|
||||||
|
|
||||||
|
def article_base_url(url), do: article_base_url(url, URI.parse(url))
|
||||||
|
|
||||||
|
@doc """
|
||||||
|
Take a URL and return the article base of said URL. That is, no
|
||||||
|
pagination data exists in it. Useful for comparing to other links
|
||||||
|
that might have pagination data within them.
|
||||||
|
"""
|
||||||
|
def article_base_url(url, %URI{scheme: protocol, authority: host, path: path})
|
||||||
|
when is_binary(url) do
|
||||||
|
reversed_segments =
|
||||||
|
path
|
||||||
|
|> String.split("/", trim: true)
|
||||||
|
|> Enum.reverse()
|
||||||
|
|
||||||
|
# If we're on the first segment, check to see if we have any characters in it.
|
||||||
|
# The first segment is actually the last bit of the URL, and this will be
|
||||||
|
# helpful to determine if we're on a URL segment that looks like "/2/" for example.
|
||||||
|
first_segment_has_letters = Regex.match?(@has_alpha, List.first(reversed_segments))
|
||||||
|
|
||||||
|
cleaned_segments =
|
||||||
|
reversed_segments
|
||||||
|
|> Enum.with_index()
|
||||||
|
|> Enum.reduce([], fn {segment, index}, acc ->
|
||||||
|
# Split off anything that looks like a file type.
|
||||||
|
segment =
|
||||||
|
case String.split(segment, ".") do
|
||||||
|
[possible_segment, fileExt] ->
|
||||||
|
if Regex.match?(@is_alpha, fileExt), do: possible_segment, else: segment
|
||||||
|
|
||||||
|
_ ->
|
||||||
|
segment
|
||||||
|
end
|
||||||
|
|
||||||
|
# If our first or second segment has anything looking like a page number, remove it.
|
||||||
|
segment =
|
||||||
|
if index < 2 and Regex.match?(@page_in_href, segment) do
|
||||||
|
Regex.replace(@page_in_href, segment, "")
|
||||||
|
else
|
||||||
|
segment
|
||||||
|
end
|
||||||
|
|
||||||
|
# If it's not marked for deletion, push it to cleaned_segments.
|
||||||
|
if is_good_segment(segment, index, first_segment_has_letters) do
|
||||||
|
[segment | acc]
|
||||||
|
else
|
||||||
|
acc
|
||||||
|
end
|
||||||
|
end)
|
||||||
|
|> Enum.reject(fn
|
||||||
|
"" -> true
|
||||||
|
_ -> false
|
||||||
|
end)
|
||||||
|
|
||||||
|
URI.to_string(%URI{
|
||||||
|
authority: host,
|
||||||
|
scheme: protocol,
|
||||||
|
path: "/" <> (cleaned_segments |> Enum.join("/"))
|
||||||
|
})
|
||||||
|
end
|
||||||
|
|
||||||
|
defp is_good_segment(segment, index, first_segment_has_letters) do
|
||||||
|
cond do
|
||||||
|
# If our first or second url_segment is smaller than 3 characters,
|
||||||
|
# and the first url_segment had no alphas, remove it.
|
||||||
|
index < 2 and String.length(segment) < 3 and !first_segment_has_letters -> false
|
||||||
|
# If this is the first url_segment and it's just index,
|
||||||
|
# remove it.
|
||||||
|
index === 0 and String.downcase(segment) == "index" -> false
|
||||||
|
# If this is purely a number, and it's the first or second url_segment,
|
||||||
|
# it's probably a page number. Remove it.
|
||||||
|
index < 2 and Regex.match?(@is_digit, segment) and String.length(segment) < 3 -> false
|
||||||
|
true -> true
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
@doc """
|
||||||
|
Given a string, return true if it appears to have an ending sentence within it, false otherwise.
|
||||||
|
|
||||||
|
## Examples
|
||||||
|
|
||||||
|
iex> Mercury.Utils.Text.has_sentence_end("This is a sentence.")
|
||||||
|
true
|
||||||
|
iex> Mercury.Utils.Text.has_sentence_end("This isn't ")
|
||||||
|
false
|
||||||
|
"""
|
||||||
|
def has_sentence_end(text) do
|
||||||
|
Regex.match?(~r/\.( |$)/, text)
|
||||||
|
end
|
||||||
|
|
||||||
|
@doc """
|
||||||
|
Normalizes multiple spaces to a single space and trims the resulting string.
|
||||||
|
|
||||||
|
## Examples
|
||||||
|
|
||||||
|
iex> Mercury.Utils.Text.normalize_spaces(" hello world ")
|
||||||
|
"hello world"
|
||||||
|
"""
|
||||||
|
def normalize_spaces(text) do
|
||||||
|
Regex.replace(~r/\s{2,}/, text, " ")
|
||||||
|
|> String.trim()
|
||||||
|
end
|
||||||
|
|
||||||
|
@doc """
|
||||||
|
Removes the anchor component of a URL.
|
||||||
|
|
||||||
|
## Examples
|
||||||
|
|
||||||
|
iex> Mercury.Utils.Text.remove_anchor("https://example.com/foo/bar#baz")
|
||||||
|
"https://example.com/foo/bar"
|
||||||
|
"""
|
||||||
|
def remove_anchor(url) do
|
||||||
|
URI.to_string(%URI{URI.parse(url) | fragment: nil})
|
||||||
|
end
|
||||||
|
end
|
1
mix.exs
1
mix.exs
|
@ -23,6 +23,7 @@ defmodule Mercury.MixProject do
|
||||||
[
|
[
|
||||||
# {:dep_from_hexpm, "~> 0.3.0"},
|
# {:dep_from_hexpm, "~> 0.3.0"},
|
||||||
# {:dep_from_git, git: "https://github.com/elixir-lang/my_dep.git", tag: "0.1.0"},
|
# {:dep_from_git, git: "https://github.com/elixir-lang/my_dep.git", tag: "0.1.0"},
|
||||||
|
{:timex, "~> 3.0"}
|
||||||
]
|
]
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -0,0 +1,14 @@
|
||||||
|
%{
|
||||||
|
"certifi": {:hex, :certifi, "2.4.2", "75424ff0f3baaccfd34b1214184b6ef616d89e420b258bb0a5ea7d7bc628f7f0", [:rebar3], [{:parse_trans, "~>3.3", [hex: :parse_trans, repo: "hexpm", optional: false]}], "hexpm"},
|
||||||
|
"combine": {:hex, :combine, "0.10.0", "eff8224eeb56498a2af13011d142c5e7997a80c8f5b97c499f84c841032e429f", [:mix], [], "hexpm"},
|
||||||
|
"gettext": {:hex, :gettext, "0.16.1", "e2130b25eebcbe02bb343b119a07ae2c7e28bd4b146c4a154da2ffb2b3507af2", [:mix], [], "hexpm"},
|
||||||
|
"hackney": {:hex, :hackney, "1.15.0", "287a5d2304d516f63e56c469511c42b016423bcb167e61b611f6bad47e3ca60e", [:rebar3], [{:certifi, "2.4.2", [hex: :certifi, repo: "hexpm", optional: false]}, {:idna, "6.0.0", [hex: :idna, repo: "hexpm", optional: false]}, {:metrics, "1.0.1", [hex: :metrics, repo: "hexpm", optional: false]}, {:mimerl, "1.0.2", [hex: :mimerl, repo: "hexpm", optional: false]}, {:ssl_verify_fun, "1.1.4", [hex: :ssl_verify_fun, repo: "hexpm", optional: false]}], "hexpm"},
|
||||||
|
"idna": {:hex, :idna, "6.0.0", "689c46cbcdf3524c44d5f3dde8001f364cd7608a99556d8fbd8239a5798d4c10", [:rebar3], [{:unicode_util_compat, "0.4.1", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm"},
|
||||||
|
"metrics": {:hex, :metrics, "1.0.1", "25f094dea2cda98213cecc3aeff09e940299d950904393b2a29d191c346a8486", [:rebar3], [], "hexpm"},
|
||||||
|
"mimerl": {:hex, :mimerl, "1.0.2", "993f9b0e084083405ed8252b99460c4f0563e41729ab42d9074fd5e52439be88", [:rebar3], [], "hexpm"},
|
||||||
|
"parse_trans": {:hex, :parse_trans, "3.3.0", "09765507a3c7590a784615cfd421d101aec25098d50b89d7aa1d66646bc571c1", [:rebar3], [], "hexpm"},
|
||||||
|
"ssl_verify_fun": {:hex, :ssl_verify_fun, "1.1.4", "f0eafff810d2041e93f915ef59899c923f4568f4585904d010387ed74988e77b", [:make, :mix, :rebar3], [], "hexpm"},
|
||||||
|
"timex": {:hex, :timex, "3.5.0", "b0a23167da02d0fe4f1a4e104d1f929a00d348502b52432c05de875d0b9cffa5", [:mix], [{:combine, "~> 0.10", [hex: :combine, repo: "hexpm", optional: false]}, {:gettext, "~> 0.10", [hex: :gettext, repo: "hexpm", optional: false]}, {:tzdata, "~> 0.1.8 or ~> 0.5", [hex: :tzdata, repo: "hexpm", optional: false]}], "hexpm"},
|
||||||
|
"tzdata": {:hex, :tzdata, "0.5.19", "7962a3997bf06303b7d1772988ede22260f3dae1bf897408ebdac2b4435f4e6a", [:mix], [{:hackney, "~> 1.0", [hex: :hackney, repo: "hexpm", optional: false]}], "hexpm"},
|
||||||
|
"unicode_util_compat": {:hex, :unicode_util_compat, "0.4.1", "d869e4c68901dd9531385bb0c8c40444ebf624e60b6962d95952775cac5e90cd", [:rebar3], [], "hexpm"},
|
||||||
|
}
|
|
@ -0,0 +1,13 @@
|
||||||
|
defmodule AuthorTests do
|
||||||
|
use ExUnit.Case
|
||||||
|
doctest Mercury.Cleaners.Author
|
||||||
|
import Mercury.Cleaners.Author
|
||||||
|
|
||||||
|
test "removes the By from an author string" do
|
||||||
|
assert clean_author("by Bob Dylan") == "Bob Dylan"
|
||||||
|
end
|
||||||
|
|
||||||
|
test "trims trailing whitespace and line breaks" do
|
||||||
|
assert clean_author("\twritten by\n\tBob Dylan\n\t") == "Bob Dylan"
|
||||||
|
end
|
||||||
|
end
|
|
@ -0,0 +1,21 @@
|
||||||
|
defmodule Mercury.Utils.TextTest do
|
||||||
|
use ExUnit.Case
|
||||||
|
doctest Mercury.Utils.Text
|
||||||
|
|
||||||
|
test "returns the base url of a paginated url" do
|
||||||
|
url = "http://example.com/foo/bar/wow-cool/page=10"
|
||||||
|
cleaned = "http://example.com/foo/bar/wow-cool"
|
||||||
|
assert article_base_url(url) == cleaned
|
||||||
|
end
|
||||||
|
|
||||||
|
test "returns the same url if url has no pagination info" do
|
||||||
|
url = "http://example.com/foo/bar/wow-cool/"
|
||||||
|
cleaned = "http://example.com/foo/bar/wow-cool"
|
||||||
|
assert article_base_url(url) == cleaned
|
||||||
|
end
|
||||||
|
|
||||||
|
test "normalize spaces" do
|
||||||
|
assert normalize_spaces(" hello world ") == "hello world"
|
||||||
|
assert normalize_spaces("\thello\n\tworld\n") == "hello world"
|
||||||
|
end
|
||||||
|
end
|
Loading…
Reference in New Issue