Some stuff

This commit is contained in:
Shadowfacts 2019-02-11 22:16:32 -05:00
parent 81375ede91
commit 8663faff3e
Signed by: shadowfacts
GPG Key ID: 94A5AB95422746E5
6 changed files with 189 additions and 0 deletions

View File

@ -0,0 +1,18 @@
defmodule Mercury.Cleaners.Author do
import Mercury.Utils.Text, only: [normalize_spaces: 1]
@clean_author ~r/^\s*(posted |written )?by\s*:?\s*(.*)/i
@doc """
Take an author string (like 'by David Smith ') and clean it to just the name.
## Examples
iex> Mercury.Cleaners.Author.clean_author("by David Smith")
"David Smith"
"""
def clean_author(author) do
Regex.replace(@clean_author, normalize_spaces(author), "\\2")
|> String.trim()
end
end

122
lib/mercury/utils/text.ex Normal file
View File

@ -0,0 +1,122 @@
defmodule Mercury.Utils.Text do
@page_in_href ~r"(page|paging|(p(a|g|ag)?(e|enum|ewanted|ing|ination)))?(=|/)([0-9]{1,3})"i
@has_alpha ~r/[a-z]/i
@is_alpha ~r/^[a-z]+$/i
@is_digit ~r/^[0-9]+$/i
@encoding ~r/charset=([\w-]+)\b/
@default_encoding "utf-8"
def article_base_url(url), do: article_base_url(url, URI.parse(url))
@doc """
Take a URL and return the article base of said URL. That is, no
pagination data exists in it. Useful for comparing to other links
that might have pagination data within them.
"""
def article_base_url(url, %URI{scheme: protocol, authority: host, path: path})
when is_binary(url) do
reversed_segments =
path
|> String.split("/", trim: true)
|> Enum.reverse()
# If we're on the first segment, check to see if we have any characters in it.
# The first segment is actually the last bit of the URL, and this will be
# helpful to determine if we're on a URL segment that looks like "/2/" for example.
first_segment_has_letters = Regex.match?(@has_alpha, List.first(reversed_segments))
cleaned_segments =
reversed_segments
|> Enum.with_index()
|> Enum.reduce([], fn {segment, index}, acc ->
# Split off anything that looks like a file type.
segment =
case String.split(segment, ".") do
[possible_segment, fileExt] ->
if Regex.match?(@is_alpha, fileExt), do: possible_segment, else: segment
_ ->
segment
end
# If our first or second segment has anything looking like a page number, remove it.
segment =
if index < 2 and Regex.match?(@page_in_href, segment) do
Regex.replace(@page_in_href, segment, "")
else
segment
end
# If it's not marked for deletion, push it to cleaned_segments.
if is_good_segment(segment, index, first_segment_has_letters) do
[segment | acc]
else
acc
end
end)
|> Enum.reject(fn
"" -> true
_ -> false
end)
URI.to_string(%URI{
authority: host,
scheme: protocol,
path: "/" <> (cleaned_segments |> Enum.join("/"))
})
end
defp is_good_segment(segment, index, first_segment_has_letters) do
cond do
# If our first or second url_segment is smaller than 3 characters,
# and the first url_segment had no alphas, remove it.
index < 2 and String.length(segment) < 3 and !first_segment_has_letters -> false
# If this is the first url_segment and it's just index,
# remove it.
index === 0 and String.downcase(segment) == "index" -> false
# If this is purely a number, and it's the first or second url_segment,
# it's probably a page number. Remove it.
index < 2 and Regex.match?(@is_digit, segment) and String.length(segment) < 3 -> false
true -> true
end
end
@doc """
Given a string, return true if it appears to have an ending sentence within it, false otherwise.
## Examples
iex> Mercury.Utils.Text.has_sentence_end("This is a sentence.")
true
iex> Mercury.Utils.Text.has_sentence_end("This isn't ")
false
"""
def has_sentence_end(text) do
Regex.match?(~r/\.( |$)/, text)
end
@doc """
Normalizes multiple spaces to a single space and trims the resulting string.
## Examples
iex> Mercury.Utils.Text.normalize_spaces(" hello world ")
"hello world"
"""
def normalize_spaces(text) do
Regex.replace(~r/\s{2,}/, text, " ")
|> String.trim()
end
@doc """
Removes the anchor component of a URL.
## Examples
iex> Mercury.Utils.Text.remove_anchor("https://example.com/foo/bar#baz")
"https://example.com/foo/bar"
"""
def remove_anchor(url) do
URI.to_string(%URI{URI.parse(url) | fragment: nil})
end
end

View File

@ -23,6 +23,7 @@ defmodule Mercury.MixProject do
[ [
# {:dep_from_hexpm, "~> 0.3.0"}, # {:dep_from_hexpm, "~> 0.3.0"},
# {:dep_from_git, git: "https://github.com/elixir-lang/my_dep.git", tag: "0.1.0"}, # {:dep_from_git, git: "https://github.com/elixir-lang/my_dep.git", tag: "0.1.0"},
{:timex, "~> 3.0"}
] ]
end end
end end

14
mix.lock Normal file
View File

@ -0,0 +1,14 @@
%{
"certifi": {:hex, :certifi, "2.4.2", "75424ff0f3baaccfd34b1214184b6ef616d89e420b258bb0a5ea7d7bc628f7f0", [:rebar3], [{:parse_trans, "~>3.3", [hex: :parse_trans, repo: "hexpm", optional: false]}], "hexpm"},
"combine": {:hex, :combine, "0.10.0", "eff8224eeb56498a2af13011d142c5e7997a80c8f5b97c499f84c841032e429f", [:mix], [], "hexpm"},
"gettext": {:hex, :gettext, "0.16.1", "e2130b25eebcbe02bb343b119a07ae2c7e28bd4b146c4a154da2ffb2b3507af2", [:mix], [], "hexpm"},
"hackney": {:hex, :hackney, "1.15.0", "287a5d2304d516f63e56c469511c42b016423bcb167e61b611f6bad47e3ca60e", [:rebar3], [{:certifi, "2.4.2", [hex: :certifi, repo: "hexpm", optional: false]}, {:idna, "6.0.0", [hex: :idna, repo: "hexpm", optional: false]}, {:metrics, "1.0.1", [hex: :metrics, repo: "hexpm", optional: false]}, {:mimerl, "1.0.2", [hex: :mimerl, repo: "hexpm", optional: false]}, {:ssl_verify_fun, "1.1.4", [hex: :ssl_verify_fun, repo: "hexpm", optional: false]}], "hexpm"},
"idna": {:hex, :idna, "6.0.0", "689c46cbcdf3524c44d5f3dde8001f364cd7608a99556d8fbd8239a5798d4c10", [:rebar3], [{:unicode_util_compat, "0.4.1", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm"},
"metrics": {:hex, :metrics, "1.0.1", "25f094dea2cda98213cecc3aeff09e940299d950904393b2a29d191c346a8486", [:rebar3], [], "hexpm"},
"mimerl": {:hex, :mimerl, "1.0.2", "993f9b0e084083405ed8252b99460c4f0563e41729ab42d9074fd5e52439be88", [:rebar3], [], "hexpm"},
"parse_trans": {:hex, :parse_trans, "3.3.0", "09765507a3c7590a784615cfd421d101aec25098d50b89d7aa1d66646bc571c1", [:rebar3], [], "hexpm"},
"ssl_verify_fun": {:hex, :ssl_verify_fun, "1.1.4", "f0eafff810d2041e93f915ef59899c923f4568f4585904d010387ed74988e77b", [:make, :mix, :rebar3], [], "hexpm"},
"timex": {:hex, :timex, "3.5.0", "b0a23167da02d0fe4f1a4e104d1f929a00d348502b52432c05de875d0b9cffa5", [:mix], [{:combine, "~> 0.10", [hex: :combine, repo: "hexpm", optional: false]}, {:gettext, "~> 0.10", [hex: :gettext, repo: "hexpm", optional: false]}, {:tzdata, "~> 0.1.8 or ~> 0.5", [hex: :tzdata, repo: "hexpm", optional: false]}], "hexpm"},
"tzdata": {:hex, :tzdata, "0.5.19", "7962a3997bf06303b7d1772988ede22260f3dae1bf897408ebdac2b4435f4e6a", [:mix], [{:hackney, "~> 1.0", [hex: :hackney, repo: "hexpm", optional: false]}], "hexpm"},
"unicode_util_compat": {:hex, :unicode_util_compat, "0.4.1", "d869e4c68901dd9531385bb0c8c40444ebf624e60b6962d95952775cac5e90cd", [:rebar3], [], "hexpm"},
}

View File

@ -0,0 +1,13 @@
defmodule AuthorTests do
use ExUnit.Case
doctest Mercury.Cleaners.Author
import Mercury.Cleaners.Author
test "removes the By from an author string" do
assert clean_author("by Bob Dylan") == "Bob Dylan"
end
test "trims trailing whitespace and line breaks" do
assert clean_author("\twritten by\n\tBob Dylan\n\t") == "Bob Dylan"
end
end

View File

@ -0,0 +1,21 @@
defmodule Mercury.Utils.TextTest do
use ExUnit.Case
doctest Mercury.Utils.Text
test "returns the base url of a paginated url" do
url = "http://example.com/foo/bar/wow-cool/page=10"
cleaned = "http://example.com/foo/bar/wow-cool"
assert article_base_url(url) == cleaned
end
test "returns the same url if url has no pagination info" do
url = "http://example.com/foo/bar/wow-cool/"
cleaned = "http://example.com/foo/bar/wow-cool"
assert article_base_url(url) == cleaned
end
test "normalize spaces" do
assert normalize_spaces(" hello world ") == "hello world"
assert normalize_spaces("\thello\n\tworld\n") == "hello world"
end
end