Browse Source

Some stuff

master
Shadowfacts 3 years ago
parent
commit
8663faff3e
Signed by: shadowfacts GPG Key ID: 94A5AB95422746E5
  1. 18
      lib/mercury/cleaners/author.ex
  2. 122
      lib/mercury/utils/text.ex
  3. 1
      mix.exs
  4. 14
      mix.lock
  5. 13
      test/mercury/cleaners/author_tests.ex
  6. 21
      test/mercury/utils/text_tests.ex

18
lib/mercury/cleaners/author.ex

@ -0,0 +1,18 @@
defmodule Mercury.Cleaners.Author do
import Mercury.Utils.Text, only: [normalize_spaces: 1]
@clean_author ~r/^\s*(posted |written )?by\s*:?\s*(.*)/i
@doc """
Take an author string (like 'by David Smith ') and clean it to just the name.
## Examples
iex> Mercury.Cleaners.Author.clean_author("by David Smith")
"David Smith"
"""
def clean_author(author) do
Regex.replace(@clean_author, normalize_spaces(author), "\\2")
|> String.trim()
end
end

122
lib/mercury/utils/text.ex

@ -0,0 +1,122 @@
defmodule Mercury.Utils.Text do
@page_in_href ~r"(page|paging|(p(a|g|ag)?(e|enum|ewanted|ing|ination)))?(=|/)([0-9]{1,3})"i
@has_alpha ~r/[a-z]/i
@is_alpha ~r/^[a-z]+$/i
@is_digit ~r/^[0-9]+$/i
@encoding ~r/charset=([\w-]+)\b/
@default_encoding "utf-8"
def article_base_url(url), do: article_base_url(url, URI.parse(url))
@doc """
Take a URL and return the article base of said URL. That is, no
pagination data exists in it. Useful for comparing to other links
that might have pagination data within them.
"""
def article_base_url(url, %URI{scheme: protocol, authority: host, path: path})
when is_binary(url) do
reversed_segments =
path
|> String.split("/", trim: true)
|> Enum.reverse()
# If we're on the first segment, check to see if we have any characters in it.
# The first segment is actually the last bit of the URL, and this will be
# helpful to determine if we're on a URL segment that looks like "/2/" for example.
first_segment_has_letters = Regex.match?(@has_alpha, List.first(reversed_segments))
cleaned_segments =
reversed_segments
|> Enum.with_index()
|> Enum.reduce([], fn {segment, index}, acc ->
# Split off anything that looks like a file type.
segment =
case String.split(segment, ".") do
[possible_segment, fileExt] ->
if Regex.match?(@is_alpha, fileExt), do: possible_segment, else: segment
_ ->
segment
end
# If our first or second segment has anything looking like a page number, remove it.
segment =
if index < 2 and Regex.match?(@page_in_href, segment) do
Regex.replace(@page_in_href, segment, "")
else
segment
end
# If it's not marked for deletion, push it to cleaned_segments.
if is_good_segment(segment, index, first_segment_has_letters) do
[segment | acc]
else
acc
end
end)
|> Enum.reject(fn
"" -> true
_ -> false
end)
URI.to_string(%URI{
authority: host,
scheme: protocol,
path: "/" <> (cleaned_segments |> Enum.join("/"))
})
end
defp is_good_segment(segment, index, first_segment_has_letters) do
cond do
# If our first or second url_segment is smaller than 3 characters,
# and the first url_segment had no alphas, remove it.
index < 2 and String.length(segment) < 3 and !first_segment_has_letters -> false
# If this is the first url_segment and it's just index,
# remove it.
index === 0 and String.downcase(segment) == "index" -> false
# If this is purely a number, and it's the first or second url_segment,
# it's probably a page number. Remove it.
index < 2 and Regex.match?(@is_digit, segment) and String.length(segment) < 3 -> false
true -> true
end
end
@doc """
Given a string, return true if it appears to have an ending sentence within it, false otherwise.
## Examples
iex> Mercury.Utils.Text.has_sentence_end("This is a sentence.")
true
iex> Mercury.Utils.Text.has_sentence_end("This isn't ")
false
"""
def has_sentence_end(text) do
Regex.match?(~r/\.( |$)/, text)
end
@doc """
Normalizes multiple spaces to a single space and trims the resulting string.
## Examples
iex> Mercury.Utils.Text.normalize_spaces(" hello world ")
"hello world"
"""
def normalize_spaces(text) do
Regex.replace(~r/\s{2,}/, text, " ")
|> String.trim()
end
@doc """
Removes the anchor component of a URL.
## Examples
iex> Mercury.Utils.Text.remove_anchor("https://example.com/foo/bar#baz")
"https://example.com/foo/bar"
"""
def remove_anchor(url) do
URI.to_string(%URI{URI.parse(url) | fragment: nil})
end
end

1
mix.exs

@ -23,6 +23,7 @@ defmodule Mercury.MixProject do
[
# {:dep_from_hexpm, "~> 0.3.0"},
# {:dep_from_git, git: "https://github.com/elixir-lang/my_dep.git", tag: "0.1.0"},
{:timex, "~> 3.0"}
]
end
end

14
mix.lock

@ -0,0 +1,14 @@
%{
"certifi": {:hex, :certifi, "2.4.2", "75424ff0f3baaccfd34b1214184b6ef616d89e420b258bb0a5ea7d7bc628f7f0", [:rebar3], [{:parse_trans, "~>3.3", [hex: :parse_trans, repo: "hexpm", optional: false]}], "hexpm"},
"combine": {:hex, :combine, "0.10.0", "eff8224eeb56498a2af13011d142c5e7997a80c8f5b97c499f84c841032e429f", [:mix], [], "hexpm"},
"gettext": {:hex, :gettext, "0.16.1", "e2130b25eebcbe02bb343b119a07ae2c7e28bd4b146c4a154da2ffb2b3507af2", [:mix], [], "hexpm"},
"hackney": {:hex, :hackney, "1.15.0", "287a5d2304d516f63e56c469511c42b016423bcb167e61b611f6bad47e3ca60e", [:rebar3], [{:certifi, "2.4.2", [hex: :certifi, repo: "hexpm", optional: false]}, {:idna, "6.0.0", [hex: :idna, repo: "hexpm", optional: false]}, {:metrics, "1.0.1", [hex: :metrics, repo: "hexpm", optional: false]}, {:mimerl, "1.0.2", [hex: :mimerl, repo: "hexpm", optional: false]}, {:ssl_verify_fun, "1.1.4", [hex: :ssl_verify_fun, repo: "hexpm", optional: false]}], "hexpm"},
"idna": {:hex, :idna, "6.0.0", "689c46cbcdf3524c44d5f3dde8001f364cd7608a99556d8fbd8239a5798d4c10", [:rebar3], [{:unicode_util_compat, "0.4.1", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm"},
"metrics": {:hex, :metrics, "1.0.1", "25f094dea2cda98213cecc3aeff09e940299d950904393b2a29d191c346a8486", [:rebar3], [], "hexpm"},
"mimerl": {:hex, :mimerl, "1.0.2", "993f9b0e084083405ed8252b99460c4f0563e41729ab42d9074fd5e52439be88", [:rebar3], [], "hexpm"},
"parse_trans": {:hex, :parse_trans, "3.3.0", "09765507a3c7590a784615cfd421d101aec25098d50b89d7aa1d66646bc571c1", [:rebar3], [], "hexpm"},
"ssl_verify_fun": {:hex, :ssl_verify_fun, "1.1.4", "f0eafff810d2041e93f915ef59899c923f4568f4585904d010387ed74988e77b", [:make, :mix, :rebar3], [], "hexpm"},
"timex": {:hex, :timex, "3.5.0", "b0a23167da02d0fe4f1a4e104d1f929a00d348502b52432c05de875d0b9cffa5", [:mix], [{:combine, "~> 0.10", [hex: :combine, repo: "hexpm", optional: false]}, {:gettext, "~> 0.10", [hex: :gettext, repo: "hexpm", optional: false]}, {:tzdata, "~> 0.1.8 or ~> 0.5", [hex: :tzdata, repo: "hexpm", optional: false]}], "hexpm"},
"tzdata": {:hex, :tzdata, "0.5.19", "7962a3997bf06303b7d1772988ede22260f3dae1bf897408ebdac2b4435f4e6a", [:mix], [{:hackney, "~> 1.0", [hex: :hackney, repo: "hexpm", optional: false]}], "hexpm"},
"unicode_util_compat": {:hex, :unicode_util_compat, "0.4.1", "d869e4c68901dd9531385bb0c8c40444ebf624e60b6962d95952775cac5e90cd", [:rebar3], [], "hexpm"},
}

13
test/mercury/cleaners/author_tests.ex

@ -0,0 +1,13 @@
defmodule AuthorTests do
use ExUnit.Case
doctest Mercury.Cleaners.Author
import Mercury.Cleaners.Author
test "removes the By from an author string" do
assert clean_author("by Bob Dylan") == "Bob Dylan"
end
test "trims trailing whitespace and line breaks" do
assert clean_author("\twritten by\n\tBob Dylan\n\t") == "Bob Dylan"
end
end

21
test/mercury/utils/text_tests.ex

@ -0,0 +1,21 @@
defmodule Mercury.Utils.TextTest do
use ExUnit.Case
doctest Mercury.Utils.Text
test "returns the base url of a paginated url" do
url = "http://example.com/foo/bar/wow-cool/page=10"
cleaned = "http://example.com/foo/bar/wow-cool"
assert article_base_url(url) == cleaned
end
test "returns the same url if url has no pagination info" do
url = "http://example.com/foo/bar/wow-cool/"
cleaned = "http://example.com/foo/bar/wow-cool"
assert article_base_url(url) == cleaned
end
test "normalize spaces" do
assert normalize_spaces(" hello world ") == "hello world"
assert normalize_spaces("\thello\n\tworld\n") == "hello world"
end
end
Loading…
Cancel
Save