v6/src/generator/util/slugify.rs

255 lines
8.5 KiB
Rust

use once_cell::sync::Lazy;
use regex::Regex;
use std::char::ToLowercase;
use std::str::Chars;
use unicode_normalization::UnicodeNormalization;
pub fn slugify(s: &str) -> String {
slugify_iter(s.nfc())
}
pub fn slugify_iter<I: Iterator<Item = char>>(mut iter: I) -> String {
let next_char = iter.next();
Slugify {
iter,
prev_char: None,
next_char,
state: State::TakeChar,
prev: None,
substitution: '-',
}
.collect()
}
struct Slugify<I: Iterator<Item = char>> {
iter: I,
prev_char: Option<char>,
next_char: Option<char>,
state: State,
prev: Option<Action>,
substitution: char,
}
impl<I: Iterator<Item = char>> Iterator for Slugify<I> {
type Item = char;
fn next(&mut self) -> Option<char> {
match self.state {
State::TakeChar => {
let prev_char = self.prev_char;
if let Some(c) = self.next_char() {
let action = handle_char(c);
// if we're about to start lowercasing something, and the previous
// (untransformed) char was lowercase (i.e., at a transition from lower ->
// upper case), emit a substitution
if let Action::Lowercase = action && let Some(prev_char) = prev_char && prev_char.is_lowercase() {
self.state = State::Lowercase(c.to_lowercase());
Some(self.substitution)
} else {
self.handle_action(c, action)
}
} else {
self.state = State::Done;
None
}
}
State::Lowercase(ref mut lowercase) => {
if let Some(c) = lowercase.next() {
Some(c)
} else {
self.state = State::TakeChar;
self.next()
}
}
State::Multi(ref mut chars, wrap_word) => {
if let Some(c) = chars.next() {
Some(c)
} else {
self.state = State::TakeChar;
if wrap_word {
self.prev = Some(Action::Substitute);
Some(self.substitution)
} else {
self.next()
}
}
}
State::Done => None,
}
}
}
impl<I: Iterator<Item = char>> Slugify<I> {
fn next_char(&mut self) -> Option<char> {
let next = self.next_char;
if next.is_some() {
self.prev_char = next;
self.next_char = self.iter.next();
}
next
}
fn handle_action(&mut self, c: char, action: Action) -> Option<char> {
let prev = self.prev;
self.prev = Some(action);
match action {
Action::Emit(c) => Some(c),
Action::EmitMulti(s, wrap_word) => {
self.state = State::Multi(s.chars(), wrap_word);
if wrap_word && prev != Some(Action::Substitute) {
Some(self.substitution)
} else {
self.next()
}
}
Action::Substitute => {
if prev == Some(Action::Substitute) || self.next_char.is_none() {
self.next()
} else {
Some(self.substitution)
}
}
Action::Lowercase => {
self.state = State::Lowercase(c.to_lowercase());
self.next()
}
}
}
}
fn handle_char(c: char) -> Action {
match c {
'a'..='z' | '0'..='9' => Action::Emit(c),
'A'..='Z' => Action::Lowercase,
_ if is_emoji(c) => Action::Emit(c),
'\u{00c0}'..='\u{00d6}'
| '\u{00d8}'..='\u{00f6}'
| '\u{00f8}'..='\u{00ff}'
| '\u{0100}'..='\u{017f}' => Action::EmitMulti(latin_to_ascii_lowercase(c), false),
'&' => Action::EmitMulti("and", true),
_ => Action::Substitute,
}
}
static EMOJI_REGEX: Lazy<Regex> = Lazy::new(|| Regex::new("\\p{Emoji}").unwrap());
fn is_emoji(c: char) -> bool {
let mut buf = [0u8; 4];
let s = c.encode_utf8(&mut buf);
EMOJI_REGEX.is_match(s)
}
// based on lodash's deburr
fn latin_to_ascii_lowercase(c: char) -> &'static str {
match c {
// Latin-1 Supplement Block
'\u{00c0}'..='\u{00c5}' => "a",
'\u{00e0}'..='\u{00e5}' => "a",
'\u{00c7}' => "c",
'\u{00e7}' => "c",
'\u{00d0}' => "d",
'\u{00f0}' => "d",
'\u{00c8}'..='\u{00cb}' => "e",
'\u{00e8}'..='\u{00eb}' => "e",
'\u{00cc}'..='\u{00cf}' => "i",
'\u{00ec}'..='\u{00ef}' => "i",
'\u{00d1}' => "n",
'\u{00f1}' => "n",
'\u{00d2}'..='\u{00d8}' => "o",
'\u{00f2}'..='\u{00f8}' => "o",
'\u{00d9}'..='\u{00dc}' => "u",
'\u{00f9}'..='\u{00fc}' => "u",
'\u{00dd}' => "y",
'\u{00fd}' | '\u{00ff}' => "y",
'\u{00c6}' => "ae",
'\u{00e6}' => "ae",
'\u{00de}' => "th",
'\u{00fe}' => "th",
'\u{00df}' => "ss",
// Latin Extended-A block
'\u{0100}' | '\u{0102}' | '\u{0104}' => "a",
'\u{0101}' | '\u{0103}' | '\u{0105}' => "a",
'\u{0106}' | '\u{0108}' | '\u{010a}' | '\u{010c}' => "c",
'\u{0107}' | '\u{0109}' | '\u{010b}' | '\u{010d}' => "c",
'\u{010e}' | '\u{0110}' => "d",
'\u{010f}' | '\u{0111}' => "d",
'\u{0112}' | '\u{0114}' | '\u{0116}' | '\u{0118}' | '\u{011a}' => "e",
'\u{0113}' | '\u{0115}' | '\u{0117}' | '\u{0119}' | '\u{011b}' => "e",
'\u{011c}' | '\u{011e}' | '\u{0120}' | '\u{0122}' => "g",
'\u{011d}' | '\u{011f}' | '\u{0121}' | '\u{0123}' => "g",
'\u{0124}' | '\u{0126}' => "h",
'\u{0125}' | '\u{0127}' => "h",
'\u{0128}' | '\u{012a}' | '\u{012c}' | '\u{012e}' | '\u{0130}' => "i",
'\u{0129}' | '\u{012b}' | '\u{012d}' | '\u{012f}' | '\u{0131}' => "i",
'\u{0134}' => "j",
'\u{0135}' => "j",
'\u{0136}' => "k",
'\u{0137}' | '\u{0138}' => "k",
'\u{0139}' | '\u{013b}' | '\u{013d}' | '\u{013f}' | '\u{0141}' => "l",
'\u{013a}' | '\u{013c}' | '\u{013e}' | '\u{0140}' | '\u{0142}' => "l",
'\u{0143}' | '\u{0145}' | '\u{0147}' | '\u{014a}' => "n",
'\u{0144}' | '\u{0146}' | '\u{0148}' | '\u{014b}' => "n",
'\u{014c}' | '\u{014e}' | '\u{0150}' => "o",
'\u{014d}' | '\u{014f}' | '\u{0151}' => "o",
'\u{0154}' | '\u{0156}' | '\u{0158}' => "r",
'\u{0155}' | '\u{0157}' | '\u{0159}' => "r",
'\u{015a}' | '\u{015c}' | '\u{015e}' | '\u{0160}' => "s",
'\u{015b}' | '\u{015d}' | '\u{015f}' | '\u{0161}' => "s",
'\u{0162}' | '\u{0164}' | '\u{0166}' => "t",
'\u{0163}' | '\u{0165}' | '\u{0167}' => "t",
'\u{0168}' | '\u{016a}' | '\u{016c}' | '\u{016e}' | '\u{0170}' | '\u{0172}' => "u",
'\u{0169}' | '\u{016b}' | '\u{016d}' | '\u{016f}' | '\u{0171}' | '\u{0173}' => "u",
'\u{0174}' => "w",
'\u{0175}' => "w",
'\u{0176}' | '\u{0178}' => "y",
'\u{0177}' => "y",
'\u{0179}' | '\u{017b}' | '\u{017d}' => "z",
'\u{017a}' | '\u{017c}' | '\u{017e}' => "z",
'\u{0132}' => "ij",
'\u{0133}' => "ij",
'\u{0152}' => "oe",
'\u{0153}' => "oe",
'\u{0149}' => "n",
'\u{017f}' => "s",
_ => panic!(),
}
}
enum State {
TakeChar,
Lowercase(ToLowercase),
Multi(Chars<'static>, bool),
Done,
}
#[derive(Clone, Copy, PartialEq)]
enum Action {
Emit(char),
EmitMulti(&'static str, bool),
Lowercase,
Substitute,
}
#[cfg(test)]
mod tests {
use super::slugify;
#[test]
fn test_simple() {
assert_eq!(slugify("foo"), "foo");
assert_eq!(slugify("Bar"), "bar");
assert_eq!(slugify("Foo: bar 2 baz"), "foo-bar-2-baz");
assert_eq!(slugify("🖖"), "🖖");
assert_eq!(slugify("Ì"), "i");
assert_eq!(slugify("café"), "cafe");
assert_eq!(slugify("cafe\u{0301}"), "cafe"); // combining diacritical mark
assert_eq!(slugify("1.9.4 Porting Spree"), "1-9-4-porting-spree");
assert_eq!(slugify("Hello, World!"), "hello-world");
assert_eq!(slugify("blah!"), "blah");
assert_eq!(slugify("foo&bar"), "foo-and-bar");
assert_eq!(slugify("foo & bar"), "foo-and-bar");
assert_eq!(slugify("FooBar"), "foo-bar");
assert_eq!(slugify("FOO"), "foo");
}
}