255 lines
8.5 KiB
Rust
255 lines
8.5 KiB
Rust
|
use once_cell::sync::Lazy;
|
||
|
use regex::Regex;
|
||
|
use std::char::ToLowercase;
|
||
|
use std::str::Chars;
|
||
|
use unicode_normalization::UnicodeNormalization;
|
||
|
|
||
|
pub fn slugify(s: &str) -> String {
|
||
|
slugify_iter(s.nfc())
|
||
|
}
|
||
|
|
||
|
pub fn slugify_iter<I: Iterator<Item = char>>(mut iter: I) -> String {
|
||
|
let next_char = iter.next();
|
||
|
Slugify {
|
||
|
iter,
|
||
|
prev_char: None,
|
||
|
next_char,
|
||
|
state: State::TakeChar,
|
||
|
prev: None,
|
||
|
substitution: '-',
|
||
|
}
|
||
|
.collect()
|
||
|
}
|
||
|
|
||
|
struct Slugify<I: Iterator<Item = char>> {
|
||
|
iter: I,
|
||
|
prev_char: Option<char>,
|
||
|
next_char: Option<char>,
|
||
|
state: State,
|
||
|
prev: Option<Action>,
|
||
|
substitution: char,
|
||
|
}
|
||
|
|
||
|
impl<I: Iterator<Item = char>> Iterator for Slugify<I> {
|
||
|
type Item = char;
|
||
|
|
||
|
fn next(&mut self) -> Option<char> {
|
||
|
match self.state {
|
||
|
State::TakeChar => {
|
||
|
let prev_char = self.prev_char;
|
||
|
if let Some(c) = self.next_char() {
|
||
|
let action = handle_char(c);
|
||
|
// if we're about to start lowercasing something, and the previous
|
||
|
// (untransformed) char was lowercase (i.e., at a transition from lower ->
|
||
|
// upper case), emit a substitution
|
||
|
if let Action::Lowercase = action && let Some(prev_char) = prev_char && prev_char.is_lowercase() {
|
||
|
self.state = State::Lowercase(c.to_lowercase());
|
||
|
Some(self.substitution)
|
||
|
} else {
|
||
|
self.handle_action(c, action)
|
||
|
}
|
||
|
} else {
|
||
|
self.state = State::Done;
|
||
|
None
|
||
|
}
|
||
|
}
|
||
|
State::Lowercase(ref mut lowercase) => {
|
||
|
if let Some(c) = lowercase.next() {
|
||
|
Some(c)
|
||
|
} else {
|
||
|
self.state = State::TakeChar;
|
||
|
self.next()
|
||
|
}
|
||
|
}
|
||
|
State::Multi(ref mut chars, wrap_word) => {
|
||
|
if let Some(c) = chars.next() {
|
||
|
Some(c)
|
||
|
} else {
|
||
|
self.state = State::TakeChar;
|
||
|
if wrap_word {
|
||
|
self.prev = Some(Action::Substitute);
|
||
|
Some(self.substitution)
|
||
|
} else {
|
||
|
self.next()
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
State::Done => None,
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
impl<I: Iterator<Item = char>> Slugify<I> {
|
||
|
fn next_char(&mut self) -> Option<char> {
|
||
|
let next = self.next_char;
|
||
|
if next.is_some() {
|
||
|
self.prev_char = next;
|
||
|
self.next_char = self.iter.next();
|
||
|
}
|
||
|
next
|
||
|
}
|
||
|
|
||
|
fn handle_action(&mut self, c: char, action: Action) -> Option<char> {
|
||
|
let prev = self.prev;
|
||
|
self.prev = Some(action);
|
||
|
match action {
|
||
|
Action::Emit(c) => Some(c),
|
||
|
Action::EmitMulti(s, wrap_word) => {
|
||
|
self.state = State::Multi(s.chars(), wrap_word);
|
||
|
if wrap_word && prev != Some(Action::Substitute) {
|
||
|
Some(self.substitution)
|
||
|
} else {
|
||
|
self.next()
|
||
|
}
|
||
|
}
|
||
|
Action::Substitute => {
|
||
|
if prev == Some(Action::Substitute) || self.next_char.is_none() {
|
||
|
self.next()
|
||
|
} else {
|
||
|
Some(self.substitution)
|
||
|
}
|
||
|
}
|
||
|
Action::Lowercase => {
|
||
|
self.state = State::Lowercase(c.to_lowercase());
|
||
|
self.next()
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
fn handle_char(c: char) -> Action {
|
||
|
match c {
|
||
|
'a'..='z' | '0'..='9' => Action::Emit(c),
|
||
|
'A'..='Z' => Action::Lowercase,
|
||
|
_ if is_emoji(c) => Action::Emit(c),
|
||
|
'\u{00c0}'..='\u{00d6}'
|
||
|
| '\u{00d8}'..='\u{00f6}'
|
||
|
| '\u{00f8}'..='\u{00ff}'
|
||
|
| '\u{0100}'..='\u{017f}' => Action::EmitMulti(latin_to_ascii_lowercase(c), false),
|
||
|
'&' => Action::EmitMulti("and", true),
|
||
|
_ => Action::Substitute,
|
||
|
}
|
||
|
}
|
||
|
|
||
|
static EMOJI_REGEX: Lazy<Regex> = Lazy::new(|| Regex::new("\\p{Emoji}").unwrap());
|
||
|
|
||
|
fn is_emoji(c: char) -> bool {
|
||
|
let mut buf = [0u8; 4];
|
||
|
let s = c.encode_utf8(&mut buf);
|
||
|
EMOJI_REGEX.is_match(s)
|
||
|
}
|
||
|
|
||
|
// based on lodash's deburr
|
||
|
fn latin_to_ascii_lowercase(c: char) -> &'static str {
|
||
|
match c {
|
||
|
// Latin-1 Supplement Block
|
||
|
'\u{00c0}'..='\u{00c5}' => "a",
|
||
|
'\u{00e0}'..='\u{00e5}' => "a",
|
||
|
'\u{00c7}' => "c",
|
||
|
'\u{00e7}' => "c",
|
||
|
'\u{00d0}' => "d",
|
||
|
'\u{00f0}' => "d",
|
||
|
'\u{00c8}'..='\u{00cb}' => "e",
|
||
|
'\u{00e8}'..='\u{00eb}' => "e",
|
||
|
'\u{00cc}'..='\u{00cf}' => "i",
|
||
|
'\u{00ec}'..='\u{00ef}' => "i",
|
||
|
'\u{00d1}' => "n",
|
||
|
'\u{00f1}' => "n",
|
||
|
'\u{00d2}'..='\u{00d8}' => "o",
|
||
|
'\u{00f2}'..='\u{00f8}' => "o",
|
||
|
'\u{00d9}'..='\u{00dc}' => "u",
|
||
|
'\u{00f9}'..='\u{00fc}' => "u",
|
||
|
'\u{00dd}' => "y",
|
||
|
'\u{00fd}' | '\u{00ff}' => "y",
|
||
|
'\u{00c6}' => "ae",
|
||
|
'\u{00e6}' => "ae",
|
||
|
'\u{00de}' => "th",
|
||
|
'\u{00fe}' => "th",
|
||
|
'\u{00df}' => "ss",
|
||
|
// Latin Extended-A block
|
||
|
'\u{0100}' | '\u{0102}' | '\u{0104}' => "a",
|
||
|
'\u{0101}' | '\u{0103}' | '\u{0105}' => "a",
|
||
|
'\u{0106}' | '\u{0108}' | '\u{010a}' | '\u{010c}' => "c",
|
||
|
'\u{0107}' | '\u{0109}' | '\u{010b}' | '\u{010d}' => "c",
|
||
|
'\u{010e}' | '\u{0110}' => "d",
|
||
|
'\u{010f}' | '\u{0111}' => "d",
|
||
|
'\u{0112}' | '\u{0114}' | '\u{0116}' | '\u{0118}' | '\u{011a}' => "e",
|
||
|
'\u{0113}' | '\u{0115}' | '\u{0117}' | '\u{0119}' | '\u{011b}' => "e",
|
||
|
'\u{011c}' | '\u{011e}' | '\u{0120}' | '\u{0122}' => "g",
|
||
|
'\u{011d}' | '\u{011f}' | '\u{0121}' | '\u{0123}' => "g",
|
||
|
'\u{0124}' | '\u{0126}' => "h",
|
||
|
'\u{0125}' | '\u{0127}' => "h",
|
||
|
'\u{0128}' | '\u{012a}' | '\u{012c}' | '\u{012e}' | '\u{0130}' => "i",
|
||
|
'\u{0129}' | '\u{012b}' | '\u{012d}' | '\u{012f}' | '\u{0131}' => "i",
|
||
|
'\u{0134}' => "j",
|
||
|
'\u{0135}' => "j",
|
||
|
'\u{0136}' => "k",
|
||
|
'\u{0137}' | '\u{0138}' => "k",
|
||
|
'\u{0139}' | '\u{013b}' | '\u{013d}' | '\u{013f}' | '\u{0141}' => "l",
|
||
|
'\u{013a}' | '\u{013c}' | '\u{013e}' | '\u{0140}' | '\u{0142}' => "l",
|
||
|
'\u{0143}' | '\u{0145}' | '\u{0147}' | '\u{014a}' => "n",
|
||
|
'\u{0144}' | '\u{0146}' | '\u{0148}' | '\u{014b}' => "n",
|
||
|
'\u{014c}' | '\u{014e}' | '\u{0150}' => "o",
|
||
|
'\u{014d}' | '\u{014f}' | '\u{0151}' => "o",
|
||
|
'\u{0154}' | '\u{0156}' | '\u{0158}' => "r",
|
||
|
'\u{0155}' | '\u{0157}' | '\u{0159}' => "r",
|
||
|
'\u{015a}' | '\u{015c}' | '\u{015e}' | '\u{0160}' => "s",
|
||
|
'\u{015b}' | '\u{015d}' | '\u{015f}' | '\u{0161}' => "s",
|
||
|
'\u{0162}' | '\u{0164}' | '\u{0166}' => "t",
|
||
|
'\u{0163}' | '\u{0165}' | '\u{0167}' => "t",
|
||
|
'\u{0168}' | '\u{016a}' | '\u{016c}' | '\u{016e}' | '\u{0170}' | '\u{0172}' => "u",
|
||
|
'\u{0169}' | '\u{016b}' | '\u{016d}' | '\u{016f}' | '\u{0171}' | '\u{0173}' => "u",
|
||
|
'\u{0174}' => "w",
|
||
|
'\u{0175}' => "w",
|
||
|
'\u{0176}' | '\u{0178}' => "y",
|
||
|
'\u{0177}' => "y",
|
||
|
'\u{0179}' | '\u{017b}' | '\u{017d}' => "z",
|
||
|
'\u{017a}' | '\u{017c}' | '\u{017e}' => "z",
|
||
|
'\u{0132}' => "ij",
|
||
|
'\u{0133}' => "ij",
|
||
|
'\u{0152}' => "oe",
|
||
|
'\u{0153}' => "oe",
|
||
|
'\u{0149}' => "n",
|
||
|
'\u{017f}' => "s",
|
||
|
_ => panic!(),
|
||
|
}
|
||
|
}
|
||
|
|
||
|
enum State {
|
||
|
TakeChar,
|
||
|
Lowercase(ToLowercase),
|
||
|
Multi(Chars<'static>, bool),
|
||
|
Done,
|
||
|
}
|
||
|
|
||
|
#[derive(Clone, Copy, PartialEq)]
|
||
|
enum Action {
|
||
|
Emit(char),
|
||
|
EmitMulti(&'static str, bool),
|
||
|
Lowercase,
|
||
|
Substitute,
|
||
|
}
|
||
|
|
||
|
#[cfg(test)]
|
||
|
mod tests {
|
||
|
use super::slugify;
|
||
|
|
||
|
#[test]
|
||
|
fn test_simple() {
|
||
|
assert_eq!(slugify("foo"), "foo");
|
||
|
assert_eq!(slugify("Bar"), "bar");
|
||
|
assert_eq!(slugify("Foo: bar 2 baz"), "foo-bar-2-baz");
|
||
|
assert_eq!(slugify("🖖"), "🖖");
|
||
|
assert_eq!(slugify("Ì"), "i");
|
||
|
assert_eq!(slugify("café"), "cafe");
|
||
|
assert_eq!(slugify("cafe\u{0301}"), "cafe"); // combining diacritical mark
|
||
|
assert_eq!(slugify("1.9.4 Porting Spree"), "1-9-4-porting-spree");
|
||
|
assert_eq!(slugify("Hello, World!"), "hello-world");
|
||
|
assert_eq!(slugify("blah!"), "blah");
|
||
|
assert_eq!(slugify("foo&bar"), "foo-and-bar");
|
||
|
assert_eq!(slugify("foo & bar"), "foo-and-bar");
|
||
|
assert_eq!(slugify("FooBar"), "foo-bar");
|
||
|
assert_eq!(slugify("FOO"), "foo");
|
||
|
}
|
||
|
}
|