use once_cell::sync::Lazy; use regex::Regex; use std::char::ToLowercase; use std::str::Chars; use unicode_normalization::UnicodeNormalization; pub fn slugify(s: &str) -> String { slugify_iter(s.nfc()) } pub fn slugify_iter>(mut iter: I) -> String { let next_char = iter.next(); Slugify { iter, prev_char: None, next_char, state: State::TakeChar, prev: None, substitution: '-', } .collect() } struct Slugify> { iter: I, prev_char: Option, next_char: Option, state: State, prev: Option, substitution: char, } impl> Iterator for Slugify { type Item = char; fn next(&mut self) -> Option { match self.state { State::TakeChar => { let prev_char = self.prev_char; if let Some(c) = self.next_char() { let action = handle_char(c); // if we're about to start lowercasing something, and the previous // (untransformed) char was lowercase (i.e., at a transition from lower -> // upper case), emit a substitution if let Action::Lowercase = action && let Some(prev_char) = prev_char && prev_char.is_lowercase() { self.state = State::Lowercase(c.to_lowercase()); Some(self.substitution) } else { self.handle_action(c, action) } } else { self.state = State::Done; None } } State::Lowercase(ref mut lowercase) => { if let Some(c) = lowercase.next() { Some(c) } else { self.state = State::TakeChar; self.next() } } State::Multi(ref mut chars, wrap_word) => { if let Some(c) = chars.next() { Some(c) } else { self.state = State::TakeChar; if wrap_word { self.prev = Some(Action::Substitute); Some(self.substitution) } else { self.next() } } } State::Done => None, } } } impl> Slugify { fn next_char(&mut self) -> Option { let next = self.next_char; if next.is_some() { self.prev_char = next; self.next_char = self.iter.next(); } next } fn handle_action(&mut self, c: char, action: Action) -> Option { let prev = self.prev; self.prev = Some(action); match action { Action::Emit(c) => Some(c), Action::EmitMulti(s, wrap_word) => { self.state = State::Multi(s.chars(), wrap_word); if wrap_word && prev != Some(Action::Substitute) { Some(self.substitution) } else { self.next() } } Action::Substitute => { if prev == Some(Action::Substitute) || self.next_char.is_none() { self.next() } else { Some(self.substitution) } } Action::Lowercase => { self.state = State::Lowercase(c.to_lowercase()); self.next() } } } } fn handle_char(c: char) -> Action { match c { 'a'..='z' | '0'..='9' => Action::Emit(c), 'A'..='Z' => Action::Lowercase, _ if is_emoji(c) => Action::Emit(c), '\u{00c0}'..='\u{00d6}' | '\u{00d8}'..='\u{00f6}' | '\u{00f8}'..='\u{00ff}' | '\u{0100}'..='\u{017f}' => Action::EmitMulti(latin_to_ascii_lowercase(c), false), '&' => Action::EmitMulti("and", true), _ => Action::Substitute, } } static EMOJI_REGEX: Lazy = Lazy::new(|| Regex::new("\\p{Emoji}").unwrap()); fn is_emoji(c: char) -> bool { let mut buf = [0u8; 4]; let s = c.encode_utf8(&mut buf); EMOJI_REGEX.is_match(s) } // based on lodash's deburr fn latin_to_ascii_lowercase(c: char) -> &'static str { match c { // Latin-1 Supplement Block '\u{00c0}'..='\u{00c5}' => "a", '\u{00e0}'..='\u{00e5}' => "a", '\u{00c7}' => "c", '\u{00e7}' => "c", '\u{00d0}' => "d", '\u{00f0}' => "d", '\u{00c8}'..='\u{00cb}' => "e", '\u{00e8}'..='\u{00eb}' => "e", '\u{00cc}'..='\u{00cf}' => "i", '\u{00ec}'..='\u{00ef}' => "i", '\u{00d1}' => "n", '\u{00f1}' => "n", '\u{00d2}'..='\u{00d8}' => "o", '\u{00f2}'..='\u{00f8}' => "o", '\u{00d9}'..='\u{00dc}' => "u", '\u{00f9}'..='\u{00fc}' => "u", '\u{00dd}' => "y", '\u{00fd}' | '\u{00ff}' => "y", '\u{00c6}' => "ae", '\u{00e6}' => "ae", '\u{00de}' => "th", '\u{00fe}' => "th", '\u{00df}' => "ss", // Latin Extended-A block '\u{0100}' | '\u{0102}' | '\u{0104}' => "a", '\u{0101}' | '\u{0103}' | '\u{0105}' => "a", '\u{0106}' | '\u{0108}' | '\u{010a}' | '\u{010c}' => "c", '\u{0107}' | '\u{0109}' | '\u{010b}' | '\u{010d}' => "c", '\u{010e}' | '\u{0110}' => "d", '\u{010f}' | '\u{0111}' => "d", '\u{0112}' | '\u{0114}' | '\u{0116}' | '\u{0118}' | '\u{011a}' => "e", '\u{0113}' | '\u{0115}' | '\u{0117}' | '\u{0119}' | '\u{011b}' => "e", '\u{011c}' | '\u{011e}' | '\u{0120}' | '\u{0122}' => "g", '\u{011d}' | '\u{011f}' | '\u{0121}' | '\u{0123}' => "g", '\u{0124}' | '\u{0126}' => "h", '\u{0125}' | '\u{0127}' => "h", '\u{0128}' | '\u{012a}' | '\u{012c}' | '\u{012e}' | '\u{0130}' => "i", '\u{0129}' | '\u{012b}' | '\u{012d}' | '\u{012f}' | '\u{0131}' => "i", '\u{0134}' => "j", '\u{0135}' => "j", '\u{0136}' => "k", '\u{0137}' | '\u{0138}' => "k", '\u{0139}' | '\u{013b}' | '\u{013d}' | '\u{013f}' | '\u{0141}' => "l", '\u{013a}' | '\u{013c}' | '\u{013e}' | '\u{0140}' | '\u{0142}' => "l", '\u{0143}' | '\u{0145}' | '\u{0147}' | '\u{014a}' => "n", '\u{0144}' | '\u{0146}' | '\u{0148}' | '\u{014b}' => "n", '\u{014c}' | '\u{014e}' | '\u{0150}' => "o", '\u{014d}' | '\u{014f}' | '\u{0151}' => "o", '\u{0154}' | '\u{0156}' | '\u{0158}' => "r", '\u{0155}' | '\u{0157}' | '\u{0159}' => "r", '\u{015a}' | '\u{015c}' | '\u{015e}' | '\u{0160}' => "s", '\u{015b}' | '\u{015d}' | '\u{015f}' | '\u{0161}' => "s", '\u{0162}' | '\u{0164}' | '\u{0166}' => "t", '\u{0163}' | '\u{0165}' | '\u{0167}' => "t", '\u{0168}' | '\u{016a}' | '\u{016c}' | '\u{016e}' | '\u{0170}' | '\u{0172}' => "u", '\u{0169}' | '\u{016b}' | '\u{016d}' | '\u{016f}' | '\u{0171}' | '\u{0173}' => "u", '\u{0174}' => "w", '\u{0175}' => "w", '\u{0176}' | '\u{0178}' => "y", '\u{0177}' => "y", '\u{0179}' | '\u{017b}' | '\u{017d}' => "z", '\u{017a}' | '\u{017c}' | '\u{017e}' => "z", '\u{0132}' => "ij", '\u{0133}' => "ij", '\u{0152}' => "oe", '\u{0153}' => "oe", '\u{0149}' => "n", '\u{017f}' => "s", _ => panic!(), } } enum State { TakeChar, Lowercase(ToLowercase), Multi(Chars<'static>, bool), Done, } #[derive(Clone, Copy, PartialEq)] enum Action { Emit(char), EmitMulti(&'static str, bool), Lowercase, Substitute, } #[cfg(test)] mod tests { use super::slugify; #[test] fn test_simple() { assert_eq!(slugify("foo"), "foo"); assert_eq!(slugify("Bar"), "bar"); assert_eq!(slugify("Foo: bar 2 baz"), "foo-bar-2-baz"); assert_eq!(slugify("🖖"), "🖖"); assert_eq!(slugify("Ì"), "i"); assert_eq!(slugify("café"), "cafe"); assert_eq!(slugify("cafe\u{0301}"), "cafe"); // combining diacritical mark assert_eq!(slugify("1.9.4 Porting Spree"), "1-9-4-porting-spree"); assert_eq!(slugify("Hello, World!"), "hello-world"); assert_eq!(slugify("blah!"), "blah"); assert_eq!(slugify("foo&bar"), "foo-and-bar"); assert_eq!(slugify("foo & bar"), "foo-and-bar"); assert_eq!(slugify("FooBar"), "foo-bar"); assert_eq!(slugify("FOO"), "foo"); } }