v6/src/generator/util/slugify.rs

use once_cell::sync::Lazy;
use regex::Regex;
use std::char::ToLowercase;
use std::str::Chars;
use unicode_normalization::UnicodeNormalization;

pub fn slugify(s: &str) -> String {
    slugify_iter(s.nfc())
}

pub fn slugify_iter<I: Iterator<Item = char>>(mut iter: I) -> String {
    let next_char = iter.next();
    Slugify {
        iter,
        prev_char: None,
        next_char,
        state: State::TakeChar,
        prev: None,
        substitution: '-',
    }
    .collect()
}

struct Slugify<I: Iterator<Item = char>> {
    iter: I,
    prev_char: Option<char>,
    next_char: Option<char>,
    state: State,
    prev: Option<Action>,
    substitution: char,
}

impl<I: Iterator<Item = char>> Iterator for Slugify<I> {
    type Item = char;

    fn next(&mut self) -> Option<char> {
        match self.state {
            State::TakeChar => {
                let prev_char = self.prev_char;
                if let Some(c) = self.next_char() {
                    let action = handle_char(c);
                    // if we're about to start lowercasing something, and the previous
                    // (untransformed) char was lowercase (i.e., at a transition from lower ->
                    // upper case), emit a substitution
                    if let Action::Lowercase = action && let Some(prev_char) = prev_char && prev_char.is_lowercase() {
                        self.state = State::Lowercase(c.to_lowercase());
                        Some(self.substitution)
                    } else {
                        self.handle_action(c, action)
                    }
                } else {
                    self.state = State::Done;
                    None
                }
            }
            State::Lowercase(ref mut lowercase) => {
                if let Some(c) = lowercase.next() {
                    Some(c)
                } else {
                    self.state = State::TakeChar;
                    self.next()
                }
            }
            State::Multi(ref mut chars, wrap_word) => {
                if let Some(c) = chars.next() {
                    Some(c)
                } else {
                    self.state = State::TakeChar;
                    if wrap_word {
                        self.prev = Some(Action::Substitute);
                        Some(self.substitution)
                    } else {
                        self.next()
                    }
                }
            }
            State::Done => None,
        }
    }
}

impl<I: Iterator<Item = char>> Slugify<I> {
    fn next_char(&mut self) -> Option<char> {
        let next = self.next_char;
        if next.is_some() {
            self.prev_char = next;
            self.next_char = self.iter.next();
        }
        next
    }

    fn handle_action(&mut self, c: char, action: Action) -> Option<char> {
        let prev = self.prev;
        self.prev = Some(action);
        match action {
            Action::Emit(c) => Some(c),
            Action::EmitMulti(s, wrap_word) => {
                self.state = State::Multi(s.chars(), wrap_word);
                if wrap_word && prev != Some(Action::Substitute) {
                    Some(self.substitution)
                } else {
                    self.next()
                }
            }
            Action::Substitute => {
                if prev == Some(Action::Substitute) || self.next_char.is_none() {
                    self.next()
                } else {
                    Some(self.substitution)
                }
            }
            Action::Lowercase => {
                self.state = State::Lowercase(c.to_lowercase());
                self.next()
            }
        }
    }
}

fn handle_char(c: char) -> Action {
    match c {
        'a'..='z' | '0'..='9' => Action::Emit(c),
        'A'..='Z' => Action::Lowercase,
        _ if is_emoji(c) => Action::Emit(c),
        '\u{00c0}'..='\u{00d6}'
        | '\u{00d8}'..='\u{00f6}'
        | '\u{00f8}'..='\u{00ff}'
        | '\u{0100}'..='\u{017f}' => Action::EmitMulti(latin_to_ascii_lowercase(c), false),
        '&' => Action::EmitMulti("and", true),
        _ => Action::Substitute,
    }
}

static EMOJI_REGEX: Lazy<Regex> = Lazy::new(|| Regex::new("\\p{Emoji}").unwrap());

fn is_emoji(c: char) -> bool {
    let mut buf = [0u8; 4];
    let s = c.encode_utf8(&mut buf);
    EMOJI_REGEX.is_match(s)
}

// based on lodash's deburr
fn latin_to_ascii_lowercase(c: char) -> &'static str {
    match c {
        // Latin-1 Supplement Block
        '\u{00c0}'..='\u{00c5}' => "a",
        '\u{00e0}'..='\u{00e5}' => "a",
        '\u{00c7}' => "c",
        '\u{00e7}' => "c",
        '\u{00d0}' => "d",
        '\u{00f0}' => "d",
        '\u{00c8}'..='\u{00cb}' => "e",
        '\u{00e8}'..='\u{00eb}' => "e",
        '\u{00cc}'..='\u{00cf}' => "i",
        '\u{00ec}'..='\u{00ef}' => "i",
        '\u{00d1}' => "n",
        '\u{00f1}' => "n",
        '\u{00d2}'..='\u{00d8}' => "o",
        '\u{00f2}'..='\u{00f8}' => "o",
        '\u{00d9}'..='\u{00dc}' => "u",
        '\u{00f9}'..='\u{00fc}' => "u",
        '\u{00dd}' => "y",
        '\u{00fd}' | '\u{00ff}' => "y",
        '\u{00c6}' => "ae",
        '\u{00e6}' => "ae",
        '\u{00de}' => "th",
        '\u{00fe}' => "th",
        '\u{00df}' => "ss",
        // Latin Extended-A block
        '\u{0100}' | '\u{0102}' | '\u{0104}' => "a",
        '\u{0101}' | '\u{0103}' | '\u{0105}' => "a",
        '\u{0106}' | '\u{0108}' | '\u{010a}' | '\u{010c}' => "c",
        '\u{0107}' | '\u{0109}' | '\u{010b}' | '\u{010d}' => "c",
        '\u{010e}' | '\u{0110}' => "d",
        '\u{010f}' | '\u{0111}' => "d",
        '\u{0112}' | '\u{0114}' | '\u{0116}' | '\u{0118}' | '\u{011a}' => "e",
        '\u{0113}' | '\u{0115}' | '\u{0117}' | '\u{0119}' | '\u{011b}' => "e",
        '\u{011c}' | '\u{011e}' | '\u{0120}' | '\u{0122}' => "g",
        '\u{011d}' | '\u{011f}' | '\u{0121}' | '\u{0123}' => "g",
        '\u{0124}' | '\u{0126}' => "h",
        '\u{0125}' | '\u{0127}' => "h",
        '\u{0128}' | '\u{012a}' | '\u{012c}' | '\u{012e}' | '\u{0130}' => "i",
        '\u{0129}' | '\u{012b}' | '\u{012d}' | '\u{012f}' | '\u{0131}' => "i",
        '\u{0134}' => "j",
        '\u{0135}' => "j",
        '\u{0136}' => "k",
        '\u{0137}' | '\u{0138}' => "k",
        '\u{0139}' | '\u{013b}' | '\u{013d}' | '\u{013f}' | '\u{0141}' => "l",
        '\u{013a}' | '\u{013c}' | '\u{013e}' | '\u{0140}' | '\u{0142}' => "l",
        '\u{0143}' | '\u{0145}' | '\u{0147}' | '\u{014a}' => "n",
        '\u{0144}' | '\u{0146}' | '\u{0148}' | '\u{014b}' => "n",
        '\u{014c}' | '\u{014e}' | '\u{0150}' => "o",
        '\u{014d}' | '\u{014f}' | '\u{0151}' => "o",
        '\u{0154}' | '\u{0156}' | '\u{0158}' => "r",
        '\u{0155}' | '\u{0157}' | '\u{0159}' => "r",
        '\u{015a}' | '\u{015c}' | '\u{015e}' | '\u{0160}' => "s",
        '\u{015b}' | '\u{015d}' | '\u{015f}' | '\u{0161}' => "s",
        '\u{0162}' | '\u{0164}' | '\u{0166}' => "t",
        '\u{0163}' | '\u{0165}' | '\u{0167}' => "t",
        '\u{0168}' | '\u{016a}' | '\u{016c}' | '\u{016e}' | '\u{0170}' | '\u{0172}' => "u",
        '\u{0169}' | '\u{016b}' | '\u{016d}' | '\u{016f}' | '\u{0171}' | '\u{0173}' => "u",
        '\u{0174}' => "w",
        '\u{0175}' => "w",
        '\u{0176}' | '\u{0178}' => "y",
        '\u{0177}' => "y",
        '\u{0179}' | '\u{017b}' | '\u{017d}' => "z",
        '\u{017a}' | '\u{017c}' | '\u{017e}' => "z",
        '\u{0132}' => "ij",
        '\u{0133}' => "ij",
        '\u{0152}' => "oe",
        '\u{0153}' => "oe",
        '\u{0149}' => "n",
        '\u{017f}' => "s",
        _ => panic!(),
    }
}

enum State {
    TakeChar,
    Lowercase(ToLowercase),
    Multi(Chars<'static>, bool),
    Done,
}

#[derive(Clone, Copy, PartialEq)]
enum Action {
    Emit(char),
    EmitMulti(&'static str, bool),
    Lowercase,
    Substitute,
}

#[cfg(test)]
mod tests {
    use super::slugify;

    #[test]
    fn test_simple() {
        assert_eq!(slugify("foo"), "foo");
        assert_eq!(slugify("Bar"), "bar");
        assert_eq!(slugify("Foo: bar 2 baz"), "foo-bar-2-baz");
        assert_eq!(slugify("🖖"), "🖖");
        assert_eq!(slugify("Ì"), "i");
        assert_eq!(slugify("café"), "cafe");
        assert_eq!(slugify("cafe\u{0301}"), "cafe"); // combining diacritical mark
        assert_eq!(slugify("1.9.4 Porting Spree"), "1-9-4-porting-spree");
        assert_eq!(slugify("Hello, World!"), "hello-world");
        assert_eq!(slugify("blah!"), "blah");
        assert_eq!(slugify("foo&bar"), "foo-and-bar");
        assert_eq!(slugify("foo & bar"), "foo-and-bar");
        assert_eq!(slugify("FooBar"), "foo-bar");
        assert_eq!(slugify("FOO"), "foo");
    }
}