From f9730b9ec50ade847c18bb0f5a407ea7be0d022b Mon Sep 17 00:00:00 2001 From: ConnorSkees <39542938+ConnorSkees@users.noreply.github.com> Date: Sun, 22 Mar 2020 13:45:41 -0400 Subject: [PATCH] HACK: somewhat handle unicode escapes --- src/lexer.rs | 76 +++++++++++++++++++++++++++++++++++++++++++--- src/utils.rs | 13 ++++++++ src/value/parse.rs | 17 +++++++++++ tests/misc.rs | 16 ++++++++++ 4 files changed, 117 insertions(+), 5 deletions(-) diff --git a/src/lexer.rs b/src/lexer.rs index 57e9db1..f21c3c2 100644 --- a/src/lexer.rs +++ b/src/lexer.rs @@ -17,6 +17,7 @@ pub(crate) struct Lexer<'a> { tokens: Vec, buf: Peekable>, pos: Pos, + should_emit_backslash: usize, } impl<'a> Iterator for Lexer<'a> { @@ -36,6 +37,13 @@ impl<'a> Iterator for Lexer<'a> { TokenKind::Whitespace(Whitespace::$whitespace) }}; } + if self.should_emit_backslash > 0 { + self.should_emit_backslash -= 1; + return Some(Token { + kind: TokenKind::Symbol(Symbol::BackSlash), + pos: self.pos, + }); + } let kind: TokenKind = match self.buf.peek().unwrap_or(&'\0') { 'a'..='z' | 'A'..='Z' | '_' => self.lex_ident(), '-' => { @@ -107,7 +115,7 @@ impl<'a> Iterator for Lexer<'a> { } } '?' => symbol!(self, QuestionMark), - '\\' => symbol!(self, BackSlash), + '\\' => self.lex_back_slash().0, '~' => symbol!(self, Tilde), '\'' => symbol!(self, SingleQuote), '"' => symbol!(self, DoubleQuote), @@ -158,6 +166,7 @@ impl<'a> Lexer<'a> { tokens: Vec::with_capacity(buf.len()), buf: buf.chars().peekable(), pos: Pos::new(), + should_emit_backslash: 0, } } @@ -209,6 +218,51 @@ impl<'a> Lexer<'a> { } } + fn lex_back_slash(&mut self) -> (TokenKind, bool) { + self.buf.next(); + self.pos.next_char(); + if self.buf.peek() == Some(&'\\') { + self.buf.next(); + self.pos.next_char(); + self.should_emit_backslash = 1; + (TokenKind::Symbol(Symbol::BackSlash), true) + } else { + let mut n = String::new(); + while let Some(c) = self.buf.peek() { + if !c.is_ascii_hexdigit() || n.len() > 6 { + break; + } + n.push(*c); + self.buf.next(); + self.pos.next_char(); + } + + if n.is_empty() { + return (TokenKind::Symbol(Symbol::BackSlash), false); + } + + let mut string = std::char::from_u32(u32::from_str_radix(&n, 16).unwrap()) + .unwrap() + .to_string(); + self.devour_whitespace(); + if let TokenKind::Ident(s) = self.lex_ident() { + string.push_str(&s); + } + (TokenKind::Ident(string), false) + } + } + + fn devour_whitespace(&mut self) { + while let Some(c) = self.buf.peek() { + if c.is_ascii_whitespace() { + self.buf.next(); + self.pos.next_char(); + continue; + } + break; + } + } + fn lex_forward_slash(&mut self) -> TokenKind { self.buf.next(); self.pos.next_char(); @@ -331,18 +385,30 @@ impl<'a> Lexer<'a> { } } - // TODO: handle weird characters that *are* ascii - // e.g. how do we handle `color: ;` fn lex_ident(&mut self) -> TokenKind { let mut string = String::with_capacity(99); while let Some(c) = self.buf.peek() { - // we know that the first char is alphabetic from peeking - if !c.is_alphanumeric() && c != &'-' && c != &'_' && c.is_ascii() { + if !c.is_alphanumeric() && c != &'-' && c != &'_' && c != &'\\' && c.is_ascii() { break; } if !c.is_ascii() { IS_UTF8.store(true, Ordering::Relaxed); } + if c == &'\\' { + match self.lex_back_slash() { + (TokenKind::Ident(s), _) => string.push_str(&s), + (TokenKind::Symbol(..), true) => { + self.should_emit_backslash = 2; + break; + } + (TokenKind::Symbol(..), false) => { + self.should_emit_backslash = 1; + break; + } + _ => unreachable!(), + } + continue; + } let tok = self .buf .next() diff --git a/src/utils.rs b/src/utils.rs index c142a44..3a9f740 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -160,6 +160,19 @@ pub(crate) fn flatten_ident>( toks.next(); s.push_str(n) } + TokenKind::Symbol(Symbol::BackSlash) => { + s.push('\\'); + toks.next(); + if let Some(tok) = toks.next() { + match tok.kind { + TokenKind::Symbol(Symbol::Plus) => s.push('+'), + TokenKind::Symbol(Symbol::BackSlash) => s.push('\\'), + _ => todo!("value after \\"), + } + } else { + todo!() + } + } _ => break, } } diff --git a/src/value/parse.rs b/src/value/parse.rs index 6abef4c..08cdfd4 100644 --- a/src/value/parse.rs +++ b/src/value/parse.rs @@ -294,6 +294,23 @@ impl Value { TokenKind::Keyword(Keyword::To(s)) => Ok(Value::Ident(s, QuoteKind::None)), TokenKind::AtRule(_) => Err("expected \";\".".into()), TokenKind::Error(e) => return Err(e), + TokenKind::Symbol(Symbol::BackSlash) => { + if let Some(tok) = toks.next() { + match tok.kind { + TokenKind::Symbol(Symbol::Plus) => Ok(Value::Ident( + "\\+".to_string() + &flatten_ident(toks, scope, super_selector)?, + QuoteKind::None, + )), + TokenKind::Symbol(Symbol::BackSlash) => Ok(Value::Ident( + "\\\\".to_string() + &flatten_ident(toks, scope, super_selector)?, + QuoteKind::None, + )), + _ => todo!("value after \\"), + } + } else { + todo!() + } + } TokenKind::Op(Op::Plus) | TokenKind::Symbol(Symbol::Plus) => { devour_whitespace_or_comment(toks); let v = Self::_from_tokens(toks, scope, super_selector)?; diff --git a/tests/misc.rs b/tests/misc.rs index 745a119..4330efa 100644 --- a/tests/misc.rs +++ b/tests/misc.rs @@ -66,3 +66,19 @@ test!( "a {\n color: red😁\n}\n", "@charset \"UTF-8\";\na {\n color: red😁;\n}\n" ); +test!( + escape_recognized_as_at_rule, + "@\\69 f true {\n a {\n b: c;\n }\n}\n", + "a {\n b: c;\n}\n" +); +test!( + escape_in_middle, + "a {\n color: b\\6cue;\n}\n", + "a {\n color: blue;\n}\n" +); +test!( + escape_at_end, + "a {\n color: blu\\65;\n}\n", + "a {\n color: blue;\n}\n" +); +test!(double_escape_is_preserved, "a {\n color: r\\\\65;\n}\n");