Process runs of unmodified characters as characterSequence tokens

This commit is contained in:
Shadowfacts 2023-11-28 20:58:01 -05:00
parent f7f35e09f7
commit 2f18ad3cf4
3 changed files with 45 additions and 18 deletions

View File

@ -49,6 +49,8 @@ public struct AttributedStringConverter<Callbacks: AttributedStringCallbacks> {
switch token {
case .character(let c):
currentRun.unicodeScalars.append(c)
case .characterSequence(let s):
currentRun.append(s)
case .comment:
// ignored
continue

View File

@ -212,6 +212,7 @@ struct Tokenizer<Chars: IteratorProtocol<Unicode.Scalar>>: IteratorProtocol {
enum Token: Equatable {
case character(Unicode.Scalar)
case characterSequence(String)
case comment(String)
case startTag(String, selfClosing: Bool, attributes: [Attribute])
case endTag(String)
@ -326,20 +327,44 @@ private enum DoctypeIdentifierQuotation {
private extension Tokenizer {
mutating func tokenizeData() -> Token? {
switch nextChar() {
case "&":
returnState = .data
state = .characterReference
return tokenizeCharacterReference()
case "<":
state = .tagOpen
return tokenizeTagOpen()
case "\0":
return .character("\0")
case nil:
return nil // end of fil
case .some(let c):
return .character(c)
// Optimization: It's common to have runs of characters that are tokenized as-is,
// so try to return them as a single token so the downstream consumer
// can avoid repeated work.
var buf = ""
while true {
switch nextChar() {
case "&":
returnState = .data
state = .characterReference
if buf.isEmpty {
return tokenizeCharacterReference()
} else {
return .characterSequence(buf)
}
case "<":
state = .tagOpen
if buf.isEmpty {
return tokenizeTagOpen()
} else {
return .characterSequence(buf)
}
case "\0":
if buf.isEmpty {
return .character("\0")
} else {
reconsume("\0")
return .characterSequence(buf)
}
case nil:
if buf.isEmpty {
return nil // end of file
} else {
return .characterSequence(buf)
}
case .some(let c):
buf.unicodeScalars.append(c)
continue
}
}
}

View File

@ -19,10 +19,10 @@ final class TokenizerTests: XCTestCase {
func testNamedCharacterReferences() {
XCTAssertEqual(tokenize("&amp;"), [.character("&")])
// missing-semicolon-after-character-reference:
XCTAssertEqual(tokenize("&not;in"), [.character("¬"), .character("i"), .character("n")])
XCTAssertEqual(tokenize("&notin"), [.character("¬"), .character("i"), .character("n")])
XCTAssertEqual(tokenize("&not;in"), [.character("¬"), .characterSequence("in")])
XCTAssertEqual(tokenize("&notin"), [.character("¬"), .characterSequence("in")])
// unknown-named-character-reference:
XCTAssertEqual(tokenize("&notit;"), [.character("¬"), .character("i"), .character("t"), .character(";")])
XCTAssertEqual(tokenize("&notit;"), [.character("¬"), .characterSequence("it;")])
XCTAssertEqual(tokenize("&asdf"), "&asdf".unicodeScalars.map { .character($0) })
XCTAssertEqual(tokenize("&a"), "&a".unicodeScalars.map { .character($0) })
@ -71,7 +71,7 @@ final class TokenizerTests: XCTestCase {
}
func testMultiScalar() {
XCTAssertEqual(tokenize("🇺🇸"), [.character("\u{1F1FA}"), .character("\u{1F1F8}")])
XCTAssertEqual(tokenize("🇺🇸"), [.characterSequence("\u{1F1FA}\u{1F1F8}")])
}
}