Process runs of unmodified characters as characterSequence tokens

This commit is contained in:
Shadowfacts 2023-11-28 20:58:01 -05:00
parent f7f35e09f7
commit 2f18ad3cf4
3 changed files with 45 additions and 18 deletions

View File

@ -49,6 +49,8 @@ public struct AttributedStringConverter<Callbacks: AttributedStringCallbacks> {
switch token { switch token {
case .character(let c): case .character(let c):
currentRun.unicodeScalars.append(c) currentRun.unicodeScalars.append(c)
case .characterSequence(let s):
currentRun.append(s)
case .comment: case .comment:
// ignored // ignored
continue continue

View File

@ -212,6 +212,7 @@ struct Tokenizer<Chars: IteratorProtocol<Unicode.Scalar>>: IteratorProtocol {
enum Token: Equatable { enum Token: Equatable {
case character(Unicode.Scalar) case character(Unicode.Scalar)
case characterSequence(String)
case comment(String) case comment(String)
case startTag(String, selfClosing: Bool, attributes: [Attribute]) case startTag(String, selfClosing: Bool, attributes: [Attribute])
case endTag(String) case endTag(String)
@ -326,20 +327,44 @@ private enum DoctypeIdentifierQuotation {
private extension Tokenizer { private extension Tokenizer {
mutating func tokenizeData() -> Token? { mutating func tokenizeData() -> Token? {
// Optimization: It's common to have runs of characters that are tokenized as-is,
// so try to return them as a single token so the downstream consumer
// can avoid repeated work.
var buf = ""
while true {
switch nextChar() { switch nextChar() {
case "&": case "&":
returnState = .data returnState = .data
state = .characterReference state = .characterReference
if buf.isEmpty {
return tokenizeCharacterReference() return tokenizeCharacterReference()
} else {
return .characterSequence(buf)
}
case "<": case "<":
state = .tagOpen state = .tagOpen
if buf.isEmpty {
return tokenizeTagOpen() return tokenizeTagOpen()
} else {
return .characterSequence(buf)
}
case "\0": case "\0":
if buf.isEmpty {
return .character("\0") return .character("\0")
} else {
reconsume("\0")
return .characterSequence(buf)
}
case nil: case nil:
return nil // end of fil if buf.isEmpty {
return nil // end of file
} else {
return .characterSequence(buf)
}
case .some(let c): case .some(let c):
return .character(c) buf.unicodeScalars.append(c)
continue
}
} }
} }

View File

@ -19,10 +19,10 @@ final class TokenizerTests: XCTestCase {
func testNamedCharacterReferences() { func testNamedCharacterReferences() {
XCTAssertEqual(tokenize("&amp;"), [.character("&")]) XCTAssertEqual(tokenize("&amp;"), [.character("&")])
// missing-semicolon-after-character-reference: // missing-semicolon-after-character-reference:
XCTAssertEqual(tokenize("&not;in"), [.character("¬"), .character("i"), .character("n")]) XCTAssertEqual(tokenize("&not;in"), [.character("¬"), .characterSequence("in")])
XCTAssertEqual(tokenize("&notin"), [.character("¬"), .character("i"), .character("n")]) XCTAssertEqual(tokenize("&notin"), [.character("¬"), .characterSequence("in")])
// unknown-named-character-reference: // unknown-named-character-reference:
XCTAssertEqual(tokenize("&notit;"), [.character("¬"), .character("i"), .character("t"), .character(";")]) XCTAssertEqual(tokenize("&notit;"), [.character("¬"), .characterSequence("it;")])
XCTAssertEqual(tokenize("&asdf"), "&asdf".unicodeScalars.map { .character($0) }) XCTAssertEqual(tokenize("&asdf"), "&asdf".unicodeScalars.map { .character($0) })
XCTAssertEqual(tokenize("&a"), "&a".unicodeScalars.map { .character($0) }) XCTAssertEqual(tokenize("&a"), "&a".unicodeScalars.map { .character($0) })
@ -71,7 +71,7 @@ final class TokenizerTests: XCTestCase {
} }
func testMultiScalar() { func testMultiScalar() {
XCTAssertEqual(tokenize("🇺🇸"), [.character("\u{1F1FA}"), .character("\u{1F1F8}")]) XCTAssertEqual(tokenize("🇺🇸"), [.characterSequence("\u{1F1FA}\u{1F1F8}")])
} }
} }