From 2f18ad3cf4c8218e44c22961f82a8585b7725227 Mon Sep 17 00:00:00 2001 From: Shadowfacts Date: Tue, 28 Nov 2023 20:58:01 -0500 Subject: [PATCH] Process runs of unmodified characters as characterSequence tokens --- .../AttributedStringConverter.swift | 2 + Sources/HTMLStreamer/Tokenizer.swift | 53 ++++++++++++++----- Tests/HTMLStreamerTests/TokenizerTests.swift | 8 +-- 3 files changed, 45 insertions(+), 18 deletions(-) diff --git a/Sources/HTMLStreamer/AttributedStringConverter.swift b/Sources/HTMLStreamer/AttributedStringConverter.swift index e243110..eb3c7e1 100644 --- a/Sources/HTMLStreamer/AttributedStringConverter.swift +++ b/Sources/HTMLStreamer/AttributedStringConverter.swift @@ -49,6 +49,8 @@ public struct AttributedStringConverter { switch token { case .character(let c): currentRun.unicodeScalars.append(c) + case .characterSequence(let s): + currentRun.append(s) case .comment: // ignored continue diff --git a/Sources/HTMLStreamer/Tokenizer.swift b/Sources/HTMLStreamer/Tokenizer.swift index 22bcd7d..81d7a36 100644 --- a/Sources/HTMLStreamer/Tokenizer.swift +++ b/Sources/HTMLStreamer/Tokenizer.swift @@ -212,6 +212,7 @@ struct Tokenizer>: IteratorProtocol { enum Token: Equatable { case character(Unicode.Scalar) + case characterSequence(String) case comment(String) case startTag(String, selfClosing: Bool, attributes: [Attribute]) case endTag(String) @@ -326,20 +327,44 @@ private enum DoctypeIdentifierQuotation { private extension Tokenizer { mutating func tokenizeData() -> Token? { - switch nextChar() { - case "&": - returnState = .data - state = .characterReference - return tokenizeCharacterReference() - case "<": - state = .tagOpen - return tokenizeTagOpen() - case "\0": - return .character("\0") - case nil: - return nil // end of fil - case .some(let c): - return .character(c) + // Optimization: It's common to have runs of characters that are tokenized as-is, + // so try to return them as a single token so the downstream consumer + // can avoid repeated work. + var buf = "" + while true { + switch nextChar() { + case "&": + returnState = .data + state = .characterReference + if buf.isEmpty { + return tokenizeCharacterReference() + } else { + return .characterSequence(buf) + } + case "<": + state = .tagOpen + if buf.isEmpty { + return tokenizeTagOpen() + } else { + return .characterSequence(buf) + } + case "\0": + if buf.isEmpty { + return .character("\0") + } else { + reconsume("\0") + return .characterSequence(buf) + } + case nil: + if buf.isEmpty { + return nil // end of file + } else { + return .characterSequence(buf) + } + case .some(let c): + buf.unicodeScalars.append(c) + continue + } } } diff --git a/Tests/HTMLStreamerTests/TokenizerTests.swift b/Tests/HTMLStreamerTests/TokenizerTests.swift index b1f3aa7..d110e77 100644 --- a/Tests/HTMLStreamerTests/TokenizerTests.swift +++ b/Tests/HTMLStreamerTests/TokenizerTests.swift @@ -19,10 +19,10 @@ final class TokenizerTests: XCTestCase { func testNamedCharacterReferences() { XCTAssertEqual(tokenize("&"), [.character("&")]) // missing-semicolon-after-character-reference: - XCTAssertEqual(tokenize("¬in"), [.character("¬"), .character("i"), .character("n")]) - XCTAssertEqual(tokenize("¬in"), [.character("¬"), .character("i"), .character("n")]) + XCTAssertEqual(tokenize("¬in"), [.character("¬"), .characterSequence("in")]) + XCTAssertEqual(tokenize("¬in"), [.character("¬"), .characterSequence("in")]) // unknown-named-character-reference: - XCTAssertEqual(tokenize("¬it;"), [.character("¬"), .character("i"), .character("t"), .character(";")]) + XCTAssertEqual(tokenize("¬it;"), [.character("¬"), .characterSequence("it;")]) XCTAssertEqual(tokenize("&asdf"), "&asdf".unicodeScalars.map { .character($0) }) XCTAssertEqual(tokenize("&a"), "&a".unicodeScalars.map { .character($0) }) @@ -71,7 +71,7 @@ final class TokenizerTests: XCTestCase { } func testMultiScalar() { - XCTAssertEqual(tokenize("🇺🇸"), [.character("\u{1F1FA}"), .character("\u{1F1F8}")]) + XCTAssertEqual(tokenize("🇺🇸"), [.characterSequence("\u{1F1FA}\u{1F1F8}")]) } }