diff --git a/Sources/HTMLStreamer/AttributedStringConverter.swift b/Sources/HTMLStreamer/AttributedStringConverter.swift
index e243110..eb3c7e1 100644
--- a/Sources/HTMLStreamer/AttributedStringConverter.swift
+++ b/Sources/HTMLStreamer/AttributedStringConverter.swift
@@ -49,6 +49,8 @@ public struct AttributedStringConverter {
switch token {
case .character(let c):
currentRun.unicodeScalars.append(c)
+ case .characterSequence(let s):
+ currentRun.append(s)
case .comment:
// ignored
continue
diff --git a/Sources/HTMLStreamer/Tokenizer.swift b/Sources/HTMLStreamer/Tokenizer.swift
index 22bcd7d..81d7a36 100644
--- a/Sources/HTMLStreamer/Tokenizer.swift
+++ b/Sources/HTMLStreamer/Tokenizer.swift
@@ -212,6 +212,7 @@ struct Tokenizer>: IteratorProtocol {
enum Token: Equatable {
case character(Unicode.Scalar)
+ case characterSequence(String)
case comment(String)
case startTag(String, selfClosing: Bool, attributes: [Attribute])
case endTag(String)
@@ -326,20 +327,44 @@ private enum DoctypeIdentifierQuotation {
private extension Tokenizer {
mutating func tokenizeData() -> Token? {
- switch nextChar() {
- case "&":
- returnState = .data
- state = .characterReference
- return tokenizeCharacterReference()
- case "<":
- state = .tagOpen
- return tokenizeTagOpen()
- case "\0":
- return .character("\0")
- case nil:
- return nil // end of fil
- case .some(let c):
- return .character(c)
+ // Optimization: It's common to have runs of characters that are tokenized as-is,
+ // so try to return them as a single token so the downstream consumer
+ // can avoid repeated work.
+ var buf = ""
+ while true {
+ switch nextChar() {
+ case "&":
+ returnState = .data
+ state = .characterReference
+ if buf.isEmpty {
+ return tokenizeCharacterReference()
+ } else {
+ return .characterSequence(buf)
+ }
+ case "<":
+ state = .tagOpen
+ if buf.isEmpty {
+ return tokenizeTagOpen()
+ } else {
+ return .characterSequence(buf)
+ }
+ case "\0":
+ if buf.isEmpty {
+ return .character("\0")
+ } else {
+ reconsume("\0")
+ return .characterSequence(buf)
+ }
+ case nil:
+ if buf.isEmpty {
+ return nil // end of file
+ } else {
+ return .characterSequence(buf)
+ }
+ case .some(let c):
+ buf.unicodeScalars.append(c)
+ continue
+ }
}
}
diff --git a/Tests/HTMLStreamerTests/TokenizerTests.swift b/Tests/HTMLStreamerTests/TokenizerTests.swift
index b1f3aa7..d110e77 100644
--- a/Tests/HTMLStreamerTests/TokenizerTests.swift
+++ b/Tests/HTMLStreamerTests/TokenizerTests.swift
@@ -19,10 +19,10 @@ final class TokenizerTests: XCTestCase {
func testNamedCharacterReferences() {
XCTAssertEqual(tokenize("&"), [.character("&")])
// missing-semicolon-after-character-reference:
- XCTAssertEqual(tokenize("¬in"), [.character("¬"), .character("i"), .character("n")])
- XCTAssertEqual(tokenize("¬in"), [.character("¬"), .character("i"), .character("n")])
+ XCTAssertEqual(tokenize("¬in"), [.character("¬"), .characterSequence("in")])
+ XCTAssertEqual(tokenize("¬in"), [.character("¬"), .characterSequence("in")])
// unknown-named-character-reference:
- XCTAssertEqual(tokenize("¬it;"), [.character("¬"), .character("i"), .character("t"), .character(";")])
+ XCTAssertEqual(tokenize("¬it;"), [.character("¬"), .characterSequence("it;")])
XCTAssertEqual(tokenize("&asdf"), "&asdf".unicodeScalars.map { .character($0) })
XCTAssertEqual(tokenize("&a"), "&a".unicodeScalars.map { .character($0) })
@@ -71,7 +71,7 @@ final class TokenizerTests: XCTestCase {
}
func testMultiScalar() {
- XCTAssertEqual(tokenize("🇺🇸"), [.character("\u{1F1FA}"), .character("\u{1F1F8}")])
+ XCTAssertEqual(tokenize("🇺🇸"), [.characterSequence("\u{1F1FA}\u{1F1F8}")])
}
}