Process runs of unmodified characters as characterSequence tokens
This commit is contained in:
parent
f7f35e09f7
commit
2f18ad3cf4
|
@ -49,6 +49,8 @@ public struct AttributedStringConverter<Callbacks: AttributedStringCallbacks> {
|
|||
switch token {
|
||||
case .character(let c):
|
||||
currentRun.unicodeScalars.append(c)
|
||||
case .characterSequence(let s):
|
||||
currentRun.append(s)
|
||||
case .comment:
|
||||
// ignored
|
||||
continue
|
||||
|
|
|
@ -212,6 +212,7 @@ struct Tokenizer<Chars: IteratorProtocol<Unicode.Scalar>>: IteratorProtocol {
|
|||
|
||||
enum Token: Equatable {
|
||||
case character(Unicode.Scalar)
|
||||
case characterSequence(String)
|
||||
case comment(String)
|
||||
case startTag(String, selfClosing: Bool, attributes: [Attribute])
|
||||
case endTag(String)
|
||||
|
@ -326,20 +327,44 @@ private enum DoctypeIdentifierQuotation {
|
|||
|
||||
private extension Tokenizer {
|
||||
mutating func tokenizeData() -> Token? {
|
||||
// Optimization: It's common to have runs of characters that are tokenized as-is,
|
||||
// so try to return them as a single token so the downstream consumer
|
||||
// can avoid repeated work.
|
||||
var buf = ""
|
||||
while true {
|
||||
switch nextChar() {
|
||||
case "&":
|
||||
returnState = .data
|
||||
state = .characterReference
|
||||
if buf.isEmpty {
|
||||
return tokenizeCharacterReference()
|
||||
} else {
|
||||
return .characterSequence(buf)
|
||||
}
|
||||
case "<":
|
||||
state = .tagOpen
|
||||
if buf.isEmpty {
|
||||
return tokenizeTagOpen()
|
||||
} else {
|
||||
return .characterSequence(buf)
|
||||
}
|
||||
case "\0":
|
||||
if buf.isEmpty {
|
||||
return .character("\0")
|
||||
} else {
|
||||
reconsume("\0")
|
||||
return .characterSequence(buf)
|
||||
}
|
||||
case nil:
|
||||
return nil // end of fil
|
||||
if buf.isEmpty {
|
||||
return nil // end of file
|
||||
} else {
|
||||
return .characterSequence(buf)
|
||||
}
|
||||
case .some(let c):
|
||||
return .character(c)
|
||||
buf.unicodeScalars.append(c)
|
||||
continue
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -19,10 +19,10 @@ final class TokenizerTests: XCTestCase {
|
|||
func testNamedCharacterReferences() {
|
||||
XCTAssertEqual(tokenize("&"), [.character("&")])
|
||||
// missing-semicolon-after-character-reference:
|
||||
XCTAssertEqual(tokenize("¬in"), [.character("¬"), .character("i"), .character("n")])
|
||||
XCTAssertEqual(tokenize("¬in"), [.character("¬"), .character("i"), .character("n")])
|
||||
XCTAssertEqual(tokenize("¬in"), [.character("¬"), .characterSequence("in")])
|
||||
XCTAssertEqual(tokenize("¬in"), [.character("¬"), .characterSequence("in")])
|
||||
// unknown-named-character-reference:
|
||||
XCTAssertEqual(tokenize("¬it;"), [.character("¬"), .character("i"), .character("t"), .character(";")])
|
||||
XCTAssertEqual(tokenize("¬it;"), [.character("¬"), .characterSequence("it;")])
|
||||
XCTAssertEqual(tokenize("&asdf"), "&asdf".unicodeScalars.map { .character($0) })
|
||||
XCTAssertEqual(tokenize("&a"), "&a".unicodeScalars.map { .character($0) })
|
||||
|
||||
|
@ -71,7 +71,7 @@ final class TokenizerTests: XCTestCase {
|
|||
}
|
||||
|
||||
func testMultiScalar() {
|
||||
XCTAssertEqual(tokenize("🇺🇸"), [.character("\u{1F1FA}"), .character("\u{1F1F8}")])
|
||||
XCTAssertEqual(tokenize("🇺🇸"), [.characterSequence("\u{1F1FA}\u{1F1F8}")])
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue