Process runs of unmodified characters as characterSequence tokens
This commit is contained in:
parent
f7f35e09f7
commit
2f18ad3cf4
|
@ -49,6 +49,8 @@ public struct AttributedStringConverter<Callbacks: AttributedStringCallbacks> {
|
||||||
switch token {
|
switch token {
|
||||||
case .character(let c):
|
case .character(let c):
|
||||||
currentRun.unicodeScalars.append(c)
|
currentRun.unicodeScalars.append(c)
|
||||||
|
case .characterSequence(let s):
|
||||||
|
currentRun.append(s)
|
||||||
case .comment:
|
case .comment:
|
||||||
// ignored
|
// ignored
|
||||||
continue
|
continue
|
||||||
|
|
|
@ -212,6 +212,7 @@ struct Tokenizer<Chars: IteratorProtocol<Unicode.Scalar>>: IteratorProtocol {
|
||||||
|
|
||||||
enum Token: Equatable {
|
enum Token: Equatable {
|
||||||
case character(Unicode.Scalar)
|
case character(Unicode.Scalar)
|
||||||
|
case characterSequence(String)
|
||||||
case comment(String)
|
case comment(String)
|
||||||
case startTag(String, selfClosing: Bool, attributes: [Attribute])
|
case startTag(String, selfClosing: Bool, attributes: [Attribute])
|
||||||
case endTag(String)
|
case endTag(String)
|
||||||
|
@ -326,20 +327,44 @@ private enum DoctypeIdentifierQuotation {
|
||||||
|
|
||||||
private extension Tokenizer {
|
private extension Tokenizer {
|
||||||
mutating func tokenizeData() -> Token? {
|
mutating func tokenizeData() -> Token? {
|
||||||
|
// Optimization: It's common to have runs of characters that are tokenized as-is,
|
||||||
|
// so try to return them as a single token so the downstream consumer
|
||||||
|
// can avoid repeated work.
|
||||||
|
var buf = ""
|
||||||
|
while true {
|
||||||
switch nextChar() {
|
switch nextChar() {
|
||||||
case "&":
|
case "&":
|
||||||
returnState = .data
|
returnState = .data
|
||||||
state = .characterReference
|
state = .characterReference
|
||||||
|
if buf.isEmpty {
|
||||||
return tokenizeCharacterReference()
|
return tokenizeCharacterReference()
|
||||||
|
} else {
|
||||||
|
return .characterSequence(buf)
|
||||||
|
}
|
||||||
case "<":
|
case "<":
|
||||||
state = .tagOpen
|
state = .tagOpen
|
||||||
|
if buf.isEmpty {
|
||||||
return tokenizeTagOpen()
|
return tokenizeTagOpen()
|
||||||
|
} else {
|
||||||
|
return .characterSequence(buf)
|
||||||
|
}
|
||||||
case "\0":
|
case "\0":
|
||||||
|
if buf.isEmpty {
|
||||||
return .character("\0")
|
return .character("\0")
|
||||||
|
} else {
|
||||||
|
reconsume("\0")
|
||||||
|
return .characterSequence(buf)
|
||||||
|
}
|
||||||
case nil:
|
case nil:
|
||||||
return nil // end of fil
|
if buf.isEmpty {
|
||||||
|
return nil // end of file
|
||||||
|
} else {
|
||||||
|
return .characterSequence(buf)
|
||||||
|
}
|
||||||
case .some(let c):
|
case .some(let c):
|
||||||
return .character(c)
|
buf.unicodeScalars.append(c)
|
||||||
|
continue
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -19,10 +19,10 @@ final class TokenizerTests: XCTestCase {
|
||||||
func testNamedCharacterReferences() {
|
func testNamedCharacterReferences() {
|
||||||
XCTAssertEqual(tokenize("&"), [.character("&")])
|
XCTAssertEqual(tokenize("&"), [.character("&")])
|
||||||
// missing-semicolon-after-character-reference:
|
// missing-semicolon-after-character-reference:
|
||||||
XCTAssertEqual(tokenize("¬in"), [.character("¬"), .character("i"), .character("n")])
|
XCTAssertEqual(tokenize("¬in"), [.character("¬"), .characterSequence("in")])
|
||||||
XCTAssertEqual(tokenize("¬in"), [.character("¬"), .character("i"), .character("n")])
|
XCTAssertEqual(tokenize("¬in"), [.character("¬"), .characterSequence("in")])
|
||||||
// unknown-named-character-reference:
|
// unknown-named-character-reference:
|
||||||
XCTAssertEqual(tokenize("¬it;"), [.character("¬"), .character("i"), .character("t"), .character(";")])
|
XCTAssertEqual(tokenize("¬it;"), [.character("¬"), .characterSequence("it;")])
|
||||||
XCTAssertEqual(tokenize("&asdf"), "&asdf".unicodeScalars.map { .character($0) })
|
XCTAssertEqual(tokenize("&asdf"), "&asdf".unicodeScalars.map { .character($0) })
|
||||||
XCTAssertEqual(tokenize("&a"), "&a".unicodeScalars.map { .character($0) })
|
XCTAssertEqual(tokenize("&a"), "&a".unicodeScalars.map { .character($0) })
|
||||||
|
|
||||||
|
@ -71,7 +71,7 @@ final class TokenizerTests: XCTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
func testMultiScalar() {
|
func testMultiScalar() {
|
||||||
XCTAssertEqual(tokenize("🇺🇸"), [.character("\u{1F1FA}"), .character("\u{1F1F8}")])
|
XCTAssertEqual(tokenize("🇺🇸"), [.characterSequence("\u{1F1FA}\u{1F1F8}")])
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue