Faster tokenizing for named character references

This commit is contained in:
Shadowfacts 2023-11-26 18:26:15 -05:00
parent e22f778f8f
commit 134803b72d
2 changed files with 46 additions and 39 deletions

View File

@ -349,54 +349,56 @@ private extension Tokenizer {
}
mutating func tokenizeNamedCharaterReference() -> Token? {
// TODO: this could definitely be faster
// maybe with a prefix tree for named characters
var everHadMatch = false
var outOfChars = false
func hasMatch() -> Bool {
// consume as many [a-zA-Z0-9] as possible, until semicolon
loop: while let c = nextChar() {
switch c {
case "a"..."z", "A"..."Z", "0"..."9":
temporaryBuffer!.append(c)
case ";":
temporaryBuffer!.append(c)
break loop
default:
reconsume(c)
break loop
}
}
var referent = namedCharactersDecodeMap[String(temporaryBuffer!.dropFirst())]
if referent == nil {
// start from the beginning and try to find a reference
var key = ";"
let buf = temporaryBuffer!
let key = buf[buf.index(after: buf.startIndex)...]
return namedCharactersDecodeMap.keys.contains(where: { $0.starts(with: key) })
}
while hasMatch() {
everHadMatch = true
guard let char = nextChar() else {
outOfChars = true
break
var index = buf.index(after: buf.startIndex)
while index < buf.endIndex {
key.replaceSubrange(key.index(before: key.endIndex)..., with: "\(buf[index]);")
buf.formIndex(after: &index)
referent = namedCharactersDecodeMap[key]
if referent != nil {
break
}
}
temporaryBuffer!.append(char)
}
if everHadMatch {
if !outOfChars {
// the last character changed us from having a match to not
reconsume(temporaryBuffer!.removeLast())
if referent != nil {
for c in buf[index...].reversed() {
reconsume(c)
}
temporaryBuffer!.removeSubrange(index...)
}
}
if let referent {
if case .attributeValue(_) = returnState,
temporaryBuffer!.last != ";",
let peeked = peekChar(),
peeked == "=" || (peeked.isASCII && (peeked.isLetter || peeked.isNumber)) {
state = .flushingTemporaryBuffer(returnState!)
let next = peekChar(),
next == "=" || ("a"..."z").contains(next) || ("A"..."Z").contains(next) || ("0"..."9").contains(next) {
flushCharacterReference()
} else {
let insertSemicolon = temporaryBuffer!.last != ";"
if insertSemicolon {
// parse error: missing-semicolon-after-character-reference
// Usually the parser behaves as if character reference is terminated by the U+003B (;) code point; however, there are some ambiguous cases in which the parser includes subsequent code points in the character reference.
temporaryBuffer!.append(";")
}
if let reference = namedCharactersDecodeMap[String(temporaryBuffer![temporaryBuffer!.index(after: temporaryBuffer!.startIndex)...])] {
temporaryBuffer = "\(reference)"
flushCharacterReference()
} else {
if insertSemicolon {
temporaryBuffer!.removeLast()
}
state = .flushingTemporaryBuffer(.ambiguousAmpersand)
}
temporaryBuffer = "\(referent)"
flushCharacterReference()
}
} else {
state = .flushingTemporaryBuffer(.ambiguousAmpersand)
}
return next()
}

View File

@ -20,9 +20,14 @@ final class TokenizerTests: XCTestCase {
XCTAssertEqual(tokenize("&amp;"), [.character("&")])
// missing-semicolon-after-character-reference:
XCTAssertEqual(tokenize("&not;in"), [.character("¬"), .character("i"), .character("n")])
XCTAssertEqual(tokenize("&notin"), [.character("")])
XCTAssertEqual(tokenize("&notin"), [.character("¬"), .character("i"), .character("n")])
// unknown-named-character-reference:
XCTAssertEqual(tokenize("&notit;"), [.character("¬"), .character("i"), .character("t"), .character(";")])
XCTAssertEqual(tokenize("&asdf"), "&asdf".map { .character($0) })
XCTAssertEqual(tokenize("&a"), "&a".map { .character($0) })
// attribute special case
XCTAssertEqual(tokenize("<a a='&nota' />"), [.startTag("a", selfClosing: true, attributes: [Attribute(name: "a", value: "&nota")])])
}
func testNumericCharacterReference() {