Faster tokenizing for named character references

This commit is contained in:
Shadowfacts 2023-11-26 18:26:15 -05:00
parent e22f778f8f
commit 134803b72d
2 changed files with 46 additions and 39 deletions

View File

@ -349,54 +349,56 @@ private extension Tokenizer {
} }
mutating func tokenizeNamedCharaterReference() -> Token? { mutating func tokenizeNamedCharaterReference() -> Token? {
// TODO: this could definitely be faster // consume as many [a-zA-Z0-9] as possible, until semicolon
// maybe with a prefix tree for named characters loop: while let c = nextChar() {
var everHadMatch = false switch c {
var outOfChars = false case "a"..."z", "A"..."Z", "0"..."9":
func hasMatch() -> Bool { temporaryBuffer!.append(c)
let buf = temporaryBuffer! case ";":
let key = buf[buf.index(after: buf.startIndex)...] temporaryBuffer!.append(c)
return namedCharactersDecodeMap.keys.contains(where: { $0.starts(with: key) }) break loop
} default:
while hasMatch() { reconsume(c)
everHadMatch = true break loop
guard let char = nextChar() else {
outOfChars = true
break
} }
temporaryBuffer!.append(char)
} }
if everHadMatch {
if !outOfChars {
// the last character changed us from having a match to not
reconsume(temporaryBuffer!.removeLast())
}
var referent = namedCharactersDecodeMap[String(temporaryBuffer!.dropFirst())]
if referent == nil {
// start from the beginning and try to find a reference
var key = ";"
let buf = temporaryBuffer!
var index = buf.index(after: buf.startIndex)
while index < buf.endIndex {
key.replaceSubrange(key.index(before: key.endIndex)..., with: "\(buf[index]);")
buf.formIndex(after: &index)
referent = namedCharactersDecodeMap[key]
if referent != nil {
break
}
}
if referent != nil {
for c in buf[index...].reversed() {
reconsume(c)
}
temporaryBuffer!.removeSubrange(index...)
}
}
if let referent {
if case .attributeValue(_) = returnState, if case .attributeValue(_) = returnState,
temporaryBuffer!.last != ";", temporaryBuffer!.last != ";",
let peeked = peekChar(), let next = peekChar(),
peeked == "=" || (peeked.isASCII && (peeked.isLetter || peeked.isNumber)) { next == "=" || ("a"..."z").contains(next) || ("A"..."Z").contains(next) || ("0"..."9").contains(next) {
state = .flushingTemporaryBuffer(returnState!) flushCharacterReference()
} else { } else {
let insertSemicolon = temporaryBuffer!.last != ";" temporaryBuffer = "\(referent)"
if insertSemicolon { flushCharacterReference()
// parse error: missing-semicolon-after-character-reference
// Usually the parser behaves as if character reference is terminated by the U+003B (;) code point; however, there are some ambiguous cases in which the parser includes subsequent code points in the character reference.
temporaryBuffer!.append(";")
}
if let reference = namedCharactersDecodeMap[String(temporaryBuffer![temporaryBuffer!.index(after: temporaryBuffer!.startIndex)...])] {
temporaryBuffer = "\(reference)"
flushCharacterReference()
} else {
if insertSemicolon {
temporaryBuffer!.removeLast()
}
state = .flushingTemporaryBuffer(.ambiguousAmpersand)
}
} }
} else { } else {
state = .flushingTemporaryBuffer(.ambiguousAmpersand) state = .flushingTemporaryBuffer(.ambiguousAmpersand)
} }
return next() return next()
} }

View File

@ -20,9 +20,14 @@ final class TokenizerTests: XCTestCase {
XCTAssertEqual(tokenize("&amp;"), [.character("&")]) XCTAssertEqual(tokenize("&amp;"), [.character("&")])
// missing-semicolon-after-character-reference: // missing-semicolon-after-character-reference:
XCTAssertEqual(tokenize("&not;in"), [.character("¬"), .character("i"), .character("n")]) XCTAssertEqual(tokenize("&not;in"), [.character("¬"), .character("i"), .character("n")])
XCTAssertEqual(tokenize("&notin"), [.character("")]) XCTAssertEqual(tokenize("&notin"), [.character("¬"), .character("i"), .character("n")])
// unknown-named-character-reference: // unknown-named-character-reference:
XCTAssertEqual(tokenize("&notit;"), [.character("¬"), .character("i"), .character("t"), .character(";")])
XCTAssertEqual(tokenize("&asdf"), "&asdf".map { .character($0) }) XCTAssertEqual(tokenize("&asdf"), "&asdf".map { .character($0) })
XCTAssertEqual(tokenize("&a"), "&a".map { .character($0) })
// attribute special case
XCTAssertEqual(tokenize("<a a='&nota' />"), [.startTag("a", selfClosing: true, attributes: [Attribute(name: "a", value: "&nota")])])
} }
func testNumericCharacterReference() { func testNumericCharacterReference() {