Faster tokenizing for named character references
This commit is contained in:
parent
e22f778f8f
commit
134803b72d
|
@ -349,54 +349,56 @@ private extension Tokenizer {
|
|||
}
|
||||
|
||||
mutating func tokenizeNamedCharaterReference() -> Token? {
|
||||
// TODO: this could definitely be faster
|
||||
// maybe with a prefix tree for named characters
|
||||
var everHadMatch = false
|
||||
var outOfChars = false
|
||||
func hasMatch() -> Bool {
|
||||
let buf = temporaryBuffer!
|
||||
let key = buf[buf.index(after: buf.startIndex)...]
|
||||
return namedCharactersDecodeMap.keys.contains(where: { $0.starts(with: key) })
|
||||
// consume as many [a-zA-Z0-9] as possible, until semicolon
|
||||
loop: while let c = nextChar() {
|
||||
switch c {
|
||||
case "a"..."z", "A"..."Z", "0"..."9":
|
||||
temporaryBuffer!.append(c)
|
||||
case ";":
|
||||
temporaryBuffer!.append(c)
|
||||
break loop
|
||||
default:
|
||||
reconsume(c)
|
||||
break loop
|
||||
}
|
||||
while hasMatch() {
|
||||
everHadMatch = true
|
||||
guard let char = nextChar() else {
|
||||
outOfChars = true
|
||||
break
|
||||
}
|
||||
temporaryBuffer!.append(char)
|
||||
}
|
||||
if everHadMatch {
|
||||
if !outOfChars {
|
||||
// the last character changed us from having a match to not
|
||||
reconsume(temporaryBuffer!.removeLast())
|
||||
}
|
||||
|
||||
var referent = namedCharactersDecodeMap[String(temporaryBuffer!.dropFirst())]
|
||||
if referent == nil {
|
||||
// start from the beginning and try to find a reference
|
||||
var key = ";"
|
||||
let buf = temporaryBuffer!
|
||||
var index = buf.index(after: buf.startIndex)
|
||||
while index < buf.endIndex {
|
||||
key.replaceSubrange(key.index(before: key.endIndex)..., with: "\(buf[index]);")
|
||||
buf.formIndex(after: &index)
|
||||
referent = namedCharactersDecodeMap[key]
|
||||
if referent != nil {
|
||||
break
|
||||
}
|
||||
}
|
||||
if referent != nil {
|
||||
for c in buf[index...].reversed() {
|
||||
reconsume(c)
|
||||
}
|
||||
temporaryBuffer!.removeSubrange(index...)
|
||||
}
|
||||
}
|
||||
|
||||
if let referent {
|
||||
if case .attributeValue(_) = returnState,
|
||||
temporaryBuffer!.last != ";",
|
||||
let peeked = peekChar(),
|
||||
peeked == "=" || (peeked.isASCII && (peeked.isLetter || peeked.isNumber)) {
|
||||
state = .flushingTemporaryBuffer(returnState!)
|
||||
} else {
|
||||
let insertSemicolon = temporaryBuffer!.last != ";"
|
||||
if insertSemicolon {
|
||||
// parse error: missing-semicolon-after-character-reference
|
||||
// Usually the parser behaves as if character reference is terminated by the U+003B (;) code point; however, there are some ambiguous cases in which the parser includes subsequent code points in the character reference.
|
||||
temporaryBuffer!.append(";")
|
||||
}
|
||||
if let reference = namedCharactersDecodeMap[String(temporaryBuffer![temporaryBuffer!.index(after: temporaryBuffer!.startIndex)...])] {
|
||||
temporaryBuffer = "\(reference)"
|
||||
let next = peekChar(),
|
||||
next == "=" || ("a"..."z").contains(next) || ("A"..."Z").contains(next) || ("0"..."9").contains(next) {
|
||||
flushCharacterReference()
|
||||
} else {
|
||||
if insertSemicolon {
|
||||
temporaryBuffer!.removeLast()
|
||||
}
|
||||
state = .flushingTemporaryBuffer(.ambiguousAmpersand)
|
||||
}
|
||||
temporaryBuffer = "\(referent)"
|
||||
flushCharacterReference()
|
||||
}
|
||||
} else {
|
||||
state = .flushingTemporaryBuffer(.ambiguousAmpersand)
|
||||
}
|
||||
|
||||
return next()
|
||||
}
|
||||
|
||||
|
|
|
@ -20,9 +20,14 @@ final class TokenizerTests: XCTestCase {
|
|||
XCTAssertEqual(tokenize("&"), [.character("&")])
|
||||
// missing-semicolon-after-character-reference:
|
||||
XCTAssertEqual(tokenize("¬in"), [.character("¬"), .character("i"), .character("n")])
|
||||
XCTAssertEqual(tokenize("¬in"), [.character("∉")])
|
||||
XCTAssertEqual(tokenize("¬in"), [.character("¬"), .character("i"), .character("n")])
|
||||
// unknown-named-character-reference:
|
||||
XCTAssertEqual(tokenize("¬it;"), [.character("¬"), .character("i"), .character("t"), .character(";")])
|
||||
XCTAssertEqual(tokenize("&asdf"), "&asdf".map { .character($0) })
|
||||
XCTAssertEqual(tokenize("&a"), "&a".map { .character($0) })
|
||||
|
||||
// attribute special case
|
||||
XCTAssertEqual(tokenize("<a a='¬a' />"), [.startTag("a", selfClosing: true, attributes: [Attribute(name: "a", value: "¬a")])])
|
||||
}
|
||||
|
||||
func testNumericCharacterReference() {
|
||||
|
|
Loading…
Reference in New Issue