Faster tokenizing for named character references
This commit is contained in:
parent
e22f778f8f
commit
134803b72d
|
@ -349,54 +349,56 @@ private extension Tokenizer {
|
||||||
}
|
}
|
||||||
|
|
||||||
mutating func tokenizeNamedCharaterReference() -> Token? {
|
mutating func tokenizeNamedCharaterReference() -> Token? {
|
||||||
// TODO: this could definitely be faster
|
// consume as many [a-zA-Z0-9] as possible, until semicolon
|
||||||
// maybe with a prefix tree for named characters
|
loop: while let c = nextChar() {
|
||||||
var everHadMatch = false
|
switch c {
|
||||||
var outOfChars = false
|
case "a"..."z", "A"..."Z", "0"..."9":
|
||||||
func hasMatch() -> Bool {
|
temporaryBuffer!.append(c)
|
||||||
let buf = temporaryBuffer!
|
case ";":
|
||||||
let key = buf[buf.index(after: buf.startIndex)...]
|
temporaryBuffer!.append(c)
|
||||||
return namedCharactersDecodeMap.keys.contains(where: { $0.starts(with: key) })
|
break loop
|
||||||
|
default:
|
||||||
|
reconsume(c)
|
||||||
|
break loop
|
||||||
}
|
}
|
||||||
while hasMatch() {
|
|
||||||
everHadMatch = true
|
|
||||||
guard let char = nextChar() else {
|
|
||||||
outOfChars = true
|
|
||||||
break
|
|
||||||
}
|
|
||||||
temporaryBuffer!.append(char)
|
|
||||||
}
|
|
||||||
if everHadMatch {
|
|
||||||
if !outOfChars {
|
|
||||||
// the last character changed us from having a match to not
|
|
||||||
reconsume(temporaryBuffer!.removeLast())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var referent = namedCharactersDecodeMap[String(temporaryBuffer!.dropFirst())]
|
||||||
|
if referent == nil {
|
||||||
|
// start from the beginning and try to find a reference
|
||||||
|
var key = ";"
|
||||||
|
let buf = temporaryBuffer!
|
||||||
|
var index = buf.index(after: buf.startIndex)
|
||||||
|
while index < buf.endIndex {
|
||||||
|
key.replaceSubrange(key.index(before: key.endIndex)..., with: "\(buf[index]);")
|
||||||
|
buf.formIndex(after: &index)
|
||||||
|
referent = namedCharactersDecodeMap[key]
|
||||||
|
if referent != nil {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if referent != nil {
|
||||||
|
for c in buf[index...].reversed() {
|
||||||
|
reconsume(c)
|
||||||
|
}
|
||||||
|
temporaryBuffer!.removeSubrange(index...)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if let referent {
|
||||||
if case .attributeValue(_) = returnState,
|
if case .attributeValue(_) = returnState,
|
||||||
temporaryBuffer!.last != ";",
|
temporaryBuffer!.last != ";",
|
||||||
let peeked = peekChar(),
|
let next = peekChar(),
|
||||||
peeked == "=" || (peeked.isASCII && (peeked.isLetter || peeked.isNumber)) {
|
next == "=" || ("a"..."z").contains(next) || ("A"..."Z").contains(next) || ("0"..."9").contains(next) {
|
||||||
state = .flushingTemporaryBuffer(returnState!)
|
|
||||||
} else {
|
|
||||||
let insertSemicolon = temporaryBuffer!.last != ";"
|
|
||||||
if insertSemicolon {
|
|
||||||
// parse error: missing-semicolon-after-character-reference
|
|
||||||
// Usually the parser behaves as if character reference is terminated by the U+003B (;) code point; however, there are some ambiguous cases in which the parser includes subsequent code points in the character reference.
|
|
||||||
temporaryBuffer!.append(";")
|
|
||||||
}
|
|
||||||
if let reference = namedCharactersDecodeMap[String(temporaryBuffer![temporaryBuffer!.index(after: temporaryBuffer!.startIndex)...])] {
|
|
||||||
temporaryBuffer = "\(reference)"
|
|
||||||
flushCharacterReference()
|
flushCharacterReference()
|
||||||
} else {
|
} else {
|
||||||
if insertSemicolon {
|
temporaryBuffer = "\(referent)"
|
||||||
temporaryBuffer!.removeLast()
|
flushCharacterReference()
|
||||||
}
|
|
||||||
state = .flushingTemporaryBuffer(.ambiguousAmpersand)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
state = .flushingTemporaryBuffer(.ambiguousAmpersand)
|
state = .flushingTemporaryBuffer(.ambiguousAmpersand)
|
||||||
}
|
}
|
||||||
|
|
||||||
return next()
|
return next()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -20,9 +20,14 @@ final class TokenizerTests: XCTestCase {
|
||||||
XCTAssertEqual(tokenize("&"), [.character("&")])
|
XCTAssertEqual(tokenize("&"), [.character("&")])
|
||||||
// missing-semicolon-after-character-reference:
|
// missing-semicolon-after-character-reference:
|
||||||
XCTAssertEqual(tokenize("¬in"), [.character("¬"), .character("i"), .character("n")])
|
XCTAssertEqual(tokenize("¬in"), [.character("¬"), .character("i"), .character("n")])
|
||||||
XCTAssertEqual(tokenize("¬in"), [.character("∉")])
|
XCTAssertEqual(tokenize("¬in"), [.character("¬"), .character("i"), .character("n")])
|
||||||
// unknown-named-character-reference:
|
// unknown-named-character-reference:
|
||||||
|
XCTAssertEqual(tokenize("¬it;"), [.character("¬"), .character("i"), .character("t"), .character(";")])
|
||||||
XCTAssertEqual(tokenize("&asdf"), "&asdf".map { .character($0) })
|
XCTAssertEqual(tokenize("&asdf"), "&asdf".map { .character($0) })
|
||||||
|
XCTAssertEqual(tokenize("&a"), "&a".map { .character($0) })
|
||||||
|
|
||||||
|
// attribute special case
|
||||||
|
XCTAssertEqual(tokenize("<a a='¬a' />"), [.startTag("a", selfClosing: true, attributes: [Attribute(name: "a", value: "¬a")])])
|
||||||
}
|
}
|
||||||
|
|
||||||
func testNumericCharacterReference() {
|
func testNumericCharacterReference() {
|
||||||
|
|
Loading…
Reference in New Issue