diff --git a/Sources/HTMLStreamer/Tokenizer.swift b/Sources/HTMLStreamer/Tokenizer.swift
index dfc0e5b..0c02684 100644
--- a/Sources/HTMLStreamer/Tokenizer.swift
+++ b/Sources/HTMLStreamer/Tokenizer.swift
@@ -349,54 +349,56 @@ private extension Tokenizer {
}
mutating func tokenizeNamedCharaterReference() -> Token? {
- // TODO: this could definitely be faster
- // maybe with a prefix tree for named characters
- var everHadMatch = false
- var outOfChars = false
- func hasMatch() -> Bool {
+ // consume as many [a-zA-Z0-9] as possible, until semicolon
+ loop: while let c = nextChar() {
+ switch c {
+ case "a"..."z", "A"..."Z", "0"..."9":
+ temporaryBuffer!.append(c)
+ case ";":
+ temporaryBuffer!.append(c)
+ break loop
+ default:
+ reconsume(c)
+ break loop
+ }
+ }
+
+ var referent = namedCharactersDecodeMap[String(temporaryBuffer!.dropFirst())]
+ if referent == nil {
+ // start from the beginning and try to find a reference
+ var key = ";"
let buf = temporaryBuffer!
- let key = buf[buf.index(after: buf.startIndex)...]
- return namedCharactersDecodeMap.keys.contains(where: { $0.starts(with: key) })
- }
- while hasMatch() {
- everHadMatch = true
- guard let char = nextChar() else {
- outOfChars = true
- break
+ var index = buf.index(after: buf.startIndex)
+ while index < buf.endIndex {
+ key.replaceSubrange(key.index(before: key.endIndex)..., with: "\(buf[index]);")
+ buf.formIndex(after: &index)
+ referent = namedCharactersDecodeMap[key]
+ if referent != nil {
+ break
+ }
}
- temporaryBuffer!.append(char)
- }
- if everHadMatch {
- if !outOfChars {
- // the last character changed us from having a match to not
- reconsume(temporaryBuffer!.removeLast())
+ if referent != nil {
+ for c in buf[index...].reversed() {
+ reconsume(c)
+ }
+ temporaryBuffer!.removeSubrange(index...)
}
-
+ }
+
+ if let referent {
if case .attributeValue(_) = returnState,
temporaryBuffer!.last != ";",
- let peeked = peekChar(),
- peeked == "=" || (peeked.isASCII && (peeked.isLetter || peeked.isNumber)) {
- state = .flushingTemporaryBuffer(returnState!)
+ let next = peekChar(),
+ next == "=" || ("a"..."z").contains(next) || ("A"..."Z").contains(next) || ("0"..."9").contains(next) {
+ flushCharacterReference()
} else {
- let insertSemicolon = temporaryBuffer!.last != ";"
- if insertSemicolon {
- // parse error: missing-semicolon-after-character-reference
- // Usually the parser behaves as if character reference is terminated by the U+003B (;) code point; however, there are some ambiguous cases in which the parser includes subsequent code points in the character reference.
- temporaryBuffer!.append(";")
- }
- if let reference = namedCharactersDecodeMap[String(temporaryBuffer![temporaryBuffer!.index(after: temporaryBuffer!.startIndex)...])] {
- temporaryBuffer = "\(reference)"
- flushCharacterReference()
- } else {
- if insertSemicolon {
- temporaryBuffer!.removeLast()
- }
- state = .flushingTemporaryBuffer(.ambiguousAmpersand)
- }
+ temporaryBuffer = "\(referent)"
+ flushCharacterReference()
}
} else {
state = .flushingTemporaryBuffer(.ambiguousAmpersand)
}
+
return next()
}
diff --git a/Tests/HTMLStreamerTests/TokenizerTests.swift b/Tests/HTMLStreamerTests/TokenizerTests.swift
index dd6b6dd..0153735 100644
--- a/Tests/HTMLStreamerTests/TokenizerTests.swift
+++ b/Tests/HTMLStreamerTests/TokenizerTests.swift
@@ -20,9 +20,14 @@ final class TokenizerTests: XCTestCase {
XCTAssertEqual(tokenize("&"), [.character("&")])
// missing-semicolon-after-character-reference:
XCTAssertEqual(tokenize("¬in"), [.character("¬"), .character("i"), .character("n")])
- XCTAssertEqual(tokenize("¬in"), [.character("∉")])
+ XCTAssertEqual(tokenize("¬in"), [.character("¬"), .character("i"), .character("n")])
// unknown-named-character-reference:
+ XCTAssertEqual(tokenize("¬it;"), [.character("¬"), .character("i"), .character("t"), .character(";")])
XCTAssertEqual(tokenize("&asdf"), "&asdf".map { .character($0) })
+ XCTAssertEqual(tokenize("&a"), "&a".map { .character($0) })
+
+ // attribute special case
+ XCTAssertEqual(tokenize(""), [.startTag("a", selfClosing: true, attributes: [Attribute(name: "a", value: "¬a")])])
}
func testNumericCharacterReference() {