diff --git a/Sources/HTMLStreamer/Tokenizer.swift b/Sources/HTMLStreamer/Tokenizer.swift index dfc0e5b..0c02684 100644 --- a/Sources/HTMLStreamer/Tokenizer.swift +++ b/Sources/HTMLStreamer/Tokenizer.swift @@ -349,54 +349,56 @@ private extension Tokenizer { } mutating func tokenizeNamedCharaterReference() -> Token? { - // TODO: this could definitely be faster - // maybe with a prefix tree for named characters - var everHadMatch = false - var outOfChars = false - func hasMatch() -> Bool { + // consume as many [a-zA-Z0-9] as possible, until semicolon + loop: while let c = nextChar() { + switch c { + case "a"..."z", "A"..."Z", "0"..."9": + temporaryBuffer!.append(c) + case ";": + temporaryBuffer!.append(c) + break loop + default: + reconsume(c) + break loop + } + } + + var referent = namedCharactersDecodeMap[String(temporaryBuffer!.dropFirst())] + if referent == nil { + // start from the beginning and try to find a reference + var key = ";" let buf = temporaryBuffer! - let key = buf[buf.index(after: buf.startIndex)...] - return namedCharactersDecodeMap.keys.contains(where: { $0.starts(with: key) }) - } - while hasMatch() { - everHadMatch = true - guard let char = nextChar() else { - outOfChars = true - break + var index = buf.index(after: buf.startIndex) + while index < buf.endIndex { + key.replaceSubrange(key.index(before: key.endIndex)..., with: "\(buf[index]);") + buf.formIndex(after: &index) + referent = namedCharactersDecodeMap[key] + if referent != nil { + break + } } - temporaryBuffer!.append(char) - } - if everHadMatch { - if !outOfChars { - // the last character changed us from having a match to not - reconsume(temporaryBuffer!.removeLast()) + if referent != nil { + for c in buf[index...].reversed() { + reconsume(c) + } + temporaryBuffer!.removeSubrange(index...) } - + } + + if let referent { if case .attributeValue(_) = returnState, temporaryBuffer!.last != ";", - let peeked = peekChar(), - peeked == "=" || (peeked.isASCII && (peeked.isLetter || peeked.isNumber)) { - state = .flushingTemporaryBuffer(returnState!) + let next = peekChar(), + next == "=" || ("a"..."z").contains(next) || ("A"..."Z").contains(next) || ("0"..."9").contains(next) { + flushCharacterReference() } else { - let insertSemicolon = temporaryBuffer!.last != ";" - if insertSemicolon { - // parse error: missing-semicolon-after-character-reference - // Usually the parser behaves as if character reference is terminated by the U+003B (;) code point; however, there are some ambiguous cases in which the parser includes subsequent code points in the character reference. - temporaryBuffer!.append(";") - } - if let reference = namedCharactersDecodeMap[String(temporaryBuffer![temporaryBuffer!.index(after: temporaryBuffer!.startIndex)...])] { - temporaryBuffer = "\(reference)" - flushCharacterReference() - } else { - if insertSemicolon { - temporaryBuffer!.removeLast() - } - state = .flushingTemporaryBuffer(.ambiguousAmpersand) - } + temporaryBuffer = "\(referent)" + flushCharacterReference() } } else { state = .flushingTemporaryBuffer(.ambiguousAmpersand) } + return next() } diff --git a/Tests/HTMLStreamerTests/TokenizerTests.swift b/Tests/HTMLStreamerTests/TokenizerTests.swift index dd6b6dd..0153735 100644 --- a/Tests/HTMLStreamerTests/TokenizerTests.swift +++ b/Tests/HTMLStreamerTests/TokenizerTests.swift @@ -20,9 +20,14 @@ final class TokenizerTests: XCTestCase { XCTAssertEqual(tokenize("&"), [.character("&")]) // missing-semicolon-after-character-reference: XCTAssertEqual(tokenize("¬in"), [.character("¬"), .character("i"), .character("n")]) - XCTAssertEqual(tokenize("¬in"), [.character("∉")]) + XCTAssertEqual(tokenize("¬in"), [.character("¬"), .character("i"), .character("n")]) // unknown-named-character-reference: + XCTAssertEqual(tokenize("¬it;"), [.character("¬"), .character("i"), .character("t"), .character(";")]) XCTAssertEqual(tokenize("&asdf"), "&asdf".map { .character($0) }) + XCTAssertEqual(tokenize("&a"), "&a".map { .character($0) }) + + // attribute special case + XCTAssertEqual(tokenize(""), [.startTag("a", selfClosing: true, attributes: [Attribute(name: "a", value: "¬a")])]) } func testNumericCharacterReference() {