diff --git a/Sources/HTMLStreamer/Tokenizer.swift b/Sources/HTMLStreamer/Tokenizer.swift index 2eeb415..d91fbbf 100644 --- a/Sources/HTMLStreamer/Tokenizer.swift +++ b/Sources/HTMLStreamer/Tokenizer.swift @@ -636,37 +636,39 @@ private extension Tokenizer { } mutating func tokenizeTagName() -> Token? { - switch nextChar() { - case "\t", "\n", "\u{000C}", " ": - state = .beforeAttributeName - return tokenizeBeforeAttributeName() - case "/": - state = .selfClosingStartTag - return tokenizeSelfClosingStartTag() - case ">": - state = .data - return takeCurrentToken() - case nil: - // parse error: eof-in-tag - state = .endOfFile - return nil - case .some(var c): - if c == "\0" { - // parse error: unexpected-null-character - c = "\u{FFFD}" - } else if ("A"..."Z").contains(c) { - c = c.asciiLowercase - } - if case .startTag(var s, let selfClosing, let attributes) = currentToken { - s.append(c) - currentToken = .startTag(s, selfClosing: selfClosing, attributes: attributes) - return tokenizeTagName() - } else if case .endTag(var s) = currentToken { - s.append(c) - currentToken = .endTag(s) - return tokenizeTagName() - } else { - fatalError("bad current token") + while true { + switch nextChar() { + case "\t", "\n", "\u{000C}", " ": + state = .beforeAttributeName + return tokenizeBeforeAttributeName() + case "/": + state = .selfClosingStartTag + return tokenizeSelfClosingStartTag() + case ">": + state = .data + return takeCurrentToken() + case nil: + // parse error: eof-in-tag + state = .endOfFile + return nil + case .some(var c): + if c == "\0" { + // parse error: unexpected-null-character + c = "\u{FFFD}" + } else if ("A"..."Z").contains(c) { + c = c.asciiLowercase + } + if case .startTag(var s, let selfClosing, let attributes) = currentToken { + s.append(c) + currentToken = .startTag(s, selfClosing: selfClosing, attributes: attributes) + continue + } else if case .endTag(var s) = currentToken { + s.append(c) + currentToken = .endTag(s) + continue + } else { + fatalError("bad current token") + } } } } @@ -732,32 +734,34 @@ private extension Tokenizer { } mutating func tokenizeAttributeName() -> Token? { - let c = nextChar() - switch c { - case "\t", "\n", "\u{000C}", " ", "/", ">", nil: - reconsume(c) - state = .afterAttributeName - return tokenizeAfterAttributeName() - case "=": - state = .beforeAttributeValue - return tokenizeBeforeAttributeValue() - case .some(var c): - if ("A"..."Z").contains(c) { - c = c.asciiLowercase - } - // if null, parse error: unexpected-null-character - if c == "\0" { - c = "\u{FFFD}" - } - // if c in ["\"", "'", "<"], parse error: unexpected-character-in-attribute-name - if case .startTag(let s, let selfClosing, var attributes) = currentToken { - attributes[attributes.count - 1].name.append(c) - currentToken = .startTag(s, selfClosing: selfClosing, attributes: attributes) - return tokenizeAttributeName() - } else if case .endTag(_) = currentToken { - return tokenizeAttributeName() - } else { - fatalError("bad curren token") + while true { + let c = nextChar() + switch c { + case "\t", "\n", "\u{000C}", " ", "/", ">", nil: + reconsume(c) + state = .afterAttributeName + return tokenizeAfterAttributeName() + case "=": + state = .beforeAttributeValue + return tokenizeBeforeAttributeValue() + case .some(var c): + if ("A"..."Z").contains(c) { + c = c.asciiLowercase + } + // if null, parse error: unexpected-null-character + if c == "\0" { + c = "\u{FFFD}" + } + // if c in ["\"", "'", "<"], parse error: unexpected-character-in-attribute-name + if case .startTag(let s, let selfClosing, var attributes) = currentToken { + attributes[attributes.count - 1].name.append(c) + currentToken = .startTag(s, selfClosing: selfClosing, attributes: attributes) + continue + } else if case .endTag(_) = currentToken { + continue + } else { + fatalError("bad curren token") + } } } } @@ -817,62 +821,66 @@ private extension Tokenizer { } mutating func tokenizeAttributeValue(quotes: AttributeValueQuotation) -> Token? { - if quotes == .unquoted { - switch nextChar() { - case "\t", "\n", "\u{000C}", " ": - state = .beforeAttributeName - return tokenizeBeforeAttributeName() - case "&": - returnState = .attributeValue(.unquoted) - state = .characterReference - return tokenizeCharacterReference() - case ">": - state = .data - return takeCurrentToken() - case nil: - // parse error: eof-in-tag - state = .endOfFile - return nil - case .some(let c): - // if c in ["\"", "'", "<", "=", "`"], parse error: unexpected-character-in-unquoted-attribute-value - if case .startTag(let s, let selfClosing, var attributes) = currentToken { - attributes[attributes.count - 1].value.append(c) - currentToken = .startTag(s, selfClosing: selfClosing, attributes: attributes) - return tokenizeAttributeValue(quotes: quotes) - } else { - fatalError("bad current token") + while true { + if quotes == .unquoted { + switch nextChar() { + case "\t", "\n", "\u{000C}", " ": + state = .beforeAttributeName + return tokenizeBeforeAttributeName() + case "&": + returnState = .attributeValue(.unquoted) + state = .characterReference + return tokenizeCharacterReference() + case ">": + state = .data + return takeCurrentToken() + case nil: + // parse error: eof-in-tag + state = .endOfFile + return nil + case .some(let c): + // if c in ["\"", "'", "<", "=", "`"], parse error: unexpected-character-in-unquoted-attribute-value + if case .startTag(let s, let selfClosing, var attributes) = currentToken { + attributes[attributes.count - 1].value.append(c) + currentToken = .startTag(s, selfClosing: selfClosing, attributes: attributes) + continue + } else if case .endTag(_) = currentToken { + continue + } else { + fatalError("bad current token") + } } - } - } else { - let c = nextChar() - switch c { - case "\"" where quotes == .doubleQuoted: - state = .afterAttributeValueQuoted - return tokenizeAfterAttributeValueQuoted() - case "'" where quotes == .singleQuoted: - state = .afterAttributeValueQuoted - return tokenizeAfterAttributeValueQuoted() - case "&": - returnState = .attributeValue(quotes) - state = .characterReference - return tokenizeCharacterReference() - case nil: - // parse error: eof-in-tag - state = .endOfFile - return nil - case .some(var c): - if c == "\0" { - // parse error: unexpected-null-character - c = "\u{FFFD}" - } - if case .startTag(let s, let selfClosing, var attributes) = currentToken { - attributes[attributes.count - 1].value.append(c) - currentToken = .startTag(s, selfClosing: selfClosing, attributes: attributes) - return tokenizeAttributeValue(quotes: quotes) - } else if case .endTag(_) = currentToken { - return tokenizeAttributeValue(quotes: quotes) - } else { - fatalError("bad current token") + } else { + let c = nextChar() + switch c { + case "\"" where quotes == .doubleQuoted: + state = .afterAttributeValueQuoted + return tokenizeAfterAttributeValueQuoted() + case "'" where quotes == .singleQuoted: + state = .afterAttributeValueQuoted + return tokenizeAfterAttributeValueQuoted() + case "&": + returnState = .attributeValue(quotes) + state = .characterReference + return tokenizeCharacterReference() + case nil: + // parse error: eof-in-tag + state = .endOfFile + return nil + case .some(var c): + if c == "\0" { + // parse error: unexpected-null-character + c = "\u{FFFD}" + } + if case .startTag(let s, let selfClosing, var attributes) = currentToken { + attributes[attributes.count - 1].value.append(c) + currentToken = .startTag(s, selfClosing: selfClosing, attributes: attributes) + continue + } else if case .endTag(_) = currentToken { + continue + } else { + fatalError("bad current token") + } } } }