HTMLStreamer/Sources/HTMLStreamer/Tokenizer.swift

1693 lines
63 KiB
Swift

//
// Tokenizer.swift
// HTMLStreamer
//
// Created by Shadowfacts on 11/22/23.
//
import Foundation
struct Tokenizer<Chars: IteratorProtocol<Character>>: IteratorProtocol {
typealias Element = Token
private var chars: Chars
private var reconsumeStack: InlineArray3<Character> = []
private var state = State.data
private var returnState: State?
private var temporaryBuffer: String?
private var characterReferenceCode: UInt32?
private var currentToken: Token?
init(chars: Chars) {
self.chars = chars
}
mutating func next() -> Token? {
switch state {
case .flushingTemporaryBuffer(let returnState):
if temporaryBuffer == nil || temporaryBuffer!.isEmpty {
state = returnState
return next()
} else {
return .character(temporaryBuffer!.removeFirst())
}
case .endOfFile:
return nil
case .emitTokens(var tokens, let nextState):
if tokens.isEmpty {
state = nextState
return next()
} else {
let tok = tokens.removeFirst()
state = .emitTokens(tokens, nextState)
return tok
}
case .data:
return tokenizeData()
case .characterReference:
return tokenizeCharacterReference()
case .namedCharacterReference:
return tokenizeNamedCharaterReference()
case .numericCharacterReference:
return tokenizeNumericCharacterReference()
case .numericCharacterReferenceEnd:
return tokenizeNumericCharacterReferenceEnd()
case .hexadecimalCharacterReferenceStart:
return tokenizeHexadecimalCharacterReferenceStart()
case .hexadecimalCharacterReference:
return tokenizeHexadecimalCharacterReference()
case .decimalCharacterReferenceStart:
return tokenizeDecimalCharacterReferenceStart()
case .decimalCharacterReference:
return tokenizeDecimalCharacterReference()
case .ambiguousAmpersand:
return tokenizeAmbiguousAmpersand()
case .tagOpen:
return tokenizeTagOpen()
case .endTagOpen:
return tokenizeEndTagOpen()
case .tagName:
return tokenizeTagName()
case .selfClosingStartTag:
return tokenizeSelfClosingStartTag()
case .beforeAttributeName:
return tokenizeBeforeAttributeName()
case .attributeName:
return tokenizeAttributeName()
case .afterAttributeName:
return tokenizeAfterAttributeName()
case .beforeAttributeValue:
return tokenizeBeforeAttributeValue()
case .attributeValue(let quotes):
return tokenizeAttributeValue(quotes: quotes)
case .afterAttributeValueQuoted:
return tokenizeAfterAttributeValueQuoted()
case .bogusComment:
return tokenizeBogusComment()
case .markupDeclarationOpen:
return tokenizeMarkupDeclarationOpen()
case .commentStart:
return tokenizeCommentStart()
case .commentStartDash:
return tokenizeCommentStartDash()
case .comment:
return tokenizeComment()
case .commentLessThanSign:
return tokenizeCommentLessThanSign()
case .commentLessThanSignBang:
return tokenizeCommentLessThanSignBang()
case .commentLessThanSignBangDash:
return tokenizeCommentLessThanSignBangDash()
case .commentLessThanSignBangDashDash:
return tokenizeCommentLessThanSignBangDashDash()
case .commentEndDash:
return tokenizeCommentEndDash()
case .commentEnd:
return tokenizeCommentEnd()
case .commentEndBang:
return tokenizeCommentEndBang()
case .doctype:
return tokenizeDoctype()
case .beforeDoctypeName:
return tokenizeBeforeDoctypeName()
case .doctypeName:
return tokenizeDoctypeName()
case .afterDoctypeName:
return tokenizeAfterDoctypeName()
case .afterDoctypePublicKeyword:
return tokenizeAfterDoctypePublicKeyword()
case .beforeDoctypePublicIdentifier:
return tokenizeBeforeDoctypePublicIdentifier()
case .doctypePublicIdentifier(let quotes):
return tokenizeDoctypePublicIdentifier(quotes: quotes)
case .afterDoctypePublicIdentifier:
return tokenizeAfterDoctypePublicIdentifier()
case .betweenDoctypePublicAndSystemIdentifiers:
return tokenizeBetweenDoctypePublicAndSystemIdentifiers()
case .afterDoctypeSystemKeyword:
return tokenizeAfterDoctypeSystemKeyword()
case .beforeDoctypeSystemIdentifier:
return tokenizeBeforeDoctypeSystemIdentifier()
case .doctypeSystemIdentifier(let quotes):
return tokenizeDoctypeSystemIdentifier(quotes: quotes)
case .afterDoctypeSystemIdentifier:
return tokenizeAfterDoctypeSystemIdentifier()
case .bogusDoctype:
return tokenizeBogusDoctype()
}
}
private mutating func reconsume(_ c: Character?) {
if let c {
reconsumeStack.append(c)
}
}
private mutating func nextChar() -> Character? {
if !reconsumeStack.isEmpty {
return reconsumeStack.removeLast()
} else {
return chars.next()
}
}
private mutating func peekChar() -> Character? {
if let nextToReconsume = reconsumeStack.last {
return nextToReconsume
} else {
let c = chars.next()
if let c {
reconsume(c)
}
return c
}
}
// TODO: extract this all out into a standalone type and test it separately
private mutating func peek(count: Int) -> String {
precondition(count >= 0)
var buf = ""
for _ in 0..<count {
if let c = nextChar() {
buf.append(c)
} else {
break
}
}
reconsumeStack.append(contentsOf: buf.reversed())
return buf
}
private mutating func consume(count: Int) {
precondition(count >= 0)
for _ in 0..<count {
_ = nextChar()
}
}
private mutating func takeCurrentToken() -> Token {
defer { currentToken = nil }
return currentToken!
}
}
enum Token: Equatable {
case character(Character)
case comment(String)
case startTag(String, selfClosing: Bool, attributes: InlineArray3<Attribute>)
case endTag(String)
case doctype(String, forceQuirks: Bool, publicIdentifier: String?, systemIdentifier: String?)
}
struct Attribute: Equatable {
var name: String
var value: String
}
private enum State {
// Internal states used by the tokenizer
indirect case flushingTemporaryBuffer(State)
case endOfFile
indirect case emitTokens([Token], State)
// States defined by the spec
case data
// RCDATA not currently supported
// case rcdata
// RAWTEXT not currently supported
// case rawtext
// script tag not currently supported
// case scriptData
// plaintext tag not currently supported
// case plaintext
case tagOpen
case endTagOpen
case tagName
// RCDATA not currently supported
// case rcdataLessThanSign
// case rcdataEndTagOpen
// case rcdataEndTagName
// RAWTEXT not currently supported
// case rawtextLessThanSign
// case rawtextEndTagOpen
// case rawtextEndTagName
// script not currently supported
// case scriptDataLessThanSign
// case scriptDataEndTagOpen
// case scriptDataEndTagName
// case scriptDataEscapeStart
// case scriptDataEscapeStartDash
// case scriptDataEscaped
// case scriptDataEscapedDash
// case scriptDataEscapedDashDash
// case scriptDataEscapedLessThanSign
// case scriptDataEscapedEndTagOpen
// case scriptDataEscapedEndTagName
// case scriptDataDoubleEscapeStart
// case scriptDataDoubleEscaped
// case scriptDataDoubleEscapedDash
// case scriptDataDoubleEscapedDashDash
// case scriptDataDoubleEscapedLessThanSign
// case scriptDataDoubleEscapeEnd
case beforeAttributeName
case attributeName
case afterAttributeName
case beforeAttributeValue
case attributeValue(AttributeValueQuotation)
case afterAttributeValueQuoted
case selfClosingStartTag
case bogusComment
case markupDeclarationOpen
case commentStart
case commentStartDash
case comment
case commentLessThanSign
case commentLessThanSignBang
case commentLessThanSignBangDash
case commentLessThanSignBangDashDash
case commentEndDash
case commentEnd
case commentEndBang
case doctype
case beforeDoctypeName
case doctypeName
case afterDoctypeName
case afterDoctypePublicKeyword
case beforeDoctypePublicIdentifier
case doctypePublicIdentifier(DoctypeIdentifierQuotation)
case afterDoctypePublicIdentifier
case betweenDoctypePublicAndSystemIdentifiers
case afterDoctypeSystemKeyword
case beforeDoctypeSystemIdentifier
case doctypeSystemIdentifier(DoctypeIdentifierQuotation)
case afterDoctypeSystemIdentifier
case bogusDoctype
// CDATA not currently supported
// case cdataSection
// case cdataSectionBracket
// case cdataSectionEndState
case characterReference
case namedCharacterReference
case ambiguousAmpersand
case numericCharacterReference
case hexadecimalCharacterReferenceStart
case decimalCharacterReferenceStart
case hexadecimalCharacterReference
case decimalCharacterReference
case numericCharacterReferenceEnd
}
private enum AttributeValueQuotation {
case singleQuoted, doubleQuoted, unquoted
}
private enum DoctypeIdentifierQuotation {
case singleQuoted, doubleQuoted
}
private extension Tokenizer {
mutating func tokenizeData() -> Token? {
switch nextChar() {
case "&":
returnState = .data
state = .characterReference
return tokenizeCharacterReference()
case "<":
state = .tagOpen
return tokenizeTagOpen()
case "\0":
return .character("\0")
case nil:
return nil // end of fil
case .some(let c):
return .character(c)
}
}
mutating func tokenizeCharacterReference() -> Token? {
temporaryBuffer = "&"
guard let c = nextChar() else {
reconsume(nil)
state = .flushingTemporaryBuffer(returnState!)
return next()
}
switch c {
case "a"..."z", "A"..."Z", "0"..."9":
reconsume(c)
state = .namedCharacterReference
return tokenizeNamedCharaterReference()
case "#":
temporaryBuffer!.append("#")
state = .numericCharacterReference
return tokenizeNumericCharacterReference()
default:
reconsume(c)
state = returnState!
return next()
}
}
mutating func tokenizeNamedCharaterReference() -> Token? {
// TODO: this could definitely be faster
// maybe with a prefix tree for named characters
var everHadMatch = false
var outOfChars = false
func hasMatch() -> Bool {
let buf = temporaryBuffer!
let key = buf[buf.index(after: buf.startIndex)...]
return namedCharactersDecodeMap.keys.contains(where: { $0.starts(with: key) })
}
while hasMatch() {
everHadMatch = true
guard let char = nextChar() else {
outOfChars = true
break
}
temporaryBuffer!.append(char)
}
if everHadMatch {
if !outOfChars {
// the last character changed us from having a match to not
reconsume(temporaryBuffer!.removeLast())
}
if case .attributeValue(_) = returnState,
temporaryBuffer!.last != ";",
let peeked = peekChar(),
peeked == "=" || (peeked.isASCII && (peeked.isLetter || peeked.isNumber)) {
state = .flushingTemporaryBuffer(returnState!)
} else {
let insertSemicolon = temporaryBuffer!.last != ";"
if insertSemicolon {
// parse error: missing-semicolon-after-character-reference
// Usually the parser behaves as if character reference is terminated by the U+003B (;) code point; however, there are some ambiguous cases in which the parser includes subsequent code points in the character reference.
temporaryBuffer!.append(";")
}
if let reference = namedCharactersDecodeMap[String(temporaryBuffer![temporaryBuffer!.index(after: temporaryBuffer!.startIndex)...])] {
temporaryBuffer = "\(reference)"
flushCharacterReference()
} else {
if insertSemicolon {
temporaryBuffer!.removeLast()
}
state = .flushingTemporaryBuffer(.ambiguousAmpersand)
}
}
} else {
state = .flushingTemporaryBuffer(.ambiguousAmpersand)
}
return next()
}
mutating func flushCharacterReference() {
if case .attributeValue(_) = returnState {
if case .startTag(let s, let selfClosing, var attributes) = currentToken {
attributes[attributes.count - 1].value.append(temporaryBuffer!)
currentToken = .startTag(s, selfClosing: selfClosing, attributes: attributes)
temporaryBuffer = nil
state = returnState!
} else {
fatalError("bad current tag")
}
} else {
state = .flushingTemporaryBuffer(returnState!)
}
}
mutating func tokenizeNumericCharacterReference() -> Token? {
characterReferenceCode = 0
switch nextChar() {
case "x", "X":
temporaryBuffer!.append("x")
state = .hexadecimalCharacterReference
return tokenizeHexadecimalCharacterReference()
case let c:
reconsume(c)
state = .decimalCharacterReference
return tokenizeDecimalCharacterReference()
}
}
mutating func tokenizeNumericCharacterReferenceEnd() -> Token? {
switch characterReferenceCode! {
case 0:
// parse error: null-character-reference
characterReferenceCode = 0xFFFD
case let c where c > 0x10FFFF:
// parse error: character-reference-outside-unicode-range
characterReferenceCode = 0xFFFD
case 0xD800...0xDBFF, 0xDC00...0xDFFF: // leading and trailing surrogate ranges
// parse error: surrogate-character-reference
characterReferenceCode = 0xFFFD
case let c where Unicode.Scalar(c) == nil:
// parse error: noncharacter-character-reference
// "The parser resolves such character references as-is."
// TODO: idfk what that means
characterReferenceCode = nil
state = returnState!
return next()
case 0x0D, 0...0x1F /* C0 control */, 0x7F...0x9F:
// parse error: control-character-reference
characterReferenceCode = switch characterReferenceCode! {
case 0x80: 0x20AC
case 0x82: 0x201A
case 0x83: 0x0192
case 0x84: 0x201E
case 0x85: 0x2026
case 0x86: 0x2020
case 0x87: 0x2021
case 0x88: 0x02C6
case 0x89: 0x2030
case 0x8A: 0x0160
case 0x8B: 0x2039
case 0x8C: 0x0152
case 0x8E: 0x017D
case 0x91: 0x2018
case 0x92: 0x2019
case 0x93: 0x201C
case 0x94: 0x201D
case 0x95: 0x2022
case 0x96: 0x2013
case 0x97: 0x2014
case 0x98: 0x02DC
case 0x99: 0x2122
case 0x9A: 0x0161
case 0x9B: 0x203A
case 0x9C: 0x0153
case 0x9E: 0x017E
case 0x9F: 0x0178
case let c: c
}
default:
break
}
temporaryBuffer = ""
if let c = Unicode.Scalar(characterReferenceCode!) {
temporaryBuffer!.append(Character(c))
}
flushCharacterReference()
return next()
}
mutating func tokenizeHexadecimalCharacterReferenceStart() -> Token? {
let c = nextChar()
switch c {
case .some("0"..."9"), .some("a"..."f"), .some("A"..."F"):
reconsume(c)
state = .hexadecimalCharacterReference
return tokenizeHexadecimalCharacterReference()
default:
// parse error: absence-of-digits-in-numeric-character-reference
reconsume(c)
state = .flushingTemporaryBuffer(returnState!)
return next()
}
}
mutating func tokenizeHexadecimalCharacterReference() -> Token? {
let c = nextChar()
switch c {
case .some("0"..."9"), .some("a"..."f"), .some("A"..."F"):
characterReferenceCode = (characterReferenceCode! * 16) + UInt32(c!.hexDigitValue!)
return tokenizeHexadecimalCharacterReference()
case ";":
state = .numericCharacterReferenceEnd
return tokenizeNumericCharacterReferenceEnd()
case let c:
// parse error: missing-semicolon-after-character-reference
reconsume(c)
state = .numericCharacterReferenceEnd
return tokenizeNumericCharacterReferenceEnd()
}
}
mutating func tokenizeDecimalCharacterReferenceStart() -> Token? {
let c = nextChar()
if let c,
c.isASCII && c.isNumber {
reconsume(c)
state = .decimalCharacterReference
return tokenizeDecimalCharacterReference()
} else {
// parse error: absence-of-digits-in-numeric-character-reference
reconsume(c)
state = returnState!
return next()
}
}
mutating func tokenizeDecimalCharacterReference() -> Token? {
let c = nextChar()
switch c {
case .some("0"..."9"):
characterReferenceCode = (characterReferenceCode! * 10) + UInt32(c!.wholeNumberValue!)
return tokenizeDecimalCharacterReference()
case ";":
state = .numericCharacterReferenceEnd
return tokenizeNumericCharacterReferenceEnd()
default:
// if nil, parse error: missing-semicolon-after-character-reference
reconsume(c)
state = .numericCharacterReferenceEnd
return tokenizeNumericCharacterReferenceEnd()
}
}
mutating func tokenizeAmbiguousAmpersand() -> Token? {
let c = nextChar()
switch c {
case .some("0"..."9"), .some("a"..."z"), .some("A"..."Z"):
if case .attributeValue(_) = returnState {
// TODO: append the current input character to the current attribute's value
if case .startTag(let s, let selfClosing, var attributes) = currentToken {
attributes[attributes.count - 1].value.append(c!)
currentToken = .startTag(s, selfClosing: selfClosing, attributes: attributes)
} else {
fatalError("bad current token")
}
return next()
} else {
return .character(c!)
}
default:
// if c == ";", parse error: unknown-named-character-reference
reconsume(c)
state = returnState!
return next()
}
}
mutating func tokenizeTagOpen() -> Token? {
let c = nextChar()
switch c {
case "!":
state = .markupDeclarationOpen
return tokenizeMarkupDeclarationOpen()
case "/":
state = .endTagOpen
return tokenizeEndTagOpen()
case "?":
// parse error: unexpected-question-mark-instead-of-tag-name
currentToken = .comment("")
state = .bogusComment
return tokenizeBogusComment()
case nil:
// parser error: eof-before-tag-name
state = .endOfFile
return .character("<")
case .some("a"..."z"), .some("A"..."Z"):
currentToken = .startTag("", selfClosing: false, attributes: [])
reconsume(c)
state = .tagName
return tokenizeTagName()
case .some(_):
// parse error: invalid-first-character-of-tag-name
reconsume(c)
state = .data
return .character("<")
}
}
mutating func tokenizeEndTagOpen() -> Token? {
let c = nextChar()
switch c {
case .some("a"..."z"), .some("A"..."Z"):
currentToken = .endTag("")
reconsume(c)
state = .tagName
return tokenizeTagName()
case ">":
// parse error: missing-end-tag-name
state = .data
return tokenizeData()
case nil:
// parse error: eof-before-tag-name
state = .emitTokens([.character("/")], .endOfFile)
return .character("<")
case .some(let c):
// parse error: invalid-first-character-of-tag-name
currentToken = .comment("")
reconsume(c)
state = .bogusComment
return tokenizeBogusComment()
}
}
mutating func tokenizeTagName() -> Token? {
switch nextChar() {
case "\t", "\n", "\u{000C}", " ":
state = .beforeAttributeName
return tokenizeBeforeAttributeName()
case "/":
state = .selfClosingStartTag
return tokenizeSelfClosingStartTag()
case ">":
state = .data
return takeCurrentToken()
case nil:
// parse error: eof-in-tag
state = .endOfFile
return nil
case .some(var c):
if c == "\0" {
// parse error: unexpected-null-character
c = "\u{FFFD}"
} else if ("A"..."Z").contains(c) {
c = c.asciiLowercase
}
if case .startTag(var s, let selfClosing, let attributes) = currentToken {
s.append(c)
currentToken = .startTag(s, selfClosing: selfClosing, attributes: attributes)
return tokenizeTagName()
} else if case .endTag(var s) = currentToken {
s.append(c)
currentToken = .endTag(s)
return tokenizeTagName()
} else {
fatalError("bad current token")
}
}
}
mutating func tokenizeSelfClosingStartTag() -> Token? {
switch nextChar() {
case ">":
if case .startTag(let s, _, let attributes) = currentToken {
currentToken = .startTag(s, selfClosing: true, attributes: attributes)
} else {
fatalError("bad current token")
}
state = .data
return takeCurrentToken()
case nil:
// parse error: eof-in-tag
state = .endOfFile
return nil
case .some(let c):
// parse error: unexpected-solidus-in-tag
reconsume(c)
state = .beforeAttributeName
return tokenizeBeforeAttributeName()
}
}
mutating func tokenizeBeforeAttributeName() -> Token? {
let c = nextChar()
switch c {
case "\t", "\n", "\u{000C}", " ":
// ignore the character
return next()
case "/", ">", nil:
reconsume(c)
state = .afterAttributeName
return tokenizeAfterAttributeName()
case "=":
// parse error: unexpected-equals-sign-before-attribute-name
if case .startTag(let s, let selfClosing, var attributes) = currentToken {
attributes.append(Attribute(name: "=", value: ""))
currentToken = .startTag(s, selfClosing: selfClosing, attributes: attributes)
state = .attributeName
return tokenizeAttributeName()
} else {
fatalError("bad current token")
}
default:
if case .startTag(let s, let selfClosing, var attributes) = currentToken {
attributes.append(Attribute(name: "", value: ""))
currentToken = .startTag(s, selfClosing: selfClosing, attributes: attributes)
reconsume(c)
state = .attributeName
return tokenizeAttributeName()
} else if case .endTag(_) = currentToken {
// ignore
reconsume(c)
state = .attributeName
return tokenizeAttributeName()
} else {
fatalError("bad current token")
}
}
}
mutating func tokenizeAttributeName() -> Token? {
let c = nextChar()
switch c {
case "\t", "\n", "\u{000C}", " ", "/", ">", nil:
reconsume(c)
state = .afterAttributeName
return tokenizeAfterAttributeName()
case "=":
state = .beforeAttributeValue
return tokenizeBeforeAttributeValue()
case .some(var c):
if ("A"..."Z").contains(c) {
c = c.asciiLowercase
}
// if null, parse error: unexpected-null-character
if c == "\0" {
c = "\u{FFFD}"
}
// if c in ["\"", "'", "<"], parse error: unexpected-character-in-attribute-name
if case .startTag(let s, let selfClosing, var attributes) = currentToken {
attributes[attributes.count - 1].name.append(c)
currentToken = .startTag(s, selfClosing: selfClosing, attributes: attributes)
return tokenizeAttributeName()
} else if case .endTag(_) = currentToken {
return tokenizeAttributeName()
} else {
fatalError("bad curren token")
}
}
}
mutating func tokenizeAfterAttributeName() -> Token? {
switch nextChar() {
case "\t", "\n", "\u{000C}", " ":
// ignore the character
return tokenizeAttributeName()
case "/":
state = .selfClosingStartTag
return tokenizeSelfClosingStartTag()
case "=":
state = .beforeAttributeValue
return tokenizeBeforeAttributeValue()
case nil:
// parse error: eof-in-tag
state = .endOfFile
return nil
case .some(let c):
if case .startTag(let s, let selfClosing, var attributes) = currentToken {
attributes.append(Attribute(name: "", value: ""))
currentToken = .startTag(s, selfClosing: selfClosing, attributes: attributes)
reconsume(c)
state = .attributeName
return tokenizeAttributeName()
} else if case .endTag(_) = currentToken {
reconsume(c)
state = .attributeName
return tokenizeAttributeName()
} else {
fatalError("bad current token")
}
}
}
mutating func tokenizeBeforeAttributeValue() -> Token? {
switch nextChar() {
case "\t", "\n", "\u{000C}", " ":
// ignore the character
return tokenizeBeforeAttributeValue()
case "\"":
state = .attributeValue(.doubleQuoted)
return tokenizeAttributeValue(quotes: .doubleQuoted)
case "'":
state = .attributeValue(.singleQuoted)
return tokenizeAttributeValue(quotes: .singleQuoted)
case ">":
// parse error: missing-attribute-value
state = .data
return takeCurrentToken()
case let c:
reconsume(c)
state = .attributeValue(.unquoted)
return tokenizeAttributeValue(quotes: .unquoted)
}
}
mutating func tokenizeAttributeValue(quotes: AttributeValueQuotation) -> Token? {
if quotes == .unquoted {
switch nextChar() {
case "\t", "\n", "\u{000C}", " ":
state = .beforeAttributeName
return tokenizeBeforeAttributeName()
case "&":
returnState = .attributeValue(.unquoted)
state = .characterReference
return tokenizeCharacterReference()
case ">":
state = .data
return takeCurrentToken()
case nil:
// parse error: eof-in-tag
state = .endOfFile
return nil
case .some(let c):
// if c in ["\"", "'", "<", "=", "`"], parse error: unexpected-character-in-unquoted-attribute-value
if case .startTag(let s, let selfClosing, var attributes) = currentToken {
attributes[attributes.count - 1].value.append(c)
currentToken = .startTag(s, selfClosing: selfClosing, attributes: attributes)
return tokenizeAttributeValue(quotes: quotes)
} else {
fatalError("bad current token")
}
}
} else {
let c = nextChar()
switch c {
case "\"" where quotes == .doubleQuoted:
state = .afterAttributeValueQuoted
return tokenizeAfterAttributeValueQuoted()
case "'" where quotes == .singleQuoted:
state = .afterAttributeValueQuoted
return tokenizeAfterAttributeValueQuoted()
case "&":
returnState = .attributeValue(quotes)
state = .characterReference
return tokenizeCharacterReference()
case nil:
// parse error: eof-in-tag
state = .endOfFile
return nil
case .some(var c):
if c == "\0" {
// parse error: unexpected-null-character
c = "\u{FFFD}"
}
if case .startTag(let s, let selfClosing, var attributes) = currentToken {
attributes[attributes.count - 1].value.append(c)
currentToken = .startTag(s, selfClosing: selfClosing, attributes: attributes)
return tokenizeAttributeValue(quotes: quotes)
} else if case .endTag(_) = currentToken {
return tokenizeAttributeValue(quotes: quotes)
} else {
fatalError("bad current token")
}
}
}
}
mutating func tokenizeAfterAttributeValueQuoted() -> Token? {
switch nextChar() {
case "\t", "\n", "\u{000C}", " ":
state = .beforeAttributeName
return tokenizeBeforeAttributeName()
case "/":
state = .selfClosingStartTag
return tokenizeSelfClosingStartTag()
case ">":
state = .data
return takeCurrentToken()
case nil:
// parse error: eof-in-tag
state = .endOfFile
return nil
case .some(let c):
// parse error: missing-whitespace-between-attributes
reconsume(c)
state = .beforeAttributeName
return tokenizeBeforeAttributeName()
}
}
mutating func tokenizeBogusComment() -> Token? {
switch nextChar() {
case ">":
state = .data
return takeCurrentToken()
case nil:
state = .endOfFile
return takeCurrentToken()
case .some(var c):
if c == "\0" {
// parse error: unexpected-null-character
c = "\u{FFFD}"
}
if case .comment(var s) = currentToken {
s.append(c)
currentToken = .comment(s)
return tokenizeBogusComment()
} else {
fatalError("bad current token")
}
}
}
mutating func tokenizeMarkupDeclarationOpen() -> Token? {
let peeked = peek(count: 7)
if peeked.starts(with: "--") {
consume(count: 2)
currentToken = .comment("")
state = .commentStart
return tokenizeCommentStart()
} else if peeked.lowercased() == "doctype" {
consume(count: 7)
state = .doctype
return tokenizeDoctype()
} else if peeked == "[CDATA[" {
// TODO: we don't do any of the tree construction stuff yet, so can't really handle this
// consume(count: 7)
currentToken = .comment("")
state = .bogusComment
return tokenizeBogusComment()
} else {
// parse error: incorrectly-opened-comment
currentToken = .comment("")
state = .bogusComment
return tokenizeBogusComment()
}
}
mutating func tokenizeCommentStart() -> Token? {
switch nextChar() {
case "-":
state = .commentStartDash
return tokenizeCommentStartDash()
case ">":
// parse error: abrupt-closing-of-empty-comment
state = .data
return takeCurrentToken()
case let c:
reconsume(c)
state = .comment
return tokenizeComment()
}
}
mutating func tokenizeCommentStartDash() -> Token? {
switch nextChar() {
case "-":
state = .commentEnd
return tokenizeCommentEnd()
case ">":
// parse error: abrupt-closing-of-empty-comment
state = .data
return takeCurrentToken()
case nil:
// parse error: eof-in-comment
return takeCurrentToken()
case .some(let c):
if case .comment(var s) = currentToken {
s.append("-")
currentToken = .comment(s)
reconsume(c)
state = .comment
return tokenizeComment()
} else {
fatalError("bad current token")
}
}
}
mutating func tokenizeComment() -> Token? {
switch nextChar() {
case "<":
if case .comment(var s) = currentToken {
s.append("<")
currentToken = .comment(s)
state = .commentLessThanSign
return tokenizeCommentLessThanSign()
} else {
fatalError("bad current token")
}
case "-":
state = .commentEndDash
return tokenizeCommentEndDash()
case nil:
// parse error: eof-in-comment
state = .endOfFile
return takeCurrentToken()
case .some(var c):
if c == "\0" {
// parse error: unexpected-null-character
c = "\u{FFFD}"
}
if case .comment(var s) = currentToken {
s.append(c)
currentToken = .comment(s)
return tokenizeComment()
} else {
fatalError("bad current token")
}
}
}
mutating func tokenizeCommentLessThanSign() -> Token? {
switch nextChar() {
case "!":
if case .comment(var s) = currentToken {
s.append("!")
currentToken = .comment(s)
state = .commentLessThanSignBang
return tokenizeCommentLessThanSignBang()
} else {
fatalError("bad current token")
}
case "<":
if case .comment(var s) = currentToken {
s.append("<")
currentToken = .comment(s)
return tokenizeComment()
} else {
fatalError("bad current token")
}
case let c:
reconsume(c)
state = .comment
return tokenizeComment()
}
}
mutating func tokenizeCommentLessThanSignBang() -> Token? {
switch nextChar() {
case "-":
state = .commentLessThanSignBangDash
return tokenizeCommentLessThanSignBangDash()
case let c:
reconsume(c)
state = .comment
return tokenizeComment()
}
}
mutating func tokenizeCommentLessThanSignBangDash() -> Token? {
switch nextChar() {
case "-":
state = .commentLessThanSignBangDashDash
return tokenizeCommentLessThanSignBangDashDash()
case let c:
reconsume(c)
state = .commentEndDash
return tokenizeCommentEndDash()
}
}
mutating func tokenizeCommentLessThanSignBangDashDash() -> Token? {
let c = nextChar()
switch c {
case ">", nil:
reconsume(c)
state = .commentEnd
return tokenizeCommentEnd()
default:
// parse error: nested-comment
reconsume(c)
state = .commentEnd
return tokenizeCommentEnd()
}
}
mutating func tokenizeCommentEndDash() -> Token? {
switch nextChar() {
case "-":
state = .commentEnd
return tokenizeCommentEnd()
case nil:
// parse error: eof-in-comment
state = .endOfFile
return takeCurrentToken()
case let c:
if case .comment(var s) = currentToken {
s.append("-")
currentToken = .comment(s)
} else {
fatalError("bad current token")
}
reconsume(c)
state = .comment
return next()
}
}
mutating func tokenizeCommentEnd() -> Token? {
switch nextChar() {
case ">":
state = .data
return takeCurrentToken()
case "!":
state = .commentEndBang
return tokenizeCommentEndBang()
case "-":
if case .comment(var s) = currentToken {
s.append("-")
currentToken = .comment(s)
return tokenizeCommentEnd()
} else {
fatalError("bad current token")
}
case nil:
// parse error: eof-in-comment
state = .endOfFile
return takeCurrentToken()
case .some(let c):
if case .comment(var s) = currentToken {
s.append("--")
currentToken = .comment(s)
} else {
fatalError("bad current token")
}
reconsume(c)
state = .comment
return tokenizeComment()
}
}
mutating func tokenizeCommentEndBang() -> Token? {
switch nextChar() {
case "-":
if case .comment(var s) = currentToken {
s.append("--!")
currentToken = .comment(s)
state = .commentEndDash
return tokenizeCommentEndDash()
} else {
fatalError("bad current token")
}
case ">":
// parse error: incorrectly-closed-comment
state = .data
return takeCurrentToken()
case nil:
// parse error: eof-in-comment
state = .endOfFile
return takeCurrentToken()
case .some(let c):
if case .comment(var s) = currentToken {
s.append("--!")
currentToken = .comment(s)
reconsume(c)
state = .comment
return tokenizeComment()
} else {
fatalError("bad current token")
}
}
}
mutating func tokenizeDoctype() -> Token? {
switch nextChar() {
case "\t", "\n", "\u{000C}", " ":
state = .beforeDoctypeName
return tokenizeBeforeDoctypeName()
case ">":
reconsume(">")
state = .beforeDoctypeName
return tokenizeBeforeDoctypeName()
case nil:
// parse error: eof-in-doctype
state = .endOfFile
return .doctype("", forceQuirks: true, publicIdentifier: nil, systemIdentifier: nil)
case .some(let c):
// parse error: missing-whitespace-before-doctype-name
reconsume(c)
state = .beforeDoctypeName
return tokenizeBeforeDoctypeName()
}
}
mutating func tokenizeBeforeDoctypeName() -> Token? {
switch nextChar() {
case "\t", "\n", "\u{000C}", " ":
// ignore the character
return tokenizeBeforeDoctypeName()
case .some(let c) where ("A"..."Z").contains(c):
currentToken = .doctype("\(c.asciiLowercase)", forceQuirks: false, publicIdentifier: nil, systemIdentifier: nil)
state = .doctypeName
return tokenizeDoctypeName()
case "\0":
// parse error: unexpected-null-character
currentToken = .doctype("\u{FFFD}", forceQuirks: false, publicIdentifier: nil, systemIdentifier: nil)
state = .doctypeName
return tokenizeDoctypeName()
case ">":
// parse error: missing-doctype-name
state = .data
return .doctype("", forceQuirks: true, publicIdentifier: nil, systemIdentifier: nil)
case nil:
// parse error: eof-in-doctype
state = .endOfFile
return .doctype("", forceQuirks: false, publicIdentifier: nil, systemIdentifier: nil)
case .some(let c):
currentToken = .doctype("\(c)", forceQuirks: false, publicIdentifier: nil, systemIdentifier: nil)
state = .doctypeName
return tokenizeDoctypeName()
}
}
mutating func tokenizeDoctypeName() -> Token? {
switch nextChar() {
case "\t", "\n", "\u{000C}", " ":
state = .afterDoctypeName
return tokenizeAfterDoctypeName()
case ">":
state = .data
return takeCurrentToken()
case nil:
// parse error: eof-in-doctype
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken {
currentToken = nil
return .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
} else {
fatalError("bad current token")
}
case .some(var c):
if c == "\0" {
c = "\u{FFFD}"
} else if ("A"..."Z").contains(c) {
c = c.asciiLowercase
}
if case .doctype(var s, let forceQuirks, _, _) = currentToken {
s.append(c)
currentToken = .doctype(s, forceQuirks: forceQuirks, publicIdentifier: nil, systemIdentifier: nil)
return tokenizeDoctypeName()
} else {
fatalError("bad current token")
}
}
}
mutating func tokenizeAfterDoctypeName() -> Token? {
switch nextChar() {
case "\t", "\n", "\u{000C}", " ":
// ignore the character
return tokenizeAfterDoctypeName()
case ">":
state = .data
return takeCurrentToken()
case nil:
// parse error: eof-in-doctype
state = .endOfFile
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken {
currentToken = nil
return .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
} else {
fatalError("bad current token")
}
case .some(let c):
reconsume(c)
let peeked = peek(count: 6).lowercased()
if peeked == "public" {
consume(count: 6)
state = .afterDoctypePublicKeyword
return tokenizeAfterDoctypePublicKeyword()
} else if peeked == "system" {
consume(count: 6)
state = .afterDoctypeSystemKeyword
return tokenizeAfterDoctypeSystemKeyword()
} else {
// parse error: invalid-character-sequence-after-doctype-name
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken {
currentToken = .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
} else {
fatalError("bad current token")
}
state = .bogusDoctype
return tokenizeBogusDoctype()
}
}
}
mutating func tokenizeAfterDoctypePublicKeyword() -> Token? {
switch nextChar() {
case "\t", "\n", "\u{000C}", " ":
state = .beforeDoctypePublicIdentifier
return tokenizeBeforeDoctypePublicIdentifier()
case .some(let c) where c == "\"" || c == "'":
// parse error: missing-whitespace-after-doctype-public-keyword
if case .doctype(let s, let forceQuirks, _, _) = currentToken {
currentToken = .doctype(s, forceQuirks: forceQuirks, publicIdentifier: "", systemIdentifier: nil)
let quotes = c == "\"" ? DoctypeIdentifierQuotation.doubleQuoted : .singleQuoted
state = .doctypePublicIdentifier(quotes)
return tokenizeDoctypePublicIdentifier(quotes: quotes)
} else {
fatalError("bad current token")
}
case ">":
// parse error: missing-doctype-public-identifier
state = .data
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken {
currentToken = nil
return .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
} else {
fatalError("bad current token")
}
case nil:
// parse error: eof-in-doctype
state = .endOfFile
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken {
currentToken = nil
return .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
} else {
fatalError("bad current token")
}
case .some(let c):
// parse error: missing-quote-before-doctype-public-identifier
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken {
currentToken = .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
} else {
fatalError("bad current token")
}
state = .bogusDoctype
reconsume(c)
return tokenizeBogusDoctype()
}
}
mutating func tokenizeBeforeDoctypePublicIdentifier() -> Token? {
switch nextChar() {
case "\t", "\n", "\u{000C}", " ":
// ignore the character
return tokenizeBeforeDoctypePublicIdentifier()
case .some(let c) where c == "\"" || c == "'":
if case .doctype(let s, let forceQuirks, _, _) = currentToken {
currentToken = .doctype(s, forceQuirks: forceQuirks, publicIdentifier: "", systemIdentifier: nil)
let quotes = c == "\"" ? DoctypeIdentifierQuotation.doubleQuoted : .singleQuoted
state = .doctypePublicIdentifier(quotes)
return tokenizeDoctypePublicIdentifier(quotes: quotes)
} else {
fatalError("bad current token")
}
case ">":
// parse error: missing-doctype-public-identifier
state = .data
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken {
currentToken = nil
return .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
} else {
fatalError("bad current token")
}
case nil:
// parse error: eof-in-doctype
state = .endOfFile
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken {
currentToken = nil
return .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
} else {
fatalError("bad current token")
}
case .some(let c):
// parse error: missing-quote-before-doctype-public-identifier
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken {
currentToken = .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
} else {
fatalError("bad current token")
}
reconsume(c)
state = .bogusDoctype
return tokenizeBogusDoctype()
}
}
mutating func tokenizeDoctypePublicIdentifier(quotes: DoctypeIdentifierQuotation) -> Token? {
switch nextChar() {
case "\"" where quotes == .doubleQuoted:
state = .afterDoctypePublicIdentifier
return tokenizeAfterDoctypePublicIdentifier()
case "'" where quotes == .singleQuoted:
state = .afterDoctypePublicIdentifier
return tokenizeAfterDoctypePublicIdentifier()
case ">":
// parse error: abrupt-doctype-public-identifier
reconsume(">")
state = .data
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken {
currentToken = nil
return .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
} else {
fatalError("bad current token")
}
case nil:
// parse error: eof-in-doctype
state = .endOfFile
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken {
currentToken = nil
return .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
} else {
fatalError("bad current token")
}
case .some(var c):
if c == "\0" {
// parse error: unexpected-null-character
c = "\u{FFFD}"
}
if case .doctype(let s, let forceQuirks, var publicIdentifier, _) = currentToken {
publicIdentifier!.append(c)
currentToken = .doctype(s, forceQuirks: forceQuirks, publicIdentifier: publicIdentifier, systemIdentifier: nil)
return tokenizeDoctypePublicIdentifier(quotes: quotes)
} else {
fatalError("bad current token")
}
}
}
mutating func tokenizeAfterDoctypePublicIdentifier() -> Token? {
switch nextChar() {
case "\t", "\n", "\u{000C}", " ":
state = .betweenDoctypePublicAndSystemIdentifiers
return tokenizeBetweenDoctypePublicAndSystemIdentifiers()
case ">":
state = .data
return takeCurrentToken()
case .some(let c) where c == "\"" || c == "'":
// parse error: missing-whitespace-between-doctype-public-and-system-identifiers
if case .doctype(let s, let forceQuirks, let publicIdentifier, _) = currentToken {
currentToken = .doctype(s, forceQuirks: forceQuirks, publicIdentifier: publicIdentifier, systemIdentifier: "")
let quotes = c == "\"" ? DoctypeIdentifierQuotation.doubleQuoted : .singleQuoted
state = .doctypeSystemIdentifier(quotes)
return tokenizeDoctypeSystemIdentifier(quotes: quotes)
} else {
fatalError("bad current token")
}
case nil:
// parse error: eof-in-doctype
state = .endOfFile
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken {
currentToken = nil
return .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
} else {
fatalError("bad current token")
}
case .some(let c):
// parse error: missing-quote-before-doctype-system-identifier
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken {
currentToken = .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
} else {
fatalError("bad current token")
}
reconsume(c)
state = .bogusDoctype
return tokenizeBogusDoctype()
}
}
mutating func tokenizeBetweenDoctypePublicAndSystemIdentifiers() -> Token? {
switch nextChar() {
case "\t", "\n", "\u{000C}", " ":
// ignore the character
return tokenizeBetweenDoctypePublicAndSystemIdentifiers()
case ">":
state = .data
return takeCurrentToken()
case .some(let c) where c == "\"" || c == "'":
if case .doctype(let s, let forceQuirks, let publicIdentifier, _) = currentToken {
currentToken = .doctype(s, forceQuirks: forceQuirks, publicIdentifier: publicIdentifier, systemIdentifier: "")
let quotes = c == "\"" ? DoctypeIdentifierQuotation.doubleQuoted : .singleQuoted
state = .doctypeSystemIdentifier(quotes)
return tokenizeDoctypeSystemIdentifier(quotes: quotes)
} else {
fatalError("bad current token")
}
case nil:
// parse error: eof-in-doctype
state = .endOfFile
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken {
currentToken = nil
return .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
} else {
fatalError("bad current token")
}
case .some(let c):
// parse error: missing-quote-before-doctype-system-identifier
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken {
currentToken = .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
} else {
fatalError("bad current token")
}
reconsume(c)
state = .bogusComment
return tokenizeBogusComment()
}
}
mutating func tokenizeAfterDoctypeSystemKeyword() -> Token? {
switch nextChar() {
case "\t", "\n", "\u{000C}", " ":
state = .beforeDoctypeSystemIdentifier
return tokenizeBeforeDoctypeSystemIdentifier()
case .some(let c) where c == "\"" || c == "'":
if case .doctype(let s, let forceQuirks, let publicIdentifier, _) = currentToken {
currentToken = .doctype(s, forceQuirks: forceQuirks, publicIdentifier: publicIdentifier, systemIdentifier: "")
let quotes = c == "\"" ? DoctypeIdentifierQuotation.doubleQuoted : .singleQuoted
state = .doctypeSystemIdentifier(quotes)
return tokenizeDoctypeSystemIdentifier(quotes: quotes)
} else {
fatalError("bad current token")
}
case ">":
// parse error: missing-doctype-system-identifier
state = .data
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken {
currentToken = nil
return .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
} else {
fatalError("bad current token")
}
case nil:
// parse error: eof-in-doctype:
state = .endOfFile
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken {
currentToken = nil
return .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
} else {
fatalError("bad current token")
}
case .some(let c):
// parse error: missing-quote-before-doctype-system-identifier
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken {
currentToken = .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
} else {
fatalError("bad current token")
}
reconsume(c)
state = .bogusDoctype
return tokenizeBogusDoctype()
}
}
mutating func tokenizeBeforeDoctypeSystemIdentifier() -> Token? {
switch nextChar() {
case "\t", "\n", "\u{000C}", " ":
// ignore the character
return tokenizeBeforeDoctypeSystemIdentifier()
case .some(let c) where c == "\"" || c == "'":
if case .doctype(let s, let forceQuirks, let publicIdentifier, _) = currentToken {
currentToken = .doctype(s, forceQuirks: forceQuirks, publicIdentifier: publicIdentifier, systemIdentifier: " ")
let quotes = c == "\"" ? DoctypeIdentifierQuotation.doubleQuoted : .singleQuoted
state = .doctypeSystemIdentifier(quotes)
return tokenizeDoctypeSystemIdentifier(quotes: quotes)
} else {
fatalError("bad current token")
}
case ">":
// parse error: missing-doctype-system-identifier
state = .data
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken {
currentToken = nil
return .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
} else {
fatalError("bad current token")
}
case nil:
// parse error: eof-in-doctype:
state = .endOfFile
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken {
currentToken = nil
return .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
} else {
fatalError("bad current token")
}
case .some(let c):
// parse error: missing-quote-before-doctype-system-identifier
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken {
currentToken = .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
} else {
fatalError("bad current token")
}
reconsume(c)
state = .bogusDoctype
return tokenizeBogusDoctype()
}
}
mutating func tokenizeDoctypeSystemIdentifier(quotes: DoctypeIdentifierQuotation) -> Token? {
switch nextChar() {
case "\"" where quotes == .doubleQuoted:
state = .afterDoctypeSystemIdentifier
return tokenizeAfterDoctypeSystemIdentifier()
case "'" where quotes == .singleQuoted:
state = .afterDoctypeSystemIdentifier
return tokenizeAfterDoctypeSystemIdentifier()
case ">":
// parse error: abrupt-doctype-system-identifier
state = .data
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken {
currentToken = nil
return .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
} else {
fatalError("bad current token")
}
case nil:
// parse error: eof-in-doctype
state = .endOfFile
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken {
currentToken = nil
return .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
} else {
fatalError("bad current token")
}
case .some(var c):
if c == "\0" {
// parse error: unexpected-null-character
c = "\u{FFFD}"
}
if case .doctype(let s, let forceQuirks, let publicIdentifier, var systemIdentifier) = currentToken {
systemIdentifier!.append(c)
currentToken = .doctype(s, forceQuirks: forceQuirks, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
return tokenizeDoctypeSystemIdentifier(quotes: quotes)
} else {
fatalError("bad current token")
}
}
}
mutating func tokenizeAfterDoctypeSystemIdentifier() -> Token? {
switch nextChar() {
case "\t", "\n", "\u{000C}", " ":
// ignore the character
return tokenizeAfterDoctypeSystemIdentifier()
case ">":
state = .data
return takeCurrentToken()
case nil:
// parse error: eof-in-doctype
state = .endOfFile
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken {
currentToken = nil
return .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
} else {
fatalError("bad current token")
}
case .some(let c):
// parse error: unexpected-character-after-doctype-system-identifier
// Note: This does not set the current DOCTYPE token's force-quirks flag to on.
reconsume(c)
state = .bogusDoctype
return tokenizeBogusDoctype()
}
}
mutating func tokenizeBogusDoctype() -> Token? {
switch nextChar() {
case ">":
state = .data
return takeCurrentToken()
case "\0":
// parse error: unexpected-null-character, ignore the character
return tokenizeBogusDoctype()
case nil:
state = .endOfFile
return takeCurrentToken()
case _:
// ignore the character
return tokenizeBogusDoctype()
}
}
}
private extension Character {
var asciiLowercase: Character {
assert(("A"..."Z").contains(self))
return Character(Unicode.Scalar(asciiValue! + 0x20))
}
}