Don't use enum with associated values for current token

They prevent in-place modification, resulting in a bunch of extra copies
This commit is contained in:
Shadowfacts 2023-11-28 10:36:04 -05:00
parent 31bd174a69
commit f412369cf7
1 changed files with 172 additions and 358 deletions

View File

@ -16,7 +16,12 @@ struct Tokenizer<Chars: IteratorProtocol<Character>>: IteratorProtocol {
private var returnState: State? private var returnState: State?
private var temporaryBuffer: String? private var temporaryBuffer: String?
private var characterReferenceCode: UInt32? private var characterReferenceCode: UInt32?
private var currentToken: Token? // Optimization: using an enum for the current token means we can't modify the associated values in-place
// Separate fields for everything increases the risk of invalid states, but nets us a small perf gain.
private var currentStartTag: (String, selfClosing: Bool, attributes: [Attribute])?
private var currentEndTag: String?
private var currentComment: String?
private var currentDoctype: (String, forceQuirks: Bool, publicIdentifier: String?, systemIdentifier: String?)?
init(chars: Chars) { init(chars: Chars) {
self.chars = chars self.chars = chars
@ -187,8 +192,21 @@ struct Tokenizer<Chars: IteratorProtocol<Character>>: IteratorProtocol {
} }
private mutating func takeCurrentToken() -> Token { private mutating func takeCurrentToken() -> Token {
defer { currentToken = nil } if let currentStartTag {
return currentToken! self.currentStartTag = nil
return .startTag(currentStartTag.0, selfClosing: currentStartTag.selfClosing, attributes: currentStartTag.attributes)
} else if let currentEndTag {
self.currentEndTag = nil
return .endTag(currentEndTag)
} else if let currentComment {
self.currentComment = nil
return .comment(currentComment)
} else if let currentDoctype {
self.currentDoctype = nil
return .doctype(currentDoctype.0, forceQuirks: currentDoctype.forceQuirks, publicIdentifier: currentDoctype.publicIdentifier, systemIdentifier: currentDoctype.systemIdentifier)
} else {
preconditionFailure("takeCurrentToken called without current token")
}
} }
} }
@ -404,14 +422,9 @@ private extension Tokenizer {
mutating func flushCharacterReference() { mutating func flushCharacterReference() {
if case .attributeValue(_) = returnState { if case .attributeValue(_) = returnState {
if case .startTag(let s, let selfClosing, var attributes) = currentToken { currentStartTag!.attributes.uncheckedLast.value.append(temporaryBuffer!)
attributes[attributes.count - 1].value.append(temporaryBuffer!) temporaryBuffer = nil
currentToken = .startTag(s, selfClosing: selfClosing, attributes: attributes) state = returnState!
temporaryBuffer = nil
state = returnState!
} else {
fatalError("bad current tag")
}
} else { } else {
state = .flushingTemporaryBuffer(returnState!) state = .flushingTemporaryBuffer(returnState!)
} }
@ -561,13 +574,8 @@ private extension Tokenizer {
switch c { switch c {
case .some("0"..."9"), .some("a"..."z"), .some("A"..."Z"): case .some("0"..."9"), .some("a"..."z"), .some("A"..."Z"):
if case .attributeValue(_) = returnState { if case .attributeValue(_) = returnState {
if case .startTag(let s, let selfClosing, var attributes) = currentToken { currentStartTag!.attributes.uncheckedLast.value.append(c!)
attributes[attributes.count - 1].value.append(c!) return tokenizeAmbiguousAmpersand()
currentToken = .startTag(s, selfClosing: selfClosing, attributes: attributes)
} else {
fatalError("bad current token")
}
return next()
} else { } else {
return .character(c!) return .character(c!)
} }
@ -590,7 +598,7 @@ private extension Tokenizer {
return tokenizeEndTagOpen() return tokenizeEndTagOpen()
case "?": case "?":
// parse error: unexpected-question-mark-instead-of-tag-name // parse error: unexpected-question-mark-instead-of-tag-name
currentToken = .comment("") currentComment = ""
state = .bogusComment state = .bogusComment
return tokenizeBogusComment() return tokenizeBogusComment()
case nil: case nil:
@ -598,7 +606,7 @@ private extension Tokenizer {
state = .endOfFile state = .endOfFile
return .character("<") return .character("<")
case .some("a"..."z"), .some("A"..."Z"): case .some("a"..."z"), .some("A"..."Z"):
currentToken = .startTag("", selfClosing: false, attributes: []) currentStartTag = ("", selfClosing: false, attributes: [])
reconsume(c) reconsume(c)
state = .tagName state = .tagName
return tokenizeTagName() return tokenizeTagName()
@ -614,7 +622,7 @@ private extension Tokenizer {
let c = nextChar() let c = nextChar()
switch c { switch c {
case .some("a"..."z"), .some("A"..."Z"): case .some("a"..."z"), .some("A"..."Z"):
currentToken = .endTag("") currentEndTag = ""
reconsume(c) reconsume(c)
state = .tagName state = .tagName
return tokenizeTagName() return tokenizeTagName()
@ -628,7 +636,7 @@ private extension Tokenizer {
return .character("<") return .character("<")
case .some(let c): case .some(let c):
// parse error: invalid-first-character-of-tag-name // parse error: invalid-first-character-of-tag-name
currentToken = .comment("") currentComment = ""
reconsume(c) reconsume(c)
state = .bogusComment state = .bogusComment
return tokenizeBogusComment() return tokenizeBogusComment()
@ -636,6 +644,8 @@ private extension Tokenizer {
} }
mutating func tokenizeTagName() -> Token? { mutating func tokenizeTagName() -> Token? {
// Optimization: this is a hot path where we stay in this state for a while before emitting a token,
// and the function call overhead of recursion costs a bit of perf.
while true { while true {
switch nextChar() { switch nextChar() {
case "\t", "\n", "\u{000C}", " ": case "\t", "\n", "\u{000C}", " ":
@ -658,13 +668,11 @@ private extension Tokenizer {
} else if ("A"..."Z").contains(c) { } else if ("A"..."Z").contains(c) {
c = c.asciiLowercase c = c.asciiLowercase
} }
if case .startTag(var s, let selfClosing, let attributes) = currentToken { if currentStartTag != nil {
s.append(c) currentStartTag!.0.append(c)
currentToken = .startTag(s, selfClosing: selfClosing, attributes: attributes)
continue continue
} else if case .endTag(var s) = currentToken { } else if currentEndTag != nil {
s.append(c) currentEndTag!.append(c)
currentToken = .endTag(s)
continue continue
} else { } else {
fatalError("bad current token") fatalError("bad current token")
@ -676,11 +684,7 @@ private extension Tokenizer {
mutating func tokenizeSelfClosingStartTag() -> Token? { mutating func tokenizeSelfClosingStartTag() -> Token? {
switch nextChar() { switch nextChar() {
case ">": case ">":
if case .startTag(let s, _, let attributes) = currentToken { currentStartTag!.selfClosing = true
currentToken = .startTag(s, selfClosing: true, attributes: attributes)
} else {
fatalError("bad current token")
}
state = .data state = .data
return takeCurrentToken() return takeCurrentToken()
case nil: case nil:
@ -707,22 +711,16 @@ private extension Tokenizer {
return tokenizeAfterAttributeName() return tokenizeAfterAttributeName()
case "=": case "=":
// parse error: unexpected-equals-sign-before-attribute-name // parse error: unexpected-equals-sign-before-attribute-name
if case .startTag(let s, let selfClosing, var attributes) = currentToken { currentStartTag!.attributes.append(Attribute(name: "=", value: ""))
attributes.append(Attribute(name: "=", value: "")) state = .attributeName
currentToken = .startTag(s, selfClosing: selfClosing, attributes: attributes) return tokenizeAttributeName()
state = .attributeName
return tokenizeAttributeName()
} else {
fatalError("bad current token")
}
default: default:
if case .startTag(let s, let selfClosing, var attributes) = currentToken { if currentStartTag != nil {
attributes.append(Attribute(name: "", value: "")) currentStartTag!.attributes.append(Attribute(name: "", value: ""))
currentToken = .startTag(s, selfClosing: selfClosing, attributes: attributes)
reconsume(c) reconsume(c)
state = .attributeName state = .attributeName
return tokenizeAttributeName() return tokenizeAttributeName()
} else if case .endTag(_) = currentToken { } else if currentEndTag != nil {
// ignore // ignore
reconsume(c) reconsume(c)
state = .attributeName state = .attributeName
@ -753,11 +751,10 @@ private extension Tokenizer {
c = "\u{FFFD}" c = "\u{FFFD}"
} }
// if c in ["\"", "'", "<"], parse error: unexpected-character-in-attribute-name // if c in ["\"", "'", "<"], parse error: unexpected-character-in-attribute-name
if case .startTag(let s, let selfClosing, var attributes) = currentToken { if currentStartTag != nil {
attributes[attributes.count - 1].name.append(c) currentStartTag!.attributes.uncheckedLast.name.append(c)
currentToken = .startTag(s, selfClosing: selfClosing, attributes: attributes)
continue continue
} else if case .endTag(_) = currentToken { } else if currentEndTag != nil {
continue continue
} else { } else {
fatalError("bad curren token") fatalError("bad curren token")
@ -782,13 +779,12 @@ private extension Tokenizer {
state = .endOfFile state = .endOfFile
return nil return nil
case .some(let c): case .some(let c):
if case .startTag(let s, let selfClosing, var attributes) = currentToken { if currentStartTag != nil {
attributes.append(Attribute(name: "", value: "")) currentStartTag!.attributes.append(Attribute(name: "", value: ""))
currentToken = .startTag(s, selfClosing: selfClosing, attributes: attributes)
reconsume(c) reconsume(c)
state = .attributeName state = .attributeName
return tokenizeAttributeName() return tokenizeAttributeName()
} else if case .endTag(_) = currentToken { } else if currentEndTag != nil {
reconsume(c) reconsume(c)
state = .attributeName state = .attributeName
return tokenizeAttributeName() return tokenizeAttributeName()
@ -840,11 +836,10 @@ private extension Tokenizer {
return nil return nil
case .some(let c): case .some(let c):
// if c in ["\"", "'", "<", "=", "`"], parse error: unexpected-character-in-unquoted-attribute-value // if c in ["\"", "'", "<", "=", "`"], parse error: unexpected-character-in-unquoted-attribute-value
if case .startTag(let s, let selfClosing, var attributes) = currentToken { if currentStartTag != nil {
attributes[attributes.count - 1].value.append(c) currentStartTag!.attributes.uncheckedLast.value.append(c)
currentToken = .startTag(s, selfClosing: selfClosing, attributes: attributes)
continue continue
} else if case .endTag(_) = currentToken { } else if currentEndTag != nil {
continue continue
} else { } else {
fatalError("bad current token") fatalError("bad current token")
@ -872,14 +867,11 @@ private extension Tokenizer {
// parse error: unexpected-null-character // parse error: unexpected-null-character
c = "\u{FFFD}" c = "\u{FFFD}"
} }
if case .startTag(let s, let selfClosing, var attributes) = currentToken { if currentStartTag != nil {
attributes[attributes.count - 1].value.append(c) currentStartTag!.attributes.uncheckedLast.value.append(c)
currentToken = .startTag(s, selfClosing: selfClosing, attributes: attributes)
continue continue
} else if case .endTag(_) = currentToken { } else if currentEndTag != nil {
continue continue
} else {
fatalError("bad current token")
} }
} }
} }
@ -922,13 +914,8 @@ private extension Tokenizer {
// parse error: unexpected-null-character // parse error: unexpected-null-character
c = "\u{FFFD}" c = "\u{FFFD}"
} }
if case .comment(var s) = currentToken { currentComment!.append(c)
s.append(c) return tokenizeBogusComment()
currentToken = .comment(s)
return tokenizeBogusComment()
} else {
fatalError("bad current token")
}
} }
} }
@ -936,7 +923,7 @@ private extension Tokenizer {
let peeked = peek(count: 7) let peeked = peek(count: 7)
if peeked.starts(with: "--") { if peeked.starts(with: "--") {
consume(count: 2) consume(count: 2)
currentToken = .comment("") currentComment = ""
state = .commentStart state = .commentStart
return tokenizeCommentStart() return tokenizeCommentStart()
} else if peeked.lowercased() == "doctype" { } else if peeked.lowercased() == "doctype" {
@ -946,12 +933,12 @@ private extension Tokenizer {
} else if peeked == "[CDATA[" { } else if peeked == "[CDATA[" {
// TODO: we don't do any of the tree construction stuff yet, so can't really handle this // TODO: we don't do any of the tree construction stuff yet, so can't really handle this
// consume(count: 7) // consume(count: 7)
currentToken = .comment("") currentComment = ""
state = .bogusComment state = .bogusComment
return tokenizeBogusComment() return tokenizeBogusComment()
} else { } else {
// parse error: incorrectly-opened-comment // parse error: incorrectly-opened-comment
currentToken = .comment("") currentComment = ""
state = .bogusComment state = .bogusComment
return tokenizeBogusComment() return tokenizeBogusComment()
} }
@ -986,29 +973,19 @@ private extension Tokenizer {
// parse error: eof-in-comment // parse error: eof-in-comment
return takeCurrentToken() return takeCurrentToken()
case .some(let c): case .some(let c):
if case .comment(var s) = currentToken { currentComment!.append("-")
s.append("-") reconsume(c)
currentToken = .comment(s) state = .comment
reconsume(c) return tokenizeComment()
state = .comment
return tokenizeComment()
} else {
fatalError("bad current token")
}
} }
} }
mutating func tokenizeComment() -> Token? { mutating func tokenizeComment() -> Token? {
switch nextChar() { switch nextChar() {
case "<": case "<":
if case .comment(var s) = currentToken { currentComment!.append("<")
s.append("<") state = .commentLessThanSign
currentToken = .comment(s) return tokenizeCommentLessThanSign()
state = .commentLessThanSign
return tokenizeCommentLessThanSign()
} else {
fatalError("bad current token")
}
case "-": case "-":
state = .commentEndDash state = .commentEndDash
return tokenizeCommentEndDash() return tokenizeCommentEndDash()
@ -1021,35 +998,20 @@ private extension Tokenizer {
// parse error: unexpected-null-character // parse error: unexpected-null-character
c = "\u{FFFD}" c = "\u{FFFD}"
} }
if case .comment(var s) = currentToken { currentComment!.append(c)
s.append(c) return tokenizeComment()
currentToken = .comment(s)
return tokenizeComment()
} else {
fatalError("bad current token")
}
} }
} }
mutating func tokenizeCommentLessThanSign() -> Token? { mutating func tokenizeCommentLessThanSign() -> Token? {
switch nextChar() { switch nextChar() {
case "!": case "!":
if case .comment(var s) = currentToken { currentComment!.append("!")
s.append("!") state = .commentLessThanSignBang
currentToken = .comment(s) return tokenizeCommentLessThanSignBang()
state = .commentLessThanSignBang
return tokenizeCommentLessThanSignBang()
} else {
fatalError("bad current token")
}
case "<": case "<":
if case .comment(var s) = currentToken { currentComment!.append("<")
s.append("<") return tokenizeComment()
currentToken = .comment(s)
return tokenizeComment()
} else {
fatalError("bad current token")
}
case let c: case let c:
reconsume(c) reconsume(c)
state = .comment state = .comment
@ -1106,15 +1068,10 @@ private extension Tokenizer {
state = .endOfFile state = .endOfFile
return takeCurrentToken() return takeCurrentToken()
case let c: case let c:
if case .comment(var s) = currentToken { currentComment!.append("-")
s.append("-")
currentToken = .comment(s)
} else {
fatalError("bad current token")
}
reconsume(c) reconsume(c)
state = .comment state = .comment
return next() return tokenizeComment()
} }
} }
@ -1127,24 +1084,14 @@ private extension Tokenizer {
state = .commentEndBang state = .commentEndBang
return tokenizeCommentEndBang() return tokenizeCommentEndBang()
case "-": case "-":
if case .comment(var s) = currentToken { currentComment!.append("-")
s.append("-") return tokenizeCommentEnd()
currentToken = .comment(s)
return tokenizeCommentEnd()
} else {
fatalError("bad current token")
}
case nil: case nil:
// parse error: eof-in-comment // parse error: eof-in-comment
state = .endOfFile state = .endOfFile
return takeCurrentToken() return takeCurrentToken()
case .some(let c): case .some(let c):
if case .comment(var s) = currentToken { currentComment!.append("--")
s.append("--")
currentToken = .comment(s)
} else {
fatalError("bad current token")
}
reconsume(c) reconsume(c)
state = .comment state = .comment
return tokenizeComment() return tokenizeComment()
@ -1154,14 +1101,9 @@ private extension Tokenizer {
mutating func tokenizeCommentEndBang() -> Token? { mutating func tokenizeCommentEndBang() -> Token? {
switch nextChar() { switch nextChar() {
case "-": case "-":
if case .comment(var s) = currentToken { currentComment!.append("--!")
s.append("--!") state = .commentEndDash
currentToken = .comment(s) return tokenizeCommentEndDash()
state = .commentEndDash
return tokenizeCommentEndDash()
} else {
fatalError("bad current token")
}
case ">": case ">":
// parse error: incorrectly-closed-comment // parse error: incorrectly-closed-comment
state = .data state = .data
@ -1171,15 +1113,10 @@ private extension Tokenizer {
state = .endOfFile state = .endOfFile
return takeCurrentToken() return takeCurrentToken()
case .some(let c): case .some(let c):
if case .comment(var s) = currentToken { currentComment!.append("--!")
s.append("--!") reconsume(c)
currentToken = .comment(s) state = .comment
reconsume(c) return tokenizeComment()
state = .comment
return tokenizeComment()
} else {
fatalError("bad current token")
}
} }
} }
@ -1210,12 +1147,12 @@ private extension Tokenizer {
// ignore the character // ignore the character
return tokenizeBeforeDoctypeName() return tokenizeBeforeDoctypeName()
case .some(let c) where ("A"..."Z").contains(c): case .some(let c) where ("A"..."Z").contains(c):
currentToken = .doctype("\(c.asciiLowercase)", forceQuirks: false, publicIdentifier: nil, systemIdentifier: nil) currentDoctype = ("\(c.asciiLowercase)", forceQuirks: false, publicIdentifier: nil, systemIdentifier: nil)
state = .doctypeName state = .doctypeName
return tokenizeDoctypeName() return tokenizeDoctypeName()
case "\0": case "\0":
// parse error: unexpected-null-character // parse error: unexpected-null-character
currentToken = .doctype("\u{FFFD}", forceQuirks: false, publicIdentifier: nil, systemIdentifier: nil) currentDoctype = ("\u{FFFD}", forceQuirks: false, publicIdentifier: nil, systemIdentifier: nil)
state = .doctypeName state = .doctypeName
return tokenizeDoctypeName() return tokenizeDoctypeName()
case ">": case ">":
@ -1227,7 +1164,7 @@ private extension Tokenizer {
state = .endOfFile state = .endOfFile
return .doctype("", forceQuirks: false, publicIdentifier: nil, systemIdentifier: nil) return .doctype("", forceQuirks: false, publicIdentifier: nil, systemIdentifier: nil)
case .some(let c): case .some(let c):
currentToken = .doctype("\(c)", forceQuirks: false, publicIdentifier: nil, systemIdentifier: nil) currentDoctype = ("\(c)", forceQuirks: false, publicIdentifier: nil, systemIdentifier: nil)
state = .doctypeName state = .doctypeName
return tokenizeDoctypeName() return tokenizeDoctypeName()
} }
@ -1243,25 +1180,16 @@ private extension Tokenizer {
return takeCurrentToken() return takeCurrentToken()
case nil: case nil:
// parse error: eof-in-doctype // parse error: eof-in-doctype
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken { currentDoctype!.forceQuirks = true
currentToken = nil return takeCurrentToken()
return .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
} else {
fatalError("bad current token")
}
case .some(var c): case .some(var c):
if c == "\0" { if c == "\0" {
c = "\u{FFFD}" c = "\u{FFFD}"
} else if ("A"..."Z").contains(c) { } else if ("A"..."Z").contains(c) {
c = c.asciiLowercase c = c.asciiLowercase
} }
if case .doctype(var s, let forceQuirks, _, _) = currentToken { currentDoctype!.0.append(c)
s.append(c) return tokenizeDoctypeName()
currentToken = .doctype(s, forceQuirks: forceQuirks, publicIdentifier: nil, systemIdentifier: nil)
return tokenizeDoctypeName()
} else {
fatalError("bad current token")
}
} }
} }
@ -1276,12 +1204,8 @@ private extension Tokenizer {
case nil: case nil:
// parse error: eof-in-doctype // parse error: eof-in-doctype
state = .endOfFile state = .endOfFile
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken { currentDoctype!.forceQuirks = true
currentToken = nil return takeCurrentToken()
return .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
} else {
fatalError("bad current token")
}
case .some(let c): case .some(let c):
reconsume(c) reconsume(c)
let peeked = peek(count: 6).lowercased() let peeked = peek(count: 6).lowercased()
@ -1295,11 +1219,7 @@ private extension Tokenizer {
return tokenizeAfterDoctypeSystemKeyword() return tokenizeAfterDoctypeSystemKeyword()
} else { } else {
// parse error: invalid-character-sequence-after-doctype-name // parse error: invalid-character-sequence-after-doctype-name
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken { currentDoctype!.forceQuirks = true
currentToken = .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
} else {
fatalError("bad current token")
}
state = .bogusDoctype state = .bogusDoctype
return tokenizeBogusDoctype() return tokenizeBogusDoctype()
} }
@ -1313,39 +1233,23 @@ private extension Tokenizer {
return tokenizeBeforeDoctypePublicIdentifier() return tokenizeBeforeDoctypePublicIdentifier()
case .some(let c) where c == "\"" || c == "'": case .some(let c) where c == "\"" || c == "'":
// parse error: missing-whitespace-after-doctype-public-keyword // parse error: missing-whitespace-after-doctype-public-keyword
if case .doctype(let s, let forceQuirks, _, _) = currentToken { currentDoctype!.publicIdentifier = ""
currentToken = .doctype(s, forceQuirks: forceQuirks, publicIdentifier: "", systemIdentifier: nil) let quotes = c == "\"" ? DoctypeIdentifierQuotation.doubleQuoted : .singleQuoted
let quotes = c == "\"" ? DoctypeIdentifierQuotation.doubleQuoted : .singleQuoted state = .doctypePublicIdentifier(quotes)
state = .doctypePublicIdentifier(quotes) return tokenizeDoctypePublicIdentifier(quotes: quotes)
return tokenizeDoctypePublicIdentifier(quotes: quotes)
} else {
fatalError("bad current token")
}
case ">": case ">":
// parse error: missing-doctype-public-identifier // parse error: missing-doctype-public-identifier
state = .data state = .data
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken { currentDoctype!.forceQuirks = true
currentToken = nil return takeCurrentToken()
return .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
} else {
fatalError("bad current token")
}
case nil: case nil:
// parse error: eof-in-doctype // parse error: eof-in-doctype
state = .endOfFile state = .endOfFile
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken { currentDoctype!.forceQuirks = true
currentToken = nil return takeCurrentToken()
return .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
} else {
fatalError("bad current token")
}
case .some(let c): case .some(let c):
// parse error: missing-quote-before-doctype-public-identifier // parse error: missing-quote-before-doctype-public-identifier
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken { currentDoctype!.forceQuirks = true
currentToken = .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
} else {
fatalError("bad current token")
}
state = .bogusDoctype state = .bogusDoctype
reconsume(c) reconsume(c)
return tokenizeBogusDoctype() return tokenizeBogusDoctype()
@ -1358,39 +1262,23 @@ private extension Tokenizer {
// ignore the character // ignore the character
return tokenizeBeforeDoctypePublicIdentifier() return tokenizeBeforeDoctypePublicIdentifier()
case .some(let c) where c == "\"" || c == "'": case .some(let c) where c == "\"" || c == "'":
if case .doctype(let s, let forceQuirks, _, _) = currentToken { currentDoctype!.publicIdentifier = ""
currentToken = .doctype(s, forceQuirks: forceQuirks, publicIdentifier: "", systemIdentifier: nil) let quotes = c == "\"" ? DoctypeIdentifierQuotation.doubleQuoted : .singleQuoted
let quotes = c == "\"" ? DoctypeIdentifierQuotation.doubleQuoted : .singleQuoted state = .doctypePublicIdentifier(quotes)
state = .doctypePublicIdentifier(quotes) return tokenizeDoctypePublicIdentifier(quotes: quotes)
return tokenizeDoctypePublicIdentifier(quotes: quotes)
} else {
fatalError("bad current token")
}
case ">": case ">":
// parse error: missing-doctype-public-identifier // parse error: missing-doctype-public-identifier
state = .data state = .data
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken { currentDoctype!.forceQuirks = true
currentToken = nil return takeCurrentToken()
return .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
} else {
fatalError("bad current token")
}
case nil: case nil:
// parse error: eof-in-doctype // parse error: eof-in-doctype
state = .endOfFile state = .endOfFile
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken { currentDoctype!.forceQuirks = true
currentToken = nil return takeCurrentToken()
return .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
} else {
fatalError("bad current token")
}
case .some(let c): case .some(let c):
// parse error: missing-quote-before-doctype-public-identifier // parse error: missing-quote-before-doctype-public-identifier
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken { currentDoctype!.forceQuirks = true
currentToken = .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
} else {
fatalError("bad current token")
}
reconsume(c) reconsume(c)
state = .bogusDoctype state = .bogusDoctype
return tokenizeBogusDoctype() return tokenizeBogusDoctype()
@ -1409,33 +1297,20 @@ private extension Tokenizer {
// parse error: abrupt-doctype-public-identifier // parse error: abrupt-doctype-public-identifier
reconsume(">") reconsume(">")
state = .data state = .data
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken { currentDoctype!.forceQuirks = true
currentToken = nil return takeCurrentToken()
return .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
} else {
fatalError("bad current token")
}
case nil: case nil:
// parse error: eof-in-doctype // parse error: eof-in-doctype
state = .endOfFile state = .endOfFile
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken { currentDoctype!.forceQuirks = true
currentToken = nil return takeCurrentToken()
return .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
} else {
fatalError("bad current token")
}
case .some(var c): case .some(var c):
if c == "\0" { if c == "\0" {
// parse error: unexpected-null-character // parse error: unexpected-null-character
c = "\u{FFFD}" c = "\u{FFFD}"
} }
if case .doctype(let s, let forceQuirks, var publicIdentifier, _) = currentToken { currentDoctype!.publicIdentifier!.append(c)
publicIdentifier!.append(c) return tokenizeDoctypePublicIdentifier(quotes: quotes)
currentToken = .doctype(s, forceQuirks: forceQuirks, publicIdentifier: publicIdentifier, systemIdentifier: nil)
return tokenizeDoctypePublicIdentifier(quotes: quotes)
} else {
fatalError("bad current token")
}
} }
} }
@ -1449,30 +1324,18 @@ private extension Tokenizer {
return takeCurrentToken() return takeCurrentToken()
case .some(let c) where c == "\"" || c == "'": case .some(let c) where c == "\"" || c == "'":
// parse error: missing-whitespace-between-doctype-public-and-system-identifiers // parse error: missing-whitespace-between-doctype-public-and-system-identifiers
if case .doctype(let s, let forceQuirks, let publicIdentifier, _) = currentToken { currentDoctype!.systemIdentifier = ""
currentToken = .doctype(s, forceQuirks: forceQuirks, publicIdentifier: publicIdentifier, systemIdentifier: "") let quotes = c == "\"" ? DoctypeIdentifierQuotation.doubleQuoted : .singleQuoted
let quotes = c == "\"" ? DoctypeIdentifierQuotation.doubleQuoted : .singleQuoted state = .doctypeSystemIdentifier(quotes)
state = .doctypeSystemIdentifier(quotes) return tokenizeDoctypeSystemIdentifier(quotes: quotes)
return tokenizeDoctypeSystemIdentifier(quotes: quotes)
} else {
fatalError("bad current token")
}
case nil: case nil:
// parse error: eof-in-doctype // parse error: eof-in-doctype
state = .endOfFile state = .endOfFile
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken { currentDoctype!.forceQuirks = true
currentToken = nil return takeCurrentToken()
return .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
} else {
fatalError("bad current token")
}
case .some(let c): case .some(let c):
// parse error: missing-quote-before-doctype-system-identifier // parse error: missing-quote-before-doctype-system-identifier
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken { currentDoctype!.forceQuirks = true
currentToken = .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
} else {
fatalError("bad current token")
}
reconsume(c) reconsume(c)
state = .bogusDoctype state = .bogusDoctype
return tokenizeBogusDoctype() return tokenizeBogusDoctype()
@ -1488,30 +1351,18 @@ private extension Tokenizer {
state = .data state = .data
return takeCurrentToken() return takeCurrentToken()
case .some(let c) where c == "\"" || c == "'": case .some(let c) where c == "\"" || c == "'":
if case .doctype(let s, let forceQuirks, let publicIdentifier, _) = currentToken { currentDoctype!.systemIdentifier = ""
currentToken = .doctype(s, forceQuirks: forceQuirks, publicIdentifier: publicIdentifier, systemIdentifier: "") let quotes = c == "\"" ? DoctypeIdentifierQuotation.doubleQuoted : .singleQuoted
let quotes = c == "\"" ? DoctypeIdentifierQuotation.doubleQuoted : .singleQuoted state = .doctypeSystemIdentifier(quotes)
state = .doctypeSystemIdentifier(quotes) return tokenizeDoctypeSystemIdentifier(quotes: quotes)
return tokenizeDoctypeSystemIdentifier(quotes: quotes)
} else {
fatalError("bad current token")
}
case nil: case nil:
// parse error: eof-in-doctype // parse error: eof-in-doctype
state = .endOfFile state = .endOfFile
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken { currentDoctype!.forceQuirks = true
currentToken = nil return takeCurrentToken()
return .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
} else {
fatalError("bad current token")
}
case .some(let c): case .some(let c):
// parse error: missing-quote-before-doctype-system-identifier // parse error: missing-quote-before-doctype-system-identifier
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken { currentDoctype!.forceQuirks = true
currentToken = .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
} else {
fatalError("bad current token")
}
reconsume(c) reconsume(c)
state = .bogusComment state = .bogusComment
return tokenizeBogusComment() return tokenizeBogusComment()
@ -1524,39 +1375,23 @@ private extension Tokenizer {
state = .beforeDoctypeSystemIdentifier state = .beforeDoctypeSystemIdentifier
return tokenizeBeforeDoctypeSystemIdentifier() return tokenizeBeforeDoctypeSystemIdentifier()
case .some(let c) where c == "\"" || c == "'": case .some(let c) where c == "\"" || c == "'":
if case .doctype(let s, let forceQuirks, let publicIdentifier, _) = currentToken { currentDoctype!.systemIdentifier = ""
currentToken = .doctype(s, forceQuirks: forceQuirks, publicIdentifier: publicIdentifier, systemIdentifier: "") let quotes = c == "\"" ? DoctypeIdentifierQuotation.doubleQuoted : .singleQuoted
let quotes = c == "\"" ? DoctypeIdentifierQuotation.doubleQuoted : .singleQuoted state = .doctypeSystemIdentifier(quotes)
state = .doctypeSystemIdentifier(quotes) return tokenizeDoctypeSystemIdentifier(quotes: quotes)
return tokenizeDoctypeSystemIdentifier(quotes: quotes)
} else {
fatalError("bad current token")
}
case ">": case ">":
// parse error: missing-doctype-system-identifier // parse error: missing-doctype-system-identifier
state = .data state = .data
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken { currentDoctype!.forceQuirks = true
currentToken = nil return takeCurrentToken()
return .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
} else {
fatalError("bad current token")
}
case nil: case nil:
// parse error: eof-in-doctype: // parse error: eof-in-doctype:
state = .endOfFile state = .endOfFile
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken { currentDoctype!.forceQuirks = true
currentToken = nil return takeCurrentToken()
return .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
} else {
fatalError("bad current token")
}
case .some(let c): case .some(let c):
// parse error: missing-quote-before-doctype-system-identifier // parse error: missing-quote-before-doctype-system-identifier
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken { currentDoctype!.forceQuirks = true
currentToken = .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
} else {
fatalError("bad current token")
}
reconsume(c) reconsume(c)
state = .bogusDoctype state = .bogusDoctype
return tokenizeBogusDoctype() return tokenizeBogusDoctype()
@ -1569,39 +1404,23 @@ private extension Tokenizer {
// ignore the character // ignore the character
return tokenizeBeforeDoctypeSystemIdentifier() return tokenizeBeforeDoctypeSystemIdentifier()
case .some(let c) where c == "\"" || c == "'": case .some(let c) where c == "\"" || c == "'":
if case .doctype(let s, let forceQuirks, let publicIdentifier, _) = currentToken { currentDoctype!.systemIdentifier = ""
currentToken = .doctype(s, forceQuirks: forceQuirks, publicIdentifier: publicIdentifier, systemIdentifier: " ") let quotes = c == "\"" ? DoctypeIdentifierQuotation.doubleQuoted : .singleQuoted
let quotes = c == "\"" ? DoctypeIdentifierQuotation.doubleQuoted : .singleQuoted state = .doctypeSystemIdentifier(quotes)
state = .doctypeSystemIdentifier(quotes) return tokenizeDoctypeSystemIdentifier(quotes: quotes)
return tokenizeDoctypeSystemIdentifier(quotes: quotes)
} else {
fatalError("bad current token")
}
case ">": case ">":
// parse error: missing-doctype-system-identifier // parse error: missing-doctype-system-identifier
state = .data state = .data
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken { currentDoctype!.forceQuirks = true
currentToken = nil return takeCurrentToken()
return .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
} else {
fatalError("bad current token")
}
case nil: case nil:
// parse error: eof-in-doctype: // parse error: eof-in-doctype:
state = .endOfFile state = .endOfFile
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken { currentDoctype!.forceQuirks = true
currentToken = nil return takeCurrentToken()
return .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
} else {
fatalError("bad current token")
}
case .some(let c): case .some(let c):
// parse error: missing-quote-before-doctype-system-identifier // parse error: missing-quote-before-doctype-system-identifier
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken { currentDoctype!.forceQuirks = true
currentToken = .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
} else {
fatalError("bad current token")
}
reconsume(c) reconsume(c)
state = .bogusDoctype state = .bogusDoctype
return tokenizeBogusDoctype() return tokenizeBogusDoctype()
@ -1619,33 +1438,20 @@ private extension Tokenizer {
case ">": case ">":
// parse error: abrupt-doctype-system-identifier // parse error: abrupt-doctype-system-identifier
state = .data state = .data
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken { currentDoctype!.forceQuirks = true
currentToken = nil return takeCurrentToken()
return .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
} else {
fatalError("bad current token")
}
case nil: case nil:
// parse error: eof-in-doctype // parse error: eof-in-doctype
state = .endOfFile state = .endOfFile
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken { currentDoctype!.forceQuirks = true
currentToken = nil return takeCurrentToken()
return .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
} else {
fatalError("bad current token")
}
case .some(var c): case .some(var c):
if c == "\0" { if c == "\0" {
// parse error: unexpected-null-character // parse error: unexpected-null-character
c = "\u{FFFD}" c = "\u{FFFD}"
} }
if case .doctype(let s, let forceQuirks, let publicIdentifier, var systemIdentifier) = currentToken { currentDoctype!.systemIdentifier!.append(c)
systemIdentifier!.append(c) return tokenizeDoctypeSystemIdentifier(quotes: quotes)
currentToken = .doctype(s, forceQuirks: forceQuirks, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
return tokenizeDoctypeSystemIdentifier(quotes: quotes)
} else {
fatalError("bad current token")
}
} }
} }
@ -1660,12 +1466,8 @@ private extension Tokenizer {
case nil: case nil:
// parse error: eof-in-doctype // parse error: eof-in-doctype
state = .endOfFile state = .endOfFile
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken { currentDoctype!.forceQuirks = true
currentToken = nil return takeCurrentToken()
return .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
} else {
fatalError("bad current token")
}
case .some(let c): case .some(let c):
// parse error: unexpected-character-after-doctype-system-identifier // parse error: unexpected-character-after-doctype-system-identifier
// Note: This does not set the current DOCTYPE token's force-quirks flag to on. // Note: This does not set the current DOCTYPE token's force-quirks flag to on.
@ -1699,3 +1501,15 @@ private extension Character {
return Character(Unicode.Scalar(asciiValue! + 0x20)) return Character(Unicode.Scalar(asciiValue! + 0x20))
} }
} }
private extension Array {
// Optimization: allows in-place modification of the last element of the array.
var uncheckedLast: Element {
_read {
yield self[count - 1]
}
_modify {
yield &self[count - 1]
}
}
}