1572 lines
54 KiB
Swift
1572 lines
54 KiB
Swift
//
|
|
// Tokenizer.swift
|
|
// HTMLStreamer
|
|
//
|
|
// Created by Shadowfacts on 11/22/23.
|
|
//
|
|
|
|
import Foundation
|
|
|
|
struct Tokenizer<Chars: IteratorProtocol<Unicode.Scalar>>: IteratorProtocol {
|
|
typealias Element = Token
|
|
|
|
private var chars: Chars
|
|
private var reconsumeStack: [Unicode.Scalar] = []
|
|
private var state = State.data
|
|
private var returnState: State?
|
|
private var temporaryBuffer: String?
|
|
private var characterReferenceCode: UInt32?
|
|
// Optimization: using an enum for the current token means we can't modify the associated values in-place
|
|
// Separate fields for everything increases the risk of invalid states, but nets us a small perf gain.
|
|
private var currentStartTag: (String, selfClosing: Bool, attributes: [Attribute])?
|
|
private var currentEndTag: String?
|
|
private var currentComment: String?
|
|
private var currentDoctype: (String, forceQuirks: Bool, publicIdentifier: String?, systemIdentifier: String?)?
|
|
|
|
init(chars: Chars) {
|
|
self.chars = chars
|
|
}
|
|
|
|
mutating func next() -> Token? {
|
|
switch state {
|
|
case .flushingTemporaryBuffer(let returnState):
|
|
state = returnState
|
|
if temporaryBuffer == nil || temporaryBuffer!.isEmpty {
|
|
return next()
|
|
} else {
|
|
var buffer: String? = nil
|
|
swap(&buffer, &temporaryBuffer)
|
|
return .characterSequence(buffer!)
|
|
}
|
|
case .endOfFile:
|
|
return nil
|
|
case .emitTokens(var tokens, let nextState):
|
|
if tokens.isEmpty {
|
|
state = nextState
|
|
return next()
|
|
} else {
|
|
let tok = tokens.removeFirst()
|
|
state = .emitTokens(tokens, nextState)
|
|
return tok
|
|
}
|
|
|
|
case .data:
|
|
return tokenizeData()
|
|
case .characterReference:
|
|
return tokenizeCharacterReference()
|
|
case .namedCharacterReference:
|
|
return tokenizeNamedCharaterReference()
|
|
case .numericCharacterReference:
|
|
return tokenizeNumericCharacterReference()
|
|
case .numericCharacterReferenceEnd:
|
|
return tokenizeNumericCharacterReferenceEnd()
|
|
case .hexadecimalCharacterReferenceStart:
|
|
return tokenizeHexadecimalCharacterReferenceStart()
|
|
case .hexadecimalCharacterReference:
|
|
return tokenizeHexadecimalCharacterReference()
|
|
case .decimalCharacterReferenceStart:
|
|
return tokenizeDecimalCharacterReferenceStart()
|
|
case .decimalCharacterReference:
|
|
return tokenizeDecimalCharacterReference()
|
|
case .ambiguousAmpersand:
|
|
return tokenizeAmbiguousAmpersand()
|
|
case .tagOpen:
|
|
return tokenizeTagOpen()
|
|
case .endTagOpen:
|
|
return tokenizeEndTagOpen()
|
|
case .tagName:
|
|
return tokenizeTagName()
|
|
case .selfClosingStartTag:
|
|
return tokenizeSelfClosingStartTag()
|
|
case .beforeAttributeName:
|
|
return tokenizeBeforeAttributeName()
|
|
case .attributeName:
|
|
return tokenizeAttributeName()
|
|
case .afterAttributeName:
|
|
return tokenizeAfterAttributeName()
|
|
case .beforeAttributeValue:
|
|
return tokenizeBeforeAttributeValue()
|
|
case .attributeValue(let quotes):
|
|
return tokenizeAttributeValue(quotes: quotes)
|
|
case .afterAttributeValueQuoted:
|
|
return tokenizeAfterAttributeValueQuoted()
|
|
case .bogusComment:
|
|
return tokenizeBogusComment()
|
|
case .markupDeclarationOpen:
|
|
return tokenizeMarkupDeclarationOpen()
|
|
case .commentStart:
|
|
return tokenizeCommentStart()
|
|
case .commentStartDash:
|
|
return tokenizeCommentStartDash()
|
|
case .comment:
|
|
return tokenizeComment()
|
|
case .commentLessThanSign:
|
|
return tokenizeCommentLessThanSign()
|
|
case .commentLessThanSignBang:
|
|
return tokenizeCommentLessThanSignBang()
|
|
case .commentLessThanSignBangDash:
|
|
return tokenizeCommentLessThanSignBangDash()
|
|
case .commentLessThanSignBangDashDash:
|
|
return tokenizeCommentLessThanSignBangDashDash()
|
|
case .commentEndDash:
|
|
return tokenizeCommentEndDash()
|
|
case .commentEnd:
|
|
return tokenizeCommentEnd()
|
|
case .commentEndBang:
|
|
return tokenizeCommentEndBang()
|
|
case .doctype:
|
|
return tokenizeDoctype()
|
|
case .beforeDoctypeName:
|
|
return tokenizeBeforeDoctypeName()
|
|
case .doctypeName:
|
|
return tokenizeDoctypeName()
|
|
case .afterDoctypeName:
|
|
return tokenizeAfterDoctypeName()
|
|
case .afterDoctypePublicKeyword:
|
|
return tokenizeAfterDoctypePublicKeyword()
|
|
case .beforeDoctypePublicIdentifier:
|
|
return tokenizeBeforeDoctypePublicIdentifier()
|
|
case .doctypePublicIdentifier(let quotes):
|
|
return tokenizeDoctypePublicIdentifier(quotes: quotes)
|
|
case .afterDoctypePublicIdentifier:
|
|
return tokenizeAfterDoctypePublicIdentifier()
|
|
case .betweenDoctypePublicAndSystemIdentifiers:
|
|
return tokenizeBetweenDoctypePublicAndSystemIdentifiers()
|
|
case .afterDoctypeSystemKeyword:
|
|
return tokenizeAfterDoctypeSystemKeyword()
|
|
case .beforeDoctypeSystemIdentifier:
|
|
return tokenizeBeforeDoctypeSystemIdentifier()
|
|
case .doctypeSystemIdentifier(let quotes):
|
|
return tokenizeDoctypeSystemIdentifier(quotes: quotes)
|
|
case .afterDoctypeSystemIdentifier:
|
|
return tokenizeAfterDoctypeSystemIdentifier()
|
|
case .bogusDoctype:
|
|
return tokenizeBogusDoctype()
|
|
}
|
|
}
|
|
|
|
private mutating func reconsume(_ c: Unicode.Scalar?) {
|
|
if let c {
|
|
reconsumeStack.append(c)
|
|
}
|
|
}
|
|
|
|
private mutating func nextChar() -> Unicode.Scalar? {
|
|
if !reconsumeStack.isEmpty {
|
|
return reconsumeStack.removeLast()
|
|
} else {
|
|
return chars.next()
|
|
}
|
|
}
|
|
|
|
private mutating func peekChar() -> Unicode.Scalar? {
|
|
if let nextToReconsume = reconsumeStack.last {
|
|
return nextToReconsume
|
|
} else {
|
|
let c = chars.next()
|
|
if let c {
|
|
reconsume(c)
|
|
}
|
|
return c
|
|
}
|
|
}
|
|
|
|
// TODO: extract this all out into a standalone type and test it separately
|
|
private mutating func peek(count: Int) -> String {
|
|
precondition(count >= 0)
|
|
var buf = String.UnicodeScalarView()
|
|
for _ in 0..<count {
|
|
if let c = nextChar() {
|
|
buf.append(c)
|
|
} else {
|
|
break
|
|
}
|
|
}
|
|
reconsumeStack.append(contentsOf: buf.reversed())
|
|
return String(buf)
|
|
}
|
|
|
|
private mutating func consume(count: Int) {
|
|
precondition(count >= 0)
|
|
for _ in 0..<count {
|
|
_ = nextChar()
|
|
}
|
|
}
|
|
|
|
private mutating func takeCurrentToken() -> Token {
|
|
if let currentStartTag {
|
|
self.currentStartTag = nil
|
|
return .startTag(currentStartTag.0, selfClosing: currentStartTag.selfClosing, attributes: currentStartTag.attributes)
|
|
} else if let currentEndTag {
|
|
self.currentEndTag = nil
|
|
return .endTag(currentEndTag)
|
|
} else if let currentComment {
|
|
self.currentComment = nil
|
|
return .comment(currentComment)
|
|
} else if let currentDoctype {
|
|
self.currentDoctype = nil
|
|
return .doctype(currentDoctype.0, forceQuirks: currentDoctype.forceQuirks, publicIdentifier: currentDoctype.publicIdentifier, systemIdentifier: currentDoctype.systemIdentifier)
|
|
} else {
|
|
preconditionFailure("takeCurrentToken called without current token")
|
|
}
|
|
}
|
|
}
|
|
|
|
enum Token: Equatable {
|
|
case character(Unicode.Scalar)
|
|
case characterSequence(String)
|
|
case comment(String)
|
|
case startTag(String, selfClosing: Bool, attributes: [Attribute])
|
|
case endTag(String)
|
|
case doctype(String, forceQuirks: Bool, publicIdentifier: String?, systemIdentifier: String?)
|
|
}
|
|
|
|
public struct Attribute: Equatable {
|
|
public var name: String
|
|
public var value: String
|
|
}
|
|
|
|
private enum State {
|
|
// Internal states used by the tokenizer
|
|
indirect case flushingTemporaryBuffer(State)
|
|
case endOfFile
|
|
indirect case emitTokens([Token], State)
|
|
|
|
// States defined by the spec
|
|
case data
|
|
// RCDATA not currently supported
|
|
// case rcdata
|
|
// RAWTEXT not currently supported
|
|
// case rawtext
|
|
// script tag not currently supported
|
|
// case scriptData
|
|
// plaintext tag not currently supported
|
|
// case plaintext
|
|
case tagOpen
|
|
case endTagOpen
|
|
case tagName
|
|
// RCDATA not currently supported
|
|
// case rcdataLessThanSign
|
|
// case rcdataEndTagOpen
|
|
// case rcdataEndTagName
|
|
// RAWTEXT not currently supported
|
|
// case rawtextLessThanSign
|
|
// case rawtextEndTagOpen
|
|
// case rawtextEndTagName
|
|
// script not currently supported
|
|
// case scriptDataLessThanSign
|
|
// case scriptDataEndTagOpen
|
|
// case scriptDataEndTagName
|
|
// case scriptDataEscapeStart
|
|
// case scriptDataEscapeStartDash
|
|
// case scriptDataEscaped
|
|
// case scriptDataEscapedDash
|
|
// case scriptDataEscapedDashDash
|
|
// case scriptDataEscapedLessThanSign
|
|
// case scriptDataEscapedEndTagOpen
|
|
// case scriptDataEscapedEndTagName
|
|
// case scriptDataDoubleEscapeStart
|
|
// case scriptDataDoubleEscaped
|
|
// case scriptDataDoubleEscapedDash
|
|
// case scriptDataDoubleEscapedDashDash
|
|
// case scriptDataDoubleEscapedLessThanSign
|
|
// case scriptDataDoubleEscapeEnd
|
|
case beforeAttributeName
|
|
case attributeName
|
|
case afterAttributeName
|
|
case beforeAttributeValue
|
|
case attributeValue(AttributeValueQuotation)
|
|
case afterAttributeValueQuoted
|
|
case selfClosingStartTag
|
|
case bogusComment
|
|
case markupDeclarationOpen
|
|
case commentStart
|
|
case commentStartDash
|
|
case comment
|
|
case commentLessThanSign
|
|
case commentLessThanSignBang
|
|
case commentLessThanSignBangDash
|
|
case commentLessThanSignBangDashDash
|
|
case commentEndDash
|
|
case commentEnd
|
|
case commentEndBang
|
|
case doctype
|
|
case beforeDoctypeName
|
|
case doctypeName
|
|
case afterDoctypeName
|
|
case afterDoctypePublicKeyword
|
|
case beforeDoctypePublicIdentifier
|
|
case doctypePublicIdentifier(DoctypeIdentifierQuotation)
|
|
case afterDoctypePublicIdentifier
|
|
case betweenDoctypePublicAndSystemIdentifiers
|
|
case afterDoctypeSystemKeyword
|
|
case beforeDoctypeSystemIdentifier
|
|
case doctypeSystemIdentifier(DoctypeIdentifierQuotation)
|
|
case afterDoctypeSystemIdentifier
|
|
case bogusDoctype
|
|
// CDATA not currently supported
|
|
// case cdataSection
|
|
// case cdataSectionBracket
|
|
// case cdataSectionEndState
|
|
case characterReference
|
|
case namedCharacterReference
|
|
case ambiguousAmpersand
|
|
case numericCharacterReference
|
|
case hexadecimalCharacterReferenceStart
|
|
case decimalCharacterReferenceStart
|
|
case hexadecimalCharacterReference
|
|
case decimalCharacterReference
|
|
case numericCharacterReferenceEnd
|
|
}
|
|
|
|
private enum AttributeValueQuotation {
|
|
case singleQuoted, doubleQuoted, unquoted
|
|
}
|
|
|
|
private enum DoctypeIdentifierQuotation {
|
|
case singleQuoted, doubleQuoted
|
|
}
|
|
|
|
private extension Tokenizer {
|
|
mutating func tokenizeData() -> Token? {
|
|
// Optimization: It's common to have runs of characters that are tokenized as-is,
|
|
// so try to return them as a single token so the downstream consumer
|
|
// can avoid repeated work.
|
|
var buf = ""
|
|
while true {
|
|
switch nextChar() {
|
|
case "&":
|
|
returnState = .data
|
|
state = .characterReference
|
|
if buf.isEmpty {
|
|
return tokenizeCharacterReference()
|
|
} else {
|
|
return .characterSequence(buf)
|
|
}
|
|
case "<":
|
|
state = .tagOpen
|
|
if buf.isEmpty {
|
|
return tokenizeTagOpen()
|
|
} else {
|
|
return .characterSequence(buf)
|
|
}
|
|
case "\0":
|
|
if buf.isEmpty {
|
|
return .character("\0")
|
|
} else {
|
|
reconsume("\0")
|
|
return .characterSequence(buf)
|
|
}
|
|
case nil:
|
|
if buf.isEmpty {
|
|
return nil // end of file
|
|
} else {
|
|
return .characterSequence(buf)
|
|
}
|
|
case .some(let c):
|
|
buf.unicodeScalars.append(c)
|
|
continue
|
|
}
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeCharacterReference() -> Token? {
|
|
temporaryBuffer = "&"
|
|
guard let c = nextChar() else {
|
|
reconsume(nil)
|
|
state = .flushingTemporaryBuffer(returnState!)
|
|
return next()
|
|
}
|
|
switch c {
|
|
case "a"..."z", "A"..."Z", "0"..."9":
|
|
reconsume(c)
|
|
state = .namedCharacterReference
|
|
return tokenizeNamedCharaterReference()
|
|
case "#":
|
|
temporaryBuffer!.append("#")
|
|
state = .numericCharacterReference
|
|
return tokenizeNumericCharacterReference()
|
|
default:
|
|
reconsume(c)
|
|
state = returnState!
|
|
return next()
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeNamedCharaterReference() -> Token? {
|
|
// consume as many [a-zA-Z0-9] as possible, until semicolon
|
|
loop: while let c = nextChar() {
|
|
switch c {
|
|
case "a"..."z", "A"..."Z", "0"..."9":
|
|
temporaryBuffer!.unicodeScalars.append(c)
|
|
case ";":
|
|
temporaryBuffer!.unicodeScalars.append(c)
|
|
break loop
|
|
default:
|
|
reconsume(c)
|
|
break loop
|
|
}
|
|
}
|
|
|
|
var referent = namedCharactersDecodeMap[String(temporaryBuffer!.dropFirst())]
|
|
if referent == nil {
|
|
// start from the beginning and try to find a reference
|
|
var key = ";"
|
|
let buf = temporaryBuffer!
|
|
var index = buf.index(after: buf.startIndex)
|
|
while index < buf.endIndex {
|
|
key.replaceSubrange(key.index(before: key.endIndex)..., with: "\(buf[index]);")
|
|
buf.formIndex(after: &index)
|
|
referent = namedCharactersDecodeMap[key]
|
|
if referent != nil {
|
|
break
|
|
}
|
|
}
|
|
if referent != nil {
|
|
for c in buf[index...].unicodeScalars.reversed() {
|
|
reconsume(c)
|
|
}
|
|
temporaryBuffer!.removeSubrange(index...)
|
|
}
|
|
}
|
|
|
|
if let referent {
|
|
if case .attributeValue(_) = returnState,
|
|
temporaryBuffer!.last != ";",
|
|
let next = peekChar(),
|
|
next == "=" || ("a"..."z").contains(next) || ("A"..."Z").contains(next) || ("0"..."9").contains(next) {
|
|
flushCharacterReference()
|
|
} else {
|
|
temporaryBuffer = "\(referent)"
|
|
flushCharacterReference()
|
|
}
|
|
} else {
|
|
state = .flushingTemporaryBuffer(.ambiguousAmpersand)
|
|
}
|
|
|
|
return next()
|
|
}
|
|
|
|
mutating func flushCharacterReference() {
|
|
if case .attributeValue(_) = returnState {
|
|
currentStartTag!.attributes.uncheckedLast.value.append(temporaryBuffer!)
|
|
temporaryBuffer = nil
|
|
state = returnState!
|
|
} else {
|
|
state = .flushingTemporaryBuffer(returnState!)
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeNumericCharacterReference() -> Token? {
|
|
characterReferenceCode = 0
|
|
switch nextChar() {
|
|
case "x", "X":
|
|
temporaryBuffer!.append("x")
|
|
state = .hexadecimalCharacterReference
|
|
return tokenizeHexadecimalCharacterReference()
|
|
case let c:
|
|
reconsume(c)
|
|
state = .decimalCharacterReference
|
|
return tokenizeDecimalCharacterReference()
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeNumericCharacterReferenceEnd() -> Token? {
|
|
switch characterReferenceCode! {
|
|
case 0:
|
|
// parse error: null-character-reference
|
|
characterReferenceCode = 0xFFFD
|
|
case let c where c > 0x10FFFF:
|
|
// parse error: character-reference-outside-unicode-range
|
|
characterReferenceCode = 0xFFFD
|
|
case 0xD800...0xDBFF, 0xDC00...0xDFFF: // leading and trailing surrogate ranges
|
|
// parse error: surrogate-character-reference
|
|
characterReferenceCode = 0xFFFD
|
|
case let c where Unicode.Scalar(c) == nil:
|
|
// parse error: noncharacter-character-reference
|
|
// "The parser resolves such character references as-is."
|
|
// TODO: idfk what that means
|
|
characterReferenceCode = nil
|
|
state = returnState!
|
|
return next()
|
|
case 0x0D, 0...0x1F /* C0 control */, 0x7F...0x9F:
|
|
// parse error: control-character-reference
|
|
characterReferenceCode = switch characterReferenceCode! {
|
|
case 0x80: 0x20AC
|
|
case 0x82: 0x201A
|
|
case 0x83: 0x0192
|
|
case 0x84: 0x201E
|
|
case 0x85: 0x2026
|
|
case 0x86: 0x2020
|
|
case 0x87: 0x2021
|
|
case 0x88: 0x02C6
|
|
case 0x89: 0x2030
|
|
case 0x8A: 0x0160
|
|
case 0x8B: 0x2039
|
|
case 0x8C: 0x0152
|
|
case 0x8E: 0x017D
|
|
case 0x91: 0x2018
|
|
case 0x92: 0x2019
|
|
case 0x93: 0x201C
|
|
case 0x94: 0x201D
|
|
case 0x95: 0x2022
|
|
case 0x96: 0x2013
|
|
case 0x97: 0x2014
|
|
case 0x98: 0x02DC
|
|
case 0x99: 0x2122
|
|
case 0x9A: 0x0161
|
|
case 0x9B: 0x203A
|
|
case 0x9C: 0x0153
|
|
case 0x9E: 0x017E
|
|
case 0x9F: 0x0178
|
|
case let c: c
|
|
}
|
|
default:
|
|
break
|
|
}
|
|
temporaryBuffer = ""
|
|
if let c = Unicode.Scalar(characterReferenceCode!) {
|
|
temporaryBuffer!.append(Character(c))
|
|
}
|
|
flushCharacterReference()
|
|
return next()
|
|
}
|
|
|
|
mutating func tokenizeHexadecimalCharacterReferenceStart() -> Token? {
|
|
let c = nextChar()
|
|
switch c {
|
|
case .some("0"..."9"), .some("a"..."f"), .some("A"..."F"):
|
|
reconsume(c)
|
|
state = .hexadecimalCharacterReference
|
|
return tokenizeHexadecimalCharacterReference()
|
|
default:
|
|
// parse error: absence-of-digits-in-numeric-character-reference
|
|
reconsume(c)
|
|
state = .flushingTemporaryBuffer(returnState!)
|
|
return next()
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeHexadecimalCharacterReference() -> Token? {
|
|
let c = nextChar()
|
|
switch c {
|
|
case .some("0"..."9"), .some("a"..."f"), .some("A"..."F"):
|
|
characterReferenceCode = (characterReferenceCode! * 16) + UInt32(c!.hexDigitValue!)
|
|
return tokenizeHexadecimalCharacterReference()
|
|
case ";":
|
|
state = .numericCharacterReferenceEnd
|
|
return tokenizeNumericCharacterReferenceEnd()
|
|
case let c:
|
|
// parse error: missing-semicolon-after-character-reference
|
|
reconsume(c)
|
|
state = .numericCharacterReferenceEnd
|
|
return tokenizeNumericCharacterReferenceEnd()
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeDecimalCharacterReferenceStart() -> Token? {
|
|
let c = nextChar()
|
|
if let c,
|
|
c.isASCII && c.isNumber {
|
|
reconsume(c)
|
|
state = .decimalCharacterReference
|
|
return tokenizeDecimalCharacterReference()
|
|
} else {
|
|
// parse error: absence-of-digits-in-numeric-character-reference
|
|
reconsume(c)
|
|
state = returnState!
|
|
return next()
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeDecimalCharacterReference() -> Token? {
|
|
let c = nextChar()
|
|
switch c {
|
|
case .some("0"..."9"):
|
|
characterReferenceCode = (characterReferenceCode! * 10) + UInt32(c!.hexDigitValue!)
|
|
return tokenizeDecimalCharacterReference()
|
|
case ";":
|
|
state = .numericCharacterReferenceEnd
|
|
return tokenizeNumericCharacterReferenceEnd()
|
|
default:
|
|
// if nil, parse error: missing-semicolon-after-character-reference
|
|
reconsume(c)
|
|
state = .numericCharacterReferenceEnd
|
|
return tokenizeNumericCharacterReferenceEnd()
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeAmbiguousAmpersand() -> Token? {
|
|
let c = nextChar()
|
|
switch c {
|
|
case .some("0"..."9"), .some("a"..."z"), .some("A"..."Z"):
|
|
if case .attributeValue(_) = returnState {
|
|
currentStartTag!.attributes.uncheckedLast.value.unicodeScalars.append(c!)
|
|
return tokenizeAmbiguousAmpersand()
|
|
} else {
|
|
return .character(c!)
|
|
}
|
|
default:
|
|
// if c == ";", parse error: unknown-named-character-reference
|
|
reconsume(c)
|
|
state = returnState!
|
|
return next()
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeTagOpen() -> Token? {
|
|
let c = nextChar()
|
|
switch c {
|
|
case "!":
|
|
state = .markupDeclarationOpen
|
|
return tokenizeMarkupDeclarationOpen()
|
|
case "/":
|
|
state = .endTagOpen
|
|
return tokenizeEndTagOpen()
|
|
case "?":
|
|
// parse error: unexpected-question-mark-instead-of-tag-name
|
|
currentComment = ""
|
|
state = .bogusComment
|
|
return tokenizeBogusComment()
|
|
case nil:
|
|
// parser error: eof-before-tag-name
|
|
state = .endOfFile
|
|
return .character("<")
|
|
case .some("a"..."z"), .some("A"..."Z"):
|
|
currentStartTag = ("", selfClosing: false, attributes: [])
|
|
reconsume(c)
|
|
state = .tagName
|
|
return tokenizeTagName()
|
|
case .some(_):
|
|
// parse error: invalid-first-character-of-tag-name
|
|
reconsume(c)
|
|
state = .data
|
|
return .character("<")
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeEndTagOpen() -> Token? {
|
|
let c = nextChar()
|
|
switch c {
|
|
case .some("a"..."z"), .some("A"..."Z"):
|
|
currentEndTag = ""
|
|
reconsume(c)
|
|
state = .tagName
|
|
return tokenizeTagName()
|
|
case ">":
|
|
// parse error: missing-end-tag-name
|
|
state = .data
|
|
return tokenizeData()
|
|
case nil:
|
|
// parse error: eof-before-tag-name
|
|
state = .emitTokens([.character("/")], .endOfFile)
|
|
return .character("<")
|
|
case .some(let c):
|
|
// parse error: invalid-first-character-of-tag-name
|
|
currentComment = ""
|
|
reconsume(c)
|
|
state = .bogusComment
|
|
return tokenizeBogusComment()
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeTagName() -> Token? {
|
|
// Optimization: this is a hot path where we stay in this state for a while before emitting a token,
|
|
// and the function call overhead of recursion costs a bit of perf.
|
|
while true {
|
|
switch nextChar() {
|
|
case "\t", "\n", "\u{000C}", " ":
|
|
state = .beforeAttributeName
|
|
return tokenizeBeforeAttributeName()
|
|
case "/":
|
|
state = .selfClosingStartTag
|
|
return tokenizeSelfClosingStartTag()
|
|
case ">":
|
|
state = .data
|
|
return takeCurrentToken()
|
|
case nil:
|
|
// parse error: eof-in-tag
|
|
state = .endOfFile
|
|
return nil
|
|
case .some(var c):
|
|
if c == "\0" {
|
|
// parse error: unexpected-null-character
|
|
c = "\u{FFFD}"
|
|
} else if ("A"..."Z").contains(c) {
|
|
c = c.asciiLowercase
|
|
}
|
|
if currentStartTag != nil {
|
|
currentStartTag!.0.unicodeScalars.append(c)
|
|
continue
|
|
} else if currentEndTag != nil {
|
|
currentEndTag!.unicodeScalars.append(c)
|
|
continue
|
|
} else {
|
|
fatalError("bad current token")
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeSelfClosingStartTag() -> Token? {
|
|
switch nextChar() {
|
|
case ">":
|
|
currentStartTag!.selfClosing = true
|
|
state = .data
|
|
return takeCurrentToken()
|
|
case nil:
|
|
// parse error: eof-in-tag
|
|
state = .endOfFile
|
|
return nil
|
|
case .some(let c):
|
|
// parse error: unexpected-solidus-in-tag
|
|
reconsume(c)
|
|
state = .beforeAttributeName
|
|
return tokenizeBeforeAttributeName()
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeBeforeAttributeName() -> Token? {
|
|
let c = nextChar()
|
|
switch c {
|
|
case "\t", "\n", "\u{000C}", " ":
|
|
// ignore the character
|
|
return tokenizeBeforeAttributeName()
|
|
case "/", ">", nil:
|
|
reconsume(c)
|
|
state = .afterAttributeName
|
|
return tokenizeAfterAttributeName()
|
|
case "=":
|
|
// parse error: unexpected-equals-sign-before-attribute-name
|
|
currentStartTag!.attributes.append(Attribute(name: "=", value: ""))
|
|
state = .attributeName
|
|
return tokenizeAttributeName()
|
|
default:
|
|
if currentStartTag != nil {
|
|
currentStartTag!.attributes.append(Attribute(name: "", value: ""))
|
|
reconsume(c)
|
|
state = .attributeName
|
|
return tokenizeAttributeName()
|
|
} else if currentEndTag != nil {
|
|
// ignore
|
|
reconsume(c)
|
|
state = .attributeName
|
|
return tokenizeAttributeName()
|
|
} else {
|
|
fatalError("bad current token")
|
|
}
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeAttributeName() -> Token? {
|
|
while true {
|
|
let c = nextChar()
|
|
switch c {
|
|
case "\t", "\n", "\u{000C}", " ", "/", ">", nil:
|
|
reconsume(c)
|
|
state = .afterAttributeName
|
|
return tokenizeAfterAttributeName()
|
|
case "=":
|
|
state = .beforeAttributeValue
|
|
return tokenizeBeforeAttributeValue()
|
|
case .some(var c):
|
|
if ("A"..."Z").contains(c) {
|
|
c = c.asciiLowercase
|
|
}
|
|
// if null, parse error: unexpected-null-character
|
|
if c == "\0" {
|
|
c = "\u{FFFD}"
|
|
}
|
|
// if c in ["\"", "'", "<"], parse error: unexpected-character-in-attribute-name
|
|
if currentStartTag != nil {
|
|
currentStartTag!.attributes.uncheckedLast.name.unicodeScalars.append(c)
|
|
continue
|
|
} else if currentEndTag != nil {
|
|
continue
|
|
} else {
|
|
fatalError("bad curren token")
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeAfterAttributeName() -> Token? {
|
|
switch nextChar() {
|
|
case "\t", "\n", "\u{000C}", " ":
|
|
// ignore the character
|
|
return tokenizeAfterAttributeName()
|
|
case "/":
|
|
state = .selfClosingStartTag
|
|
return tokenizeSelfClosingStartTag()
|
|
case "=":
|
|
state = .beforeAttributeValue
|
|
return tokenizeBeforeAttributeValue()
|
|
case ">":
|
|
state = .data
|
|
return takeCurrentToken()
|
|
case nil:
|
|
// parse error: eof-in-tag
|
|
state = .endOfFile
|
|
return nil
|
|
case .some(let c):
|
|
if currentStartTag != nil {
|
|
currentStartTag!.attributes.append(Attribute(name: "", value: ""))
|
|
reconsume(c)
|
|
state = .attributeName
|
|
return tokenizeAttributeName()
|
|
} else if currentEndTag != nil {
|
|
reconsume(c)
|
|
state = .attributeName
|
|
return tokenizeAttributeName()
|
|
} else {
|
|
fatalError("bad current token")
|
|
}
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeBeforeAttributeValue() -> Token? {
|
|
switch nextChar() {
|
|
case "\t", "\n", "\u{000C}", " ":
|
|
// ignore the character
|
|
return tokenizeBeforeAttributeValue()
|
|
case "\"":
|
|
state = .attributeValue(.doubleQuoted)
|
|
return tokenizeAttributeValue(quotes: .doubleQuoted)
|
|
case "'":
|
|
state = .attributeValue(.singleQuoted)
|
|
return tokenizeAttributeValue(quotes: .singleQuoted)
|
|
case ">":
|
|
// parse error: missing-attribute-value
|
|
state = .data
|
|
return takeCurrentToken()
|
|
case let c:
|
|
reconsume(c)
|
|
state = .attributeValue(.unquoted)
|
|
return tokenizeAttributeValue(quotes: .unquoted)
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeAttributeValue(quotes: AttributeValueQuotation) -> Token? {
|
|
while true {
|
|
if quotes == .unquoted {
|
|
switch nextChar() {
|
|
case "\t", "\n", "\u{000C}", " ":
|
|
state = .beforeAttributeName
|
|
return tokenizeBeforeAttributeName()
|
|
case "&":
|
|
returnState = .attributeValue(.unquoted)
|
|
state = .characterReference
|
|
return tokenizeCharacterReference()
|
|
case ">":
|
|
state = .data
|
|
return takeCurrentToken()
|
|
case nil:
|
|
// parse error: eof-in-tag
|
|
state = .endOfFile
|
|
return nil
|
|
case .some(let c):
|
|
// if c in ["\"", "'", "<", "=", "`"], parse error: unexpected-character-in-unquoted-attribute-value
|
|
if currentStartTag != nil {
|
|
currentStartTag!.attributes.uncheckedLast.value.unicodeScalars.append(c)
|
|
continue
|
|
} else if currentEndTag != nil {
|
|
continue
|
|
} else {
|
|
fatalError("bad current token")
|
|
}
|
|
}
|
|
} else {
|
|
let c = nextChar()
|
|
switch c {
|
|
case "\"" where quotes == .doubleQuoted:
|
|
state = .afterAttributeValueQuoted
|
|
return tokenizeAfterAttributeValueQuoted()
|
|
case "'" where quotes == .singleQuoted:
|
|
state = .afterAttributeValueQuoted
|
|
return tokenizeAfterAttributeValueQuoted()
|
|
case "&":
|
|
returnState = .attributeValue(quotes)
|
|
state = .characterReference
|
|
return tokenizeCharacterReference()
|
|
case nil:
|
|
// parse error: eof-in-tag
|
|
state = .endOfFile
|
|
return nil
|
|
case .some(var c):
|
|
if c == "\0" {
|
|
// parse error: unexpected-null-character
|
|
c = "\u{FFFD}"
|
|
}
|
|
if currentStartTag != nil {
|
|
currentStartTag!.attributes.uncheckedLast.value.unicodeScalars.append(c)
|
|
continue
|
|
} else if currentEndTag != nil {
|
|
continue
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeAfterAttributeValueQuoted() -> Token? {
|
|
switch nextChar() {
|
|
case "\t", "\n", "\u{000C}", " ":
|
|
state = .beforeAttributeName
|
|
return tokenizeBeforeAttributeName()
|
|
case "/":
|
|
state = .selfClosingStartTag
|
|
return tokenizeSelfClosingStartTag()
|
|
case ">":
|
|
state = .data
|
|
return takeCurrentToken()
|
|
case nil:
|
|
// parse error: eof-in-tag
|
|
state = .endOfFile
|
|
return nil
|
|
case .some(let c):
|
|
// parse error: missing-whitespace-between-attributes
|
|
reconsume(c)
|
|
state = .beforeAttributeName
|
|
return tokenizeBeforeAttributeName()
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeBogusComment() -> Token? {
|
|
switch nextChar() {
|
|
case ">":
|
|
state = .data
|
|
return takeCurrentToken()
|
|
case nil:
|
|
state = .endOfFile
|
|
return takeCurrentToken()
|
|
case .some(var c):
|
|
if c == "\0" {
|
|
// parse error: unexpected-null-character
|
|
c = "\u{FFFD}"
|
|
}
|
|
currentComment!.unicodeScalars.append(c)
|
|
return tokenizeBogusComment()
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeMarkupDeclarationOpen() -> Token? {
|
|
let peeked = peek(count: 7)
|
|
if peeked.starts(with: "--") {
|
|
consume(count: 2)
|
|
currentComment = ""
|
|
state = .commentStart
|
|
return tokenizeCommentStart()
|
|
} else if peeked.lowercased() == "doctype" {
|
|
consume(count: 7)
|
|
state = .doctype
|
|
return tokenizeDoctype()
|
|
} else if peeked == "[CDATA[" {
|
|
// TODO: we don't do any of the tree construction stuff yet, so can't really handle this
|
|
// consume(count: 7)
|
|
currentComment = ""
|
|
state = .bogusComment
|
|
return tokenizeBogusComment()
|
|
} else {
|
|
// parse error: incorrectly-opened-comment
|
|
currentComment = ""
|
|
state = .bogusComment
|
|
return tokenizeBogusComment()
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeCommentStart() -> Token? {
|
|
switch nextChar() {
|
|
case "-":
|
|
state = .commentStartDash
|
|
return tokenizeCommentStartDash()
|
|
case ">":
|
|
// parse error: abrupt-closing-of-empty-comment
|
|
state = .data
|
|
return takeCurrentToken()
|
|
case let c:
|
|
reconsume(c)
|
|
state = .comment
|
|
return tokenizeComment()
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeCommentStartDash() -> Token? {
|
|
switch nextChar() {
|
|
case "-":
|
|
state = .commentEnd
|
|
return tokenizeCommentEnd()
|
|
case ">":
|
|
// parse error: abrupt-closing-of-empty-comment
|
|
state = .data
|
|
return takeCurrentToken()
|
|
case nil:
|
|
// parse error: eof-in-comment
|
|
return takeCurrentToken()
|
|
case .some(let c):
|
|
currentComment!.append("-")
|
|
reconsume(c)
|
|
state = .comment
|
|
return tokenizeComment()
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeComment() -> Token? {
|
|
switch nextChar() {
|
|
case "<":
|
|
currentComment!.append("<")
|
|
state = .commentLessThanSign
|
|
return tokenizeCommentLessThanSign()
|
|
case "-":
|
|
state = .commentEndDash
|
|
return tokenizeCommentEndDash()
|
|
case nil:
|
|
// parse error: eof-in-comment
|
|
state = .endOfFile
|
|
return takeCurrentToken()
|
|
case .some(var c):
|
|
if c == "\0" {
|
|
// parse error: unexpected-null-character
|
|
c = "\u{FFFD}"
|
|
}
|
|
currentComment!.unicodeScalars.append(c)
|
|
return tokenizeComment()
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeCommentLessThanSign() -> Token? {
|
|
switch nextChar() {
|
|
case "!":
|
|
currentComment!.append("!")
|
|
state = .commentLessThanSignBang
|
|
return tokenizeCommentLessThanSignBang()
|
|
case "<":
|
|
currentComment!.append("<")
|
|
return tokenizeComment()
|
|
case let c:
|
|
reconsume(c)
|
|
state = .comment
|
|
return tokenizeComment()
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeCommentLessThanSignBang() -> Token? {
|
|
switch nextChar() {
|
|
case "-":
|
|
state = .commentLessThanSignBangDash
|
|
return tokenizeCommentLessThanSignBangDash()
|
|
case let c:
|
|
reconsume(c)
|
|
state = .comment
|
|
return tokenizeComment()
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeCommentLessThanSignBangDash() -> Token? {
|
|
switch nextChar() {
|
|
case "-":
|
|
state = .commentLessThanSignBangDashDash
|
|
return tokenizeCommentLessThanSignBangDashDash()
|
|
case let c:
|
|
reconsume(c)
|
|
state = .commentEndDash
|
|
return tokenizeCommentEndDash()
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeCommentLessThanSignBangDashDash() -> Token? {
|
|
let c = nextChar()
|
|
switch c {
|
|
case ">", nil:
|
|
reconsume(c)
|
|
state = .commentEnd
|
|
return tokenizeCommentEnd()
|
|
default:
|
|
// parse error: nested-comment
|
|
reconsume(c)
|
|
state = .commentEnd
|
|
return tokenizeCommentEnd()
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeCommentEndDash() -> Token? {
|
|
switch nextChar() {
|
|
case "-":
|
|
state = .commentEnd
|
|
return tokenizeCommentEnd()
|
|
case nil:
|
|
// parse error: eof-in-comment
|
|
state = .endOfFile
|
|
return takeCurrentToken()
|
|
case let c:
|
|
currentComment!.append("-")
|
|
reconsume(c)
|
|
state = .comment
|
|
return tokenizeComment()
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeCommentEnd() -> Token? {
|
|
switch nextChar() {
|
|
case ">":
|
|
state = .data
|
|
return takeCurrentToken()
|
|
case "!":
|
|
state = .commentEndBang
|
|
return tokenizeCommentEndBang()
|
|
case "-":
|
|
currentComment!.append("-")
|
|
return tokenizeCommentEnd()
|
|
case nil:
|
|
// parse error: eof-in-comment
|
|
state = .endOfFile
|
|
return takeCurrentToken()
|
|
case .some(let c):
|
|
currentComment!.append("--")
|
|
reconsume(c)
|
|
state = .comment
|
|
return tokenizeComment()
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeCommentEndBang() -> Token? {
|
|
switch nextChar() {
|
|
case "-":
|
|
currentComment!.append("--!")
|
|
state = .commentEndDash
|
|
return tokenizeCommentEndDash()
|
|
case ">":
|
|
// parse error: incorrectly-closed-comment
|
|
state = .data
|
|
return takeCurrentToken()
|
|
case nil:
|
|
// parse error: eof-in-comment
|
|
state = .endOfFile
|
|
return takeCurrentToken()
|
|
case .some(let c):
|
|
currentComment!.append("--!")
|
|
reconsume(c)
|
|
state = .comment
|
|
return tokenizeComment()
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeDoctype() -> Token? {
|
|
switch nextChar() {
|
|
case "\t", "\n", "\u{000C}", " ":
|
|
state = .beforeDoctypeName
|
|
return tokenizeBeforeDoctypeName()
|
|
case ">":
|
|
reconsume(">")
|
|
state = .beforeDoctypeName
|
|
return tokenizeBeforeDoctypeName()
|
|
case nil:
|
|
// parse error: eof-in-doctype
|
|
state = .endOfFile
|
|
return .doctype("", forceQuirks: true, publicIdentifier: nil, systemIdentifier: nil)
|
|
case .some(let c):
|
|
// parse error: missing-whitespace-before-doctype-name
|
|
reconsume(c)
|
|
state = .beforeDoctypeName
|
|
return tokenizeBeforeDoctypeName()
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeBeforeDoctypeName() -> Token? {
|
|
switch nextChar() {
|
|
case "\t", "\n", "\u{000C}", " ":
|
|
// ignore the character
|
|
return tokenizeBeforeDoctypeName()
|
|
case .some(let c) where ("A"..."Z").contains(c):
|
|
currentDoctype = ("\(c.asciiLowercase)", forceQuirks: false, publicIdentifier: nil, systemIdentifier: nil)
|
|
state = .doctypeName
|
|
return tokenizeDoctypeName()
|
|
case "\0":
|
|
// parse error: unexpected-null-character
|
|
currentDoctype = ("\u{FFFD}", forceQuirks: false, publicIdentifier: nil, systemIdentifier: nil)
|
|
state = .doctypeName
|
|
return tokenizeDoctypeName()
|
|
case ">":
|
|
// parse error: missing-doctype-name
|
|
state = .data
|
|
return .doctype("", forceQuirks: true, publicIdentifier: nil, systemIdentifier: nil)
|
|
case nil:
|
|
// parse error: eof-in-doctype
|
|
state = .endOfFile
|
|
return .doctype("", forceQuirks: false, publicIdentifier: nil, systemIdentifier: nil)
|
|
case .some(let c):
|
|
currentDoctype = ("\(c)", forceQuirks: false, publicIdentifier: nil, systemIdentifier: nil)
|
|
state = .doctypeName
|
|
return tokenizeDoctypeName()
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeDoctypeName() -> Token? {
|
|
switch nextChar() {
|
|
case "\t", "\n", "\u{000C}", " ":
|
|
state = .afterDoctypeName
|
|
return tokenizeAfterDoctypeName()
|
|
case ">":
|
|
state = .data
|
|
return takeCurrentToken()
|
|
case nil:
|
|
// parse error: eof-in-doctype
|
|
currentDoctype!.forceQuirks = true
|
|
return takeCurrentToken()
|
|
case .some(var c):
|
|
if c == "\0" {
|
|
c = "\u{FFFD}"
|
|
} else if ("A"..."Z").contains(c) {
|
|
c = c.asciiLowercase
|
|
}
|
|
currentDoctype!.0.unicodeScalars.append(c)
|
|
return tokenizeDoctypeName()
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeAfterDoctypeName() -> Token? {
|
|
switch nextChar() {
|
|
case "\t", "\n", "\u{000C}", " ":
|
|
// ignore the character
|
|
return tokenizeAfterDoctypeName()
|
|
case ">":
|
|
state = .data
|
|
return takeCurrentToken()
|
|
case nil:
|
|
// parse error: eof-in-doctype
|
|
state = .endOfFile
|
|
currentDoctype!.forceQuirks = true
|
|
return takeCurrentToken()
|
|
case .some(let c):
|
|
reconsume(c)
|
|
let peeked = peek(count: 6).lowercased()
|
|
if peeked == "public" {
|
|
consume(count: 6)
|
|
state = .afterDoctypePublicKeyword
|
|
return tokenizeAfterDoctypePublicKeyword()
|
|
} else if peeked == "system" {
|
|
consume(count: 6)
|
|
state = .afterDoctypeSystemKeyword
|
|
return tokenizeAfterDoctypeSystemKeyword()
|
|
} else {
|
|
// parse error: invalid-character-sequence-after-doctype-name
|
|
currentDoctype!.forceQuirks = true
|
|
state = .bogusDoctype
|
|
return tokenizeBogusDoctype()
|
|
}
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeAfterDoctypePublicKeyword() -> Token? {
|
|
switch nextChar() {
|
|
case "\t", "\n", "\u{000C}", " ":
|
|
state = .beforeDoctypePublicIdentifier
|
|
return tokenizeBeforeDoctypePublicIdentifier()
|
|
case .some(let c) where c == "\"" || c == "'":
|
|
// parse error: missing-whitespace-after-doctype-public-keyword
|
|
currentDoctype!.publicIdentifier = ""
|
|
let quotes = c == "\"" ? DoctypeIdentifierQuotation.doubleQuoted : .singleQuoted
|
|
state = .doctypePublicIdentifier(quotes)
|
|
return tokenizeDoctypePublicIdentifier(quotes: quotes)
|
|
case ">":
|
|
// parse error: missing-doctype-public-identifier
|
|
state = .data
|
|
currentDoctype!.forceQuirks = true
|
|
return takeCurrentToken()
|
|
case nil:
|
|
// parse error: eof-in-doctype
|
|
state = .endOfFile
|
|
currentDoctype!.forceQuirks = true
|
|
return takeCurrentToken()
|
|
case .some(let c):
|
|
// parse error: missing-quote-before-doctype-public-identifier
|
|
currentDoctype!.forceQuirks = true
|
|
state = .bogusDoctype
|
|
reconsume(c)
|
|
return tokenizeBogusDoctype()
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeBeforeDoctypePublicIdentifier() -> Token? {
|
|
switch nextChar() {
|
|
case "\t", "\n", "\u{000C}", " ":
|
|
// ignore the character
|
|
return tokenizeBeforeDoctypePublicIdentifier()
|
|
case .some(let c) where c == "\"" || c == "'":
|
|
currentDoctype!.publicIdentifier = ""
|
|
let quotes = c == "\"" ? DoctypeIdentifierQuotation.doubleQuoted : .singleQuoted
|
|
state = .doctypePublicIdentifier(quotes)
|
|
return tokenizeDoctypePublicIdentifier(quotes: quotes)
|
|
case ">":
|
|
// parse error: missing-doctype-public-identifier
|
|
state = .data
|
|
currentDoctype!.forceQuirks = true
|
|
return takeCurrentToken()
|
|
case nil:
|
|
// parse error: eof-in-doctype
|
|
state = .endOfFile
|
|
currentDoctype!.forceQuirks = true
|
|
return takeCurrentToken()
|
|
case .some(let c):
|
|
// parse error: missing-quote-before-doctype-public-identifier
|
|
currentDoctype!.forceQuirks = true
|
|
reconsume(c)
|
|
state = .bogusDoctype
|
|
return tokenizeBogusDoctype()
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeDoctypePublicIdentifier(quotes: DoctypeIdentifierQuotation) -> Token? {
|
|
switch nextChar() {
|
|
case "\"" where quotes == .doubleQuoted:
|
|
state = .afterDoctypePublicIdentifier
|
|
return tokenizeAfterDoctypePublicIdentifier()
|
|
case "'" where quotes == .singleQuoted:
|
|
state = .afterDoctypePublicIdentifier
|
|
return tokenizeAfterDoctypePublicIdentifier()
|
|
case ">":
|
|
// parse error: abrupt-doctype-public-identifier
|
|
reconsume(">")
|
|
state = .data
|
|
currentDoctype!.forceQuirks = true
|
|
return takeCurrentToken()
|
|
case nil:
|
|
// parse error: eof-in-doctype
|
|
state = .endOfFile
|
|
currentDoctype!.forceQuirks = true
|
|
return takeCurrentToken()
|
|
case .some(var c):
|
|
if c == "\0" {
|
|
// parse error: unexpected-null-character
|
|
c = "\u{FFFD}"
|
|
}
|
|
currentDoctype!.publicIdentifier!.unicodeScalars.append(c)
|
|
return tokenizeDoctypePublicIdentifier(quotes: quotes)
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeAfterDoctypePublicIdentifier() -> Token? {
|
|
switch nextChar() {
|
|
case "\t", "\n", "\u{000C}", " ":
|
|
state = .betweenDoctypePublicAndSystemIdentifiers
|
|
return tokenizeBetweenDoctypePublicAndSystemIdentifiers()
|
|
case ">":
|
|
state = .data
|
|
return takeCurrentToken()
|
|
case .some(let c) where c == "\"" || c == "'":
|
|
// parse error: missing-whitespace-between-doctype-public-and-system-identifiers
|
|
currentDoctype!.systemIdentifier = ""
|
|
let quotes = c == "\"" ? DoctypeIdentifierQuotation.doubleQuoted : .singleQuoted
|
|
state = .doctypeSystemIdentifier(quotes)
|
|
return tokenizeDoctypeSystemIdentifier(quotes: quotes)
|
|
case nil:
|
|
// parse error: eof-in-doctype
|
|
state = .endOfFile
|
|
currentDoctype!.forceQuirks = true
|
|
return takeCurrentToken()
|
|
case .some(let c):
|
|
// parse error: missing-quote-before-doctype-system-identifier
|
|
currentDoctype!.forceQuirks = true
|
|
reconsume(c)
|
|
state = .bogusDoctype
|
|
return tokenizeBogusDoctype()
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeBetweenDoctypePublicAndSystemIdentifiers() -> Token? {
|
|
switch nextChar() {
|
|
case "\t", "\n", "\u{000C}", " ":
|
|
// ignore the character
|
|
return tokenizeBetweenDoctypePublicAndSystemIdentifiers()
|
|
case ">":
|
|
state = .data
|
|
return takeCurrentToken()
|
|
case .some(let c) where c == "\"" || c == "'":
|
|
currentDoctype!.systemIdentifier = ""
|
|
let quotes = c == "\"" ? DoctypeIdentifierQuotation.doubleQuoted : .singleQuoted
|
|
state = .doctypeSystemIdentifier(quotes)
|
|
return tokenizeDoctypeSystemIdentifier(quotes: quotes)
|
|
case nil:
|
|
// parse error: eof-in-doctype
|
|
state = .endOfFile
|
|
currentDoctype!.forceQuirks = true
|
|
return takeCurrentToken()
|
|
case .some(let c):
|
|
// parse error: missing-quote-before-doctype-system-identifier
|
|
currentDoctype!.forceQuirks = true
|
|
reconsume(c)
|
|
state = .bogusComment
|
|
return tokenizeBogusComment()
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeAfterDoctypeSystemKeyword() -> Token? {
|
|
switch nextChar() {
|
|
case "\t", "\n", "\u{000C}", " ":
|
|
state = .beforeDoctypeSystemIdentifier
|
|
return tokenizeBeforeDoctypeSystemIdentifier()
|
|
case .some(let c) where c == "\"" || c == "'":
|
|
currentDoctype!.systemIdentifier = ""
|
|
let quotes = c == "\"" ? DoctypeIdentifierQuotation.doubleQuoted : .singleQuoted
|
|
state = .doctypeSystemIdentifier(quotes)
|
|
return tokenizeDoctypeSystemIdentifier(quotes: quotes)
|
|
case ">":
|
|
// parse error: missing-doctype-system-identifier
|
|
state = .data
|
|
currentDoctype!.forceQuirks = true
|
|
return takeCurrentToken()
|
|
case nil:
|
|
// parse error: eof-in-doctype:
|
|
state = .endOfFile
|
|
currentDoctype!.forceQuirks = true
|
|
return takeCurrentToken()
|
|
case .some(let c):
|
|
// parse error: missing-quote-before-doctype-system-identifier
|
|
currentDoctype!.forceQuirks = true
|
|
reconsume(c)
|
|
state = .bogusDoctype
|
|
return tokenizeBogusDoctype()
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeBeforeDoctypeSystemIdentifier() -> Token? {
|
|
switch nextChar() {
|
|
case "\t", "\n", "\u{000C}", " ":
|
|
// ignore the character
|
|
return tokenizeBeforeDoctypeSystemIdentifier()
|
|
case .some(let c) where c == "\"" || c == "'":
|
|
currentDoctype!.systemIdentifier = ""
|
|
let quotes = c == "\"" ? DoctypeIdentifierQuotation.doubleQuoted : .singleQuoted
|
|
state = .doctypeSystemIdentifier(quotes)
|
|
return tokenizeDoctypeSystemIdentifier(quotes: quotes)
|
|
case ">":
|
|
// parse error: missing-doctype-system-identifier
|
|
state = .data
|
|
currentDoctype!.forceQuirks = true
|
|
return takeCurrentToken()
|
|
case nil:
|
|
// parse error: eof-in-doctype:
|
|
state = .endOfFile
|
|
currentDoctype!.forceQuirks = true
|
|
return takeCurrentToken()
|
|
case .some(let c):
|
|
// parse error: missing-quote-before-doctype-system-identifier
|
|
currentDoctype!.forceQuirks = true
|
|
reconsume(c)
|
|
state = .bogusDoctype
|
|
return tokenizeBogusDoctype()
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeDoctypeSystemIdentifier(quotes: DoctypeIdentifierQuotation) -> Token? {
|
|
switch nextChar() {
|
|
case "\"" where quotes == .doubleQuoted:
|
|
state = .afterDoctypeSystemIdentifier
|
|
return tokenizeAfterDoctypeSystemIdentifier()
|
|
case "'" where quotes == .singleQuoted:
|
|
state = .afterDoctypeSystemIdentifier
|
|
return tokenizeAfterDoctypeSystemIdentifier()
|
|
case ">":
|
|
// parse error: abrupt-doctype-system-identifier
|
|
state = .data
|
|
currentDoctype!.forceQuirks = true
|
|
return takeCurrentToken()
|
|
case nil:
|
|
// parse error: eof-in-doctype
|
|
state = .endOfFile
|
|
currentDoctype!.forceQuirks = true
|
|
return takeCurrentToken()
|
|
case .some(var c):
|
|
if c == "\0" {
|
|
// parse error: unexpected-null-character
|
|
c = "\u{FFFD}"
|
|
}
|
|
currentDoctype!.systemIdentifier!.unicodeScalars.append(c)
|
|
return tokenizeDoctypeSystemIdentifier(quotes: quotes)
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeAfterDoctypeSystemIdentifier() -> Token? {
|
|
switch nextChar() {
|
|
case "\t", "\n", "\u{000C}", " ":
|
|
// ignore the character
|
|
return tokenizeAfterDoctypeSystemIdentifier()
|
|
case ">":
|
|
state = .data
|
|
return takeCurrentToken()
|
|
case nil:
|
|
// parse error: eof-in-doctype
|
|
state = .endOfFile
|
|
currentDoctype!.forceQuirks = true
|
|
return takeCurrentToken()
|
|
case .some(let c):
|
|
// parse error: unexpected-character-after-doctype-system-identifier
|
|
// Note: This does not set the current DOCTYPE token's force-quirks flag to on.
|
|
reconsume(c)
|
|
state = .bogusDoctype
|
|
return tokenizeBogusDoctype()
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeBogusDoctype() -> Token? {
|
|
switch nextChar() {
|
|
case ">":
|
|
state = .data
|
|
return takeCurrentToken()
|
|
case "\0":
|
|
// parse error: unexpected-null-character, ignore the character
|
|
return tokenizeBogusDoctype()
|
|
case nil:
|
|
state = .endOfFile
|
|
return takeCurrentToken()
|
|
case _:
|
|
// ignore the character
|
|
return tokenizeBogusDoctype()
|
|
}
|
|
}
|
|
}
|
|
|
|
private extension Array {
|
|
// Optimization: allows in-place modification of the last element of the array.
|
|
var uncheckedLast: Element {
|
|
_read {
|
|
yield self[count - 1]
|
|
}
|
|
_modify {
|
|
yield &self[count - 1]
|
|
}
|
|
}
|
|
}
|
|
|
|
private extension Unicode.Scalar {
|
|
var asciiLowercase: Unicode.Scalar {
|
|
assert(("A"..."Z").contains(self))
|
|
return Unicode.Scalar(value + 0x20)!
|
|
}
|
|
|
|
var hexDigitValue: Int? {
|
|
switch self {
|
|
case "0": 0
|
|
case "1": 1
|
|
case "2": 2
|
|
case "3": 3
|
|
case "4": 4
|
|
case "5": 5
|
|
case "6": 6
|
|
case "7": 7
|
|
case "8": 8
|
|
case "9": 9
|
|
case "A", "a": 0xA
|
|
case "B", "b": 0xB
|
|
case "C", "c": 0xC
|
|
case "D", "d": 0xD
|
|
case "E", "e": 0xE
|
|
case "F", "f": 0xF
|
|
default: nil
|
|
}
|
|
}
|
|
|
|
var isNumber: Bool {
|
|
("0"..."9").contains(self)
|
|
}
|
|
}
|