1693 lines
63 KiB
Swift
1693 lines
63 KiB
Swift
//
|
|
// Tokenizer.swift
|
|
// HTMLStreamer
|
|
//
|
|
// Created by Shadowfacts on 11/22/23.
|
|
//
|
|
|
|
import Foundation
|
|
|
|
struct Tokenizer<Chars: IteratorProtocol<Character>>: IteratorProtocol {
|
|
typealias Element = Token
|
|
|
|
private var chars: Chars
|
|
private var reconsumeStack: InlineArray3<Character> = []
|
|
private var state = State.data
|
|
private var returnState: State?
|
|
private var temporaryBuffer: String?
|
|
private var characterReferenceCode: UInt32?
|
|
private var currentToken: Token?
|
|
|
|
init(chars: Chars) {
|
|
self.chars = chars
|
|
}
|
|
|
|
mutating func next() -> Token? {
|
|
switch state {
|
|
case .flushingTemporaryBuffer(let returnState):
|
|
if temporaryBuffer == nil || temporaryBuffer!.isEmpty {
|
|
state = returnState
|
|
return next()
|
|
} else {
|
|
return .character(temporaryBuffer!.removeFirst())
|
|
}
|
|
case .endOfFile:
|
|
return nil
|
|
case .emitTokens(var tokens, let nextState):
|
|
if tokens.isEmpty {
|
|
state = nextState
|
|
return next()
|
|
} else {
|
|
let tok = tokens.removeFirst()
|
|
state = .emitTokens(tokens, nextState)
|
|
return tok
|
|
}
|
|
|
|
case .data:
|
|
return tokenizeData()
|
|
case .characterReference:
|
|
return tokenizeCharacterReference()
|
|
case .namedCharacterReference:
|
|
return tokenizeNamedCharaterReference()
|
|
case .numericCharacterReference:
|
|
return tokenizeNumericCharacterReference()
|
|
case .numericCharacterReferenceEnd:
|
|
return tokenizeNumericCharacterReferenceEnd()
|
|
case .hexadecimalCharacterReferenceStart:
|
|
return tokenizeHexadecimalCharacterReferenceStart()
|
|
case .hexadecimalCharacterReference:
|
|
return tokenizeHexadecimalCharacterReference()
|
|
case .decimalCharacterReferenceStart:
|
|
return tokenizeDecimalCharacterReferenceStart()
|
|
case .decimalCharacterReference:
|
|
return tokenizeDecimalCharacterReference()
|
|
case .ambiguousAmpersand:
|
|
return tokenizeAmbiguousAmpersand()
|
|
case .tagOpen:
|
|
return tokenizeTagOpen()
|
|
case .endTagOpen:
|
|
return tokenizeEndTagOpen()
|
|
case .tagName:
|
|
return tokenizeTagName()
|
|
case .selfClosingStartTag:
|
|
return tokenizeSelfClosingStartTag()
|
|
case .beforeAttributeName:
|
|
return tokenizeBeforeAttributeName()
|
|
case .attributeName:
|
|
return tokenizeAttributeName()
|
|
case .afterAttributeName:
|
|
return tokenizeAfterAttributeName()
|
|
case .beforeAttributeValue:
|
|
return tokenizeBeforeAttributeValue()
|
|
case .attributeValue(let quotes):
|
|
return tokenizeAttributeValue(quotes: quotes)
|
|
case .afterAttributeValueQuoted:
|
|
return tokenizeAfterAttributeValueQuoted()
|
|
case .bogusComment:
|
|
return tokenizeBogusComment()
|
|
case .markupDeclarationOpen:
|
|
return tokenizeMarkupDeclarationOpen()
|
|
case .commentStart:
|
|
return tokenizeCommentStart()
|
|
case .commentStartDash:
|
|
return tokenizeCommentStartDash()
|
|
case .comment:
|
|
return tokenizeComment()
|
|
case .commentLessThanSign:
|
|
return tokenizeCommentLessThanSign()
|
|
case .commentLessThanSignBang:
|
|
return tokenizeCommentLessThanSignBang()
|
|
case .commentLessThanSignBangDash:
|
|
return tokenizeCommentLessThanSignBangDash()
|
|
case .commentLessThanSignBangDashDash:
|
|
return tokenizeCommentLessThanSignBangDashDash()
|
|
case .commentEndDash:
|
|
return tokenizeCommentEndDash()
|
|
case .commentEnd:
|
|
return tokenizeCommentEnd()
|
|
case .commentEndBang:
|
|
return tokenizeCommentEndBang()
|
|
case .doctype:
|
|
return tokenizeDoctype()
|
|
case .beforeDoctypeName:
|
|
return tokenizeBeforeDoctypeName()
|
|
case .doctypeName:
|
|
return tokenizeDoctypeName()
|
|
case .afterDoctypeName:
|
|
return tokenizeAfterDoctypeName()
|
|
case .afterDoctypePublicKeyword:
|
|
return tokenizeAfterDoctypePublicKeyword()
|
|
case .beforeDoctypePublicIdentifier:
|
|
return tokenizeBeforeDoctypePublicIdentifier()
|
|
case .doctypePublicIdentifier(let quotes):
|
|
return tokenizeDoctypePublicIdentifier(quotes: quotes)
|
|
case .afterDoctypePublicIdentifier:
|
|
return tokenizeAfterDoctypePublicIdentifier()
|
|
case .betweenDoctypePublicAndSystemIdentifiers:
|
|
return tokenizeBetweenDoctypePublicAndSystemIdentifiers()
|
|
case .afterDoctypeSystemKeyword:
|
|
return tokenizeAfterDoctypeSystemKeyword()
|
|
case .beforeDoctypeSystemIdentifier:
|
|
return tokenizeBeforeDoctypeSystemIdentifier()
|
|
case .doctypeSystemIdentifier(let quotes):
|
|
return tokenizeDoctypeSystemIdentifier(quotes: quotes)
|
|
case .afterDoctypeSystemIdentifier:
|
|
return tokenizeAfterDoctypeSystemIdentifier()
|
|
case .bogusDoctype:
|
|
return tokenizeBogusDoctype()
|
|
}
|
|
}
|
|
|
|
private mutating func reconsume(_ c: Character?) {
|
|
if let c {
|
|
reconsumeStack.append(c)
|
|
}
|
|
}
|
|
|
|
private mutating func nextChar() -> Character? {
|
|
if !reconsumeStack.isEmpty {
|
|
return reconsumeStack.removeLast()
|
|
} else {
|
|
return chars.next()
|
|
}
|
|
}
|
|
|
|
private mutating func peekChar() -> Character? {
|
|
if let nextToReconsume = reconsumeStack.last {
|
|
return nextToReconsume
|
|
} else {
|
|
let c = chars.next()
|
|
if let c {
|
|
reconsume(c)
|
|
}
|
|
return c
|
|
}
|
|
}
|
|
|
|
// TODO: extract this all out into a standalone type and test it separately
|
|
private mutating func peek(count: Int) -> String {
|
|
precondition(count >= 0)
|
|
var buf = ""
|
|
for _ in 0..<count {
|
|
if let c = nextChar() {
|
|
buf.append(c)
|
|
} else {
|
|
break
|
|
}
|
|
}
|
|
reconsumeStack.append(contentsOf: buf.reversed())
|
|
return buf
|
|
}
|
|
|
|
private mutating func consume(count: Int) {
|
|
precondition(count >= 0)
|
|
for _ in 0..<count {
|
|
_ = nextChar()
|
|
}
|
|
}
|
|
|
|
private mutating func takeCurrentToken() -> Token {
|
|
defer { currentToken = nil }
|
|
return currentToken!
|
|
}
|
|
}
|
|
|
|
enum Token: Equatable {
|
|
case character(Character)
|
|
case comment(String)
|
|
case startTag(String, selfClosing: Bool, attributes: InlineArray3<Attribute>)
|
|
case endTag(String)
|
|
case doctype(String, forceQuirks: Bool, publicIdentifier: String?, systemIdentifier: String?)
|
|
}
|
|
|
|
struct Attribute: Equatable {
|
|
var name: String
|
|
var value: String
|
|
}
|
|
|
|
private enum State {
|
|
// Internal states used by the tokenizer
|
|
indirect case flushingTemporaryBuffer(State)
|
|
case endOfFile
|
|
indirect case emitTokens([Token], State)
|
|
|
|
// States defined by the spec
|
|
case data
|
|
// RCDATA not currently supported
|
|
// case rcdata
|
|
// RAWTEXT not currently supported
|
|
// case rawtext
|
|
// script tag not currently supported
|
|
// case scriptData
|
|
// plaintext tag not currently supported
|
|
// case plaintext
|
|
case tagOpen
|
|
case endTagOpen
|
|
case tagName
|
|
// RCDATA not currently supported
|
|
// case rcdataLessThanSign
|
|
// case rcdataEndTagOpen
|
|
// case rcdataEndTagName
|
|
// RAWTEXT not currently supported
|
|
// case rawtextLessThanSign
|
|
// case rawtextEndTagOpen
|
|
// case rawtextEndTagName
|
|
// script not currently supported
|
|
// case scriptDataLessThanSign
|
|
// case scriptDataEndTagOpen
|
|
// case scriptDataEndTagName
|
|
// case scriptDataEscapeStart
|
|
// case scriptDataEscapeStartDash
|
|
// case scriptDataEscaped
|
|
// case scriptDataEscapedDash
|
|
// case scriptDataEscapedDashDash
|
|
// case scriptDataEscapedLessThanSign
|
|
// case scriptDataEscapedEndTagOpen
|
|
// case scriptDataEscapedEndTagName
|
|
// case scriptDataDoubleEscapeStart
|
|
// case scriptDataDoubleEscaped
|
|
// case scriptDataDoubleEscapedDash
|
|
// case scriptDataDoubleEscapedDashDash
|
|
// case scriptDataDoubleEscapedLessThanSign
|
|
// case scriptDataDoubleEscapeEnd
|
|
case beforeAttributeName
|
|
case attributeName
|
|
case afterAttributeName
|
|
case beforeAttributeValue
|
|
case attributeValue(AttributeValueQuotation)
|
|
case afterAttributeValueQuoted
|
|
case selfClosingStartTag
|
|
case bogusComment
|
|
case markupDeclarationOpen
|
|
case commentStart
|
|
case commentStartDash
|
|
case comment
|
|
case commentLessThanSign
|
|
case commentLessThanSignBang
|
|
case commentLessThanSignBangDash
|
|
case commentLessThanSignBangDashDash
|
|
case commentEndDash
|
|
case commentEnd
|
|
case commentEndBang
|
|
case doctype
|
|
case beforeDoctypeName
|
|
case doctypeName
|
|
case afterDoctypeName
|
|
case afterDoctypePublicKeyword
|
|
case beforeDoctypePublicIdentifier
|
|
case doctypePublicIdentifier(DoctypeIdentifierQuotation)
|
|
case afterDoctypePublicIdentifier
|
|
case betweenDoctypePublicAndSystemIdentifiers
|
|
case afterDoctypeSystemKeyword
|
|
case beforeDoctypeSystemIdentifier
|
|
case doctypeSystemIdentifier(DoctypeIdentifierQuotation)
|
|
case afterDoctypeSystemIdentifier
|
|
case bogusDoctype
|
|
// CDATA not currently supported
|
|
// case cdataSection
|
|
// case cdataSectionBracket
|
|
// case cdataSectionEndState
|
|
case characterReference
|
|
case namedCharacterReference
|
|
case ambiguousAmpersand
|
|
case numericCharacterReference
|
|
case hexadecimalCharacterReferenceStart
|
|
case decimalCharacterReferenceStart
|
|
case hexadecimalCharacterReference
|
|
case decimalCharacterReference
|
|
case numericCharacterReferenceEnd
|
|
}
|
|
|
|
private enum AttributeValueQuotation {
|
|
case singleQuoted, doubleQuoted, unquoted
|
|
}
|
|
|
|
private enum DoctypeIdentifierQuotation {
|
|
case singleQuoted, doubleQuoted
|
|
}
|
|
|
|
private extension Tokenizer {
|
|
mutating func tokenizeData() -> Token? {
|
|
switch nextChar() {
|
|
case "&":
|
|
returnState = .data
|
|
state = .characterReference
|
|
return tokenizeCharacterReference()
|
|
case "<":
|
|
state = .tagOpen
|
|
return tokenizeTagOpen()
|
|
case "\0":
|
|
return .character("\0")
|
|
case nil:
|
|
return nil // end of fil
|
|
case .some(let c):
|
|
return .character(c)
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeCharacterReference() -> Token? {
|
|
temporaryBuffer = "&"
|
|
guard let c = nextChar() else {
|
|
reconsume(nil)
|
|
state = .flushingTemporaryBuffer(returnState!)
|
|
return next()
|
|
}
|
|
switch c {
|
|
case "a"..."z", "A"..."Z", "0"..."9":
|
|
reconsume(c)
|
|
state = .namedCharacterReference
|
|
return tokenizeNamedCharaterReference()
|
|
case "#":
|
|
temporaryBuffer!.append("#")
|
|
state = .numericCharacterReference
|
|
return tokenizeNumericCharacterReference()
|
|
default:
|
|
reconsume(c)
|
|
state = returnState!
|
|
return next()
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeNamedCharaterReference() -> Token? {
|
|
// TODO: this could definitely be faster
|
|
// maybe with a prefix tree for named characters
|
|
var everHadMatch = false
|
|
var outOfChars = false
|
|
func hasMatch() -> Bool {
|
|
let buf = temporaryBuffer!
|
|
let key = buf[buf.index(after: buf.startIndex)...]
|
|
return namedCharactersDecodeMap.keys.contains(where: { $0.starts(with: key) })
|
|
}
|
|
while hasMatch() {
|
|
everHadMatch = true
|
|
guard let char = nextChar() else {
|
|
outOfChars = true
|
|
break
|
|
}
|
|
temporaryBuffer!.append(char)
|
|
}
|
|
if everHadMatch {
|
|
if !outOfChars {
|
|
// the last character changed us from having a match to not
|
|
reconsume(temporaryBuffer!.removeLast())
|
|
}
|
|
|
|
if case .attributeValue(_) = returnState,
|
|
temporaryBuffer!.last != ";",
|
|
let peeked = peekChar(),
|
|
peeked == "=" || (peeked.isASCII && (peeked.isLetter || peeked.isNumber)) {
|
|
state = .flushingTemporaryBuffer(returnState!)
|
|
} else {
|
|
let insertSemicolon = temporaryBuffer!.last != ";"
|
|
if insertSemicolon {
|
|
// parse error: missing-semicolon-after-character-reference
|
|
// Usually the parser behaves as if character reference is terminated by the U+003B (;) code point; however, there are some ambiguous cases in which the parser includes subsequent code points in the character reference.
|
|
temporaryBuffer!.append(";")
|
|
}
|
|
if let reference = namedCharactersDecodeMap[String(temporaryBuffer![temporaryBuffer!.index(after: temporaryBuffer!.startIndex)...])] {
|
|
temporaryBuffer = "\(reference)"
|
|
flushCharacterReference()
|
|
} else {
|
|
if insertSemicolon {
|
|
temporaryBuffer!.removeLast()
|
|
}
|
|
state = .flushingTemporaryBuffer(.ambiguousAmpersand)
|
|
}
|
|
}
|
|
} else {
|
|
state = .flushingTemporaryBuffer(.ambiguousAmpersand)
|
|
}
|
|
return next()
|
|
}
|
|
|
|
mutating func flushCharacterReference() {
|
|
if case .attributeValue(_) = returnState {
|
|
if case .startTag(let s, let selfClosing, var attributes) = currentToken {
|
|
attributes[attributes.count - 1].value.append(temporaryBuffer!)
|
|
currentToken = .startTag(s, selfClosing: selfClosing, attributes: attributes)
|
|
temporaryBuffer = nil
|
|
state = returnState!
|
|
} else {
|
|
fatalError("bad current tag")
|
|
}
|
|
} else {
|
|
state = .flushingTemporaryBuffer(returnState!)
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeNumericCharacterReference() -> Token? {
|
|
characterReferenceCode = 0
|
|
switch nextChar() {
|
|
case "x", "X":
|
|
temporaryBuffer!.append("x")
|
|
state = .hexadecimalCharacterReference
|
|
return tokenizeHexadecimalCharacterReference()
|
|
case let c:
|
|
reconsume(c)
|
|
state = .decimalCharacterReference
|
|
return tokenizeDecimalCharacterReference()
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeNumericCharacterReferenceEnd() -> Token? {
|
|
switch characterReferenceCode! {
|
|
case 0:
|
|
// parse error: null-character-reference
|
|
characterReferenceCode = 0xFFFD
|
|
case let c where c > 0x10FFFF:
|
|
// parse error: character-reference-outside-unicode-range
|
|
characterReferenceCode = 0xFFFD
|
|
case 0xD800...0xDBFF, 0xDC00...0xDFFF: // leading and trailing surrogate ranges
|
|
// parse error: surrogate-character-reference
|
|
characterReferenceCode = 0xFFFD
|
|
case let c where Unicode.Scalar(c) == nil:
|
|
// parse error: noncharacter-character-reference
|
|
// "The parser resolves such character references as-is."
|
|
// TODO: idfk what that means
|
|
characterReferenceCode = nil
|
|
state = returnState!
|
|
return next()
|
|
case 0x0D, 0...0x1F /* C0 control */, 0x7F...0x9F:
|
|
// parse error: control-character-reference
|
|
characterReferenceCode = switch characterReferenceCode! {
|
|
case 0x80: 0x20AC
|
|
case 0x82: 0x201A
|
|
case 0x83: 0x0192
|
|
case 0x84: 0x201E
|
|
case 0x85: 0x2026
|
|
case 0x86: 0x2020
|
|
case 0x87: 0x2021
|
|
case 0x88: 0x02C6
|
|
case 0x89: 0x2030
|
|
case 0x8A: 0x0160
|
|
case 0x8B: 0x2039
|
|
case 0x8C: 0x0152
|
|
case 0x8E: 0x017D
|
|
case 0x91: 0x2018
|
|
case 0x92: 0x2019
|
|
case 0x93: 0x201C
|
|
case 0x94: 0x201D
|
|
case 0x95: 0x2022
|
|
case 0x96: 0x2013
|
|
case 0x97: 0x2014
|
|
case 0x98: 0x02DC
|
|
case 0x99: 0x2122
|
|
case 0x9A: 0x0161
|
|
case 0x9B: 0x203A
|
|
case 0x9C: 0x0153
|
|
case 0x9E: 0x017E
|
|
case 0x9F: 0x0178
|
|
case let c: c
|
|
}
|
|
default:
|
|
break
|
|
}
|
|
temporaryBuffer = ""
|
|
if let c = Unicode.Scalar(characterReferenceCode!) {
|
|
temporaryBuffer!.append(Character(c))
|
|
}
|
|
flushCharacterReference()
|
|
return next()
|
|
}
|
|
|
|
mutating func tokenizeHexadecimalCharacterReferenceStart() -> Token? {
|
|
let c = nextChar()
|
|
switch c {
|
|
case .some("0"..."9"), .some("a"..."f"), .some("A"..."F"):
|
|
reconsume(c)
|
|
state = .hexadecimalCharacterReference
|
|
return tokenizeHexadecimalCharacterReference()
|
|
default:
|
|
// parse error: absence-of-digits-in-numeric-character-reference
|
|
reconsume(c)
|
|
state = .flushingTemporaryBuffer(returnState!)
|
|
return next()
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeHexadecimalCharacterReference() -> Token? {
|
|
let c = nextChar()
|
|
switch c {
|
|
case .some("0"..."9"), .some("a"..."f"), .some("A"..."F"):
|
|
characterReferenceCode = (characterReferenceCode! * 16) + UInt32(c!.hexDigitValue!)
|
|
return tokenizeHexadecimalCharacterReference()
|
|
case ";":
|
|
state = .numericCharacterReferenceEnd
|
|
return tokenizeNumericCharacterReferenceEnd()
|
|
case let c:
|
|
// parse error: missing-semicolon-after-character-reference
|
|
reconsume(c)
|
|
state = .numericCharacterReferenceEnd
|
|
return tokenizeNumericCharacterReferenceEnd()
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeDecimalCharacterReferenceStart() -> Token? {
|
|
let c = nextChar()
|
|
if let c,
|
|
c.isASCII && c.isNumber {
|
|
reconsume(c)
|
|
state = .decimalCharacterReference
|
|
return tokenizeDecimalCharacterReference()
|
|
} else {
|
|
// parse error: absence-of-digits-in-numeric-character-reference
|
|
reconsume(c)
|
|
state = returnState!
|
|
return next()
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeDecimalCharacterReference() -> Token? {
|
|
let c = nextChar()
|
|
switch c {
|
|
case .some("0"..."9"):
|
|
characterReferenceCode = (characterReferenceCode! * 10) + UInt32(c!.wholeNumberValue!)
|
|
return tokenizeDecimalCharacterReference()
|
|
case ";":
|
|
state = .numericCharacterReferenceEnd
|
|
return tokenizeNumericCharacterReferenceEnd()
|
|
default:
|
|
// if nil, parse error: missing-semicolon-after-character-reference
|
|
reconsume(c)
|
|
state = .numericCharacterReferenceEnd
|
|
return tokenizeNumericCharacterReferenceEnd()
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeAmbiguousAmpersand() -> Token? {
|
|
let c = nextChar()
|
|
switch c {
|
|
case .some("0"..."9"), .some("a"..."z"), .some("A"..."Z"):
|
|
if case .attributeValue(_) = returnState {
|
|
// TODO: append the current input character to the current attribute's value
|
|
if case .startTag(let s, let selfClosing, var attributes) = currentToken {
|
|
attributes[attributes.count - 1].value.append(c!)
|
|
currentToken = .startTag(s, selfClosing: selfClosing, attributes: attributes)
|
|
} else {
|
|
fatalError("bad current token")
|
|
}
|
|
return next()
|
|
} else {
|
|
return .character(c!)
|
|
}
|
|
default:
|
|
// if c == ";", parse error: unknown-named-character-reference
|
|
reconsume(c)
|
|
state = returnState!
|
|
return next()
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeTagOpen() -> Token? {
|
|
let c = nextChar()
|
|
switch c {
|
|
case "!":
|
|
state = .markupDeclarationOpen
|
|
return tokenizeMarkupDeclarationOpen()
|
|
case "/":
|
|
state = .endTagOpen
|
|
return tokenizeEndTagOpen()
|
|
case "?":
|
|
// parse error: unexpected-question-mark-instead-of-tag-name
|
|
currentToken = .comment("")
|
|
state = .bogusComment
|
|
return tokenizeBogusComment()
|
|
case nil:
|
|
// parser error: eof-before-tag-name
|
|
state = .endOfFile
|
|
return .character("<")
|
|
case .some("a"..."z"), .some("A"..."Z"):
|
|
currentToken = .startTag("", selfClosing: false, attributes: [])
|
|
reconsume(c)
|
|
state = .tagName
|
|
return tokenizeTagName()
|
|
case .some(_):
|
|
// parse error: invalid-first-character-of-tag-name
|
|
reconsume(c)
|
|
state = .data
|
|
return .character("<")
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeEndTagOpen() -> Token? {
|
|
let c = nextChar()
|
|
switch c {
|
|
case .some("a"..."z"), .some("A"..."Z"):
|
|
currentToken = .endTag("")
|
|
reconsume(c)
|
|
state = .tagName
|
|
return tokenizeTagName()
|
|
case ">":
|
|
// parse error: missing-end-tag-name
|
|
state = .data
|
|
return tokenizeData()
|
|
case nil:
|
|
// parse error: eof-before-tag-name
|
|
state = .emitTokens([.character("/")], .endOfFile)
|
|
return .character("<")
|
|
case .some(let c):
|
|
// parse error: invalid-first-character-of-tag-name
|
|
currentToken = .comment("")
|
|
reconsume(c)
|
|
state = .bogusComment
|
|
return tokenizeBogusComment()
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeTagName() -> Token? {
|
|
switch nextChar() {
|
|
case "\t", "\n", "\u{000C}", " ":
|
|
state = .beforeAttributeName
|
|
return tokenizeBeforeAttributeName()
|
|
case "/":
|
|
state = .selfClosingStartTag
|
|
return tokenizeSelfClosingStartTag()
|
|
case ">":
|
|
state = .data
|
|
return takeCurrentToken()
|
|
case nil:
|
|
// parse error: eof-in-tag
|
|
state = .endOfFile
|
|
return nil
|
|
case .some(var c):
|
|
if c == "\0" {
|
|
// parse error: unexpected-null-character
|
|
c = "\u{FFFD}"
|
|
} else if ("A"..."Z").contains(c) {
|
|
c = c.asciiLowercase
|
|
}
|
|
if case .startTag(var s, let selfClosing, let attributes) = currentToken {
|
|
s.append(c)
|
|
currentToken = .startTag(s, selfClosing: selfClosing, attributes: attributes)
|
|
return tokenizeTagName()
|
|
} else if case .endTag(var s) = currentToken {
|
|
s.append(c)
|
|
currentToken = .endTag(s)
|
|
return tokenizeTagName()
|
|
} else {
|
|
fatalError("bad current token")
|
|
}
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeSelfClosingStartTag() -> Token? {
|
|
switch nextChar() {
|
|
case ">":
|
|
if case .startTag(let s, _, let attributes) = currentToken {
|
|
currentToken = .startTag(s, selfClosing: true, attributes: attributes)
|
|
} else {
|
|
fatalError("bad current token")
|
|
}
|
|
state = .data
|
|
return takeCurrentToken()
|
|
case nil:
|
|
// parse error: eof-in-tag
|
|
state = .endOfFile
|
|
return nil
|
|
case .some(let c):
|
|
// parse error: unexpected-solidus-in-tag
|
|
reconsume(c)
|
|
state = .beforeAttributeName
|
|
return tokenizeBeforeAttributeName()
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeBeforeAttributeName() -> Token? {
|
|
let c = nextChar()
|
|
switch c {
|
|
case "\t", "\n", "\u{000C}", " ":
|
|
// ignore the character
|
|
return next()
|
|
case "/", ">", nil:
|
|
reconsume(c)
|
|
state = .afterAttributeName
|
|
return tokenizeAfterAttributeName()
|
|
case "=":
|
|
// parse error: unexpected-equals-sign-before-attribute-name
|
|
if case .startTag(let s, let selfClosing, var attributes) = currentToken {
|
|
attributes.append(Attribute(name: "=", value: ""))
|
|
currentToken = .startTag(s, selfClosing: selfClosing, attributes: attributes)
|
|
state = .attributeName
|
|
return tokenizeAttributeName()
|
|
} else {
|
|
fatalError("bad current token")
|
|
}
|
|
default:
|
|
if case .startTag(let s, let selfClosing, var attributes) = currentToken {
|
|
attributes.append(Attribute(name: "", value: ""))
|
|
currentToken = .startTag(s, selfClosing: selfClosing, attributes: attributes)
|
|
reconsume(c)
|
|
state = .attributeName
|
|
return tokenizeAttributeName()
|
|
} else if case .endTag(_) = currentToken {
|
|
// ignore
|
|
reconsume(c)
|
|
state = .attributeName
|
|
return tokenizeAttributeName()
|
|
} else {
|
|
fatalError("bad current token")
|
|
}
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeAttributeName() -> Token? {
|
|
let c = nextChar()
|
|
switch c {
|
|
case "\t", "\n", "\u{000C}", " ", "/", ">", nil:
|
|
reconsume(c)
|
|
state = .afterAttributeName
|
|
return tokenizeAfterAttributeName()
|
|
case "=":
|
|
state = .beforeAttributeValue
|
|
return tokenizeBeforeAttributeValue()
|
|
case .some(var c):
|
|
if ("A"..."Z").contains(c) {
|
|
c = c.asciiLowercase
|
|
}
|
|
// if null, parse error: unexpected-null-character
|
|
if c == "\0" {
|
|
c = "\u{FFFD}"
|
|
}
|
|
// if c in ["\"", "'", "<"], parse error: unexpected-character-in-attribute-name
|
|
if case .startTag(let s, let selfClosing, var attributes) = currentToken {
|
|
attributes[attributes.count - 1].name.append(c)
|
|
currentToken = .startTag(s, selfClosing: selfClosing, attributes: attributes)
|
|
return tokenizeAttributeName()
|
|
} else if case .endTag(_) = currentToken {
|
|
return tokenizeAttributeName()
|
|
} else {
|
|
fatalError("bad curren token")
|
|
}
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeAfterAttributeName() -> Token? {
|
|
switch nextChar() {
|
|
case "\t", "\n", "\u{000C}", " ":
|
|
// ignore the character
|
|
return tokenizeAttributeName()
|
|
case "/":
|
|
state = .selfClosingStartTag
|
|
return tokenizeSelfClosingStartTag()
|
|
case "=":
|
|
state = .beforeAttributeValue
|
|
return tokenizeBeforeAttributeValue()
|
|
case nil:
|
|
// parse error: eof-in-tag
|
|
state = .endOfFile
|
|
return nil
|
|
case .some(let c):
|
|
if case .startTag(let s, let selfClosing, var attributes) = currentToken {
|
|
attributes.append(Attribute(name: "", value: ""))
|
|
currentToken = .startTag(s, selfClosing: selfClosing, attributes: attributes)
|
|
reconsume(c)
|
|
state = .attributeName
|
|
return tokenizeAttributeName()
|
|
} else if case .endTag(_) = currentToken {
|
|
reconsume(c)
|
|
state = .attributeName
|
|
return tokenizeAttributeName()
|
|
} else {
|
|
fatalError("bad current token")
|
|
}
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeBeforeAttributeValue() -> Token? {
|
|
switch nextChar() {
|
|
case "\t", "\n", "\u{000C}", " ":
|
|
// ignore the character
|
|
return tokenizeBeforeAttributeValue()
|
|
case "\"":
|
|
state = .attributeValue(.doubleQuoted)
|
|
return tokenizeAttributeValue(quotes: .doubleQuoted)
|
|
case "'":
|
|
state = .attributeValue(.singleQuoted)
|
|
return tokenizeAttributeValue(quotes: .singleQuoted)
|
|
case ">":
|
|
// parse error: missing-attribute-value
|
|
state = .data
|
|
return takeCurrentToken()
|
|
case let c:
|
|
reconsume(c)
|
|
state = .attributeValue(.unquoted)
|
|
return tokenizeAttributeValue(quotes: .unquoted)
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeAttributeValue(quotes: AttributeValueQuotation) -> Token? {
|
|
if quotes == .unquoted {
|
|
switch nextChar() {
|
|
case "\t", "\n", "\u{000C}", " ":
|
|
state = .beforeAttributeName
|
|
return tokenizeBeforeAttributeName()
|
|
case "&":
|
|
returnState = .attributeValue(.unquoted)
|
|
state = .characterReference
|
|
return tokenizeCharacterReference()
|
|
case ">":
|
|
state = .data
|
|
return takeCurrentToken()
|
|
case nil:
|
|
// parse error: eof-in-tag
|
|
state = .endOfFile
|
|
return nil
|
|
case .some(let c):
|
|
// if c in ["\"", "'", "<", "=", "`"], parse error: unexpected-character-in-unquoted-attribute-value
|
|
if case .startTag(let s, let selfClosing, var attributes) = currentToken {
|
|
attributes[attributes.count - 1].value.append(c)
|
|
currentToken = .startTag(s, selfClosing: selfClosing, attributes: attributes)
|
|
return tokenizeAttributeValue(quotes: quotes)
|
|
} else {
|
|
fatalError("bad current token")
|
|
}
|
|
}
|
|
} else {
|
|
let c = nextChar()
|
|
switch c {
|
|
case "\"" where quotes == .doubleQuoted:
|
|
state = .afterAttributeValueQuoted
|
|
return tokenizeAfterAttributeValueQuoted()
|
|
case "'" where quotes == .singleQuoted:
|
|
state = .afterAttributeValueQuoted
|
|
return tokenizeAfterAttributeValueQuoted()
|
|
case "&":
|
|
returnState = .attributeValue(quotes)
|
|
state = .characterReference
|
|
return tokenizeCharacterReference()
|
|
case nil:
|
|
// parse error: eof-in-tag
|
|
state = .endOfFile
|
|
return nil
|
|
case .some(var c):
|
|
if c == "\0" {
|
|
// parse error: unexpected-null-character
|
|
c = "\u{FFFD}"
|
|
}
|
|
if case .startTag(let s, let selfClosing, var attributes) = currentToken {
|
|
attributes[attributes.count - 1].value.append(c)
|
|
currentToken = .startTag(s, selfClosing: selfClosing, attributes: attributes)
|
|
return tokenizeAttributeValue(quotes: quotes)
|
|
} else if case .endTag(_) = currentToken {
|
|
return tokenizeAttributeValue(quotes: quotes)
|
|
} else {
|
|
fatalError("bad current token")
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeAfterAttributeValueQuoted() -> Token? {
|
|
switch nextChar() {
|
|
case "\t", "\n", "\u{000C}", " ":
|
|
state = .beforeAttributeName
|
|
return tokenizeBeforeAttributeName()
|
|
case "/":
|
|
state = .selfClosingStartTag
|
|
return tokenizeSelfClosingStartTag()
|
|
case ">":
|
|
state = .data
|
|
return takeCurrentToken()
|
|
case nil:
|
|
// parse error: eof-in-tag
|
|
state = .endOfFile
|
|
return nil
|
|
case .some(let c):
|
|
// parse error: missing-whitespace-between-attributes
|
|
reconsume(c)
|
|
state = .beforeAttributeName
|
|
return tokenizeBeforeAttributeName()
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeBogusComment() -> Token? {
|
|
switch nextChar() {
|
|
case ">":
|
|
state = .data
|
|
return takeCurrentToken()
|
|
case nil:
|
|
state = .endOfFile
|
|
return takeCurrentToken()
|
|
case .some(var c):
|
|
if c == "\0" {
|
|
// parse error: unexpected-null-character
|
|
c = "\u{FFFD}"
|
|
}
|
|
if case .comment(var s) = currentToken {
|
|
s.append(c)
|
|
currentToken = .comment(s)
|
|
return tokenizeBogusComment()
|
|
} else {
|
|
fatalError("bad current token")
|
|
}
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeMarkupDeclarationOpen() -> Token? {
|
|
let peeked = peek(count: 7)
|
|
if peeked.starts(with: "--") {
|
|
consume(count: 2)
|
|
currentToken = .comment("")
|
|
state = .commentStart
|
|
return tokenizeCommentStart()
|
|
} else if peeked.lowercased() == "doctype" {
|
|
consume(count: 7)
|
|
state = .doctype
|
|
return tokenizeDoctype()
|
|
} else if peeked == "[CDATA[" {
|
|
// TODO: we don't do any of the tree construction stuff yet, so can't really handle this
|
|
// consume(count: 7)
|
|
currentToken = .comment("")
|
|
state = .bogusComment
|
|
return tokenizeBogusComment()
|
|
} else {
|
|
// parse error: incorrectly-opened-comment
|
|
currentToken = .comment("")
|
|
state = .bogusComment
|
|
return tokenizeBogusComment()
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeCommentStart() -> Token? {
|
|
switch nextChar() {
|
|
case "-":
|
|
state = .commentStartDash
|
|
return tokenizeCommentStartDash()
|
|
case ">":
|
|
// parse error: abrupt-closing-of-empty-comment
|
|
state = .data
|
|
return takeCurrentToken()
|
|
case let c:
|
|
reconsume(c)
|
|
state = .comment
|
|
return tokenizeComment()
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeCommentStartDash() -> Token? {
|
|
switch nextChar() {
|
|
case "-":
|
|
state = .commentEnd
|
|
return tokenizeCommentEnd()
|
|
case ">":
|
|
// parse error: abrupt-closing-of-empty-comment
|
|
state = .data
|
|
return takeCurrentToken()
|
|
case nil:
|
|
// parse error: eof-in-comment
|
|
return takeCurrentToken()
|
|
case .some(let c):
|
|
if case .comment(var s) = currentToken {
|
|
s.append("-")
|
|
currentToken = .comment(s)
|
|
reconsume(c)
|
|
state = .comment
|
|
return tokenizeComment()
|
|
} else {
|
|
fatalError("bad current token")
|
|
}
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeComment() -> Token? {
|
|
switch nextChar() {
|
|
case "<":
|
|
if case .comment(var s) = currentToken {
|
|
s.append("<")
|
|
currentToken = .comment(s)
|
|
state = .commentLessThanSign
|
|
return tokenizeCommentLessThanSign()
|
|
} else {
|
|
fatalError("bad current token")
|
|
}
|
|
case "-":
|
|
state = .commentEndDash
|
|
return tokenizeCommentEndDash()
|
|
case nil:
|
|
// parse error: eof-in-comment
|
|
state = .endOfFile
|
|
return takeCurrentToken()
|
|
case .some(var c):
|
|
if c == "\0" {
|
|
// parse error: unexpected-null-character
|
|
c = "\u{FFFD}"
|
|
}
|
|
if case .comment(var s) = currentToken {
|
|
s.append(c)
|
|
currentToken = .comment(s)
|
|
return tokenizeComment()
|
|
} else {
|
|
fatalError("bad current token")
|
|
}
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeCommentLessThanSign() -> Token? {
|
|
switch nextChar() {
|
|
case "!":
|
|
if case .comment(var s) = currentToken {
|
|
s.append("!")
|
|
currentToken = .comment(s)
|
|
state = .commentLessThanSignBang
|
|
return tokenizeCommentLessThanSignBang()
|
|
} else {
|
|
fatalError("bad current token")
|
|
}
|
|
case "<":
|
|
if case .comment(var s) = currentToken {
|
|
s.append("<")
|
|
currentToken = .comment(s)
|
|
return tokenizeComment()
|
|
} else {
|
|
fatalError("bad current token")
|
|
}
|
|
case let c:
|
|
reconsume(c)
|
|
state = .comment
|
|
return tokenizeComment()
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeCommentLessThanSignBang() -> Token? {
|
|
switch nextChar() {
|
|
case "-":
|
|
state = .commentLessThanSignBangDash
|
|
return tokenizeCommentLessThanSignBangDash()
|
|
case let c:
|
|
reconsume(c)
|
|
state = .comment
|
|
return tokenizeComment()
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeCommentLessThanSignBangDash() -> Token? {
|
|
switch nextChar() {
|
|
case "-":
|
|
state = .commentLessThanSignBangDashDash
|
|
return tokenizeCommentLessThanSignBangDashDash()
|
|
case let c:
|
|
reconsume(c)
|
|
state = .commentEndDash
|
|
return tokenizeCommentEndDash()
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeCommentLessThanSignBangDashDash() -> Token? {
|
|
let c = nextChar()
|
|
switch c {
|
|
case ">", nil:
|
|
reconsume(c)
|
|
state = .commentEnd
|
|
return tokenizeCommentEnd()
|
|
default:
|
|
// parse error: nested-comment
|
|
reconsume(c)
|
|
state = .commentEnd
|
|
return tokenizeCommentEnd()
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeCommentEndDash() -> Token? {
|
|
switch nextChar() {
|
|
case "-":
|
|
state = .commentEnd
|
|
return tokenizeCommentEnd()
|
|
case nil:
|
|
// parse error: eof-in-comment
|
|
state = .endOfFile
|
|
return takeCurrentToken()
|
|
case let c:
|
|
if case .comment(var s) = currentToken {
|
|
s.append("-")
|
|
currentToken = .comment(s)
|
|
} else {
|
|
fatalError("bad current token")
|
|
}
|
|
reconsume(c)
|
|
state = .comment
|
|
return next()
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeCommentEnd() -> Token? {
|
|
switch nextChar() {
|
|
case ">":
|
|
state = .data
|
|
return takeCurrentToken()
|
|
case "!":
|
|
state = .commentEndBang
|
|
return tokenizeCommentEndBang()
|
|
case "-":
|
|
if case .comment(var s) = currentToken {
|
|
s.append("-")
|
|
currentToken = .comment(s)
|
|
return tokenizeCommentEnd()
|
|
} else {
|
|
fatalError("bad current token")
|
|
}
|
|
case nil:
|
|
// parse error: eof-in-comment
|
|
state = .endOfFile
|
|
return takeCurrentToken()
|
|
case .some(let c):
|
|
if case .comment(var s) = currentToken {
|
|
s.append("--")
|
|
currentToken = .comment(s)
|
|
} else {
|
|
fatalError("bad current token")
|
|
}
|
|
reconsume(c)
|
|
state = .comment
|
|
return tokenizeComment()
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeCommentEndBang() -> Token? {
|
|
switch nextChar() {
|
|
case "-":
|
|
if case .comment(var s) = currentToken {
|
|
s.append("--!")
|
|
currentToken = .comment(s)
|
|
state = .commentEndDash
|
|
return tokenizeCommentEndDash()
|
|
} else {
|
|
fatalError("bad current token")
|
|
}
|
|
case ">":
|
|
// parse error: incorrectly-closed-comment
|
|
state = .data
|
|
return takeCurrentToken()
|
|
case nil:
|
|
// parse error: eof-in-comment
|
|
state = .endOfFile
|
|
return takeCurrentToken()
|
|
case .some(let c):
|
|
if case .comment(var s) = currentToken {
|
|
s.append("--!")
|
|
currentToken = .comment(s)
|
|
reconsume(c)
|
|
state = .comment
|
|
return tokenizeComment()
|
|
} else {
|
|
fatalError("bad current token")
|
|
}
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeDoctype() -> Token? {
|
|
switch nextChar() {
|
|
case "\t", "\n", "\u{000C}", " ":
|
|
state = .beforeDoctypeName
|
|
return tokenizeBeforeDoctypeName()
|
|
case ">":
|
|
reconsume(">")
|
|
state = .beforeDoctypeName
|
|
return tokenizeBeforeDoctypeName()
|
|
case nil:
|
|
// parse error: eof-in-doctype
|
|
state = .endOfFile
|
|
return .doctype("", forceQuirks: true, publicIdentifier: nil, systemIdentifier: nil)
|
|
case .some(let c):
|
|
// parse error: missing-whitespace-before-doctype-name
|
|
reconsume(c)
|
|
state = .beforeDoctypeName
|
|
return tokenizeBeforeDoctypeName()
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeBeforeDoctypeName() -> Token? {
|
|
switch nextChar() {
|
|
case "\t", "\n", "\u{000C}", " ":
|
|
// ignore the character
|
|
return tokenizeBeforeDoctypeName()
|
|
case .some(let c) where ("A"..."Z").contains(c):
|
|
currentToken = .doctype("\(c.asciiLowercase)", forceQuirks: false, publicIdentifier: nil, systemIdentifier: nil)
|
|
state = .doctypeName
|
|
return tokenizeDoctypeName()
|
|
case "\0":
|
|
// parse error: unexpected-null-character
|
|
currentToken = .doctype("\u{FFFD}", forceQuirks: false, publicIdentifier: nil, systemIdentifier: nil)
|
|
state = .doctypeName
|
|
return tokenizeDoctypeName()
|
|
case ">":
|
|
// parse error: missing-doctype-name
|
|
state = .data
|
|
return .doctype("", forceQuirks: true, publicIdentifier: nil, systemIdentifier: nil)
|
|
case nil:
|
|
// parse error: eof-in-doctype
|
|
state = .endOfFile
|
|
return .doctype("", forceQuirks: false, publicIdentifier: nil, systemIdentifier: nil)
|
|
case .some(let c):
|
|
currentToken = .doctype("\(c)", forceQuirks: false, publicIdentifier: nil, systemIdentifier: nil)
|
|
state = .doctypeName
|
|
return tokenizeDoctypeName()
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeDoctypeName() -> Token? {
|
|
switch nextChar() {
|
|
case "\t", "\n", "\u{000C}", " ":
|
|
state = .afterDoctypeName
|
|
return tokenizeAfterDoctypeName()
|
|
case ">":
|
|
state = .data
|
|
return takeCurrentToken()
|
|
case nil:
|
|
// parse error: eof-in-doctype
|
|
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken {
|
|
currentToken = nil
|
|
return .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
|
|
} else {
|
|
fatalError("bad current token")
|
|
}
|
|
case .some(var c):
|
|
if c == "\0" {
|
|
c = "\u{FFFD}"
|
|
} else if ("A"..."Z").contains(c) {
|
|
c = c.asciiLowercase
|
|
}
|
|
if case .doctype(var s, let forceQuirks, _, _) = currentToken {
|
|
s.append(c)
|
|
currentToken = .doctype(s, forceQuirks: forceQuirks, publicIdentifier: nil, systemIdentifier: nil)
|
|
return tokenizeDoctypeName()
|
|
} else {
|
|
fatalError("bad current token")
|
|
}
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeAfterDoctypeName() -> Token? {
|
|
switch nextChar() {
|
|
case "\t", "\n", "\u{000C}", " ":
|
|
// ignore the character
|
|
return tokenizeAfterDoctypeName()
|
|
case ">":
|
|
state = .data
|
|
return takeCurrentToken()
|
|
case nil:
|
|
// parse error: eof-in-doctype
|
|
state = .endOfFile
|
|
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken {
|
|
currentToken = nil
|
|
return .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
|
|
} else {
|
|
fatalError("bad current token")
|
|
}
|
|
case .some(let c):
|
|
reconsume(c)
|
|
let peeked = peek(count: 6).lowercased()
|
|
if peeked == "public" {
|
|
consume(count: 6)
|
|
state = .afterDoctypePublicKeyword
|
|
return tokenizeAfterDoctypePublicKeyword()
|
|
} else if peeked == "system" {
|
|
consume(count: 6)
|
|
state = .afterDoctypeSystemKeyword
|
|
return tokenizeAfterDoctypeSystemKeyword()
|
|
} else {
|
|
// parse error: invalid-character-sequence-after-doctype-name
|
|
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken {
|
|
currentToken = .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
|
|
} else {
|
|
fatalError("bad current token")
|
|
}
|
|
state = .bogusDoctype
|
|
return tokenizeBogusDoctype()
|
|
}
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeAfterDoctypePublicKeyword() -> Token? {
|
|
switch nextChar() {
|
|
case "\t", "\n", "\u{000C}", " ":
|
|
state = .beforeDoctypePublicIdentifier
|
|
return tokenizeBeforeDoctypePublicIdentifier()
|
|
case .some(let c) where c == "\"" || c == "'":
|
|
// parse error: missing-whitespace-after-doctype-public-keyword
|
|
if case .doctype(let s, let forceQuirks, _, _) = currentToken {
|
|
currentToken = .doctype(s, forceQuirks: forceQuirks, publicIdentifier: "", systemIdentifier: nil)
|
|
let quotes = c == "\"" ? DoctypeIdentifierQuotation.doubleQuoted : .singleQuoted
|
|
state = .doctypePublicIdentifier(quotes)
|
|
return tokenizeDoctypePublicIdentifier(quotes: quotes)
|
|
} else {
|
|
fatalError("bad current token")
|
|
}
|
|
case ">":
|
|
// parse error: missing-doctype-public-identifier
|
|
state = .data
|
|
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken {
|
|
currentToken = nil
|
|
return .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
|
|
} else {
|
|
fatalError("bad current token")
|
|
}
|
|
case nil:
|
|
// parse error: eof-in-doctype
|
|
state = .endOfFile
|
|
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken {
|
|
currentToken = nil
|
|
return .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
|
|
} else {
|
|
fatalError("bad current token")
|
|
}
|
|
case .some(let c):
|
|
// parse error: missing-quote-before-doctype-public-identifier
|
|
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken {
|
|
currentToken = .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
|
|
} else {
|
|
fatalError("bad current token")
|
|
}
|
|
state = .bogusDoctype
|
|
reconsume(c)
|
|
return tokenizeBogusDoctype()
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeBeforeDoctypePublicIdentifier() -> Token? {
|
|
switch nextChar() {
|
|
case "\t", "\n", "\u{000C}", " ":
|
|
// ignore the character
|
|
return tokenizeBeforeDoctypePublicIdentifier()
|
|
case .some(let c) where c == "\"" || c == "'":
|
|
if case .doctype(let s, let forceQuirks, _, _) = currentToken {
|
|
currentToken = .doctype(s, forceQuirks: forceQuirks, publicIdentifier: "", systemIdentifier: nil)
|
|
let quotes = c == "\"" ? DoctypeIdentifierQuotation.doubleQuoted : .singleQuoted
|
|
state = .doctypePublicIdentifier(quotes)
|
|
return tokenizeDoctypePublicIdentifier(quotes: quotes)
|
|
} else {
|
|
fatalError("bad current token")
|
|
}
|
|
case ">":
|
|
// parse error: missing-doctype-public-identifier
|
|
state = .data
|
|
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken {
|
|
currentToken = nil
|
|
return .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
|
|
} else {
|
|
fatalError("bad current token")
|
|
}
|
|
case nil:
|
|
// parse error: eof-in-doctype
|
|
state = .endOfFile
|
|
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken {
|
|
currentToken = nil
|
|
return .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
|
|
} else {
|
|
fatalError("bad current token")
|
|
}
|
|
case .some(let c):
|
|
// parse error: missing-quote-before-doctype-public-identifier
|
|
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken {
|
|
currentToken = .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
|
|
} else {
|
|
fatalError("bad current token")
|
|
}
|
|
reconsume(c)
|
|
state = .bogusDoctype
|
|
return tokenizeBogusDoctype()
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeDoctypePublicIdentifier(quotes: DoctypeIdentifierQuotation) -> Token? {
|
|
switch nextChar() {
|
|
case "\"" where quotes == .doubleQuoted:
|
|
state = .afterDoctypePublicIdentifier
|
|
return tokenizeAfterDoctypePublicIdentifier()
|
|
case "'" where quotes == .singleQuoted:
|
|
state = .afterDoctypePublicIdentifier
|
|
return tokenizeAfterDoctypePublicIdentifier()
|
|
case ">":
|
|
// parse error: abrupt-doctype-public-identifier
|
|
reconsume(">")
|
|
state = .data
|
|
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken {
|
|
currentToken = nil
|
|
return .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
|
|
} else {
|
|
fatalError("bad current token")
|
|
}
|
|
case nil:
|
|
// parse error: eof-in-doctype
|
|
state = .endOfFile
|
|
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken {
|
|
currentToken = nil
|
|
return .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
|
|
} else {
|
|
fatalError("bad current token")
|
|
}
|
|
case .some(var c):
|
|
if c == "\0" {
|
|
// parse error: unexpected-null-character
|
|
c = "\u{FFFD}"
|
|
}
|
|
if case .doctype(let s, let forceQuirks, var publicIdentifier, _) = currentToken {
|
|
publicIdentifier!.append(c)
|
|
currentToken = .doctype(s, forceQuirks: forceQuirks, publicIdentifier: publicIdentifier, systemIdentifier: nil)
|
|
return tokenizeDoctypePublicIdentifier(quotes: quotes)
|
|
} else {
|
|
fatalError("bad current token")
|
|
}
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeAfterDoctypePublicIdentifier() -> Token? {
|
|
switch nextChar() {
|
|
case "\t", "\n", "\u{000C}", " ":
|
|
state = .betweenDoctypePublicAndSystemIdentifiers
|
|
return tokenizeBetweenDoctypePublicAndSystemIdentifiers()
|
|
case ">":
|
|
state = .data
|
|
return takeCurrentToken()
|
|
case .some(let c) where c == "\"" || c == "'":
|
|
// parse error: missing-whitespace-between-doctype-public-and-system-identifiers
|
|
if case .doctype(let s, let forceQuirks, let publicIdentifier, _) = currentToken {
|
|
currentToken = .doctype(s, forceQuirks: forceQuirks, publicIdentifier: publicIdentifier, systemIdentifier: "")
|
|
let quotes = c == "\"" ? DoctypeIdentifierQuotation.doubleQuoted : .singleQuoted
|
|
state = .doctypeSystemIdentifier(quotes)
|
|
return tokenizeDoctypeSystemIdentifier(quotes: quotes)
|
|
} else {
|
|
fatalError("bad current token")
|
|
}
|
|
case nil:
|
|
// parse error: eof-in-doctype
|
|
state = .endOfFile
|
|
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken {
|
|
currentToken = nil
|
|
return .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
|
|
} else {
|
|
fatalError("bad current token")
|
|
}
|
|
case .some(let c):
|
|
// parse error: missing-quote-before-doctype-system-identifier
|
|
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken {
|
|
currentToken = .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
|
|
} else {
|
|
fatalError("bad current token")
|
|
}
|
|
reconsume(c)
|
|
state = .bogusDoctype
|
|
return tokenizeBogusDoctype()
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeBetweenDoctypePublicAndSystemIdentifiers() -> Token? {
|
|
switch nextChar() {
|
|
case "\t", "\n", "\u{000C}", " ":
|
|
// ignore the character
|
|
return tokenizeBetweenDoctypePublicAndSystemIdentifiers()
|
|
case ">":
|
|
state = .data
|
|
return takeCurrentToken()
|
|
case .some(let c) where c == "\"" || c == "'":
|
|
if case .doctype(let s, let forceQuirks, let publicIdentifier, _) = currentToken {
|
|
currentToken = .doctype(s, forceQuirks: forceQuirks, publicIdentifier: publicIdentifier, systemIdentifier: "")
|
|
let quotes = c == "\"" ? DoctypeIdentifierQuotation.doubleQuoted : .singleQuoted
|
|
state = .doctypeSystemIdentifier(quotes)
|
|
return tokenizeDoctypeSystemIdentifier(quotes: quotes)
|
|
} else {
|
|
fatalError("bad current token")
|
|
}
|
|
case nil:
|
|
// parse error: eof-in-doctype
|
|
state = .endOfFile
|
|
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken {
|
|
currentToken = nil
|
|
return .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
|
|
} else {
|
|
fatalError("bad current token")
|
|
}
|
|
case .some(let c):
|
|
// parse error: missing-quote-before-doctype-system-identifier
|
|
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken {
|
|
currentToken = .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
|
|
} else {
|
|
fatalError("bad current token")
|
|
}
|
|
reconsume(c)
|
|
state = .bogusComment
|
|
return tokenizeBogusComment()
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeAfterDoctypeSystemKeyword() -> Token? {
|
|
switch nextChar() {
|
|
case "\t", "\n", "\u{000C}", " ":
|
|
state = .beforeDoctypeSystemIdentifier
|
|
return tokenizeBeforeDoctypeSystemIdentifier()
|
|
case .some(let c) where c == "\"" || c == "'":
|
|
if case .doctype(let s, let forceQuirks, let publicIdentifier, _) = currentToken {
|
|
currentToken = .doctype(s, forceQuirks: forceQuirks, publicIdentifier: publicIdentifier, systemIdentifier: "")
|
|
let quotes = c == "\"" ? DoctypeIdentifierQuotation.doubleQuoted : .singleQuoted
|
|
state = .doctypeSystemIdentifier(quotes)
|
|
return tokenizeDoctypeSystemIdentifier(quotes: quotes)
|
|
} else {
|
|
fatalError("bad current token")
|
|
}
|
|
case ">":
|
|
// parse error: missing-doctype-system-identifier
|
|
state = .data
|
|
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken {
|
|
currentToken = nil
|
|
return .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
|
|
} else {
|
|
fatalError("bad current token")
|
|
}
|
|
case nil:
|
|
// parse error: eof-in-doctype:
|
|
state = .endOfFile
|
|
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken {
|
|
currentToken = nil
|
|
return .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
|
|
} else {
|
|
fatalError("bad current token")
|
|
}
|
|
case .some(let c):
|
|
// parse error: missing-quote-before-doctype-system-identifier
|
|
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken {
|
|
currentToken = .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
|
|
} else {
|
|
fatalError("bad current token")
|
|
}
|
|
reconsume(c)
|
|
state = .bogusDoctype
|
|
return tokenizeBogusDoctype()
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeBeforeDoctypeSystemIdentifier() -> Token? {
|
|
switch nextChar() {
|
|
case "\t", "\n", "\u{000C}", " ":
|
|
// ignore the character
|
|
return tokenizeBeforeDoctypeSystemIdentifier()
|
|
case .some(let c) where c == "\"" || c == "'":
|
|
if case .doctype(let s, let forceQuirks, let publicIdentifier, _) = currentToken {
|
|
currentToken = .doctype(s, forceQuirks: forceQuirks, publicIdentifier: publicIdentifier, systemIdentifier: " ")
|
|
let quotes = c == "\"" ? DoctypeIdentifierQuotation.doubleQuoted : .singleQuoted
|
|
state = .doctypeSystemIdentifier(quotes)
|
|
return tokenizeDoctypeSystemIdentifier(quotes: quotes)
|
|
} else {
|
|
fatalError("bad current token")
|
|
}
|
|
case ">":
|
|
// parse error: missing-doctype-system-identifier
|
|
state = .data
|
|
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken {
|
|
currentToken = nil
|
|
return .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
|
|
} else {
|
|
fatalError("bad current token")
|
|
}
|
|
case nil:
|
|
// parse error: eof-in-doctype:
|
|
state = .endOfFile
|
|
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken {
|
|
currentToken = nil
|
|
return .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
|
|
} else {
|
|
fatalError("bad current token")
|
|
}
|
|
case .some(let c):
|
|
// parse error: missing-quote-before-doctype-system-identifier
|
|
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken {
|
|
currentToken = .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
|
|
} else {
|
|
fatalError("bad current token")
|
|
}
|
|
reconsume(c)
|
|
state = .bogusDoctype
|
|
return tokenizeBogusDoctype()
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeDoctypeSystemIdentifier(quotes: DoctypeIdentifierQuotation) -> Token? {
|
|
switch nextChar() {
|
|
case "\"" where quotes == .doubleQuoted:
|
|
state = .afterDoctypeSystemIdentifier
|
|
return tokenizeAfterDoctypeSystemIdentifier()
|
|
case "'" where quotes == .singleQuoted:
|
|
state = .afterDoctypeSystemIdentifier
|
|
return tokenizeAfterDoctypeSystemIdentifier()
|
|
case ">":
|
|
// parse error: abrupt-doctype-system-identifier
|
|
state = .data
|
|
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken {
|
|
currentToken = nil
|
|
return .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
|
|
} else {
|
|
fatalError("bad current token")
|
|
}
|
|
case nil:
|
|
// parse error: eof-in-doctype
|
|
state = .endOfFile
|
|
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken {
|
|
currentToken = nil
|
|
return .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
|
|
} else {
|
|
fatalError("bad current token")
|
|
}
|
|
case .some(var c):
|
|
if c == "\0" {
|
|
// parse error: unexpected-null-character
|
|
c = "\u{FFFD}"
|
|
}
|
|
if case .doctype(let s, let forceQuirks, let publicIdentifier, var systemIdentifier) = currentToken {
|
|
systemIdentifier!.append(c)
|
|
currentToken = .doctype(s, forceQuirks: forceQuirks, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
|
|
return tokenizeDoctypeSystemIdentifier(quotes: quotes)
|
|
} else {
|
|
fatalError("bad current token")
|
|
}
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeAfterDoctypeSystemIdentifier() -> Token? {
|
|
switch nextChar() {
|
|
case "\t", "\n", "\u{000C}", " ":
|
|
// ignore the character
|
|
return tokenizeAfterDoctypeSystemIdentifier()
|
|
case ">":
|
|
state = .data
|
|
return takeCurrentToken()
|
|
case nil:
|
|
// parse error: eof-in-doctype
|
|
state = .endOfFile
|
|
if case .doctype(let s, _, let publicIdentifier, let systemIdentifier) = currentToken {
|
|
currentToken = nil
|
|
return .doctype(s, forceQuirks: true, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier)
|
|
} else {
|
|
fatalError("bad current token")
|
|
}
|
|
case .some(let c):
|
|
// parse error: unexpected-character-after-doctype-system-identifier
|
|
// Note: This does not set the current DOCTYPE token's force-quirks flag to on.
|
|
reconsume(c)
|
|
state = .bogusDoctype
|
|
return tokenizeBogusDoctype()
|
|
}
|
|
}
|
|
|
|
mutating func tokenizeBogusDoctype() -> Token? {
|
|
switch nextChar() {
|
|
case ">":
|
|
state = .data
|
|
return takeCurrentToken()
|
|
case "\0":
|
|
// parse error: unexpected-null-character, ignore the character
|
|
return tokenizeBogusDoctype()
|
|
case nil:
|
|
state = .endOfFile
|
|
return takeCurrentToken()
|
|
case _:
|
|
// ignore the character
|
|
return tokenizeBogusDoctype()
|
|
}
|
|
}
|
|
}
|
|
|
|
private extension Character {
|
|
var asciiLowercase: Character {
|
|
assert(("A"..."Z").contains(self))
|
|
return Character(Unicode.Scalar(asciiValue! + 0x20))
|
|
}
|
|
}
|