diff --git a/Sources/HTMLStreamer/AttributedStringConverter.swift b/Sources/HTMLStreamer/AttributedStringConverter.swift index da13a72..e243110 100644 --- a/Sources/HTMLStreamer/AttributedStringConverter.swift +++ b/Sources/HTMLStreamer/AttributedStringConverter.swift @@ -21,7 +21,7 @@ public struct AttributedStringConverter { private let configuration: AttributedStringConverterConfiguration private var fontCache: [FontTrait: PlatformFont] = [:] - private var tokenizer: Tokenizer! + private var tokenizer: Tokenizer! private var str: NSMutableAttributedString! private var actionStack: [ElementAction] = [] @@ -38,7 +38,7 @@ public struct AttributedStringConverter { } public mutating func convert(html: String) -> NSAttributedString { - tokenizer = Tokenizer(chars: html.makeIterator()) + tokenizer = Tokenizer(chars: html.unicodeScalars.makeIterator()) str = NSMutableAttributedString() actionStack = [] @@ -48,7 +48,7 @@ public struct AttributedStringConverter { while let token = tokenizer.next() { switch token { case .character(let c): - currentRun.append(c) + currentRun.unicodeScalars.append(c) case .comment: // ignored continue diff --git a/Sources/HTMLStreamer/Tokenizer.swift b/Sources/HTMLStreamer/Tokenizer.swift index 2dd39d0..22bcd7d 100644 --- a/Sources/HTMLStreamer/Tokenizer.swift +++ b/Sources/HTMLStreamer/Tokenizer.swift @@ -7,11 +7,11 @@ import Foundation -struct Tokenizer>: IteratorProtocol { +struct Tokenizer>: IteratorProtocol { typealias Element = Token private var chars: Chars - private var reconsumeStack: [Character] = [] + private var reconsumeStack: [Unicode.Scalar] = [] private var state = State.data private var returnState: State? private var temporaryBuffer: String? @@ -34,7 +34,7 @@ struct Tokenizer>: IteratorProtocol { state = returnState return next() } else { - return .character(temporaryBuffer!.removeFirst()) + return .character(temporaryBuffer!.unicodeScalars.removeFirst()) } case .endOfFile: return nil @@ -143,13 +143,13 @@ struct Tokenizer>: IteratorProtocol { } } - private mutating func reconsume(_ c: Character?) { + private mutating func reconsume(_ c: Unicode.Scalar?) { if let c { reconsumeStack.append(c) } } - private mutating func nextChar() -> Character? { + private mutating func nextChar() -> Unicode.Scalar? { if !reconsumeStack.isEmpty { return reconsumeStack.removeLast() } else { @@ -157,7 +157,7 @@ struct Tokenizer>: IteratorProtocol { } } - private mutating func peekChar() -> Character? { + private mutating func peekChar() -> Unicode.Scalar? { if let nextToReconsume = reconsumeStack.last { return nextToReconsume } else { @@ -172,7 +172,7 @@ struct Tokenizer>: IteratorProtocol { // TODO: extract this all out into a standalone type and test it separately private mutating func peek(count: Int) -> String { precondition(count >= 0) - var buf = "" + var buf = String.UnicodeScalarView() for _ in 0..>: IteratorProtocol { } } reconsumeStack.append(contentsOf: buf.reversed()) - return buf + return String(buf) } private mutating func consume(count: Int) { @@ -211,7 +211,7 @@ struct Tokenizer>: IteratorProtocol { } enum Token: Equatable { - case character(Character) + case character(Unicode.Scalar) case comment(String) case startTag(String, selfClosing: Bool, attributes: [Attribute]) case endTag(String) @@ -371,9 +371,9 @@ private extension Tokenizer { loop: while let c = nextChar() { switch c { case "a"..."z", "A"..."Z", "0"..."9": - temporaryBuffer!.append(c) + temporaryBuffer!.unicodeScalars.append(c) case ";": - temporaryBuffer!.append(c) + temporaryBuffer!.unicodeScalars.append(c) break loop default: reconsume(c) @@ -396,7 +396,7 @@ private extension Tokenizer { } } if referent != nil { - for c in buf[index...].reversed() { + for c in buf[index...].unicodeScalars.reversed() { reconsume(c) } temporaryBuffer!.removeSubrange(index...) @@ -556,7 +556,7 @@ private extension Tokenizer { let c = nextChar() switch c { case .some("0"..."9"): - characterReferenceCode = (characterReferenceCode! * 10) + UInt32(c!.wholeNumberValue!) + characterReferenceCode = (characterReferenceCode! * 10) + UInt32(c!.hexDigitValue!) return tokenizeDecimalCharacterReference() case ";": state = .numericCharacterReferenceEnd @@ -574,7 +574,7 @@ private extension Tokenizer { switch c { case .some("0"..."9"), .some("a"..."z"), .some("A"..."Z"): if case .attributeValue(_) = returnState { - currentStartTag!.attributes.uncheckedLast.value.append(c!) + currentStartTag!.attributes.uncheckedLast.value.unicodeScalars.append(c!) return tokenizeAmbiguousAmpersand() } else { return .character(c!) @@ -669,10 +669,10 @@ private extension Tokenizer { c = c.asciiLowercase } if currentStartTag != nil { - currentStartTag!.0.append(c) + currentStartTag!.0.unicodeScalars.append(c) continue } else if currentEndTag != nil { - currentEndTag!.append(c) + currentEndTag!.unicodeScalars.append(c) continue } else { fatalError("bad current token") @@ -752,7 +752,7 @@ private extension Tokenizer { } // if c in ["\"", "'", "<"], parse error: unexpected-character-in-attribute-name if currentStartTag != nil { - currentStartTag!.attributes.uncheckedLast.name.append(c) + currentStartTag!.attributes.uncheckedLast.name.unicodeScalars.append(c) continue } else if currentEndTag != nil { continue @@ -837,7 +837,7 @@ private extension Tokenizer { case .some(let c): // if c in ["\"", "'", "<", "=", "`"], parse error: unexpected-character-in-unquoted-attribute-value if currentStartTag != nil { - currentStartTag!.attributes.uncheckedLast.value.append(c) + currentStartTag!.attributes.uncheckedLast.value.unicodeScalars.append(c) continue } else if currentEndTag != nil { continue @@ -868,7 +868,7 @@ private extension Tokenizer { c = "\u{FFFD}" } if currentStartTag != nil { - currentStartTag!.attributes.uncheckedLast.value.append(c) + currentStartTag!.attributes.uncheckedLast.value.unicodeScalars.append(c) continue } else if currentEndTag != nil { continue @@ -914,7 +914,7 @@ private extension Tokenizer { // parse error: unexpected-null-character c = "\u{FFFD}" } - currentComment!.append(c) + currentComment!.unicodeScalars.append(c) return tokenizeBogusComment() } } @@ -998,7 +998,7 @@ private extension Tokenizer { // parse error: unexpected-null-character c = "\u{FFFD}" } - currentComment!.append(c) + currentComment!.unicodeScalars.append(c) return tokenizeComment() } } @@ -1188,7 +1188,7 @@ private extension Tokenizer { } else if ("A"..."Z").contains(c) { c = c.asciiLowercase } - currentDoctype!.0.append(c) + currentDoctype!.0.unicodeScalars.append(c) return tokenizeDoctypeName() } } @@ -1309,7 +1309,7 @@ private extension Tokenizer { // parse error: unexpected-null-character c = "\u{FFFD}" } - currentDoctype!.publicIdentifier!.append(c) + currentDoctype!.publicIdentifier!.unicodeScalars.append(c) return tokenizeDoctypePublicIdentifier(quotes: quotes) } } @@ -1450,7 +1450,7 @@ private extension Tokenizer { // parse error: unexpected-null-character c = "\u{FFFD}" } - currentDoctype!.systemIdentifier!.append(c) + currentDoctype!.systemIdentifier!.unicodeScalars.append(c) return tokenizeDoctypeSystemIdentifier(quotes: quotes) } } @@ -1495,13 +1495,6 @@ private extension Tokenizer { } } -private extension Character { - var asciiLowercase: Character { - assert(("A"..."Z").contains(self)) - return Character(Unicode.Scalar(asciiValue! + 0x20)) - } -} - private extension Array { // Optimization: allows in-place modification of the last element of the array. var uncheckedLast: Element { @@ -1513,3 +1506,36 @@ private extension Array { } } } + +private extension Unicode.Scalar { + var asciiLowercase: Unicode.Scalar { + assert(("A"..."Z").contains(self)) + return Unicode.Scalar(value + 0x20)! + } + + var hexDigitValue: Int? { + switch self { + case "0": 0 + case "1": 1 + case "2": 2 + case "3": 3 + case "4": 4 + case "5": 5 + case "6": 6 + case "7": 7 + case "8": 8 + case "9": 9 + case "A": 0xA + case "B": 0xB + case "C": 0xC + case "D": 0xD + case "E": 0xE + case "F": 0xF + default: nil + } + } + + var isNumber: Bool { + ("0"..."9").contains(self) + } +} diff --git a/Tests/HTMLStreamerTests/AttributedStringConverterTests.swift b/Tests/HTMLStreamerTests/AttributedStringConverterTests.swift index 66f84a6..fda40fb 100644 --- a/Tests/HTMLStreamerTests/AttributedStringConverterTests.swift +++ b/Tests/HTMLStreamerTests/AttributedStringConverterTests.swift @@ -207,4 +207,11 @@ final class AttributedStringConverterTests: XCTestCase { ])) } + func testMultiScalar() { + XCTAssertEqual(convert("🇺🇸"), NSAttributedString(string: "🇺🇸", attributes: [ + .font: font, + .paragraphStyle: NSParagraphStyle.default, + ])) + } + } diff --git a/Tests/HTMLStreamerTests/TokenizerTests.swift b/Tests/HTMLStreamerTests/TokenizerTests.swift index 0153735..b1f3aa7 100644 --- a/Tests/HTMLStreamerTests/TokenizerTests.swift +++ b/Tests/HTMLStreamerTests/TokenizerTests.swift @@ -11,7 +11,7 @@ import XCTest final class TokenizerTests: XCTestCase { private func tokenize(_ s: String) -> [Token] { - let iterator = Tokenizer(chars: s.makeIterator()) + let iterator = Tokenizer(chars: s.unicodeScalars.makeIterator()) // let iterator = PrintIterator(inner: Tokenizer(chars: s.makeIterator())) return Array(AnySequence({ iterator })) } @@ -23,8 +23,8 @@ final class TokenizerTests: XCTestCase { XCTAssertEqual(tokenize("¬in"), [.character("¬"), .character("i"), .character("n")]) // unknown-named-character-reference: XCTAssertEqual(tokenize("¬it;"), [.character("¬"), .character("i"), .character("t"), .character(";")]) - XCTAssertEqual(tokenize("&asdf"), "&asdf".map { .character($0) }) - XCTAssertEqual(tokenize("&a"), "&a".map { .character($0) }) + XCTAssertEqual(tokenize("&asdf"), "&asdf".unicodeScalars.map { .character($0) }) + XCTAssertEqual(tokenize("&a"), "&a".unicodeScalars.map { .character($0) }) // attribute special case XCTAssertEqual(tokenize(""), [.startTag("a", selfClosing: true, attributes: [Attribute(name: "a", value: "¬a")])]) @@ -70,6 +70,10 @@ final class TokenizerTests: XCTestCase { XCTAssertEqual(tokenize(#""#), [.doctype("html", forceQuirks: false, publicIdentifier: "-//W3C//DTD HTML 4.01//EN", systemIdentifier: "http://www.w3.org/TR/html4/strict.dtd")]) } + func testMultiScalar() { + XCTAssertEqual(tokenize("🇺🇸"), [.character("\u{1F1FA}"), .character("\u{1F1F8}")]) + } + } private struct PrintIterator: IteratorProtocol {