diff --git a/Sources/HTMLStreamer/AttributedStringConverter.swift b/Sources/HTMLStreamer/AttributedStringConverter.swift
index da13a72..e243110 100644
--- a/Sources/HTMLStreamer/AttributedStringConverter.swift
+++ b/Sources/HTMLStreamer/AttributedStringConverter.swift
@@ -21,7 +21,7 @@ public struct AttributedStringConverter {
private let configuration: AttributedStringConverterConfiguration
private var fontCache: [FontTrait: PlatformFont] = [:]
- private var tokenizer: Tokenizer!
+ private var tokenizer: Tokenizer!
private var str: NSMutableAttributedString!
private var actionStack: [ElementAction] = []
@@ -38,7 +38,7 @@ public struct AttributedStringConverter {
}
public mutating func convert(html: String) -> NSAttributedString {
- tokenizer = Tokenizer(chars: html.makeIterator())
+ tokenizer = Tokenizer(chars: html.unicodeScalars.makeIterator())
str = NSMutableAttributedString()
actionStack = []
@@ -48,7 +48,7 @@ public struct AttributedStringConverter {
while let token = tokenizer.next() {
switch token {
case .character(let c):
- currentRun.append(c)
+ currentRun.unicodeScalars.append(c)
case .comment:
// ignored
continue
diff --git a/Sources/HTMLStreamer/Tokenizer.swift b/Sources/HTMLStreamer/Tokenizer.swift
index 2dd39d0..22bcd7d 100644
--- a/Sources/HTMLStreamer/Tokenizer.swift
+++ b/Sources/HTMLStreamer/Tokenizer.swift
@@ -7,11 +7,11 @@
import Foundation
-struct Tokenizer>: IteratorProtocol {
+struct Tokenizer>: IteratorProtocol {
typealias Element = Token
private var chars: Chars
- private var reconsumeStack: [Character] = []
+ private var reconsumeStack: [Unicode.Scalar] = []
private var state = State.data
private var returnState: State?
private var temporaryBuffer: String?
@@ -34,7 +34,7 @@ struct Tokenizer>: IteratorProtocol {
state = returnState
return next()
} else {
- return .character(temporaryBuffer!.removeFirst())
+ return .character(temporaryBuffer!.unicodeScalars.removeFirst())
}
case .endOfFile:
return nil
@@ -143,13 +143,13 @@ struct Tokenizer>: IteratorProtocol {
}
}
- private mutating func reconsume(_ c: Character?) {
+ private mutating func reconsume(_ c: Unicode.Scalar?) {
if let c {
reconsumeStack.append(c)
}
}
- private mutating func nextChar() -> Character? {
+ private mutating func nextChar() -> Unicode.Scalar? {
if !reconsumeStack.isEmpty {
return reconsumeStack.removeLast()
} else {
@@ -157,7 +157,7 @@ struct Tokenizer>: IteratorProtocol {
}
}
- private mutating func peekChar() -> Character? {
+ private mutating func peekChar() -> Unicode.Scalar? {
if let nextToReconsume = reconsumeStack.last {
return nextToReconsume
} else {
@@ -172,7 +172,7 @@ struct Tokenizer>: IteratorProtocol {
// TODO: extract this all out into a standalone type and test it separately
private mutating func peek(count: Int) -> String {
precondition(count >= 0)
- var buf = ""
+ var buf = String.UnicodeScalarView()
for _ in 0..>: IteratorProtocol {
}
}
reconsumeStack.append(contentsOf: buf.reversed())
- return buf
+ return String(buf)
}
private mutating func consume(count: Int) {
@@ -211,7 +211,7 @@ struct Tokenizer>: IteratorProtocol {
}
enum Token: Equatable {
- case character(Character)
+ case character(Unicode.Scalar)
case comment(String)
case startTag(String, selfClosing: Bool, attributes: [Attribute])
case endTag(String)
@@ -371,9 +371,9 @@ private extension Tokenizer {
loop: while let c = nextChar() {
switch c {
case "a"..."z", "A"..."Z", "0"..."9":
- temporaryBuffer!.append(c)
+ temporaryBuffer!.unicodeScalars.append(c)
case ";":
- temporaryBuffer!.append(c)
+ temporaryBuffer!.unicodeScalars.append(c)
break loop
default:
reconsume(c)
@@ -396,7 +396,7 @@ private extension Tokenizer {
}
}
if referent != nil {
- for c in buf[index...].reversed() {
+ for c in buf[index...].unicodeScalars.reversed() {
reconsume(c)
}
temporaryBuffer!.removeSubrange(index...)
@@ -556,7 +556,7 @@ private extension Tokenizer {
let c = nextChar()
switch c {
case .some("0"..."9"):
- characterReferenceCode = (characterReferenceCode! * 10) + UInt32(c!.wholeNumberValue!)
+ characterReferenceCode = (characterReferenceCode! * 10) + UInt32(c!.hexDigitValue!)
return tokenizeDecimalCharacterReference()
case ";":
state = .numericCharacterReferenceEnd
@@ -574,7 +574,7 @@ private extension Tokenizer {
switch c {
case .some("0"..."9"), .some("a"..."z"), .some("A"..."Z"):
if case .attributeValue(_) = returnState {
- currentStartTag!.attributes.uncheckedLast.value.append(c!)
+ currentStartTag!.attributes.uncheckedLast.value.unicodeScalars.append(c!)
return tokenizeAmbiguousAmpersand()
} else {
return .character(c!)
@@ -669,10 +669,10 @@ private extension Tokenizer {
c = c.asciiLowercase
}
if currentStartTag != nil {
- currentStartTag!.0.append(c)
+ currentStartTag!.0.unicodeScalars.append(c)
continue
} else if currentEndTag != nil {
- currentEndTag!.append(c)
+ currentEndTag!.unicodeScalars.append(c)
continue
} else {
fatalError("bad current token")
@@ -752,7 +752,7 @@ private extension Tokenizer {
}
// if c in ["\"", "'", "<"], parse error: unexpected-character-in-attribute-name
if currentStartTag != nil {
- currentStartTag!.attributes.uncheckedLast.name.append(c)
+ currentStartTag!.attributes.uncheckedLast.name.unicodeScalars.append(c)
continue
} else if currentEndTag != nil {
continue
@@ -837,7 +837,7 @@ private extension Tokenizer {
case .some(let c):
// if c in ["\"", "'", "<", "=", "`"], parse error: unexpected-character-in-unquoted-attribute-value
if currentStartTag != nil {
- currentStartTag!.attributes.uncheckedLast.value.append(c)
+ currentStartTag!.attributes.uncheckedLast.value.unicodeScalars.append(c)
continue
} else if currentEndTag != nil {
continue
@@ -868,7 +868,7 @@ private extension Tokenizer {
c = "\u{FFFD}"
}
if currentStartTag != nil {
- currentStartTag!.attributes.uncheckedLast.value.append(c)
+ currentStartTag!.attributes.uncheckedLast.value.unicodeScalars.append(c)
continue
} else if currentEndTag != nil {
continue
@@ -914,7 +914,7 @@ private extension Tokenizer {
// parse error: unexpected-null-character
c = "\u{FFFD}"
}
- currentComment!.append(c)
+ currentComment!.unicodeScalars.append(c)
return tokenizeBogusComment()
}
}
@@ -998,7 +998,7 @@ private extension Tokenizer {
// parse error: unexpected-null-character
c = "\u{FFFD}"
}
- currentComment!.append(c)
+ currentComment!.unicodeScalars.append(c)
return tokenizeComment()
}
}
@@ -1188,7 +1188,7 @@ private extension Tokenizer {
} else if ("A"..."Z").contains(c) {
c = c.asciiLowercase
}
- currentDoctype!.0.append(c)
+ currentDoctype!.0.unicodeScalars.append(c)
return tokenizeDoctypeName()
}
}
@@ -1309,7 +1309,7 @@ private extension Tokenizer {
// parse error: unexpected-null-character
c = "\u{FFFD}"
}
- currentDoctype!.publicIdentifier!.append(c)
+ currentDoctype!.publicIdentifier!.unicodeScalars.append(c)
return tokenizeDoctypePublicIdentifier(quotes: quotes)
}
}
@@ -1450,7 +1450,7 @@ private extension Tokenizer {
// parse error: unexpected-null-character
c = "\u{FFFD}"
}
- currentDoctype!.systemIdentifier!.append(c)
+ currentDoctype!.systemIdentifier!.unicodeScalars.append(c)
return tokenizeDoctypeSystemIdentifier(quotes: quotes)
}
}
@@ -1495,13 +1495,6 @@ private extension Tokenizer {
}
}
-private extension Character {
- var asciiLowercase: Character {
- assert(("A"..."Z").contains(self))
- return Character(Unicode.Scalar(asciiValue! + 0x20))
- }
-}
-
private extension Array {
// Optimization: allows in-place modification of the last element of the array.
var uncheckedLast: Element {
@@ -1513,3 +1506,36 @@ private extension Array {
}
}
}
+
+private extension Unicode.Scalar {
+ var asciiLowercase: Unicode.Scalar {
+ assert(("A"..."Z").contains(self))
+ return Unicode.Scalar(value + 0x20)!
+ }
+
+ var hexDigitValue: Int? {
+ switch self {
+ case "0": 0
+ case "1": 1
+ case "2": 2
+ case "3": 3
+ case "4": 4
+ case "5": 5
+ case "6": 6
+ case "7": 7
+ case "8": 8
+ case "9": 9
+ case "A": 0xA
+ case "B": 0xB
+ case "C": 0xC
+ case "D": 0xD
+ case "E": 0xE
+ case "F": 0xF
+ default: nil
+ }
+ }
+
+ var isNumber: Bool {
+ ("0"..."9").contains(self)
+ }
+}
diff --git a/Tests/HTMLStreamerTests/AttributedStringConverterTests.swift b/Tests/HTMLStreamerTests/AttributedStringConverterTests.swift
index 66f84a6..fda40fb 100644
--- a/Tests/HTMLStreamerTests/AttributedStringConverterTests.swift
+++ b/Tests/HTMLStreamerTests/AttributedStringConverterTests.swift
@@ -207,4 +207,11 @@ final class AttributedStringConverterTests: XCTestCase {
]))
}
+ func testMultiScalar() {
+ XCTAssertEqual(convert("🇺🇸"), NSAttributedString(string: "🇺🇸", attributes: [
+ .font: font,
+ .paragraphStyle: NSParagraphStyle.default,
+ ]))
+ }
+
}
diff --git a/Tests/HTMLStreamerTests/TokenizerTests.swift b/Tests/HTMLStreamerTests/TokenizerTests.swift
index 0153735..b1f3aa7 100644
--- a/Tests/HTMLStreamerTests/TokenizerTests.swift
+++ b/Tests/HTMLStreamerTests/TokenizerTests.swift
@@ -11,7 +11,7 @@ import XCTest
final class TokenizerTests: XCTestCase {
private func tokenize(_ s: String) -> [Token] {
- let iterator = Tokenizer(chars: s.makeIterator())
+ let iterator = Tokenizer(chars: s.unicodeScalars.makeIterator())
// let iterator = PrintIterator(inner: Tokenizer(chars: s.makeIterator()))
return Array(AnySequence({ iterator }))
}
@@ -23,8 +23,8 @@ final class TokenizerTests: XCTestCase {
XCTAssertEqual(tokenize("¬in"), [.character("¬"), .character("i"), .character("n")])
// unknown-named-character-reference:
XCTAssertEqual(tokenize("¬it;"), [.character("¬"), .character("i"), .character("t"), .character(";")])
- XCTAssertEqual(tokenize("&asdf"), "&asdf".map { .character($0) })
- XCTAssertEqual(tokenize("&a"), "&a".map { .character($0) })
+ XCTAssertEqual(tokenize("&asdf"), "&asdf".unicodeScalars.map { .character($0) })
+ XCTAssertEqual(tokenize("&a"), "&a".unicodeScalars.map { .character($0) })
// attribute special case
XCTAssertEqual(tokenize(""), [.startTag("a", selfClosing: true, attributes: [Attribute(name: "a", value: "¬a")])])
@@ -70,6 +70,10 @@ final class TokenizerTests: XCTestCase {
XCTAssertEqual(tokenize(#""#), [.doctype("html", forceQuirks: false, publicIdentifier: "-//W3C//DTD HTML 4.01//EN", systemIdentifier: "http://www.w3.org/TR/html4/strict.dtd")])
}
+ func testMultiScalar() {
+ XCTAssertEqual(tokenize("🇺🇸"), [.character("\u{1F1FA}"), .character("\u{1F1F8}")])
+ }
+
}
private struct PrintIterator: IteratorProtocol {