Use Unicode.Scalar instead of Character

All the chars we care about are a single scalar, so this avoids spending
time on the grapheme breaking algorithm.
This commit is contained in:
Shadowfacts 2023-11-28 11:56:56 -05:00
parent f412369cf7
commit f7f35e09f7
4 changed files with 74 additions and 37 deletions

View File

@ -21,7 +21,7 @@ public struct AttributedStringConverter<Callbacks: AttributedStringCallbacks> {
private let configuration: AttributedStringConverterConfiguration private let configuration: AttributedStringConverterConfiguration
private var fontCache: [FontTrait: PlatformFont] = [:] private var fontCache: [FontTrait: PlatformFont] = [:]
private var tokenizer: Tokenizer<String.Iterator>! private var tokenizer: Tokenizer<String.UnicodeScalarView.Iterator>!
private var str: NSMutableAttributedString! private var str: NSMutableAttributedString!
private var actionStack: [ElementAction] = [] private var actionStack: [ElementAction] = []
@ -38,7 +38,7 @@ public struct AttributedStringConverter<Callbacks: AttributedStringCallbacks> {
} }
public mutating func convert(html: String) -> NSAttributedString { public mutating func convert(html: String) -> NSAttributedString {
tokenizer = Tokenizer(chars: html.makeIterator()) tokenizer = Tokenizer(chars: html.unicodeScalars.makeIterator())
str = NSMutableAttributedString() str = NSMutableAttributedString()
actionStack = [] actionStack = []
@ -48,7 +48,7 @@ public struct AttributedStringConverter<Callbacks: AttributedStringCallbacks> {
while let token = tokenizer.next() { while let token = tokenizer.next() {
switch token { switch token {
case .character(let c): case .character(let c):
currentRun.append(c) currentRun.unicodeScalars.append(c)
case .comment: case .comment:
// ignored // ignored
continue continue

View File

@ -7,11 +7,11 @@
import Foundation import Foundation
struct Tokenizer<Chars: IteratorProtocol<Character>>: IteratorProtocol { struct Tokenizer<Chars: IteratorProtocol<Unicode.Scalar>>: IteratorProtocol {
typealias Element = Token typealias Element = Token
private var chars: Chars private var chars: Chars
private var reconsumeStack: [Character] = [] private var reconsumeStack: [Unicode.Scalar] = []
private var state = State.data private var state = State.data
private var returnState: State? private var returnState: State?
private var temporaryBuffer: String? private var temporaryBuffer: String?
@ -34,7 +34,7 @@ struct Tokenizer<Chars: IteratorProtocol<Character>>: IteratorProtocol {
state = returnState state = returnState
return next() return next()
} else { } else {
return .character(temporaryBuffer!.removeFirst()) return .character(temporaryBuffer!.unicodeScalars.removeFirst())
} }
case .endOfFile: case .endOfFile:
return nil return nil
@ -143,13 +143,13 @@ struct Tokenizer<Chars: IteratorProtocol<Character>>: IteratorProtocol {
} }
} }
private mutating func reconsume(_ c: Character?) { private mutating func reconsume(_ c: Unicode.Scalar?) {
if let c { if let c {
reconsumeStack.append(c) reconsumeStack.append(c)
} }
} }
private mutating func nextChar() -> Character? { private mutating func nextChar() -> Unicode.Scalar? {
if !reconsumeStack.isEmpty { if !reconsumeStack.isEmpty {
return reconsumeStack.removeLast() return reconsumeStack.removeLast()
} else { } else {
@ -157,7 +157,7 @@ struct Tokenizer<Chars: IteratorProtocol<Character>>: IteratorProtocol {
} }
} }
private mutating func peekChar() -> Character? { private mutating func peekChar() -> Unicode.Scalar? {
if let nextToReconsume = reconsumeStack.last { if let nextToReconsume = reconsumeStack.last {
return nextToReconsume return nextToReconsume
} else { } else {
@ -172,7 +172,7 @@ struct Tokenizer<Chars: IteratorProtocol<Character>>: IteratorProtocol {
// TODO: extract this all out into a standalone type and test it separately // TODO: extract this all out into a standalone type and test it separately
private mutating func peek(count: Int) -> String { private mutating func peek(count: Int) -> String {
precondition(count >= 0) precondition(count >= 0)
var buf = "" var buf = String.UnicodeScalarView()
for _ in 0..<count { for _ in 0..<count {
if let c = nextChar() { if let c = nextChar() {
buf.append(c) buf.append(c)
@ -181,7 +181,7 @@ struct Tokenizer<Chars: IteratorProtocol<Character>>: IteratorProtocol {
} }
} }
reconsumeStack.append(contentsOf: buf.reversed()) reconsumeStack.append(contentsOf: buf.reversed())
return buf return String(buf)
} }
private mutating func consume(count: Int) { private mutating func consume(count: Int) {
@ -211,7 +211,7 @@ struct Tokenizer<Chars: IteratorProtocol<Character>>: IteratorProtocol {
} }
enum Token: Equatable { enum Token: Equatable {
case character(Character) case character(Unicode.Scalar)
case comment(String) case comment(String)
case startTag(String, selfClosing: Bool, attributes: [Attribute]) case startTag(String, selfClosing: Bool, attributes: [Attribute])
case endTag(String) case endTag(String)
@ -371,9 +371,9 @@ private extension Tokenizer {
loop: while let c = nextChar() { loop: while let c = nextChar() {
switch c { switch c {
case "a"..."z", "A"..."Z", "0"..."9": case "a"..."z", "A"..."Z", "0"..."9":
temporaryBuffer!.append(c) temporaryBuffer!.unicodeScalars.append(c)
case ";": case ";":
temporaryBuffer!.append(c) temporaryBuffer!.unicodeScalars.append(c)
break loop break loop
default: default:
reconsume(c) reconsume(c)
@ -396,7 +396,7 @@ private extension Tokenizer {
} }
} }
if referent != nil { if referent != nil {
for c in buf[index...].reversed() { for c in buf[index...].unicodeScalars.reversed() {
reconsume(c) reconsume(c)
} }
temporaryBuffer!.removeSubrange(index...) temporaryBuffer!.removeSubrange(index...)
@ -556,7 +556,7 @@ private extension Tokenizer {
let c = nextChar() let c = nextChar()
switch c { switch c {
case .some("0"..."9"): case .some("0"..."9"):
characterReferenceCode = (characterReferenceCode! * 10) + UInt32(c!.wholeNumberValue!) characterReferenceCode = (characterReferenceCode! * 10) + UInt32(c!.hexDigitValue!)
return tokenizeDecimalCharacterReference() return tokenizeDecimalCharacterReference()
case ";": case ";":
state = .numericCharacterReferenceEnd state = .numericCharacterReferenceEnd
@ -574,7 +574,7 @@ private extension Tokenizer {
switch c { switch c {
case .some("0"..."9"), .some("a"..."z"), .some("A"..."Z"): case .some("0"..."9"), .some("a"..."z"), .some("A"..."Z"):
if case .attributeValue(_) = returnState { if case .attributeValue(_) = returnState {
currentStartTag!.attributes.uncheckedLast.value.append(c!) currentStartTag!.attributes.uncheckedLast.value.unicodeScalars.append(c!)
return tokenizeAmbiguousAmpersand() return tokenizeAmbiguousAmpersand()
} else { } else {
return .character(c!) return .character(c!)
@ -669,10 +669,10 @@ private extension Tokenizer {
c = c.asciiLowercase c = c.asciiLowercase
} }
if currentStartTag != nil { if currentStartTag != nil {
currentStartTag!.0.append(c) currentStartTag!.0.unicodeScalars.append(c)
continue continue
} else if currentEndTag != nil { } else if currentEndTag != nil {
currentEndTag!.append(c) currentEndTag!.unicodeScalars.append(c)
continue continue
} else { } else {
fatalError("bad current token") fatalError("bad current token")
@ -752,7 +752,7 @@ private extension Tokenizer {
} }
// if c in ["\"", "'", "<"], parse error: unexpected-character-in-attribute-name // if c in ["\"", "'", "<"], parse error: unexpected-character-in-attribute-name
if currentStartTag != nil { if currentStartTag != nil {
currentStartTag!.attributes.uncheckedLast.name.append(c) currentStartTag!.attributes.uncheckedLast.name.unicodeScalars.append(c)
continue continue
} else if currentEndTag != nil { } else if currentEndTag != nil {
continue continue
@ -837,7 +837,7 @@ private extension Tokenizer {
case .some(let c): case .some(let c):
// if c in ["\"", "'", "<", "=", "`"], parse error: unexpected-character-in-unquoted-attribute-value // if c in ["\"", "'", "<", "=", "`"], parse error: unexpected-character-in-unquoted-attribute-value
if currentStartTag != nil { if currentStartTag != nil {
currentStartTag!.attributes.uncheckedLast.value.append(c) currentStartTag!.attributes.uncheckedLast.value.unicodeScalars.append(c)
continue continue
} else if currentEndTag != nil { } else if currentEndTag != nil {
continue continue
@ -868,7 +868,7 @@ private extension Tokenizer {
c = "\u{FFFD}" c = "\u{FFFD}"
} }
if currentStartTag != nil { if currentStartTag != nil {
currentStartTag!.attributes.uncheckedLast.value.append(c) currentStartTag!.attributes.uncheckedLast.value.unicodeScalars.append(c)
continue continue
} else if currentEndTag != nil { } else if currentEndTag != nil {
continue continue
@ -914,7 +914,7 @@ private extension Tokenizer {
// parse error: unexpected-null-character // parse error: unexpected-null-character
c = "\u{FFFD}" c = "\u{FFFD}"
} }
currentComment!.append(c) currentComment!.unicodeScalars.append(c)
return tokenizeBogusComment() return tokenizeBogusComment()
} }
} }
@ -998,7 +998,7 @@ private extension Tokenizer {
// parse error: unexpected-null-character // parse error: unexpected-null-character
c = "\u{FFFD}" c = "\u{FFFD}"
} }
currentComment!.append(c) currentComment!.unicodeScalars.append(c)
return tokenizeComment() return tokenizeComment()
} }
} }
@ -1188,7 +1188,7 @@ private extension Tokenizer {
} else if ("A"..."Z").contains(c) { } else if ("A"..."Z").contains(c) {
c = c.asciiLowercase c = c.asciiLowercase
} }
currentDoctype!.0.append(c) currentDoctype!.0.unicodeScalars.append(c)
return tokenizeDoctypeName() return tokenizeDoctypeName()
} }
} }
@ -1309,7 +1309,7 @@ private extension Tokenizer {
// parse error: unexpected-null-character // parse error: unexpected-null-character
c = "\u{FFFD}" c = "\u{FFFD}"
} }
currentDoctype!.publicIdentifier!.append(c) currentDoctype!.publicIdentifier!.unicodeScalars.append(c)
return tokenizeDoctypePublicIdentifier(quotes: quotes) return tokenizeDoctypePublicIdentifier(quotes: quotes)
} }
} }
@ -1450,7 +1450,7 @@ private extension Tokenizer {
// parse error: unexpected-null-character // parse error: unexpected-null-character
c = "\u{FFFD}" c = "\u{FFFD}"
} }
currentDoctype!.systemIdentifier!.append(c) currentDoctype!.systemIdentifier!.unicodeScalars.append(c)
return tokenizeDoctypeSystemIdentifier(quotes: quotes) return tokenizeDoctypeSystemIdentifier(quotes: quotes)
} }
} }
@ -1495,13 +1495,6 @@ private extension Tokenizer {
} }
} }
private extension Character {
var asciiLowercase: Character {
assert(("A"..."Z").contains(self))
return Character(Unicode.Scalar(asciiValue! + 0x20))
}
}
private extension Array { private extension Array {
// Optimization: allows in-place modification of the last element of the array. // Optimization: allows in-place modification of the last element of the array.
var uncheckedLast: Element { var uncheckedLast: Element {
@ -1513,3 +1506,36 @@ private extension Array {
} }
} }
} }
private extension Unicode.Scalar {
var asciiLowercase: Unicode.Scalar {
assert(("A"..."Z").contains(self))
return Unicode.Scalar(value + 0x20)!
}
var hexDigitValue: Int? {
switch self {
case "0": 0
case "1": 1
case "2": 2
case "3": 3
case "4": 4
case "5": 5
case "6": 6
case "7": 7
case "8": 8
case "9": 9
case "A": 0xA
case "B": 0xB
case "C": 0xC
case "D": 0xD
case "E": 0xE
case "F": 0xF
default: nil
}
}
var isNumber: Bool {
("0"..."9").contains(self)
}
}

View File

@ -207,4 +207,11 @@ final class AttributedStringConverterTests: XCTestCase {
])) ]))
} }
func testMultiScalar() {
XCTAssertEqual(convert("🇺🇸"), NSAttributedString(string: "🇺🇸", attributes: [
.font: font,
.paragraphStyle: NSParagraphStyle.default,
]))
}
} }

View File

@ -11,7 +11,7 @@ import XCTest
final class TokenizerTests: XCTestCase { final class TokenizerTests: XCTestCase {
private func tokenize(_ s: String) -> [Token] { private func tokenize(_ s: String) -> [Token] {
let iterator = Tokenizer(chars: s.makeIterator()) let iterator = Tokenizer(chars: s.unicodeScalars.makeIterator())
// let iterator = PrintIterator(inner: Tokenizer(chars: s.makeIterator())) // let iterator = PrintIterator(inner: Tokenizer(chars: s.makeIterator()))
return Array(AnySequence({ iterator })) return Array(AnySequence({ iterator }))
} }
@ -23,8 +23,8 @@ final class TokenizerTests: XCTestCase {
XCTAssertEqual(tokenize("&notin"), [.character("¬"), .character("i"), .character("n")]) XCTAssertEqual(tokenize("&notin"), [.character("¬"), .character("i"), .character("n")])
// unknown-named-character-reference: // unknown-named-character-reference:
XCTAssertEqual(tokenize("&notit;"), [.character("¬"), .character("i"), .character("t"), .character(";")]) XCTAssertEqual(tokenize("&notit;"), [.character("¬"), .character("i"), .character("t"), .character(";")])
XCTAssertEqual(tokenize("&asdf"), "&asdf".map { .character($0) }) XCTAssertEqual(tokenize("&asdf"), "&asdf".unicodeScalars.map { .character($0) })
XCTAssertEqual(tokenize("&a"), "&a".map { .character($0) }) XCTAssertEqual(tokenize("&a"), "&a".unicodeScalars.map { .character($0) })
// attribute special case // attribute special case
XCTAssertEqual(tokenize("<a a='&nota' />"), [.startTag("a", selfClosing: true, attributes: [Attribute(name: "a", value: "&nota")])]) XCTAssertEqual(tokenize("<a a='&nota' />"), [.startTag("a", selfClosing: true, attributes: [Attribute(name: "a", value: "&nota")])])
@ -70,6 +70,10 @@ final class TokenizerTests: XCTestCase {
XCTAssertEqual(tokenize(#"<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">"#), [.doctype("html", forceQuirks: false, publicIdentifier: "-//W3C//DTD HTML 4.01//EN", systemIdentifier: "http://www.w3.org/TR/html4/strict.dtd")]) XCTAssertEqual(tokenize(#"<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">"#), [.doctype("html", forceQuirks: false, publicIdentifier: "-//W3C//DTD HTML 4.01//EN", systemIdentifier: "http://www.w3.org/TR/html4/strict.dtd")])
} }
func testMultiScalar() {
XCTAssertEqual(tokenize("🇺🇸"), [.character("\u{1F1FA}"), .character("\u{1F1F8}")])
}
} }
private struct PrintIterator<Inner: IteratorProtocol>: IteratorProtocol { private struct PrintIterator<Inner: IteratorProtocol>: IteratorProtocol {