Use Unicode.Scalar instead of Character

All the chars we care about are a single scalar, so this avoids spending
time on the grapheme breaking algorithm.
This commit is contained in:
Shadowfacts 2023-11-28 11:56:56 -05:00
parent f412369cf7
commit f7f35e09f7
4 changed files with 74 additions and 37 deletions

View File

@ -21,7 +21,7 @@ public struct AttributedStringConverter<Callbacks: AttributedStringCallbacks> {
private let configuration: AttributedStringConverterConfiguration
private var fontCache: [FontTrait: PlatformFont] = [:]
private var tokenizer: Tokenizer<String.Iterator>!
private var tokenizer: Tokenizer<String.UnicodeScalarView.Iterator>!
private var str: NSMutableAttributedString!
private var actionStack: [ElementAction] = []
@ -38,7 +38,7 @@ public struct AttributedStringConverter<Callbacks: AttributedStringCallbacks> {
}
public mutating func convert(html: String) -> NSAttributedString {
tokenizer = Tokenizer(chars: html.makeIterator())
tokenizer = Tokenizer(chars: html.unicodeScalars.makeIterator())
str = NSMutableAttributedString()
actionStack = []
@ -48,7 +48,7 @@ public struct AttributedStringConverter<Callbacks: AttributedStringCallbacks> {
while let token = tokenizer.next() {
switch token {
case .character(let c):
currentRun.append(c)
currentRun.unicodeScalars.append(c)
case .comment:
// ignored
continue

View File

@ -7,11 +7,11 @@
import Foundation
struct Tokenizer<Chars: IteratorProtocol<Character>>: IteratorProtocol {
struct Tokenizer<Chars: IteratorProtocol<Unicode.Scalar>>: IteratorProtocol {
typealias Element = Token
private var chars: Chars
private var reconsumeStack: [Character] = []
private var reconsumeStack: [Unicode.Scalar] = []
private var state = State.data
private var returnState: State?
private var temporaryBuffer: String?
@ -34,7 +34,7 @@ struct Tokenizer<Chars: IteratorProtocol<Character>>: IteratorProtocol {
state = returnState
return next()
} else {
return .character(temporaryBuffer!.removeFirst())
return .character(temporaryBuffer!.unicodeScalars.removeFirst())
}
case .endOfFile:
return nil
@ -143,13 +143,13 @@ struct Tokenizer<Chars: IteratorProtocol<Character>>: IteratorProtocol {
}
}
private mutating func reconsume(_ c: Character?) {
private mutating func reconsume(_ c: Unicode.Scalar?) {
if let c {
reconsumeStack.append(c)
}
}
private mutating func nextChar() -> Character? {
private mutating func nextChar() -> Unicode.Scalar? {
if !reconsumeStack.isEmpty {
return reconsumeStack.removeLast()
} else {
@ -157,7 +157,7 @@ struct Tokenizer<Chars: IteratorProtocol<Character>>: IteratorProtocol {
}
}
private mutating func peekChar() -> Character? {
private mutating func peekChar() -> Unicode.Scalar? {
if let nextToReconsume = reconsumeStack.last {
return nextToReconsume
} else {
@ -172,7 +172,7 @@ struct Tokenizer<Chars: IteratorProtocol<Character>>: IteratorProtocol {
// TODO: extract this all out into a standalone type and test it separately
private mutating func peek(count: Int) -> String {
precondition(count >= 0)
var buf = ""
var buf = String.UnicodeScalarView()
for _ in 0..<count {
if let c = nextChar() {
buf.append(c)
@ -181,7 +181,7 @@ struct Tokenizer<Chars: IteratorProtocol<Character>>: IteratorProtocol {
}
}
reconsumeStack.append(contentsOf: buf.reversed())
return buf
return String(buf)
}
private mutating func consume(count: Int) {
@ -211,7 +211,7 @@ struct Tokenizer<Chars: IteratorProtocol<Character>>: IteratorProtocol {
}
enum Token: Equatable {
case character(Character)
case character(Unicode.Scalar)
case comment(String)
case startTag(String, selfClosing: Bool, attributes: [Attribute])
case endTag(String)
@ -371,9 +371,9 @@ private extension Tokenizer {
loop: while let c = nextChar() {
switch c {
case "a"..."z", "A"..."Z", "0"..."9":
temporaryBuffer!.append(c)
temporaryBuffer!.unicodeScalars.append(c)
case ";":
temporaryBuffer!.append(c)
temporaryBuffer!.unicodeScalars.append(c)
break loop
default:
reconsume(c)
@ -396,7 +396,7 @@ private extension Tokenizer {
}
}
if referent != nil {
for c in buf[index...].reversed() {
for c in buf[index...].unicodeScalars.reversed() {
reconsume(c)
}
temporaryBuffer!.removeSubrange(index...)
@ -556,7 +556,7 @@ private extension Tokenizer {
let c = nextChar()
switch c {
case .some("0"..."9"):
characterReferenceCode = (characterReferenceCode! * 10) + UInt32(c!.wholeNumberValue!)
characterReferenceCode = (characterReferenceCode! * 10) + UInt32(c!.hexDigitValue!)
return tokenizeDecimalCharacterReference()
case ";":
state = .numericCharacterReferenceEnd
@ -574,7 +574,7 @@ private extension Tokenizer {
switch c {
case .some("0"..."9"), .some("a"..."z"), .some("A"..."Z"):
if case .attributeValue(_) = returnState {
currentStartTag!.attributes.uncheckedLast.value.append(c!)
currentStartTag!.attributes.uncheckedLast.value.unicodeScalars.append(c!)
return tokenizeAmbiguousAmpersand()
} else {
return .character(c!)
@ -669,10 +669,10 @@ private extension Tokenizer {
c = c.asciiLowercase
}
if currentStartTag != nil {
currentStartTag!.0.append(c)
currentStartTag!.0.unicodeScalars.append(c)
continue
} else if currentEndTag != nil {
currentEndTag!.append(c)
currentEndTag!.unicodeScalars.append(c)
continue
} else {
fatalError("bad current token")
@ -752,7 +752,7 @@ private extension Tokenizer {
}
// if c in ["\"", "'", "<"], parse error: unexpected-character-in-attribute-name
if currentStartTag != nil {
currentStartTag!.attributes.uncheckedLast.name.append(c)
currentStartTag!.attributes.uncheckedLast.name.unicodeScalars.append(c)
continue
} else if currentEndTag != nil {
continue
@ -837,7 +837,7 @@ private extension Tokenizer {
case .some(let c):
// if c in ["\"", "'", "<", "=", "`"], parse error: unexpected-character-in-unquoted-attribute-value
if currentStartTag != nil {
currentStartTag!.attributes.uncheckedLast.value.append(c)
currentStartTag!.attributes.uncheckedLast.value.unicodeScalars.append(c)
continue
} else if currentEndTag != nil {
continue
@ -868,7 +868,7 @@ private extension Tokenizer {
c = "\u{FFFD}"
}
if currentStartTag != nil {
currentStartTag!.attributes.uncheckedLast.value.append(c)
currentStartTag!.attributes.uncheckedLast.value.unicodeScalars.append(c)
continue
} else if currentEndTag != nil {
continue
@ -914,7 +914,7 @@ private extension Tokenizer {
// parse error: unexpected-null-character
c = "\u{FFFD}"
}
currentComment!.append(c)
currentComment!.unicodeScalars.append(c)
return tokenizeBogusComment()
}
}
@ -998,7 +998,7 @@ private extension Tokenizer {
// parse error: unexpected-null-character
c = "\u{FFFD}"
}
currentComment!.append(c)
currentComment!.unicodeScalars.append(c)
return tokenizeComment()
}
}
@ -1188,7 +1188,7 @@ private extension Tokenizer {
} else if ("A"..."Z").contains(c) {
c = c.asciiLowercase
}
currentDoctype!.0.append(c)
currentDoctype!.0.unicodeScalars.append(c)
return tokenizeDoctypeName()
}
}
@ -1309,7 +1309,7 @@ private extension Tokenizer {
// parse error: unexpected-null-character
c = "\u{FFFD}"
}
currentDoctype!.publicIdentifier!.append(c)
currentDoctype!.publicIdentifier!.unicodeScalars.append(c)
return tokenizeDoctypePublicIdentifier(quotes: quotes)
}
}
@ -1450,7 +1450,7 @@ private extension Tokenizer {
// parse error: unexpected-null-character
c = "\u{FFFD}"
}
currentDoctype!.systemIdentifier!.append(c)
currentDoctype!.systemIdentifier!.unicodeScalars.append(c)
return tokenizeDoctypeSystemIdentifier(quotes: quotes)
}
}
@ -1495,13 +1495,6 @@ private extension Tokenizer {
}
}
private extension Character {
var asciiLowercase: Character {
assert(("A"..."Z").contains(self))
return Character(Unicode.Scalar(asciiValue! + 0x20))
}
}
private extension Array {
// Optimization: allows in-place modification of the last element of the array.
var uncheckedLast: Element {
@ -1513,3 +1506,36 @@ private extension Array {
}
}
}
private extension Unicode.Scalar {
var asciiLowercase: Unicode.Scalar {
assert(("A"..."Z").contains(self))
return Unicode.Scalar(value + 0x20)!
}
var hexDigitValue: Int? {
switch self {
case "0": 0
case "1": 1
case "2": 2
case "3": 3
case "4": 4
case "5": 5
case "6": 6
case "7": 7
case "8": 8
case "9": 9
case "A": 0xA
case "B": 0xB
case "C": 0xC
case "D": 0xD
case "E": 0xE
case "F": 0xF
default: nil
}
}
var isNumber: Bool {
("0"..."9").contains(self)
}
}

View File

@ -207,4 +207,11 @@ final class AttributedStringConverterTests: XCTestCase {
]))
}
func testMultiScalar() {
XCTAssertEqual(convert("🇺🇸"), NSAttributedString(string: "🇺🇸", attributes: [
.font: font,
.paragraphStyle: NSParagraphStyle.default,
]))
}
}

View File

@ -11,7 +11,7 @@ import XCTest
final class TokenizerTests: XCTestCase {
private func tokenize(_ s: String) -> [Token] {
let iterator = Tokenizer(chars: s.makeIterator())
let iterator = Tokenizer(chars: s.unicodeScalars.makeIterator())
// let iterator = PrintIterator(inner: Tokenizer(chars: s.makeIterator()))
return Array(AnySequence({ iterator }))
}
@ -23,8 +23,8 @@ final class TokenizerTests: XCTestCase {
XCTAssertEqual(tokenize("&notin"), [.character("¬"), .character("i"), .character("n")])
// unknown-named-character-reference:
XCTAssertEqual(tokenize("&notit;"), [.character("¬"), .character("i"), .character("t"), .character(";")])
XCTAssertEqual(tokenize("&asdf"), "&asdf".map { .character($0) })
XCTAssertEqual(tokenize("&a"), "&a".map { .character($0) })
XCTAssertEqual(tokenize("&asdf"), "&asdf".unicodeScalars.map { .character($0) })
XCTAssertEqual(tokenize("&a"), "&a".unicodeScalars.map { .character($0) })
// attribute special case
XCTAssertEqual(tokenize("<a a='&nota' />"), [.startTag("a", selfClosing: true, attributes: [Attribute(name: "a", value: "&nota")])])
@ -70,6 +70,10 @@ final class TokenizerTests: XCTestCase {
XCTAssertEqual(tokenize(#"<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">"#), [.doctype("html", forceQuirks: false, publicIdentifier: "-//W3C//DTD HTML 4.01//EN", systemIdentifier: "http://www.w3.org/TR/html4/strict.dtd")])
}
func testMultiScalar() {
XCTAssertEqual(tokenize("🇺🇸"), [.character("\u{1F1FA}"), .character("\u{1F1F8}")])
}
}
private struct PrintIterator<Inner: IteratorProtocol>: IteratorProtocol {