Use Unicode.Scalar instead of Character
All the chars we care about are a single scalar, so this avoids spending time on the grapheme breaking algorithm.
This commit is contained in:
parent
f412369cf7
commit
f7f35e09f7
|
@ -21,7 +21,7 @@ public struct AttributedStringConverter<Callbacks: AttributedStringCallbacks> {
|
||||||
private let configuration: AttributedStringConverterConfiguration
|
private let configuration: AttributedStringConverterConfiguration
|
||||||
private var fontCache: [FontTrait: PlatformFont] = [:]
|
private var fontCache: [FontTrait: PlatformFont] = [:]
|
||||||
|
|
||||||
private var tokenizer: Tokenizer<String.Iterator>!
|
private var tokenizer: Tokenizer<String.UnicodeScalarView.Iterator>!
|
||||||
private var str: NSMutableAttributedString!
|
private var str: NSMutableAttributedString!
|
||||||
|
|
||||||
private var actionStack: [ElementAction] = []
|
private var actionStack: [ElementAction] = []
|
||||||
|
@ -38,7 +38,7 @@ public struct AttributedStringConverter<Callbacks: AttributedStringCallbacks> {
|
||||||
}
|
}
|
||||||
|
|
||||||
public mutating func convert(html: String) -> NSAttributedString {
|
public mutating func convert(html: String) -> NSAttributedString {
|
||||||
tokenizer = Tokenizer(chars: html.makeIterator())
|
tokenizer = Tokenizer(chars: html.unicodeScalars.makeIterator())
|
||||||
str = NSMutableAttributedString()
|
str = NSMutableAttributedString()
|
||||||
|
|
||||||
actionStack = []
|
actionStack = []
|
||||||
|
@ -48,7 +48,7 @@ public struct AttributedStringConverter<Callbacks: AttributedStringCallbacks> {
|
||||||
while let token = tokenizer.next() {
|
while let token = tokenizer.next() {
|
||||||
switch token {
|
switch token {
|
||||||
case .character(let c):
|
case .character(let c):
|
||||||
currentRun.append(c)
|
currentRun.unicodeScalars.append(c)
|
||||||
case .comment:
|
case .comment:
|
||||||
// ignored
|
// ignored
|
||||||
continue
|
continue
|
||||||
|
|
|
@ -7,11 +7,11 @@
|
||||||
|
|
||||||
import Foundation
|
import Foundation
|
||||||
|
|
||||||
struct Tokenizer<Chars: IteratorProtocol<Character>>: IteratorProtocol {
|
struct Tokenizer<Chars: IteratorProtocol<Unicode.Scalar>>: IteratorProtocol {
|
||||||
typealias Element = Token
|
typealias Element = Token
|
||||||
|
|
||||||
private var chars: Chars
|
private var chars: Chars
|
||||||
private var reconsumeStack: [Character] = []
|
private var reconsumeStack: [Unicode.Scalar] = []
|
||||||
private var state = State.data
|
private var state = State.data
|
||||||
private var returnState: State?
|
private var returnState: State?
|
||||||
private var temporaryBuffer: String?
|
private var temporaryBuffer: String?
|
||||||
|
@ -34,7 +34,7 @@ struct Tokenizer<Chars: IteratorProtocol<Character>>: IteratorProtocol {
|
||||||
state = returnState
|
state = returnState
|
||||||
return next()
|
return next()
|
||||||
} else {
|
} else {
|
||||||
return .character(temporaryBuffer!.removeFirst())
|
return .character(temporaryBuffer!.unicodeScalars.removeFirst())
|
||||||
}
|
}
|
||||||
case .endOfFile:
|
case .endOfFile:
|
||||||
return nil
|
return nil
|
||||||
|
@ -143,13 +143,13 @@ struct Tokenizer<Chars: IteratorProtocol<Character>>: IteratorProtocol {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private mutating func reconsume(_ c: Character?) {
|
private mutating func reconsume(_ c: Unicode.Scalar?) {
|
||||||
if let c {
|
if let c {
|
||||||
reconsumeStack.append(c)
|
reconsumeStack.append(c)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private mutating func nextChar() -> Character? {
|
private mutating func nextChar() -> Unicode.Scalar? {
|
||||||
if !reconsumeStack.isEmpty {
|
if !reconsumeStack.isEmpty {
|
||||||
return reconsumeStack.removeLast()
|
return reconsumeStack.removeLast()
|
||||||
} else {
|
} else {
|
||||||
|
@ -157,7 +157,7 @@ struct Tokenizer<Chars: IteratorProtocol<Character>>: IteratorProtocol {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private mutating func peekChar() -> Character? {
|
private mutating func peekChar() -> Unicode.Scalar? {
|
||||||
if let nextToReconsume = reconsumeStack.last {
|
if let nextToReconsume = reconsumeStack.last {
|
||||||
return nextToReconsume
|
return nextToReconsume
|
||||||
} else {
|
} else {
|
||||||
|
@ -172,7 +172,7 @@ struct Tokenizer<Chars: IteratorProtocol<Character>>: IteratorProtocol {
|
||||||
// TODO: extract this all out into a standalone type and test it separately
|
// TODO: extract this all out into a standalone type and test it separately
|
||||||
private mutating func peek(count: Int) -> String {
|
private mutating func peek(count: Int) -> String {
|
||||||
precondition(count >= 0)
|
precondition(count >= 0)
|
||||||
var buf = ""
|
var buf = String.UnicodeScalarView()
|
||||||
for _ in 0..<count {
|
for _ in 0..<count {
|
||||||
if let c = nextChar() {
|
if let c = nextChar() {
|
||||||
buf.append(c)
|
buf.append(c)
|
||||||
|
@ -181,7 +181,7 @@ struct Tokenizer<Chars: IteratorProtocol<Character>>: IteratorProtocol {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
reconsumeStack.append(contentsOf: buf.reversed())
|
reconsumeStack.append(contentsOf: buf.reversed())
|
||||||
return buf
|
return String(buf)
|
||||||
}
|
}
|
||||||
|
|
||||||
private mutating func consume(count: Int) {
|
private mutating func consume(count: Int) {
|
||||||
|
@ -211,7 +211,7 @@ struct Tokenizer<Chars: IteratorProtocol<Character>>: IteratorProtocol {
|
||||||
}
|
}
|
||||||
|
|
||||||
enum Token: Equatable {
|
enum Token: Equatable {
|
||||||
case character(Character)
|
case character(Unicode.Scalar)
|
||||||
case comment(String)
|
case comment(String)
|
||||||
case startTag(String, selfClosing: Bool, attributes: [Attribute])
|
case startTag(String, selfClosing: Bool, attributes: [Attribute])
|
||||||
case endTag(String)
|
case endTag(String)
|
||||||
|
@ -371,9 +371,9 @@ private extension Tokenizer {
|
||||||
loop: while let c = nextChar() {
|
loop: while let c = nextChar() {
|
||||||
switch c {
|
switch c {
|
||||||
case "a"..."z", "A"..."Z", "0"..."9":
|
case "a"..."z", "A"..."Z", "0"..."9":
|
||||||
temporaryBuffer!.append(c)
|
temporaryBuffer!.unicodeScalars.append(c)
|
||||||
case ";":
|
case ";":
|
||||||
temporaryBuffer!.append(c)
|
temporaryBuffer!.unicodeScalars.append(c)
|
||||||
break loop
|
break loop
|
||||||
default:
|
default:
|
||||||
reconsume(c)
|
reconsume(c)
|
||||||
|
@ -396,7 +396,7 @@ private extension Tokenizer {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if referent != nil {
|
if referent != nil {
|
||||||
for c in buf[index...].reversed() {
|
for c in buf[index...].unicodeScalars.reversed() {
|
||||||
reconsume(c)
|
reconsume(c)
|
||||||
}
|
}
|
||||||
temporaryBuffer!.removeSubrange(index...)
|
temporaryBuffer!.removeSubrange(index...)
|
||||||
|
@ -556,7 +556,7 @@ private extension Tokenizer {
|
||||||
let c = nextChar()
|
let c = nextChar()
|
||||||
switch c {
|
switch c {
|
||||||
case .some("0"..."9"):
|
case .some("0"..."9"):
|
||||||
characterReferenceCode = (characterReferenceCode! * 10) + UInt32(c!.wholeNumberValue!)
|
characterReferenceCode = (characterReferenceCode! * 10) + UInt32(c!.hexDigitValue!)
|
||||||
return tokenizeDecimalCharacterReference()
|
return tokenizeDecimalCharacterReference()
|
||||||
case ";":
|
case ";":
|
||||||
state = .numericCharacterReferenceEnd
|
state = .numericCharacterReferenceEnd
|
||||||
|
@ -574,7 +574,7 @@ private extension Tokenizer {
|
||||||
switch c {
|
switch c {
|
||||||
case .some("0"..."9"), .some("a"..."z"), .some("A"..."Z"):
|
case .some("0"..."9"), .some("a"..."z"), .some("A"..."Z"):
|
||||||
if case .attributeValue(_) = returnState {
|
if case .attributeValue(_) = returnState {
|
||||||
currentStartTag!.attributes.uncheckedLast.value.append(c!)
|
currentStartTag!.attributes.uncheckedLast.value.unicodeScalars.append(c!)
|
||||||
return tokenizeAmbiguousAmpersand()
|
return tokenizeAmbiguousAmpersand()
|
||||||
} else {
|
} else {
|
||||||
return .character(c!)
|
return .character(c!)
|
||||||
|
@ -669,10 +669,10 @@ private extension Tokenizer {
|
||||||
c = c.asciiLowercase
|
c = c.asciiLowercase
|
||||||
}
|
}
|
||||||
if currentStartTag != nil {
|
if currentStartTag != nil {
|
||||||
currentStartTag!.0.append(c)
|
currentStartTag!.0.unicodeScalars.append(c)
|
||||||
continue
|
continue
|
||||||
} else if currentEndTag != nil {
|
} else if currentEndTag != nil {
|
||||||
currentEndTag!.append(c)
|
currentEndTag!.unicodeScalars.append(c)
|
||||||
continue
|
continue
|
||||||
} else {
|
} else {
|
||||||
fatalError("bad current token")
|
fatalError("bad current token")
|
||||||
|
@ -752,7 +752,7 @@ private extension Tokenizer {
|
||||||
}
|
}
|
||||||
// if c in ["\"", "'", "<"], parse error: unexpected-character-in-attribute-name
|
// if c in ["\"", "'", "<"], parse error: unexpected-character-in-attribute-name
|
||||||
if currentStartTag != nil {
|
if currentStartTag != nil {
|
||||||
currentStartTag!.attributes.uncheckedLast.name.append(c)
|
currentStartTag!.attributes.uncheckedLast.name.unicodeScalars.append(c)
|
||||||
continue
|
continue
|
||||||
} else if currentEndTag != nil {
|
} else if currentEndTag != nil {
|
||||||
continue
|
continue
|
||||||
|
@ -837,7 +837,7 @@ private extension Tokenizer {
|
||||||
case .some(let c):
|
case .some(let c):
|
||||||
// if c in ["\"", "'", "<", "=", "`"], parse error: unexpected-character-in-unquoted-attribute-value
|
// if c in ["\"", "'", "<", "=", "`"], parse error: unexpected-character-in-unquoted-attribute-value
|
||||||
if currentStartTag != nil {
|
if currentStartTag != nil {
|
||||||
currentStartTag!.attributes.uncheckedLast.value.append(c)
|
currentStartTag!.attributes.uncheckedLast.value.unicodeScalars.append(c)
|
||||||
continue
|
continue
|
||||||
} else if currentEndTag != nil {
|
} else if currentEndTag != nil {
|
||||||
continue
|
continue
|
||||||
|
@ -868,7 +868,7 @@ private extension Tokenizer {
|
||||||
c = "\u{FFFD}"
|
c = "\u{FFFD}"
|
||||||
}
|
}
|
||||||
if currentStartTag != nil {
|
if currentStartTag != nil {
|
||||||
currentStartTag!.attributes.uncheckedLast.value.append(c)
|
currentStartTag!.attributes.uncheckedLast.value.unicodeScalars.append(c)
|
||||||
continue
|
continue
|
||||||
} else if currentEndTag != nil {
|
} else if currentEndTag != nil {
|
||||||
continue
|
continue
|
||||||
|
@ -914,7 +914,7 @@ private extension Tokenizer {
|
||||||
// parse error: unexpected-null-character
|
// parse error: unexpected-null-character
|
||||||
c = "\u{FFFD}"
|
c = "\u{FFFD}"
|
||||||
}
|
}
|
||||||
currentComment!.append(c)
|
currentComment!.unicodeScalars.append(c)
|
||||||
return tokenizeBogusComment()
|
return tokenizeBogusComment()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -998,7 +998,7 @@ private extension Tokenizer {
|
||||||
// parse error: unexpected-null-character
|
// parse error: unexpected-null-character
|
||||||
c = "\u{FFFD}"
|
c = "\u{FFFD}"
|
||||||
}
|
}
|
||||||
currentComment!.append(c)
|
currentComment!.unicodeScalars.append(c)
|
||||||
return tokenizeComment()
|
return tokenizeComment()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1188,7 +1188,7 @@ private extension Tokenizer {
|
||||||
} else if ("A"..."Z").contains(c) {
|
} else if ("A"..."Z").contains(c) {
|
||||||
c = c.asciiLowercase
|
c = c.asciiLowercase
|
||||||
}
|
}
|
||||||
currentDoctype!.0.append(c)
|
currentDoctype!.0.unicodeScalars.append(c)
|
||||||
return tokenizeDoctypeName()
|
return tokenizeDoctypeName()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1309,7 +1309,7 @@ private extension Tokenizer {
|
||||||
// parse error: unexpected-null-character
|
// parse error: unexpected-null-character
|
||||||
c = "\u{FFFD}"
|
c = "\u{FFFD}"
|
||||||
}
|
}
|
||||||
currentDoctype!.publicIdentifier!.append(c)
|
currentDoctype!.publicIdentifier!.unicodeScalars.append(c)
|
||||||
return tokenizeDoctypePublicIdentifier(quotes: quotes)
|
return tokenizeDoctypePublicIdentifier(quotes: quotes)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1450,7 +1450,7 @@ private extension Tokenizer {
|
||||||
// parse error: unexpected-null-character
|
// parse error: unexpected-null-character
|
||||||
c = "\u{FFFD}"
|
c = "\u{FFFD}"
|
||||||
}
|
}
|
||||||
currentDoctype!.systemIdentifier!.append(c)
|
currentDoctype!.systemIdentifier!.unicodeScalars.append(c)
|
||||||
return tokenizeDoctypeSystemIdentifier(quotes: quotes)
|
return tokenizeDoctypeSystemIdentifier(quotes: quotes)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1495,13 +1495,6 @@ private extension Tokenizer {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private extension Character {
|
|
||||||
var asciiLowercase: Character {
|
|
||||||
assert(("A"..."Z").contains(self))
|
|
||||||
return Character(Unicode.Scalar(asciiValue! + 0x20))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private extension Array {
|
private extension Array {
|
||||||
// Optimization: allows in-place modification of the last element of the array.
|
// Optimization: allows in-place modification of the last element of the array.
|
||||||
var uncheckedLast: Element {
|
var uncheckedLast: Element {
|
||||||
|
@ -1513,3 +1506,36 @@ private extension Array {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private extension Unicode.Scalar {
|
||||||
|
var asciiLowercase: Unicode.Scalar {
|
||||||
|
assert(("A"..."Z").contains(self))
|
||||||
|
return Unicode.Scalar(value + 0x20)!
|
||||||
|
}
|
||||||
|
|
||||||
|
var hexDigitValue: Int? {
|
||||||
|
switch self {
|
||||||
|
case "0": 0
|
||||||
|
case "1": 1
|
||||||
|
case "2": 2
|
||||||
|
case "3": 3
|
||||||
|
case "4": 4
|
||||||
|
case "5": 5
|
||||||
|
case "6": 6
|
||||||
|
case "7": 7
|
||||||
|
case "8": 8
|
||||||
|
case "9": 9
|
||||||
|
case "A": 0xA
|
||||||
|
case "B": 0xB
|
||||||
|
case "C": 0xC
|
||||||
|
case "D": 0xD
|
||||||
|
case "E": 0xE
|
||||||
|
case "F": 0xF
|
||||||
|
default: nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var isNumber: Bool {
|
||||||
|
("0"..."9").contains(self)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -207,4 +207,11 @@ final class AttributedStringConverterTests: XCTestCase {
|
||||||
]))
|
]))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func testMultiScalar() {
|
||||||
|
XCTAssertEqual(convert("🇺🇸"), NSAttributedString(string: "🇺🇸", attributes: [
|
||||||
|
.font: font,
|
||||||
|
.paragraphStyle: NSParagraphStyle.default,
|
||||||
|
]))
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -11,7 +11,7 @@ import XCTest
|
||||||
final class TokenizerTests: XCTestCase {
|
final class TokenizerTests: XCTestCase {
|
||||||
|
|
||||||
private func tokenize(_ s: String) -> [Token] {
|
private func tokenize(_ s: String) -> [Token] {
|
||||||
let iterator = Tokenizer(chars: s.makeIterator())
|
let iterator = Tokenizer(chars: s.unicodeScalars.makeIterator())
|
||||||
// let iterator = PrintIterator(inner: Tokenizer(chars: s.makeIterator()))
|
// let iterator = PrintIterator(inner: Tokenizer(chars: s.makeIterator()))
|
||||||
return Array(AnySequence({ iterator }))
|
return Array(AnySequence({ iterator }))
|
||||||
}
|
}
|
||||||
|
@ -23,8 +23,8 @@ final class TokenizerTests: XCTestCase {
|
||||||
XCTAssertEqual(tokenize("¬in"), [.character("¬"), .character("i"), .character("n")])
|
XCTAssertEqual(tokenize("¬in"), [.character("¬"), .character("i"), .character("n")])
|
||||||
// unknown-named-character-reference:
|
// unknown-named-character-reference:
|
||||||
XCTAssertEqual(tokenize("¬it;"), [.character("¬"), .character("i"), .character("t"), .character(";")])
|
XCTAssertEqual(tokenize("¬it;"), [.character("¬"), .character("i"), .character("t"), .character(";")])
|
||||||
XCTAssertEqual(tokenize("&asdf"), "&asdf".map { .character($0) })
|
XCTAssertEqual(tokenize("&asdf"), "&asdf".unicodeScalars.map { .character($0) })
|
||||||
XCTAssertEqual(tokenize("&a"), "&a".map { .character($0) })
|
XCTAssertEqual(tokenize("&a"), "&a".unicodeScalars.map { .character($0) })
|
||||||
|
|
||||||
// attribute special case
|
// attribute special case
|
||||||
XCTAssertEqual(tokenize("<a a='¬a' />"), [.startTag("a", selfClosing: true, attributes: [Attribute(name: "a", value: "¬a")])])
|
XCTAssertEqual(tokenize("<a a='¬a' />"), [.startTag("a", selfClosing: true, attributes: [Attribute(name: "a", value: "¬a")])])
|
||||||
|
@ -70,6 +70,10 @@ final class TokenizerTests: XCTestCase {
|
||||||
XCTAssertEqual(tokenize(#"<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">"#), [.doctype("html", forceQuirks: false, publicIdentifier: "-//W3C//DTD HTML 4.01//EN", systemIdentifier: "http://www.w3.org/TR/html4/strict.dtd")])
|
XCTAssertEqual(tokenize(#"<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">"#), [.doctype("html", forceQuirks: false, publicIdentifier: "-//W3C//DTD HTML 4.01//EN", systemIdentifier: "http://www.w3.org/TR/html4/strict.dtd")])
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func testMultiScalar() {
|
||||||
|
XCTAssertEqual(tokenize("🇺🇸"), [.character("\u{1F1FA}"), .character("\u{1F1F8}")])
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private struct PrintIterator<Inner: IteratorProtocol>: IteratorProtocol {
|
private struct PrintIterator<Inner: IteratorProtocol>: IteratorProtocol {
|
||||||
|
|
Loading…
Reference in New Issue