Use Unicode.Scalar instead of Character
All the chars we care about are a single scalar, so this avoids spending time on the grapheme breaking algorithm.
This commit is contained in:
parent
f412369cf7
commit
f7f35e09f7
|
@ -21,7 +21,7 @@ public struct AttributedStringConverter<Callbacks: AttributedStringCallbacks> {
|
|||
private let configuration: AttributedStringConverterConfiguration
|
||||
private var fontCache: [FontTrait: PlatformFont] = [:]
|
||||
|
||||
private var tokenizer: Tokenizer<String.Iterator>!
|
||||
private var tokenizer: Tokenizer<String.UnicodeScalarView.Iterator>!
|
||||
private var str: NSMutableAttributedString!
|
||||
|
||||
private var actionStack: [ElementAction] = []
|
||||
|
@ -38,7 +38,7 @@ public struct AttributedStringConverter<Callbacks: AttributedStringCallbacks> {
|
|||
}
|
||||
|
||||
public mutating func convert(html: String) -> NSAttributedString {
|
||||
tokenizer = Tokenizer(chars: html.makeIterator())
|
||||
tokenizer = Tokenizer(chars: html.unicodeScalars.makeIterator())
|
||||
str = NSMutableAttributedString()
|
||||
|
||||
actionStack = []
|
||||
|
@ -48,7 +48,7 @@ public struct AttributedStringConverter<Callbacks: AttributedStringCallbacks> {
|
|||
while let token = tokenizer.next() {
|
||||
switch token {
|
||||
case .character(let c):
|
||||
currentRun.append(c)
|
||||
currentRun.unicodeScalars.append(c)
|
||||
case .comment:
|
||||
// ignored
|
||||
continue
|
||||
|
|
|
@ -7,11 +7,11 @@
|
|||
|
||||
import Foundation
|
||||
|
||||
struct Tokenizer<Chars: IteratorProtocol<Character>>: IteratorProtocol {
|
||||
struct Tokenizer<Chars: IteratorProtocol<Unicode.Scalar>>: IteratorProtocol {
|
||||
typealias Element = Token
|
||||
|
||||
private var chars: Chars
|
||||
private var reconsumeStack: [Character] = []
|
||||
private var reconsumeStack: [Unicode.Scalar] = []
|
||||
private var state = State.data
|
||||
private var returnState: State?
|
||||
private var temporaryBuffer: String?
|
||||
|
@ -34,7 +34,7 @@ struct Tokenizer<Chars: IteratorProtocol<Character>>: IteratorProtocol {
|
|||
state = returnState
|
||||
return next()
|
||||
} else {
|
||||
return .character(temporaryBuffer!.removeFirst())
|
||||
return .character(temporaryBuffer!.unicodeScalars.removeFirst())
|
||||
}
|
||||
case .endOfFile:
|
||||
return nil
|
||||
|
@ -143,13 +143,13 @@ struct Tokenizer<Chars: IteratorProtocol<Character>>: IteratorProtocol {
|
|||
}
|
||||
}
|
||||
|
||||
private mutating func reconsume(_ c: Character?) {
|
||||
private mutating func reconsume(_ c: Unicode.Scalar?) {
|
||||
if let c {
|
||||
reconsumeStack.append(c)
|
||||
}
|
||||
}
|
||||
|
||||
private mutating func nextChar() -> Character? {
|
||||
private mutating func nextChar() -> Unicode.Scalar? {
|
||||
if !reconsumeStack.isEmpty {
|
||||
return reconsumeStack.removeLast()
|
||||
} else {
|
||||
|
@ -157,7 +157,7 @@ struct Tokenizer<Chars: IteratorProtocol<Character>>: IteratorProtocol {
|
|||
}
|
||||
}
|
||||
|
||||
private mutating func peekChar() -> Character? {
|
||||
private mutating func peekChar() -> Unicode.Scalar? {
|
||||
if let nextToReconsume = reconsumeStack.last {
|
||||
return nextToReconsume
|
||||
} else {
|
||||
|
@ -172,7 +172,7 @@ struct Tokenizer<Chars: IteratorProtocol<Character>>: IteratorProtocol {
|
|||
// TODO: extract this all out into a standalone type and test it separately
|
||||
private mutating func peek(count: Int) -> String {
|
||||
precondition(count >= 0)
|
||||
var buf = ""
|
||||
var buf = String.UnicodeScalarView()
|
||||
for _ in 0..<count {
|
||||
if let c = nextChar() {
|
||||
buf.append(c)
|
||||
|
@ -181,7 +181,7 @@ struct Tokenizer<Chars: IteratorProtocol<Character>>: IteratorProtocol {
|
|||
}
|
||||
}
|
||||
reconsumeStack.append(contentsOf: buf.reversed())
|
||||
return buf
|
||||
return String(buf)
|
||||
}
|
||||
|
||||
private mutating func consume(count: Int) {
|
||||
|
@ -211,7 +211,7 @@ struct Tokenizer<Chars: IteratorProtocol<Character>>: IteratorProtocol {
|
|||
}
|
||||
|
||||
enum Token: Equatable {
|
||||
case character(Character)
|
||||
case character(Unicode.Scalar)
|
||||
case comment(String)
|
||||
case startTag(String, selfClosing: Bool, attributes: [Attribute])
|
||||
case endTag(String)
|
||||
|
@ -371,9 +371,9 @@ private extension Tokenizer {
|
|||
loop: while let c = nextChar() {
|
||||
switch c {
|
||||
case "a"..."z", "A"..."Z", "0"..."9":
|
||||
temporaryBuffer!.append(c)
|
||||
temporaryBuffer!.unicodeScalars.append(c)
|
||||
case ";":
|
||||
temporaryBuffer!.append(c)
|
||||
temporaryBuffer!.unicodeScalars.append(c)
|
||||
break loop
|
||||
default:
|
||||
reconsume(c)
|
||||
|
@ -396,7 +396,7 @@ private extension Tokenizer {
|
|||
}
|
||||
}
|
||||
if referent != nil {
|
||||
for c in buf[index...].reversed() {
|
||||
for c in buf[index...].unicodeScalars.reversed() {
|
||||
reconsume(c)
|
||||
}
|
||||
temporaryBuffer!.removeSubrange(index...)
|
||||
|
@ -556,7 +556,7 @@ private extension Tokenizer {
|
|||
let c = nextChar()
|
||||
switch c {
|
||||
case .some("0"..."9"):
|
||||
characterReferenceCode = (characterReferenceCode! * 10) + UInt32(c!.wholeNumberValue!)
|
||||
characterReferenceCode = (characterReferenceCode! * 10) + UInt32(c!.hexDigitValue!)
|
||||
return tokenizeDecimalCharacterReference()
|
||||
case ";":
|
||||
state = .numericCharacterReferenceEnd
|
||||
|
@ -574,7 +574,7 @@ private extension Tokenizer {
|
|||
switch c {
|
||||
case .some("0"..."9"), .some("a"..."z"), .some("A"..."Z"):
|
||||
if case .attributeValue(_) = returnState {
|
||||
currentStartTag!.attributes.uncheckedLast.value.append(c!)
|
||||
currentStartTag!.attributes.uncheckedLast.value.unicodeScalars.append(c!)
|
||||
return tokenizeAmbiguousAmpersand()
|
||||
} else {
|
||||
return .character(c!)
|
||||
|
@ -669,10 +669,10 @@ private extension Tokenizer {
|
|||
c = c.asciiLowercase
|
||||
}
|
||||
if currentStartTag != nil {
|
||||
currentStartTag!.0.append(c)
|
||||
currentStartTag!.0.unicodeScalars.append(c)
|
||||
continue
|
||||
} else if currentEndTag != nil {
|
||||
currentEndTag!.append(c)
|
||||
currentEndTag!.unicodeScalars.append(c)
|
||||
continue
|
||||
} else {
|
||||
fatalError("bad current token")
|
||||
|
@ -752,7 +752,7 @@ private extension Tokenizer {
|
|||
}
|
||||
// if c in ["\"", "'", "<"], parse error: unexpected-character-in-attribute-name
|
||||
if currentStartTag != nil {
|
||||
currentStartTag!.attributes.uncheckedLast.name.append(c)
|
||||
currentStartTag!.attributes.uncheckedLast.name.unicodeScalars.append(c)
|
||||
continue
|
||||
} else if currentEndTag != nil {
|
||||
continue
|
||||
|
@ -837,7 +837,7 @@ private extension Tokenizer {
|
|||
case .some(let c):
|
||||
// if c in ["\"", "'", "<", "=", "`"], parse error: unexpected-character-in-unquoted-attribute-value
|
||||
if currentStartTag != nil {
|
||||
currentStartTag!.attributes.uncheckedLast.value.append(c)
|
||||
currentStartTag!.attributes.uncheckedLast.value.unicodeScalars.append(c)
|
||||
continue
|
||||
} else if currentEndTag != nil {
|
||||
continue
|
||||
|
@ -868,7 +868,7 @@ private extension Tokenizer {
|
|||
c = "\u{FFFD}"
|
||||
}
|
||||
if currentStartTag != nil {
|
||||
currentStartTag!.attributes.uncheckedLast.value.append(c)
|
||||
currentStartTag!.attributes.uncheckedLast.value.unicodeScalars.append(c)
|
||||
continue
|
||||
} else if currentEndTag != nil {
|
||||
continue
|
||||
|
@ -914,7 +914,7 @@ private extension Tokenizer {
|
|||
// parse error: unexpected-null-character
|
||||
c = "\u{FFFD}"
|
||||
}
|
||||
currentComment!.append(c)
|
||||
currentComment!.unicodeScalars.append(c)
|
||||
return tokenizeBogusComment()
|
||||
}
|
||||
}
|
||||
|
@ -998,7 +998,7 @@ private extension Tokenizer {
|
|||
// parse error: unexpected-null-character
|
||||
c = "\u{FFFD}"
|
||||
}
|
||||
currentComment!.append(c)
|
||||
currentComment!.unicodeScalars.append(c)
|
||||
return tokenizeComment()
|
||||
}
|
||||
}
|
||||
|
@ -1188,7 +1188,7 @@ private extension Tokenizer {
|
|||
} else if ("A"..."Z").contains(c) {
|
||||
c = c.asciiLowercase
|
||||
}
|
||||
currentDoctype!.0.append(c)
|
||||
currentDoctype!.0.unicodeScalars.append(c)
|
||||
return tokenizeDoctypeName()
|
||||
}
|
||||
}
|
||||
|
@ -1309,7 +1309,7 @@ private extension Tokenizer {
|
|||
// parse error: unexpected-null-character
|
||||
c = "\u{FFFD}"
|
||||
}
|
||||
currentDoctype!.publicIdentifier!.append(c)
|
||||
currentDoctype!.publicIdentifier!.unicodeScalars.append(c)
|
||||
return tokenizeDoctypePublicIdentifier(quotes: quotes)
|
||||
}
|
||||
}
|
||||
|
@ -1450,7 +1450,7 @@ private extension Tokenizer {
|
|||
// parse error: unexpected-null-character
|
||||
c = "\u{FFFD}"
|
||||
}
|
||||
currentDoctype!.systemIdentifier!.append(c)
|
||||
currentDoctype!.systemIdentifier!.unicodeScalars.append(c)
|
||||
return tokenizeDoctypeSystemIdentifier(quotes: quotes)
|
||||
}
|
||||
}
|
||||
|
@ -1495,13 +1495,6 @@ private extension Tokenizer {
|
|||
}
|
||||
}
|
||||
|
||||
private extension Character {
|
||||
var asciiLowercase: Character {
|
||||
assert(("A"..."Z").contains(self))
|
||||
return Character(Unicode.Scalar(asciiValue! + 0x20))
|
||||
}
|
||||
}
|
||||
|
||||
private extension Array {
|
||||
// Optimization: allows in-place modification of the last element of the array.
|
||||
var uncheckedLast: Element {
|
||||
|
@ -1513,3 +1506,36 @@ private extension Array {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
private extension Unicode.Scalar {
|
||||
var asciiLowercase: Unicode.Scalar {
|
||||
assert(("A"..."Z").contains(self))
|
||||
return Unicode.Scalar(value + 0x20)!
|
||||
}
|
||||
|
||||
var hexDigitValue: Int? {
|
||||
switch self {
|
||||
case "0": 0
|
||||
case "1": 1
|
||||
case "2": 2
|
||||
case "3": 3
|
||||
case "4": 4
|
||||
case "5": 5
|
||||
case "6": 6
|
||||
case "7": 7
|
||||
case "8": 8
|
||||
case "9": 9
|
||||
case "A": 0xA
|
||||
case "B": 0xB
|
||||
case "C": 0xC
|
||||
case "D": 0xD
|
||||
case "E": 0xE
|
||||
case "F": 0xF
|
||||
default: nil
|
||||
}
|
||||
}
|
||||
|
||||
var isNumber: Bool {
|
||||
("0"..."9").contains(self)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -207,4 +207,11 @@ final class AttributedStringConverterTests: XCTestCase {
|
|||
]))
|
||||
}
|
||||
|
||||
func testMultiScalar() {
|
||||
XCTAssertEqual(convert("🇺🇸"), NSAttributedString(string: "🇺🇸", attributes: [
|
||||
.font: font,
|
||||
.paragraphStyle: NSParagraphStyle.default,
|
||||
]))
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -11,7 +11,7 @@ import XCTest
|
|||
final class TokenizerTests: XCTestCase {
|
||||
|
||||
private func tokenize(_ s: String) -> [Token] {
|
||||
let iterator = Tokenizer(chars: s.makeIterator())
|
||||
let iterator = Tokenizer(chars: s.unicodeScalars.makeIterator())
|
||||
// let iterator = PrintIterator(inner: Tokenizer(chars: s.makeIterator()))
|
||||
return Array(AnySequence({ iterator }))
|
||||
}
|
||||
|
@ -23,8 +23,8 @@ final class TokenizerTests: XCTestCase {
|
|||
XCTAssertEqual(tokenize("¬in"), [.character("¬"), .character("i"), .character("n")])
|
||||
// unknown-named-character-reference:
|
||||
XCTAssertEqual(tokenize("¬it;"), [.character("¬"), .character("i"), .character("t"), .character(";")])
|
||||
XCTAssertEqual(tokenize("&asdf"), "&asdf".map { .character($0) })
|
||||
XCTAssertEqual(tokenize("&a"), "&a".map { .character($0) })
|
||||
XCTAssertEqual(tokenize("&asdf"), "&asdf".unicodeScalars.map { .character($0) })
|
||||
XCTAssertEqual(tokenize("&a"), "&a".unicodeScalars.map { .character($0) })
|
||||
|
||||
// attribute special case
|
||||
XCTAssertEqual(tokenize("<a a='¬a' />"), [.startTag("a", selfClosing: true, attributes: [Attribute(name: "a", value: "¬a")])])
|
||||
|
@ -70,6 +70,10 @@ final class TokenizerTests: XCTestCase {
|
|||
XCTAssertEqual(tokenize(#"<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">"#), [.doctype("html", forceQuirks: false, publicIdentifier: "-//W3C//DTD HTML 4.01//EN", systemIdentifier: "http://www.w3.org/TR/html4/strict.dtd")])
|
||||
}
|
||||
|
||||
func testMultiScalar() {
|
||||
XCTAssertEqual(tokenize("🇺🇸"), [.character("\u{1F1FA}"), .character("\u{1F1F8}")])
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private struct PrintIterator<Inner: IteratorProtocol>: IteratorProtocol {
|
||||
|
|
Loading…
Reference in New Issue