BlockStateMachine performance improvements

This commit is contained in:
Shadowfacts 2024-02-22 15:19:53 -05:00
parent 1f26c4923c
commit a2ca8fd650
3 changed files with 112 additions and 95 deletions

View File

@ -17,7 +17,7 @@ private typealias PlatformFont = UIFont
private typealias PlatformFont = NSFont
#endif
public struct AttributedStringConverter<Callbacks: HTMLConversionCallbacks>: BlockRenderer {
public class AttributedStringConverter<Callbacks: HTMLConversionCallbacks> {
private let configuration: AttributedStringConverterConfiguration
private var fontCache: [FontTrait: PlatformFont] = [:]
@ -26,14 +26,13 @@ public struct AttributedStringConverter<Callbacks: HTMLConversionCallbacks>: Blo
private var actionStack: [ElementAction] = []
private var styleStack: [Style] = []
var blockState = BlockState.start
var temporaryBuffer: String = ""
private var blockStateMachine = BlockStateMachine(blockBreak: "", lineBreak: "", listIndentForContentOutsideItem: "", append: { _ in }, removeChar: {})
private var currentElementIsEmpty = true
private var previouslyFinishedListItem = false
// The current run of text w/o styles changing
private var currentRun: String = ""
public init(configuration: AttributedStringConverterConfiguration) where Callbacks == DefaultCallbacks {
public convenience init(configuration: AttributedStringConverterConfiguration) where Callbacks == DefaultCallbacks {
self.init(configuration: configuration, callbacks: DefaultCallbacks.self)
}
@ -41,14 +40,17 @@ public struct AttributedStringConverter<Callbacks: HTMLConversionCallbacks>: Blo
self.configuration = configuration
}
public mutating func convert(html: String) -> NSAttributedString {
public func convert(html: String) -> NSAttributedString {
tokenizer = Tokenizer(chars: html.unicodeScalars.makeIterator())
str = NSMutableAttributedString()
actionStack = []
styleStack = []
blockState = .start
temporaryBuffer = ""
blockStateMachine = BlockStateMachine(blockBreak: "\n\n", lineBreak: "\n", listIndentForContentOutsideItem: "\t\t", append: { [unowned self] in
self.append($0)
}, removeChar: { [unowned self] in
self.removeChar()
})
currentElementIsEmpty = true
previouslyFinishedListItem = false
currentRun = ""
@ -57,13 +59,13 @@ public struct AttributedStringConverter<Callbacks: HTMLConversionCallbacks>: Blo
switch token {
case .character(let c):
currentElementIsEmpty = false
if continueBlock(char: c) {
if blockStateMachine.continueBlock(char: c) {
currentRun.unicodeScalars.append(c)
}
case .characterSequence(let s):
currentElementIsEmpty = false
for c in s.unicodeScalars {
if continueBlock(char: c) {
if blockStateMachine.continueBlock(char: c) {
currentRun.unicodeScalars.append(c)
}
}
@ -93,15 +95,15 @@ public struct AttributedStringConverter<Callbacks: HTMLConversionCallbacks>: Blo
}
}
endBlocks()
blockStateMachine.endBlocks()
finishRun()
return str
}
private mutating func handleStartTag(_ name: String, selfClosing: Bool, attributes: [Attribute]) {
private func handleStartTag(_ name: String, selfClosing: Bool, attributes: [Attribute]) {
if name == "br" {
breakTag()
blockStateMachine.breakTag()
return
}
// self closing tags are ignored since they have no content
@ -133,22 +135,22 @@ public struct AttributedStringConverter<Callbacks: HTMLConversionCallbacks>: Blo
finishRun()
styleStack.append(.monospace)
case "pre":
startOrEndBlock()
startPreformatted()
blockStateMachine.startOrEndBlock()
blockStateMachine.startPreformatted()
finishRun()
styleStack.append(.monospace)
case "blockquote":
startOrEndBlock()
blockStateMachine.startOrEndBlock()
finishRun()
styleStack.append(.blockquote)
case "p":
startOrEndBlock()
blockStateMachine.startOrEndBlock()
case "ol":
startOrEndBlock()
blockStateMachine.startOrEndBlock()
finishRun()
styleStack.append(.orderedList(nextElementOrdinal: 1))
case "ul":
startOrEndBlock()
blockStateMachine.startOrEndBlock()
finishRun()
styleStack.append(.unorderedList)
case "li":
@ -161,14 +163,14 @@ public struct AttributedStringConverter<Callbacks: HTMLConversionCallbacks>: Blo
} else {
break
}
startListItem()
blockStateMachine.startListItem()
currentRun.append("\t\(marker)\t")
default:
break
}
}
private mutating func handleEndTag(_ name: String) {
private func handleEndTag(_ name: String) {
switch name {
case "a":
if case .link(.some(_)) = lastStyle(.link) {
@ -190,28 +192,28 @@ public struct AttributedStringConverter<Callbacks: HTMLConversionCallbacks>: Blo
case "pre":
finishRun()
removeLastStyle(.monospace)
startOrEndBlock()
endPreformatted()
blockStateMachine.startOrEndBlock()
blockStateMachine.endPreformatted()
case "blockquote":
finishRun()
removeLastStyle(.blockquote)
startOrEndBlock()
blockStateMachine.startOrEndBlock()
case "p":
startOrEndBlock()
blockStateMachine.startOrEndBlock()
case "ol":
finishRun()
removeLastStyle(.orderedList)
startOrEndBlock()
blockStateMachine.startOrEndBlock()
previouslyFinishedListItem = false
case "ul":
finishRun()
removeLastStyle(.unorderedList)
startOrEndBlock()
blockStateMachine.startOrEndBlock()
previouslyFinishedListItem = false
case "li":
finishRun()
previouslyFinishedListItem = true
endListItem()
blockStateMachine.endListItem()
default:
break
}
@ -229,11 +231,11 @@ public struct AttributedStringConverter<Callbacks: HTMLConversionCallbacks>: Blo
"\t\t"
}
mutating func append(_ s: String) {
func append(_ s: String) {
currentRun.append(s)
}
mutating func removeChar() {
func removeChar() {
if currentRun.isEmpty {
str.deleteCharacters(in: NSRange(location: str.length - 1, length: 1))
} else {
@ -243,7 +245,7 @@ public struct AttributedStringConverter<Callbacks: HTMLConversionCallbacks>: Blo
// Finds the last currently-open style of the given type.
// We can't just use the last one because we need to handle mis-nested tags.
private mutating func removeLastStyle(_ type: Style.StyleType) {
private func removeLastStyle(_ type: Style.StyleType) {
var i = styleStack.index(before: styleStack.endIndex)
while i >= styleStack.startIndex {
if styleStack[i].type == type {
@ -278,7 +280,7 @@ public struct AttributedStringConverter<Callbacks: HTMLConversionCallbacks>: Blo
return style
}()
private mutating func finishRun() {
private func finishRun() {
if actionStack.contains(.skip) {
currentRun = ""
return
@ -326,7 +328,7 @@ public struct AttributedStringConverter<Callbacks: HTMLConversionCallbacks>: Blo
currentRun = ""
}
private mutating func getFont(traits: FontTrait) -> PlatformFont? {
private func getFont(traits: FontTrait) -> PlatformFont? {
if let cached = fontCache[traits] {
return cached
}

View File

@ -19,17 +19,17 @@ import Foundation
*/
protocol BlockRenderer {
var blockState: BlockState { get set }
var blockBreak: String { get }
var lineBreak: String { get }
var listIndentForContentOutsideItem: String { get }
var temporaryBuffer: String { get set }
mutating func append(_ s: String)
mutating func removeChar()
struct BlockStateMachine {
var blockState: BlockState = .start
let blockBreak: String
let lineBreak: String
let listIndentForContentOutsideItem: String
var temporaryBuffer: String = ""
let append: (String) -> Void
let removeChar: () -> Void
}
extension BlockRenderer {
extension BlockStateMachine {
mutating func startOrEndBlock() {
switch blockState {
case .start:
@ -85,16 +85,18 @@ extension BlockRenderer {
}
mutating func continueBlock(char: UnicodeScalar) -> Bool {
let isNewline = char == "\n"
let isWhitespace = isNewline || isWhitespace(char)
switch blockState {
case .start:
if char.properties.isWhitespace {
if isWhitespace {
return false
} else {
blockState = .nonEmptyBlock
return true
}
case .emptyBlock:
if char.properties.isWhitespace {
if isWhitespace {
return false
} else {
blockState = .nonEmptyBlock
@ -102,7 +104,7 @@ extension BlockRenderer {
return true
}
case .nonEmptyBlock:
if char.properties.isWhitespace {
if isWhitespace {
blockState = .emittedSpace
append(" ")
return false
@ -110,14 +112,14 @@ extension BlockRenderer {
return true
}
case .emittedSpace:
if char.properties.isWhitespace {
if isWhitespace {
return false
} else {
blockState = .nonEmptyBlock
return true
}
case .lineBreakTag:
if char.properties.isWhitespace {
if isWhitespace {
return false
} else {
blockState = .nonEmptyBlock
@ -126,7 +128,7 @@ extension BlockRenderer {
return true
}
case .atLeastTwoLineBreakTags:
if char.properties.isWhitespace {
if isWhitespace {
return false
} else {
blockState = .nonEmptyBlock
@ -135,7 +137,7 @@ extension BlockRenderer {
return true
}
case .emptyBlockWithAtLeastTwoPreviousLineBreakTags:
if char.properties.isWhitespace {
if isWhitespace {
return false
} else {
blockState = .nonEmptyBlock
@ -144,14 +146,14 @@ extension BlockRenderer {
return true
}
case .beginListItem:
if char.properties.isWhitespace {
if isWhitespace {
return false
} else {
blockState = .listItemContent
return true
}
case .endListItem:
if char.properties.isWhitespace {
if isWhitespace {
return false
} else {
blockState = .listItemContent
@ -160,7 +162,7 @@ extension BlockRenderer {
return true
}
case .listItemContent:
if char.properties.isWhitespace {
if isWhitespace {
blockState = .emittedSpaceInListItemContent
append(" ")
return false
@ -168,14 +170,14 @@ extension BlockRenderer {
return true
}
case .emittedSpaceInListItemContent:
if char.properties.isWhitespace {
if isWhitespace {
return false
} else {
blockState = .listItemContent
return true
}
case .lineBreakTagInListItemContent:
if char.properties.isWhitespace {
if isWhitespace {
return false
} else {
blockState = .listItemContent
@ -184,7 +186,7 @@ extension BlockRenderer {
return true
}
case .atLeastTwoLineBreakTagsInListItemContent:
if char.properties.isWhitespace {
if isWhitespace {
return false
} else {
blockState = .listItemContent
@ -193,14 +195,14 @@ extension BlockRenderer {
return true
}
case .preformattedStart(let depth):
if char == "\n" {
if isNewline {
return false
} else {
blockState = .preformattedNonEmptyBlock(depth: depth)
return true
}
case .preformattedEmptyBlock(depth: let depth):
if char.properties.isWhitespace {
if isWhitespace {
blockState = .preformattedEmptyBlockWithLeadingWhitespace(depth: depth)
temporaryBuffer.unicodeScalars.append(char)
return false
@ -210,11 +212,11 @@ extension BlockRenderer {
return true
}
case .preformattedNonEmptyBlock(let depth):
if char == "\n" {
if isNewline {
blockState = .preformattedLineBreak(depth: depth)
temporaryBuffer.append(lineBreak)
return false
} else if char.properties.isWhitespace {
} else if isWhitespace {
blockState = .preformattedNonEmptyBlockWithTrailingWhitespace(depth: depth)
temporaryBuffer.unicodeScalars.append(char)
return false
@ -222,11 +224,11 @@ extension BlockRenderer {
return true
}
case .preformattedLineBreak(let depth):
if char == "\n" {
if isNewline {
blockState = .preformattedAtLeastTwoLineBreaks(depth: depth)
temporaryBuffer.append(lineBreak)
return false
} else if char.properties.isWhitespace {
} else if isWhitespace {
blockState = .preformattedNonEmptyBlockWithTrailingWhitespace(depth: depth)
temporaryBuffer.unicodeScalars.append(char)
return false
@ -237,7 +239,7 @@ extension BlockRenderer {
return true
}
case .preformattedAtLeastTwoLineBreaks(let depth):
if char.properties.isWhitespace {
if isWhitespace {
temporaryBuffer.unicodeScalars.append(char)
return false
} else {
@ -247,7 +249,7 @@ extension BlockRenderer {
return true
}
case .afterPreStartTag(let depth):
if char == "\n" {
if isNewline {
blockState = .preformattedEmptyBlock(depth: depth)
return false
} else {
@ -256,10 +258,10 @@ extension BlockRenderer {
return true
}
case .afterPreStartTagWithLeadingWhitespace(let depth):
if char == "\n" {
if isNewline {
blockState = .preformattedEmptyBlockWithLeadingWhitespace(depth: depth)
return false
} else if char.properties.isWhitespace {
} else if isWhitespace {
blockState = .preformattedEmptyBlockWithLeadingWhitespace(depth: depth)
temporaryBuffer.unicodeScalars.append(char)
return false
@ -270,11 +272,11 @@ extension BlockRenderer {
return true
}
case .preformattedNonEmptyBlockWithTrailingWhitespace(let depth):
if char == "\n" {
if isNewline {
blockState = .preformattedLineBreak(depth: depth)
temporaryBuffer.append(lineBreak)
return false
} else if char.properties.isWhitespace {
} else if isWhitespace {
temporaryBuffer.unicodeScalars.append(char)
return false
} else {
@ -284,11 +286,11 @@ extension BlockRenderer {
return true
}
case .preformattedEmptyBlockWithLeadingWhitespace(let depth):
if char == "\n" {
if isNewline {
blockState = .preformattedLineBreak(depth: depth)
temporaryBuffer.append(lineBreak)
return false
} else if char.properties.isWhitespace {
} else if isWhitespace {
temporaryBuffer.unicodeScalars.append(char)
return false
} else {
@ -566,13 +568,20 @@ enum BlockState: Equatable {
case emittedSpaceInListItemContent
case lineBreakTagInListItemContent
case atLeastTwoLineBreakTagsInListItemContent
case preformattedStart(depth: Int)
case preformattedEmptyBlock(depth: Int)
case preformattedNonEmptyBlock(depth: Int)
case preformattedLineBreak(depth: Int)
case preformattedAtLeastTwoLineBreaks(depth: Int)
case afterPreStartTag(depth: Int)
case afterPreStartTagWithLeadingWhitespace(depth: Int)
case preformattedNonEmptyBlockWithTrailingWhitespace(depth: Int)
case preformattedEmptyBlockWithLeadingWhitespace(depth: Int)
case preformattedStart(depth: Int32)
case preformattedEmptyBlock(depth: Int32)
case preformattedNonEmptyBlock(depth: Int32)
case preformattedLineBreak(depth: Int32)
case preformattedAtLeastTwoLineBreaks(depth: Int32)
case afterPreStartTag(depth: Int32)
case afterPreStartTagWithLeadingWhitespace(depth: Int32)
case preformattedNonEmptyBlockWithTrailingWhitespace(depth: Int32)
case preformattedEmptyBlockWithLeadingWhitespace(depth: Int32)
}
@inline(__always)
private func isWhitespace(_ c: UnicodeScalar) -> Bool {
// this is not strictly correct, but checking the actual unicode properties is slow
// and this should cover the vast majority of actual use
c == " " || c == "\n" || c == "\t" || c == "\u{A0}" /* NO-BREAK SPACE */
}

View File

@ -7,19 +7,18 @@
import Foundation
public struct TextConverter<Callbacks: HTMLConversionCallbacks>: BlockRenderer {
public class TextConverter<Callbacks: HTMLConversionCallbacks> {
private let configuration: TextConverterConfiguration
private var tokenizer: Tokenizer<String.UnicodeScalarView.Iterator>!
private var str: String!
private var actionStack: [ElementAction] = []
var blockState = BlockState.start
var temporaryBuffer: String = ""
var blockStateMachine = BlockStateMachine(blockBreak: "", lineBreak: "", listIndentForContentOutsideItem: "", append: { _ in }, removeChar: {})
private var currentElementIsEmpty = true
private var currentRun = ""
public init(configuration: TextConverterConfiguration = .init()) where Callbacks == DefaultCallbacks {
public convenience init(configuration: TextConverterConfiguration = .init()) where Callbacks == DefaultCallbacks {
self.init(configuration: configuration, callbacks: DefaultCallbacks.self)
}
@ -27,12 +26,19 @@ public struct TextConverter<Callbacks: HTMLConversionCallbacks>: BlockRenderer {
self.configuration = configuration
}
public mutating func convert(html: String) -> String {
public func convert(html: String) -> String {
tokenizer = Tokenizer(chars: html.unicodeScalars.makeIterator())
str = ""
blockState = .start
temporaryBuffer = ""
blockStateMachine = BlockStateMachine(
blockBreak: configuration.insertNewlines ? "\n\n" : " " ,
lineBreak: configuration.insertNewlines ? "\n" : " " ,
listIndentForContentOutsideItem: "",
append: { [unowned self] in
self.append($0)
}, removeChar: { [unowned self] in
self.removeChar()
})
currentElementIsEmpty = true
currentRun = ""
@ -40,13 +46,13 @@ public struct TextConverter<Callbacks: HTMLConversionCallbacks>: BlockRenderer {
switch token {
case .character(let scalar):
currentElementIsEmpty = false
if continueBlock(char: scalar) {
if blockStateMachine.continueBlock(char: scalar) {
currentRun.unicodeScalars.append(scalar)
}
case .characterSequence(let string):
currentElementIsEmpty = false
for c in string.unicodeScalars {
if continueBlock(char: c) {
if blockStateMachine.continueBlock(char: c) {
currentRun.unicodeScalars.append(c)
}
}
@ -71,27 +77,27 @@ public struct TextConverter<Callbacks: HTMLConversionCallbacks>: BlockRenderer {
}
}
endBlocks()
blockStateMachine.endBlocks()
finishRun()
return str
}
private mutating func handleStartTag(_ name: String, selfClosing: Bool, attributes: [Attribute]) {
private func handleStartTag(_ name: String, selfClosing: Bool, attributes: [Attribute]) {
switch name {
case "br":
breakTag()
blockStateMachine.breakTag()
case "pre", "blockquote", "p", "ol", "ul":
startOrEndBlock()
blockStateMachine.startOrEndBlock()
default:
break
}
}
private mutating func handleEndTag(_ name: String) {
private func handleEndTag(_ name: String) {
switch name {
case "pre", "blockquote", "p", "ol", "ul":
startOrEndBlock()
blockStateMachine.startOrEndBlock()
finishRun()
default:
break
@ -118,11 +124,11 @@ public struct TextConverter<Callbacks: HTMLConversionCallbacks>: BlockRenderer {
" "
}
mutating func append(_ s: String) {
func append(_ s: String) {
currentRun.append(s)
}
mutating func removeChar() {
func removeChar() {
if currentRun.isEmpty {
str.removeLast()
} else {
@ -130,7 +136,7 @@ public struct TextConverter<Callbacks: HTMLConversionCallbacks>: BlockRenderer {
}
}
private mutating func finishRun() {
private func finishRun() {
if actionStack.contains(.skip) {
currentRun = ""
return