From a2ca8fd65053872f37114d8bf769bc7fb6807ab5 Mon Sep 17 00:00:00 2001 From: Shadowfacts Date: Thu, 22 Feb 2024 15:19:53 -0500 Subject: [PATCH] BlockStateMachine performance improvements --- .../AttributedStringConverter.swift | 66 +++++++------ Sources/HTMLStreamer/BlockState.swift | 99 ++++++++++--------- Sources/HTMLStreamer/TextConverter.swift | 42 ++++---- 3 files changed, 112 insertions(+), 95 deletions(-) diff --git a/Sources/HTMLStreamer/AttributedStringConverter.swift b/Sources/HTMLStreamer/AttributedStringConverter.swift index 580df6e..f9dd781 100644 --- a/Sources/HTMLStreamer/AttributedStringConverter.swift +++ b/Sources/HTMLStreamer/AttributedStringConverter.swift @@ -17,7 +17,7 @@ private typealias PlatformFont = UIFont private typealias PlatformFont = NSFont #endif -public struct AttributedStringConverter: BlockRenderer { +public class AttributedStringConverter { private let configuration: AttributedStringConverterConfiguration private var fontCache: [FontTrait: PlatformFont] = [:] @@ -26,14 +26,13 @@ public struct AttributedStringConverter: Blo private var actionStack: [ElementAction] = [] private var styleStack: [Style] = [] - var blockState = BlockState.start - var temporaryBuffer: String = "" + private var blockStateMachine = BlockStateMachine(blockBreak: "", lineBreak: "", listIndentForContentOutsideItem: "", append: { _ in }, removeChar: {}) private var currentElementIsEmpty = true private var previouslyFinishedListItem = false // The current run of text w/o styles changing private var currentRun: String = "" - public init(configuration: AttributedStringConverterConfiguration) where Callbacks == DefaultCallbacks { + public convenience init(configuration: AttributedStringConverterConfiguration) where Callbacks == DefaultCallbacks { self.init(configuration: configuration, callbacks: DefaultCallbacks.self) } @@ -41,14 +40,17 @@ public struct AttributedStringConverter: Blo self.configuration = configuration } - public mutating func convert(html: String) -> NSAttributedString { + public func convert(html: String) -> NSAttributedString { tokenizer = Tokenizer(chars: html.unicodeScalars.makeIterator()) str = NSMutableAttributedString() actionStack = [] styleStack = [] - blockState = .start - temporaryBuffer = "" + blockStateMachine = BlockStateMachine(blockBreak: "\n\n", lineBreak: "\n", listIndentForContentOutsideItem: "\t\t", append: { [unowned self] in + self.append($0) + }, removeChar: { [unowned self] in + self.removeChar() + }) currentElementIsEmpty = true previouslyFinishedListItem = false currentRun = "" @@ -57,13 +59,13 @@ public struct AttributedStringConverter: Blo switch token { case .character(let c): currentElementIsEmpty = false - if continueBlock(char: c) { + if blockStateMachine.continueBlock(char: c) { currentRun.unicodeScalars.append(c) } case .characterSequence(let s): currentElementIsEmpty = false for c in s.unicodeScalars { - if continueBlock(char: c) { + if blockStateMachine.continueBlock(char: c) { currentRun.unicodeScalars.append(c) } } @@ -93,15 +95,15 @@ public struct AttributedStringConverter: Blo } } - endBlocks() + blockStateMachine.endBlocks() finishRun() return str } - private mutating func handleStartTag(_ name: String, selfClosing: Bool, attributes: [Attribute]) { + private func handleStartTag(_ name: String, selfClosing: Bool, attributes: [Attribute]) { if name == "br" { - breakTag() + blockStateMachine.breakTag() return } // self closing tags are ignored since they have no content @@ -133,22 +135,22 @@ public struct AttributedStringConverter: Blo finishRun() styleStack.append(.monospace) case "pre": - startOrEndBlock() - startPreformatted() + blockStateMachine.startOrEndBlock() + blockStateMachine.startPreformatted() finishRun() styleStack.append(.monospace) case "blockquote": - startOrEndBlock() + blockStateMachine.startOrEndBlock() finishRun() styleStack.append(.blockquote) case "p": - startOrEndBlock() + blockStateMachine.startOrEndBlock() case "ol": - startOrEndBlock() + blockStateMachine.startOrEndBlock() finishRun() styleStack.append(.orderedList(nextElementOrdinal: 1)) case "ul": - startOrEndBlock() + blockStateMachine.startOrEndBlock() finishRun() styleStack.append(.unorderedList) case "li": @@ -161,14 +163,14 @@ public struct AttributedStringConverter: Blo } else { break } - startListItem() + blockStateMachine.startListItem() currentRun.append("\t\(marker)\t") default: break } } - private mutating func handleEndTag(_ name: String) { + private func handleEndTag(_ name: String) { switch name { case "a": if case .link(.some(_)) = lastStyle(.link) { @@ -190,28 +192,28 @@ public struct AttributedStringConverter: Blo case "pre": finishRun() removeLastStyle(.monospace) - startOrEndBlock() - endPreformatted() + blockStateMachine.startOrEndBlock() + blockStateMachine.endPreformatted() case "blockquote": finishRun() removeLastStyle(.blockquote) - startOrEndBlock() + blockStateMachine.startOrEndBlock() case "p": - startOrEndBlock() + blockStateMachine.startOrEndBlock() case "ol": finishRun() removeLastStyle(.orderedList) - startOrEndBlock() + blockStateMachine.startOrEndBlock() previouslyFinishedListItem = false case "ul": finishRun() removeLastStyle(.unorderedList) - startOrEndBlock() + blockStateMachine.startOrEndBlock() previouslyFinishedListItem = false case "li": finishRun() previouslyFinishedListItem = true - endListItem() + blockStateMachine.endListItem() default: break } @@ -229,11 +231,11 @@ public struct AttributedStringConverter: Blo "\t\t" } - mutating func append(_ s: String) { + func append(_ s: String) { currentRun.append(s) } - mutating func removeChar() { + func removeChar() { if currentRun.isEmpty { str.deleteCharacters(in: NSRange(location: str.length - 1, length: 1)) } else { @@ -243,7 +245,7 @@ public struct AttributedStringConverter: Blo // Finds the last currently-open style of the given type. // We can't just use the last one because we need to handle mis-nested tags. - private mutating func removeLastStyle(_ type: Style.StyleType) { + private func removeLastStyle(_ type: Style.StyleType) { var i = styleStack.index(before: styleStack.endIndex) while i >= styleStack.startIndex { if styleStack[i].type == type { @@ -278,7 +280,7 @@ public struct AttributedStringConverter: Blo return style }() - private mutating func finishRun() { + private func finishRun() { if actionStack.contains(.skip) { currentRun = "" return @@ -326,7 +328,7 @@ public struct AttributedStringConverter: Blo currentRun = "" } - private mutating func getFont(traits: FontTrait) -> PlatformFont? { + private func getFont(traits: FontTrait) -> PlatformFont? { if let cached = fontCache[traits] { return cached } diff --git a/Sources/HTMLStreamer/BlockState.swift b/Sources/HTMLStreamer/BlockState.swift index 86130e5..77711ed 100644 --- a/Sources/HTMLStreamer/BlockState.swift +++ b/Sources/HTMLStreamer/BlockState.swift @@ -19,17 +19,17 @@ import Foundation */ -protocol BlockRenderer { - var blockState: BlockState { get set } - var blockBreak: String { get } - var lineBreak: String { get } - var listIndentForContentOutsideItem: String { get } - var temporaryBuffer: String { get set } - mutating func append(_ s: String) - mutating func removeChar() +struct BlockStateMachine { + var blockState: BlockState = .start + let blockBreak: String + let lineBreak: String + let listIndentForContentOutsideItem: String + var temporaryBuffer: String = "" + let append: (String) -> Void + let removeChar: () -> Void } -extension BlockRenderer { +extension BlockStateMachine { mutating func startOrEndBlock() { switch blockState { case .start: @@ -85,16 +85,18 @@ extension BlockRenderer { } mutating func continueBlock(char: UnicodeScalar) -> Bool { + let isNewline = char == "\n" + let isWhitespace = isNewline || isWhitespace(char) switch blockState { case .start: - if char.properties.isWhitespace { + if isWhitespace { return false } else { blockState = .nonEmptyBlock return true } case .emptyBlock: - if char.properties.isWhitespace { + if isWhitespace { return false } else { blockState = .nonEmptyBlock @@ -102,7 +104,7 @@ extension BlockRenderer { return true } case .nonEmptyBlock: - if char.properties.isWhitespace { + if isWhitespace { blockState = .emittedSpace append(" ") return false @@ -110,14 +112,14 @@ extension BlockRenderer { return true } case .emittedSpace: - if char.properties.isWhitespace { + if isWhitespace { return false } else { blockState = .nonEmptyBlock return true } case .lineBreakTag: - if char.properties.isWhitespace { + if isWhitespace { return false } else { blockState = .nonEmptyBlock @@ -126,7 +128,7 @@ extension BlockRenderer { return true } case .atLeastTwoLineBreakTags: - if char.properties.isWhitespace { + if isWhitespace { return false } else { blockState = .nonEmptyBlock @@ -135,7 +137,7 @@ extension BlockRenderer { return true } case .emptyBlockWithAtLeastTwoPreviousLineBreakTags: - if char.properties.isWhitespace { + if isWhitespace { return false } else { blockState = .nonEmptyBlock @@ -144,14 +146,14 @@ extension BlockRenderer { return true } case .beginListItem: - if char.properties.isWhitespace { + if isWhitespace { return false } else { blockState = .listItemContent return true } case .endListItem: - if char.properties.isWhitespace { + if isWhitespace { return false } else { blockState = .listItemContent @@ -160,7 +162,7 @@ extension BlockRenderer { return true } case .listItemContent: - if char.properties.isWhitespace { + if isWhitespace { blockState = .emittedSpaceInListItemContent append(" ") return false @@ -168,14 +170,14 @@ extension BlockRenderer { return true } case .emittedSpaceInListItemContent: - if char.properties.isWhitespace { + if isWhitespace { return false } else { blockState = .listItemContent return true } case .lineBreakTagInListItemContent: - if char.properties.isWhitespace { + if isWhitespace { return false } else { blockState = .listItemContent @@ -184,7 +186,7 @@ extension BlockRenderer { return true } case .atLeastTwoLineBreakTagsInListItemContent: - if char.properties.isWhitespace { + if isWhitespace { return false } else { blockState = .listItemContent @@ -193,14 +195,14 @@ extension BlockRenderer { return true } case .preformattedStart(let depth): - if char == "\n" { + if isNewline { return false } else { blockState = .preformattedNonEmptyBlock(depth: depth) return true } case .preformattedEmptyBlock(depth: let depth): - if char.properties.isWhitespace { + if isWhitespace { blockState = .preformattedEmptyBlockWithLeadingWhitespace(depth: depth) temporaryBuffer.unicodeScalars.append(char) return false @@ -210,11 +212,11 @@ extension BlockRenderer { return true } case .preformattedNonEmptyBlock(let depth): - if char == "\n" { + if isNewline { blockState = .preformattedLineBreak(depth: depth) temporaryBuffer.append(lineBreak) return false - } else if char.properties.isWhitespace { + } else if isWhitespace { blockState = .preformattedNonEmptyBlockWithTrailingWhitespace(depth: depth) temporaryBuffer.unicodeScalars.append(char) return false @@ -222,11 +224,11 @@ extension BlockRenderer { return true } case .preformattedLineBreak(let depth): - if char == "\n" { + if isNewline { blockState = .preformattedAtLeastTwoLineBreaks(depth: depth) temporaryBuffer.append(lineBreak) return false - } else if char.properties.isWhitespace { + } else if isWhitespace { blockState = .preformattedNonEmptyBlockWithTrailingWhitespace(depth: depth) temporaryBuffer.unicodeScalars.append(char) return false @@ -237,7 +239,7 @@ extension BlockRenderer { return true } case .preformattedAtLeastTwoLineBreaks(let depth): - if char.properties.isWhitespace { + if isWhitespace { temporaryBuffer.unicodeScalars.append(char) return false } else { @@ -247,7 +249,7 @@ extension BlockRenderer { return true } case .afterPreStartTag(let depth): - if char == "\n" { + if isNewline { blockState = .preformattedEmptyBlock(depth: depth) return false } else { @@ -256,10 +258,10 @@ extension BlockRenderer { return true } case .afterPreStartTagWithLeadingWhitespace(let depth): - if char == "\n" { + if isNewline { blockState = .preformattedEmptyBlockWithLeadingWhitespace(depth: depth) return false - } else if char.properties.isWhitespace { + } else if isWhitespace { blockState = .preformattedEmptyBlockWithLeadingWhitespace(depth: depth) temporaryBuffer.unicodeScalars.append(char) return false @@ -270,11 +272,11 @@ extension BlockRenderer { return true } case .preformattedNonEmptyBlockWithTrailingWhitespace(let depth): - if char == "\n" { + if isNewline { blockState = .preformattedLineBreak(depth: depth) temporaryBuffer.append(lineBreak) return false - } else if char.properties.isWhitespace { + } else if isWhitespace { temporaryBuffer.unicodeScalars.append(char) return false } else { @@ -284,11 +286,11 @@ extension BlockRenderer { return true } case .preformattedEmptyBlockWithLeadingWhitespace(let depth): - if char == "\n" { + if isNewline { blockState = .preformattedLineBreak(depth: depth) temporaryBuffer.append(lineBreak) return false - } else if char.properties.isWhitespace { + } else if isWhitespace { temporaryBuffer.unicodeScalars.append(char) return false } else { @@ -566,13 +568,20 @@ enum BlockState: Equatable { case emittedSpaceInListItemContent case lineBreakTagInListItemContent case atLeastTwoLineBreakTagsInListItemContent - case preformattedStart(depth: Int) - case preformattedEmptyBlock(depth: Int) - case preformattedNonEmptyBlock(depth: Int) - case preformattedLineBreak(depth: Int) - case preformattedAtLeastTwoLineBreaks(depth: Int) - case afterPreStartTag(depth: Int) - case afterPreStartTagWithLeadingWhitespace(depth: Int) - case preformattedNonEmptyBlockWithTrailingWhitespace(depth: Int) - case preformattedEmptyBlockWithLeadingWhitespace(depth: Int) + case preformattedStart(depth: Int32) + case preformattedEmptyBlock(depth: Int32) + case preformattedNonEmptyBlock(depth: Int32) + case preformattedLineBreak(depth: Int32) + case preformattedAtLeastTwoLineBreaks(depth: Int32) + case afterPreStartTag(depth: Int32) + case afterPreStartTagWithLeadingWhitespace(depth: Int32) + case preformattedNonEmptyBlockWithTrailingWhitespace(depth: Int32) + case preformattedEmptyBlockWithLeadingWhitespace(depth: Int32) +} + +@inline(__always) +private func isWhitespace(_ c: UnicodeScalar) -> Bool { + // this is not strictly correct, but checking the actual unicode properties is slow + // and this should cover the vast majority of actual use + c == " " || c == "\n" || c == "\t" || c == "\u{A0}" /* NO-BREAK SPACE */ } diff --git a/Sources/HTMLStreamer/TextConverter.swift b/Sources/HTMLStreamer/TextConverter.swift index 75acc4a..267bb1b 100644 --- a/Sources/HTMLStreamer/TextConverter.swift +++ b/Sources/HTMLStreamer/TextConverter.swift @@ -7,19 +7,18 @@ import Foundation -public struct TextConverter: BlockRenderer { +public class TextConverter { private let configuration: TextConverterConfiguration private var tokenizer: Tokenizer! private var str: String! private var actionStack: [ElementAction] = [] - var blockState = BlockState.start - var temporaryBuffer: String = "" + var blockStateMachine = BlockStateMachine(blockBreak: "", lineBreak: "", listIndentForContentOutsideItem: "", append: { _ in }, removeChar: {}) private var currentElementIsEmpty = true private var currentRun = "" - public init(configuration: TextConverterConfiguration = .init()) where Callbacks == DefaultCallbacks { + public convenience init(configuration: TextConverterConfiguration = .init()) where Callbacks == DefaultCallbacks { self.init(configuration: configuration, callbacks: DefaultCallbacks.self) } @@ -27,12 +26,19 @@ public struct TextConverter: BlockRenderer { self.configuration = configuration } - public mutating func convert(html: String) -> String { + public func convert(html: String) -> String { tokenizer = Tokenizer(chars: html.unicodeScalars.makeIterator()) str = "" - blockState = .start - temporaryBuffer = "" + blockStateMachine = BlockStateMachine( + blockBreak: configuration.insertNewlines ? "\n\n" : " " , + lineBreak: configuration.insertNewlines ? "\n" : " " , + listIndentForContentOutsideItem: "", + append: { [unowned self] in + self.append($0) + }, removeChar: { [unowned self] in + self.removeChar() + }) currentElementIsEmpty = true currentRun = "" @@ -40,13 +46,13 @@ public struct TextConverter: BlockRenderer { switch token { case .character(let scalar): currentElementIsEmpty = false - if continueBlock(char: scalar) { + if blockStateMachine.continueBlock(char: scalar) { currentRun.unicodeScalars.append(scalar) } case .characterSequence(let string): currentElementIsEmpty = false for c in string.unicodeScalars { - if continueBlock(char: c) { + if blockStateMachine.continueBlock(char: c) { currentRun.unicodeScalars.append(c) } } @@ -71,27 +77,27 @@ public struct TextConverter: BlockRenderer { } } - endBlocks() + blockStateMachine.endBlocks() finishRun() return str } - private mutating func handleStartTag(_ name: String, selfClosing: Bool, attributes: [Attribute]) { + private func handleStartTag(_ name: String, selfClosing: Bool, attributes: [Attribute]) { switch name { case "br": - breakTag() + blockStateMachine.breakTag() case "pre", "blockquote", "p", "ol", "ul": - startOrEndBlock() + blockStateMachine.startOrEndBlock() default: break } } - private mutating func handleEndTag(_ name: String) { + private func handleEndTag(_ name: String) { switch name { case "pre", "blockquote", "p", "ol", "ul": - startOrEndBlock() + blockStateMachine.startOrEndBlock() finishRun() default: break @@ -118,11 +124,11 @@ public struct TextConverter: BlockRenderer { " " } - mutating func append(_ s: String) { + func append(_ s: String) { currentRun.append(s) } - mutating func removeChar() { + func removeChar() { if currentRun.isEmpty { str.removeLast() } else { @@ -130,7 +136,7 @@ public struct TextConverter: BlockRenderer { } } - private mutating func finishRun() { + private func finishRun() { if actionStack.contains(.skip) { currentRun = "" return