diff --git a/Sources/HTMLStreamer/AttributedStringConverter.swift b/Sources/HTMLStreamer/AttributedStringConverter.swift
index 580df6e..f9dd781 100644
--- a/Sources/HTMLStreamer/AttributedStringConverter.swift
+++ b/Sources/HTMLStreamer/AttributedStringConverter.swift
@@ -17,7 +17,7 @@ private typealias PlatformFont = UIFont
private typealias PlatformFont = NSFont
#endif
-public struct AttributedStringConverter: BlockRenderer {
+public class AttributedStringConverter {
private let configuration: AttributedStringConverterConfiguration
private var fontCache: [FontTrait: PlatformFont] = [:]
@@ -26,14 +26,13 @@ public struct AttributedStringConverter: Blo
private var actionStack: [ElementAction] = []
private var styleStack: [Style] = []
- var blockState = BlockState.start
- var temporaryBuffer: String = ""
+ private var blockStateMachine = BlockStateMachine(blockBreak: "", lineBreak: "", listIndentForContentOutsideItem: "", append: { _ in }, removeChar: {})
private var currentElementIsEmpty = true
private var previouslyFinishedListItem = false
// The current run of text w/o styles changing
private var currentRun: String = ""
- public init(configuration: AttributedStringConverterConfiguration) where Callbacks == DefaultCallbacks {
+ public convenience init(configuration: AttributedStringConverterConfiguration) where Callbacks == DefaultCallbacks {
self.init(configuration: configuration, callbacks: DefaultCallbacks.self)
}
@@ -41,14 +40,17 @@ public struct AttributedStringConverter: Blo
self.configuration = configuration
}
- public mutating func convert(html: String) -> NSAttributedString {
+ public func convert(html: String) -> NSAttributedString {
tokenizer = Tokenizer(chars: html.unicodeScalars.makeIterator())
str = NSMutableAttributedString()
actionStack = []
styleStack = []
- blockState = .start
- temporaryBuffer = ""
+ blockStateMachine = BlockStateMachine(blockBreak: "\n\n", lineBreak: "\n", listIndentForContentOutsideItem: "\t\t", append: { [unowned self] in
+ self.append($0)
+ }, removeChar: { [unowned self] in
+ self.removeChar()
+ })
currentElementIsEmpty = true
previouslyFinishedListItem = false
currentRun = ""
@@ -57,13 +59,13 @@ public struct AttributedStringConverter: Blo
switch token {
case .character(let c):
currentElementIsEmpty = false
- if continueBlock(char: c) {
+ if blockStateMachine.continueBlock(char: c) {
currentRun.unicodeScalars.append(c)
}
case .characterSequence(let s):
currentElementIsEmpty = false
for c in s.unicodeScalars {
- if continueBlock(char: c) {
+ if blockStateMachine.continueBlock(char: c) {
currentRun.unicodeScalars.append(c)
}
}
@@ -93,15 +95,15 @@ public struct AttributedStringConverter: Blo
}
}
- endBlocks()
+ blockStateMachine.endBlocks()
finishRun()
return str
}
- private mutating func handleStartTag(_ name: String, selfClosing: Bool, attributes: [Attribute]) {
+ private func handleStartTag(_ name: String, selfClosing: Bool, attributes: [Attribute]) {
if name == "br" {
- breakTag()
+ blockStateMachine.breakTag()
return
}
// self closing tags are ignored since they have no content
@@ -133,22 +135,22 @@ public struct AttributedStringConverter: Blo
finishRun()
styleStack.append(.monospace)
case "pre":
- startOrEndBlock()
- startPreformatted()
+ blockStateMachine.startOrEndBlock()
+ blockStateMachine.startPreformatted()
finishRun()
styleStack.append(.monospace)
case "blockquote":
- startOrEndBlock()
+ blockStateMachine.startOrEndBlock()
finishRun()
styleStack.append(.blockquote)
case "p":
- startOrEndBlock()
+ blockStateMachine.startOrEndBlock()
case "ol":
- startOrEndBlock()
+ blockStateMachine.startOrEndBlock()
finishRun()
styleStack.append(.orderedList(nextElementOrdinal: 1))
case "ul":
- startOrEndBlock()
+ blockStateMachine.startOrEndBlock()
finishRun()
styleStack.append(.unorderedList)
case "li":
@@ -161,14 +163,14 @@ public struct AttributedStringConverter: Blo
} else {
break
}
- startListItem()
+ blockStateMachine.startListItem()
currentRun.append("\t\(marker)\t")
default:
break
}
}
- private mutating func handleEndTag(_ name: String) {
+ private func handleEndTag(_ name: String) {
switch name {
case "a":
if case .link(.some(_)) = lastStyle(.link) {
@@ -190,28 +192,28 @@ public struct AttributedStringConverter: Blo
case "pre":
finishRun()
removeLastStyle(.monospace)
- startOrEndBlock()
- endPreformatted()
+ blockStateMachine.startOrEndBlock()
+ blockStateMachine.endPreformatted()
case "blockquote":
finishRun()
removeLastStyle(.blockquote)
- startOrEndBlock()
+ blockStateMachine.startOrEndBlock()
case "p":
- startOrEndBlock()
+ blockStateMachine.startOrEndBlock()
case "ol":
finishRun()
removeLastStyle(.orderedList)
- startOrEndBlock()
+ blockStateMachine.startOrEndBlock()
previouslyFinishedListItem = false
case "ul":
finishRun()
removeLastStyle(.unorderedList)
- startOrEndBlock()
+ blockStateMachine.startOrEndBlock()
previouslyFinishedListItem = false
case "li":
finishRun()
previouslyFinishedListItem = true
- endListItem()
+ blockStateMachine.endListItem()
default:
break
}
@@ -229,11 +231,11 @@ public struct AttributedStringConverter: Blo
"\t\t"
}
- mutating func append(_ s: String) {
+ func append(_ s: String) {
currentRun.append(s)
}
- mutating func removeChar() {
+ func removeChar() {
if currentRun.isEmpty {
str.deleteCharacters(in: NSRange(location: str.length - 1, length: 1))
} else {
@@ -243,7 +245,7 @@ public struct AttributedStringConverter: Blo
// Finds the last currently-open style of the given type.
// We can't just use the last one because we need to handle mis-nested tags.
- private mutating func removeLastStyle(_ type: Style.StyleType) {
+ private func removeLastStyle(_ type: Style.StyleType) {
var i = styleStack.index(before: styleStack.endIndex)
while i >= styleStack.startIndex {
if styleStack[i].type == type {
@@ -278,7 +280,7 @@ public struct AttributedStringConverter: Blo
return style
}()
- private mutating func finishRun() {
+ private func finishRun() {
if actionStack.contains(.skip) {
currentRun = ""
return
@@ -326,7 +328,7 @@ public struct AttributedStringConverter: Blo
currentRun = ""
}
- private mutating func getFont(traits: FontTrait) -> PlatformFont? {
+ private func getFont(traits: FontTrait) -> PlatformFont? {
if let cached = fontCache[traits] {
return cached
}
diff --git a/Sources/HTMLStreamer/BlockState.swift b/Sources/HTMLStreamer/BlockState.swift
index 86130e5..77711ed 100644
--- a/Sources/HTMLStreamer/BlockState.swift
+++ b/Sources/HTMLStreamer/BlockState.swift
@@ -19,17 +19,17 @@ import Foundation
*/
-protocol BlockRenderer {
- var blockState: BlockState { get set }
- var blockBreak: String { get }
- var lineBreak: String { get }
- var listIndentForContentOutsideItem: String { get }
- var temporaryBuffer: String { get set }
- mutating func append(_ s: String)
- mutating func removeChar()
+struct BlockStateMachine {
+ var blockState: BlockState = .start
+ let blockBreak: String
+ let lineBreak: String
+ let listIndentForContentOutsideItem: String
+ var temporaryBuffer: String = ""
+ let append: (String) -> Void
+ let removeChar: () -> Void
}
-extension BlockRenderer {
+extension BlockStateMachine {
mutating func startOrEndBlock() {
switch blockState {
case .start:
@@ -85,16 +85,18 @@ extension BlockRenderer {
}
mutating func continueBlock(char: UnicodeScalar) -> Bool {
+ let isNewline = char == "\n"
+ let isWhitespace = isNewline || isWhitespace(char)
switch blockState {
case .start:
- if char.properties.isWhitespace {
+ if isWhitespace {
return false
} else {
blockState = .nonEmptyBlock
return true
}
case .emptyBlock:
- if char.properties.isWhitespace {
+ if isWhitespace {
return false
} else {
blockState = .nonEmptyBlock
@@ -102,7 +104,7 @@ extension BlockRenderer {
return true
}
case .nonEmptyBlock:
- if char.properties.isWhitespace {
+ if isWhitespace {
blockState = .emittedSpace
append(" ")
return false
@@ -110,14 +112,14 @@ extension BlockRenderer {
return true
}
case .emittedSpace:
- if char.properties.isWhitespace {
+ if isWhitespace {
return false
} else {
blockState = .nonEmptyBlock
return true
}
case .lineBreakTag:
- if char.properties.isWhitespace {
+ if isWhitespace {
return false
} else {
blockState = .nonEmptyBlock
@@ -126,7 +128,7 @@ extension BlockRenderer {
return true
}
case .atLeastTwoLineBreakTags:
- if char.properties.isWhitespace {
+ if isWhitespace {
return false
} else {
blockState = .nonEmptyBlock
@@ -135,7 +137,7 @@ extension BlockRenderer {
return true
}
case .emptyBlockWithAtLeastTwoPreviousLineBreakTags:
- if char.properties.isWhitespace {
+ if isWhitespace {
return false
} else {
blockState = .nonEmptyBlock
@@ -144,14 +146,14 @@ extension BlockRenderer {
return true
}
case .beginListItem:
- if char.properties.isWhitespace {
+ if isWhitespace {
return false
} else {
blockState = .listItemContent
return true
}
case .endListItem:
- if char.properties.isWhitespace {
+ if isWhitespace {
return false
} else {
blockState = .listItemContent
@@ -160,7 +162,7 @@ extension BlockRenderer {
return true
}
case .listItemContent:
- if char.properties.isWhitespace {
+ if isWhitespace {
blockState = .emittedSpaceInListItemContent
append(" ")
return false
@@ -168,14 +170,14 @@ extension BlockRenderer {
return true
}
case .emittedSpaceInListItemContent:
- if char.properties.isWhitespace {
+ if isWhitespace {
return false
} else {
blockState = .listItemContent
return true
}
case .lineBreakTagInListItemContent:
- if char.properties.isWhitespace {
+ if isWhitespace {
return false
} else {
blockState = .listItemContent
@@ -184,7 +186,7 @@ extension BlockRenderer {
return true
}
case .atLeastTwoLineBreakTagsInListItemContent:
- if char.properties.isWhitespace {
+ if isWhitespace {
return false
} else {
blockState = .listItemContent
@@ -193,14 +195,14 @@ extension BlockRenderer {
return true
}
case .preformattedStart(let depth):
- if char == "\n" {
+ if isNewline {
return false
} else {
blockState = .preformattedNonEmptyBlock(depth: depth)
return true
}
case .preformattedEmptyBlock(depth: let depth):
- if char.properties.isWhitespace {
+ if isWhitespace {
blockState = .preformattedEmptyBlockWithLeadingWhitespace(depth: depth)
temporaryBuffer.unicodeScalars.append(char)
return false
@@ -210,11 +212,11 @@ extension BlockRenderer {
return true
}
case .preformattedNonEmptyBlock(let depth):
- if char == "\n" {
+ if isNewline {
blockState = .preformattedLineBreak(depth: depth)
temporaryBuffer.append(lineBreak)
return false
- } else if char.properties.isWhitespace {
+ } else if isWhitespace {
blockState = .preformattedNonEmptyBlockWithTrailingWhitespace(depth: depth)
temporaryBuffer.unicodeScalars.append(char)
return false
@@ -222,11 +224,11 @@ extension BlockRenderer {
return true
}
case .preformattedLineBreak(let depth):
- if char == "\n" {
+ if isNewline {
blockState = .preformattedAtLeastTwoLineBreaks(depth: depth)
temporaryBuffer.append(lineBreak)
return false
- } else if char.properties.isWhitespace {
+ } else if isWhitespace {
blockState = .preformattedNonEmptyBlockWithTrailingWhitespace(depth: depth)
temporaryBuffer.unicodeScalars.append(char)
return false
@@ -237,7 +239,7 @@ extension BlockRenderer {
return true
}
case .preformattedAtLeastTwoLineBreaks(let depth):
- if char.properties.isWhitespace {
+ if isWhitespace {
temporaryBuffer.unicodeScalars.append(char)
return false
} else {
@@ -247,7 +249,7 @@ extension BlockRenderer {
return true
}
case .afterPreStartTag(let depth):
- if char == "\n" {
+ if isNewline {
blockState = .preformattedEmptyBlock(depth: depth)
return false
} else {
@@ -256,10 +258,10 @@ extension BlockRenderer {
return true
}
case .afterPreStartTagWithLeadingWhitespace(let depth):
- if char == "\n" {
+ if isNewline {
blockState = .preformattedEmptyBlockWithLeadingWhitespace(depth: depth)
return false
- } else if char.properties.isWhitespace {
+ } else if isWhitespace {
blockState = .preformattedEmptyBlockWithLeadingWhitespace(depth: depth)
temporaryBuffer.unicodeScalars.append(char)
return false
@@ -270,11 +272,11 @@ extension BlockRenderer {
return true
}
case .preformattedNonEmptyBlockWithTrailingWhitespace(let depth):
- if char == "\n" {
+ if isNewline {
blockState = .preformattedLineBreak(depth: depth)
temporaryBuffer.append(lineBreak)
return false
- } else if char.properties.isWhitespace {
+ } else if isWhitespace {
temporaryBuffer.unicodeScalars.append(char)
return false
} else {
@@ -284,11 +286,11 @@ extension BlockRenderer {
return true
}
case .preformattedEmptyBlockWithLeadingWhitespace(let depth):
- if char == "\n" {
+ if isNewline {
blockState = .preformattedLineBreak(depth: depth)
temporaryBuffer.append(lineBreak)
return false
- } else if char.properties.isWhitespace {
+ } else if isWhitespace {
temporaryBuffer.unicodeScalars.append(char)
return false
} else {
@@ -566,13 +568,20 @@ enum BlockState: Equatable {
case emittedSpaceInListItemContent
case lineBreakTagInListItemContent
case atLeastTwoLineBreakTagsInListItemContent
- case preformattedStart(depth: Int)
- case preformattedEmptyBlock(depth: Int)
- case preformattedNonEmptyBlock(depth: Int)
- case preformattedLineBreak(depth: Int)
- case preformattedAtLeastTwoLineBreaks(depth: Int)
- case afterPreStartTag(depth: Int)
- case afterPreStartTagWithLeadingWhitespace(depth: Int)
- case preformattedNonEmptyBlockWithTrailingWhitespace(depth: Int)
- case preformattedEmptyBlockWithLeadingWhitespace(depth: Int)
+ case preformattedStart(depth: Int32)
+ case preformattedEmptyBlock(depth: Int32)
+ case preformattedNonEmptyBlock(depth: Int32)
+ case preformattedLineBreak(depth: Int32)
+ case preformattedAtLeastTwoLineBreaks(depth: Int32)
+ case afterPreStartTag(depth: Int32)
+ case afterPreStartTagWithLeadingWhitespace(depth: Int32)
+ case preformattedNonEmptyBlockWithTrailingWhitespace(depth: Int32)
+ case preformattedEmptyBlockWithLeadingWhitespace(depth: Int32)
+}
+
+@inline(__always)
+private func isWhitespace(_ c: UnicodeScalar) -> Bool {
+ // this is not strictly correct, but checking the actual unicode properties is slow
+ // and this should cover the vast majority of actual use
+ c == " " || c == "\n" || c == "\t" || c == "\u{A0}" /* NO-BREAK SPACE */
}
diff --git a/Sources/HTMLStreamer/TextConverter.swift b/Sources/HTMLStreamer/TextConverter.swift
index 75acc4a..267bb1b 100644
--- a/Sources/HTMLStreamer/TextConverter.swift
+++ b/Sources/HTMLStreamer/TextConverter.swift
@@ -7,19 +7,18 @@
import Foundation
-public struct TextConverter: BlockRenderer {
+public class TextConverter {
private let configuration: TextConverterConfiguration
private var tokenizer: Tokenizer!
private var str: String!
private var actionStack: [ElementAction] = []
- var blockState = BlockState.start
- var temporaryBuffer: String = ""
+ var blockStateMachine = BlockStateMachine(blockBreak: "", lineBreak: "", listIndentForContentOutsideItem: "", append: { _ in }, removeChar: {})
private var currentElementIsEmpty = true
private var currentRun = ""
- public init(configuration: TextConverterConfiguration = .init()) where Callbacks == DefaultCallbacks {
+ public convenience init(configuration: TextConverterConfiguration = .init()) where Callbacks == DefaultCallbacks {
self.init(configuration: configuration, callbacks: DefaultCallbacks.self)
}
@@ -27,12 +26,19 @@ public struct TextConverter: BlockRenderer {
self.configuration = configuration
}
- public mutating func convert(html: String) -> String {
+ public func convert(html: String) -> String {
tokenizer = Tokenizer(chars: html.unicodeScalars.makeIterator())
str = ""
- blockState = .start
- temporaryBuffer = ""
+ blockStateMachine = BlockStateMachine(
+ blockBreak: configuration.insertNewlines ? "\n\n" : " " ,
+ lineBreak: configuration.insertNewlines ? "\n" : " " ,
+ listIndentForContentOutsideItem: "",
+ append: { [unowned self] in
+ self.append($0)
+ }, removeChar: { [unowned self] in
+ self.removeChar()
+ })
currentElementIsEmpty = true
currentRun = ""
@@ -40,13 +46,13 @@ public struct TextConverter: BlockRenderer {
switch token {
case .character(let scalar):
currentElementIsEmpty = false
- if continueBlock(char: scalar) {
+ if blockStateMachine.continueBlock(char: scalar) {
currentRun.unicodeScalars.append(scalar)
}
case .characterSequence(let string):
currentElementIsEmpty = false
for c in string.unicodeScalars {
- if continueBlock(char: c) {
+ if blockStateMachine.continueBlock(char: c) {
currentRun.unicodeScalars.append(c)
}
}
@@ -71,27 +77,27 @@ public struct TextConverter: BlockRenderer {
}
}
- endBlocks()
+ blockStateMachine.endBlocks()
finishRun()
return str
}
- private mutating func handleStartTag(_ name: String, selfClosing: Bool, attributes: [Attribute]) {
+ private func handleStartTag(_ name: String, selfClosing: Bool, attributes: [Attribute]) {
switch name {
case "br":
- breakTag()
+ blockStateMachine.breakTag()
case "pre", "blockquote", "p", "ol", "ul":
- startOrEndBlock()
+ blockStateMachine.startOrEndBlock()
default:
break
}
}
- private mutating func handleEndTag(_ name: String) {
+ private func handleEndTag(_ name: String) {
switch name {
case "pre", "blockquote", "p", "ol", "ul":
- startOrEndBlock()
+ blockStateMachine.startOrEndBlock()
finishRun()
default:
break
@@ -118,11 +124,11 @@ public struct TextConverter: BlockRenderer {
" "
}
- mutating func append(_ s: String) {
+ func append(_ s: String) {
currentRun.append(s)
}
- mutating func removeChar() {
+ func removeChar() {
if currentRun.isEmpty {
str.removeLast()
} else {
@@ -130,7 +136,7 @@ public struct TextConverter: BlockRenderer {
}
}
- private mutating func finishRun() {
+ private func finishRun() {
if actionStack.contains(.skip) {
currentRun = ""
return