HTMLStreamer/Sources/HTMLStreamer/TextConverter.swift

177 lines
5.2 KiB
Swift
Raw Normal View History

2023-12-23 01:30:29 +00:00
//
// TextConverter.swift
// HTMLStreamer
//
// Created by Shadowfacts on 12/19/23.
//
import Foundation
public class TextConverter<Callbacks: HTMLConversionCallbacks> {
2023-12-23 01:30:29 +00:00
private let configuration: TextConverterConfiguration
private var tokenizer: Tokenizer<String.UnicodeScalarView.Iterator>!
private var str: String!
private var actionStack: [ElementAction] = [] {
didSet {
hasSkipOrReplaceElementAction = actionStack.contains(where: {
switch $0 {
case .skip, .replace(_):
true
default:
false
}
})
}
}
private var hasSkipOrReplaceElementAction = false
var blockStateMachine = BlockStateMachine(blockBreak: "", lineBreak: "", listIndentForContentOutsideItem: "", append: { _ in }, removeChar: {})
private var currentElementIsEmpty = true
2023-12-23 01:30:29 +00:00
private var currentRun = ""
public convenience init(configuration: TextConverterConfiguration = .init()) where Callbacks == DefaultCallbacks {
2023-12-23 01:30:29 +00:00
self.init(configuration: configuration, callbacks: DefaultCallbacks.self)
}
public init(configuration: TextConverterConfiguration = .init(), callbacks _: Callbacks.Type = Callbacks.self) {
self.configuration = configuration
}
public func convert(html: String) -> String {
2023-12-23 01:30:29 +00:00
tokenizer = Tokenizer(chars: html.unicodeScalars.makeIterator())
str = ""
blockStateMachine = BlockStateMachine(
blockBreak: configuration.insertNewlines ? "\n\n" : " " ,
lineBreak: configuration.insertNewlines ? "\n" : " " ,
listIndentForContentOutsideItem: "",
append: { [unowned self] in
self.append($0)
}, removeChar: { [unowned self] in
self.removeChar()
})
2023-12-26 19:40:02 +00:00
currentElementIsEmpty = true
currentRun = ""
2023-12-23 01:30:29 +00:00
while let token = tokenizer.next() {
switch token {
case .character(let scalar):
currentElementIsEmpty = false
if blockStateMachine.continueBlock(char: scalar),
!hasSkipOrReplaceElementAction {
2024-02-21 16:15:27 +00:00
currentRun.unicodeScalars.append(scalar)
}
2023-12-23 01:30:29 +00:00
case .characterSequence(let string):
currentElementIsEmpty = false
2024-02-21 16:15:27 +00:00
for c in string.unicodeScalars {
if blockStateMachine.continueBlock(char: c),
!hasSkipOrReplaceElementAction {
2024-02-21 16:15:27 +00:00
currentRun.unicodeScalars.append(c)
}
}
2023-12-23 01:30:29 +00:00
case .startTag(let name, let selfClosing, let attributes):
currentElementIsEmpty = true
2023-12-23 01:30:29 +00:00
let action = Callbacks.elementAction(name: name, attributes: attributes)
if action != .default {
finishRun()
}
2023-12-23 01:30:29 +00:00
actionStack.append(action)
handleStartTag(name, selfClosing: selfClosing, attributes: attributes)
case .endTag(let name):
handleEndTag(name)
if let action = actionStack.last {
if action != .default {
finishRun()
}
actionStack.removeLast()
2023-12-23 01:30:29 +00:00
}
case .comment, .doctype:
break
}
}
blockStateMachine.endBlocks()
2023-12-23 01:30:29 +00:00
finishRun()
return str
}
private func handleStartTag(_ name: String, selfClosing: Bool, attributes: [Attribute]) {
2023-12-23 01:30:29 +00:00
switch name {
case "br":
blockStateMachine.breakTag()
2024-01-17 20:28:06 +00:00
case "pre", "blockquote", "p", "ol", "ul":
blockStateMachine.startOrEndBlock()
2023-12-23 01:30:29 +00:00
default:
break
}
}
private func handleEndTag(_ name: String) {
2023-12-23 01:30:29 +00:00
switch name {
case "pre", "blockquote", "p", "ol", "ul":
blockStateMachine.startOrEndBlock()
2024-01-17 20:28:06 +00:00
finishRun()
2023-12-23 01:30:29 +00:00
default:
break
}
}
2024-02-21 16:15:27 +00:00
var blockBreak: String {
2024-02-15 02:07:19 +00:00
if configuration.insertNewlines {
2024-02-21 16:15:27 +00:00
"\n\n"
2024-02-15 02:07:19 +00:00
} else {
2024-02-21 16:15:27 +00:00
" "
2024-01-17 20:28:06 +00:00
}
}
2024-02-21 16:15:27 +00:00
var lineBreak: String {
if configuration.insertNewlines {
"\n"
} else {
" "
}
}
var listIndentForContentOutsideItem: String {
" "
}
func append(_ s: String) {
2024-02-21 16:15:27 +00:00
currentRun.append(s)
}
func removeChar() {
2024-02-21 16:15:27 +00:00
if currentRun.isEmpty {
str.removeLast()
} else {
currentRun.removeLast()
}
}
private func finishRun() {
if case .append(let s) = actionStack.last {
2024-01-17 00:15:56 +00:00
currentRun.append(s)
} else if case .replace(let replacement) = actionStack.last {
currentRun.append(replacement)
2023-12-23 01:30:29 +00:00
}
2024-01-17 00:15:56 +00:00
guard !currentRun.isEmpty else {
return
}
2023-12-23 01:30:29 +00:00
str.append(currentRun)
currentRun = ""
}
}
public struct TextConverterConfiguration {
public var insertNewlines: Bool
public init(insertNewlines: Bool = true) {
self.insertNewlines = insertNewlines
2023-12-23 01:30:29 +00:00
}
}