HTML to plain text conversion

This commit is contained in:
Shadowfacts 2023-12-22 20:30:29 -05:00
parent 601c9f2cd8
commit 5bdfda71e6
5 changed files with 247 additions and 38 deletions

View File

@ -17,7 +17,7 @@ private typealias PlatformFont = UIFont
private typealias PlatformFont = NSFont
#endif
public struct AttributedStringConverter<Callbacks: AttributedStringCallbacks> {
public struct AttributedStringConverter<Callbacks: HTMLConversionCallbacks> {
private let configuration: AttributedStringConverterConfiguration
private var fontCache: [FontTrait: PlatformFont] = [:]
@ -77,7 +77,7 @@ public struct AttributedStringConverter<Callbacks: AttributedStringCallbacks> {
return str
}
private mutating func handleStartTag(_ name: String, selfClosing: Bool, attributes: [HTMLStreamer.Attribute]) {
private mutating func handleStartTag(_ name: String, selfClosing: Bool, attributes: [Attribute]) {
if name == "br" {
currentRun.append("\n")
return
@ -313,37 +313,6 @@ public struct AttributedStringConverter<Callbacks: AttributedStringCallbacks> {
}
}
public protocol AttributedStringCallbacks {
static func makeURL(string: String) -> URL?
static func elementAction(name: String, attributes: [Attribute]) -> ElementAction
}
public enum ElementAction: Equatable {
case `default`
case skip
case replace(String)
var isReplace: Bool {
if case .replace(_) = self {
true
} else {
false
}
}
}
public extension AttributedStringCallbacks {
static func makeURL(string: String) -> URL? {
URL(string: string)
}
static func elementAction(name: String, attributes: [Attribute]) -> ElementAction {
.default
}
}
public struct DefaultCallbacks: AttributedStringCallbacks {
}
public struct AttributedStringConverterConfiguration {
#if os(iOS)
public var font: UIFont

View File

@ -0,0 +1,39 @@
//
// HTMLConversionCallbacks.swift
// HTMLStreamer
//
// Created by Shadowfacts on 12/22/23.
//
import Foundation
public protocol HTMLConversionCallbacks {
static func makeURL(string: String) -> URL?
static func elementAction(name: String, attributes: [Attribute]) -> ElementAction
}
public enum ElementAction: Equatable {
case `default`
case skip
case replace(String)
var isReplace: Bool {
if case .replace(_) = self {
true
} else {
false
}
}
}
public extension HTMLConversionCallbacks {
static func makeURL(string: String) -> URL? {
URL(string: string)
}
static func elementAction(name: String, attributes: [Attribute]) -> ElementAction {
.default
}
}
public struct DefaultCallbacks: HTMLConversionCallbacks {
}

View File

@ -0,0 +1,135 @@
//
// TextConverter.swift
// HTMLStreamer
//
// Created by Shadowfacts on 12/19/23.
//
import Foundation
public struct TextConverter<Callbacks: HTMLConversionCallbacks> {
private let configuration: TextConverterConfiguration
private var tokenizer: Tokenizer<String.UnicodeScalarView.Iterator>!
private var str: String!
private var actionStack: [ElementAction] = []
private var previouslyFinishedBlockElement = false
private var currentRun = ""
public init(configuration: TextConverterConfiguration = .init()) where Callbacks == DefaultCallbacks {
self.init(configuration: configuration, callbacks: DefaultCallbacks.self)
}
public init(configuration: TextConverterConfiguration = .init(), callbacks _: Callbacks.Type = Callbacks.self) {
self.configuration = configuration
}
public mutating func convert(html: String) -> String {
tokenizer = Tokenizer(chars: html.unicodeScalars.makeIterator())
str = ""
while let token = tokenizer.next() {
switch token {
case .character(let scalar):
currentRun.unicodeScalars.append(scalar)
case .characterSequence(let string):
currentRun.append(string)
case .startTag(let name, let selfClosing, let attributes):
let action = Callbacks.elementAction(name: name, attributes: attributes)
actionStack.append(action)
handleStartTag(name, selfClosing: selfClosing, attributes: attributes)
case .endTag(let name):
handleEndTag(name)
if actionStack.last != .default {
finishRun()
}
actionStack.removeLast()
case .comment, .doctype:
break
}
}
finishRun()
return str
}
private mutating func handleStartTag(_ name: String, selfClosing: Bool, attributes: [Attribute]) {
switch name {
case "br":
if configuration.insertNewlines {
currentRun.append("\n")
} else {
currentRun.append(" ")
}
case "pre", "blockquote", "p", "ol", "ul":
startBlockElement()
finishRun()
default:
break
}
}
private mutating func startBlockElement() {
if !str.isEmpty {
previouslyFinishedBlockElement = false
if configuration.insertNewlines {
currentRun.append("\n\n")
} else {
currentRun.append(" ")
}
}
}
private mutating func handleEndTag(_ name: String) {
switch name {
case "pre", "blockquote", "p", "ol", "ul":
finishRun()
finishBlockElement()
default:
break
}
}
private mutating func finishBlockElement() {
if !str.isEmpty {
previouslyFinishedBlockElement = true
}
}
private mutating func finishRun() {
guard !currentRun.isEmpty else {
return
}
if actionStack.contains(.skip) {
currentRun = ""
return
} else if case .replace(let replacement) = actionStack.first(where: \.isReplace) {
currentRun = replacement
}
if previouslyFinishedBlockElement {
previouslyFinishedBlockElement = false
if configuration.insertNewlines {
currentRun.insert(contentsOf: "\n\n", at: currentRun.startIndex)
} else {
currentRun.insert(" ", at: currentRun.startIndex)
}
}
str.append(currentRun)
currentRun = ""
}
}
public struct TextConverterConfiguration {
public var insertNewlines: Bool
public init(insertWhitespace: Bool = true) {
self.insertNewlines = insertWhitespace
}
}

View File

@ -41,7 +41,7 @@ final class AttributedStringConverterTests: XCTestCase {
convert(html, callbacks: DefaultCallbacks.self)
}
private func convert<Callbacks: AttributedStringCallbacks>(_ html: String, callbacks _: Callbacks.Type = Callbacks.self) -> NSAttributedString {
private func convert<Callbacks: HTMLConversionCallbacks>(_ html: String, callbacks _: Callbacks.Type = Callbacks.self) -> NSAttributedString {
let config = AttributedStringConverterConfiguration(
font: font,
monospaceFont: monospaceFont,
@ -212,7 +212,7 @@ final class AttributedStringConverterTests: XCTestCase {
}
func testMakeURLCallback() {
struct Callbacks: AttributedStringCallbacks {
struct Callbacks: HTMLConversionCallbacks {
static func makeURL(string: String) -> URL? {
URL(string: "https://apple.com")
}
@ -226,7 +226,7 @@ final class AttributedStringConverterTests: XCTestCase {
}
func testElementActionCallback() {
struct Callbacks: AttributedStringCallbacks {
struct Callbacks: HTMLConversionCallbacks {
static func elementAction(name: String, attributes: [Attribute]) -> ElementAction {
let clazz = attributes.attributeValue(for: "class")
if clazz == "invisible" {
@ -240,8 +240,8 @@ final class AttributedStringConverterTests: XCTestCase {
}
let skipped = convert("<span class='invisible'>test</span>", callbacks: Callbacks.self)
XCTAssertEqual(skipped, NSAttributedString())
let skipNestped = convert("<span class='invisible'><b>test</b></span>", callbacks: Callbacks.self)
XCTAssertEqual(skipNestped, NSAttributedString())
let skipNested = convert("<span class='invisible'><b>test</b></span>", callbacks: Callbacks.self)
XCTAssertEqual(skipNested, NSAttributedString())
let skipNestped2 = convert("<b><span class='invisible'>test</span></b>", callbacks: Callbacks.self)
XCTAssertEqual(skipNestped2, NSAttributedString())
let replaced = convert("<span class='ellipsis'>test</span>", callbacks: Callbacks.self)

View File

@ -0,0 +1,66 @@
//
// TextConverterTests.swift
//
//
// Created by Shadowfacts on 12/22/23.
//
import XCTest
@testable import HTMLStreamer
final class TextConverterTests: XCTestCase {
private func convert(_ html: String, configuration: TextConverterConfiguration = .init()) -> String {
convert(html, configuration: configuration, callbacks: DefaultCallbacks.self)
}
private func convert<Callbacks: HTMLConversionCallbacks>(_ html: String, configuration: TextConverterConfiguration = .init(), callbacks _: Callbacks.Type = Callbacks.self) -> String {
var converter = TextConverter<Callbacks>(configuration: configuration)
return converter.convert(html: html)
}
func testConvertBR() {
XCTAssertEqual(convert("a<br>b"), "a\nb")
XCTAssertEqual(convert("a<br />b"), "a\nb")
}
func testConvertA() {
XCTAssertEqual(convert("<a href='https://example.com'>link</a>"), "link")
}
func testIncorrectNesting() {
XCTAssertEqual(convert("<strong>bold <em>both</strong> italic</em>"), "bold both italic")
}
func testTextAfterBlockElement() {
XCTAssertEqual(convert("<blockquote>wee</blockquote>after"), "wee\n\nafter")
XCTAssertEqual(convert("<blockquote>wee</blockquote>after", configuration: .init(insertWhitespace: false)), "wee after")
}
func testMultipleBlockElements() {
XCTAssertEqual(convert("<blockquote>a</blockquote><blockquote>b</blockquote>"), "a\n\nb")
XCTAssertEqual(convert("<blockquote>a</blockquote><blockquote>b</blockquote>", configuration: .init(insertWhitespace: false)), "a b")
}
func testElementActionCallback() {
struct Callbacks: HTMLConversionCallbacks {
static func elementAction(name: String, attributes: [Attribute]) -> ElementAction {
let clazz = attributes.attributeValue(for: "class")
if clazz == "invisible" {
return .skip
} else if clazz == "ellipsis" {
return .replace("")
} else {
return .default
}
}
}
let skipped = convert("<span class='invisible'>test</span>", callbacks: Callbacks.self)
XCTAssertEqual(skipped, "")
let skipNested = convert("<span class='invisible'><b>test</b></span>", callbacks: Callbacks.self)
XCTAssertEqual(skipNested, "")
let replaced = convert("<span class='ellipsis'>test</span>", callbacks: Callbacks.self)
XCTAssertEqual(replaced, "")
}
}