From 5bdfda71e6e8b49a90cd726bdcc992983b55fccf Mon Sep 17 00:00:00 2001 From: Shadowfacts Date: Fri, 22 Dec 2023 20:30:29 -0500 Subject: [PATCH] HTML to plain text conversion --- .../AttributedStringConverter.swift | 35 +---- .../HTMLConversionCallbacks.swift | 39 +++++ Sources/HTMLStreamer/TextConverter.swift | 135 ++++++++++++++++++ .../AttributedStringConverterTests.swift | 10 +- .../TextConverterTests.swift | 66 +++++++++ 5 files changed, 247 insertions(+), 38 deletions(-) create mode 100644 Sources/HTMLStreamer/HTMLConversionCallbacks.swift create mode 100644 Sources/HTMLStreamer/TextConverter.swift create mode 100644 Tests/HTMLStreamerTests/TextConverterTests.swift diff --git a/Sources/HTMLStreamer/AttributedStringConverter.swift b/Sources/HTMLStreamer/AttributedStringConverter.swift index 569776c..29b0626 100644 --- a/Sources/HTMLStreamer/AttributedStringConverter.swift +++ b/Sources/HTMLStreamer/AttributedStringConverter.swift @@ -17,7 +17,7 @@ private typealias PlatformFont = UIFont private typealias PlatformFont = NSFont #endif -public struct AttributedStringConverter { +public struct AttributedStringConverter { private let configuration: AttributedStringConverterConfiguration private var fontCache: [FontTrait: PlatformFont] = [:] @@ -77,7 +77,7 @@ public struct AttributedStringConverter { return str } - private mutating func handleStartTag(_ name: String, selfClosing: Bool, attributes: [HTMLStreamer.Attribute]) { + private mutating func handleStartTag(_ name: String, selfClosing: Bool, attributes: [Attribute]) { if name == "br" { currentRun.append("\n") return @@ -313,37 +313,6 @@ public struct AttributedStringConverter { } } -public protocol AttributedStringCallbacks { - static func makeURL(string: String) -> URL? - static func elementAction(name: String, attributes: [Attribute]) -> ElementAction -} - -public enum ElementAction: Equatable { - case `default` - case skip - case replace(String) - - var isReplace: Bool { - if case .replace(_) = self { - true - } else { - false - } - } -} - -public extension AttributedStringCallbacks { - static func makeURL(string: String) -> URL? { - URL(string: string) - } - static func elementAction(name: String, attributes: [Attribute]) -> ElementAction { - .default - } -} - -public struct DefaultCallbacks: AttributedStringCallbacks { -} - public struct AttributedStringConverterConfiguration { #if os(iOS) public var font: UIFont diff --git a/Sources/HTMLStreamer/HTMLConversionCallbacks.swift b/Sources/HTMLStreamer/HTMLConversionCallbacks.swift new file mode 100644 index 0000000..44d40c1 --- /dev/null +++ b/Sources/HTMLStreamer/HTMLConversionCallbacks.swift @@ -0,0 +1,39 @@ +// +// HTMLConversionCallbacks.swift +// HTMLStreamer +// +// Created by Shadowfacts on 12/22/23. +// + +import Foundation + +public protocol HTMLConversionCallbacks { + static func makeURL(string: String) -> URL? + static func elementAction(name: String, attributes: [Attribute]) -> ElementAction +} + +public enum ElementAction: Equatable { + case `default` + case skip + case replace(String) + + var isReplace: Bool { + if case .replace(_) = self { + true + } else { + false + } + } +} + +public extension HTMLConversionCallbacks { + static func makeURL(string: String) -> URL? { + URL(string: string) + } + static func elementAction(name: String, attributes: [Attribute]) -> ElementAction { + .default + } +} + +public struct DefaultCallbacks: HTMLConversionCallbacks { +} diff --git a/Sources/HTMLStreamer/TextConverter.swift b/Sources/HTMLStreamer/TextConverter.swift new file mode 100644 index 0000000..870ef34 --- /dev/null +++ b/Sources/HTMLStreamer/TextConverter.swift @@ -0,0 +1,135 @@ +// +// TextConverter.swift +// HTMLStreamer +// +// Created by Shadowfacts on 12/19/23. +// + +import Foundation + +public struct TextConverter { + + private let configuration: TextConverterConfiguration + + private var tokenizer: Tokenizer! + private var str: String! + + private var actionStack: [ElementAction] = [] + private var previouslyFinishedBlockElement = false + private var currentRun = "" + + public init(configuration: TextConverterConfiguration = .init()) where Callbacks == DefaultCallbacks { + self.init(configuration: configuration, callbacks: DefaultCallbacks.self) + } + + public init(configuration: TextConverterConfiguration = .init(), callbacks _: Callbacks.Type = Callbacks.self) { + self.configuration = configuration + } + + public mutating func convert(html: String) -> String { + tokenizer = Tokenizer(chars: html.unicodeScalars.makeIterator()) + str = "" + + while let token = tokenizer.next() { + switch token { + case .character(let scalar): + currentRun.unicodeScalars.append(scalar) + case .characterSequence(let string): + currentRun.append(string) + case .startTag(let name, let selfClosing, let attributes): + let action = Callbacks.elementAction(name: name, attributes: attributes) + actionStack.append(action) + handleStartTag(name, selfClosing: selfClosing, attributes: attributes) + case .endTag(let name): + handleEndTag(name) + if actionStack.last != .default { + finishRun() + } + actionStack.removeLast() + case .comment, .doctype: + break + } + } + + finishRun() + + return str + } + + private mutating func handleStartTag(_ name: String, selfClosing: Bool, attributes: [Attribute]) { + switch name { + case "br": + if configuration.insertNewlines { + currentRun.append("\n") + } else { + currentRun.append(" ") + } + case "pre", "blockquote", "p", "ol", "ul": + startBlockElement() + finishRun() + default: + break + } + } + + private mutating func startBlockElement() { + if !str.isEmpty { + previouslyFinishedBlockElement = false + if configuration.insertNewlines { + currentRun.append("\n\n") + } else { + currentRun.append(" ") + } + } + } + + private mutating func handleEndTag(_ name: String) { + switch name { + case "pre", "blockquote", "p", "ol", "ul": + finishRun() + finishBlockElement() + default: + break + } + } + + private mutating func finishBlockElement() { + if !str.isEmpty { + previouslyFinishedBlockElement = true + } + } + + private mutating func finishRun() { + guard !currentRun.isEmpty else { + return + } + + if actionStack.contains(.skip) { + currentRun = "" + return + } else if case .replace(let replacement) = actionStack.first(where: \.isReplace) { + currentRun = replacement + } + + if previouslyFinishedBlockElement { + previouslyFinishedBlockElement = false + if configuration.insertNewlines { + currentRun.insert(contentsOf: "\n\n", at: currentRun.startIndex) + } else { + currentRun.insert(" ", at: currentRun.startIndex) + } + } + + str.append(currentRun) + currentRun = "" + } + +} + +public struct TextConverterConfiguration { + public var insertNewlines: Bool + + public init(insertWhitespace: Bool = true) { + self.insertNewlines = insertWhitespace + } +} diff --git a/Tests/HTMLStreamerTests/AttributedStringConverterTests.swift b/Tests/HTMLStreamerTests/AttributedStringConverterTests.swift index 307ec85..53e00e0 100644 --- a/Tests/HTMLStreamerTests/AttributedStringConverterTests.swift +++ b/Tests/HTMLStreamerTests/AttributedStringConverterTests.swift @@ -41,7 +41,7 @@ final class AttributedStringConverterTests: XCTestCase { convert(html, callbacks: DefaultCallbacks.self) } - private func convert(_ html: String, callbacks _: Callbacks.Type = Callbacks.self) -> NSAttributedString { + private func convert(_ html: String, callbacks _: Callbacks.Type = Callbacks.self) -> NSAttributedString { let config = AttributedStringConverterConfiguration( font: font, monospaceFont: monospaceFont, @@ -212,7 +212,7 @@ final class AttributedStringConverterTests: XCTestCase { } func testMakeURLCallback() { - struct Callbacks: AttributedStringCallbacks { + struct Callbacks: HTMLConversionCallbacks { static func makeURL(string: String) -> URL? { URL(string: "https://apple.com") } @@ -226,7 +226,7 @@ final class AttributedStringConverterTests: XCTestCase { } func testElementActionCallback() { - struct Callbacks: AttributedStringCallbacks { + struct Callbacks: HTMLConversionCallbacks { static func elementAction(name: String, attributes: [Attribute]) -> ElementAction { let clazz = attributes.attributeValue(for: "class") if clazz == "invisible" { @@ -240,8 +240,8 @@ final class AttributedStringConverterTests: XCTestCase { } let skipped = convert("", callbacks: Callbacks.self) XCTAssertEqual(skipped, NSAttributedString()) - let skipNestped = convert("", callbacks: Callbacks.self) - XCTAssertEqual(skipNestped, NSAttributedString()) + let skipNested = convert("", callbacks: Callbacks.self) + XCTAssertEqual(skipNested, NSAttributedString()) let skipNestped2 = convert("", callbacks: Callbacks.self) XCTAssertEqual(skipNestped2, NSAttributedString()) let replaced = convert("test", callbacks: Callbacks.self) diff --git a/Tests/HTMLStreamerTests/TextConverterTests.swift b/Tests/HTMLStreamerTests/TextConverterTests.swift new file mode 100644 index 0000000..aa970c9 --- /dev/null +++ b/Tests/HTMLStreamerTests/TextConverterTests.swift @@ -0,0 +1,66 @@ +// +// TextConverterTests.swift +// +// +// Created by Shadowfacts on 12/22/23. +// + +import XCTest +@testable import HTMLStreamer + +final class TextConverterTests: XCTestCase { + + private func convert(_ html: String, configuration: TextConverterConfiguration = .init()) -> String { + convert(html, configuration: configuration, callbacks: DefaultCallbacks.self) + } + + private func convert(_ html: String, configuration: TextConverterConfiguration = .init(), callbacks _: Callbacks.Type = Callbacks.self) -> String { + var converter = TextConverter(configuration: configuration) + return converter.convert(html: html) + } + + func testConvertBR() { + XCTAssertEqual(convert("a
b"), "a\nb") + XCTAssertEqual(convert("a
b"), "a\nb") + } + + func testConvertA() { + XCTAssertEqual(convert("link"), "link") + } + + func testIncorrectNesting() { + XCTAssertEqual(convert("bold both italic"), "bold both italic") + } + + func testTextAfterBlockElement() { + XCTAssertEqual(convert("
wee
after"), "wee\n\nafter") + XCTAssertEqual(convert("
wee
after", configuration: .init(insertWhitespace: false)), "wee after") + } + + func testMultipleBlockElements() { + XCTAssertEqual(convert("
a
b
"), "a\n\nb") + XCTAssertEqual(convert("
a
b
", configuration: .init(insertWhitespace: false)), "a b") + } + + func testElementActionCallback() { + struct Callbacks: HTMLConversionCallbacks { + static func elementAction(name: String, attributes: [Attribute]) -> ElementAction { + let clazz = attributes.attributeValue(for: "class") + if clazz == "invisible" { + return .skip + } else if clazz == "ellipsis" { + return .replace("…") + } else { + return .default + } + } + } + let skipped = convert("", callbacks: Callbacks.self) + XCTAssertEqual(skipped, "") + let skipNested = convert("", callbacks: Callbacks.self) + XCTAssertEqual(skipNested, "") + let replaced = convert("test", callbacks: Callbacks.self) + XCTAssertEqual(replaced, "…") + } + +}