HTML to plain text conversion
This commit is contained in:
parent
601c9f2cd8
commit
5bdfda71e6
|
@ -17,7 +17,7 @@ private typealias PlatformFont = UIFont
|
||||||
private typealias PlatformFont = NSFont
|
private typealias PlatformFont = NSFont
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
public struct AttributedStringConverter<Callbacks: AttributedStringCallbacks> {
|
public struct AttributedStringConverter<Callbacks: HTMLConversionCallbacks> {
|
||||||
private let configuration: AttributedStringConverterConfiguration
|
private let configuration: AttributedStringConverterConfiguration
|
||||||
private var fontCache: [FontTrait: PlatformFont] = [:]
|
private var fontCache: [FontTrait: PlatformFont] = [:]
|
||||||
|
|
||||||
|
@ -77,7 +77,7 @@ public struct AttributedStringConverter<Callbacks: AttributedStringCallbacks> {
|
||||||
return str
|
return str
|
||||||
}
|
}
|
||||||
|
|
||||||
private mutating func handleStartTag(_ name: String, selfClosing: Bool, attributes: [HTMLStreamer.Attribute]) {
|
private mutating func handleStartTag(_ name: String, selfClosing: Bool, attributes: [Attribute]) {
|
||||||
if name == "br" {
|
if name == "br" {
|
||||||
currentRun.append("\n")
|
currentRun.append("\n")
|
||||||
return
|
return
|
||||||
|
@ -313,37 +313,6 @@ public struct AttributedStringConverter<Callbacks: AttributedStringCallbacks> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public protocol AttributedStringCallbacks {
|
|
||||||
static func makeURL(string: String) -> URL?
|
|
||||||
static func elementAction(name: String, attributes: [Attribute]) -> ElementAction
|
|
||||||
}
|
|
||||||
|
|
||||||
public enum ElementAction: Equatable {
|
|
||||||
case `default`
|
|
||||||
case skip
|
|
||||||
case replace(String)
|
|
||||||
|
|
||||||
var isReplace: Bool {
|
|
||||||
if case .replace(_) = self {
|
|
||||||
true
|
|
||||||
} else {
|
|
||||||
false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public extension AttributedStringCallbacks {
|
|
||||||
static func makeURL(string: String) -> URL? {
|
|
||||||
URL(string: string)
|
|
||||||
}
|
|
||||||
static func elementAction(name: String, attributes: [Attribute]) -> ElementAction {
|
|
||||||
.default
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public struct DefaultCallbacks: AttributedStringCallbacks {
|
|
||||||
}
|
|
||||||
|
|
||||||
public struct AttributedStringConverterConfiguration {
|
public struct AttributedStringConverterConfiguration {
|
||||||
#if os(iOS)
|
#if os(iOS)
|
||||||
public var font: UIFont
|
public var font: UIFont
|
||||||
|
|
|
@ -0,0 +1,39 @@
|
||||||
|
//
|
||||||
|
// HTMLConversionCallbacks.swift
|
||||||
|
// HTMLStreamer
|
||||||
|
//
|
||||||
|
// Created by Shadowfacts on 12/22/23.
|
||||||
|
//
|
||||||
|
|
||||||
|
import Foundation
|
||||||
|
|
||||||
|
public protocol HTMLConversionCallbacks {
|
||||||
|
static func makeURL(string: String) -> URL?
|
||||||
|
static func elementAction(name: String, attributes: [Attribute]) -> ElementAction
|
||||||
|
}
|
||||||
|
|
||||||
|
public enum ElementAction: Equatable {
|
||||||
|
case `default`
|
||||||
|
case skip
|
||||||
|
case replace(String)
|
||||||
|
|
||||||
|
var isReplace: Bool {
|
||||||
|
if case .replace(_) = self {
|
||||||
|
true
|
||||||
|
} else {
|
||||||
|
false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public extension HTMLConversionCallbacks {
|
||||||
|
static func makeURL(string: String) -> URL? {
|
||||||
|
URL(string: string)
|
||||||
|
}
|
||||||
|
static func elementAction(name: String, attributes: [Attribute]) -> ElementAction {
|
||||||
|
.default
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public struct DefaultCallbacks: HTMLConversionCallbacks {
|
||||||
|
}
|
|
@ -0,0 +1,135 @@
|
||||||
|
//
|
||||||
|
// TextConverter.swift
|
||||||
|
// HTMLStreamer
|
||||||
|
//
|
||||||
|
// Created by Shadowfacts on 12/19/23.
|
||||||
|
//
|
||||||
|
|
||||||
|
import Foundation
|
||||||
|
|
||||||
|
public struct TextConverter<Callbacks: HTMLConversionCallbacks> {
|
||||||
|
|
||||||
|
private let configuration: TextConverterConfiguration
|
||||||
|
|
||||||
|
private var tokenizer: Tokenizer<String.UnicodeScalarView.Iterator>!
|
||||||
|
private var str: String!
|
||||||
|
|
||||||
|
private var actionStack: [ElementAction] = []
|
||||||
|
private var previouslyFinishedBlockElement = false
|
||||||
|
private var currentRun = ""
|
||||||
|
|
||||||
|
public init(configuration: TextConverterConfiguration = .init()) where Callbacks == DefaultCallbacks {
|
||||||
|
self.init(configuration: configuration, callbacks: DefaultCallbacks.self)
|
||||||
|
}
|
||||||
|
|
||||||
|
public init(configuration: TextConverterConfiguration = .init(), callbacks _: Callbacks.Type = Callbacks.self) {
|
||||||
|
self.configuration = configuration
|
||||||
|
}
|
||||||
|
|
||||||
|
public mutating func convert(html: String) -> String {
|
||||||
|
tokenizer = Tokenizer(chars: html.unicodeScalars.makeIterator())
|
||||||
|
str = ""
|
||||||
|
|
||||||
|
while let token = tokenizer.next() {
|
||||||
|
switch token {
|
||||||
|
case .character(let scalar):
|
||||||
|
currentRun.unicodeScalars.append(scalar)
|
||||||
|
case .characterSequence(let string):
|
||||||
|
currentRun.append(string)
|
||||||
|
case .startTag(let name, let selfClosing, let attributes):
|
||||||
|
let action = Callbacks.elementAction(name: name, attributes: attributes)
|
||||||
|
actionStack.append(action)
|
||||||
|
handleStartTag(name, selfClosing: selfClosing, attributes: attributes)
|
||||||
|
case .endTag(let name):
|
||||||
|
handleEndTag(name)
|
||||||
|
if actionStack.last != .default {
|
||||||
|
finishRun()
|
||||||
|
}
|
||||||
|
actionStack.removeLast()
|
||||||
|
case .comment, .doctype:
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
finishRun()
|
||||||
|
|
||||||
|
return str
|
||||||
|
}
|
||||||
|
|
||||||
|
private mutating func handleStartTag(_ name: String, selfClosing: Bool, attributes: [Attribute]) {
|
||||||
|
switch name {
|
||||||
|
case "br":
|
||||||
|
if configuration.insertNewlines {
|
||||||
|
currentRun.append("\n")
|
||||||
|
} else {
|
||||||
|
currentRun.append(" ")
|
||||||
|
}
|
||||||
|
case "pre", "blockquote", "p", "ol", "ul":
|
||||||
|
startBlockElement()
|
||||||
|
finishRun()
|
||||||
|
default:
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private mutating func startBlockElement() {
|
||||||
|
if !str.isEmpty {
|
||||||
|
previouslyFinishedBlockElement = false
|
||||||
|
if configuration.insertNewlines {
|
||||||
|
currentRun.append("\n\n")
|
||||||
|
} else {
|
||||||
|
currentRun.append(" ")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private mutating func handleEndTag(_ name: String) {
|
||||||
|
switch name {
|
||||||
|
case "pre", "blockquote", "p", "ol", "ul":
|
||||||
|
finishRun()
|
||||||
|
finishBlockElement()
|
||||||
|
default:
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private mutating func finishBlockElement() {
|
||||||
|
if !str.isEmpty {
|
||||||
|
previouslyFinishedBlockElement = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private mutating func finishRun() {
|
||||||
|
guard !currentRun.isEmpty else {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if actionStack.contains(.skip) {
|
||||||
|
currentRun = ""
|
||||||
|
return
|
||||||
|
} else if case .replace(let replacement) = actionStack.first(where: \.isReplace) {
|
||||||
|
currentRun = replacement
|
||||||
|
}
|
||||||
|
|
||||||
|
if previouslyFinishedBlockElement {
|
||||||
|
previouslyFinishedBlockElement = false
|
||||||
|
if configuration.insertNewlines {
|
||||||
|
currentRun.insert(contentsOf: "\n\n", at: currentRun.startIndex)
|
||||||
|
} else {
|
||||||
|
currentRun.insert(" ", at: currentRun.startIndex)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
str.append(currentRun)
|
||||||
|
currentRun = ""
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public struct TextConverterConfiguration {
|
||||||
|
public var insertNewlines: Bool
|
||||||
|
|
||||||
|
public init(insertWhitespace: Bool = true) {
|
||||||
|
self.insertNewlines = insertWhitespace
|
||||||
|
}
|
||||||
|
}
|
|
@ -41,7 +41,7 @@ final class AttributedStringConverterTests: XCTestCase {
|
||||||
convert(html, callbacks: DefaultCallbacks.self)
|
convert(html, callbacks: DefaultCallbacks.self)
|
||||||
}
|
}
|
||||||
|
|
||||||
private func convert<Callbacks: AttributedStringCallbacks>(_ html: String, callbacks _: Callbacks.Type = Callbacks.self) -> NSAttributedString {
|
private func convert<Callbacks: HTMLConversionCallbacks>(_ html: String, callbacks _: Callbacks.Type = Callbacks.self) -> NSAttributedString {
|
||||||
let config = AttributedStringConverterConfiguration(
|
let config = AttributedStringConverterConfiguration(
|
||||||
font: font,
|
font: font,
|
||||||
monospaceFont: monospaceFont,
|
monospaceFont: monospaceFont,
|
||||||
|
@ -212,7 +212,7 @@ final class AttributedStringConverterTests: XCTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
func testMakeURLCallback() {
|
func testMakeURLCallback() {
|
||||||
struct Callbacks: AttributedStringCallbacks {
|
struct Callbacks: HTMLConversionCallbacks {
|
||||||
static func makeURL(string: String) -> URL? {
|
static func makeURL(string: String) -> URL? {
|
||||||
URL(string: "https://apple.com")
|
URL(string: "https://apple.com")
|
||||||
}
|
}
|
||||||
|
@ -226,7 +226,7 @@ final class AttributedStringConverterTests: XCTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
func testElementActionCallback() {
|
func testElementActionCallback() {
|
||||||
struct Callbacks: AttributedStringCallbacks {
|
struct Callbacks: HTMLConversionCallbacks {
|
||||||
static func elementAction(name: String, attributes: [Attribute]) -> ElementAction {
|
static func elementAction(name: String, attributes: [Attribute]) -> ElementAction {
|
||||||
let clazz = attributes.attributeValue(for: "class")
|
let clazz = attributes.attributeValue(for: "class")
|
||||||
if clazz == "invisible" {
|
if clazz == "invisible" {
|
||||||
|
@ -240,8 +240,8 @@ final class AttributedStringConverterTests: XCTestCase {
|
||||||
}
|
}
|
||||||
let skipped = convert("<span class='invisible'>test</span>", callbacks: Callbacks.self)
|
let skipped = convert("<span class='invisible'>test</span>", callbacks: Callbacks.self)
|
||||||
XCTAssertEqual(skipped, NSAttributedString())
|
XCTAssertEqual(skipped, NSAttributedString())
|
||||||
let skipNestped = convert("<span class='invisible'><b>test</b></span>", callbacks: Callbacks.self)
|
let skipNested = convert("<span class='invisible'><b>test</b></span>", callbacks: Callbacks.self)
|
||||||
XCTAssertEqual(skipNestped, NSAttributedString())
|
XCTAssertEqual(skipNested, NSAttributedString())
|
||||||
let skipNestped2 = convert("<b><span class='invisible'>test</span></b>", callbacks: Callbacks.self)
|
let skipNestped2 = convert("<b><span class='invisible'>test</span></b>", callbacks: Callbacks.self)
|
||||||
XCTAssertEqual(skipNestped2, NSAttributedString())
|
XCTAssertEqual(skipNestped2, NSAttributedString())
|
||||||
let replaced = convert("<span class='ellipsis'>test</span>", callbacks: Callbacks.self)
|
let replaced = convert("<span class='ellipsis'>test</span>", callbacks: Callbacks.self)
|
||||||
|
|
|
@ -0,0 +1,66 @@
|
||||||
|
//
|
||||||
|
// TextConverterTests.swift
|
||||||
|
//
|
||||||
|
//
|
||||||
|
// Created by Shadowfacts on 12/22/23.
|
||||||
|
//
|
||||||
|
|
||||||
|
import XCTest
|
||||||
|
@testable import HTMLStreamer
|
||||||
|
|
||||||
|
final class TextConverterTests: XCTestCase {
|
||||||
|
|
||||||
|
private func convert(_ html: String, configuration: TextConverterConfiguration = .init()) -> String {
|
||||||
|
convert(html, configuration: configuration, callbacks: DefaultCallbacks.self)
|
||||||
|
}
|
||||||
|
|
||||||
|
private func convert<Callbacks: HTMLConversionCallbacks>(_ html: String, configuration: TextConverterConfiguration = .init(), callbacks _: Callbacks.Type = Callbacks.self) -> String {
|
||||||
|
var converter = TextConverter<Callbacks>(configuration: configuration)
|
||||||
|
return converter.convert(html: html)
|
||||||
|
}
|
||||||
|
|
||||||
|
func testConvertBR() {
|
||||||
|
XCTAssertEqual(convert("a<br>b"), "a\nb")
|
||||||
|
XCTAssertEqual(convert("a<br />b"), "a\nb")
|
||||||
|
}
|
||||||
|
|
||||||
|
func testConvertA() {
|
||||||
|
XCTAssertEqual(convert("<a href='https://example.com'>link</a>"), "link")
|
||||||
|
}
|
||||||
|
|
||||||
|
func testIncorrectNesting() {
|
||||||
|
XCTAssertEqual(convert("<strong>bold <em>both</strong> italic</em>"), "bold both italic")
|
||||||
|
}
|
||||||
|
|
||||||
|
func testTextAfterBlockElement() {
|
||||||
|
XCTAssertEqual(convert("<blockquote>wee</blockquote>after"), "wee\n\nafter")
|
||||||
|
XCTAssertEqual(convert("<blockquote>wee</blockquote>after", configuration: .init(insertWhitespace: false)), "wee after")
|
||||||
|
}
|
||||||
|
|
||||||
|
func testMultipleBlockElements() {
|
||||||
|
XCTAssertEqual(convert("<blockquote>a</blockquote><blockquote>b</blockquote>"), "a\n\nb")
|
||||||
|
XCTAssertEqual(convert("<blockquote>a</blockquote><blockquote>b</blockquote>", configuration: .init(insertWhitespace: false)), "a b")
|
||||||
|
}
|
||||||
|
|
||||||
|
func testElementActionCallback() {
|
||||||
|
struct Callbacks: HTMLConversionCallbacks {
|
||||||
|
static func elementAction(name: String, attributes: [Attribute]) -> ElementAction {
|
||||||
|
let clazz = attributes.attributeValue(for: "class")
|
||||||
|
if clazz == "invisible" {
|
||||||
|
return .skip
|
||||||
|
} else if clazz == "ellipsis" {
|
||||||
|
return .replace("…")
|
||||||
|
} else {
|
||||||
|
return .default
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let skipped = convert("<span class='invisible'>test</span>", callbacks: Callbacks.self)
|
||||||
|
XCTAssertEqual(skipped, "")
|
||||||
|
let skipNested = convert("<span class='invisible'><b>test</b></span>", callbacks: Callbacks.self)
|
||||||
|
XCTAssertEqual(skipNested, "")
|
||||||
|
let replaced = convert("<span class='ellipsis'>test</span>", callbacks: Callbacks.self)
|
||||||
|
XCTAssertEqual(replaced, "…")
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
Loading…
Reference in New Issue