Compare commits

...

12 Commits
0.1.2 ... main

9 changed files with 1187 additions and 184 deletions

110
BlockState.dot Normal file
View File

@ -0,0 +1,110 @@
digraph blockstate {
/* rankdir=LR; */
node [shape = circle, fontsize = 18];
edge [fontsize = 18];
init [label = "", shape=none, height = .0, width = .0];
start;
emptyBlock [label = "empty block"];
nonEmptyBlock [label = "non-empty block"];
lineBreakTag [label = "line break tag"];
atLeastTwoLineBreakTags [label = ">=2 line break tags"];
emptyBlockWithAtLeastTwoPreviousLineBreakTags [label = "empty block w/ >=2 prev line break tags"];
beginListItem [label = "begin list item"];
endListItem [label = "end list item"];
listItemContent [label = "list item content"];
lineBreakTagInListItemContent [label = "line break tag in list item content"];
atLeastTwoLineBreakTagsInListItemContent [label = ">= 2 line break tags in list item content"];
preformattedStart [label = "preformatted start"];
preformattedEmptyBlock [label = "preformatted empty block"];
preformattedNonEmptyBlock [label = "preformatted non-empty block"];
preformattedLineBreak [label = "preformatted line break"];
preformattedAtLeastTwoLineBreaks [label = "preformatted >=2 line breaks"];
afterPreStartTag [label = "after <pre> start tag"];
afterPreStartTagWithLeadingWhitespace [label = "after <pre> start tag w/ leading whitespace"];
preformattedNonEmptyBlockWithTrailingWhitespace [label = "preformatted non-empty block w/ trailing whitespace"];
preformattedEmptyBlockWithLeadingWhitespace [label = "preformatted empty block w/ leading whitespace"];
init -> start;
start -> start [label = "whitespace (skip)\n<br> (skip)\n</pre>\nstart/end block"];
start -> nonEmptyBlock [label = "non-whitespace"];
start -> preformattedStart [label = "<pre> (depth = 1)"];
start -> beginListItem [label = "<li>"];
nonEmptyBlock -> nonEmptyBlock [label = "non-newline"];
nonEmptyBlock -> emptyBlock [label = "start/end block"];
nonEmptyBlock -> lineBreakTag [label = "<br> or \\n (append to tmp)"];
nonEmptyBlock -> beginListItem [label = "<li>"];
nonEmptyBlock -> endListItem [label = "</li>"];
emptyBlock -> nonEmptyBlock [label = "non-whitespace (block break)"];
emptyBlock -> emptyBlock [label = "whitespace (skip)\n</pre>\nstart/end block"];
emptyBlock -> lineBreakTag [label = "<br> (append to tmp, block break)"];
emptyBlock -> afterPreStartTag [label = "<pre> (depth = 1)"];
emptyBlock -> beginListItem [label = "<li>"];
emptyBlock -> endListItem [label = "</li>"];
lineBreakTag -> lineBreakTag [label = "whitespace (append to tmp)"];
lineBreakTag -> atLeastTwoLineBreakTags [label = "<br> or \\n (append to tmp)"];
lineBreakTag -> emptyBlock [label = "start/end block (clear tmp)"];
lineBreakTag -> nonEmptyBlock [label = "non-whitespace (emit tmp)"];
atLeastTwoLineBreakTags -> atLeastTwoLineBreakTags [label = "whitespace or <br> (append to tmp)"];
atLeastTwoLineBreakTags -> nonEmptyBlock [label = "non-whitespace (emit tmp)"];
atLeastTwoLineBreakTags -> emptyBlockWithAtLeastTwoPreviousLineBreakTags [label = "start/end block"];
emptyBlockWithAtLeastTwoPreviousLineBreakTags -> emptyBlockWithAtLeastTwoPreviousLineBreakTags [label = "whitespace (skip)\n<br>\n</pre>\nstart/end block"];
emptyBlockWithAtLeastTwoPreviousLineBreakTags -> nonEmptyBlock [label = "non-whitespace (emit tmp)"];
emptyBlockWithAtLeastTwoPreviousLineBreakTags -> afterPreStartTagWithLeadingWhitespace [label = "<pre> (depth = 1)"];
beginListItem -> beginListItem [label = "<li>\nwhitespace (skip)\n<br>\nstart/end block"];
beginListItem -> listItemContent [label = "non-whitespace"];
beginListItem -> endListItem [label = "</li>"];
beginListItem -> afterPreStartTagWithLeadingWhitespace [label = "<pre>"];
endListItem -> endListItem [label = "whitespace (skip)\n</li>"];
endListItem -> beginListItem [label = "<li> (line break)"];
endListItem -> emptyBlock [label = "start/end block"];
endListItem -> listItemContent [label = "non-whitespace (line break, indent)"];
endListItem -> lineBreakTagInListItemContent [label = "<br> (append to tmp)"];
listItemContent -> listItemContent [label = "non-whitespace"];
listItemContent -> beginListItem [label = "<li> (line break)"];
listItemContent -> lineBreakTagInListItemContent [label = "<br> (append to tmp)"];
listItemContent -> emptyBlock [label = "start/end block"];
listItemContent -> endListItem [label = "</li>"];
lineBreakTagInListItemContent -> lineBreakTagInListItemContent [label = "whitespace (append to tmp)"];
lineBreakTagInListItemContent -> emptyBlock [label = "start/end block (clear tmp)"];
lineBreakTagInListItemContent -> beginListItem [label = "<li> (emit tmp, line break)"];
lineBreakTagInListItemContent -> listItemContent [label = "non-whitespace (emit tmp)"];
lineBreakTagInListItemContent -> atLeastTwoLineBreakTagsInListItemContent [label = "<br> or \\n (append to tmp)"];
lineBreakTagInListItemContent -> endListItem [label = "</li> (clear tmp)"];
atLeastTwoLineBreakTagsInListItemContent -> atLeastTwoLineBreakTagsInListItemContent [label = "whitespace or <br> (append to tmp)"];
atLeastTwoLineBreakTagsInListItemContent -> beginListItem [label = "<li> (emit tmp, line break)"];
atLeastTwoLineBreakTagsInListItemContent -> emptyBlockWithAtLeastTwoPreviousLineBreakTags [label = "start/end block"];
atLeastTwoLineBreakTagsInListItemContent -> listItemContent [label = "non-whitespace (emit tmp)"];
atLeastTwoLineBreakTagsInListItemContent -> endListItem [label = "</li> (clear tmp)"];
afterPreStartTag -> preformattedLineBreak [label = "<br> (append to tmp, append block break to tmp)"];
afterPreStartTag -> preformattedNonEmptyBlock [label = "non \\n (block break)"];
afterPreStartTag -> preformattedEmptyBlock [label = "\\n (skip)\nstart/end block"];
preformattedLineBreak -> preformattedNonEmptyBlock [label = "non-whitespace (emit tmp)"];
preformattedLineBreak -> preformattedNonEmptyBlockWithTrailingWhitespace [label = "other whitespace (append to tmp)"];
preformattedLineBreak -> preformattedAtLeastTwoLineBreaks [label = "\\n or <br> (append to tmp)"];
preformattedAtLeastTwoLineBreaks -> preformattedAtLeastTwoLineBreaks [label = "\\n or <br> (append to tmp)"];
preformattedAtLeastTwoLineBreaks -> preformattedNonEmptyBlock [label = "non \\n or <br> (emit tmp)"];
preformattedAtLeastTwoLineBreaks -> preformattedEmptyBlockWithLeadingWhitespace [label = "start/end block"];
preformattedEmptyBlockWithLeadingWhitespace -> preformattedEmptyBlockWithLeadingWhitespace [label = "whitespace (append to tmp)\nstart/end block\n</pre> if depth>1&&tmp.count>=2 (depth - 1, remove 1 from tmp)"];
preformattedEmptyBlockWithLeadingWhitespace -> preformattedLineBreak [label = "\\n or <br> (append to tmp)"];
preformattedEmptyBlockWithLeadingWhitespace -> afterPreStartTagWithLeadingWhitespace [label = "<pre> (depth + 1)"];
preformattedEmptyBlockWithLeadingWhitespace -> preformattedEmptyBlock [label = "</pre> if depth>1&&tmp.count<2 (depth - 1, remove 1 from tmp)"];
preformattedEmptyBlockWithLeadingWhitespace -> emptyBlock [label = "</pre> if depth<=1 (clear tmp)"];
preformattedEmptyBlock -> preformattedEmptyBlock [label = "start/end block\n</pre>if depth>1 (depth - 1)"];
preformattedEmptyBlock -> afterPreStartTag [label = "<pre> (depth + 1"];
preformattedEmptyBlock -> preformattedNonEmptyBlock [label = "non-whitespace (block break)"];
preformattedEmptyBlock -> preformattedEmptyBlockWithLeadingWhitespace [label = "whitespace (append to tmp)"];
preformattedEmptyBlock -> preformattedLineBreak [label = "<br> (append to tmp)"];
preformattedNonEmptyBlock -> preformattedNonEmptyBlock [label = "non-whitespace"];
preformattedNonEmptyBlock -> preformattedLineBreak [label = "\\n or <br> (append to tmp)"];
preformattedNonEmptyBlock -> preformattedNonEmptyBlockWithTrailingWhitespace [label = "other whitespace (append to tmp)"];
preformattedNonEmptyBlock -> preformattedEmptyBlock [label = "start/end block"];
preformattedNonEmptyBlockWithTrailingWhitespace -> preformattedNonEmptyBlockWithTrailingWhitespace [label = "whitespace (append to tmp)"];
preformattedNonEmptyBlockWithTrailingWhitespace -> preformattedNonEmptyBlock [label = "non-whitespace (emit tmp)"];
preformattedNonEmptyBlockWithTrailingWhitespace -> preformattedLineBreak [label = "\\n or <br> (append to tmp)"];
preformattedNonEmptyBlockWithTrailingWhitespace -> preformattedEmptyBlockWithLeadingWhitespace [label = "start/end block (append block break to tmp)"];
afterPreStartTagWithLeadingWhitespace -> preformattedNonEmptyBlock [label = "non-whitespace (emit tmp)"];
afterPreStartTagWithLeadingWhitespace -> preformattedEmptyBlockWithLeadingWhitespace [label = "\\n (skip)\nother whitespace (append to tmp)\n<br> (append to tmp)\nstart/end block"];
preformattedStart -> preformattedStart [label = "<pre> (depth + 1)\n</pre> if depth>1 (depth - 1)\n\\n or <br> (skip)\nstart/end block"];
preformattedStart -> start [label = "</pre> if depth<=1"];
preformattedStart -> preformattedNonEmptyBlock [label = "non \\n"];
}

View File

@ -17,22 +17,34 @@ private typealias PlatformFont = UIFont
private typealias PlatformFont = NSFont
#endif
public struct AttributedStringConverter<Callbacks: HTMLConversionCallbacks> {
public class AttributedStringConverter<Callbacks: HTMLConversionCallbacks> {
private let configuration: AttributedStringConverterConfiguration
private var fontCache: [FontTrait: PlatformFont] = [:]
private var tokenizer: Tokenizer<String.UnicodeScalarView.Iterator>!
private var str: NSMutableAttributedString!
private var actionStack: [ElementAction] = []
private var actionStack: [ElementAction] = [] {
didSet {
hasSkipOrReplaceElementAction = actionStack.contains(where: {
switch $0 {
case .skip, .replace(_):
true
default:
false
}
})
}
}
private var hasSkipOrReplaceElementAction = false
private var styleStack: [Style] = []
private var blockState = BlockState.unstarted
private var blockStateMachine = BlockStateMachine(blockBreak: "", lineBreak: "", listIndentForContentOutsideItem: "", append: { _ in }, removeChar: {})
private var currentElementIsEmpty = true
private var previouslyFinishedListItem = false
// The current run of text w/o styles changing
private var currentRun: String = ""
public init(configuration: AttributedStringConverterConfiguration) where Callbacks == DefaultCallbacks {
public convenience init(configuration: AttributedStringConverterConfiguration) where Callbacks == DefaultCallbacks {
self.init(configuration: configuration, callbacks: DefaultCallbacks.self)
}
@ -40,13 +52,17 @@ public struct AttributedStringConverter<Callbacks: HTMLConversionCallbacks> {
self.configuration = configuration
}
public mutating func convert(html: String) -> NSAttributedString {
public func convert(html: String) -> NSAttributedString {
tokenizer = Tokenizer(chars: html.unicodeScalars.makeIterator())
str = NSMutableAttributedString()
actionStack = []
styleStack = []
blockState = .unstarted
blockStateMachine = BlockStateMachine(blockBreak: "\n\n", lineBreak: "\n", listIndentForContentOutsideItem: "\t\t", append: { [unowned self] in
self.append($0)
}, removeChar: { [unowned self] in
self.removeChar()
})
currentElementIsEmpty = true
previouslyFinishedListItem = false
currentRun = ""
@ -55,12 +71,18 @@ public struct AttributedStringConverter<Callbacks: HTMLConversionCallbacks> {
switch token {
case .character(let c):
currentElementIsEmpty = false
continueBlock()
currentRun.unicodeScalars.append(c)
if blockStateMachine.continueBlock(char: c),
!hasSkipOrReplaceElementAction {
currentRun.unicodeScalars.append(c)
}
case .characterSequence(let s):
currentElementIsEmpty = false
continueBlock()
currentRun.append(s)
for c in s.unicodeScalars {
if blockStateMachine.continueBlock(char: c),
!hasSkipOrReplaceElementAction {
currentRun.unicodeScalars.append(c)
}
}
case .comment:
// ignored
continue
@ -75,24 +97,27 @@ public struct AttributedStringConverter<Callbacks: HTMLConversionCallbacks> {
case .endTag(let name):
handleEndTag(name)
// if we have a non-default action for the current element, the run finishes here
if actionStack.last != .default {
finishRun()
if let action = actionStack.last {
if action != .default {
finishRun()
}
actionStack.removeLast()
}
actionStack.removeLast()
case .doctype:
// ignored
continue
}
}
blockStateMachine.endBlocks()
finishRun()
return str
}
private mutating func handleStartTag(_ name: String, selfClosing: Bool, attributes: [Attribute]) {
private func handleStartTag(_ name: String, selfClosing: Bool, attributes: [Attribute]) {
if name == "br" {
currentRun.append("\n")
blockStateMachine.breakTag()
return
}
// self closing tags are ignored since they have no content
@ -124,27 +149,25 @@ public struct AttributedStringConverter<Callbacks: HTMLConversionCallbacks> {
finishRun()
styleStack.append(.monospace)
case "pre":
startBlockIfNecessary()
blockStateMachine.startOrEndBlock()
blockStateMachine.startPreformatted()
finishRun()
styleStack.append(.monospace)
case "blockquote":
startBlockIfNecessary()
blockStateMachine.startOrEndBlock()
finishRun()
styleStack.append(.blockquote)
case "p":
startBlockIfNecessary()
blockStateMachine.startOrEndBlock()
case "ol":
startBlockIfNecessary()
blockStateMachine.startOrEndBlock()
finishRun()
styleStack.append(.orderedList(nextElementOrdinal: 1))
case "ul":
startBlockIfNecessary()
blockStateMachine.startOrEndBlock()
finishRun()
styleStack.append(.unorderedList)
case "li":
if previouslyFinishedListItem {
currentRun.append("\n")
}
let marker: String
if case .orderedList(let nextElementOrdinal) = styleStack.last {
marker = orderedTextList.marker(forItemNumber: nextElementOrdinal)
@ -154,13 +177,14 @@ public struct AttributedStringConverter<Callbacks: HTMLConversionCallbacks> {
} else {
break
}
blockStateMachine.startListItem()
currentRun.append("\t\(marker)\t")
default:
break
}
}
private mutating func handleEndTag(_ name: String) {
private func handleEndTag(_ name: String) {
switch name {
case "a":
if case .link(.some(_)) = lastStyle(.link) {
@ -182,72 +206,60 @@ public struct AttributedStringConverter<Callbacks: HTMLConversionCallbacks> {
case "pre":
finishRun()
removeLastStyle(.monospace)
finishBlockElement()
blockStateMachine.startOrEndBlock()
blockStateMachine.endPreformatted()
case "blockquote":
finishRun()
removeLastStyle(.blockquote)
finishBlockElement()
blockStateMachine.startOrEndBlock()
case "p":
finishBlockElement()
blockStateMachine.startOrEndBlock()
case "ol":
finishRun()
removeLastStyle(.orderedList)
finishBlockElement()
blockStateMachine.startOrEndBlock()
previouslyFinishedListItem = false
case "ul":
finishRun()
removeLastStyle(.unorderedList)
finishBlockElement()
blockStateMachine.startOrEndBlock()
previouslyFinishedListItem = false
case "li":
finishRun()
previouslyFinishedListItem = true
blockStateMachine.endListItem()
default:
break
}
}
private mutating func startBlockIfNecessary() {
switch blockState {
case .unstarted:
blockState = .started(false)
case .started:
break
case .ongoing:
currentRun.append("\n\n")
blockState = .started(true)
case .finished(let nonEmpty):
if nonEmpty {
currentRun.append("\n\n")
}
blockState = .started(nonEmpty)
}
var blockBreak: String {
"\n\n"
}
private mutating func continueBlock() {
switch blockState {
case .unstarted, .started(_):
blockState = .ongoing
case .ongoing:
break
case .finished(let nonEmpty):
if nonEmpty {
currentRun.append("\n\n")
}
blockState = .ongoing
}
var lineBreak: String {
"\n"
}
private mutating func finishBlockElement() {
if blockState == .started(true) && currentElementIsEmpty {
currentRun.removeLast(2)
var listIndentForContentOutsideItem: String {
"\t\t"
}
func append(_ s: String) {
currentRun.append(s)
}
func removeChar() {
if currentRun.isEmpty {
str.deleteCharacters(in: NSRange(location: str.length - 1, length: 1))
} else {
currentRun.removeLast()
}
blockState = .finished(blockState == .ongoing)
}
// Finds the last currently-open style of the given type.
// We can't just use the last one because we need to handle mis-nested tags.
private mutating func removeLastStyle(_ type: Style.StyleType) {
private func removeLastStyle(_ type: Style.StyleType) {
var i = styleStack.index(before: styleStack.endIndex)
while i >= styleStack.startIndex {
if styleStack[i].type == type {
@ -282,14 +294,11 @@ public struct AttributedStringConverter<Callbacks: HTMLConversionCallbacks> {
return style
}()
private mutating func finishRun() {
if actionStack.contains(.skip) {
currentRun = ""
return
} else if case .append(let s) = actionStack.last {
private func finishRun() {
if case .append(let s) = actionStack.last {
currentRun.append(s)
} else if case .replace(let replacement) = actionStack.first(where: \.isReplace) {
currentRun = replacement
} else if case .replace(let replacement) = actionStack.last {
currentRun.append(replacement)
}
guard !currentRun.isEmpty else {
@ -330,7 +339,7 @@ public struct AttributedStringConverter<Callbacks: HTMLConversionCallbacks> {
currentRun = ""
}
private mutating func getFont(traits: FontTrait) -> PlatformFont? {
private func getFont(traits: FontTrait) -> PlatformFont? {
if let cached = fontCache[traits] {
return cached
}
@ -450,13 +459,6 @@ private enum Style {
}
}
enum BlockState: Equatable {
case unstarted
case started(Bool)
case ongoing
case finished(Bool)
}
extension Collection where Element == Attribute {
public func attributeValue(for name: String) -> String? {
first(where: { $0.name == name })?.value

View File

@ -0,0 +1,551 @@
//
// BlockState.swift
// HTMLStreamer
//
// Created by Shadowfacts on 2/14/24.
//
import Foundation
/*
This gnarly mess of a state machine is responsible for:
1) Inserting line breaks in the right places corresponding to boundaries between block elements
2) Preventing leading/trailing whitespace from being emitted
3) Handling whitespace inside <pre> elements
DO NOT TOUCH THE CODE WITHOUT CHECKING/UPDATING THE DIAGRAM.
*/
struct BlockStateMachine {
var blockState: BlockState = .start
let blockBreak: String
let lineBreak: String
let listIndentForContentOutsideItem: String
var temporaryBuffer: String = ""
let append: (String) -> Void
let removeChar: () -> Void
}
extension BlockStateMachine {
mutating func startOrEndBlock() {
switch blockState {
case .start:
break
case .emptyBlock:
break
case .nonEmptyBlock:
blockState = .emptyBlock
case .lineBreakTag:
blockState = .emptyBlock
temporaryBuffer = ""
case .atLeastTwoLineBreakTags:
blockState = .emptyBlockWithAtLeastTwoPreviousLineBreakTags
case .emptyBlockWithAtLeastTwoPreviousLineBreakTags:
break
case .beginListItem:
break
case .endListItem:
blockState = .emptyBlock
case .listItemContent:
blockState = .emptyBlock
case .lineBreakTagInListItemContent:
blockState = .emptyBlock
temporaryBuffer = ""
case .atLeastTwoLineBreakTagsInListItemContent:
blockState = .emptyBlockWithAtLeastTwoPreviousLineBreakTags
case .preformattedStart(depth: _):
break
case .preformattedEmptyBlock(depth: _):
break
case .preformattedNonEmptyBlock(let depth):
blockState = .preformattedEmptyBlock(depth: depth)
case .preformattedLineBreak(depth: let depth):
blockState = .preformattedEmptyBlockWithLeadingWhitespace(depth: depth)
temporaryBuffer.append(lineBreak)
case .preformattedAtLeastTwoLineBreaks(let depth):
blockState = .preformattedEmptyBlockWithLeadingWhitespace(depth: depth)
case .afterPreStartTag(let depth):
blockState = .preformattedEmptyBlock(depth: depth)
case .afterPreStartTagWithLeadingWhitespace(let depth):
blockState = .preformattedEmptyBlockWithLeadingWhitespace(depth: depth)
case .preformattedNonEmptyBlockWithTrailingWhitespace(depth: _):
temporaryBuffer.append(blockBreak)
case .preformattedEmptyBlockWithLeadingWhitespace(depth: _):
break
}
}
mutating func continueBlock(char: UnicodeScalar) -> Bool {
let isNewline = char == "\n"
let isWhitespace = isNewline || isWhitespace(char)
switch blockState {
case .start:
if isWhitespace {
return false
} else {
blockState = .nonEmptyBlock
return true
}
case .emptyBlock:
if isWhitespace {
return false
} else {
blockState = .nonEmptyBlock
append(blockBreak)
return true
}
case .nonEmptyBlock:
if isNewline {
blockState = .lineBreakTag
temporaryBuffer.append("\n")
return false
} else {
return true
}
case .lineBreakTag:
if isWhitespace {
if isNewline {
blockState = .atLeastTwoLineBreakTags
}
temporaryBuffer.unicodeScalars.append(char)
return false
} else {
blockState = .nonEmptyBlock
append(temporaryBuffer)
temporaryBuffer = ""
return true
}
case .atLeastTwoLineBreakTags:
if isWhitespace {
temporaryBuffer.unicodeScalars.append(char)
return false
} else {
blockState = .nonEmptyBlock
append(temporaryBuffer)
temporaryBuffer = ""
return true
}
case .emptyBlockWithAtLeastTwoPreviousLineBreakTags:
if isWhitespace {
return false
} else {
blockState = .nonEmptyBlock
append(temporaryBuffer)
temporaryBuffer = ""
return true
}
case .beginListItem:
if isWhitespace {
return false
} else {
blockState = .listItemContent
return true
}
case .endListItem:
if isWhitespace {
return false
} else {
blockState = .listItemContent
append(lineBreak)
append(listIndentForContentOutsideItem)
return true
}
case .listItemContent:
if isNewline {
blockState = .lineBreakTagInListItemContent
temporaryBuffer.append("\n")
return false
} else {
return true
}
case .lineBreakTagInListItemContent:
if isWhitespace {
if isNewline {
blockState = .atLeastTwoLineBreakTagsInListItemContent
}
temporaryBuffer.unicodeScalars.append(char)
return false
} else {
blockState = .listItemContent
append(temporaryBuffer)
temporaryBuffer = ""
return true
}
case .atLeastTwoLineBreakTagsInListItemContent:
if isWhitespace {
temporaryBuffer.unicodeScalars.append(char)
return false
} else {
blockState = .listItemContent
append(temporaryBuffer)
temporaryBuffer = ""
return true
}
case .preformattedStart(let depth):
if isNewline {
return false
} else {
blockState = .preformattedNonEmptyBlock(depth: depth)
return true
}
case .preformattedEmptyBlock(depth: let depth):
if isWhitespace {
blockState = .preformattedEmptyBlockWithLeadingWhitespace(depth: depth)
temporaryBuffer.unicodeScalars.append(char)
return false
} else {
blockState = .preformattedNonEmptyBlock(depth: depth)
append(blockBreak)
return true
}
case .preformattedNonEmptyBlock(let depth):
if isNewline {
blockState = .preformattedLineBreak(depth: depth)
temporaryBuffer.append(lineBreak)
return false
} else if isWhitespace {
blockState = .preformattedNonEmptyBlockWithTrailingWhitespace(depth: depth)
temporaryBuffer.unicodeScalars.append(char)
return false
} else {
return true
}
case .preformattedLineBreak(let depth):
if isNewline {
blockState = .preformattedAtLeastTwoLineBreaks(depth: depth)
temporaryBuffer.append(lineBreak)
return false
} else if isWhitespace {
blockState = .preformattedNonEmptyBlockWithTrailingWhitespace(depth: depth)
temporaryBuffer.unicodeScalars.append(char)
return false
} else {
blockState = .preformattedNonEmptyBlock(depth: depth)
append(temporaryBuffer)
temporaryBuffer = ""
return true
}
case .preformattedAtLeastTwoLineBreaks(let depth):
if isWhitespace {
temporaryBuffer.unicodeScalars.append(char)
return false
} else {
blockState = .preformattedNonEmptyBlock(depth: depth)
append(temporaryBuffer)
temporaryBuffer = ""
return true
}
case .afterPreStartTag(let depth):
if isNewline {
blockState = .preformattedEmptyBlock(depth: depth)
return false
} else {
blockState = .preformattedNonEmptyBlock(depth: depth)
append(blockBreak)
return true
}
case .afterPreStartTagWithLeadingWhitespace(let depth):
if isNewline {
blockState = .preformattedEmptyBlockWithLeadingWhitespace(depth: depth)
return false
} else if isWhitespace {
blockState = .preformattedEmptyBlockWithLeadingWhitespace(depth: depth)
temporaryBuffer.unicodeScalars.append(char)
return false
} else {
blockState = .preformattedNonEmptyBlock(depth: depth)
append(temporaryBuffer)
temporaryBuffer = ""
return true
}
case .preformattedNonEmptyBlockWithTrailingWhitespace(let depth):
if isNewline {
blockState = .preformattedLineBreak(depth: depth)
temporaryBuffer.append(lineBreak)
return false
} else if isWhitespace {
temporaryBuffer.unicodeScalars.append(char)
return false
} else {
blockState = .preformattedNonEmptyBlock(depth: depth)
append(temporaryBuffer)
temporaryBuffer = ""
return true
}
case .preformattedEmptyBlockWithLeadingWhitespace(let depth):
if isNewline {
blockState = .preformattedLineBreak(depth: depth)
temporaryBuffer.append(lineBreak)
return false
} else if isWhitespace {
temporaryBuffer.unicodeScalars.append(char)
return false
} else {
blockState = .preformattedNonEmptyBlock(depth: depth)
append(temporaryBuffer)
temporaryBuffer = ""
return true
}
}
}
mutating func breakTag() {
switch blockState {
case .start:
break
case .emptyBlock:
blockState = .lineBreakTag
append(blockBreak)
temporaryBuffer.append(lineBreak)
case .nonEmptyBlock:
blockState = .lineBreakTag
temporaryBuffer.append(lineBreak)
case .lineBreakTag:
blockState = .atLeastTwoLineBreakTags
temporaryBuffer.append(lineBreak)
case .atLeastTwoLineBreakTags:
temporaryBuffer.append(lineBreak)
case .emptyBlockWithAtLeastTwoPreviousLineBreakTags:
append(lineBreak)
case .beginListItem:
append(lineBreak)
case .endListItem:
blockState = .lineBreakTagInListItemContent
temporaryBuffer.append(lineBreak)
case .listItemContent:
blockState = .lineBreakTagInListItemContent
temporaryBuffer.append(lineBreak)
case .lineBreakTagInListItemContent:
blockState = .atLeastTwoLineBreakTagsInListItemContent
temporaryBuffer.append(lineBreak)
case .atLeastTwoLineBreakTagsInListItemContent:
temporaryBuffer.append(lineBreak)
case .preformattedStart(depth: _):
break
case .preformattedEmptyBlock(let depth):
blockState = .preformattedLineBreak(depth: depth)
temporaryBuffer.append(lineBreak)
case .preformattedNonEmptyBlock(let depth):
blockState = .preformattedLineBreak(depth: depth)
temporaryBuffer.append(lineBreak)
case .preformattedLineBreak(let depth):
blockState = .preformattedAtLeastTwoLineBreaks(depth: depth)
temporaryBuffer.append(lineBreak)
case .preformattedAtLeastTwoLineBreaks(depth: _):
temporaryBuffer.append(lineBreak)
case .afterPreStartTag(let depth):
blockState = .preformattedLineBreak(depth: depth)
temporaryBuffer.append(blockBreak)
temporaryBuffer.append(lineBreak)
case .afterPreStartTagWithLeadingWhitespace(let depth):
blockState = .preformattedEmptyBlockWithLeadingWhitespace(depth: depth)
temporaryBuffer.append(lineBreak)
case .preformattedNonEmptyBlockWithTrailingWhitespace(let depth):
blockState = .preformattedLineBreak(depth: depth)
temporaryBuffer.append(lineBreak)
case .preformattedEmptyBlockWithLeadingWhitespace(let depth):
blockState = .preformattedLineBreak(depth: depth)
temporaryBuffer.append(lineBreak)
}
}
mutating func startPreformatted() {
switch blockState {
case .start:
blockState = .preformattedStart(depth: 1)
case .emptyBlock:
blockState = .afterPreStartTag(depth: 1)
case .nonEmptyBlock:
fatalError("unreachable")
case .lineBreakTag:
fatalError("unreachable")
case .atLeastTwoLineBreakTags:
fatalError("unreachable")
case .emptyBlockWithAtLeastTwoPreviousLineBreakTags:
blockState = .afterPreStartTagWithLeadingWhitespace(depth: 1)
case .beginListItem:
blockState = .afterPreStartTagWithLeadingWhitespace(depth: 1)
case .endListItem:
fatalError("unreachable")
case .listItemContent:
fatalError("unreachable")
case .lineBreakTagInListItemContent:
fatalError("unreachable")
case .atLeastTwoLineBreakTagsInListItemContent:
fatalError("unreachable")
case .preformattedStart(let depth):
blockState = .preformattedStart(depth: depth + 1)
case .preformattedEmptyBlock(let depth):
blockState = .afterPreStartTag(depth: depth + 1)
case .preformattedNonEmptyBlock(depth: _):
fatalError("unreachable")
case .preformattedLineBreak(depth: _):
fatalError("unreachable")
case .preformattedAtLeastTwoLineBreaks(depth: _):
fatalError("unreachable")
case .afterPreStartTag(depth: _):
fatalError("unreachable")
case .afterPreStartTagWithLeadingWhitespace(depth: _):
fatalError("unreachable")
case .preformattedNonEmptyBlockWithTrailingWhitespace(depth: _):
fatalError("unreachable")
case .preformattedEmptyBlockWithLeadingWhitespace(let depth):
blockState = .afterPreStartTagWithLeadingWhitespace(depth: depth + 1)
}
}
mutating func endPreformatted() {
switch blockState {
case .start:
break
case .emptyBlock:
break
case .nonEmptyBlock:
fatalError("unreachable")
case .lineBreakTag:
fatalError("unreachable")
case .atLeastTwoLineBreakTags:
fatalError("unreachable")
case .emptyBlockWithAtLeastTwoPreviousLineBreakTags:
break
case .beginListItem:
break
case .endListItem:
fatalError("unreachable")
case .listItemContent:
fatalError("unreachable")
case .lineBreakTagInListItemContent:
fatalError("unreachable")
case .atLeastTwoLineBreakTagsInListItemContent:
fatalError("unreachable")
case .preformattedStart(let depth):
if depth <= 1 {
blockState = .start
} else {
blockState = .preformattedStart(depth: depth - 1)
}
case .preformattedEmptyBlock(let depth):
if depth <= 1 {
blockState = .emptyBlock
} else {
blockState = .preformattedEmptyBlock(depth: depth - 1)
}
case .preformattedNonEmptyBlock(depth: _):
fatalError("unreachable")
case .preformattedLineBreak(depth: _):
fatalError("unreachable")
case .preformattedAtLeastTwoLineBreaks(depth: _):
fatalError("unreachable")
case .afterPreStartTag(depth: _):
fatalError("unreachable")
case .afterPreStartTagWithLeadingWhitespace(depth: _):
fatalError("unreachable")
case .preformattedNonEmptyBlockWithTrailingWhitespace(depth: _):
fatalError("unreachable")
case .preformattedEmptyBlockWithLeadingWhitespace(let depth):
if depth <= 1 {
blockState = .emptyBlock
temporaryBuffer = ""
} else {
if temporaryBuffer.count >= 2 {
temporaryBuffer.removeLast()
blockState = .preformattedEmptyBlockWithLeadingWhitespace(depth: depth - 1)
} else {
temporaryBuffer.removeLast()
blockState = .preformattedEmptyBlock(depth: depth - 1)
}
}
}
}
mutating func startListItem() {
switch blockState {
case .start:
blockState = .beginListItem
case .emptyBlock:
blockState = .beginListItem
append(blockBreak)
case .nonEmptyBlock:
blockState = .beginListItem
append(blockBreak)
case .beginListItem:
break
case .endListItem:
blockState = .beginListItem
append(lineBreak)
case .listItemContent:
blockState = .beginListItem
append(lineBreak)
case .lineBreakTagInListItemContent:
blockState = .beginListItem
append(temporaryBuffer)
temporaryBuffer = ""
append(lineBreak)
case .atLeastTwoLineBreakTagsInListItemContent:
blockState = .beginListItem
append(temporaryBuffer)
temporaryBuffer = ""
append(lineBreak)
default:
break
}
}
mutating func endListItem() {
switch blockState {
case .emptyBlock:
blockState = .endListItem
case .nonEmptyBlock:
blockState = .endListItem
case .listItemContent:
blockState = .endListItem
case .lineBreakTagInListItemContent:
blockState = .endListItem
temporaryBuffer = ""
case .atLeastTwoLineBreakTagsInListItemContent:
blockState = .endListItem
temporaryBuffer = ""
default:
break
}
}
mutating func endBlocks() {
switch blockState {
default:
break
}
}
}
enum BlockState: Equatable {
case start
case emptyBlock
case nonEmptyBlock
case lineBreakTag
case atLeastTwoLineBreakTags
case emptyBlockWithAtLeastTwoPreviousLineBreakTags
case beginListItem
case endListItem
case listItemContent
case lineBreakTagInListItemContent
case atLeastTwoLineBreakTagsInListItemContent
case preformattedStart(depth: Int32)
case preformattedEmptyBlock(depth: Int32)
case preformattedNonEmptyBlock(depth: Int32)
case preformattedLineBreak(depth: Int32)
case preformattedAtLeastTwoLineBreaks(depth: Int32)
case afterPreStartTag(depth: Int32)
case afterPreStartTagWithLeadingWhitespace(depth: Int32)
case preformattedNonEmptyBlockWithTrailingWhitespace(depth: Int32)
case preformattedEmptyBlockWithLeadingWhitespace(depth: Int32)
}
@inline(__always)
private func isWhitespace(_ c: UnicodeScalar) -> Bool {
// this is not strictly correct, but checking the actual unicode properties is slow
// and this should cover the vast majority of actual use
c == " " || c == "\n" || c == "\t"
}

View File

@ -17,14 +17,6 @@ public enum ElementAction: Equatable {
case skip
case replace(String)
case append(String)
var isReplace: Bool {
if case .replace(_) = self {
true
} else {
false
}
}
}
public extension HTMLConversionCallbacks {

View File

@ -7,19 +7,30 @@
import Foundation
public struct TextConverter<Callbacks: HTMLConversionCallbacks> {
public class TextConverter<Callbacks: HTMLConversionCallbacks> {
private let configuration: TextConverterConfiguration
private var tokenizer: Tokenizer<String.UnicodeScalarView.Iterator>!
private var str: String!
private var actionStack: [ElementAction] = []
private var blockState = BlockState.unstarted
private var actionStack: [ElementAction] = [] {
didSet {
hasSkipOrReplaceElementAction = actionStack.contains(where: {
switch $0 {
case .skip, .replace(_):
true
default:
false
}
})
}
}
private var hasSkipOrReplaceElementAction = false
var blockStateMachine = BlockStateMachine(blockBreak: "", lineBreak: "", listIndentForContentOutsideItem: "", append: { _ in }, removeChar: {})
private var currentElementIsEmpty = true
private var currentRun = ""
public init(configuration: TextConverterConfiguration = .init()) where Callbacks == DefaultCallbacks {
public convenience init(configuration: TextConverterConfiguration = .init()) where Callbacks == DefaultCallbacks {
self.init(configuration: configuration, callbacks: DefaultCallbacks.self)
}
@ -27,11 +38,19 @@ public struct TextConverter<Callbacks: HTMLConversionCallbacks> {
self.configuration = configuration
}
public mutating func convert(html: String) -> String {
public func convert(html: String) -> String {
tokenizer = Tokenizer(chars: html.unicodeScalars.makeIterator())
str = ""
blockState = .unstarted
blockStateMachine = BlockStateMachine(
blockBreak: configuration.insertNewlines ? "\n\n" : " " ,
lineBreak: configuration.insertNewlines ? "\n" : " " ,
listIndentForContentOutsideItem: "",
append: { [unowned self] in
self.append($0)
}, removeChar: { [unowned self] in
self.removeChar()
})
currentElementIsEmpty = true
currentRun = ""
@ -39,12 +58,18 @@ public struct TextConverter<Callbacks: HTMLConversionCallbacks> {
switch token {
case .character(let scalar):
currentElementIsEmpty = false
continueBlock()
currentRun.unicodeScalars.append(scalar)
if blockStateMachine.continueBlock(char: scalar),
!hasSkipOrReplaceElementAction {
currentRun.unicodeScalars.append(scalar)
}
case .characterSequence(let string):
currentElementIsEmpty = false
continueBlock()
currentRun.append(string)
for c in string.unicodeScalars {
if blockStateMachine.continueBlock(char: c),
!hasSkipOrReplaceElementAction {
currentRun.unicodeScalars.append(c)
}
}
case .startTag(let name, let selfClosing, let attributes):
currentElementIsEmpty = true
let action = Callbacks.elementAction(name: name, attributes: attributes)
@ -55,107 +80,81 @@ public struct TextConverter<Callbacks: HTMLConversionCallbacks> {
handleStartTag(name, selfClosing: selfClosing, attributes: attributes)
case .endTag(let name):
handleEndTag(name)
if actionStack.last != .default {
finishRun()
if let action = actionStack.last {
if action != .default {
finishRun()
}
actionStack.removeLast()
}
actionStack.removeLast()
case .comment, .doctype:
break
}
}
blockStateMachine.endBlocks()
finishRun()
return str
}
private mutating func handleStartTag(_ name: String, selfClosing: Bool, attributes: [Attribute]) {
private func handleStartTag(_ name: String, selfClosing: Bool, attributes: [Attribute]) {
switch name {
case "br":
if configuration.insertNewlines {
currentRun.append("\n")
} else {
currentRun.append(" ")
}
blockStateMachine.breakTag()
case "pre", "blockquote", "p", "ol", "ul":
startBlockIfNecessary()
blockStateMachine.startOrEndBlock()
default:
break
}
}
private mutating func handleEndTag(_ name: String) {
private func handleEndTag(_ name: String) {
switch name {
case "pre", "blockquote", "p", "ol", "ul":
finishBlockElement()
blockStateMachine.startOrEndBlock()
finishRun()
default:
break
}
}
private mutating func startBlockIfNecessary() {
switch blockState {
case .unstarted:
blockState = .started(false)
case .started:
break
case .ongoing:
if configuration.insertNewlines {
currentRun.append("\n\n")
} else {
currentRun.append(" ")
}
blockState = .started(true)
case .finished(let nonEmpty):
if nonEmpty {
if configuration.insertNewlines {
currentRun.append("\n\n")
} else {
currentRun.append(" ")
}
}
blockState = .started(nonEmpty)
var blockBreak: String {
if configuration.insertNewlines {
"\n\n"
} else {
" "
}
}
private mutating func continueBlock() {
switch blockState {
case .unstarted, .started(_):
blockState = .ongoing
case .ongoing:
break
case .finished(let nonEmpty):
if nonEmpty {
if configuration.insertNewlines {
currentRun.append("\n\n")
} else {
currentRun.append(" ")
}
}
blockState = .ongoing
var lineBreak: String {
if configuration.insertNewlines {
"\n"
} else {
" "
}
}
private mutating func finishBlockElement() {
if blockState == .started(true) && currentElementIsEmpty {
if configuration.insertNewlines {
currentRun.removeLast(2)
} else {
currentRun.removeLast(1)
}
}
blockState = .finished(blockState == .ongoing)
var listIndentForContentOutsideItem: String {
" "
}
private mutating func finishRun() {
if actionStack.contains(.skip) {
currentRun = ""
return
} else if case .append(let s) = actionStack.last {
func append(_ s: String) {
currentRun.append(s)
}
func removeChar() {
if currentRun.isEmpty {
str.removeLast()
} else {
currentRun.removeLast()
}
}
private func finishRun() {
if case .append(let s) = actionStack.last {
currentRun.append(s)
} else if case .replace(let replacement) = actionStack.first(where: \.isReplace) {
currentRun = replacement
} else if case .replace(let replacement) = actionStack.last {
currentRun.append(replacement)
}
guard !currentRun.isEmpty else {

View File

@ -731,7 +731,7 @@ private extension Tokenizer {
switch c {
case "\t", "\n", "\u{000C}", " ":
// ignore the character
return next()
return tokenizeBeforeAttributeName()
case "/", ">", nil:
reconsume(c)
state = .afterAttributeName
@ -794,13 +794,16 @@ private extension Tokenizer {
switch nextChar() {
case "\t", "\n", "\u{000C}", " ":
// ignore the character
return tokenizeAttributeName()
return tokenizeAfterAttributeName()
case "/":
state = .selfClosingStartTag
return tokenizeSelfClosingStartTag()
case "=":
state = .beforeAttributeValue
return tokenizeBeforeAttributeValue()
case ">":
state = .data
return takeCurrentToken()
case nil:
// parse error: eof-in-tag
state = .endOfFile
@ -1552,12 +1555,12 @@ private extension Unicode.Scalar {
case "7": 7
case "8": 8
case "9": 9
case "A": 0xA
case "B": 0xB
case "C": 0xC
case "D": 0xD
case "E": 0xE
case "F": 0xF
case "A", "a": 0xA
case "B", "b": 0xB
case "C", "c": 0xC
case "D", "d": 0xD
case "E", "e": 0xE
case "F", "f": 0xF
default: nil
}
}

View File

@ -50,7 +50,7 @@ final class AttributedStringConverterTests: XCTestCase {
color: color,
paragraphStyle: .default
)
var converter = AttributedStringConverter<Callbacks>(configuration: config)
let converter = AttributedStringConverter<Callbacks>(configuration: config)
return converter.convert(html: html)
}
@ -206,20 +206,11 @@ final class AttributedStringConverterTests: XCTestCase {
}
func testMultipleBlockElements() {
let result = NSMutableAttributedString()
result.append(NSAttributedString(string: "a", attributes: [
let result = NSAttributedString(string: "a\n\nb", attributes: [
.font: italicFont,
.paragraphStyle: blockquoteParagraphStyle,
]))
result.append(NSAttributedString(string: "\n\n", attributes: [
.font: font,
.paragraphStyle: NSParagraphStyle.default,
]))
result.append(NSAttributedString(string: "b", attributes: [
.font: italicFont,
.paragraphStyle: blockquoteParagraphStyle,
]))
result.addAttribute(.foregroundColor, value: color, range: NSRange(location: 0, length: result.length))
.foregroundColor: color,
])
XCTAssertEqual(convert("<blockquote>a</blockquote><blockquote>b</blockquote>"), result)
}
@ -273,6 +264,12 @@ final class AttributedStringConverterTests: XCTestCase {
.paragraphStyle: NSParagraphStyle.default,
.foregroundColor: color,
]))
let replaceNested = convert("<span class='replace'><b>a</b></span>", callbacks: Callbacks.self)
XCTAssertEqual(replaceNested, NSAttributedString(string: "", attributes: [
.font: font,
.paragraphStyle: NSParagraphStyle.default,
.foregroundColor: color,
]))
let appended = convert("<span class='append'>test</span>", callbacks: Callbacks.self)
XCTAssertEqual(appended, NSAttributedString(string: "test…", attributes: [
.font: font,
@ -321,12 +318,12 @@ final class AttributedStringConverterTests: XCTestCase {
func testFollowedByList() {
let result = NSMutableAttributedString()
result.append(NSAttributedString(string: "a\n\n", attributes: [
result.append(NSAttributedString(string: "a", attributes: [
.font: font,
.paragraphStyle: NSParagraphStyle.default,
.foregroundColor: color,
]))
result.append(NSAttributedString(string: "\t1.\tb\n\t2.\tc", attributes: [
result.append(NSAttributedString(string: "\n\n\t1.\tb\n\t2.\tc", attributes: [
.font: font,
.paragraphStyle: listParagraphStyle,
.foregroundColor: color,
@ -336,6 +333,15 @@ final class AttributedStringConverterTests: XCTestCase {
XCTAssertEqual(convert("a<ol><li>b</li><li>c</li></ol>"), result)
}
func testListItemOutsideList() {
let result = NSAttributedString(string: "a", attributes: [
.font: font,
.paragraphStyle: NSParagraphStyle.default,
.foregroundColor: color,
])
XCTAssertEqual(convert("<li>a</li>"), result)
}
func testSkipElementActionFollowingUnfinishedRun() {
struct Callbacks: HTMLConversionCallbacks {
static func elementAction(name: String, attributes: [Attribute]) -> ElementAction {
@ -352,4 +358,334 @@ final class AttributedStringConverterTests: XCTestCase {
XCTAssertEqual(convert(#"<a href="https://example.com"><span class="invisible">https://</span><span>example.com</span><span class="invisible"></span></a>"#, callbacks: Callbacks.self), result)
}
func testMalformedOnlyClosingTag() {
XCTAssertEqual(convert("</span>"), .init())
}
func testMultipleClosingBlockTagsBeforeOpeningBlockTag() {
let result = NSMutableAttributedString()
result.append(NSAttributedString(string: "a", attributes: [
.font: italicFont,
.paragraphStyle: blockquoteParagraphStyle,
.foregroundColor: color,
]))
result.append(NSAttributedString(string: "\n\nb", attributes: [
.font: font,
.paragraphStyle: NSParagraphStyle.default,
.foregroundColor: color,
]))
XCTAssertEqual(convert(#"<blockquote><p>a</p></blockquote><p>b</p>"#), result)
}
func testNewlineBetweenClosingAndOpeningBlockTag() {
let result = NSAttributedString(string: "a\n\nb", attributes: [
.font: font,
.paragraphStyle: NSParagraphStyle.default,
.foregroundColor: color,
])
XCTAssertEqual(convert("<p>a</p>\n<p>b</p>"), result)
XCTAssertEqual(convert("<p>a</p><p>\nb</p>"), result)
}
func testEndAfterNewlineInBlockContent() {
let result = NSAttributedString(string: "a", attributes: [
.font: font,
.paragraphStyle: NSParagraphStyle.default,
.foregroundColor: color,
])
XCTAssertEqual(convert("<p>a\n\n</p>"), result)
XCTAssertEqual(convert("<p>a\n\n</p>\n"), result)
XCTAssertEqual(convert("<p>\n\na</p>"), result)
XCTAssertEqual(convert("<p>\n\na</p>\n"), result)
let result2 = NSAttributedString(string: "a\n\n\nb", attributes: [
.font: font,
.paragraphStyle: NSParagraphStyle.default,
.foregroundColor: color,
])
XCTAssertEqual(convert("<p>a\n\n\nb</p>"), result2)
}
func testBRAtBlockElementBoundary() {
let two = NSAttributedString(string: "a\n\nb", attributes: [
.font: font,
.paragraphStyle: NSParagraphStyle.default,
.foregroundColor: color,
])
XCTAssertEqual(convert("<p>a<br></p><p>b</p>"), two)
let three = NSAttributedString(string: "a\n\n\nb", attributes: [
.font: font,
.paragraphStyle: NSParagraphStyle.default,
.foregroundColor: color,
])
XCTAssertEqual(convert("<p>a</p><p><br>b</p>"), three)
}
func testPreFollowedByP() {
let result = NSMutableAttributedString()
result.append(NSAttributedString(string: "a", attributes: [
.font: monospaceFont,
.paragraphStyle: NSParagraphStyle.default,
.foregroundColor: color,
]))
result.append(NSAttributedString(string: "\n\nb", attributes: [
.font: font,
.paragraphStyle: NSParagraphStyle.default,
.foregroundColor: color,
]))
XCTAssertEqual(convert("<pre>a<br></pre><p>b</p>"), result)
}
func testPreFollowedByPre() {
let result = NSAttributedString(string: "a\n\nb", attributes: [
.font: monospaceFont,
.paragraphStyle: NSParagraphStyle.default,
.foregroundColor: color,
])
XCTAssertEqual(convert("<pre>a</pre><pre>b</pre>"), result)
}
func testBRAtPreBoundary() {
let two = NSAttributedString(string: "a\n\nb", attributes: [
.font: monospaceFont,
.paragraphStyle: NSParagraphStyle.default,
.foregroundColor: color,
])
XCTAssertEqual(convert("<pre>a<br></pre><pre>b</pre>"), two)
let three = NSAttributedString(string: "a\n\n\nb", attributes: [
.font: monospaceFont,
.paragraphStyle: NSParagraphStyle.default,
.foregroundColor: color,
])
XCTAssertEqual(convert("<pre>a</pre><pre><br>b</pre>"), three)
}
func testNestedPre() {
let one = NSAttributedString(string: "a", attributes: [
.font: monospaceFont,
.paragraphStyle: NSParagraphStyle.default,
.foregroundColor: color,
])
XCTAssertEqual(convert("<pre><pre>a</pre></pre>"), one)
let two = NSAttributedString(string: "a\n\nb", attributes: [
.font: monospaceFont,
.paragraphStyle: NSParagraphStyle.default,
.foregroundColor: color,
])
XCTAssertEqual(convert("<pre>a<pre>b</pre></pre>"), two)
XCTAssertEqual(convert("<pre>a<br><pre>b</pre></pre>"), two)
let three = NSAttributedString(string: "a\n\n\nb", attributes: [
.font: monospaceFont,
.paragraphStyle: NSParagraphStyle.default,
.foregroundColor: color,
])
XCTAssertEqual(convert("<pre>a<pre><br>b</pre></pre>"), three)
}
func testIgnoreLeadingNewlineInPre() {
let one = NSAttributedString(string: "a", attributes: [
.font: monospaceFont,
.paragraphStyle: NSParagraphStyle.default,
.foregroundColor: color,
])
XCTAssertEqual(convert("<pre>\na</pre>"), one)
let two = NSMutableAttributedString()
two.append(NSAttributedString(string: "a", attributes: [
.font: font,
.paragraphStyle: NSParagraphStyle.default,
.foregroundColor: color,
]))
two.append(NSAttributedString(string: "\n\nb", attributes: [
.font: monospaceFont,
.paragraphStyle: NSParagraphStyle.default,
.foregroundColor: color,
]))
XCTAssertEqual(convert("a<pre>\nb</pre>"), two)
}
func testPreFollowingChar() {
let result = NSMutableAttributedString()
result.append(NSAttributedString(string: "a", attributes: [
.font: font,
.paragraphStyle: NSParagraphStyle.default,
.foregroundColor: color,
]))
result.append(NSAttributedString(string: "\n\nb", attributes: [
.font: monospaceFont,
.paragraphStyle: NSParagraphStyle.default,
.foregroundColor: color,
]))
XCTAssertEqual(convert("a<pre>b</pre>"), result)
}
func testSkipLeadingTrailingWhitespace() {
let result = NSAttributedString(string: "a", attributes: [
.font: font,
.paragraphStyle: NSParagraphStyle.default,
.foregroundColor: color,
])
XCTAssertEqual(convert(" \n\ta"), result)
XCTAssertEqual(convert(" \n\t<p>a</p>"), result)
XCTAssertEqual(convert("a\n\t"), result)
XCTAssertEqual(convert("<p>a</p> \n\t"), result)
let result2 = NSAttributedString(string: "a ", attributes: [
.font: font,
.paragraphStyle: NSParagraphStyle.default,
.foregroundColor: color,
])
XCTAssertEqual(convert("a \n\t"), result2)
let pre = NSAttributedString(string: "a", attributes: [
.font: monospaceFont,
.paragraphStyle: NSParagraphStyle.default,
.foregroundColor: color,
])
XCTAssertEqual(convert(" \n\t<pre>a</pre>"), pre)
XCTAssertEqual(convert("<pre>a</pre> \n\t"), pre)
}
func testDoesNotCollapseWhitespace() {
let result = NSAttributedString(string: "a \t\nb", attributes: [
.font: font,
.paragraphStyle: NSParagraphStyle.default,
.foregroundColor: color,
])
XCTAssertEqual(convert("<p>a \t\nb</p>"), result)
}
func testParagraphInsideListItem() {
let result = NSAttributedString(string: "\t1.\ta\n\t2.\tb", attributes: [
.font: font,
.paragraphStyle: listParagraphStyle,
.foregroundColor: color,
])
XCTAssertEqual(convert("<ol><li><p>a</p></li><li><p>b</p></li></ol>"), result)
}
func testMultipleParagraphsInsideListItem() {
let result = NSAttributedString(string: "\t1.\ta\n\nb", attributes: [
.font: font,
.paragraphStyle: listParagraphStyle,
.foregroundColor: color,
])
XCTAssertEqual(convert("<ol><li><p>a</p><p>b</p></li></ol>"), result)
}
func testBreakBetweenListItems() {
let result = NSAttributedString(string: "\t1.\ta\n\n\t2.\tb", attributes: [
.font: font,
.paragraphStyle: listParagraphStyle,
.foregroundColor: color,
])
XCTAssertEqual(convert("<ol><li>a</li><br><li>b</li></ol>"), result)
}
func testCharacterBetweenListItems() {
let result = NSAttributedString(string: "\t1.\ta\n\t\tc\n\t2.\tb", attributes: [
.font: font,
.paragraphStyle: listParagraphStyle,
.foregroundColor: color,
])
XCTAssertEqual(convert("<ol><li>a</li>c<li>b</li></ol>"), result)
let result2 = NSAttributedString(string: "\t1.\ta\n\t\tc \n\t2.\tb", attributes: [
.font: font,
.paragraphStyle: listParagraphStyle,
.foregroundColor: color,
])
XCTAssertEqual(convert("<ol><li>a</li>c <li>b</li></ol>"), result2)
}
func testWhitespaceCollapsingInTextBetweenListItems() {
let result = NSAttributedString(string: "\t1.\ta\n\t\tc d\n\t2.\tb", attributes: [
.font: font,
.paragraphStyle: listParagraphStyle,
.foregroundColor: color,
])
XCTAssertEqual(convert("<ol><li>a</li>c d<li>b</li></ol>"), result)
}
func testImplicitlyClosedListItem() {
let result = NSAttributedString(string: "\t1.\ta\n\t2.\tb", attributes: [
.font: font,
.paragraphStyle: listParagraphStyle,
.foregroundColor: color,
])
XCTAssertEqual(convert("<ol><li>a<li>b</ol>"), result)
}
func testPreInsideListItem() {
let result = NSMutableAttributedString()
result.append(NSAttributedString(string: "\t1.\t", attributes: [
.font: font,
.paragraphStyle: listParagraphStyle,
.foregroundColor: color,
]))
result.append(NSAttributedString(string: "a", attributes: [
.font: monospaceFont,
.paragraphStyle: listParagraphStyle,
.foregroundColor: color,
]))
XCTAssertEqual(convert("<ol><li><pre>a</pre></li></ol>"), result)
}
func testInvisibleAtBeginningOfParagraphDoesNotPreventParagraphBreak() {
struct Invisible: HTMLConversionCallbacks {
static func elementAction(name: String, attributes: [Attribute]) -> ElementAction {
if attributes.attributeValue(for: "class") == "invisible" {
.skip
} else {
.default
}
}
}
let result = NSAttributedString(string: "a\n\nc", attributes: [
.font: font,
.paragraphStyle: NSParagraphStyle.default,
.foregroundColor: color,
])
let html = """
<p>a</p><p><span class="invisible">b</span><span class="ellipsis">c</span></p>
"""
XCTAssertEqual(convert(html, callbacks: Invisible.self), result)
}
func testReplaceAtBeginningOfParagraphDoesNotPreventParagraphBreak() {
struct Replace: HTMLConversionCallbacks {
static func elementAction(name: String, attributes: [Attribute]) -> ElementAction {
if attributes.attributeValue(for: "class") == "replace" {
.replace("c")
} else {
.default
}
}
}
let result = NSAttributedString(string: "a\n\nc", attributes: [
.font: font,
.paragraphStyle: NSParagraphStyle.default,
.foregroundColor: color,
])
let html = """
<p>a</p><p><span class="replace">b</span></p>
"""
XCTAssertEqual(convert(html, callbacks: Replace.self), result)
}
func testLineBreakAtBeginningOfBlockElement() {
let result = NSAttributedString(string: "a\n\n\n\nb", attributes: [
.font: font,
.paragraphStyle: NSParagraphStyle.default,
.foregroundColor: color,
])
XCTAssertEqual(convert("<p>a</p><p><br></p><p>b</p>"), result)
let result2 = NSAttributedString(string: "a\n\n\nb\n\nc", attributes: [
.font: font,
.paragraphStyle: NSParagraphStyle.default,
.foregroundColor: color,
])
XCTAssertEqual(convert("<p>a</p><p><br>b</p><p>c</p>"), result2)
}
}

View File

@ -83,4 +83,8 @@ final class TextConverterTests: XCTestCase {
XCTAssertEqual(convert(#"<a href="https://example.com"><span class="invisible">https://</span><span>example.com</span><span class="invisible"></span></a>"#, callbacks: Callbacks.self), "example.com")
}
func testMalformedOnlyClosingTag() {
XCTAssertEqual(convert("</span>"), "")
}
}

View File

@ -33,6 +33,8 @@ final class TokenizerTests: XCTestCase {
func testNumericCharacterReference() {
XCTAssertEqual(tokenize("&#33;"), [.characterSequence("!")])
XCTAssertEqual(tokenize("&#x21;"), [.characterSequence("!")])
XCTAssertEqual(tokenize("&#x4A;"), [.characterSequence("J")])
XCTAssertEqual(tokenize("&#x4a;"), [.characterSequence("J")])
}
func testStartTag() {
@ -74,6 +76,10 @@ final class TokenizerTests: XCTestCase {
XCTAssertEqual(tokenize("🇺🇸"), [.characterSequence("\u{1F1FA}\u{1F1F8}")])
}
func testWhitespaceAfterAttributeName() {
XCTAssertEqual(tokenize("<a foo >"), [.startTag("a", selfClosing: false, attributes: [.init(name: "foo", value: "")])])
}
}
private struct PrintIterator<Inner: IteratorProtocol>: IteratorProtocol {