The great encomplicating

This commit is contained in:
Shadowfacts 2024-02-21 11:15:27 -05:00
parent fa03efedbb
commit 1f26c4923c
5 changed files with 999 additions and 49 deletions

126
BlockState.dot Normal file
View File

@ -0,0 +1,126 @@
digraph blockstate {
/* rankdir=LR; */
node [shape = doublecircle, fontsize = 18]; end;
node [shape = circle, fontsize = 18];
edge [fontsize = 18];
init [label = "", shape=none, height = .0, width = .0];
start;
emptyBlock [label = "empty block"];
nonEmptyBlock [label = "non-empty block"];
emittedSpace [label = "emitted space"];
lineBreakTag [label = "line break tag"];
atLeastTwoLineBreakTags [label = ">=2 line break tags"];
emptyBlockWithAtLeastTwoPreviousLineBreakTags [label = "empty block w/ >=2 prev line break tags"];
beginListItem [label = "begin list item"];
endListItem [label = "end list item"];
listItemContent [label = "list item content"];
emittedSpaceInListItemContent [label = "emitted space in text in list item content"];
lineBreakTagInListItemContent [label = "line break tag in list item content"];
atLeastTwoLineBreakTagsInListItemContent [label = ">= 2 line break tags in list item content"];
preformattedStart [label = "preformatted start"];
preformattedEmptyBlock [label = "preformatted empty block"];
preformattedNonEmptyBlock [label = "preformatted non-empty block"];
preformattedLineBreak [label = "preformatted line break"];
preformattedAtLeastTwoLineBreaks [label = "preformatted >=2 line breaks"];
afterPreStartTag [label = "after <pre> start tag"];
afterPreStartTagWithLeadingWhitespace [label = "after <pre> start tag w/ leading whitespace"];
preformattedNonEmptyBlockWithTrailingWhitespace [label = "preformatted non-empty block w/ trailing whitespace"];
preformattedEmptyBlockWithLeadingWhitespace [label = "preformatted empty block w/ leading whitespace"];
init -> start;
start -> start [label = "whitespace (skip)\n<br> (skip)\n</pre>\nstart/end block"];
start -> nonEmptyBlock [label = "non-whitespace"];
start -> preformattedStart [label = "<pre> (depth = 1)"];
start -> beginListItem [label = "<li>"];
nonEmptyBlock -> nonEmptyBlock [label = "non-whitespace"];
nonEmptyBlock -> emptyBlock [label = "start/end block"];
nonEmptyBlock -> emittedSpace [label = "whitespace (emit space)"];
nonEmptyBlock -> lineBreakTag [label = "<br> (append to tmp)"];
nonEmptyBlock -> beginListItem [label = "<li>"];
nonEmptyBlock -> endListItem [label = "</li>"];
emittedSpace -> nonEmptyBlock [label = "non-whitespace"];
emittedSpace -> emittedSpace [label = "whitespace (skip)"];
emittedSpace -> emptyBlock [label = "start/end block (remove 1)"];
emittedSpace -> lineBreakTag [label = "<br> (append to tmp)"];
emittedSpace -> end [label = "EOF (remove 1)"];
emptyBlock -> nonEmptyBlock [label = "non-whitespace (block break)"];
emptyBlock -> emptyBlock [label = "whitespace (skip)\n<br>\n</pre>\nstart/end block"];
emptyBlock -> afterPreStartTag [label = "<pre> (depth = 1)"];
emptyBlock -> beginListItem [label = "<li>"];
emptyBlock -> endListItem [label = "</li>"];
lineBreakTag -> lineBreakTag [label = "whitespace (skip)"];
lineBreakTag -> atLeastTwoLineBreakTags [label = "<br> (append to tmp)"];
lineBreakTag -> emptyBlock [label = "start/end block (clear tmp)"];
lineBreakTag -> nonEmptyBlock [label = "non-whitespace (emit tmp)"];
atLeastTwoLineBreakTags -> atLeastTwoLineBreakTags [label = "whitespace (skip)\n<br> (append to tmp)"];
atLeastTwoLineBreakTags -> nonEmptyBlock [label = "non-whitespace (emit tmp)"];
atLeastTwoLineBreakTags -> emptyBlockWithAtLeastTwoPreviousLineBreakTags [label = "start/end block"];
emptyBlockWithAtLeastTwoPreviousLineBreakTags -> emptyBlockWithAtLeastTwoPreviousLineBreakTags [label = "whitespace (skip)\n<br>\n</pre>\nstart/end block"];
emptyBlockWithAtLeastTwoPreviousLineBreakTags -> nonEmptyBlock [label = "non-whitespace (emit tmp)"];
emptyBlockWithAtLeastTwoPreviousLineBreakTags -> afterPreStartTagWithLeadingWhitespace [label = "<pre> (depth = 1)"];
beginListItem -> beginListItem [label = "<li>\nwhitespace (skip)\n<br>\nstart/end block"];
beginListItem -> listItemContent [label = "non-whitespace"];
beginListItem -> endListItem [label = "</li>"];
beginListItem -> afterPreStartTagWithLeadingWhitespace [label = "<pre>"];
endListItem -> endListItem [label = "whitespace (skip)\n</li>"];
endListItem -> beginListItem [label = "<li> (line break)"];
endListItem -> emptyBlock [label = "start/end block"];
endListItem -> listItemContent [label = "non-whitespace (line break, indent)"];
endListItem -> lineBreakTagInListItemContent [label = "<br> (append to tmp)"];
listItemContent -> listItemContent [label = "non-whitespace"];
listItemContent -> beginListItem [label = "<li> (line break)"];
listItemContent -> lineBreakTagInListItemContent [label = "<br> (append to tmp)"];
listItemContent -> emittedSpaceInListItemContent [label = "whitespace (emit space)"];
listItemContent -> emptyBlock [label = "start/end block"];
listItemContent -> endListItem [label = "</li>"];
emittedSpaceInListItemContent -> emittedSpaceInListItemContent [label = "whitespace (skip)"];
emittedSpaceInListItemContent -> listItemContent [label = "non-whitespace"];
emittedSpaceInListItemContent -> end [label = "EOF (remove 1)"];
emittedSpaceInListItemContent -> emptyBlock [label = "start/end block (remove 1)"];
emittedSpaceInListItemContent -> beginListItem [label = "<li> (remove 1, line break)"];
emittedSpaceInListItemContent -> lineBreakTagInListItemContent [label = "<br> (append to tmp)"];
emittedSpaceInListItemContent -> endListItem [label = "</li> (remove 1)"];
lineBreakTagInListItemContent -> lineBreakTagInListItemContent [label = "whitespace (skip)"];
lineBreakTagInListItemContent -> emptyBlock [label = "start/end block (clear tmp)"];
lineBreakTagInListItemContent -> beginListItem [label = "<li> (emit tmp, line break)"];
lineBreakTagInListItemContent -> listItemContent [label = "non-whitespace (emit tmp)"];
lineBreakTagInListItemContent -> atLeastTwoLineBreakTagsInListItemContent [label = "<br> (append to tmp)"];
lineBreakTagInListItemContent -> endListItem [label = "</li> (clear tmp)"];
atLeastTwoLineBreakTagsInListItemContent -> atLeastTwoLineBreakTagsInListItemContent [label = "<br> (append to tmp)\nwhitespace (skip)"];
atLeastTwoLineBreakTagsInListItemContent -> beginListItem [label = "<li> (emit tmp, line break)"];
atLeastTwoLineBreakTagsInListItemContent -> emptyBlockWithAtLeastTwoPreviousLineBreakTags [label = "start/end block"];
atLeastTwoLineBreakTagsInListItemContent -> listItemContent [label = "non-whitespace (emit tmp)"];
atLeastTwoLineBreakTagsInListItemContent -> endListItem [label = "</li> (clear tmp)"];
afterPreStartTag -> preformattedLineBreak [label = "<br> (append to tmp, append block break to tmp)"];
afterPreStartTag -> preformattedNonEmptyBlock [label = "non \\n (block break)"];
afterPreStartTag -> preformattedEmptyBlock [label = "\\n (skip)\nstart/end block"];
preformattedLineBreak -> preformattedNonEmptyBlock [label = "non-whitespace (emit tmp)"];
preformattedLineBreak -> preformattedNonEmptyBlockWithTrailingWhitespace [label = "other whitespace (append to tmp)"];
preformattedLineBreak -> preformattedAtLeastTwoLineBreaks [label = "\\n or <br> (append to tmp)"];
preformattedAtLeastTwoLineBreaks -> preformattedAtLeastTwoLineBreaks [label = "\\n or <br> (append to tmp)"];
preformattedAtLeastTwoLineBreaks -> preformattedNonEmptyBlock [label = "non \\n or <br> (emit tmp)"];
preformattedAtLeastTwoLineBreaks -> preformattedEmptyBlockWithLeadingWhitespace [label = "start/end block"];
preformattedEmptyBlockWithLeadingWhitespace -> preformattedEmptyBlockWithLeadingWhitespace [label = "whitespace (append to tmp)\nstart/end block\n</pre> if depth>1&&tmp.count>=2 (depth - 1, remove 1 from tmp)"];
preformattedEmptyBlockWithLeadingWhitespace -> preformattedLineBreak [label = "\\n or <br> (append to tmp)"];
preformattedEmptyBlockWithLeadingWhitespace -> afterPreStartTagWithLeadingWhitespace [label = "<pre> (depth + 1)"];
preformattedEmptyBlockWithLeadingWhitespace -> preformattedEmptyBlock [label = "</pre> if depth>1&&tmp.count<2 (depth - 1, remove 1 from tmp)"];
preformattedEmptyBlockWithLeadingWhitespace -> emptyBlock [label = "</pre> if depth<=1 (clear tmp)"];
preformattedEmptyBlock -> preformattedEmptyBlock [label = "start/end block\n</pre>if depth>1 (depth - 1)"];
preformattedEmptyBlock -> afterPreStartTag [label = "<pre> (depth + 1"];
preformattedEmptyBlock -> preformattedNonEmptyBlock [label = "non-whitespace (block break)"];
preformattedEmptyBlock -> preformattedEmptyBlockWithLeadingWhitespace [label = "whitespace (append to tmp)"];
preformattedEmptyBlock -> preformattedLineBreak [label = "<br> (append to tmp)"];
preformattedNonEmptyBlock -> preformattedNonEmptyBlock [label = "non-whitespace"];
preformattedNonEmptyBlock -> preformattedLineBreak [label = "\\n or <br> (append to tmp)"];
preformattedNonEmptyBlock -> preformattedNonEmptyBlockWithTrailingWhitespace [label = "other whitespace (append to tmp)"];
preformattedNonEmptyBlock -> preformattedEmptyBlock [label = "start/end block"];
preformattedNonEmptyBlockWithTrailingWhitespace -> preformattedNonEmptyBlockWithTrailingWhitespace [label = "whitespace (append to tmp)"];
preformattedNonEmptyBlockWithTrailingWhitespace -> preformattedNonEmptyBlock [label = "non-whitespace (emit tmp)"];
preformattedNonEmptyBlockWithTrailingWhitespace -> preformattedLineBreak [label = "\\n or <br> (append to tmp)"];
preformattedNonEmptyBlockWithTrailingWhitespace -> preformattedEmptyBlockWithLeadingWhitespace [label = "start/end block (append block break to tmp)"];
afterPreStartTagWithLeadingWhitespace -> preformattedNonEmptyBlock [label = "non-whitespace (emit tmp)"];
afterPreStartTagWithLeadingWhitespace -> preformattedEmptyBlockWithLeadingWhitespace [label = "\\n (skip)\nother whitespace (append to tmp)\n<br> (append to tmp)\nstart/end block"];
preformattedStart -> preformattedStart [label = "<pre> (depth + 1)\n</pre> if depth>1 (depth - 1)\n\\n or <br> (skip)\nstart/end block"];
preformattedStart -> start [label = "</pre> if depth<=1"];
preformattedStart -> preformattedNonEmptyBlock [label = "non \\n"];
}

View File

@ -27,6 +27,7 @@ public struct AttributedStringConverter<Callbacks: HTMLConversionCallbacks>: Blo
private var actionStack: [ElementAction] = []
private var styleStack: [Style] = []
var blockState = BlockState.start
var temporaryBuffer: String = ""
private var currentElementIsEmpty = true
private var previouslyFinishedListItem = false
// The current run of text w/o styles changing
@ -47,6 +48,7 @@ public struct AttributedStringConverter<Callbacks: HTMLConversionCallbacks>: Blo
actionStack = []
styleStack = []
blockState = .start
temporaryBuffer = ""
currentElementIsEmpty = true
previouslyFinishedListItem = false
currentRun = ""
@ -55,12 +57,16 @@ public struct AttributedStringConverter<Callbacks: HTMLConversionCallbacks>: Blo
switch token {
case .character(let c):
currentElementIsEmpty = false
continueBlock()
currentRun.unicodeScalars.append(c)
if continueBlock(char: c) {
currentRun.unicodeScalars.append(c)
}
case .characterSequence(let s):
currentElementIsEmpty = false
continueBlock()
currentRun.append(s)
for c in s.unicodeScalars {
if continueBlock(char: c) {
currentRun.unicodeScalars.append(c)
}
}
case .comment:
// ignored
continue
@ -87,6 +93,7 @@ public struct AttributedStringConverter<Callbacks: HTMLConversionCallbacks>: Blo
}
}
endBlocks()
finishRun()
return str
@ -94,7 +101,7 @@ public struct AttributedStringConverter<Callbacks: HTMLConversionCallbacks>: Blo
private mutating func handleStartTag(_ name: String, selfClosing: Bool, attributes: [Attribute]) {
if name == "br" {
currentRun.append("\n")
breakTag()
return
}
// self closing tags are ignored since they have no content
@ -126,29 +133,25 @@ public struct AttributedStringConverter<Callbacks: HTMLConversionCallbacks>: Blo
finishRun()
styleStack.append(.monospace)
case "pre":
startOrFinishBlock()
startOrEndBlock()
startPreformatted()
finishRun()
styleStack.append(.monospace)
case "blockquote":
startOrFinishBlock()
startOrEndBlock()
finishRun()
styleStack.append(.blockquote)
case "p":
startOrFinishBlock()
startOrEndBlock()
case "ol":
startOrFinishBlock()
startOrEndBlock()
finishRun()
styleStack.append(.orderedList(nextElementOrdinal: 1))
case "ul":
startOrFinishBlock()
startOrEndBlock()
finishRun()
styleStack.append(.unorderedList)
case "li":
if previouslyFinishedListItem {
currentRun.append("\n")
} else {
continueBlock()
}
let marker: String
if case .orderedList(let nextElementOrdinal) = styleStack.last {
marker = orderedTextList.marker(forItemNumber: nextElementOrdinal)
@ -158,6 +161,7 @@ public struct AttributedStringConverter<Callbacks: HTMLConversionCallbacks>: Blo
} else {
break
}
startListItem()
currentRun.append("\t\(marker)\t")
default:
break
@ -186,33 +190,55 @@ public struct AttributedStringConverter<Callbacks: HTMLConversionCallbacks>: Blo
case "pre":
finishRun()
removeLastStyle(.monospace)
startOrFinishBlock()
startOrEndBlock()
endPreformatted()
case "blockquote":
finishRun()
removeLastStyle(.blockquote)
startOrFinishBlock()
startOrEndBlock()
case "p":
startOrFinishBlock()
startOrEndBlock()
case "ol":
finishRun()
removeLastStyle(.orderedList)
startOrFinishBlock()
startOrEndBlock()
previouslyFinishedListItem = false
case "ul":
finishRun()
removeLastStyle(.unorderedList)
startOrFinishBlock()
startOrEndBlock()
previouslyFinishedListItem = false
case "li":
finishRun()
previouslyFinishedListItem = true
endListItem()
default:
break
}
}
mutating func insertBlockBreak() {
currentRun.append("\n\n")
var blockBreak: String {
"\n\n"
}
var lineBreak: String {
"\n"
}
var listIndentForContentOutsideItem: String {
"\t\t"
}
mutating func append(_ s: String) {
currentRun.append(s)
}
mutating func removeChar() {
if currentRun.isEmpty {
str.deleteCharacters(in: NSRange(location: str.length - 1, length: 1))
} else {
currentRun.removeLast()
}
}
// Finds the last currently-open style of the given type.

View File

@ -7,39 +7,572 @@
import Foundation
/*
This gnarly mess of a state machine is responsible for:
1) Inserting line breaks in the right places corresponding to boundaries between block elements
2) Preventing leading/trailing whitespace from being emitted
3) Collapsing whitespace within the string like https://www.w3.org/TR/css-text-3/#white-space-phase-1
4) Handling whitespace inside <pre> elements
DO NOT TOUCH THE CODE WITHOUT CHECKING/UPDATING THE DIAGRAM.
*/
protocol BlockRenderer {
var blockState: BlockState { get set }
mutating func insertBlockBreak()
var blockBreak: String { get }
var lineBreak: String { get }
var listIndentForContentOutsideItem: String { get }
var temporaryBuffer: String { get set }
mutating func append(_ s: String)
mutating func removeChar()
}
extension BlockRenderer {
mutating func startOrFinishBlock() {
mutating func startOrEndBlock() {
switch blockState {
case .start:
break
case .emptyBlock:
break
case .nonEmptyBlock:
blockState = .emptyBlock
case .emptyBlock:
case .emittedSpace:
blockState = .emptyBlock
removeChar()
case .lineBreakTag:
blockState = .emptyBlock
temporaryBuffer = ""
case .atLeastTwoLineBreakTags:
blockState = .emptyBlockWithAtLeastTwoPreviousLineBreakTags
case .emptyBlockWithAtLeastTwoPreviousLineBreakTags:
break
case .beginListItem:
break
case .endListItem:
blockState = .emptyBlock
case .listItemContent:
blockState = .emptyBlock
case .emittedSpaceInListItemContent:
blockState = .emptyBlock
removeChar()
case .lineBreakTagInListItemContent:
blockState = .emptyBlock
temporaryBuffer = ""
case .atLeastTwoLineBreakTagsInListItemContent:
blockState = .emptyBlockWithAtLeastTwoPreviousLineBreakTags
case .preformattedStart(depth: _):
break
case .preformattedEmptyBlock(depth: _):
break
case .preformattedNonEmptyBlock(let depth):
blockState = .preformattedEmptyBlock(depth: depth)
case .preformattedLineBreak(depth: let depth):
blockState = .preformattedEmptyBlockWithLeadingWhitespace(depth: depth)
temporaryBuffer.append(lineBreak)
case .preformattedAtLeastTwoLineBreaks(let depth):
blockState = .preformattedEmptyBlockWithLeadingWhitespace(depth: depth)
case .afterPreStartTag(let depth):
blockState = .preformattedEmptyBlock(depth: depth)
case .afterPreStartTagWithLeadingWhitespace(let depth):
blockState = .preformattedEmptyBlockWithLeadingWhitespace(depth: depth)
case .preformattedNonEmptyBlockWithTrailingWhitespace(depth: _):
temporaryBuffer.append(blockBreak)
case .preformattedEmptyBlockWithLeadingWhitespace(depth: _):
break
}
}
mutating func continueBlock() {
mutating func continueBlock(char: UnicodeScalar) -> Bool {
switch blockState {
case .start:
blockState = .nonEmptyBlock
case .nonEmptyBlock:
break
if char.properties.isWhitespace {
return false
} else {
blockState = .nonEmptyBlock
return true
}
case .emptyBlock:
insertBlockBreak()
blockState = .nonEmptyBlock
if char.properties.isWhitespace {
return false
} else {
blockState = .nonEmptyBlock
append(blockBreak)
return true
}
case .nonEmptyBlock:
if char.properties.isWhitespace {
blockState = .emittedSpace
append(" ")
return false
} else {
return true
}
case .emittedSpace:
if char.properties.isWhitespace {
return false
} else {
blockState = .nonEmptyBlock
return true
}
case .lineBreakTag:
if char.properties.isWhitespace {
return false
} else {
blockState = .nonEmptyBlock
append(temporaryBuffer)
temporaryBuffer = ""
return true
}
case .atLeastTwoLineBreakTags:
if char.properties.isWhitespace {
return false
} else {
blockState = .nonEmptyBlock
append(temporaryBuffer)
temporaryBuffer = ""
return true
}
case .emptyBlockWithAtLeastTwoPreviousLineBreakTags:
if char.properties.isWhitespace {
return false
} else {
blockState = .nonEmptyBlock
append(temporaryBuffer)
temporaryBuffer = ""
return true
}
case .beginListItem:
if char.properties.isWhitespace {
return false
} else {
blockState = .listItemContent
return true
}
case .endListItem:
if char.properties.isWhitespace {
return false
} else {
blockState = .listItemContent
append(lineBreak)
append(listIndentForContentOutsideItem)
return true
}
case .listItemContent:
if char.properties.isWhitespace {
blockState = .emittedSpaceInListItemContent
append(" ")
return false
} else {
return true
}
case .emittedSpaceInListItemContent:
if char.properties.isWhitespace {
return false
} else {
blockState = .listItemContent
return true
}
case .lineBreakTagInListItemContent:
if char.properties.isWhitespace {
return false
} else {
blockState = .listItemContent
append(temporaryBuffer)
temporaryBuffer = ""
return true
}
case .atLeastTwoLineBreakTagsInListItemContent:
if char.properties.isWhitespace {
return false
} else {
blockState = .listItemContent
append(temporaryBuffer)
temporaryBuffer = ""
return true
}
case .preformattedStart(let depth):
if char == "\n" {
return false
} else {
blockState = .preformattedNonEmptyBlock(depth: depth)
return true
}
case .preformattedEmptyBlock(depth: let depth):
if char.properties.isWhitespace {
blockState = .preformattedEmptyBlockWithLeadingWhitespace(depth: depth)
temporaryBuffer.unicodeScalars.append(char)
return false
} else {
blockState = .preformattedNonEmptyBlock(depth: depth)
append(blockBreak)
return true
}
case .preformattedNonEmptyBlock(let depth):
if char == "\n" {
blockState = .preformattedLineBreak(depth: depth)
temporaryBuffer.append(lineBreak)
return false
} else if char.properties.isWhitespace {
blockState = .preformattedNonEmptyBlockWithTrailingWhitespace(depth: depth)
temporaryBuffer.unicodeScalars.append(char)
return false
} else {
return true
}
case .preformattedLineBreak(let depth):
if char == "\n" {
blockState = .preformattedAtLeastTwoLineBreaks(depth: depth)
temporaryBuffer.append(lineBreak)
return false
} else if char.properties.isWhitespace {
blockState = .preformattedNonEmptyBlockWithTrailingWhitespace(depth: depth)
temporaryBuffer.unicodeScalars.append(char)
return false
} else {
blockState = .preformattedNonEmptyBlock(depth: depth)
append(temporaryBuffer)
temporaryBuffer = ""
return true
}
case .preformattedAtLeastTwoLineBreaks(let depth):
if char.properties.isWhitespace {
temporaryBuffer.unicodeScalars.append(char)
return false
} else {
blockState = .preformattedNonEmptyBlock(depth: depth)
append(temporaryBuffer)
temporaryBuffer = ""
return true
}
case .afterPreStartTag(let depth):
if char == "\n" {
blockState = .preformattedEmptyBlock(depth: depth)
return false
} else {
blockState = .preformattedNonEmptyBlock(depth: depth)
append(blockBreak)
return true
}
case .afterPreStartTagWithLeadingWhitespace(let depth):
if char == "\n" {
blockState = .preformattedEmptyBlockWithLeadingWhitespace(depth: depth)
return false
} else if char.properties.isWhitespace {
blockState = .preformattedEmptyBlockWithLeadingWhitespace(depth: depth)
temporaryBuffer.unicodeScalars.append(char)
return false
} else {
blockState = .preformattedNonEmptyBlock(depth: depth)
append(temporaryBuffer)
temporaryBuffer = ""
return true
}
case .preformattedNonEmptyBlockWithTrailingWhitespace(let depth):
if char == "\n" {
blockState = .preformattedLineBreak(depth: depth)
temporaryBuffer.append(lineBreak)
return false
} else if char.properties.isWhitespace {
temporaryBuffer.unicodeScalars.append(char)
return false
} else {
blockState = .preformattedNonEmptyBlock(depth: depth)
append(temporaryBuffer)
temporaryBuffer = ""
return true
}
case .preformattedEmptyBlockWithLeadingWhitespace(let depth):
if char == "\n" {
blockState = .preformattedLineBreak(depth: depth)
temporaryBuffer.append(lineBreak)
return false
} else if char.properties.isWhitespace {
temporaryBuffer.unicodeScalars.append(char)
return false
} else {
blockState = .preformattedNonEmptyBlock(depth: depth)
append(temporaryBuffer)
temporaryBuffer = ""
return true
}
}
}
mutating func breakTag() {
switch blockState {
case .start:
break
case .emptyBlock:
append(lineBreak)
case .nonEmptyBlock:
blockState = .lineBreakTag
temporaryBuffer.append(lineBreak)
case .emittedSpace:
blockState = .lineBreakTag
temporaryBuffer.append(lineBreak)
case .lineBreakTag:
blockState = .atLeastTwoLineBreakTags
temporaryBuffer.append(lineBreak)
case .atLeastTwoLineBreakTags:
temporaryBuffer.append(lineBreak)
case .emptyBlockWithAtLeastTwoPreviousLineBreakTags:
append(lineBreak)
case .beginListItem:
append(lineBreak)
case .endListItem:
blockState = .lineBreakTagInListItemContent
temporaryBuffer.append(lineBreak)
case .listItemContent:
blockState = .lineBreakTagInListItemContent
temporaryBuffer.append(lineBreak)
case .emittedSpaceInListItemContent:
blockState = .lineBreakTagInListItemContent
temporaryBuffer.append(lineBreak)
case .lineBreakTagInListItemContent:
blockState = .atLeastTwoLineBreakTagsInListItemContent
temporaryBuffer.append(lineBreak)
case .atLeastTwoLineBreakTagsInListItemContent:
temporaryBuffer.append(lineBreak)
case .preformattedStart(depth: _):
break
case .preformattedEmptyBlock(let depth):
blockState = .preformattedLineBreak(depth: depth)
temporaryBuffer.append(lineBreak)
case .preformattedNonEmptyBlock(let depth):
blockState = .preformattedLineBreak(depth: depth)
temporaryBuffer.append(lineBreak)
case .preformattedLineBreak(let depth):
blockState = .preformattedAtLeastTwoLineBreaks(depth: depth)
temporaryBuffer.append(lineBreak)
case .preformattedAtLeastTwoLineBreaks(depth: _):
temporaryBuffer.append(lineBreak)
case .afterPreStartTag(let depth):
blockState = .preformattedLineBreak(depth: depth)
temporaryBuffer.append(blockBreak)
temporaryBuffer.append(lineBreak)
case .afterPreStartTagWithLeadingWhitespace(let depth):
blockState = .preformattedEmptyBlockWithLeadingWhitespace(depth: depth)
temporaryBuffer.append(lineBreak)
case .preformattedNonEmptyBlockWithTrailingWhitespace(let depth):
blockState = .preformattedLineBreak(depth: depth)
temporaryBuffer.append(lineBreak)
case .preformattedEmptyBlockWithLeadingWhitespace(let depth):
blockState = .preformattedLineBreak(depth: depth)
temporaryBuffer.append(lineBreak)
}
}
mutating func startPreformatted() {
switch blockState {
case .start:
blockState = .preformattedStart(depth: 1)
case .emptyBlock:
blockState = .afterPreStartTag(depth: 1)
case .nonEmptyBlock:
fatalError("unreachable")
case .emittedSpace:
fatalError("unreachable")
case .lineBreakTag:
fatalError("unreachable")
case .atLeastTwoLineBreakTags:
fatalError("unreachable")
case .emptyBlockWithAtLeastTwoPreviousLineBreakTags:
blockState = .afterPreStartTagWithLeadingWhitespace(depth: 1)
case .beginListItem:
blockState = .afterPreStartTagWithLeadingWhitespace(depth: 1)
case .endListItem:
fatalError("unreachable")
case .listItemContent:
fatalError("unreachable")
case .emittedSpaceInListItemContent:
fatalError("unreachable")
case .lineBreakTagInListItemContent:
fatalError("unreachable")
case .atLeastTwoLineBreakTagsInListItemContent:
fatalError("unreachable")
case .preformattedStart(let depth):
blockState = .preformattedStart(depth: depth + 1)
case .preformattedEmptyBlock(let depth):
blockState = .afterPreStartTag(depth: depth + 1)
case .preformattedNonEmptyBlock(depth: _):
fatalError("unreachable")
case .preformattedLineBreak(depth: _):
fatalError("unreachable")
case .preformattedAtLeastTwoLineBreaks(depth: _):
fatalError("unreachable")
case .afterPreStartTag(depth: _):
fatalError("unreachable")
case .afterPreStartTagWithLeadingWhitespace(depth: _):
fatalError("unreachable")
case .preformattedNonEmptyBlockWithTrailingWhitespace(depth: _):
fatalError("unreachable")
case .preformattedEmptyBlockWithLeadingWhitespace(let depth):
blockState = .afterPreStartTagWithLeadingWhitespace(depth: depth + 1)
}
}
mutating func endPreformatted() {
switch blockState {
case .start:
break
case .emptyBlock:
break
case .nonEmptyBlock:
fatalError("unreachable")
case .emittedSpace:
fatalError("unreachable")
case .lineBreakTag:
fatalError("unreachable")
case .atLeastTwoLineBreakTags:
fatalError("unreachable")
case .emptyBlockWithAtLeastTwoPreviousLineBreakTags:
break
case .beginListItem:
break
case .endListItem:
fatalError("unreachable")
case .listItemContent:
fatalError("unreachable")
case .emittedSpaceInListItemContent:
fatalError("unreachable")
case .lineBreakTagInListItemContent:
fatalError("unreachable")
case .atLeastTwoLineBreakTagsInListItemContent:
fatalError("unreachable")
case .preformattedStart(let depth):
if depth <= 1 {
blockState = .start
} else {
blockState = .preformattedStart(depth: depth - 1)
}
case .preformattedEmptyBlock(let depth):
if depth <= 1 {
blockState = .emptyBlock
} else {
blockState = .preformattedEmptyBlock(depth: depth - 1)
}
case .preformattedNonEmptyBlock(depth: _):
fatalError("unreachable")
case .preformattedLineBreak(depth: _):
fatalError("unreachable")
case .preformattedAtLeastTwoLineBreaks(depth: _):
fatalError("unreachable")
case .afterPreStartTag(depth: _):
fatalError("unreachable")
case .afterPreStartTagWithLeadingWhitespace(depth: _):
fatalError("unreachable")
case .preformattedNonEmptyBlockWithTrailingWhitespace(depth: _):
fatalError("unreachable")
case .preformattedEmptyBlockWithLeadingWhitespace(let depth):
if depth <= 1 {
blockState = .emptyBlock
temporaryBuffer = ""
} else {
if temporaryBuffer.count >= 2 {
temporaryBuffer.removeLast()
blockState = .preformattedEmptyBlockWithLeadingWhitespace(depth: depth - 1)
} else {
temporaryBuffer.removeLast()
blockState = .preformattedEmptyBlock(depth: depth - 1)
}
}
}
}
mutating func startListItem() {
switch blockState {
case .start:
blockState = .beginListItem
case .emptyBlock:
blockState = .beginListItem
append(blockBreak)
case .nonEmptyBlock:
blockState = .beginListItem
append(blockBreak)
case .beginListItem:
break
case .endListItem:
blockState = .beginListItem
append(lineBreak)
case .listItemContent:
blockState = .beginListItem
append(lineBreak)
case .emittedSpaceInListItemContent:
blockState = .beginListItem
removeChar()
append(lineBreak)
case .lineBreakTagInListItemContent:
blockState = .beginListItem
append(temporaryBuffer)
temporaryBuffer = ""
append(lineBreak)
case .atLeastTwoLineBreakTagsInListItemContent:
blockState = .beginListItem
append(temporaryBuffer)
temporaryBuffer = ""
append(lineBreak)
default:
break
}
}
mutating func endListItem() {
switch blockState {
case .emptyBlock:
blockState = .endListItem
case .nonEmptyBlock:
blockState = .endListItem
case .listItemContent:
blockState = .endListItem
case .emittedSpaceInListItemContent:
blockState = .endListItem
removeChar()
case .lineBreakTagInListItemContent:
blockState = .endListItem
temporaryBuffer = ""
case .atLeastTwoLineBreakTagsInListItemContent:
blockState = .endListItem
temporaryBuffer = ""
default:
break
}
}
mutating func endBlocks() {
switch blockState {
case .emittedSpace:
removeChar()
case .emittedSpaceInListItemContent:
removeChar()
default:
break
}
}
}
enum BlockState: Equatable {
case start
case nonEmptyBlock
case emptyBlock
case nonEmptyBlock
case emittedSpace
case lineBreakTag
case atLeastTwoLineBreakTags
case emptyBlockWithAtLeastTwoPreviousLineBreakTags
case beginListItem
case endListItem
case listItemContent
case emittedSpaceInListItemContent
case lineBreakTagInListItemContent
case atLeastTwoLineBreakTagsInListItemContent
case preformattedStart(depth: Int)
case preformattedEmptyBlock(depth: Int)
case preformattedNonEmptyBlock(depth: Int)
case preformattedLineBreak(depth: Int)
case preformattedAtLeastTwoLineBreaks(depth: Int)
case afterPreStartTag(depth: Int)
case afterPreStartTagWithLeadingWhitespace(depth: Int)
case preformattedNonEmptyBlockWithTrailingWhitespace(depth: Int)
case preformattedEmptyBlockWithLeadingWhitespace(depth: Int)
}

View File

@ -8,7 +8,6 @@
import Foundation
public struct TextConverter<Callbacks: HTMLConversionCallbacks>: BlockRenderer {
private let configuration: TextConverterConfiguration
private var tokenizer: Tokenizer<String.UnicodeScalarView.Iterator>!
@ -16,6 +15,7 @@ public struct TextConverter<Callbacks: HTMLConversionCallbacks>: BlockRenderer {
private var actionStack: [ElementAction] = []
var blockState = BlockState.start
var temporaryBuffer: String = ""
private var currentElementIsEmpty = true
private var currentRun = ""
@ -32,6 +32,7 @@ public struct TextConverter<Callbacks: HTMLConversionCallbacks>: BlockRenderer {
str = ""
blockState = .start
temporaryBuffer = ""
currentElementIsEmpty = true
currentRun = ""
@ -39,12 +40,16 @@ public struct TextConverter<Callbacks: HTMLConversionCallbacks>: BlockRenderer {
switch token {
case .character(let scalar):
currentElementIsEmpty = false
continueBlock()
currentRun.unicodeScalars.append(scalar)
if continueBlock(char: scalar) {
currentRun.unicodeScalars.append(scalar)
}
case .characterSequence(let string):
currentElementIsEmpty = false
continueBlock()
currentRun.append(string)
for c in string.unicodeScalars {
if continueBlock(char: c) {
currentRun.unicodeScalars.append(c)
}
}
case .startTag(let name, let selfClosing, let attributes):
currentElementIsEmpty = true
let action = Callbacks.elementAction(name: name, attributes: attributes)
@ -66,6 +71,7 @@ public struct TextConverter<Callbacks: HTMLConversionCallbacks>: BlockRenderer {
}
}
endBlocks()
finishRun()
return str
@ -74,13 +80,9 @@ public struct TextConverter<Callbacks: HTMLConversionCallbacks>: BlockRenderer {
private mutating func handleStartTag(_ name: String, selfClosing: Bool, attributes: [Attribute]) {
switch name {
case "br":
if configuration.insertNewlines {
currentRun.append("\n")
} else {
currentRun.append(" ")
}
breakTag()
case "pre", "blockquote", "p", "ol", "ul":
startOrFinishBlock()
startOrEndBlock()
default:
break
}
@ -89,21 +91,45 @@ public struct TextConverter<Callbacks: HTMLConversionCallbacks>: BlockRenderer {
private mutating func handleEndTag(_ name: String) {
switch name {
case "pre", "blockquote", "p", "ol", "ul":
startOrFinishBlock()
startOrEndBlock()
finishRun()
default:
break
}
}
mutating func insertBlockBreak() {
var blockBreak: String {
if configuration.insertNewlines {
currentRun.append("\n\n")
"\n\n"
} else {
currentRun.append(" ")
" "
}
}
var lineBreak: String {
if configuration.insertNewlines {
"\n"
} else {
" "
}
}
var listIndentForContentOutsideItem: String {
" "
}
mutating func append(_ s: String) {
currentRun.append(s)
}
mutating func removeChar() {
if currentRun.isEmpty {
str.removeLast()
} else {
currentRun.removeLast()
}
}
private mutating func finishRun() {
if actionStack.contains(.skip) {
currentRun = ""

View File

@ -327,6 +327,15 @@ final class AttributedStringConverterTests: XCTestCase {
XCTAssertEqual(convert("a<ol><li>b</li><li>c</li></ol>"), result)
}
func testListItemOutsideList() {
let result = NSAttributedString(string: "a", attributes: [
.font: font,
.paragraphStyle: NSParagraphStyle.default,
.foregroundColor: color,
])
XCTAssertEqual(convert("<li>a</li>"), result)
}
func testSkipElementActionFollowingUnfinishedRun() {
struct Callbacks: HTMLConversionCallbacks {
static func elementAction(name: String, attributes: [Attribute]) -> ElementAction {
@ -347,7 +356,7 @@ final class AttributedStringConverterTests: XCTestCase {
XCTAssertEqual(convert("</span>"), .init())
}
func testWTF() {
func testMultipleClosingBlockTagsBeforeOpeningBlockTag() {
let result = NSMutableAttributedString()
result.append(NSAttributedString(string: "a", attributes: [
.font: italicFont,
@ -362,4 +371,234 @@ final class AttributedStringConverterTests: XCTestCase {
XCTAssertEqual(convert(#"<blockquote><p>a</p></blockquote><p>b</p>"#), result)
}
func testNewlineBetweenClosingAndOpeningBlockTag() {
let result = NSAttributedString(string: "a\n\nb", attributes: [
.font: font,
.paragraphStyle: NSParagraphStyle.default,
.foregroundColor: color,
])
XCTAssertEqual(convert("<p>a</p>\n<p>b</p>"), result)
XCTAssertEqual(convert("<p>a</p><p>\nb</p>"), result)
}
func testEndAfterNewlineInBlockContent() {
let result = NSAttributedString(string: "a", attributes: [
.font: font,
.paragraphStyle: NSParagraphStyle.default,
.foregroundColor: color,
])
XCTAssertEqual(convert("<p>a\n\n</p>"), result)
XCTAssertEqual(convert("<p>a\n\n</p>\n"), result)
XCTAssertEqual(convert("<p>\n\na</p>"), result)
XCTAssertEqual(convert("<p>\n\na</p>\n"), result)
let result2 = NSAttributedString(string: "a b", attributes: [
.font: font,
.paragraphStyle: NSParagraphStyle.default,
.foregroundColor: color,
])
XCTAssertEqual(convert("<p>a\n\n\nb</p>"), result2)
}
func testBRAtBlockElementBoundary() {
let two = NSAttributedString(string: "a\n\nb", attributes: [
.font: font,
.paragraphStyle: NSParagraphStyle.default,
.foregroundColor: color,
])
XCTAssertEqual(convert("<p>a<br></p><p>b</p>"), two)
let three = NSAttributedString(string: "a\n\n\nb", attributes: [
.font: font,
.paragraphStyle: NSParagraphStyle.default,
.foregroundColor: color,
])
XCTAssertEqual(convert("<p>a</p><p><br>b</p>"), three)
}
func testPreFollowedByP() {
let result = NSMutableAttributedString()
result.append(NSAttributedString(string: "a", attributes: [
.font: monospaceFont,
.paragraphStyle: NSParagraphStyle.default,
.foregroundColor: color,
]))
result.append(NSAttributedString(string: "\n\nb", attributes: [
.font: font,
.paragraphStyle: NSParagraphStyle.default,
.foregroundColor: color,
]))
XCTAssertEqual(convert("<pre>a<br></pre><p>b</p>"), result)
}
func testPreFollowedByPre() {
let result = NSAttributedString(string: "a\n\nb", attributes: [
.font: monospaceFont,
.paragraphStyle: NSParagraphStyle.default,
.foregroundColor: color,
])
XCTAssertEqual(convert("<pre>a</pre><pre>b</pre>"), result)
}
func testBRAtPreBoundary() {
let two = NSAttributedString(string: "a\n\nb", attributes: [
.font: monospaceFont,
.paragraphStyle: NSParagraphStyle.default,
.foregroundColor: color,
])
XCTAssertEqual(convert("<pre>a<br></pre><pre>b</pre>"), two)
let three = NSAttributedString(string: "a\n\n\nb", attributes: [
.font: monospaceFont,
.paragraphStyle: NSParagraphStyle.default,
.foregroundColor: color,
])
XCTAssertEqual(convert("<pre>a</pre><pre><br>b</pre>"), three)
}
func testNestedPre() {
let one = NSAttributedString(string: "a", attributes: [
.font: monospaceFont,
.paragraphStyle: NSParagraphStyle.default,
.foregroundColor: color,
])
XCTAssertEqual(convert("<pre><pre>a</pre></pre>"), one)
let two = NSAttributedString(string: "a\n\nb", attributes: [
.font: monospaceFont,
.paragraphStyle: NSParagraphStyle.default,
.foregroundColor: color,
])
XCTAssertEqual(convert("<pre>a<pre>b</pre></pre>"), two)
XCTAssertEqual(convert("<pre>a<br><pre>b</pre></pre>"), two)
let three = NSAttributedString(string: "a\n\n\nb", attributes: [
.font: monospaceFont,
.paragraphStyle: NSParagraphStyle.default,
.foregroundColor: color,
])
XCTAssertEqual(convert("<pre>a<pre><br>b</pre></pre>"), three)
}
func testIgnoreLeadingNewlineInPre() {
let one = NSAttributedString(string: "a", attributes: [
.font: monospaceFont,
.paragraphStyle: NSParagraphStyle.default,
.foregroundColor: color,
])
XCTAssertEqual(convert("<pre>\na</pre>"), one)
let two = NSMutableAttributedString()
two.append(NSAttributedString(string: "a", attributes: [
.font: font,
.paragraphStyle: NSParagraphStyle.default,
.foregroundColor: color,
]))
two.append(NSAttributedString(string: "\n\nb", attributes: [
.font: monospaceFont,
.paragraphStyle: NSParagraphStyle.default,
.foregroundColor: color,
]))
XCTAssertEqual(convert("a<pre>\nb</pre>"), two)
}
func testPreFollowingChar() {
let result = NSMutableAttributedString()
result.append(NSAttributedString(string: "a", attributes: [
.font: font,
.paragraphStyle: NSParagraphStyle.default,
.foregroundColor: color,
]))
result.append(NSAttributedString(string: "\n\nb", attributes: [
.font: monospaceFont,
.paragraphStyle: NSParagraphStyle.default,
.foregroundColor: color,
]))
XCTAssertEqual(convert("a<pre>b</pre>"), result)
}
func testSkipLeadingTrailingWhitespace() {
let result = NSAttributedString(string: "a", attributes: [
.font: font,
.paragraphStyle: NSParagraphStyle.default,
.foregroundColor: color,
])
XCTAssertEqual(convert(" \n\ta"), result)
XCTAssertEqual(convert(" \n\t<p>a</p>"), result)
XCTAssertEqual(convert("a \n\t"), result)
XCTAssertEqual(convert("<p>a</p> \n\t"), result)
let pre = NSAttributedString(string: "a", attributes: [
.font: monospaceFont,
.paragraphStyle: NSParagraphStyle.default,
.foregroundColor: color,
])
XCTAssertEqual(convert(" \n\t<pre>a</pre>"), pre)
XCTAssertEqual(convert("<pre>a</pre> \n\t"), pre)
}
func testWhitespaceCollapsing() {
let result = NSAttributedString(string: "a b", attributes: [
.font: font,
.paragraphStyle: NSParagraphStyle.default,
.foregroundColor: color,
])
XCTAssertEqual(convert("<p>a \t\nb</p>"), result)
}
func testParagraphInsideListItem() {
let result = NSAttributedString(string: "\t1.\ta\n\t2.\tb", attributes: [
.font: font,
.paragraphStyle: listParagraphStyle,
.foregroundColor: color,
])
XCTAssertEqual(convert("<ol><li><p>a</p></li><li><p>b</p></li></ol>"), result)
}
func testBreakBetweenListItems() {
let result = NSAttributedString(string: "\t1.\ta\n\n\t2.\tb", attributes: [
.font: font,
.paragraphStyle: listParagraphStyle,
.foregroundColor: color,
])
XCTAssertEqual(convert("<ol><li>a</li><br><li>b</li></ol>"), result)
}
func testCharacterBetweenListItems() {
let result = NSAttributedString(string: "\t1.\ta\n\t\tc\n\t2.\tb", attributes: [
.font: font,
.paragraphStyle: listParagraphStyle,
.foregroundColor: color,
])
XCTAssertEqual(convert("<ol><li>a</li>c<li>b</li></ol>"), result)
XCTAssertEqual(convert("<ol><li>a</li>c <li>b</li></ol>"), result)
}
func testWhitespaceCollapsingInTextBetweenListItems() {
let result = NSAttributedString(string: "\t1.\ta\n\t\tc d\n\t2.\tb", attributes: [
.font: font,
.paragraphStyle: listParagraphStyle,
.foregroundColor: color,
])
XCTAssertEqual(convert("<ol><li>a</li>c d<li>b</li></ol>"), result)
}
func testImplicitlyClosedListItem() {
let result = NSAttributedString(string: "\t1.\ta\n\t2.\tb", attributes: [
.font: font,
.paragraphStyle: listParagraphStyle,
.foregroundColor: color,
])
XCTAssertEqual(convert("<ol><li>a<li>b</ol>"), result)
}
func testPreInsideListItem() {
let result = NSMutableAttributedString()
result.append(NSAttributedString(string: "\t1.\t", attributes: [
.font: font,
.paragraphStyle: listParagraphStyle,
.foregroundColor: color,
]))
result.append(NSAttributedString(string: "a", attributes: [
.font: monospaceFont,
.paragraphStyle: listParagraphStyle,
.foregroundColor: color,
]))
XCTAssertEqual(convert("<ol><li><pre>a</pre></li></ol>"), result)
}
}