diff --git a/BlockState.dot b/BlockState.dot
index ab9fea6..65ddf70 100644
--- a/BlockState.dot
+++ b/BlockState.dot
@@ -7,14 +7,12 @@ digraph blockstate {
start;
emptyBlock [label = "empty block"];
nonEmptyBlock [label = "non-empty block"];
- emittedSpace [label = "emitted space"];
lineBreakTag [label = "line break tag"];
atLeastTwoLineBreakTags [label = ">=2 line break tags"];
emptyBlockWithAtLeastTwoPreviousLineBreakTags [label = "empty block w/ >=2 prev line break tags"];
beginListItem [label = "begin list item"];
endListItem [label = "end list item"];
listItemContent [label = "list item content"];
- emittedSpaceInListItemContent [label = "emitted space in text in list item content"];
lineBreakTagInListItemContent [label = "line break tag in list item content"];
atLeastTwoLineBreakTagsInListItemContent [label = ">= 2 line break tags in list item content"];
preformattedStart [label = "preformatted start"];
@@ -32,27 +30,21 @@ digraph blockstate {
start -> nonEmptyBlock [label = "non-whitespace"];
start -> preformattedStart [label = "
(depth = 1)"];
start -> beginListItem [label = "
"];
- nonEmptyBlock -> nonEmptyBlock [label = "non-whitespace"];
+ nonEmptyBlock -> nonEmptyBlock [label = "non-newline"];
nonEmptyBlock -> emptyBlock [label = "start/end block"];
- nonEmptyBlock -> emittedSpace [label = "whitespace (emit space)"];
- nonEmptyBlock -> lineBreakTag [label = "
(append to tmp)"];
+ nonEmptyBlock -> lineBreakTag [label = "
or \\n (append to tmp)"];
nonEmptyBlock -> beginListItem [label = ""];
nonEmptyBlock -> endListItem [label = ""];
- emittedSpace -> nonEmptyBlock [label = "non-whitespace"];
- emittedSpace -> emittedSpace [label = "whitespace (skip)"];
- emittedSpace -> emptyBlock [label = "start/end block (remove 1)"];
- emittedSpace -> lineBreakTag [label = "
(append to tmp)"];
- emittedSpace -> end [label = "EOF (remove 1)"];
emptyBlock -> nonEmptyBlock [label = "non-whitespace (block break)"];
emptyBlock -> emptyBlock [label = "whitespace (skip)\n
\n\nstart/end block"];
emptyBlock -> afterPreStartTag [label = " (depth = 1)"];
emptyBlock -> beginListItem [label = "
"];
emptyBlock -> endListItem [label = ""];
- lineBreakTag -> lineBreakTag [label = "whitespace (skip)"];
- lineBreakTag -> atLeastTwoLineBreakTags [label = "
(append to tmp)"];
+ lineBreakTag -> lineBreakTag [label = "whitespace (append to tmp)"];
+ lineBreakTag -> atLeastTwoLineBreakTags [label = "
or \\n (append to tmp)"];
lineBreakTag -> emptyBlock [label = "start/end block (clear tmp)"];
lineBreakTag -> nonEmptyBlock [label = "non-whitespace (emit tmp)"];
- atLeastTwoLineBreakTags -> atLeastTwoLineBreakTags [label = "whitespace (skip)\n
(append to tmp)"];
+ atLeastTwoLineBreakTags -> atLeastTwoLineBreakTags [label = "whitespace or
(append to tmp)"];
atLeastTwoLineBreakTags -> nonEmptyBlock [label = "non-whitespace (emit tmp)"];
atLeastTwoLineBreakTags -> emptyBlockWithAtLeastTwoPreviousLineBreakTags [label = "start/end block"];
emptyBlockWithAtLeastTwoPreviousLineBreakTags -> emptyBlockWithAtLeastTwoPreviousLineBreakTags [label = "whitespace (skip)\n
\n\nstart/end block"];
@@ -70,23 +62,15 @@ digraph blockstate {
listItemContent -> listItemContent [label = "non-whitespace"];
listItemContent -> beginListItem [label = " (line break)"];
listItemContent -> lineBreakTagInListItemContent [label = "
(append to tmp)"];
- listItemContent -> emittedSpaceInListItemContent [label = "whitespace (emit space)"];
listItemContent -> emptyBlock [label = "start/end block"];
listItemContent -> endListItem [label = ""];
- emittedSpaceInListItemContent -> emittedSpaceInListItemContent [label = "whitespace (skip)"];
- emittedSpaceInListItemContent -> listItemContent [label = "non-whitespace"];
- emittedSpaceInListItemContent -> end [label = "EOF (remove 1)"];
- emittedSpaceInListItemContent -> emptyBlock [label = "start/end block (remove 1)"];
- emittedSpaceInListItemContent -> beginListItem [label = " (remove 1, line break)"];
- emittedSpaceInListItemContent -> lineBreakTagInListItemContent [label = "
(append to tmp)"];
- emittedSpaceInListItemContent -> endListItem [label = " (remove 1)"];
- lineBreakTagInListItemContent -> lineBreakTagInListItemContent [label = "whitespace (skip)"];
+ lineBreakTagInListItemContent -> lineBreakTagInListItemContent [label = "whitespace (append to tmp)"];
lineBreakTagInListItemContent -> emptyBlock [label = "start/end block (clear tmp)"];
lineBreakTagInListItemContent -> beginListItem [label = " (emit tmp, line break)"];
lineBreakTagInListItemContent -> listItemContent [label = "non-whitespace (emit tmp)"];
- lineBreakTagInListItemContent -> atLeastTwoLineBreakTagsInListItemContent [label = "
(append to tmp)"];
+ lineBreakTagInListItemContent -> atLeastTwoLineBreakTagsInListItemContent [label = "
or \\n (append to tmp)"];
lineBreakTagInListItemContent -> endListItem [label = " (clear tmp)"];
- atLeastTwoLineBreakTagsInListItemContent -> atLeastTwoLineBreakTagsInListItemContent [label = "
(append to tmp)\nwhitespace (skip)"];
+ atLeastTwoLineBreakTagsInListItemContent -> atLeastTwoLineBreakTagsInListItemContent [label = "whitespace or
(append to tmp)"];
atLeastTwoLineBreakTagsInListItemContent -> beginListItem [label = " (emit tmp, line break)"];
atLeastTwoLineBreakTagsInListItemContent -> emptyBlockWithAtLeastTwoPreviousLineBreakTags [label = "start/end block"];
atLeastTwoLineBreakTagsInListItemContent -> listItemContent [label = "non-whitespace (emit tmp)"];
diff --git a/Sources/HTMLStreamer/BlockState.swift b/Sources/HTMLStreamer/BlockState.swift
index 77711ed..76b51bf 100644
--- a/Sources/HTMLStreamer/BlockState.swift
+++ b/Sources/HTMLStreamer/BlockState.swift
@@ -38,9 +38,6 @@ extension BlockStateMachine {
break
case .nonEmptyBlock:
blockState = .emptyBlock
- case .emittedSpace:
- blockState = .emptyBlock
- removeChar()
case .lineBreakTag:
blockState = .emptyBlock
temporaryBuffer = ""
@@ -54,9 +51,6 @@ extension BlockStateMachine {
blockState = .emptyBlock
case .listItemContent:
blockState = .emptyBlock
- case .emittedSpaceInListItemContent:
- blockState = .emptyBlock
- removeChar()
case .lineBreakTagInListItemContent:
blockState = .emptyBlock
temporaryBuffer = ""
@@ -104,22 +98,19 @@ extension BlockStateMachine {
return true
}
case .nonEmptyBlock:
- if isWhitespace {
- blockState = .emittedSpace
- append(" ")
+ if isNewline {
+ blockState = .lineBreakTag
+ temporaryBuffer.append("\n")
return false
} else {
return true
}
- case .emittedSpace:
- if isWhitespace {
- return false
- } else {
- blockState = .nonEmptyBlock
- return true
- }
case .lineBreakTag:
if isWhitespace {
+ if isNewline {
+ blockState = .atLeastTwoLineBreakTags
+ }
+ temporaryBuffer.unicodeScalars.append(char)
return false
} else {
blockState = .nonEmptyBlock
@@ -129,6 +120,7 @@ extension BlockStateMachine {
}
case .atLeastTwoLineBreakTags:
if isWhitespace {
+ temporaryBuffer.unicodeScalars.append(char)
return false
} else {
blockState = .nonEmptyBlock
@@ -162,22 +154,19 @@ extension BlockStateMachine {
return true
}
case .listItemContent:
- if isWhitespace {
- blockState = .emittedSpaceInListItemContent
- append(" ")
+ if isNewline {
+ blockState = .lineBreakTagInListItemContent
+ temporaryBuffer.append("\n")
return false
} else {
return true
}
- case .emittedSpaceInListItemContent:
- if isWhitespace {
- return false
- } else {
- blockState = .listItemContent
- return true
- }
case .lineBreakTagInListItemContent:
if isWhitespace {
+ if isNewline {
+ blockState = .atLeastTwoLineBreakTagsInListItemContent
+ }
+ temporaryBuffer.unicodeScalars.append(char)
return false
} else {
blockState = .listItemContent
@@ -187,6 +176,7 @@ extension BlockStateMachine {
}
case .atLeastTwoLineBreakTagsInListItemContent:
if isWhitespace {
+ temporaryBuffer.unicodeScalars.append(char)
return false
} else {
blockState = .listItemContent
@@ -311,9 +301,6 @@ extension BlockStateMachine {
case .nonEmptyBlock:
blockState = .lineBreakTag
temporaryBuffer.append(lineBreak)
- case .emittedSpace:
- blockState = .lineBreakTag
- temporaryBuffer.append(lineBreak)
case .lineBreakTag:
blockState = .atLeastTwoLineBreakTags
temporaryBuffer.append(lineBreak)
@@ -329,9 +316,6 @@ extension BlockStateMachine {
case .listItemContent:
blockState = .lineBreakTagInListItemContent
temporaryBuffer.append(lineBreak)
- case .emittedSpaceInListItemContent:
- blockState = .lineBreakTagInListItemContent
- temporaryBuffer.append(lineBreak)
case .lineBreakTagInListItemContent:
blockState = .atLeastTwoLineBreakTagsInListItemContent
temporaryBuffer.append(lineBreak)
@@ -374,8 +358,6 @@ extension BlockStateMachine {
blockState = .afterPreStartTag(depth: 1)
case .nonEmptyBlock:
fatalError("unreachable")
- case .emittedSpace:
- fatalError("unreachable")
case .lineBreakTag:
fatalError("unreachable")
case .atLeastTwoLineBreakTags:
@@ -388,8 +370,6 @@ extension BlockStateMachine {
fatalError("unreachable")
case .listItemContent:
fatalError("unreachable")
- case .emittedSpaceInListItemContent:
- fatalError("unreachable")
case .lineBreakTagInListItemContent:
fatalError("unreachable")
case .atLeastTwoLineBreakTagsInListItemContent:
@@ -423,8 +403,6 @@ extension BlockStateMachine {
break
case .nonEmptyBlock:
fatalError("unreachable")
- case .emittedSpace:
- fatalError("unreachable")
case .lineBreakTag:
fatalError("unreachable")
case .atLeastTwoLineBreakTags:
@@ -437,8 +415,6 @@ extension BlockStateMachine {
fatalError("unreachable")
case .listItemContent:
fatalError("unreachable")
- case .emittedSpaceInListItemContent:
- fatalError("unreachable")
case .lineBreakTagInListItemContent:
fatalError("unreachable")
case .atLeastTwoLineBreakTagsInListItemContent:
@@ -501,10 +477,6 @@ extension BlockStateMachine {
case .listItemContent:
blockState = .beginListItem
append(lineBreak)
- case .emittedSpaceInListItemContent:
- blockState = .beginListItem
- removeChar()
- append(lineBreak)
case .lineBreakTagInListItemContent:
blockState = .beginListItem
append(temporaryBuffer)
@@ -528,9 +500,6 @@ extension BlockStateMachine {
blockState = .endListItem
case .listItemContent:
blockState = .endListItem
- case .emittedSpaceInListItemContent:
- blockState = .endListItem
- removeChar()
case .lineBreakTagInListItemContent:
blockState = .endListItem
temporaryBuffer = ""
@@ -544,10 +513,6 @@ extension BlockStateMachine {
mutating func endBlocks() {
switch blockState {
- case .emittedSpace:
- removeChar()
- case .emittedSpaceInListItemContent:
- removeChar()
default:
break
}
@@ -558,14 +523,12 @@ enum BlockState: Equatable {
case start
case emptyBlock
case nonEmptyBlock
- case emittedSpace
case lineBreakTag
case atLeastTwoLineBreakTags
case emptyBlockWithAtLeastTwoPreviousLineBreakTags
case beginListItem
case endListItem
case listItemContent
- case emittedSpaceInListItemContent
case lineBreakTagInListItemContent
case atLeastTwoLineBreakTagsInListItemContent
case preformattedStart(depth: Int32)
@@ -583,5 +546,5 @@ enum BlockState: Equatable {
private func isWhitespace(_ c: UnicodeScalar) -> Bool {
// this is not strictly correct, but checking the actual unicode properties is slow
// and this should cover the vast majority of actual use
- c == " " || c == "\n" || c == "\t" || c == "\u{A0}" /* NO-BREAK SPACE */
+ c == " " || c == "\n" || c == "\t"
}
diff --git a/Tests/HTMLStreamerTests/AttributedStringConverterTests.swift b/Tests/HTMLStreamerTests/AttributedStringConverterTests.swift
index ed2e79e..9a5eb3d 100644
--- a/Tests/HTMLStreamerTests/AttributedStringConverterTests.swift
+++ b/Tests/HTMLStreamerTests/AttributedStringConverterTests.swift
@@ -50,7 +50,7 @@ final class AttributedStringConverterTests: XCTestCase {
color: color,
paragraphStyle: .default
)
- var converter = AttributedStringConverter(configuration: config)
+ let converter = AttributedStringConverter(configuration: config)
return converter.convert(html: html)
}
@@ -391,7 +391,7 @@ final class AttributedStringConverterTests: XCTestCase {
XCTAssertEqual(convert("a\n\n
\n"), result)
XCTAssertEqual(convert("\n\na
"), result)
XCTAssertEqual(convert("\n\na
\n"), result)
- let result2 = NSAttributedString(string: "a b", attributes: [
+ let result2 = NSAttributedString(string: "a\n\n\nb", attributes: [
.font: font,
.paragraphStyle: NSParagraphStyle.default,
.foregroundColor: color,
@@ -519,8 +519,14 @@ final class AttributedStringConverterTests: XCTestCase {
])
XCTAssertEqual(convert(" \n\ta"), result)
XCTAssertEqual(convert(" \n\ta
"), result)
- XCTAssertEqual(convert("a \n\t"), result)
+ XCTAssertEqual(convert("a\n\t"), result)
XCTAssertEqual(convert("a
\n\t"), result)
+ let result2 = NSAttributedString(string: "a ", attributes: [
+ .font: font,
+ .paragraphStyle: NSParagraphStyle.default,
+ .foregroundColor: color,
+ ])
+ XCTAssertEqual(convert("a \n\t"), result2)
let pre = NSAttributedString(string: "a", attributes: [
.font: monospaceFont,
@@ -531,8 +537,8 @@ final class AttributedStringConverterTests: XCTestCase {
XCTAssertEqual(convert("a
\n\t"), pre)
}
- func testWhitespaceCollapsing() {
- let result = NSAttributedString(string: "a b", attributes: [
+ func testDoesNotCollapseWhitespace() {
+ let result = NSAttributedString(string: "a \t\nb", attributes: [
.font: font,
.paragraphStyle: NSParagraphStyle.default,
.foregroundColor: color,
@@ -565,11 +571,16 @@ final class AttributedStringConverterTests: XCTestCase {
.foregroundColor: color,
])
XCTAssertEqual(convert("- a
c- b
"), result)
- XCTAssertEqual(convert("- a
c - b
"), result)
+ let result2 = NSAttributedString(string: "\t1.\ta\n\t\tc \n\t2.\tb", attributes: [
+ .font: font,
+ .paragraphStyle: listParagraphStyle,
+ .foregroundColor: color,
+ ])
+ XCTAssertEqual(convert("- a
c - b
"), result2)
}
func testWhitespaceCollapsingInTextBetweenListItems() {
- let result = NSAttributedString(string: "\t1.\ta\n\t\tc d\n\t2.\tb", attributes: [
+ let result = NSAttributedString(string: "\t1.\ta\n\t\tc d\n\t2.\tb", attributes: [
.font: font,
.paragraphStyle: listParagraphStyle,
.foregroundColor: color,