diff --git a/BlockState.dot b/BlockState.dot index ab9fea6..65ddf70 100644 --- a/BlockState.dot +++ b/BlockState.dot @@ -7,14 +7,12 @@ digraph blockstate { start; emptyBlock [label = "empty block"]; nonEmptyBlock [label = "non-empty block"]; - emittedSpace [label = "emitted space"]; lineBreakTag [label = "line break tag"]; atLeastTwoLineBreakTags [label = ">=2 line break tags"]; emptyBlockWithAtLeastTwoPreviousLineBreakTags [label = "empty block w/ >=2 prev line break tags"]; beginListItem [label = "begin list item"]; endListItem [label = "end list item"]; listItemContent [label = "list item content"]; - emittedSpaceInListItemContent [label = "emitted space in text in list item content"]; lineBreakTagInListItemContent [label = "line break tag in list item content"]; atLeastTwoLineBreakTagsInListItemContent [label = ">= 2 line break tags in list item content"]; preformattedStart [label = "preformatted start"]; @@ -32,27 +30,21 @@ digraph blockstate { start -> nonEmptyBlock [label = "non-whitespace"]; start -> preformattedStart [label = "
 (depth = 1)"];
 	start -> beginListItem [label = "
  • "]; - nonEmptyBlock -> nonEmptyBlock [label = "non-whitespace"]; + nonEmptyBlock -> nonEmptyBlock [label = "non-newline"]; nonEmptyBlock -> emptyBlock [label = "start/end block"]; - nonEmptyBlock -> emittedSpace [label = "whitespace (emit space)"]; - nonEmptyBlock -> lineBreakTag [label = "
    (append to tmp)"]; + nonEmptyBlock -> lineBreakTag [label = "
    or \\n (append to tmp)"]; nonEmptyBlock -> beginListItem [label = "
  • "]; nonEmptyBlock -> endListItem [label = "
  • "]; - emittedSpace -> nonEmptyBlock [label = "non-whitespace"]; - emittedSpace -> emittedSpace [label = "whitespace (skip)"]; - emittedSpace -> emptyBlock [label = "start/end block (remove 1)"]; - emittedSpace -> lineBreakTag [label = "
    (append to tmp)"]; - emittedSpace -> end [label = "EOF (remove 1)"]; emptyBlock -> nonEmptyBlock [label = "non-whitespace (block break)"]; emptyBlock -> emptyBlock [label = "whitespace (skip)\n
    \n
    \nstart/end block"]; emptyBlock -> afterPreStartTag [label = "
     (depth = 1)"];
     	emptyBlock -> beginListItem [label = "
  • "]; emptyBlock -> endListItem [label = "
  • "]; - lineBreakTag -> lineBreakTag [label = "whitespace (skip)"]; - lineBreakTag -> atLeastTwoLineBreakTags [label = "
    (append to tmp)"]; + lineBreakTag -> lineBreakTag [label = "whitespace (append to tmp)"]; + lineBreakTag -> atLeastTwoLineBreakTags [label = "
    or \\n (append to tmp)"]; lineBreakTag -> emptyBlock [label = "start/end block (clear tmp)"]; lineBreakTag -> nonEmptyBlock [label = "non-whitespace (emit tmp)"]; - atLeastTwoLineBreakTags -> atLeastTwoLineBreakTags [label = "whitespace (skip)\n
    (append to tmp)"]; + atLeastTwoLineBreakTags -> atLeastTwoLineBreakTags [label = "whitespace or
    (append to tmp)"]; atLeastTwoLineBreakTags -> nonEmptyBlock [label = "non-whitespace (emit tmp)"]; atLeastTwoLineBreakTags -> emptyBlockWithAtLeastTwoPreviousLineBreakTags [label = "start/end block"]; emptyBlockWithAtLeastTwoPreviousLineBreakTags -> emptyBlockWithAtLeastTwoPreviousLineBreakTags [label = "whitespace (skip)\n
    \n
    \nstart/end block"]; @@ -70,23 +62,15 @@ digraph blockstate { listItemContent -> listItemContent [label = "non-whitespace"]; listItemContent -> beginListItem [label = "
  • (line break)"]; listItemContent -> lineBreakTagInListItemContent [label = "
    (append to tmp)"]; - listItemContent -> emittedSpaceInListItemContent [label = "whitespace (emit space)"]; listItemContent -> emptyBlock [label = "start/end block"]; listItemContent -> endListItem [label = "
  • "]; - emittedSpaceInListItemContent -> emittedSpaceInListItemContent [label = "whitespace (skip)"]; - emittedSpaceInListItemContent -> listItemContent [label = "non-whitespace"]; - emittedSpaceInListItemContent -> end [label = "EOF (remove 1)"]; - emittedSpaceInListItemContent -> emptyBlock [label = "start/end block (remove 1)"]; - emittedSpaceInListItemContent -> beginListItem [label = "
  • (remove 1, line break)"]; - emittedSpaceInListItemContent -> lineBreakTagInListItemContent [label = "
    (append to tmp)"]; - emittedSpaceInListItemContent -> endListItem [label = "
  • (remove 1)"]; - lineBreakTagInListItemContent -> lineBreakTagInListItemContent [label = "whitespace (skip)"]; + lineBreakTagInListItemContent -> lineBreakTagInListItemContent [label = "whitespace (append to tmp)"]; lineBreakTagInListItemContent -> emptyBlock [label = "start/end block (clear tmp)"]; lineBreakTagInListItemContent -> beginListItem [label = "
  • (emit tmp, line break)"]; lineBreakTagInListItemContent -> listItemContent [label = "non-whitespace (emit tmp)"]; - lineBreakTagInListItemContent -> atLeastTwoLineBreakTagsInListItemContent [label = "
    (append to tmp)"]; + lineBreakTagInListItemContent -> atLeastTwoLineBreakTagsInListItemContent [label = "
    or \\n (append to tmp)"]; lineBreakTagInListItemContent -> endListItem [label = "
  • (clear tmp)"]; - atLeastTwoLineBreakTagsInListItemContent -> atLeastTwoLineBreakTagsInListItemContent [label = "
    (append to tmp)\nwhitespace (skip)"]; + atLeastTwoLineBreakTagsInListItemContent -> atLeastTwoLineBreakTagsInListItemContent [label = "whitespace or
    (append to tmp)"]; atLeastTwoLineBreakTagsInListItemContent -> beginListItem [label = "
  • (emit tmp, line break)"]; atLeastTwoLineBreakTagsInListItemContent -> emptyBlockWithAtLeastTwoPreviousLineBreakTags [label = "start/end block"]; atLeastTwoLineBreakTagsInListItemContent -> listItemContent [label = "non-whitespace (emit tmp)"]; diff --git a/Sources/HTMLStreamer/BlockState.swift b/Sources/HTMLStreamer/BlockState.swift index 77711ed..76b51bf 100644 --- a/Sources/HTMLStreamer/BlockState.swift +++ b/Sources/HTMLStreamer/BlockState.swift @@ -38,9 +38,6 @@ extension BlockStateMachine { break case .nonEmptyBlock: blockState = .emptyBlock - case .emittedSpace: - blockState = .emptyBlock - removeChar() case .lineBreakTag: blockState = .emptyBlock temporaryBuffer = "" @@ -54,9 +51,6 @@ extension BlockStateMachine { blockState = .emptyBlock case .listItemContent: blockState = .emptyBlock - case .emittedSpaceInListItemContent: - blockState = .emptyBlock - removeChar() case .lineBreakTagInListItemContent: blockState = .emptyBlock temporaryBuffer = "" @@ -104,22 +98,19 @@ extension BlockStateMachine { return true } case .nonEmptyBlock: - if isWhitespace { - blockState = .emittedSpace - append(" ") + if isNewline { + blockState = .lineBreakTag + temporaryBuffer.append("\n") return false } else { return true } - case .emittedSpace: - if isWhitespace { - return false - } else { - blockState = .nonEmptyBlock - return true - } case .lineBreakTag: if isWhitespace { + if isNewline { + blockState = .atLeastTwoLineBreakTags + } + temporaryBuffer.unicodeScalars.append(char) return false } else { blockState = .nonEmptyBlock @@ -129,6 +120,7 @@ extension BlockStateMachine { } case .atLeastTwoLineBreakTags: if isWhitespace { + temporaryBuffer.unicodeScalars.append(char) return false } else { blockState = .nonEmptyBlock @@ -162,22 +154,19 @@ extension BlockStateMachine { return true } case .listItemContent: - if isWhitespace { - blockState = .emittedSpaceInListItemContent - append(" ") + if isNewline { + blockState = .lineBreakTagInListItemContent + temporaryBuffer.append("\n") return false } else { return true } - case .emittedSpaceInListItemContent: - if isWhitespace { - return false - } else { - blockState = .listItemContent - return true - } case .lineBreakTagInListItemContent: if isWhitespace { + if isNewline { + blockState = .atLeastTwoLineBreakTagsInListItemContent + } + temporaryBuffer.unicodeScalars.append(char) return false } else { blockState = .listItemContent @@ -187,6 +176,7 @@ extension BlockStateMachine { } case .atLeastTwoLineBreakTagsInListItemContent: if isWhitespace { + temporaryBuffer.unicodeScalars.append(char) return false } else { blockState = .listItemContent @@ -311,9 +301,6 @@ extension BlockStateMachine { case .nonEmptyBlock: blockState = .lineBreakTag temporaryBuffer.append(lineBreak) - case .emittedSpace: - blockState = .lineBreakTag - temporaryBuffer.append(lineBreak) case .lineBreakTag: blockState = .atLeastTwoLineBreakTags temporaryBuffer.append(lineBreak) @@ -329,9 +316,6 @@ extension BlockStateMachine { case .listItemContent: blockState = .lineBreakTagInListItemContent temporaryBuffer.append(lineBreak) - case .emittedSpaceInListItemContent: - blockState = .lineBreakTagInListItemContent - temporaryBuffer.append(lineBreak) case .lineBreakTagInListItemContent: blockState = .atLeastTwoLineBreakTagsInListItemContent temporaryBuffer.append(lineBreak) @@ -374,8 +358,6 @@ extension BlockStateMachine { blockState = .afterPreStartTag(depth: 1) case .nonEmptyBlock: fatalError("unreachable") - case .emittedSpace: - fatalError("unreachable") case .lineBreakTag: fatalError("unreachable") case .atLeastTwoLineBreakTags: @@ -388,8 +370,6 @@ extension BlockStateMachine { fatalError("unreachable") case .listItemContent: fatalError("unreachable") - case .emittedSpaceInListItemContent: - fatalError("unreachable") case .lineBreakTagInListItemContent: fatalError("unreachable") case .atLeastTwoLineBreakTagsInListItemContent: @@ -423,8 +403,6 @@ extension BlockStateMachine { break case .nonEmptyBlock: fatalError("unreachable") - case .emittedSpace: - fatalError("unreachable") case .lineBreakTag: fatalError("unreachable") case .atLeastTwoLineBreakTags: @@ -437,8 +415,6 @@ extension BlockStateMachine { fatalError("unreachable") case .listItemContent: fatalError("unreachable") - case .emittedSpaceInListItemContent: - fatalError("unreachable") case .lineBreakTagInListItemContent: fatalError("unreachable") case .atLeastTwoLineBreakTagsInListItemContent: @@ -501,10 +477,6 @@ extension BlockStateMachine { case .listItemContent: blockState = .beginListItem append(lineBreak) - case .emittedSpaceInListItemContent: - blockState = .beginListItem - removeChar() - append(lineBreak) case .lineBreakTagInListItemContent: blockState = .beginListItem append(temporaryBuffer) @@ -528,9 +500,6 @@ extension BlockStateMachine { blockState = .endListItem case .listItemContent: blockState = .endListItem - case .emittedSpaceInListItemContent: - blockState = .endListItem - removeChar() case .lineBreakTagInListItemContent: blockState = .endListItem temporaryBuffer = "" @@ -544,10 +513,6 @@ extension BlockStateMachine { mutating func endBlocks() { switch blockState { - case .emittedSpace: - removeChar() - case .emittedSpaceInListItemContent: - removeChar() default: break } @@ -558,14 +523,12 @@ enum BlockState: Equatable { case start case emptyBlock case nonEmptyBlock - case emittedSpace case lineBreakTag case atLeastTwoLineBreakTags case emptyBlockWithAtLeastTwoPreviousLineBreakTags case beginListItem case endListItem case listItemContent - case emittedSpaceInListItemContent case lineBreakTagInListItemContent case atLeastTwoLineBreakTagsInListItemContent case preformattedStart(depth: Int32) @@ -583,5 +546,5 @@ enum BlockState: Equatable { private func isWhitespace(_ c: UnicodeScalar) -> Bool { // this is not strictly correct, but checking the actual unicode properties is slow // and this should cover the vast majority of actual use - c == " " || c == "\n" || c == "\t" || c == "\u{A0}" /* NO-BREAK SPACE */ + c == " " || c == "\n" || c == "\t" } diff --git a/Tests/HTMLStreamerTests/AttributedStringConverterTests.swift b/Tests/HTMLStreamerTests/AttributedStringConverterTests.swift index ed2e79e..9a5eb3d 100644 --- a/Tests/HTMLStreamerTests/AttributedStringConverterTests.swift +++ b/Tests/HTMLStreamerTests/AttributedStringConverterTests.swift @@ -50,7 +50,7 @@ final class AttributedStringConverterTests: XCTestCase { color: color, paragraphStyle: .default ) - var converter = AttributedStringConverter(configuration: config) + let converter = AttributedStringConverter(configuration: config) return converter.convert(html: html) } @@ -391,7 +391,7 @@ final class AttributedStringConverterTests: XCTestCase { XCTAssertEqual(convert("

    a\n\n

    \n"), result) XCTAssertEqual(convert("

    \n\na

    "), result) XCTAssertEqual(convert("

    \n\na

    \n"), result) - let result2 = NSAttributedString(string: "a b", attributes: [ + let result2 = NSAttributedString(string: "a\n\n\nb", attributes: [ .font: font, .paragraphStyle: NSParagraphStyle.default, .foregroundColor: color, @@ -519,8 +519,14 @@ final class AttributedStringConverterTests: XCTestCase { ]) XCTAssertEqual(convert(" \n\ta"), result) XCTAssertEqual(convert(" \n\t

    a

    "), result) - XCTAssertEqual(convert("a \n\t"), result) + XCTAssertEqual(convert("a\n\t"), result) XCTAssertEqual(convert("

    a

    \n\t"), result) + let result2 = NSAttributedString(string: "a ", attributes: [ + .font: font, + .paragraphStyle: NSParagraphStyle.default, + .foregroundColor: color, + ]) + XCTAssertEqual(convert("a \n\t"), result2) let pre = NSAttributedString(string: "a", attributes: [ .font: monospaceFont, @@ -531,8 +537,8 @@ final class AttributedStringConverterTests: XCTestCase { XCTAssertEqual(convert("
    a
    \n\t"), pre) } - func testWhitespaceCollapsing() { - let result = NSAttributedString(string: "a b", attributes: [ + func testDoesNotCollapseWhitespace() { + let result = NSAttributedString(string: "a \t\nb", attributes: [ .font: font, .paragraphStyle: NSParagraphStyle.default, .foregroundColor: color, @@ -565,11 +571,16 @@ final class AttributedStringConverterTests: XCTestCase { .foregroundColor: color, ]) XCTAssertEqual(convert("
    1. a
    2. c
    3. b
    "), result) - XCTAssertEqual(convert("
    1. a
    2. c
    3. b
    "), result) + let result2 = NSAttributedString(string: "\t1.\ta\n\t\tc \n\t2.\tb", attributes: [ + .font: font, + .paragraphStyle: listParagraphStyle, + .foregroundColor: color, + ]) + XCTAssertEqual(convert("
    1. a
    2. c
    3. b
    "), result2) } func testWhitespaceCollapsingInTextBetweenListItems() { - let result = NSAttributedString(string: "\t1.\ta\n\t\tc d\n\t2.\tb", attributes: [ + let result = NSAttributedString(string: "\t1.\ta\n\t\tc d\n\t2.\tb", attributes: [ .font: font, .paragraphStyle: listParagraphStyle, .foregroundColor: color,