Gemini/GeminiFormat/GeminiParser.swift

//
//  GeminiParser.swift
//  GeminiFormat
//
//  Created by Shadowfacts on 7/12/20.
//

import Foundation

public struct GeminiParser {

    private init() {}

    public static func parse(text: String, baseURL: URL) -> Document {
        var doc = Document(url: baseURL)

        var inPreformattingBlock = false
        text.enumerateLines { (line, stop) in
            if line.starts(with: "```") {
                if inPreformattingBlock {
                    inPreformattingBlock = false
                    // todo: should the toggle off line be a separate line type?
                    doc.lines.append(.preformattedToggle(alt: nil))
                } else {
                    let alt: String?
                    if line.count > 3 {
                        alt = String(line[line.index(line.startIndex, offsetBy: 3)...])
                    } else {
                        alt = nil
                    }
                    inPreformattingBlock = true
                    doc.lines.append(.preformattedToggle(alt: alt))
                }
            } else if inPreformattingBlock {
                doc.lines.append(.preformattedText(line))
            } else if line.starts(with: "=>") {
                // Link line
                let urlStart = line.firstNonWhitespaceIndex(after: line.index(line.startIndex, offsetBy: 2))
                let urlEnd = line.firstWhitespaceIndex(after: urlStart)
                let textStart = line.firstNonWhitespaceIndex(after: urlEnd)

                var urlString = String(line[urlStart..<urlEnd])
                if urlString.hasPrefix("//") {
                    // URL(string:relativeTo:) does not handle // meaning the same protocol as the base URL
                    urlString = baseURL.scheme! + ":" + urlString
                }

                let text: String?
                if textStart < line.endIndex {
                    text = String(line[textStart..<line.endIndex])
                } else {
                    text = nil
                }

                if let url = URL(string: urlString, relativeTo: baseURL)?.absoluteURL {
                    doc.lines.append(.link(url, text: text))
                } else if let escaped = urlString.addingPercentEncoding(withAllowedCharacters: .URLAllowed),
                          let url = URL(string: escaped, relativeTo: baseURL)?.absoluteURL {
                    // if URL initialization fails because there are unescaped chars in the doc, escape everything and try again.
                    // I'm not certain, but it feels unsafe to always do this escaping
                    doc.lines.append(.link(url, text: text))
                } else {
                    let str: String
                    if let text = text {
                        // todo: localize me?
                        str = "\(text): \(urlString)"
                    } else {
                        str = urlString
                    }
                    doc.lines.append(.text(str))
                }
            } else if line.starts(with: "#") {
                let level: Document.HeadingLevel
                if line.starts(with: "###") {
                    level = .h3
                } else if line.starts(with: "##") {
                    level = .h2
                } else {
                    level = .h1
                }
                let headingStart = line.firstNonWhitespaceIndex(after: line.index(line.startIndex, offsetBy: level.rawValue))
                let headingText = String(line[headingStart...])
                doc.lines.append(.heading(headingText, level: level))
            } else if line.starts(with: "* ") {
                let listItemStart = line.firstNonWhitespaceIndex(after: line.index(line.startIndex, offsetBy: 2))
                let listItemText = String(line[listItemStart...])
                doc.lines.append(.unorderedListItem(listItemText))
            } else if line.starts(with: ">") {
                let quoteStartIndex = line.firstNonWhitespaceIndex(after: line.index(after: line.startIndex))
                let quoteText = String(line[quoteStartIndex...])
                doc.lines.append(.quote(quoteText))
            } else {
                doc.lines.append(.text(line))
            }
        }

        return doc
    }

}

fileprivate extension String {
    func firstNonWhitespaceIndex(after index: String.Index) -> String.Index {
        var index = index
        // using .unicodeScalars.first should be fine, since all whitespace characters are single scalars
        while index < self.endIndex, CharacterSet.whitespaces.contains(self[index].unicodeScalars.first!) {
            index = self.index(after: index)
        }
        return index
    }

    func firstWhitespaceIndex(after index: String.Index) -> String.Index {
        var index = index
        // todo: could the first unicode scalar of a character be whitespace even though the whole character is not?
        while index < self.endIndex, !CharacterSet.whitespaces.contains(self[index].unicodeScalars.first!) {
            index = self.index(after: index)
        }
        return index
    }
}

private extension CharacterSet {
    static let URLAllowed = CharacterSet(charactersIn: "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~:/?#[]@!$&'()*+,;=%")
}