Gemini/GeminiFormat/GeminiParser.swift

125 lines
5.3 KiB
Swift

//
// GeminiParser.swift
// GeminiFormat
//
// Created by Shadowfacts on 7/12/20.
//
import Foundation
public struct GeminiParser {
private init() {}
public static func parse(text: String, baseURL: URL) -> Document {
var doc = Document(url: baseURL)
var inPreformattingBlock = false
text.enumerateLines { (line, stop) in
if line.starts(with: "```") {
if inPreformattingBlock {
inPreformattingBlock = false
// todo: should the toggle off line be a separate line type?
doc.lines.append(.preformattedToggle(alt: nil))
} else {
let alt: String?
if line.count > 3 {
alt = String(line[line.index(line.startIndex, offsetBy: 3)...])
} else {
alt = nil
}
inPreformattingBlock = true
doc.lines.append(.preformattedToggle(alt: alt))
}
} else if inPreformattingBlock {
doc.lines.append(.preformattedText(line))
} else if line.starts(with: "=>") {
// Link line
let urlStart = line.firstNonWhitespaceIndex(after: line.index(line.startIndex, offsetBy: 2))
let urlEnd = line.firstWhitespaceIndex(after: urlStart)
let textStart = line.firstNonWhitespaceIndex(after: urlEnd)
var urlString = String(line[urlStart..<urlEnd])
if urlString.hasPrefix("//") {
// URL(string:relativeTo:) does not handle // meaning the same protocol as the base URL
urlString = baseURL.scheme! + ":" + urlString
}
let text: String?
if textStart < line.endIndex {
text = String(line[textStart..<line.endIndex])
} else {
text = nil
}
if let url = URL(string: urlString, relativeTo: baseURL)?.absoluteURL {
doc.lines.append(.link(url, text: text))
} else if let escaped = urlString.addingPercentEncoding(withAllowedCharacters: .URLAllowed),
let url = URL(string: escaped, relativeTo: baseURL)?.absoluteURL {
// if URL initialization fails because there are unescaped chars in the doc, escape everything and try again.
// I'm not certain, but it feels unsafe to always do this escaping
doc.lines.append(.link(url, text: text))
} else {
let str: String
if let text = text {
// todo: localize me?
str = "\(text): \(urlString)"
} else {
str = urlString
}
doc.lines.append(.text(str))
}
} else if line.starts(with: "#") {
let level: Document.HeadingLevel
if line.starts(with: "###") {
level = .h3
} else if line.starts(with: "##") {
level = .h2
} else {
level = .h1
}
let headingStart = line.firstNonWhitespaceIndex(after: line.index(line.startIndex, offsetBy: level.rawValue))
let headingText = String(line[headingStart...])
doc.lines.append(.heading(headingText, level: level))
} else if line.starts(with: "* ") {
let listItemStart = line.firstNonWhitespaceIndex(after: line.index(line.startIndex, offsetBy: 2))
let listItemText = String(line[listItemStart...])
doc.lines.append(.unorderedListItem(listItemText))
} else if line.starts(with: ">") {
let quoteStartIndex = line.firstNonWhitespaceIndex(after: line.index(after: line.startIndex))
let quoteText = String(line[quoteStartIndex...])
doc.lines.append(.quote(quoteText))
} else {
doc.lines.append(.text(line))
}
}
return doc
}
}
fileprivate extension String {
func firstNonWhitespaceIndex(after index: String.Index) -> String.Index {
var index = index
// using .unicodeScalars.first should be fine, since all whitespace characters are single scalars
while index < self.endIndex, CharacterSet.whitespaces.contains(self[index].unicodeScalars.first!) {
index = self.index(after: index)
}
return index
}
func firstWhitespaceIndex(after index: String.Index) -> String.Index {
var index = index
// todo: could the first unicode scalar of a character be whitespace even though the whole character is not?
while index < self.endIndex, !CharacterSet.whitespaces.contains(self[index].unicodeScalars.first!) {
index = self.index(after: index)
}
return index
}
}
private extension CharacterSet {
static let URLAllowed = CharacterSet(charactersIn: "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~:/?#[]@!$&'()*+,;=%")
}