2018-08-24 18:42:07 +02:00
|
|
|
/**
|
|
|
|
* Splash
|
|
|
|
* Copyright (c) John Sundell 2018
|
|
|
|
* MIT license - see LICENSE.md
|
|
|
|
*/
|
|
|
|
|
|
|
|
import Foundation
|
|
|
|
|
|
|
|
internal struct Tokenizer {
|
2019-08-07 15:45:15 +02:00
|
|
|
func segmentsByTokenizing(_ code: String,
|
|
|
|
using grammar: Grammar) -> AnySequence<Segment> {
|
2018-08-24 18:42:07 +02:00
|
|
|
return AnySequence<Segment> {
|
2019-08-07 15:45:15 +02:00
|
|
|
Buffer(iterator: Iterator(code: code, grammar: grammar))
|
2018-08-24 18:42:07 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
private extension Tokenizer {
|
|
|
|
struct Buffer: IteratorProtocol {
|
|
|
|
private var iterator: Iterator
|
|
|
|
private var nextSegment: Segment?
|
|
|
|
|
|
|
|
init(iterator: Iterator) {
|
|
|
|
self.iterator = iterator
|
|
|
|
}
|
|
|
|
|
|
|
|
mutating func next() -> Segment? {
|
|
|
|
var segment = nextSegment ?? iterator.next()
|
|
|
|
nextSegment = iterator.next()
|
|
|
|
segment?.tokens.next = nextSegment?.tokens.current
|
|
|
|
return segment
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
struct Iterator: IteratorProtocol {
|
2019-08-07 15:45:15 +02:00
|
|
|
struct Component {
|
|
|
|
enum Kind {
|
|
|
|
case token
|
|
|
|
case delimiter
|
|
|
|
case whitespace
|
|
|
|
case newline
|
|
|
|
}
|
|
|
|
|
|
|
|
let character: Character
|
|
|
|
let kind: Kind
|
2018-08-24 18:42:07 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
private let code: String
|
2019-08-07 15:45:15 +02:00
|
|
|
private let grammar: Grammar
|
2018-08-24 18:42:07 +02:00
|
|
|
private var index: String.Index?
|
2019-03-11 14:52:46 +01:00
|
|
|
private var tokenCounts = [String: Int]()
|
2018-08-27 00:10:08 +02:00
|
|
|
private var allTokens = [String]()
|
2018-08-24 18:42:07 +02:00
|
|
|
private var lineTokens = [String]()
|
|
|
|
private var segments: (current: Segment?, previous: Segment?)
|
|
|
|
|
2019-08-07 15:45:15 +02:00
|
|
|
init(code: String, grammar: Grammar) {
|
2018-08-24 18:42:07 +02:00
|
|
|
self.code = code
|
2019-08-07 15:45:15 +02:00
|
|
|
self.grammar = grammar
|
2018-08-24 18:42:07 +02:00
|
|
|
segments = (nil, nil)
|
|
|
|
}
|
|
|
|
|
|
|
|
mutating func next() -> Segment? {
|
|
|
|
let nextIndex = makeNextIndex()
|
|
|
|
|
|
|
|
guard nextIndex != code.endIndex else {
|
|
|
|
let segment = segments.current
|
|
|
|
segments.current = nil
|
|
|
|
return segment
|
|
|
|
}
|
|
|
|
|
|
|
|
index = nextIndex
|
|
|
|
let component = makeComponent(at: nextIndex)
|
|
|
|
|
2019-08-07 15:45:15 +02:00
|
|
|
switch component.kind {
|
|
|
|
case .token, .delimiter:
|
2018-08-24 18:42:07 +02:00
|
|
|
guard var segment = segments.current else {
|
|
|
|
segments.current = makeSegment(with: component, at: nextIndex)
|
|
|
|
return next()
|
|
|
|
}
|
|
|
|
|
|
|
|
guard segment.trailingWhitespace == nil,
|
|
|
|
component.isDelimiter == segment.currentTokenIsDelimiter else {
|
|
|
|
return finish(segment, with: component, at: nextIndex)
|
|
|
|
}
|
|
|
|
|
2019-08-07 15:45:15 +02:00
|
|
|
if component.isDelimiter {
|
|
|
|
let previousCharacter = segment.tokens.current.last!
|
|
|
|
let shouldMerge = grammar.isDelimiter(previousCharacter,
|
|
|
|
mergableWith: component.character)
|
|
|
|
|
|
|
|
guard shouldMerge else {
|
|
|
|
return finish(segment, with: component, at: nextIndex)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
segment.tokens.current.append(component.character)
|
2018-08-24 18:42:07 +02:00
|
|
|
segments.current = segment
|
|
|
|
return next()
|
2019-08-07 15:45:15 +02:00
|
|
|
case .whitespace, .newline:
|
2018-08-24 18:42:07 +02:00
|
|
|
guard var segment = segments.current else {
|
|
|
|
var segment = makeSegment(with: component, at: nextIndex)
|
2019-08-07 15:45:15 +02:00
|
|
|
segment.trailingWhitespace = component.token
|
2018-08-24 18:42:07 +02:00
|
|
|
segment.isLastOnLine = component.isNewline
|
|
|
|
segments.current = segment
|
|
|
|
return next()
|
|
|
|
}
|
|
|
|
|
2019-08-07 15:45:15 +02:00
|
|
|
if var existingWhitespace = segment.trailingWhitespace {
|
|
|
|
existingWhitespace.append(component.character)
|
|
|
|
segment.trailingWhitespace = existingWhitespace
|
2018-08-24 18:42:07 +02:00
|
|
|
} else {
|
2019-08-07 15:45:15 +02:00
|
|
|
segment.trailingWhitespace = component.token
|
2018-08-24 18:42:07 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
if component.isNewline {
|
|
|
|
segment.isLastOnLine = true
|
|
|
|
}
|
|
|
|
|
|
|
|
segments.current = segment
|
|
|
|
return next()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
private func makeNextIndex() -> String.Index {
|
|
|
|
guard let index = index else {
|
|
|
|
return code.startIndex
|
|
|
|
}
|
|
|
|
|
|
|
|
return code.index(after: index)
|
|
|
|
}
|
|
|
|
|
|
|
|
private func makeComponent(at index: String.Index) -> Component {
|
2019-08-07 15:45:15 +02:00
|
|
|
func kind(for character: Character) -> Component.Kind {
|
|
|
|
if character.isWhitespace {
|
|
|
|
return .whitespace
|
|
|
|
}
|
2018-08-24 18:42:07 +02:00
|
|
|
|
2019-08-07 15:45:15 +02:00
|
|
|
if character.isNewline {
|
|
|
|
return .newline
|
|
|
|
}
|
2018-08-24 18:42:07 +02:00
|
|
|
|
2019-08-07 15:45:15 +02:00
|
|
|
if grammar.delimiters.contains(character) {
|
|
|
|
return .delimiter
|
|
|
|
}
|
2018-08-24 18:42:07 +02:00
|
|
|
|
2019-08-07 15:45:15 +02:00
|
|
|
return .token
|
2018-08-24 18:42:07 +02:00
|
|
|
}
|
|
|
|
|
2019-08-07 15:45:15 +02:00
|
|
|
let character = code[index]
|
|
|
|
|
|
|
|
return Component(
|
|
|
|
character: character,
|
|
|
|
kind: kind(for: character)
|
|
|
|
)
|
2018-08-24 18:42:07 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
private func makeSegment(with component: Component, at index: String.Index) -> Segment {
|
|
|
|
let tokens = Segment.Tokens(
|
2018-08-27 00:10:08 +02:00
|
|
|
all: allTokens,
|
2018-08-24 18:42:07 +02:00
|
|
|
counts: tokenCounts,
|
|
|
|
onSameLine: lineTokens,
|
|
|
|
previous: segments.current?.tokens.current,
|
|
|
|
current: component.token,
|
|
|
|
next: nil
|
|
|
|
)
|
|
|
|
|
|
|
|
return Segment(
|
|
|
|
prefix: code[..<index],
|
|
|
|
tokens: tokens,
|
|
|
|
trailingWhitespace: nil,
|
|
|
|
currentTokenIsDelimiter: component.isDelimiter,
|
|
|
|
isLastOnLine: false
|
|
|
|
)
|
|
|
|
}
|
|
|
|
|
|
|
|
private mutating func finish(_ segment: Segment,
|
|
|
|
with component: Component,
|
|
|
|
at index: String.Index) -> Segment {
|
|
|
|
var count = tokenCounts[segment.tokens.current] ?? 0
|
|
|
|
count += 1
|
|
|
|
tokenCounts[segment.tokens.current] = count
|
|
|
|
|
2018-08-27 00:10:08 +02:00
|
|
|
allTokens.append(segment.tokens.current)
|
|
|
|
|
2018-08-24 18:42:07 +02:00
|
|
|
if segment.isLastOnLine {
|
|
|
|
lineTokens = []
|
|
|
|
} else {
|
|
|
|
lineTokens.append(segment.tokens.current)
|
|
|
|
}
|
|
|
|
|
|
|
|
segments.previous = segment
|
|
|
|
segments.current = makeSegment(with: component, at: index)
|
|
|
|
|
|
|
|
return segment
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
extension Tokenizer.Iterator.Component {
|
|
|
|
var token: String {
|
2019-08-07 15:45:15 +02:00
|
|
|
return String(character)
|
2018-08-24 18:42:07 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
var isDelimiter: Bool {
|
2019-08-07 15:45:15 +02:00
|
|
|
switch kind {
|
2018-08-24 18:42:07 +02:00
|
|
|
case .token, .whitespace, .newline:
|
|
|
|
return false
|
|
|
|
case .delimiter:
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
var isNewline: Bool {
|
2019-08-07 15:45:15 +02:00
|
|
|
switch kind {
|
2018-08-24 18:42:07 +02:00
|
|
|
case .token, .whitespace, .delimiter:
|
|
|
|
return false
|
|
|
|
case .newline:
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
private extension Character {
|
|
|
|
var isWhitespace: Bool {
|
|
|
|
return CharacterSet.whitespaces.contains(self)
|
|
|
|
}
|
|
|
|
|
|
|
|
var isNewline: Bool {
|
|
|
|
return CharacterSet.newlines.contains(self)
|
|
|
|
}
|
|
|
|
}
|