splash/Sources/Splash/Tokenizing/Tokenizer.swift

232 lines
6.8 KiB
Swift
Raw Permalink Normal View History

2018-08-24 18:42:07 +02:00
/**
* Splash
* Copyright (c) John Sundell 2018
* MIT license - see LICENSE.md
*/
import Foundation
internal struct Tokenizer {
func segmentsByTokenizing(_ code: String,
using grammar: Grammar) -> AnySequence<Segment> {
2018-08-24 18:42:07 +02:00
return AnySequence<Segment> {
Buffer(iterator: Iterator(code: code, grammar: grammar))
2018-08-24 18:42:07 +02:00
}
}
}
private extension Tokenizer {
struct Buffer: IteratorProtocol {
private var iterator: Iterator
private var nextSegment: Segment?
init(iterator: Iterator) {
self.iterator = iterator
}
mutating func next() -> Segment? {
var segment = nextSegment ?? iterator.next()
nextSegment = iterator.next()
segment?.tokens.next = nextSegment?.tokens.current
return segment
}
}
struct Iterator: IteratorProtocol {
struct Component {
enum Kind {
case token
case delimiter
case whitespace
case newline
}
let character: Character
let kind: Kind
2018-08-24 18:42:07 +02:00
}
private let code: String
private let grammar: Grammar
2018-08-24 18:42:07 +02:00
private var index: String.Index?
private var tokenCounts = [String: Int]()
private var allTokens = [String]()
2018-08-24 18:42:07 +02:00
private var lineTokens = [String]()
private var segments: (current: Segment?, previous: Segment?)
init(code: String, grammar: Grammar) {
2018-08-24 18:42:07 +02:00
self.code = code
self.grammar = grammar
2018-08-24 18:42:07 +02:00
segments = (nil, nil)
}
mutating func next() -> Segment? {
let nextIndex = makeNextIndex()
guard nextIndex != code.endIndex else {
let segment = segments.current
segments.current = nil
return segment
}
index = nextIndex
let component = makeComponent(at: nextIndex)
switch component.kind {
case .token, .delimiter:
2018-08-24 18:42:07 +02:00
guard var segment = segments.current else {
segments.current = makeSegment(with: component, at: nextIndex)
return next()
}
guard segment.trailingWhitespace == nil,
component.isDelimiter == segment.currentTokenIsDelimiter else {
return finish(segment, with: component, at: nextIndex)
}
if component.isDelimiter {
let previousCharacter = segment.tokens.current.last!
let shouldMerge = grammar.isDelimiter(previousCharacter,
mergableWith: component.character)
guard shouldMerge else {
return finish(segment, with: component, at: nextIndex)
}
}
segment.tokens.current.append(component.character)
2018-08-24 18:42:07 +02:00
segments.current = segment
return next()
case .whitespace, .newline:
2018-08-24 18:42:07 +02:00
guard var segment = segments.current else {
var segment = makeSegment(with: component, at: nextIndex)
segment.trailingWhitespace = component.token
2018-08-24 18:42:07 +02:00
segment.isLastOnLine = component.isNewline
segments.current = segment
return next()
}
if var existingWhitespace = segment.trailingWhitespace {
existingWhitespace.append(component.character)
segment.trailingWhitespace = existingWhitespace
2018-08-24 18:42:07 +02:00
} else {
segment.trailingWhitespace = component.token
2018-08-24 18:42:07 +02:00
}
if component.isNewline {
segment.isLastOnLine = true
}
segments.current = segment
return next()
}
}
private func makeNextIndex() -> String.Index {
guard let index = index else {
return code.startIndex
}
return code.index(after: index)
}
private func makeComponent(at index: String.Index) -> Component {
func kind(for character: Character) -> Component.Kind {
if character.isWhitespace {
return .whitespace
}
2018-08-24 18:42:07 +02:00
if character.isNewline {
return .newline
}
2018-08-24 18:42:07 +02:00
if grammar.delimiters.contains(character) {
return .delimiter
}
2018-08-24 18:42:07 +02:00
return .token
2018-08-24 18:42:07 +02:00
}
let character = code[index]
return Component(
character: character,
kind: kind(for: character)
)
2018-08-24 18:42:07 +02:00
}
private func makeSegment(with component: Component, at index: String.Index) -> Segment {
let tokens = Segment.Tokens(
all: allTokens,
2018-08-24 18:42:07 +02:00
counts: tokenCounts,
onSameLine: lineTokens,
previous: segments.current?.tokens.current,
current: component.token,
next: nil
)
return Segment(
prefix: code[..<index],
tokens: tokens,
trailingWhitespace: nil,
currentTokenIsDelimiter: component.isDelimiter,
isLastOnLine: false
)
}
private mutating func finish(_ segment: Segment,
with component: Component,
at index: String.Index) -> Segment {
var count = tokenCounts[segment.tokens.current] ?? 0
count += 1
tokenCounts[segment.tokens.current] = count
allTokens.append(segment.tokens.current)
2018-08-24 18:42:07 +02:00
if segment.isLastOnLine {
lineTokens = []
} else {
lineTokens.append(segment.tokens.current)
}
segments.previous = segment
segments.current = makeSegment(with: component, at: index)
return segment
}
}
}
extension Tokenizer.Iterator.Component {
var token: String {
return String(character)
2018-08-24 18:42:07 +02:00
}
var isDelimiter: Bool {
switch kind {
2018-08-24 18:42:07 +02:00
case .token, .whitespace, .newline:
return false
case .delimiter:
return true
}
}
var isNewline: Bool {
switch kind {
2018-08-24 18:42:07 +02:00
case .token, .whitespace, .delimiter:
return false
case .newline:
return true
}
}
}
private extension Character {
var isWhitespace: Bool {
return CharacterSet.whitespaces.contains(self)
}
var isNewline: Bool {
return CharacterSet.newlines.contains(self)
}
}