389 lines
9.9 KiB
C++
389 lines
9.9 KiB
C++
#include <tree_sitter/parser.h>
|
|
#include <string>
|
|
#include <cwctype>
|
|
|
|
namespace {
|
|
|
|
using std::string;
|
|
|
|
enum TokenType {
|
|
HEREDOC_START,
|
|
SIMPLE_HEREDOC_BODY,
|
|
HEREDOC_BODY_BEGINNING,
|
|
HEREDOC_BODY_MIDDLE,
|
|
HEREDOC_BODY_END,
|
|
FILE_DESCRIPTOR,
|
|
EMPTY_VALUE,
|
|
CONCAT,
|
|
VARIABLE_NAME,
|
|
REGEX,
|
|
CLOSING_BRACE,
|
|
CLOSING_BRACKET,
|
|
HEREDOC_ARROW,
|
|
HEREDOC_ARROW_DASH,
|
|
NEWLINE,
|
|
};
|
|
|
|
struct Scanner {
|
|
void skip(TSLexer *lexer) {
|
|
lexer->advance(lexer, true);
|
|
}
|
|
|
|
void advance(TSLexer *lexer) {
|
|
lexer->advance(lexer, false);
|
|
}
|
|
|
|
unsigned serialize(char *buffer) {
|
|
if (heredoc_delimiter.length() + 3 >= TREE_SITTER_SERIALIZATION_BUFFER_SIZE) return 0;
|
|
buffer[0] = heredoc_is_raw;
|
|
buffer[1] = started_heredoc;
|
|
buffer[2] = heredoc_allows_indent;
|
|
heredoc_delimiter.copy(&buffer[3], heredoc_delimiter.length());
|
|
return heredoc_delimiter.length() + 3;
|
|
}
|
|
|
|
void deserialize(const char *buffer, unsigned length) {
|
|
if (length == 0) {
|
|
heredoc_is_raw = false;
|
|
started_heredoc = false;
|
|
heredoc_allows_indent = false;
|
|
heredoc_delimiter.clear();
|
|
} else {
|
|
heredoc_is_raw = buffer[0];
|
|
started_heredoc = buffer[1];
|
|
heredoc_allows_indent = buffer[2];
|
|
heredoc_delimiter.assign(&buffer[3], &buffer[length]);
|
|
}
|
|
}
|
|
|
|
bool scan_heredoc_start(TSLexer *lexer) {
|
|
while (iswspace(lexer->lookahead)) skip(lexer);
|
|
|
|
lexer->result_symbol = HEREDOC_START;
|
|
heredoc_is_raw = lexer->lookahead == '\'';
|
|
started_heredoc = false;
|
|
heredoc_delimiter.clear();
|
|
|
|
if (lexer->lookahead == '\\') {
|
|
advance(lexer);
|
|
}
|
|
|
|
int32_t quote = 0;
|
|
if (heredoc_is_raw || lexer->lookahead == '"') {
|
|
quote = lexer->lookahead;
|
|
advance(lexer);
|
|
}
|
|
|
|
while (iswalpha(lexer->lookahead)) {
|
|
heredoc_delimiter += lexer->lookahead;
|
|
advance(lexer);
|
|
}
|
|
|
|
if (lexer->lookahead == quote) {
|
|
advance(lexer);
|
|
}
|
|
|
|
return !heredoc_delimiter.empty();
|
|
}
|
|
|
|
bool scan_heredoc_end_identifier(TSLexer *lexer) {
|
|
current_leading_word.clear();
|
|
while (iswalpha(lexer->lookahead)) {
|
|
current_leading_word += lexer->lookahead;
|
|
advance(lexer);
|
|
}
|
|
return current_leading_word == heredoc_delimiter;
|
|
}
|
|
|
|
bool scan_heredoc_content(TSLexer *lexer, TokenType middle_type, TokenType end_type) {
|
|
bool did_advance = false;
|
|
|
|
for (;;) {
|
|
switch (lexer->lookahead) {
|
|
case '\0': {
|
|
if (did_advance) {
|
|
heredoc_is_raw = false;
|
|
started_heredoc = false;
|
|
heredoc_allows_indent = false;
|
|
heredoc_delimiter.clear();
|
|
lexer->result_symbol = end_type;
|
|
return true;
|
|
} else {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
case '\\': {
|
|
did_advance = true;
|
|
advance(lexer);
|
|
advance(lexer);
|
|
break;
|
|
}
|
|
|
|
case '$': {
|
|
if (heredoc_is_raw) {
|
|
did_advance = true;
|
|
advance(lexer);
|
|
break;
|
|
} else if (did_advance) {
|
|
lexer->result_symbol = middle_type;
|
|
started_heredoc = true;
|
|
return true;
|
|
} else {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
case '\n': {
|
|
did_advance = true;
|
|
advance(lexer);
|
|
if (heredoc_allows_indent) {
|
|
while (iswspace(lexer->lookahead)) {
|
|
advance(lexer);
|
|
}
|
|
}
|
|
if (scan_heredoc_end_identifier(lexer)) {
|
|
heredoc_is_raw = false;
|
|
started_heredoc = false;
|
|
heredoc_allows_indent = false;
|
|
heredoc_delimiter.clear();
|
|
lexer->result_symbol = end_type;
|
|
return true;
|
|
}
|
|
break;
|
|
}
|
|
|
|
default: {
|
|
did_advance = true;
|
|
advance(lexer);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
bool scan(TSLexer *lexer, const bool *valid_symbols) {
|
|
if (valid_symbols[CONCAT]) {
|
|
if (!(
|
|
lexer->lookahead == 0 ||
|
|
iswspace(lexer->lookahead) ||
|
|
lexer->lookahead == '>' ||
|
|
lexer->lookahead == '<' ||
|
|
lexer->lookahead == ')' ||
|
|
lexer->lookahead == '(' ||
|
|
lexer->lookahead == ';' ||
|
|
lexer->lookahead == '&' ||
|
|
lexer->lookahead == '|' ||
|
|
lexer->lookahead == '`' ||
|
|
lexer->lookahead == '#' ||
|
|
(lexer->lookahead == '}' && valid_symbols[CLOSING_BRACE]) ||
|
|
(lexer->lookahead == ']' && valid_symbols[CLOSING_BRACKET])
|
|
)) {
|
|
lexer->result_symbol = CONCAT;
|
|
return true;
|
|
}
|
|
}
|
|
|
|
if (valid_symbols[EMPTY_VALUE]) {
|
|
if (iswspace(lexer->lookahead)) {
|
|
lexer->result_symbol = EMPTY_VALUE;
|
|
return true;
|
|
}
|
|
}
|
|
|
|
if (valid_symbols[HEREDOC_BODY_BEGINNING] && !heredoc_delimiter.empty() && !started_heredoc) {
|
|
return scan_heredoc_content(lexer, HEREDOC_BODY_BEGINNING, SIMPLE_HEREDOC_BODY);
|
|
}
|
|
|
|
if (valid_symbols[HEREDOC_BODY_MIDDLE] && !heredoc_delimiter.empty() && started_heredoc) {
|
|
return scan_heredoc_content(lexer, HEREDOC_BODY_MIDDLE, HEREDOC_BODY_END);
|
|
}
|
|
|
|
if (valid_symbols[HEREDOC_START]) {
|
|
return scan_heredoc_start(lexer);
|
|
}
|
|
|
|
if (valid_symbols[VARIABLE_NAME] || valid_symbols[FILE_DESCRIPTOR] || valid_symbols[HEREDOC_ARROW]) {
|
|
for (;;) {
|
|
if (
|
|
lexer->lookahead == ' ' ||
|
|
lexer->lookahead == '\t' ||
|
|
(lexer->lookahead == '\n' && !valid_symbols[NEWLINE])
|
|
) {
|
|
skip(lexer);
|
|
} else if (lexer->lookahead == '\\') {
|
|
skip(lexer);
|
|
if (lexer->lookahead == '\n') {
|
|
skip(lexer);
|
|
} else {
|
|
return false;
|
|
}
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (valid_symbols[HEREDOC_ARROW] && lexer->lookahead == '<') {
|
|
advance(lexer);
|
|
if (lexer->lookahead == '<') {
|
|
advance(lexer);
|
|
if (lexer->lookahead == '-') {
|
|
advance(lexer);
|
|
heredoc_allows_indent = true;
|
|
lexer->result_symbol = HEREDOC_ARROW_DASH;
|
|
} else if (lexer->lookahead == '<') {
|
|
return false;
|
|
} else {
|
|
heredoc_allows_indent = false;
|
|
lexer->result_symbol = HEREDOC_ARROW;
|
|
}
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool is_number = true;
|
|
if (iswdigit(lexer->lookahead)) {
|
|
advance(lexer);
|
|
} else if (iswalpha(lexer->lookahead) || lexer->lookahead == '_') {
|
|
is_number = false;
|
|
advance(lexer);
|
|
} else {
|
|
return false;
|
|
}
|
|
|
|
for (;;) {
|
|
if (iswdigit(lexer->lookahead)) {
|
|
advance(lexer);
|
|
} else if (iswalpha(lexer->lookahead) || lexer->lookahead == '_') {
|
|
is_number = false;
|
|
advance(lexer);
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (is_number &&
|
|
valid_symbols[FILE_DESCRIPTOR] &&
|
|
(lexer->lookahead == '>' || lexer->lookahead == '<')) {
|
|
lexer->result_symbol = FILE_DESCRIPTOR;
|
|
return true;
|
|
}
|
|
|
|
if (valid_symbols[VARIABLE_NAME]) {
|
|
if (lexer->lookahead == '+') {
|
|
lexer->mark_end(lexer);
|
|
advance(lexer);
|
|
if (lexer->lookahead == '=') {
|
|
lexer->result_symbol = VARIABLE_NAME;
|
|
return true;
|
|
} else {
|
|
return false;
|
|
}
|
|
} else if (lexer->lookahead == '=' || lexer->lookahead == '[') {
|
|
lexer->result_symbol = VARIABLE_NAME;
|
|
return true;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
if (valid_symbols[REGEX]) {
|
|
while (iswspace(lexer->lookahead)) skip(lexer);
|
|
|
|
if (
|
|
lexer->lookahead != '"' &&
|
|
lexer->lookahead != '\'' &&
|
|
lexer->lookahead != '$'
|
|
) {
|
|
struct State {
|
|
bool done;
|
|
uint32_t paren_depth;
|
|
uint32_t bracket_depth;
|
|
uint32_t brace_depth;
|
|
};
|
|
|
|
lexer->mark_end(lexer);
|
|
|
|
State state = {false, 0, 0, 0};
|
|
while (!state.done) {
|
|
switch (lexer->lookahead) {
|
|
case '\0':
|
|
return false;
|
|
case '(':
|
|
state.paren_depth++;
|
|
break;
|
|
case '[':
|
|
state.bracket_depth++;
|
|
break;
|
|
case '{':
|
|
state.brace_depth++;
|
|
break;
|
|
case ')':
|
|
if (state.paren_depth == 0) state.done = true;
|
|
state.paren_depth--;
|
|
break;
|
|
case ']':
|
|
if (state.bracket_depth == 0) state.done = true;
|
|
state.bracket_depth--;
|
|
break;
|
|
case '}':
|
|
if (state.brace_depth == 0) state.done = true;
|
|
state.brace_depth--;
|
|
break;
|
|
}
|
|
|
|
if (!state.done) {
|
|
bool was_space = iswspace(lexer->lookahead);
|
|
advance(lexer);
|
|
if (!was_space) lexer->mark_end(lexer);
|
|
}
|
|
}
|
|
|
|
lexer->result_symbol = REGEX;
|
|
return true;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
string heredoc_delimiter;
|
|
bool heredoc_is_raw;
|
|
bool started_heredoc;
|
|
bool heredoc_allows_indent;
|
|
string current_leading_word;
|
|
};
|
|
|
|
}
|
|
|
|
extern "C" {
|
|
|
|
void *tree_sitter_bash_external_scanner_create() {
|
|
return new Scanner();
|
|
}
|
|
|
|
bool tree_sitter_bash_external_scanner_scan(void *payload, TSLexer *lexer,
|
|
const bool *valid_symbols) {
|
|
Scanner *scanner = static_cast<Scanner *>(payload);
|
|
return scanner->scan(lexer, valid_symbols);
|
|
}
|
|
|
|
unsigned tree_sitter_bash_external_scanner_serialize(void *payload, char *state) {
|
|
Scanner *scanner = static_cast<Scanner *>(payload);
|
|
return scanner->serialize(state);
|
|
}
|
|
|
|
void tree_sitter_bash_external_scanner_deserialize(void *payload, const char *state, unsigned length) {
|
|
Scanner *scanner = static_cast<Scanner *>(payload);
|
|
scanner->deserialize(state, length);
|
|
}
|
|
|
|
void tree_sitter_bash_external_scanner_destroy(void *payload) {
|
|
Scanner *scanner = static_cast<Scanner *>(payload);
|
|
delete scanner;
|
|
}
|
|
|
|
}
|