2017-07-14 20:54:05 +00:00
|
|
|
#include <tree_sitter/parser.h>
|
|
|
|
#include <string>
|
|
|
|
#include <cwctype>
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
|
2017-08-01 17:12:16 +00:00
|
|
|
using std::string;
|
2017-07-14 20:54:05 +00:00
|
|
|
|
|
|
|
enum TokenType {
|
2017-07-14 21:27:13 +00:00
|
|
|
SIMPLE_HEREDOC,
|
|
|
|
HEREDOC_BEGINNING,
|
|
|
|
HEREDOC_MIDDLE,
|
|
|
|
HEREDOC_END,
|
2017-07-15 00:14:23 +00:00
|
|
|
FILE_DESCRIPTOR,
|
2018-02-27 18:54:40 +00:00
|
|
|
WORD,
|
2017-07-15 00:41:14 +00:00
|
|
|
EMPTY_VALUE,
|
2017-07-17 17:19:35 +00:00
|
|
|
CONCAT,
|
|
|
|
VARIABLE_NAME,
|
|
|
|
NEWLINE,
|
2018-02-27 18:54:40 +00:00
|
|
|
CLOSING_BRACKET,
|
|
|
|
CLOSING_BRACE,
|
2017-07-14 20:54:05 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
struct Scanner {
|
|
|
|
void skip(TSLexer *lexer) {
|
|
|
|
lexer->advance(lexer, true);
|
|
|
|
}
|
|
|
|
|
|
|
|
void advance(TSLexer *lexer) {
|
|
|
|
lexer->advance(lexer, false);
|
|
|
|
}
|
|
|
|
|
2017-08-01 17:12:16 +00:00
|
|
|
unsigned serialize(char *buffer) {
|
2018-01-19 17:52:45 +00:00
|
|
|
if (heredoc_delimiter.size() >= TREE_SITTER_SERIALIZATION_BUFFER_SIZE) return 0;
|
2017-08-01 17:26:42 +00:00
|
|
|
heredoc_delimiter.copy(buffer, heredoc_delimiter.length());
|
2017-08-01 17:12:16 +00:00
|
|
|
return heredoc_delimiter.length();
|
|
|
|
}
|
2017-07-14 20:54:05 +00:00
|
|
|
|
2017-08-01 17:12:16 +00:00
|
|
|
void deserialize(const char *buffer, unsigned length) {
|
|
|
|
if (length == 0) heredoc_delimiter.clear();
|
|
|
|
else heredoc_delimiter.assign(buffer, buffer + length);
|
|
|
|
}
|
2017-07-14 20:54:05 +00:00
|
|
|
|
2017-07-14 21:27:13 +00:00
|
|
|
bool scan_heredoc_end_identifier(TSLexer *lexer) {
|
|
|
|
current_leading_word.clear();
|
2017-07-14 20:54:05 +00:00
|
|
|
while (iswalpha(lexer->lookahead)) {
|
2017-07-14 21:27:13 +00:00
|
|
|
current_leading_word += lexer->lookahead;
|
2017-07-14 20:54:05 +00:00
|
|
|
advance(lexer);
|
|
|
|
}
|
2017-07-16 05:13:55 +00:00
|
|
|
return current_leading_word == heredoc_delimiter;
|
2017-07-14 21:27:13 +00:00
|
|
|
}
|
2017-07-14 20:54:05 +00:00
|
|
|
|
2017-07-14 21:27:13 +00:00
|
|
|
bool scan_heredoc_content(TSLexer *lexer, TokenType middle_type, TokenType end_type) {
|
|
|
|
bool did_advance = false;
|
2017-07-14 20:54:05 +00:00
|
|
|
|
|
|
|
for (;;) {
|
2017-07-14 21:27:13 +00:00
|
|
|
switch (lexer->lookahead) {
|
|
|
|
case '\0': {
|
|
|
|
lexer->result_symbol = end_type;
|
2017-08-01 17:12:16 +00:00
|
|
|
return did_advance;
|
2017-07-14 21:27:13 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
case '$': {
|
|
|
|
lexer->result_symbol = middle_type;
|
|
|
|
return did_advance;
|
|
|
|
}
|
|
|
|
|
|
|
|
case '\n': {
|
|
|
|
did_advance = true;
|
|
|
|
advance(lexer);
|
|
|
|
if (scan_heredoc_end_identifier(lexer)) {
|
|
|
|
lexer->result_symbol = end_type;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
default: {
|
|
|
|
did_advance = true;
|
|
|
|
advance(lexer);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
bool scan(TSLexer *lexer, const bool *valid_symbols) {
|
2017-07-17 17:19:35 +00:00
|
|
|
if (valid_symbols[CONCAT]) {
|
|
|
|
if (!(
|
|
|
|
iswspace(lexer->lookahead) ||
|
|
|
|
lexer->lookahead == '>' ||
|
|
|
|
lexer->lookahead == '<' ||
|
|
|
|
lexer->lookahead == ')' ||
|
|
|
|
lexer->lookahead == '(' ||
|
|
|
|
lexer->lookahead == '[' ||
|
2018-02-27 18:54:40 +00:00
|
|
|
lexer->lookahead == '|' ||
|
2017-07-17 17:19:35 +00:00
|
|
|
lexer->lookahead == ']' ||
|
|
|
|
lexer->lookahead == '}' ||
|
|
|
|
lexer->lookahead == ';' ||
|
|
|
|
lexer->lookahead == '&' ||
|
|
|
|
lexer->lookahead == '`'
|
|
|
|
)) {
|
|
|
|
lexer->result_symbol = CONCAT;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (valid_symbols[EMPTY_VALUE]) {
|
|
|
|
if (iswspace(lexer->lookahead)) {
|
|
|
|
lexer->result_symbol = EMPTY_VALUE;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-07-16 05:13:55 +00:00
|
|
|
if (valid_symbols[HEREDOC_MIDDLE] && !heredoc_delimiter.empty()) {
|
2017-07-14 21:27:13 +00:00
|
|
|
return scan_heredoc_content(lexer, HEREDOC_MIDDLE, HEREDOC_END);
|
2017-07-15 00:41:14 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if (valid_symbols[HEREDOC_BEGINNING]) {
|
2017-07-16 05:13:55 +00:00
|
|
|
heredoc_delimiter.clear();
|
2017-07-15 00:14:23 +00:00
|
|
|
while (iswalpha(lexer->lookahead)) {
|
2017-07-16 05:13:55 +00:00
|
|
|
heredoc_delimiter += lexer->lookahead;
|
2017-07-15 00:14:23 +00:00
|
|
|
advance(lexer);
|
|
|
|
}
|
2017-07-14 21:27:13 +00:00
|
|
|
|
2017-07-15 00:14:23 +00:00
|
|
|
if (lexer->lookahead != '\n') return false;
|
2017-07-14 20:54:05 +00:00
|
|
|
advance(lexer);
|
|
|
|
|
2017-07-15 00:14:23 +00:00
|
|
|
if (scan_heredoc_end_identifier(lexer)) {
|
|
|
|
lexer->result_symbol = SIMPLE_HEREDOC;
|
|
|
|
return true;
|
|
|
|
}
|
2017-07-14 20:54:05 +00:00
|
|
|
|
2017-07-15 00:14:23 +00:00
|
|
|
return scan_heredoc_content(lexer, HEREDOC_BEGINNING, SIMPLE_HEREDOC);
|
2017-07-15 00:41:14 +00:00
|
|
|
}
|
|
|
|
|
2018-02-27 18:54:40 +00:00
|
|
|
if (valid_symbols[VARIABLE_NAME] || valid_symbols[FILE_DESCRIPTOR] || valid_symbols[WORD]) {
|
|
|
|
unsigned length = 0;
|
|
|
|
|
2017-07-17 17:19:35 +00:00
|
|
|
for (;;) {
|
|
|
|
if (
|
|
|
|
lexer->lookahead == ' ' ||
|
|
|
|
lexer->lookahead == '\t' ||
|
|
|
|
(lexer->lookahead == '\n' && !valid_symbols[NEWLINE])
|
|
|
|
) {
|
|
|
|
skip(lexer);
|
|
|
|
} else if (lexer->lookahead == '\\') {
|
2018-02-27 18:54:40 +00:00
|
|
|
advance(lexer);
|
2017-07-17 17:19:35 +00:00
|
|
|
if (lexer->lookahead == '\n') {
|
|
|
|
skip(lexer);
|
|
|
|
} else {
|
2018-02-27 18:54:40 +00:00
|
|
|
length++;
|
|
|
|
break;
|
2017-07-17 17:19:35 +00:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-02-28 06:22:57 +00:00
|
|
|
bool is_numeric = iswdigit(lexer->lookahead);
|
|
|
|
bool is_alphanumeric = iswalpha(lexer->lookahead);
|
|
|
|
|
2017-07-17 17:19:35 +00:00
|
|
|
for (;;) {
|
2018-02-28 06:22:57 +00:00
|
|
|
// These characters are not allowed in unquoted arguments
|
|
|
|
// or environment variable names
|
|
|
|
if (
|
|
|
|
lexer->lookahead == 0 ||
|
|
|
|
lexer->lookahead == ';' ||
|
|
|
|
lexer->lookahead == '"' ||
|
|
|
|
lexer->lookahead == '(' ||
|
|
|
|
lexer->lookahead == ')' ||
|
|
|
|
lexer->lookahead == '\'' ||
|
|
|
|
lexer->lookahead == '&' ||
|
|
|
|
lexer->lookahead == '#' ||
|
|
|
|
lexer->lookahead == '`' ||
|
|
|
|
lexer->lookahead == '|' ||
|
|
|
|
lexer->lookahead == '$' ||
|
|
|
|
iswspace(lexer->lookahead)
|
|
|
|
) break;
|
|
|
|
|
|
|
|
// Curly braces are not allowed in unquoted arguments within curly braces
|
|
|
|
// (e.g. inside of a variable expansion like `${key:arg}`).
|
|
|
|
if (
|
|
|
|
lexer->lookahead == '}' &&
|
|
|
|
valid_symbols[CLOSING_BRACE]
|
|
|
|
) break;
|
|
|
|
|
|
|
|
// Square brackets are not allowed in unquoted arguments within square brackets
|
|
|
|
// (e.g. inside of an array subscript like `a[arg]`).
|
|
|
|
if (
|
|
|
|
lexer->lookahead == ']' &&
|
|
|
|
valid_symbols[CLOSING_BRACKET]
|
|
|
|
) break;
|
|
|
|
|
|
|
|
// Numbers followed by '<' and '>' at the beginning of commands
|
|
|
|
// are parsed as file descriptors.
|
|
|
|
if (lexer->lookahead == '<' || lexer->lookahead == '>') {
|
|
|
|
if (is_numeric && valid_symbols[FILE_DESCRIPTOR]) {
|
|
|
|
lexer->result_symbol = FILE_DESCRIPTOR;
|
|
|
|
return true;
|
|
|
|
}
|
2017-07-17 17:19:35 +00:00
|
|
|
break;
|
2017-07-15 00:14:23 +00:00
|
|
|
}
|
2018-02-27 18:54:40 +00:00
|
|
|
|
2018-02-28 06:22:57 +00:00
|
|
|
if (!iswdigit(lexer->lookahead)) is_numeric = false;
|
|
|
|
|
|
|
|
if (!iswalnum(lexer->lookahead) && lexer->lookahead != '_') {
|
|
|
|
|
|
|
|
// Alphanumeric strings followed by '=', '[', or '+=' are treated
|
|
|
|
// as environment variable names.
|
|
|
|
if (is_alphanumeric && valid_symbols[VARIABLE_NAME] && length > 0) {
|
|
|
|
if (lexer->lookahead == '+') {
|
|
|
|
lexer->mark_end(lexer);
|
|
|
|
advance(lexer);
|
|
|
|
if (lexer->lookahead == '=') {
|
|
|
|
lexer->result_symbol = VARIABLE_NAME;
|
|
|
|
return true;
|
|
|
|
} else {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
} else if (lexer->lookahead == '=' || lexer->lookahead == '[') {
|
|
|
|
lexer->result_symbol = VARIABLE_NAME;
|
|
|
|
return true;
|
|
|
|
}
|
2017-12-26 22:55:37 +00:00
|
|
|
}
|
2018-02-28 06:22:57 +00:00
|
|
|
|
|
|
|
is_alphanumeric = false;
|
2017-12-26 22:55:37 +00:00
|
|
|
}
|
2018-02-28 06:22:57 +00:00
|
|
|
|
|
|
|
advance(lexer);
|
|
|
|
length++;
|
2017-07-16 06:12:22 +00:00
|
|
|
}
|
2017-07-17 17:19:35 +00:00
|
|
|
|
2018-02-28 06:22:57 +00:00
|
|
|
// Do not handle strings containing only letters, because those
|
|
|
|
// might be keywords. Let the normal lexer handle those.
|
|
|
|
if (length > 0 && valid_symbols[WORD] && !is_alphanumeric) {
|
2018-02-27 18:54:40 +00:00
|
|
|
lexer->result_symbol = WORD;
|
|
|
|
return true;
|
|
|
|
}
|
2017-07-16 06:12:22 +00:00
|
|
|
}
|
|
|
|
|
2017-07-15 00:14:23 +00:00
|
|
|
return false;
|
2017-07-14 20:54:05 +00:00
|
|
|
}
|
2017-07-14 21:27:13 +00:00
|
|
|
|
2017-08-01 17:12:16 +00:00
|
|
|
string heredoc_delimiter;
|
|
|
|
string current_leading_word;
|
2017-07-14 20:54:05 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
extern "C" {
|
|
|
|
|
|
|
|
void *tree_sitter_bash_external_scanner_create() {
|
|
|
|
return new Scanner();
|
|
|
|
}
|
|
|
|
|
|
|
|
bool tree_sitter_bash_external_scanner_scan(void *payload, TSLexer *lexer,
|
|
|
|
const bool *valid_symbols) {
|
|
|
|
Scanner *scanner = static_cast<Scanner *>(payload);
|
|
|
|
return scanner->scan(lexer, valid_symbols);
|
|
|
|
}
|
|
|
|
|
2017-08-01 17:12:16 +00:00
|
|
|
unsigned tree_sitter_bash_external_scanner_serialize(void *payload, char *state) {
|
2017-07-14 20:54:05 +00:00
|
|
|
Scanner *scanner = static_cast<Scanner *>(payload);
|
|
|
|
return scanner->serialize(state);
|
|
|
|
}
|
|
|
|
|
2017-08-01 17:12:16 +00:00
|
|
|
void tree_sitter_bash_external_scanner_deserialize(void *payload, const char *state, unsigned length) {
|
2017-07-14 20:54:05 +00:00
|
|
|
Scanner *scanner = static_cast<Scanner *>(payload);
|
2017-08-01 17:12:16 +00:00
|
|
|
scanner->deserialize(state, length);
|
2017-07-14 20:54:05 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void tree_sitter_bash_external_scanner_destroy(void *payload) {
|
|
|
|
Scanner *scanner = static_cast<Scanner *>(payload);
|
|
|
|
delete scanner;
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|