tree-sitter-elixir/src/scanner.cc

641 lines
18 KiB
C++

#include <tree_sitter/parser.h>
namespace {
// See references in grammar.externals
enum TokenType {
QUOTED_CONTENT_I_SINGLE,
QUOTED_CONTENT_I_DOUBLE,
QUOTED_CONTENT_I_HEREDOC_SINGLE,
QUOTED_CONTENT_I_HEREDOC_DOUBLE,
QUOTED_CONTENT_I_PARENTHESIS,
QUOTED_CONTENT_I_CURLY,
QUOTED_CONTENT_I_SQUARE,
QUOTED_CONTENT_I_ANGLE,
QUOTED_CONTENT_I_BAR,
QUOTED_CONTENT_I_SLASH,
QUOTED_CONTENT_SINGLE,
QUOTED_CONTENT_DOUBLE,
QUOTED_CONTENT_HEREDOC_SINGLE,
QUOTED_CONTENT_HEREDOC_DOUBLE,
QUOTED_CONTENT_PARENTHESIS,
QUOTED_CONTENT_CURLY,
QUOTED_CONTENT_SQUARE,
QUOTED_CONTENT_ANGLE,
QUOTED_CONTENT_BAR,
QUOTED_CONTENT_SLASH,
NEWLINE_BEFORE_DO,
NEWLINE_BEFORE_BINARY_OPERATOR,
NEWLINE_BEFORE_COMMENT,
BEFORE_UNARY_OPERATOR,
NOT_IN,
QUOTED_ATOM_START
};
void advance(TSLexer* lexer) {
lexer->advance(lexer, false);
}
void skip(TSLexer *lexer) {
lexer->advance(lexer, true);
}
// Note: some checks require several lexer steps of lookahead
// and alter its state, for these we use names check_*
bool is_whitespace(int32_t c) {
return c == ' ' || c == '\t' || c == '\n' || c == '\r';
}
bool is_inline_whitespace(int32_t c) {
return c == ' ' || c == '\t';
}
bool is_newline(int32_t c) {
// Note: this implies \r\n is treated as two line breaks,
// but in our case it's fine, since multiple line breaks
// make no difference
return c == '\n' || c == '\r';
}
bool is_digit(int32_t c) {
return '0' <= c && c <= '9';
}
bool check_keyword_end(TSLexer* lexer) {
if (lexer->lookahead == ':') {
advance(lexer);
return is_whitespace(lexer->lookahead);
}
return false;
}
bool check_operator_end(TSLexer* lexer) {
// Keyword
if (lexer->lookahead == ':') {
return !check_keyword_end(lexer);
}
while (is_inline_whitespace(lexer->lookahead)) {
advance(lexer);
}
// Operator identifier with arity
if (lexer->lookahead == '/') {
advance(lexer);
while (is_whitespace(lexer->lookahead)) {
advance(lexer);
}
if (is_digit(lexer->lookahead)) {
return false;
}
}
return true;
}
const char token_terminators[] = {
// Operator starts
'@', '.', '+', '-', '^', '-', '*', '/', '<', '>', '|', '~', '=', '&', '\\', '%',
// Delimiters
'{', '}', '[', ']', '(', ')', '"', '\'',
// Separators
',', ';',
// Comment
'#'
};
const uint8_t token_terminators_length = sizeof(token_terminators) / sizeof(char);
// Note: this is a heuristic as we only use this to distinguish word
// operators and we don't want to include complex Unicode ranges
bool is_token_end(int32_t c) {
for (uint8_t i = 0; i < token_terminators_length; i++) {
if (c == token_terminators[i]) {
return true;
}
}
return is_whitespace(c);
}
struct QuotedContentInfo {
const TokenType token_type;
const bool supports_interpol;
const int32_t end_delimiter;
const uint8_t delimiter_length;
};
const QuotedContentInfo quoted_content_infos[] = {
{ QUOTED_CONTENT_I_SINGLE, true, '\'', 1 },
{ QUOTED_CONTENT_I_DOUBLE, true, '"', 1 },
{ QUOTED_CONTENT_I_HEREDOC_SINGLE, true, '\'', 3 },
{ QUOTED_CONTENT_I_HEREDOC_DOUBLE, true, '"', 3 },
{ QUOTED_CONTENT_I_PARENTHESIS, true, ')', 1 },
{ QUOTED_CONTENT_I_CURLY, true, '}', 1 },
{ QUOTED_CONTENT_I_SQUARE, true, ']', 1 },
{ QUOTED_CONTENT_I_ANGLE, true, '>', 1 },
{ QUOTED_CONTENT_I_BAR, true, '|', 1 },
{ QUOTED_CONTENT_I_SLASH, true, '/', 1 },
{ QUOTED_CONTENT_SINGLE, false, '\'', 1 },
{ QUOTED_CONTENT_DOUBLE, false, '"', 1 },
{ QUOTED_CONTENT_HEREDOC_SINGLE, false, '\'', 3 },
{ QUOTED_CONTENT_HEREDOC_DOUBLE, false, '"', 3 },
{ QUOTED_CONTENT_PARENTHESIS, false, ')', 1 },
{ QUOTED_CONTENT_CURLY, false, '}', 1 },
{ QUOTED_CONTENT_SQUARE, false, ']', 1 },
{ QUOTED_CONTENT_ANGLE, false, '>', 1 },
{ QUOTED_CONTENT_BAR, false, '|', 1 },
{ QUOTED_CONTENT_SLASH, false, '/', 1 },
};
const uint8_t quoted_content_infos_length = sizeof(quoted_content_infos) / sizeof(QuotedContentInfo);
int8_t find_quoted_token_info(const bool* valid_symbols) {
// Quoted tokens are mutually exclusive and only one should be valid
// at a time. If multiple are valid it means we parse an arbitrary
// code outside quotes, in which case we don't want to tokenize it as
// quoted content.
if (valid_symbols[QUOTED_CONTENT_I_SINGLE] && valid_symbols[QUOTED_CONTENT_I_DOUBLE]) {
return -1;
}
for (uint8_t i = 0; i < quoted_content_infos_length; i++) {
if (valid_symbols[quoted_content_infos[i].token_type]) {
return i;
}
}
return -1;
}
bool scan_quoted_content(TSLexer* lexer, const QuotedContentInfo& info) {
lexer->result_symbol = info.token_type;
bool is_heredoc = (info.delimiter_length == 3);
for (bool has_content = false; true; has_content = true) {
bool newline = false;
if (is_newline(lexer->lookahead)) {
advance(lexer);
has_content = true;
newline = true;
while (is_whitespace(lexer->lookahead)) {
advance(lexer);
}
}
lexer->mark_end(lexer);
if (lexer->lookahead == info.end_delimiter) {
uint8_t length = 1;
while (length < info.delimiter_length) {
advance(lexer);
if (lexer->lookahead == info.end_delimiter) {
length++;
} else {
break;
}
}
if (length == info.delimiter_length && (!is_heredoc || newline)) {
return has_content;
}
} else {
if (lexer->lookahead == '#') {
advance(lexer);
if (info.supports_interpol && lexer->lookahead == '{') {
return has_content;
}
} else if (lexer->lookahead == '\\') {
advance(lexer);
if (is_heredoc && lexer->lookahead == '\n') {
// We need to know about the newline to correctly recognise
// heredoc end delimiter, so we intentionally ignore escaping
} else if (info.supports_interpol || lexer->lookahead == info.end_delimiter) {
return has_content;
}
} else if (lexer->lookahead == '\0') {
// If we reached the end of the file, this means there is no
// end delimiter, so the syntax is invalid. In that case we
// want to treat all the scanned content as quoted content.
return has_content;
} else {
advance(lexer);
}
}
}
return false;
}
bool scan_newline(TSLexer* lexer, const bool* valid_symbols) {
advance(lexer);
while (is_whitespace(lexer->lookahead)) {
advance(lexer);
}
// Note we include all the whitespace after newline, so that the
// parser doesn't have to go through it again
lexer->mark_end(lexer);
if (lexer->lookahead == '#') {
lexer->result_symbol = NEWLINE_BEFORE_COMMENT;
return true;
}
if (lexer->lookahead == 'd' && valid_symbols[NEWLINE_BEFORE_DO]) {
lexer->result_symbol = NEWLINE_BEFORE_DO;
advance(lexer);
if (lexer->lookahead == 'o') {
advance(lexer);
return is_token_end(lexer->lookahead);
}
return false;
}
if (valid_symbols[NEWLINE_BEFORE_BINARY_OPERATOR] ) {
lexer->result_symbol = NEWLINE_BEFORE_BINARY_OPERATOR;
// &&, &&&
if (lexer->lookahead == '&') {
advance(lexer);
if (lexer->lookahead == '&') {
advance(lexer);
if (lexer->lookahead == '&') {
advance(lexer);
return check_operator_end(lexer);
} else {
return check_operator_end(lexer);
}
}
// =, ==, ===, =~, =>
} else if (lexer->lookahead == '=') {
advance(lexer);
if (lexer->lookahead == '=') {
advance(lexer);
if (lexer->lookahead == '=') {
advance(lexer);
return check_operator_end(lexer);
} else {
return check_operator_end(lexer);
}
} else if (lexer->lookahead == '~') {
advance(lexer);
return check_operator_end(lexer);
} else if (lexer->lookahead == '>') {
advance(lexer);
return check_operator_end(lexer);
} else {
return check_operator_end(lexer);
}
// ::
} else if (lexer->lookahead == ':') {
advance(lexer);
if (lexer->lookahead == ':') {
advance(lexer);
// Ignore ::: atom
if (lexer->lookahead == ':') return false;
return check_operator_end(lexer);
}
// ++, +++
} else if (lexer->lookahead == '+') {
advance(lexer);
if (lexer->lookahead == '+') {
advance(lexer);
if (lexer->lookahead == '+') {
advance(lexer);
return check_operator_end(lexer);
} else {
return check_operator_end(lexer);
}
}
// --, ---, ->
} else if (lexer->lookahead == '-') {
advance(lexer);
if (lexer->lookahead == '-') {
advance(lexer);
if (lexer->lookahead == '-') {
advance(lexer);
return check_operator_end(lexer);
} else {
return check_operator_end(lexer);
}
} else if (lexer->lookahead == '>') {
advance(lexer);
return check_operator_end(lexer);
}
// <, <=, <-, <>, <~, <~>, <|>, <<<, <<~
} else if (lexer->lookahead == '<') {
advance(lexer);
if (lexer->lookahead == '=' ||
lexer->lookahead == '-' ||
lexer->lookahead == '>') {
advance(lexer);
return check_operator_end(lexer);
} else if (lexer->lookahead == '~') {
advance(lexer);
if (lexer->lookahead == '>') {
advance(lexer);
return check_operator_end(lexer);
} else {
return check_operator_end(lexer);
}
} else if (lexer->lookahead == '|') {
advance(lexer);
if (lexer->lookahead == '>') {
advance(lexer);
return check_operator_end(lexer);
}
} else if (lexer->lookahead == '<') {
advance(lexer);
if (lexer->lookahead == '<' ||
lexer->lookahead == '~') {
advance(lexer);
return check_operator_end(lexer);
}
} else {
return check_operator_end(lexer);
}
// >, >=, >>>
} else if (lexer->lookahead == '>') {
advance(lexer);
if (lexer->lookahead == '=') {
advance(lexer);
return check_operator_end(lexer);
} else if (lexer->lookahead == '>') {
advance(lexer);
if (lexer->lookahead == '>') {
advance(lexer);
return check_operator_end(lexer);
}
} else {
return check_operator_end(lexer);
}
// ^^^
} else if (lexer->lookahead == '^') {
advance(lexer);
if (lexer->lookahead == '^') {
advance(lexer);
if (lexer->lookahead == '^') {
advance(lexer);
return check_operator_end(lexer);
}
}
// !=, !==
} else if (lexer->lookahead == '!') {
advance(lexer);
if (lexer->lookahead == '=') {
advance(lexer);
if (lexer->lookahead == '=') {
advance(lexer);
return check_operator_end(lexer);
} else {
return check_operator_end(lexer);
}
}
// ~>, ~>>
} else if (lexer->lookahead == '~') {
advance(lexer);
if (lexer->lookahead == '>') {
advance(lexer);
if (lexer->lookahead == '>') {
advance(lexer);
return check_operator_end(lexer);
} else {
return check_operator_end(lexer);
}
}
// |, ||, |||, |>
} else if (lexer->lookahead == '|') {
advance(lexer);
if (lexer->lookahead == '|') {
advance(lexer);
if (lexer->lookahead == '|') {
advance(lexer);
return check_operator_end(lexer);
} else {
return check_operator_end(lexer);
}
} else if (lexer->lookahead == '>') {
advance(lexer);
return check_operator_end(lexer);
} else {
return check_operator_end(lexer);
}
// *, **
} else if (lexer->lookahead == '*') {
advance(lexer);
if (lexer->lookahead == '*') {
advance(lexer);
return check_operator_end(lexer);
} else {
return check_operator_end(lexer);
}
// / //
} else if (lexer->lookahead == '/') {
advance(lexer);
if (lexer->lookahead == '/') {
advance(lexer);
return check_operator_end(lexer);
} else {
return check_operator_end(lexer);
}
// ., ..
} else if (lexer->lookahead == '.') {
advance(lexer);
if (lexer->lookahead == '.') {
advance(lexer);
// Ignore ... identifier
if (lexer->lookahead == '.') return false;
return check_operator_end(lexer);
} else {
return check_operator_end(lexer);
}
// double slash
} else if (lexer->lookahead == '\\') {
advance(lexer);
if (lexer->lookahead == '\\') {
advance(lexer);
return check_operator_end(lexer);
}
} else if (lexer->lookahead == 'w') {
advance(lexer);
if (lexer->lookahead == 'h') {
advance(lexer);
if (lexer->lookahead == 'e') {
advance(lexer);
if (lexer->lookahead == 'n') {
advance(lexer);
return is_token_end(lexer->lookahead) && check_operator_end(lexer);
}
}
}
} else if (lexer->lookahead == 'a') {
advance(lexer);
if (lexer->lookahead == 'n') {
advance(lexer);
if (lexer->lookahead == 'd') {
advance(lexer);
return is_token_end(lexer->lookahead) && check_operator_end(lexer);
}
}
// or
} else if (lexer->lookahead == 'o') {
advance(lexer);
if (lexer->lookahead == 'r') {
advance(lexer);
return is_token_end(lexer->lookahead) && check_operator_end(lexer);
}
// in
} else if (lexer->lookahead == 'i') {
advance(lexer);
if (lexer->lookahead == 'n') {
advance(lexer);
return is_token_end(lexer->lookahead) && check_operator_end(lexer);
}
// not in
} else if (lexer->lookahead == 'n') {
advance(lexer);
if (lexer->lookahead == 'o') {
advance(lexer);
if (lexer->lookahead == 't') {
advance(lexer);
while (is_inline_whitespace(lexer->lookahead)) {
advance(lexer);
}
if (lexer->lookahead == 'i') {
advance(lexer);
if (lexer->lookahead == 'n') {
advance(lexer);
return is_token_end(lexer->lookahead) && check_operator_end(lexer);
}
}
}
}
}
}
return false;
}
bool scan(TSLexer* lexer, const bool* valid_symbols) {
int8_t quoted_content_info_idx = find_quoted_token_info(valid_symbols);
// Quoted content, which matches any character except for close
// delimiters, escapes and interpolations
if (quoted_content_info_idx != -1) {
const QuotedContentInfo& info = quoted_content_infos[quoted_content_info_idx];
return scan_quoted_content(lexer, info);
}
bool skipped_whitespace = false;
while (is_inline_whitespace(lexer->lookahead)) {
skipped_whitespace = true;
skip(lexer);
}
// Newline, which is either tokenized as a special newline or ignored
if (is_newline(lexer->lookahead) && (
valid_symbols[NEWLINE_BEFORE_DO] ||
valid_symbols[NEWLINE_BEFORE_BINARY_OPERATOR] ||
valid_symbols[NEWLINE_BEFORE_COMMENT])) {
return scan_newline(lexer, valid_symbols);
}
// before unary +
if (lexer->lookahead == '+') {
if (skipped_whitespace && valid_symbols[BEFORE_UNARY_OPERATOR]) {
lexer->mark_end(lexer);
advance(lexer);
if (lexer->lookahead == '+' || lexer->lookahead == ':' || lexer->lookahead == '/') {
return false;
}
if (is_whitespace(lexer->lookahead)) {
return false;
}
lexer->result_symbol = BEFORE_UNARY_OPERATOR;
return true;
}
// before unary -
} else if (lexer->lookahead == '-') {
if (skipped_whitespace && valid_symbols[BEFORE_UNARY_OPERATOR]) {
lexer->mark_end(lexer);
lexer->result_symbol = BEFORE_UNARY_OPERATOR;
advance(lexer);
if (lexer->lookahead == '-' || lexer->lookahead == '>' || lexer->lookahead == ':' || lexer->lookahead == '/') {
return false;
}
if (is_whitespace(lexer->lookahead)) {
return false;
}
return true;
}
// not in
} else if (lexer->lookahead == 'n') {
if (valid_symbols[NOT_IN]) {
lexer->result_symbol = NOT_IN;
advance(lexer);
if (lexer->lookahead == 'o') {
advance(lexer);
if (lexer->lookahead == 't') {
advance(lexer);
while (is_inline_whitespace(lexer->lookahead)) {
advance(lexer);
}
if (lexer->lookahead == 'i') {
advance(lexer);
if (lexer->lookahead == 'n') {
advance(lexer);
return is_token_end(lexer->lookahead);
}
}
}
}
}
// quoted atom start
} else if (lexer->lookahead == ':') {
if (valid_symbols[QUOTED_ATOM_START]) {
advance(lexer);
lexer->mark_end(lexer);
lexer->result_symbol = QUOTED_ATOM_START;
if (lexer->lookahead == '"' || lexer->lookahead == '\'') {
return true;
}
}
}
return false;
}
// Expose the API expected by tree-sitter
extern "C" {
void* tree_sitter_elixir_external_scanner_create() {
return NULL;
}
bool tree_sitter_elixir_external_scanner_scan(void* payload, TSLexer* lexer, const bool* valid_symbols) {
return scan(lexer, valid_symbols);
}
unsigned tree_sitter_elixir_external_scanner_serialize(void* payload, char* buffer) {
return 0;
}
void tree_sitter_elixir_external_scanner_deserialize(void* payload, const char* buffer, unsigned length) {}
void tree_sitter_elixir_external_scanner_destroy(void* payload) {}
}
// end anonymous namespace
}