tree-sitter-elixir/src/scanner.cc

845 lines
22 KiB
C++

#include <tree_sitter/parser.h>
namespace {
enum TokenType {
// TODO add a note that all QUOTE_* tokens are mutually exclusive
// i.e. the valid_symbols array contains at most one truthy of these
QUOTED_CONTENT_I_SINGLE,
QUOTED_CONTENT_I_DOUBLE,
QUOTED_CONTENT_I_HEREDOC_SINGLE,
QUOTED_CONTENT_I_HEREDOC_DOUBLE,
QUOTED_CONTENT_I_PARENTHESIS,
QUOTED_CONTENT_I_CURLY,
QUOTED_CONTENT_I_SQUARE,
QUOTED_CONTENT_I_ANGLE,
QUOTED_CONTENT_I_BAR,
QUOTED_CONTENT_I_SLASH,
QUOTED_CONTENT_SINGLE,
QUOTED_CONTENT_DOUBLE,
QUOTED_CONTENT_HEREDOC_SINGLE,
QUOTED_CONTENT_HEREDOC_DOUBLE,
QUOTED_CONTENT_PARENTHESIS,
QUOTED_CONTENT_CURLY,
QUOTED_CONTENT_SQUARE,
QUOTED_CONTENT_ANGLE,
QUOTED_CONTENT_BAR,
QUOTED_CONTENT_SLASH,
KEYWORD_SPECIAL_LITERAL,
ATOM_START,
KEYWORD_END,
NEWLINE_BEFORE_DO,
NEWLINE_BEFORE_BINARY_OP,
NEWLINE_BEFORE_COMMENT,
BEFORE_UNARY_OP,
NOT_IN
};
bool quoted_token_type(const bool* valid_symbols, TokenType& token_type) {
// Quoted symbols are mutually exclusive and only one should
// be valid at a time. If multiple are valid it means we parse
// an arbitrary code outside quotes, in which case we don't
// want to tokenize it as quoted content.
if (valid_symbols[QUOTED_CONTENT_I_SINGLE] && valid_symbols[QUOTED_CONTENT_I_DOUBLE]) {
return false;
}
if (valid_symbols[QUOTED_CONTENT_I_SINGLE]) {
token_type = QUOTED_CONTENT_I_SINGLE;
return true;
}
if (valid_symbols[QUOTED_CONTENT_I_DOUBLE]) {
token_type = QUOTED_CONTENT_I_DOUBLE;
return true;
}
if (valid_symbols[QUOTED_CONTENT_I_HEREDOC_SINGLE]) {
token_type = QUOTED_CONTENT_I_HEREDOC_SINGLE;
return true;
}
if (valid_symbols[QUOTED_CONTENT_I_HEREDOC_DOUBLE]) {
token_type = QUOTED_CONTENT_I_HEREDOC_DOUBLE;
return true;
}
if (valid_symbols[QUOTED_CONTENT_I_PARENTHESIS]) {
token_type = QUOTED_CONTENT_I_PARENTHESIS;
return true;
}
if (valid_symbols[QUOTED_CONTENT_I_CURLY]) {
token_type = QUOTED_CONTENT_I_CURLY;
return true;
}
if (valid_symbols[QUOTED_CONTENT_I_SQUARE]) {
token_type = QUOTED_CONTENT_I_SQUARE;
return true;
}
if (valid_symbols[QUOTED_CONTENT_I_ANGLE]) {
token_type = QUOTED_CONTENT_I_ANGLE;
return true;
}
if (valid_symbols[QUOTED_CONTENT_I_BAR]) {
token_type = QUOTED_CONTENT_I_BAR;
return true;
}
if (valid_symbols[QUOTED_CONTENT_I_SLASH]) {
token_type = QUOTED_CONTENT_I_SLASH;
return true;
}
if (valid_symbols[QUOTED_CONTENT_SINGLE]) {
token_type = QUOTED_CONTENT_SINGLE;
return true;
}
if (valid_symbols[QUOTED_CONTENT_DOUBLE]) {
token_type = QUOTED_CONTENT_DOUBLE;
return true;
}
if (valid_symbols[QUOTED_CONTENT_HEREDOC_SINGLE]) {
token_type = QUOTED_CONTENT_HEREDOC_SINGLE;
return true;
}
if (valid_symbols[QUOTED_CONTENT_HEREDOC_DOUBLE]) {
token_type = QUOTED_CONTENT_HEREDOC_DOUBLE;
return true;
}
if (valid_symbols[QUOTED_CONTENT_PARENTHESIS]) {
token_type = QUOTED_CONTENT_PARENTHESIS;
return true;
}
if (valid_symbols[QUOTED_CONTENT_CURLY]) {
token_type = QUOTED_CONTENT_CURLY;
return true;
}
if (valid_symbols[QUOTED_CONTENT_SQUARE]) {
token_type = QUOTED_CONTENT_SQUARE;
return true;
}
if (valid_symbols[QUOTED_CONTENT_ANGLE]) {
token_type = QUOTED_CONTENT_ANGLE;
return true;
}
if (valid_symbols[QUOTED_CONTENT_BAR]) {
token_type = QUOTED_CONTENT_BAR;
return true;
}
if (valid_symbols[QUOTED_CONTENT_SLASH]) {
token_type = QUOTED_CONTENT_SLASH;
return true;
}
return false;
}
int32_t quoted_end_delimiter(TokenType token_type) {
switch (token_type) {
case QUOTED_CONTENT_I_SINGLE:
case QUOTED_CONTENT_SINGLE:
case QUOTED_CONTENT_I_HEREDOC_SINGLE:
case QUOTED_CONTENT_HEREDOC_SINGLE:
return '\'';
case QUOTED_CONTENT_I_DOUBLE:
case QUOTED_CONTENT_DOUBLE:
case QUOTED_CONTENT_I_HEREDOC_DOUBLE:
case QUOTED_CONTENT_HEREDOC_DOUBLE:
return '\"';
case QUOTED_CONTENT_I_PARENTHESIS:
case QUOTED_CONTENT_PARENTHESIS:
return ')';
case QUOTED_CONTENT_I_CURLY:
case QUOTED_CONTENT_CURLY:
return '}';
case QUOTED_CONTENT_I_SQUARE:
case QUOTED_CONTENT_SQUARE:
return ']';
case QUOTED_CONTENT_I_ANGLE:
case QUOTED_CONTENT_ANGLE:
return '>';
case QUOTED_CONTENT_I_BAR:
case QUOTED_CONTENT_BAR:
return '|';
case QUOTED_CONTENT_I_SLASH:
case QUOTED_CONTENT_SLASH:
return '/';
}
__builtin_unreachable();
}
uint8_t quoted_delimiter_length(TokenType token_type) {
switch (token_type) {
case QUOTED_CONTENT_I_HEREDOC_SINGLE:
case QUOTED_CONTENT_I_HEREDOC_DOUBLE:
case QUOTED_CONTENT_HEREDOC_SINGLE:
case QUOTED_CONTENT_HEREDOC_DOUBLE:
return 3;
default:
return 1;
}
}
bool quoted_is_interpol(TokenType token_type) {
switch (token_type) {
case QUOTED_CONTENT_I_SINGLE:
case QUOTED_CONTENT_I_DOUBLE:
case QUOTED_CONTENT_I_HEREDOC_SINGLE:
case QUOTED_CONTENT_I_HEREDOC_DOUBLE:
case QUOTED_CONTENT_I_PARENTHESIS:
case QUOTED_CONTENT_I_CURLY:
case QUOTED_CONTENT_I_SQUARE:
case QUOTED_CONTENT_I_ANGLE:
case QUOTED_CONTENT_I_BAR:
case QUOTED_CONTENT_I_SLASH:
return true;
default:
return false;
}
}
bool is_whitespace(int32_t c) {
return c == ' ' || c == '\t' || c == '\v' ||
c == '\n' || c == '\f' || c == '\r';
}
bool is_inline_whitespace(int32_t c) {
return c == ' ' || c == '\t' || c == '\v';
}
// TODO what about these weird \f \r
bool is_newline(int32_t c) {
return c == '\n';
}
void advance(TSLexer* lexer) {
lexer->advance(lexer, false);
}
void skip(TSLexer *lexer) {
lexer->advance(lexer, true);
}
bool finish_atom_start(TSLexer* lexer) {
// The first ':' is already scanned and parser advanced
lexer->mark_end(lexer);
lexer->result_symbol = ATOM_START;
if (lexer->lookahead == ':') {
advance(lexer);
if (lexer->lookahead == ':') {
// :::
return true;
} else {
return false;
}
} else {
return !is_whitespace(lexer->lookahead);
}
}
bool is_keyword_end(TSLexer* lexer) {
if (lexer->lookahead == ':') {
advance(lexer);
return is_whitespace(lexer->lookahead);
}
return false;
}
bool finish_keyword(TSLexer* lexer) {
lexer->mark_end(lexer);
lexer->result_symbol = KEYWORD_SPECIAL_LITERAL;
return is_keyword_end(lexer);
}
bool is_digit(int32_t c) {
return '0' <= c && c <= '9';
}
bool is_operator_end(TSLexer* lexer) {
// Keyword
if (lexer->lookahead == ':') {
return !is_keyword_end(lexer);
}
while (is_inline_whitespace(lexer->lookahead)) {
advance(lexer);
}
// Operator identifier with arity
if (lexer->lookahead == '/') {
advance(lexer);
while (is_whitespace(lexer->lookahead)) {
advance(lexer);
}
if (is_digit(lexer->lookahead)) {
return false;
}
}
return true;
}
const char TOKEN_TERMINATORS[] = {
// Operator starts
'@', '.', '+', '-', '^', '-', '*', '/', '<', '>', '|', '~', '=', '&', '\\', '%',
// Delimiters
'{', '}', '[', ']', '(', ')', '"', '\'',
// Separators
',', ';',
// Comment
'#'
};
// Note: this is a heuristic as we only use this to distinguish word
// operators and we don't want to include complex Unicode ranges.
bool is_token_end(int32_t c) {
for (unsigned int i = 0; i < sizeof(TOKEN_TERMINATORS); i++) {
if (c == TOKEN_TERMINATORS[i]) {
return true;
}
}
return is_whitespace(c);
}
bool scan(TSLexer* lexer, const bool* valid_symbols) {
TokenType token_type;
bool is_quoted_symbol = quoted_token_type(valid_symbols, token_type);
// Quoted content, which matches any character except for close
// delimiters, escapes and interpolations
if (is_quoted_symbol) {
// TODO naming
// TODO move all of this into a separate function like scan_quoted_content
int32_t end_delimiter = quoted_end_delimiter(token_type);
bool supports_interpol = quoted_is_interpol(token_type);
uint8_t delimiter_length = quoted_delimiter_length(token_type);
lexer->result_symbol = token_type;
for (bool has_content = false; true; has_content = true) {
lexer->mark_end(lexer);
if (lexer->lookahead == end_delimiter) {
uint8_t length = 1;
while (length < delimiter_length) {
advance(lexer);
if (lexer->lookahead == end_delimiter) {
length++;
} else {
break;
}
}
if (length == delimiter_length) {
return has_content;
}
} else {
switch (lexer->lookahead) {
case '#':
advance(lexer);
if (supports_interpol && lexer->lookahead == '{') {
return has_content;
}
break;
case '\\':
if (supports_interpol) {
return has_content;
} else {
advance(lexer);
if (lexer->lookahead == end_delimiter) {
return has_content;
}
}
break;
case '\0':
return false;
default:
advance(lexer);
}
}
}
return false;
}
if (lexer->lookahead == ':') {
if (valid_symbols[ATOM_START] || valid_symbols[KEYWORD_END]) {
advance(lexer);
if (is_whitespace(lexer->lookahead)) {
if (valid_symbols[KEYWORD_END]) {
lexer->result_symbol = KEYWORD_END;
return true;
}
} else {
if (valid_symbols[ATOM_START]) {
return finish_atom_start(lexer);
}
}
return false;
}
}
bool skipped_whitespace = false;
while (is_inline_whitespace(lexer->lookahead)) {
skipped_whitespace = true;
skip(lexer);
}
// TODO moves this below together with other functions on this level
if (lexer->lookahead == '+') {
if (skipped_whitespace && valid_symbols[BEFORE_UNARY_OP]) {
lexer->mark_end(lexer);
advance(lexer);
if (lexer->lookahead == '+' || lexer->lookahead == ':' || lexer->lookahead == '/') {
return false;
}
if (is_whitespace(lexer->lookahead)) {
return false;
}
lexer->result_symbol = BEFORE_UNARY_OP;
return true;
}
}
if (lexer->lookahead == '-') {
if (skipped_whitespace && valid_symbols[BEFORE_UNARY_OP]) {
lexer->mark_end(lexer);
advance(lexer);
if (lexer->lookahead == '-' || lexer->lookahead == '>' || lexer->lookahead == ':' || lexer->lookahead == '/') {
return false;
}
if (is_whitespace(lexer->lookahead)) {
return false;
}
lexer->result_symbol = BEFORE_UNARY_OP;
return true;
}
}
if (lexer->lookahead == 'n') {
lexer->result_symbol = NOT_IN;
advance(lexer);
if (lexer->lookahead == 'o') {
advance(lexer);
if (lexer->lookahead == 't') {
advance(lexer);
while (is_inline_whitespace(lexer->lookahead)) {
advance(lexer);
}
if (lexer->lookahead == 'i') {
advance(lexer);
if (lexer->lookahead == 'n') {
advance(lexer);
return is_token_end(lexer->lookahead);
}
}
}
}
return false;
}
// TODO can be a separate function
if (is_newline(lexer->lookahead) && (
valid_symbols[NEWLINE_BEFORE_DO] ||
valid_symbols[NEWLINE_BEFORE_BINARY_OP] ||
valid_symbols[NEWLINE_BEFORE_COMMENT])) {
advance(lexer);
while (is_whitespace(lexer->lookahead)) {
advance(lexer);
}
// Note we include all the whitespace after newline, so that the
// parser doesn't have to go through it again
lexer->mark_end(lexer);
if (lexer->lookahead == '#') {
lexer->result_symbol = NEWLINE_BEFORE_COMMENT;
return true;
}
if (valid_symbols[NEWLINE_BEFORE_DO] && lexer->lookahead == 'd') {
lexer->result_symbol = NEWLINE_BEFORE_DO;
advance(lexer);
if (lexer->lookahead == 'o') {
advance(lexer);
return is_token_end(lexer->lookahead);
}
return false;
}
if (valid_symbols[NEWLINE_BEFORE_BINARY_OP] ) {
lexer->result_symbol = NEWLINE_BEFORE_BINARY_OP;
// &&, &&&
if (lexer->lookahead == '&') {
advance(lexer);
if (lexer->lookahead == '&') {
advance(lexer);
if (lexer->lookahead == '&') {
advance(lexer);
return is_operator_end(lexer);
} else {
return is_operator_end(lexer);
}
}
// =, ==, ===, =~, =>
} else if (lexer->lookahead == '=') {
advance(lexer);
if (lexer->lookahead == '=') {
advance(lexer);
if (lexer->lookahead == '=') {
advance(lexer);
return is_operator_end(lexer);
} else {
return is_operator_end(lexer);
}
} else if (lexer->lookahead == '~') {
advance(lexer);
return is_operator_end(lexer);
} else if (lexer->lookahead == '>') {
advance(lexer);
return is_operator_end(lexer);
} else {
return is_operator_end(lexer);
}
// ::
} else if (lexer->lookahead == ':') {
advance(lexer);
if (lexer->lookahead == ':') {
advance(lexer);
// Ignore ::: atom
if (lexer->lookahead == ':') return false;
return is_operator_end(lexer);
}
// ++, +++
} else if (lexer->lookahead == '+') {
advance(lexer);
if (lexer->lookahead == '+') {
advance(lexer);
if (lexer->lookahead == '+') {
advance(lexer);
return is_operator_end(lexer);
} else {
return is_operator_end(lexer);
}
}
// --, ---, ->
} else if (lexer->lookahead == '-') {
advance(lexer);
if (lexer->lookahead == '-') {
advance(lexer);
if (lexer->lookahead == '-') {
advance(lexer);
return is_operator_end(lexer);
} else {
return is_operator_end(lexer);
}
} else if (lexer->lookahead == '>') {
advance(lexer);
return is_operator_end(lexer);
}
// <, <=, <-, <>, <~, <~>, <|>, <<<, <<~
} else if (lexer->lookahead == '<') {
advance(lexer);
if (lexer->lookahead == '=' ||
lexer->lookahead == '-' ||
lexer->lookahead == '>') {
advance(lexer);
return is_operator_end(lexer);
} else if (lexer->lookahead == '~') {
advance(lexer);
if (lexer->lookahead == '>') {
advance(lexer);
return is_operator_end(lexer);
} else {
return is_operator_end(lexer);
}
} else if (lexer->lookahead == '|') {
advance(lexer);
if (lexer->lookahead == '>') {
advance(lexer);
return is_operator_end(lexer);
}
} else if (lexer->lookahead == '<') {
advance(lexer);
if (lexer->lookahead == '<' ||
lexer->lookahead == '~') {
advance(lexer);
return is_operator_end(lexer);
}
} else {
return is_operator_end(lexer);
}
// >, >=, >>>
} else if (lexer->lookahead == '>') {
advance(lexer);
if (lexer->lookahead == '=') {
advance(lexer);
return is_operator_end(lexer);
} else if (lexer->lookahead == '>') {
advance(lexer);
if (lexer->lookahead == '>') {
advance(lexer);
return is_operator_end(lexer);
}
} else {
return is_operator_end(lexer);
}
// ^^^
} else if (lexer->lookahead == '^') {
advance(lexer);
if (lexer->lookahead == '^') {
advance(lexer);
if (lexer->lookahead == '^') {
advance(lexer);
return is_operator_end(lexer);
}
}
// !=, !==
} else if (lexer->lookahead == '!') {
advance(lexer);
if (lexer->lookahead == '=') {
advance(lexer);
if (lexer->lookahead == '=') {
advance(lexer);
return is_operator_end(lexer);
} else {
return is_operator_end(lexer);
}
}
// ~>, ~>>
} else if (lexer->lookahead == '~') {
advance(lexer);
if (lexer->lookahead == '>') {
advance(lexer);
if (lexer->lookahead == '>') {
advance(lexer);
return is_operator_end(lexer);
} else {
return is_operator_end(lexer);
}
}
// |, ||, |||, |>
} else if (lexer->lookahead == '|') {
advance(lexer);
if (lexer->lookahead == '|') {
advance(lexer);
if (lexer->lookahead == '|') {
advance(lexer);
return is_operator_end(lexer);
} else {
return is_operator_end(lexer);
}
} else if (lexer->lookahead == '>') {
advance(lexer);
return is_operator_end(lexer);
} else {
return is_operator_end(lexer);
}
// *, **
} else if (lexer->lookahead == '*') {
advance(lexer);
if (lexer->lookahead == '*') {
advance(lexer);
return is_operator_end(lexer);
} else {
return is_operator_end(lexer);
}
// / //
} else if (lexer->lookahead == '/') {
advance(lexer);
if (lexer->lookahead == '/') {
advance(lexer);
return is_operator_end(lexer);
} else {
return is_operator_end(lexer);
}
// ., ..
} else if (lexer->lookahead == '.') {
advance(lexer);
if (lexer->lookahead == '.') {
advance(lexer);
// Ignore ... identifier
if (lexer->lookahead == '.') return false;
return is_operator_end(lexer);
} else {
return is_operator_end(lexer);
}
// double slash
} else if (lexer->lookahead == '\\') {
advance(lexer);
if (lexer->lookahead == '\\') {
advance(lexer);
return is_operator_end(lexer);
}
} else if (lexer->lookahead == 'w') {
advance(lexer);
if (lexer->lookahead == 'h') {
advance(lexer);
if (lexer->lookahead == 'e') {
advance(lexer);
if (lexer->lookahead == 'n') {
advance(lexer);
return is_token_end(lexer->lookahead) && is_operator_end(lexer);
}
}
}
} else if (lexer->lookahead == 'a') {
advance(lexer);
if (lexer->lookahead == 'n') {
advance(lexer);
if (lexer->lookahead == 'd') {
advance(lexer);
return is_token_end(lexer->lookahead) && is_operator_end(lexer);
}
}
// or
} else if (lexer->lookahead == 'o') {
advance(lexer);
if (lexer->lookahead == 'r') {
advance(lexer);
return is_token_end(lexer->lookahead) && is_operator_end(lexer);
}
// in
} else if (lexer->lookahead == 'i') {
advance(lexer);
if (lexer->lookahead == 'n') {
advance(lexer);
return is_token_end(lexer->lookahead) && is_operator_end(lexer);
}
// not in
} else if (lexer->lookahead == 'n') {
advance(lexer);
if (lexer->lookahead == 'o') {
advance(lexer);
if (lexer->lookahead == 't') {
advance(lexer);
while (is_inline_whitespace(lexer->lookahead)) {
advance(lexer);
}
if (lexer->lookahead == 'i') {
advance(lexer);
if (lexer->lookahead == 'n') {
advance(lexer);
return is_token_end(lexer->lookahead) && is_operator_end(lexer);
}
}
}
}
}
}
return false;
}
// ... ..//
if (lexer->lookahead == '.') {
if (valid_symbols[KEYWORD_SPECIAL_LITERAL]) {
advance(lexer);
if (lexer->lookahead == '.') {
advance(lexer);
if (lexer->lookahead == '.') {
advance(lexer);
return finish_keyword(lexer);
} else if (lexer->lookahead == '/') {
advance(lexer);
if (lexer->lookahead == '/') {
advance(lexer);
return finish_keyword(lexer);
}
}
}
}
// % %{}
} else if (lexer->lookahead == '%') {
if (valid_symbols[KEYWORD_SPECIAL_LITERAL]) {
advance(lexer);
if (lexer->lookahead == '{') {
advance(lexer);
if (lexer->lookahead == '}') {
advance(lexer);
return finish_keyword(lexer);
}
} else {
return finish_keyword(lexer);
}
}
// {}
} else if (lexer->lookahead == '{') {
if (valid_symbols[KEYWORD_SPECIAL_LITERAL]) {
advance(lexer);
if (lexer->lookahead == '}') {
advance(lexer);
return finish_keyword(lexer);
}
}
// <<>>
} else if (lexer->lookahead == '<') {
if (valid_symbols[KEYWORD_SPECIAL_LITERAL]) {
advance(lexer);
if (lexer->lookahead == '<') {
advance(lexer);
if (lexer->lookahead == '>') {
advance(lexer);
if (lexer->lookahead == '>') {
advance(lexer);
return finish_keyword(lexer);
}
}
}
}
// atom start
} else if (lexer->lookahead == ':') {
if (valid_symbols[ATOM_START]) {
advance(lexer);
return finish_atom_start(lexer);
}
}
return false;
}
// Expose the API expected by tree-sitter
extern "C" {
void* tree_sitter_elixir_external_scanner_create() {
return nullptr;
}
bool tree_sitter_elixir_external_scanner_scan(void* payload, TSLexer* lexer, const bool* valid_symbols) {
return scan(lexer, valid_symbols);
}
unsigned tree_sitter_elixir_external_scanner_serialize(void* payload, char* buffer) {
return 0;
}
void tree_sitter_elixir_external_scanner_deserialize(void* payload, const char* buffer, unsigned length) {}
void tree_sitter_elixir_external_scanner_destroy(void* payload) {}
}
}