#include namespace { // See references in grammar.externals enum TokenType { QUOTED_CONTENT_I_SINGLE, QUOTED_CONTENT_I_DOUBLE, QUOTED_CONTENT_I_HEREDOC_SINGLE, QUOTED_CONTENT_I_HEREDOC_DOUBLE, QUOTED_CONTENT_I_PARENTHESIS, QUOTED_CONTENT_I_CURLY, QUOTED_CONTENT_I_SQUARE, QUOTED_CONTENT_I_ANGLE, QUOTED_CONTENT_I_BAR, QUOTED_CONTENT_I_SLASH, QUOTED_CONTENT_SINGLE, QUOTED_CONTENT_DOUBLE, QUOTED_CONTENT_HEREDOC_SINGLE, QUOTED_CONTENT_HEREDOC_DOUBLE, QUOTED_CONTENT_PARENTHESIS, QUOTED_CONTENT_CURLY, QUOTED_CONTENT_SQUARE, QUOTED_CONTENT_ANGLE, QUOTED_CONTENT_BAR, QUOTED_CONTENT_SLASH, NEWLINE_BEFORE_DO, NEWLINE_BEFORE_BINARY_OPERATOR, NEWLINE_BEFORE_COMMENT, BEFORE_UNARY_OPERATOR, NOT_IN, QUOTED_ATOM_START }; void advance(TSLexer* lexer) { lexer->advance(lexer, false); } void skip(TSLexer *lexer) { lexer->advance(lexer, true); } // Note: some checks require several lexer steps of lookahead // and alter its state, for these we use names check_* bool is_whitespace(int32_t c) { return c == ' ' || c == '\t' || c == '\n' || c == '\r'; } bool is_inline_whitespace(int32_t c) { return c == ' ' || c == '\t'; } bool is_newline(int32_t c) { // Note: this implies \r\n is treated as two line breaks, // but in our case it's fine, since multiple line breaks // make no difference return c == '\n' || c == '\r'; } bool is_digit(int32_t c) { return '0' <= c && c <= '9'; } bool check_keyword_end(TSLexer* lexer) { if (lexer->lookahead == ':') { advance(lexer); return is_whitespace(lexer->lookahead); } return false; } bool check_operator_end(TSLexer* lexer) { // Keyword if (lexer->lookahead == ':') { return !check_keyword_end(lexer); } while (is_inline_whitespace(lexer->lookahead)) { advance(lexer); } // Operator identifier with arity if (lexer->lookahead == '/') { advance(lexer); while (is_whitespace(lexer->lookahead)) { advance(lexer); } if (is_digit(lexer->lookahead)) { return false; } } return true; } const char token_terminators[] = { // Operator starts '@', '.', '+', '-', '^', '-', '*', '/', '<', '>', '|', '~', '=', '&', '\\', '%', // Delimiters '{', '}', '[', ']', '(', ')', '"', '\'', // Separators ',', ';', // Comment '#' }; const uint8_t token_terminators_length = sizeof(token_terminators) / sizeof(char); // Note: this is a heuristic as we only use this to distinguish word // operators and we don't want to include complex Unicode ranges bool is_token_end(int32_t c) { for (uint8_t i = 0; i < token_terminators_length; i++) { if (c == token_terminators[i]) { return true; } } return is_whitespace(c); } struct QuotedContentInfo { const TokenType token_type; const bool supports_interpol; const int32_t end_delimiter; const uint8_t delimiter_length; }; const QuotedContentInfo quoted_content_infos[] = { { QUOTED_CONTENT_I_SINGLE, true, '\'', 1 }, { QUOTED_CONTENT_I_DOUBLE, true, '"', 1 }, { QUOTED_CONTENT_I_HEREDOC_SINGLE, true, '\'', 3 }, { QUOTED_CONTENT_I_HEREDOC_DOUBLE, true, '"', 3 }, { QUOTED_CONTENT_I_PARENTHESIS, true, ')', 1 }, { QUOTED_CONTENT_I_CURLY, true, '}', 1 }, { QUOTED_CONTENT_I_SQUARE, true, ']', 1 }, { QUOTED_CONTENT_I_ANGLE, true, '>', 1 }, { QUOTED_CONTENT_I_BAR, true, '|', 1 }, { QUOTED_CONTENT_I_SLASH, true, '/', 1 }, { QUOTED_CONTENT_SINGLE, false, '\'', 1 }, { QUOTED_CONTENT_DOUBLE, false, '"', 1 }, { QUOTED_CONTENT_HEREDOC_SINGLE, false, '\'', 3 }, { QUOTED_CONTENT_HEREDOC_DOUBLE, false, '"', 3 }, { QUOTED_CONTENT_PARENTHESIS, false, ')', 1 }, { QUOTED_CONTENT_CURLY, false, '}', 1 }, { QUOTED_CONTENT_SQUARE, false, ']', 1 }, { QUOTED_CONTENT_ANGLE, false, '>', 1 }, { QUOTED_CONTENT_BAR, false, '|', 1 }, { QUOTED_CONTENT_SLASH, false, '/', 1 }, }; const uint8_t quoted_content_infos_length = sizeof(quoted_content_infos) / sizeof(QuotedContentInfo); int8_t find_quoted_token_info(const bool* valid_symbols) { // Quoted tokens are mutually exclusive and only one should be valid // at a time. If multiple are valid it means we parse an arbitrary // code outside quotes, in which case we don't want to tokenize it as // quoted content. if (valid_symbols[QUOTED_CONTENT_I_SINGLE] && valid_symbols[QUOTED_CONTENT_I_DOUBLE]) { return -1; } for (uint8_t i = 0; i < quoted_content_infos_length; i++) { if (valid_symbols[quoted_content_infos[i].token_type]) { return i; } } return -1; } bool scan_quoted_content(TSLexer* lexer, const QuotedContentInfo& info) { lexer->result_symbol = info.token_type; bool is_heredoc = (info.delimiter_length == 3); for (bool has_content = false; true; has_content = true) { bool newline = false; if (is_newline(lexer->lookahead)) { advance(lexer); has_content = true; newline = true; while (is_whitespace(lexer->lookahead)) { advance(lexer); } } lexer->mark_end(lexer); if (lexer->lookahead == info.end_delimiter) { uint8_t length = 1; while (length < info.delimiter_length) { advance(lexer); if (lexer->lookahead == info.end_delimiter) { length++; } else { break; } } if (length == info.delimiter_length && (!is_heredoc || newline)) { return has_content; } } else { if (lexer->lookahead == '#') { advance(lexer); if (info.supports_interpol && lexer->lookahead == '{') { return has_content; } } else if (lexer->lookahead == '\\') { advance(lexer); if (is_heredoc && lexer->lookahead == '\n') { // We need to know about the newline to correctly recognise // heredoc end delimiter, so we intentionally ignore escaping } else if (info.supports_interpol || lexer->lookahead == info.end_delimiter) { return has_content; } } else if (lexer->lookahead == '\0') { // If we reached the end of the file, this means there is no // end delimiter, so the syntax is invalid. In that case we // want to treat all the scanned content as quoted content. return has_content; } else { advance(lexer); } } } return false; } bool scan_newline(TSLexer* lexer, const bool* valid_symbols) { advance(lexer); while (is_whitespace(lexer->lookahead)) { advance(lexer); } // Note we include all the whitespace after newline, so that the // parser doesn't have to go through it again lexer->mark_end(lexer); if (lexer->lookahead == '#') { lexer->result_symbol = NEWLINE_BEFORE_COMMENT; return true; } if (lexer->lookahead == 'd' && valid_symbols[NEWLINE_BEFORE_DO]) { lexer->result_symbol = NEWLINE_BEFORE_DO; advance(lexer); if (lexer->lookahead == 'o') { advance(lexer); return is_token_end(lexer->lookahead); } return false; } if (valid_symbols[NEWLINE_BEFORE_BINARY_OPERATOR] ) { lexer->result_symbol = NEWLINE_BEFORE_BINARY_OPERATOR; // &&, &&& if (lexer->lookahead == '&') { advance(lexer); if (lexer->lookahead == '&') { advance(lexer); if (lexer->lookahead == '&') { advance(lexer); return check_operator_end(lexer); } else { return check_operator_end(lexer); } } // =, ==, ===, =~, => } else if (lexer->lookahead == '=') { advance(lexer); if (lexer->lookahead == '=') { advance(lexer); if (lexer->lookahead == '=') { advance(lexer); return check_operator_end(lexer); } else { return check_operator_end(lexer); } } else if (lexer->lookahead == '~') { advance(lexer); return check_operator_end(lexer); } else if (lexer->lookahead == '>') { advance(lexer); return check_operator_end(lexer); } else { return check_operator_end(lexer); } // :: } else if (lexer->lookahead == ':') { advance(lexer); if (lexer->lookahead == ':') { advance(lexer); // Ignore ::: atom if (lexer->lookahead == ':') return false; return check_operator_end(lexer); } // ++, +++ } else if (lexer->lookahead == '+') { advance(lexer); if (lexer->lookahead == '+') { advance(lexer); if (lexer->lookahead == '+') { advance(lexer); return check_operator_end(lexer); } else { return check_operator_end(lexer); } } // --, ---, -> } else if (lexer->lookahead == '-') { advance(lexer); if (lexer->lookahead == '-') { advance(lexer); if (lexer->lookahead == '-') { advance(lexer); return check_operator_end(lexer); } else { return check_operator_end(lexer); } } else if (lexer->lookahead == '>') { advance(lexer); return check_operator_end(lexer); } // <, <=, <-, <>, <~, <~>, <|>, <<<, <<~ } else if (lexer->lookahead == '<') { advance(lexer); if (lexer->lookahead == '=' || lexer->lookahead == '-' || lexer->lookahead == '>') { advance(lexer); return check_operator_end(lexer); } else if (lexer->lookahead == '~') { advance(lexer); if (lexer->lookahead == '>') { advance(lexer); return check_operator_end(lexer); } else { return check_operator_end(lexer); } } else if (lexer->lookahead == '|') { advance(lexer); if (lexer->lookahead == '>') { advance(lexer); return check_operator_end(lexer); } } else if (lexer->lookahead == '<') { advance(lexer); if (lexer->lookahead == '<' || lexer->lookahead == '~') { advance(lexer); return check_operator_end(lexer); } } else { return check_operator_end(lexer); } // >, >=, >>> } else if (lexer->lookahead == '>') { advance(lexer); if (lexer->lookahead == '=') { advance(lexer); return check_operator_end(lexer); } else if (lexer->lookahead == '>') { advance(lexer); if (lexer->lookahead == '>') { advance(lexer); return check_operator_end(lexer); } } else { return check_operator_end(lexer); } // ^^^ } else if (lexer->lookahead == '^') { advance(lexer); if (lexer->lookahead == '^') { advance(lexer); if (lexer->lookahead == '^') { advance(lexer); return check_operator_end(lexer); } } // !=, !== } else if (lexer->lookahead == '!') { advance(lexer); if (lexer->lookahead == '=') { advance(lexer); if (lexer->lookahead == '=') { advance(lexer); return check_operator_end(lexer); } else { return check_operator_end(lexer); } } // ~>, ~>> } else if (lexer->lookahead == '~') { advance(lexer); if (lexer->lookahead == '>') { advance(lexer); if (lexer->lookahead == '>') { advance(lexer); return check_operator_end(lexer); } else { return check_operator_end(lexer); } } // |, ||, |||, |> } else if (lexer->lookahead == '|') { advance(lexer); if (lexer->lookahead == '|') { advance(lexer); if (lexer->lookahead == '|') { advance(lexer); return check_operator_end(lexer); } else { return check_operator_end(lexer); } } else if (lexer->lookahead == '>') { advance(lexer); return check_operator_end(lexer); } else { return check_operator_end(lexer); } // *, ** } else if (lexer->lookahead == '*') { advance(lexer); if (lexer->lookahead == '*') { advance(lexer); return check_operator_end(lexer); } else { return check_operator_end(lexer); } // / // } else if (lexer->lookahead == '/') { advance(lexer); if (lexer->lookahead == '/') { advance(lexer); return check_operator_end(lexer); } else { return check_operator_end(lexer); } // ., .. } else if (lexer->lookahead == '.') { advance(lexer); if (lexer->lookahead == '.') { advance(lexer); // Ignore ... identifier if (lexer->lookahead == '.') return false; return check_operator_end(lexer); } else { return check_operator_end(lexer); } // double slash } else if (lexer->lookahead == '\\') { advance(lexer); if (lexer->lookahead == '\\') { advance(lexer); return check_operator_end(lexer); } } else if (lexer->lookahead == 'w') { advance(lexer); if (lexer->lookahead == 'h') { advance(lexer); if (lexer->lookahead == 'e') { advance(lexer); if (lexer->lookahead == 'n') { advance(lexer); return is_token_end(lexer->lookahead) && check_operator_end(lexer); } } } } else if (lexer->lookahead == 'a') { advance(lexer); if (lexer->lookahead == 'n') { advance(lexer); if (lexer->lookahead == 'd') { advance(lexer); return is_token_end(lexer->lookahead) && check_operator_end(lexer); } } // or } else if (lexer->lookahead == 'o') { advance(lexer); if (lexer->lookahead == 'r') { advance(lexer); return is_token_end(lexer->lookahead) && check_operator_end(lexer); } // in } else if (lexer->lookahead == 'i') { advance(lexer); if (lexer->lookahead == 'n') { advance(lexer); return is_token_end(lexer->lookahead) && check_operator_end(lexer); } // not in } else if (lexer->lookahead == 'n') { advance(lexer); if (lexer->lookahead == 'o') { advance(lexer); if (lexer->lookahead == 't') { advance(lexer); while (is_inline_whitespace(lexer->lookahead)) { advance(lexer); } if (lexer->lookahead == 'i') { advance(lexer); if (lexer->lookahead == 'n') { advance(lexer); return is_token_end(lexer->lookahead) && check_operator_end(lexer); } } } } } } return false; } bool scan(TSLexer* lexer, const bool* valid_symbols) { int8_t quoted_content_info_idx = find_quoted_token_info(valid_symbols); // Quoted content, which matches any character except for close // delimiters, escapes and interpolations if (quoted_content_info_idx != -1) { const QuotedContentInfo& info = quoted_content_infos[quoted_content_info_idx]; return scan_quoted_content(lexer, info); } bool skipped_whitespace = false; while (is_inline_whitespace(lexer->lookahead)) { skipped_whitespace = true; skip(lexer); } // Newline, which is either tokenized as a special newline or ignored if (is_newline(lexer->lookahead) && ( valid_symbols[NEWLINE_BEFORE_DO] || valid_symbols[NEWLINE_BEFORE_BINARY_OPERATOR] || valid_symbols[NEWLINE_BEFORE_COMMENT])) { return scan_newline(lexer, valid_symbols); } // before unary + if (lexer->lookahead == '+') { if (skipped_whitespace && valid_symbols[BEFORE_UNARY_OPERATOR]) { lexer->mark_end(lexer); advance(lexer); if (lexer->lookahead == '+' || lexer->lookahead == ':' || lexer->lookahead == '/') { return false; } if (is_whitespace(lexer->lookahead)) { return false; } lexer->result_symbol = BEFORE_UNARY_OPERATOR; return true; } // before unary - } else if (lexer->lookahead == '-') { if (skipped_whitespace && valid_symbols[BEFORE_UNARY_OPERATOR]) { lexer->mark_end(lexer); lexer->result_symbol = BEFORE_UNARY_OPERATOR; advance(lexer); if (lexer->lookahead == '-' || lexer->lookahead == '>' || lexer->lookahead == ':' || lexer->lookahead == '/') { return false; } if (is_whitespace(lexer->lookahead)) { return false; } return true; } // not in } else if (lexer->lookahead == 'n') { if (valid_symbols[NOT_IN]) { lexer->result_symbol = NOT_IN; advance(lexer); if (lexer->lookahead == 'o') { advance(lexer); if (lexer->lookahead == 't') { advance(lexer); while (is_inline_whitespace(lexer->lookahead)) { advance(lexer); } if (lexer->lookahead == 'i') { advance(lexer); if (lexer->lookahead == 'n') { advance(lexer); return is_token_end(lexer->lookahead); } } } } } // quoted atom start } else if (lexer->lookahead == ':') { if (valid_symbols[QUOTED_ATOM_START]) { advance(lexer); lexer->mark_end(lexer); lexer->result_symbol = QUOTED_ATOM_START; if (lexer->lookahead == '"' || lexer->lookahead == '\'') { return true; } } } return false; } // Expose the API expected by tree-sitter extern "C" { void* tree_sitter_elixir_external_scanner_create() { return NULL; } bool tree_sitter_elixir_external_scanner_scan(void* payload, TSLexer* lexer, const bool* valid_symbols) { return scan(lexer, valid_symbols); } unsigned tree_sitter_elixir_external_scanner_serialize(void* payload, char* buffer) { return 0; } void tree_sitter_elixir_external_scanner_deserialize(void* payload, const char* buffer, unsigned length) {} void tree_sitter_elixir_external_scanner_destroy(void* payload) {} } // end anonymous namespace }