Initial parser implementation
This commit is contained in:
parent
3f19b87c70
commit
dad92d2b87
|
@ -1,3 +1,8 @@
|
|||
# Deps
|
||||
/node_modules/
|
||||
/build/
|
||||
|
||||
# Temporary files
|
||||
/tmp/
|
||||
|
||||
# Temporary files generated by Tree-sitter
|
||||
log.html
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
"sources": [
|
||||
"bindings/node/binding.cc",
|
||||
"src/parser.c",
|
||||
# If your language uses an external scanner, add it here.
|
||||
"src/scanner.cc"
|
||||
],
|
||||
"cflags_c": [
|
||||
"-std=c99",
|
||||
|
|
916
grammar.js
916
grammar.js
|
@ -1,7 +1,921 @@
|
|||
// Operator precedence:
|
||||
// * https://hexdocs.pm/elixir/master/operators.html
|
||||
// * https://github.com/elixir-lang/elixir/blob/master/lib/elixir/src/elixir_parser.yrl
|
||||
const PREC = {
|
||||
IN_MATCH_OPS: 10,
|
||||
WHEN_OP: 20,
|
||||
TYPE_OP: 30,
|
||||
BAR_OP: 40,
|
||||
ASSOC_OP: 50,
|
||||
CAPTURE_OP: 60,
|
||||
MATCH_OP: 70,
|
||||
OR_OPS: 80,
|
||||
AND_OPS: 90,
|
||||
COMP_OPS: 100,
|
||||
REL_OPS: 110,
|
||||
ARROW_OPS: 120,
|
||||
IN_OPS: 130,
|
||||
XOR_OP: 140,
|
||||
TERNARY_OP: 150,
|
||||
CONCAT_OPS: 160,
|
||||
ADD_OPS: 170,
|
||||
MULT_OPS: 180,
|
||||
POWER_OP: 190,
|
||||
UNARY_OPS: 200,
|
||||
ACCESS: 205,
|
||||
DOT_OP: 210,
|
||||
AT_OP: 220,
|
||||
CAPTURE_OPERAND: 235,
|
||||
};
|
||||
|
||||
const IN_MATCH_OPS = ["<-", "\\\\"];
|
||||
const OR_OPS = ["||", "|||", "or"];
|
||||
const AND_OPS = ["&&", "&&&", "and"];
|
||||
const COMP_OPS = ["==", "!=", "=~", "===", "!=="];
|
||||
const REL_OPS = ["<", ">", "<=", ">="];
|
||||
const ARROW_OPS = ["|>", "<<<", ">>>", "<<~", "~>>", "<~", "~>", "<~>", "<|>"];
|
||||
const IN_OPS = ["in", "not in"];
|
||||
const CONCAT_OPS = ["++", "--", "+++", "---", "..", "<>"];
|
||||
const ADD_OPS = ["+", "-"];
|
||||
const MULT_OPS = ["*", "/"];
|
||||
const UNARY_OPS = ["+", "-", "!", "^", "~~~", "not"];
|
||||
|
||||
const ALL_OPS = [
|
||||
["->", "when", "::", "|", "=>", "&", "=", "^^^", "//", "**", ".", "@"],
|
||||
IN_MATCH_OPS,
|
||||
OR_OPS,
|
||||
AND_OPS,
|
||||
COMP_OPS,
|
||||
REL_OPS,
|
||||
ARROW_OPS,
|
||||
IN_OPS,
|
||||
CONCAT_OPS,
|
||||
ADD_OPS,
|
||||
MULT_OPS,
|
||||
UNARY_OPS,
|
||||
].flat();
|
||||
|
||||
// Ignore word literals and "=>" which is not a valid atom
|
||||
const ATOM_OPERATOR_LITERALS = ALL_OPS.filter(
|
||||
(operator) => !/[a-z]/.test(operator) && operator !== "=>"
|
||||
);
|
||||
|
||||
// Note that for keywords we use external scanner (KEYWORD_SPECIAL_LITERAL),
|
||||
// so it should be kept in sync
|
||||
const ATOM_SPECIAL_LITERALS = ["...", "%{}", "{}", "%", "<<>>", "..//"];
|
||||
|
||||
// Word tokens used directly in the grammar
|
||||
const RESERVED_WORD_TOKENS = [
|
||||
// Operators
|
||||
["and", "in", "not", "or", "when"],
|
||||
// Literals
|
||||
["true", "false", "nil"],
|
||||
// Other
|
||||
["after", "catch", "do", "else", "end", "fn", "rescue"],
|
||||
].flat();
|
||||
|
||||
const SPECIAL_IDENTIFIERS = [
|
||||
"__MODULE__",
|
||||
"__DIR__",
|
||||
"__ENV__",
|
||||
"__CALLER__",
|
||||
"__STACKTRACE__",
|
||||
];
|
||||
|
||||
// Numbers
|
||||
|
||||
const DIGITS = /[0-9]+/;
|
||||
const BIN_DIGITS = /[0-1]+/;
|
||||
const OCT_DIGITS = /[0-7]+/;
|
||||
const HEX_DIGITS = /[0-9a-fA-F]+/;
|
||||
|
||||
const numberDec = sep1(DIGITS, "_");
|
||||
const numberBin = seq("0b", sep1(BIN_DIGITS, "_"));
|
||||
const numberOct = seq("0o", sep1(OCT_DIGITS, "_"));
|
||||
const numberHex = seq("0x", sep1(HEX_DIGITS, "_"));
|
||||
|
||||
const integer = choice(numberDec, numberBin, numberOct, numberHex);
|
||||
|
||||
const floatScientificPart = seq(/[eE]/, optional(choice("-", "+")), integer);
|
||||
const float = seq(numberDec, ".", numberDec, optional(floatScientificPart));
|
||||
|
||||
const aliasPart = /[A-Z][_a-zA-Z0-9]*/;
|
||||
|
||||
module.exports = grammar({
|
||||
name: "elixir",
|
||||
|
||||
// TODO describe stuff (also in the separate notes doc add clarification
|
||||
// how we use this verbose tokens to avoid needing scanner state)
|
||||
externals: ($) => [
|
||||
$._quoted_content_i_single,
|
||||
$._quoted_content_i_double,
|
||||
$._quoted_content_i_heredoc_single,
|
||||
$._quoted_content_i_heredoc_double,
|
||||
$._quoted_content_i_parenthesis,
|
||||
$._quoted_content_i_curly,
|
||||
$._quoted_content_i_square,
|
||||
$._quoted_content_i_angle,
|
||||
$._quoted_content_i_bar,
|
||||
$._quoted_content_i_slash,
|
||||
|
||||
$._quoted_content_single,
|
||||
$._quoted_content_double,
|
||||
$._quoted_content_heredoc_single,
|
||||
$._quoted_content_heredoc_double,
|
||||
$._quoted_content_parenthesis,
|
||||
$._quoted_content_curly,
|
||||
$._quoted_content_square,
|
||||
$._quoted_content_angle,
|
||||
$._quoted_content_bar,
|
||||
$._quoted_content_slash,
|
||||
|
||||
$._keyword_special_literal,
|
||||
$._atom_start,
|
||||
$._keyword_end,
|
||||
|
||||
$._newline_before_do,
|
||||
$._newline_before_binary_op,
|
||||
// TODO explain this, basically we use newline ignored for newline before comment,
|
||||
// as after the comment there is another newline that we then consider as usual (so
|
||||
// that comments are skipped when considering newlines) <- this is chaotic need a better one
|
||||
$._newline_before_comment,
|
||||
|
||||
// TODO explain this, basically we use this to force unary + and -
|
||||
// if there is no spacing before the operand
|
||||
$._before_unary_op,
|
||||
|
||||
$._not_in,
|
||||
],
|
||||
|
||||
// TODO include in notes about why using extra for newline before binary op is fine
|
||||
// TODO figure out how "\n" helps with the behaviour in
|
||||
// [
|
||||
// :a,
|
||||
// ]
|
||||
// and how it generally works with extras
|
||||
extras: ($) => [
|
||||
$.comment,
|
||||
/\s|\\\n/,
|
||||
$._newline_before_binary_op,
|
||||
$._newline_before_comment,
|
||||
"\n",
|
||||
],
|
||||
|
||||
// TODO check if the parser doesn't compile without each conflict rule,
|
||||
// otherwise it means we don't really use it (I think)
|
||||
conflicts: ($) => [
|
||||
// [$._newline_before_binary_op],
|
||||
[$.binary_operator],
|
||||
[$.keywords],
|
||||
// [$.identifier, $.atom_literal],
|
||||
[$._expression, $._local_call_with_arguments],
|
||||
[
|
||||
$._expression,
|
||||
$._local_call_with_arguments,
|
||||
$._local_call_without_arguments,
|
||||
],
|
||||
|
||||
[$._remote_call, $._parenthesised_remote_call],
|
||||
|
||||
// stab clause `(x` may be either `(x;y) ->` or `(x, y) ->`
|
||||
// [$.block, $._stab_clause_arguments],
|
||||
[$.block, $._stab_clause_parentheses_arguments],
|
||||
[$.block, $._stab_clause_arguments],
|
||||
|
||||
[$.block, $._stab_clause_arguments_expression],
|
||||
|
||||
// when in stab clause
|
||||
[$.binary_operator, $._stab_clause_arguments_expression],
|
||||
|
||||
[$.tuple, $.map],
|
||||
[$.tuple, $.map_content],
|
||||
[$.operator_identifier, $.stab_clause],
|
||||
[$.unary_operator, $.operator_identifier],
|
||||
// [$.alias],
|
||||
[$.body],
|
||||
// [$.block, $._stab_clause_arguments],
|
||||
// [$.block, $._stab_clause_parentheses_arguments],
|
||||
// [$.block, $._stab_clause_parentheses_arguments],
|
||||
[$.after_block],
|
||||
[$.rescue_block],
|
||||
[$.catch_block],
|
||||
[$.else_block],
|
||||
],
|
||||
|
||||
rules: {
|
||||
source: ($) => "TODO",
|
||||
source: ($) =>
|
||||
seq(
|
||||
optional($._terminator),
|
||||
optional(
|
||||
seq(sep1($._expression, $._terminator), optional($._terminator))
|
||||
)
|
||||
),
|
||||
|
||||
_terminator: ($) =>
|
||||
prec.right(choice(seq(repeat("\n"), ";"), repeat1("\n"))),
|
||||
|
||||
_expression: ($) =>
|
||||
choice(
|
||||
$.block,
|
||||
$._identifier,
|
||||
$.alias,
|
||||
$.integer,
|
||||
$.float,
|
||||
$.atom,
|
||||
$.string,
|
||||
$.charlist,
|
||||
$.sigil,
|
||||
$.list,
|
||||
$.tuple,
|
||||
$.bitstring,
|
||||
$.map,
|
||||
$.char,
|
||||
$.boolean,
|
||||
$.nil,
|
||||
$.unary_operator,
|
||||
$.binary_operator,
|
||||
$.dot,
|
||||
$.call,
|
||||
$.access_call,
|
||||
$.anonymous_function
|
||||
),
|
||||
|
||||
block: ($) =>
|
||||
prec(
|
||||
PREC.WHEN_OP,
|
||||
seq(
|
||||
"(",
|
||||
seq(
|
||||
optional($._terminator),
|
||||
optional(
|
||||
seq(
|
||||
sep1(choice($._expression, $.stab_clause), $._terminator),
|
||||
optional($._terminator)
|
||||
)
|
||||
)
|
||||
),
|
||||
")"
|
||||
)
|
||||
),
|
||||
|
||||
_identifier: ($) =>
|
||||
choice($.identifier, $.unused_identifier, $.special_identifier),
|
||||
|
||||
// Note: Elixir does not allow uppercase and titlecase letters
|
||||
// as a variable starting character, but this regex would match
|
||||
// those. This implies we would happily parse those cases, but
|
||||
// since they are not valid Elixir it's unlikely to stumble upon
|
||||
// them. TODO reword
|
||||
// Ref: https://hexdocs.pm/elixir/master/unicode-syntax.html#variables
|
||||
// TODO see if we need this in custom scanner in the end, if we do,
|
||||
// then we may use the generation script from the original repo instead
|
||||
// and make this an external (though I'd check if these custom unicode
|
||||
// functions are efficient, does compiler optimise such checks?)
|
||||
// identifier: ($) => choice(/[\p{ID_Start}][\p{ID_Continue}]*[?!]?/u, "..."),
|
||||
// identifier: ($) => choice(/[\p{Ll}\p{Lm}\p{Lo}\p{Nl}\p{Other_ID_Start}][\p{ID_Continue}]*[?!]?/u, "..."),
|
||||
// identifier: ($) => choice(/[\p{Ll}\p{Lm}\p{Lo}\p{Nl}][\p{ID_Continue}]*[?!]?/u, "..."),
|
||||
//
|
||||
// TODO elaborate, but basically
|
||||
//
|
||||
// we remove uppercase/titlecase letters from ID_Start as elixir does
|
||||
// we remove the subtractions (we cannot express group subtraction in regex),
|
||||
// but it's fine becaues at the time of writing these groups only really subtract
|
||||
// a single character
|
||||
// Unicode.Set.to_utf8_char "[[[:L:][:Nl:][:Other_ID_Start:]] & [[:Pattern_Syntax:][:Pattern_White_Space:]]]"
|
||||
// we use hardcoded codepoints for \p{Other_ID_Start} since treesitter/js regexp doesn't
|
||||
// recognise this group
|
||||
//
|
||||
// Other_ID_Start \u1885\u1886\u2118\u212E\u309B\u309C
|
||||
// (this the list at the time of writing, it's for backward compatibility, see https://unicode.org/reports/tr31/#Backward_Compatibility)
|
||||
identifier: ($) =>
|
||||
choice(
|
||||
/[\p{Ll}\p{Lm}\p{Lo}\p{Nl}\u1885\u1886\u2118\u212E\u309B\u309C][\p{ID_Continue}]*[?!]?/u,
|
||||
"..."
|
||||
),
|
||||
|
||||
unused_identifier: ($) => /_[\p{ID_Continue}]*[?!]?/u,
|
||||
|
||||
special_identifier: ($) => choice(...SPECIAL_IDENTIFIERS),
|
||||
|
||||
// We have a separate rule for single-part alias, so that we
|
||||
// can use it in the keywords rule
|
||||
alias: ($) => choice($._alias_single, $._alias_multi),
|
||||
|
||||
_alias_single: ($) => aliasPart,
|
||||
|
||||
_alias_multi: ($) => token(sep1(aliasPart, /\s*\.\s*/)),
|
||||
|
||||
integer: ($) => token(integer),
|
||||
|
||||
float: ($) => token(float),
|
||||
|
||||
atom: ($) =>
|
||||
seq(
|
||||
$._atom_start,
|
||||
choice(
|
||||
alias($._atom_word_literal, $.atom_literal),
|
||||
alias($._atom_operator_literal, $.atom_literal),
|
||||
alias($._atom_special_literal, $.atom_literal),
|
||||
$._quoted_i_double,
|
||||
$._quoted_i_single
|
||||
)
|
||||
),
|
||||
|
||||
// TODO comment on the unicode groups here
|
||||
_atom_word_literal: ($) => /[\p{ID_Start}_][\p{ID_Continue}@]*[?!]?/u,
|
||||
|
||||
_atom_operator_literal: ($) => choice(...ATOM_OPERATOR_LITERALS),
|
||||
|
||||
_atom_special_literal: ($) => choice(...ATOM_SPECIAL_LITERALS),
|
||||
|
||||
// Defines $._quoted_content_i_{name} and $._quoted_content_{name} rules,
|
||||
// content with and without interpolation respectively
|
||||
...defineQuoted(`"`, `"`, "double"),
|
||||
...defineQuoted(`'`, `'`, "single"),
|
||||
...defineQuoted(`'''`, `'''`, "heredoc_single"),
|
||||
...defineQuoted(`"""`, `"""`, "heredoc_double"),
|
||||
...defineQuoted(`(`, `)`, "parenthesis"),
|
||||
...defineQuoted(`{`, `}`, "curly"),
|
||||
...defineQuoted(`[`, `]`, "square"),
|
||||
...defineQuoted(`<`, `>`, "angle"),
|
||||
...defineQuoted(`|`, `|`, "bar"),
|
||||
...defineQuoted(`/`, `/`, "slash"),
|
||||
|
||||
string: ($) => choice($._quoted_i_double, $._quoted_i_heredoc_double),
|
||||
|
||||
charlist: ($) => choice($._quoted_i_single, $._quoted_i_heredoc_single),
|
||||
|
||||
interpolation: ($) => seq("#{", $._expression, "}"),
|
||||
|
||||
escape_sequence: ($) =>
|
||||
token(
|
||||
seq(
|
||||
"\\",
|
||||
choice(
|
||||
// Single escaped character
|
||||
/[^ux]/,
|
||||
// Hex byte
|
||||
/x[0-9a-fA-F]{1,2}/,
|
||||
/x{[0-9a-fA-F]+}/,
|
||||
// Unicode code point
|
||||
/u{[0-9a-fA-F]+}/,
|
||||
/u[0-9a-fA-F]{4}/
|
||||
)
|
||||
)
|
||||
),
|
||||
|
||||
sigil: ($) =>
|
||||
seq(
|
||||
"~",
|
||||
choice(
|
||||
seq(
|
||||
alias(token.immediate(/[a-z]/), $.sigil_name),
|
||||
choice(
|
||||
$._quoted_i_double,
|
||||
$._quoted_i_single,
|
||||
$._quoted_i_heredoc_single,
|
||||
$._quoted_i_heredoc_double,
|
||||
$._quoted_i_parenthesis,
|
||||
$._quoted_i_curly,
|
||||
$._quoted_i_square,
|
||||
$._quoted_i_angle,
|
||||
$._quoted_i_bar,
|
||||
$._quoted_i_slash
|
||||
)
|
||||
),
|
||||
seq(
|
||||
alias(token.immediate(/[A-Z]/), $.sigil_name),
|
||||
choice(
|
||||
$._quoted_double,
|
||||
$._quoted_single,
|
||||
$._quoted_heredoc_single,
|
||||
$._quoted_heredoc_double,
|
||||
$._quoted_parenthesis,
|
||||
$._quoted_curly,
|
||||
$._quoted_square,
|
||||
$._quoted_angle,
|
||||
$._quoted_bar,
|
||||
$._quoted_slash
|
||||
)
|
||||
)
|
||||
),
|
||||
optional(alias(token.immediate(/[a-zA-Z]+/), $.sigil_modifiers))
|
||||
),
|
||||
|
||||
unary_operator: ($) =>
|
||||
choice(
|
||||
unaryOp($, prec, PREC.CAPTURE_OP, "&", $._capture_expression),
|
||||
unaryOp($, prec, PREC.UNARY_OPS, choice(...UNARY_OPS)),
|
||||
unaryOp($, prec, PREC.AT_OP, "@"),
|
||||
// Capture operand like &1 is a special case with higher precedence
|
||||
unaryOp($, prec, PREC.CAPTURE_OPERAND, "&", $.integer)
|
||||
),
|
||||
|
||||
_capture_expression: ($) =>
|
||||
choice(
|
||||
// TODO sholud parenthesised expression be generally used (?)
|
||||
// Precedence over block expression
|
||||
prec(PREC.WHEN_OP + 1, seq("(", $._expression, ")")),
|
||||
$._expression
|
||||
),
|
||||
|
||||
binary_operator: ($) =>
|
||||
choice(
|
||||
binaryOp($, prec.left, PREC.IN_MATCH_OPS, choice(...IN_MATCH_OPS)),
|
||||
binaryOp(
|
||||
$,
|
||||
prec.right,
|
||||
PREC.WHEN_OP,
|
||||
"when",
|
||||
$._expression,
|
||||
choice($._expression, $.keywords)
|
||||
),
|
||||
binaryOp($, prec.right, PREC.TYPE_OP, "::"),
|
||||
binaryOp(
|
||||
$,
|
||||
prec.right,
|
||||
PREC.BAR_OP,
|
||||
"|",
|
||||
$._expression,
|
||||
choice($._expression, $.keywords)
|
||||
),
|
||||
binaryOp($, prec.right, PREC.ASSOC_OP, "=>"),
|
||||
binaryOp($, prec.right, PREC.MATCH_OP, "="),
|
||||
binaryOp($, prec.left, PREC.OR_OPS, choice(...OR_OPS)),
|
||||
binaryOp($, prec.left, PREC.AND_OPS, choice(...AND_OPS)),
|
||||
binaryOp($, prec.left, PREC.COMP_OPS, choice(...COMP_OPS)),
|
||||
binaryOp($, prec.left, PREC.REL_OPS, choice(...REL_OPS)),
|
||||
binaryOp($, prec.left, PREC.ARROW_OPS, choice(...ARROW_OPS)),
|
||||
binaryOp($, prec.left, PREC.IN_OPS, choice("in", $._not_in)),
|
||||
binaryOp($, prec.left, PREC.XOR_OP, "^^^"),
|
||||
binaryOp($, prec.right, PREC.TERNARY_OP, "//"),
|
||||
binaryOp($, prec.right, PREC.CONCAT_OPS, choice(...CONCAT_OPS)),
|
||||
binaryOp($, prec.left, PREC.ADD_OPS, choice(...ADD_OPS)),
|
||||
binaryOp($, prec.left, PREC.MULT_OPS, choice(...MULT_OPS)),
|
||||
binaryOp($, prec.left, PREC.POWER_OP, "**"),
|
||||
// Operator with arity
|
||||
binaryOp(
|
||||
$,
|
||||
prec.left,
|
||||
PREC.MULT_OPS,
|
||||
"/",
|
||||
$.operator_identifier,
|
||||
$.integer
|
||||
)
|
||||
),
|
||||
|
||||
operator_identifier: ($) =>
|
||||
// Operators with the following changes:
|
||||
// * exclude "=>" since it's not a valid atom/operator identifier anyway (valid only in map)
|
||||
// * we exclude // since it's only valid after ..
|
||||
// * we remove "-" and "+" since they are both unary and binary
|
||||
|
||||
// We use the same precedence as unary operators, so that a sequence
|
||||
// like `& /` is a conflict and is resolved via $.conflicts
|
||||
// (could be be either `& / 2` or `& / / 2`)
|
||||
choice(
|
||||
// Unary operators
|
||||
prec(PREC.CAPTURE_OP, "&"),
|
||||
prec(PREC.UNARY_OPS, choice(...UNARY_OPS)),
|
||||
prec(PREC.AT_OP, "@"),
|
||||
// Binary operators
|
||||
...IN_MATCH_OPS,
|
||||
"when",
|
||||
"::",
|
||||
"|",
|
||||
"=",
|
||||
...OR_OPS,
|
||||
...AND_OPS,
|
||||
...COMP_OPS,
|
||||
...REL_OPS,
|
||||
...ARROW_OPS,
|
||||
"in",
|
||||
$._not_in,
|
||||
"^^",
|
||||
...CONCAT_OPS,
|
||||
...MULT_OPS,
|
||||
"**",
|
||||
"->",
|
||||
"."
|
||||
),
|
||||
|
||||
dot: ($) =>
|
||||
prec(
|
||||
PREC.DOT_OP,
|
||||
seq(choice($._expression), ".", choice($.alias, $.tuple))
|
||||
),
|
||||
|
||||
keywords: ($) => sep1($.pair, ","),
|
||||
|
||||
pair: ($) => seq($.keyword, $._expression),
|
||||
|
||||
keyword: ($) =>
|
||||
seq(
|
||||
// Tree-sitter doesn't consider ambiguities within individual
|
||||
// tokens (in this case regexps). So both in [a] and [a: 1] it
|
||||
// would always parse "a" as the same node (based on whether
|
||||
// $.identifier or $.atom_literal) is listed first in the rules.
|
||||
// However, since identifiers and alias parts are valid atom
|
||||
// literals, we can list them here, in which case the parser will
|
||||
// consider all paths and pick the valid one.
|
||||
// Also see https://github.com/tree-sitter/tree-sitter/issues/518
|
||||
choice(
|
||||
alias($._atom_word_literal, $.atom_literal),
|
||||
alias($._atom_operator_literal, $.atom_literal),
|
||||
alias($._keyword_special_literal, $.atom_literal),
|
||||
alias($.identifier, $.atom_literal),
|
||||
alias($.unused_identifier, $.atom_literal),
|
||||
alias($.special_identifier, $.atom_literal),
|
||||
alias($._alias_single, $.atom_literal),
|
||||
alias(choice(...RESERVED_WORD_TOKENS), $.atom_literal),
|
||||
$._quoted_i_double,
|
||||
$._quoted_i_single
|
||||
),
|
||||
$._keyword_end
|
||||
),
|
||||
|
||||
list: ($) => seq("[", optional($._items_with_trailing_separator), "]"),
|
||||
|
||||
tuple: ($) => seq("{", optional($._items_with_trailing_separator), "}"),
|
||||
|
||||
bitstring: ($) =>
|
||||
seq("<<", optional($._items_with_trailing_separator), ">>"),
|
||||
|
||||
map: ($) => seq("%", optional($.struct), "{", optional($.map_content), "}"),
|
||||
|
||||
struct: ($) =>
|
||||
prec.left(
|
||||
choice(
|
||||
$.alias,
|
||||
$.atom,
|
||||
$._identifier,
|
||||
$.unary_operator,
|
||||
$.dot,
|
||||
alias($._parenthesised_call, $.call)
|
||||
)
|
||||
),
|
||||
|
||||
map_content: ($) => $._items_with_trailing_separator,
|
||||
|
||||
_items_with_trailing_separator: ($) =>
|
||||
seq(
|
||||
choice(
|
||||
seq(sep1($._expression, ","), optional(seq(",", $.keywords))),
|
||||
$.keywords
|
||||
),
|
||||
optional(",")
|
||||
),
|
||||
|
||||
char: ($) => /\?(.|\\.)/,
|
||||
|
||||
boolean: ($) => choice("true", "false"),
|
||||
|
||||
nil: ($) => "nil",
|
||||
|
||||
call: ($) =>
|
||||
choice(
|
||||
$._local_call_with_arguments,
|
||||
$._parenthesised_local_call_with_arguments,
|
||||
$._local_call_without_arguments,
|
||||
$._remote_call,
|
||||
$._parenthesised_remote_call,
|
||||
$._anonymous_call,
|
||||
$._call_on_call
|
||||
),
|
||||
|
||||
_parenthesised_call: ($) =>
|
||||
choice(
|
||||
$._parenthesised_local_call_with_arguments,
|
||||
$._parenthesised_remote_call,
|
||||
$._anonymous_call,
|
||||
$._call_on_call
|
||||
),
|
||||
|
||||
_call_on_call: ($) =>
|
||||
prec.left(
|
||||
seq(
|
||||
alias(
|
||||
choice(
|
||||
$._parenthesised_local_call_with_arguments,
|
||||
$._parenthesised_remote_call,
|
||||
$._anonymous_call
|
||||
),
|
||||
$.call
|
||||
),
|
||||
// arguments in parentheses
|
||||
// alias($._local_or_remote_arguments, $.arguments),
|
||||
// TODO just make nonimmediate/immediate in the name
|
||||
alias($._anonymous_arguments, $.arguments),
|
||||
optional(seq(optional($._newline_before_do), $.do_block))
|
||||
)
|
||||
),
|
||||
|
||||
_local_call_with_arguments: ($) =>
|
||||
// Given `x + y` it can be interpreted either as a binary operator
|
||||
// or a call with unary operator. This is an actual ambiguity, so
|
||||
// we use dynamic precedence to penalize call
|
||||
// prec.dynamic(
|
||||
// TODO ideally we would penalize whitespace after unary op,
|
||||
// so that x + y is binary op and x +y is unary op, to reflect
|
||||
// Elixir ast
|
||||
// -1,
|
||||
prec.left(
|
||||
seq(
|
||||
$._identifier,
|
||||
alias($._call_arguments, $.arguments),
|
||||
// TODO include this in notes:
|
||||
// We use external scanner for _newline_before_do because
|
||||
// this way we can lookahead through any whitespace
|
||||
// (especially newlines). We cannot simply use repeat("\n")
|
||||
// and conflict with expression end, because this function
|
||||
// rule has left precedence (so that do-end sticks to the outermost
|
||||
// call), and thus expression end would always be preferred
|
||||
optional(seq(optional($._newline_before_do), $.do_block))
|
||||
// optional($.do_block)
|
||||
)
|
||||
// )
|
||||
),
|
||||
|
||||
_parenthesised_local_call_with_arguments: ($) =>
|
||||
// Given `x + y` it can be interpreted either as a binary operator
|
||||
// or a call with unary operator. This is an actual ambiguity, so
|
||||
// we use dynamic precedence to penalize call
|
||||
// prec.dynamic(
|
||||
// TODO ideally we would penalize whitespace after unary op,
|
||||
// so that x + y is binary op and x +y is unary op, to reflect
|
||||
// Elixir ast
|
||||
// -1,
|
||||
prec.left(
|
||||
seq(
|
||||
$._identifier,
|
||||
alias($._parenthesised_call_arguments, $.arguments),
|
||||
// TODO include this in notes:
|
||||
// We use external scanner for _newline_before_do because
|
||||
// this way we can lookahead through any whitespace
|
||||
// (especially newlines). We cannot simply use repeat("\n")
|
||||
// and conflict with expression end, because this function
|
||||
// rule has left precedence (so that do-end sticks to the outermost
|
||||
// call), and thus expression end would always be preferred
|
||||
optional(seq(optional($._newline_before_do), $.do_block))
|
||||
// optional($.do_block)
|
||||
)
|
||||
// )
|
||||
),
|
||||
|
||||
_local_call_without_arguments: ($) =>
|
||||
// We use lower precedence, so given `fun arg do end`
|
||||
// we don't tokenize `arg` as a call
|
||||
|
||||
// we actually need a conflict because of `foo bar do end` vs `foo bar do: 1`
|
||||
// prec(-1,
|
||||
prec.dynamic(-1, seq($._identifier, $.do_block)),
|
||||
// )
|
||||
_remote_call: ($) =>
|
||||
prec.left(
|
||||
seq(
|
||||
alias($._remote_dot, $.dot),
|
||||
optional(alias($._call_arguments, $.arguments)),
|
||||
optional(seq(optional($._newline_before_do), $.do_block))
|
||||
// optional($.do_block)
|
||||
)
|
||||
),
|
||||
|
||||
_parenthesised_remote_call: ($) =>
|
||||
prec.left(
|
||||
seq(
|
||||
alias($._remote_dot, $.dot),
|
||||
alias($._parenthesised_call_arguments, $.arguments),
|
||||
optional(seq(optional($._newline_before_do), $.do_block))
|
||||
// optional($.do_block)
|
||||
)
|
||||
),
|
||||
|
||||
_remote_dot: ($) =>
|
||||
prec(
|
||||
PREC.DOT_OP,
|
||||
seq(
|
||||
$._expression,
|
||||
".",
|
||||
// TODO can also be string, anything else?
|
||||
// compare with the other parser
|
||||
// TODO we don't want to support heredoc though
|
||||
choice(
|
||||
$._identifier,
|
||||
alias(choice(...RESERVED_WORD_TOKENS), $.identifier),
|
||||
$.operator_identifier,
|
||||
alias($._quoted_i_double, $.string),
|
||||
alias($._quoted_i_single, $.charlist)
|
||||
)
|
||||
)
|
||||
),
|
||||
|
||||
_parenthesised_call_arguments: ($) =>
|
||||
seq(token.immediate("("), optional($._call_arguments), ")"),
|
||||
|
||||
_anonymous_call: ($) =>
|
||||
seq(
|
||||
alias($._anonymous_dot, $.dot),
|
||||
alias($._anonymous_arguments, $.arguments)
|
||||
),
|
||||
|
||||
_anonymous_dot: ($) => prec(PREC.DOT_OP, seq($._expression, ".")),
|
||||
|
||||
_anonymous_arguments: ($) => seq("(", optional($._call_arguments), ")"),
|
||||
|
||||
_call_arguments: ($) =>
|
||||
// Right precedence ensures that `fun1 fun2 x, y` is treated
|
||||
// as `fun1(fun2(x, y))` and not `fun1(fun2(x), y)
|
||||
prec.right(
|
||||
seq(
|
||||
choice(
|
||||
seq(
|
||||
sep1($._expression, ","),
|
||||
optional(seq(",", $.keywords, optional(",")))
|
||||
),
|
||||
seq($.keywords, optional(","))
|
||||
)
|
||||
)
|
||||
),
|
||||
|
||||
access_call: ($) =>
|
||||
prec(
|
||||
PREC.ACCESS,
|
||||
seq($._expression, token.immediate("["), $._expression, "]")
|
||||
),
|
||||
|
||||
do_block: ($) =>
|
||||
seq(
|
||||
sugarBlock($, "do"),
|
||||
repeat(
|
||||
choice($.after_block, $.rescue_block, $.catch_block, $.else_block)
|
||||
),
|
||||
"end"
|
||||
),
|
||||
|
||||
after_block: ($) => sugarBlock($, "after"),
|
||||
|
||||
rescue_block: ($) => sugarBlock($, "rescue"),
|
||||
|
||||
catch_block: ($) => sugarBlock($, "catch"),
|
||||
|
||||
else_block: ($) => sugarBlock($, "else"),
|
||||
|
||||
// Specify right precedence, so that we consume as much as we can
|
||||
stab_clause: ($) =>
|
||||
prec.right(seq(optional($._stab_clause_left), "->", optional($.body))),
|
||||
|
||||
_stab_clause_left: ($) =>
|
||||
choice(
|
||||
// Note the first option has higher precedence, TODO clarify
|
||||
alias($._stab_clause_parentheses_arguments, $.arguments),
|
||||
// TODO naming/cleanup
|
||||
alias(
|
||||
$._stab_clause_parentheses_arguments_with_guard,
|
||||
$.binary_operator
|
||||
),
|
||||
alias($._stab_clause_arguments, $.arguments),
|
||||
alias($._stab_clause_arguments_with_guard, $.binary_operator)
|
||||
),
|
||||
|
||||
_stab_clause_parentheses_arguments: ($) =>
|
||||
// `(1) ->` may be interpreted either as block argument
|
||||
// or argument in parentheses and we use dynamic precedence
|
||||
// to favour the latter
|
||||
prec(
|
||||
PREC.WHEN_OP,
|
||||
prec.dynamic(1, seq("(", optional($._stab_clause_arguments), ")"))
|
||||
),
|
||||
_stab_clause_parentheses_arguments_with_guard: ($) =>
|
||||
seq(
|
||||
alias($._stab_clause_parentheses_arguments, $.arguments),
|
||||
"when",
|
||||
$._expression
|
||||
),
|
||||
|
||||
_stab_clause_arguments_with_guard: ($) =>
|
||||
// `a when b ->` may be interpted either such that `a when b` is an argument
|
||||
// or a guard binary operator with argument `a` and right operand `b`,
|
||||
// we use dynamic precedence to favour the latter
|
||||
prec.dynamic(
|
||||
1,
|
||||
seq(alias($._stab_clause_arguments, $.arguments), "when", $._expression)
|
||||
),
|
||||
_stab_clause_arguments: ($) =>
|
||||
// TODO this is a variant of _items_with_trailing_separator, cleanup
|
||||
choice(
|
||||
seq(
|
||||
sep1($._stab_clause_arguments_expression, ","),
|
||||
optional(seq(",", $.keywords))
|
||||
),
|
||||
$.keywords
|
||||
),
|
||||
|
||||
_stab_clause_arguments_expression: ($) =>
|
||||
// Note here we use the same precedence as when operator,
|
||||
// so we get a conflict and resolve it dynamically
|
||||
prec(PREC.WHEN_OP, $._expression),
|
||||
body: ($) =>
|
||||
seq(
|
||||
choice(
|
||||
seq($._terminator, sep($._expression, $._terminator)),
|
||||
sep1($._expression, $._terminator)
|
||||
),
|
||||
optional($._terminator)
|
||||
),
|
||||
|
||||
anonymous_function: ($) =>
|
||||
seq(
|
||||
"fn",
|
||||
optional($._terminator),
|
||||
sep1($.stab_clause, $._terminator),
|
||||
"end"
|
||||
),
|
||||
|
||||
// A comment may be anywhere, we give it a lower precedence,
|
||||
// so it doesn't intercept sequences such as interpolation
|
||||
comment: ($) => token(prec(-1, seq("#", /.*/))),
|
||||
},
|
||||
});
|
||||
|
||||
function sep1(rule, separator) {
|
||||
return seq(rule, repeat(seq(separator, rule)));
|
||||
}
|
||||
|
||||
function sep(rule, separator) {
|
||||
return optional(sep1(rule, separator));
|
||||
}
|
||||
|
||||
function unaryOp($, assoc, precedence, operator, right = null) {
|
||||
return assoc(
|
||||
precedence,
|
||||
// TODO clarify, we use lower precedence, so given `x + y`,
|
||||
// which can be interpreted as either `x + y` or `x(+y)`
|
||||
// we favour the former. The only exception is when
|
||||
// _before_unary_op matches which forces the latter interpretation
|
||||
// in case like `x +y`
|
||||
prec.dynamic(
|
||||
-1,
|
||||
seq(
|
||||
optional($._before_unary_op),
|
||||
field("operator", operator),
|
||||
right || $._expression
|
||||
)
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
function binaryOp($, assoc, precedence, operator, left = null, right = null) {
|
||||
return assoc(
|
||||
precedence,
|
||||
seq(
|
||||
field("left", left || $._expression),
|
||||
field("operator", operator),
|
||||
field("right", right || $._expression)
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
function sugarBlock($, start) {
|
||||
return seq(
|
||||
start,
|
||||
optional($._terminator),
|
||||
optional(
|
||||
choice(
|
||||
sep1(choice($.stab_clause), $._terminator),
|
||||
seq(sep1(choice($._expression), $._terminator), optional($._terminator))
|
||||
)
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
function defineQuoted(start, end, name) {
|
||||
return {
|
||||
[`_quoted_i_${name}`]: ($) =>
|
||||
seq(
|
||||
start,
|
||||
repeat(
|
||||
choice(
|
||||
// TODO rename the extenrals to _content
|
||||
alias($[`_quoted_content_i_${name}`], $.string_content),
|
||||
$.interpolation,
|
||||
$.escape_sequence
|
||||
)
|
||||
),
|
||||
end
|
||||
),
|
||||
|
||||
[`_quoted_${name}`]: ($) =>
|
||||
seq(
|
||||
start,
|
||||
repeat(
|
||||
choice(
|
||||
// TODO rename the extenrals to _content
|
||||
alias($[`_quoted_content_${name}`], $.string_content),
|
||||
// It's always possible to escape the end delimiter
|
||||
$.escape_sequence
|
||||
)
|
||||
),
|
||||
end
|
||||
),
|
||||
};
|
||||
}
|
||||
|
|
5663
src/grammar.json
5663
src/grammar.json
File diff suppressed because it is too large
Load Diff
3262
src/node-types.json
3262
src/node-types.json
File diff suppressed because it is too large
Load Diff
403241
src/parser.c
403241
src/parser.c
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,844 @@
|
|||
#include <tree_sitter/parser.h>
|
||||
|
||||
namespace {
|
||||
|
||||
enum TokenType {
|
||||
// TODO add a note that all QUOTE_* tokens are mutually exclusive
|
||||
// i.e. the valid_symbols array contains at most one truthy of these
|
||||
QUOTED_CONTENT_I_SINGLE,
|
||||
QUOTED_CONTENT_I_DOUBLE,
|
||||
QUOTED_CONTENT_I_HEREDOC_SINGLE,
|
||||
QUOTED_CONTENT_I_HEREDOC_DOUBLE,
|
||||
QUOTED_CONTENT_I_PARENTHESIS,
|
||||
QUOTED_CONTENT_I_CURLY,
|
||||
QUOTED_CONTENT_I_SQUARE,
|
||||
QUOTED_CONTENT_I_ANGLE,
|
||||
QUOTED_CONTENT_I_BAR,
|
||||
QUOTED_CONTENT_I_SLASH,
|
||||
|
||||
QUOTED_CONTENT_SINGLE,
|
||||
QUOTED_CONTENT_DOUBLE,
|
||||
QUOTED_CONTENT_HEREDOC_SINGLE,
|
||||
QUOTED_CONTENT_HEREDOC_DOUBLE,
|
||||
QUOTED_CONTENT_PARENTHESIS,
|
||||
QUOTED_CONTENT_CURLY,
|
||||
QUOTED_CONTENT_SQUARE,
|
||||
QUOTED_CONTENT_ANGLE,
|
||||
QUOTED_CONTENT_BAR,
|
||||
QUOTED_CONTENT_SLASH,
|
||||
|
||||
KEYWORD_SPECIAL_LITERAL,
|
||||
ATOM_START,
|
||||
KEYWORD_END,
|
||||
|
||||
NEWLINE_BEFORE_DO,
|
||||
NEWLINE_BEFORE_BINARY_OP,
|
||||
NEWLINE_BEFORE_COMMENT,
|
||||
|
||||
BEFORE_UNARY_OP,
|
||||
|
||||
NOT_IN
|
||||
};
|
||||
|
||||
bool quoted_token_type(const bool* valid_symbols, TokenType& token_type) {
|
||||
// Quoted symbols are mutually exclusive and only one should
|
||||
// be valid at a time. If multiple are valid it means we parse
|
||||
// an arbitrary code outside quotes, in which case we don't
|
||||
// want to tokenize it as quoted content.
|
||||
if (valid_symbols[QUOTED_CONTENT_I_SINGLE] && valid_symbols[QUOTED_CONTENT_I_DOUBLE]) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (valid_symbols[QUOTED_CONTENT_I_SINGLE]) {
|
||||
token_type = QUOTED_CONTENT_I_SINGLE;
|
||||
return true;
|
||||
}
|
||||
if (valid_symbols[QUOTED_CONTENT_I_DOUBLE]) {
|
||||
token_type = QUOTED_CONTENT_I_DOUBLE;
|
||||
return true;
|
||||
}
|
||||
if (valid_symbols[QUOTED_CONTENT_I_HEREDOC_SINGLE]) {
|
||||
token_type = QUOTED_CONTENT_I_HEREDOC_SINGLE;
|
||||
return true;
|
||||
}
|
||||
if (valid_symbols[QUOTED_CONTENT_I_HEREDOC_DOUBLE]) {
|
||||
token_type = QUOTED_CONTENT_I_HEREDOC_DOUBLE;
|
||||
return true;
|
||||
}
|
||||
if (valid_symbols[QUOTED_CONTENT_I_PARENTHESIS]) {
|
||||
token_type = QUOTED_CONTENT_I_PARENTHESIS;
|
||||
return true;
|
||||
}
|
||||
if (valid_symbols[QUOTED_CONTENT_I_CURLY]) {
|
||||
token_type = QUOTED_CONTENT_I_CURLY;
|
||||
return true;
|
||||
}
|
||||
if (valid_symbols[QUOTED_CONTENT_I_SQUARE]) {
|
||||
token_type = QUOTED_CONTENT_I_SQUARE;
|
||||
return true;
|
||||
}
|
||||
if (valid_symbols[QUOTED_CONTENT_I_ANGLE]) {
|
||||
token_type = QUOTED_CONTENT_I_ANGLE;
|
||||
return true;
|
||||
}
|
||||
if (valid_symbols[QUOTED_CONTENT_I_BAR]) {
|
||||
token_type = QUOTED_CONTENT_I_BAR;
|
||||
return true;
|
||||
}
|
||||
if (valid_symbols[QUOTED_CONTENT_I_SLASH]) {
|
||||
token_type = QUOTED_CONTENT_I_SLASH;
|
||||
return true;
|
||||
}
|
||||
if (valid_symbols[QUOTED_CONTENT_SINGLE]) {
|
||||
token_type = QUOTED_CONTENT_SINGLE;
|
||||
return true;
|
||||
}
|
||||
if (valid_symbols[QUOTED_CONTENT_DOUBLE]) {
|
||||
token_type = QUOTED_CONTENT_DOUBLE;
|
||||
return true;
|
||||
}
|
||||
if (valid_symbols[QUOTED_CONTENT_HEREDOC_SINGLE]) {
|
||||
token_type = QUOTED_CONTENT_HEREDOC_SINGLE;
|
||||
return true;
|
||||
}
|
||||
if (valid_symbols[QUOTED_CONTENT_HEREDOC_DOUBLE]) {
|
||||
token_type = QUOTED_CONTENT_HEREDOC_DOUBLE;
|
||||
return true;
|
||||
}
|
||||
if (valid_symbols[QUOTED_CONTENT_PARENTHESIS]) {
|
||||
token_type = QUOTED_CONTENT_PARENTHESIS;
|
||||
return true;
|
||||
}
|
||||
if (valid_symbols[QUOTED_CONTENT_CURLY]) {
|
||||
token_type = QUOTED_CONTENT_CURLY;
|
||||
return true;
|
||||
}
|
||||
if (valid_symbols[QUOTED_CONTENT_SQUARE]) {
|
||||
token_type = QUOTED_CONTENT_SQUARE;
|
||||
return true;
|
||||
}
|
||||
if (valid_symbols[QUOTED_CONTENT_ANGLE]) {
|
||||
token_type = QUOTED_CONTENT_ANGLE;
|
||||
return true;
|
||||
}
|
||||
if (valid_symbols[QUOTED_CONTENT_BAR]) {
|
||||
token_type = QUOTED_CONTENT_BAR;
|
||||
return true;
|
||||
}
|
||||
if (valid_symbols[QUOTED_CONTENT_SLASH]) {
|
||||
token_type = QUOTED_CONTENT_SLASH;
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
int32_t quoted_end_delimiter(TokenType token_type) {
|
||||
switch (token_type) {
|
||||
case QUOTED_CONTENT_I_SINGLE:
|
||||
case QUOTED_CONTENT_SINGLE:
|
||||
case QUOTED_CONTENT_I_HEREDOC_SINGLE:
|
||||
case QUOTED_CONTENT_HEREDOC_SINGLE:
|
||||
return '\'';
|
||||
|
||||
case QUOTED_CONTENT_I_DOUBLE:
|
||||
case QUOTED_CONTENT_DOUBLE:
|
||||
case QUOTED_CONTENT_I_HEREDOC_DOUBLE:
|
||||
case QUOTED_CONTENT_HEREDOC_DOUBLE:
|
||||
return '\"';
|
||||
|
||||
case QUOTED_CONTENT_I_PARENTHESIS:
|
||||
case QUOTED_CONTENT_PARENTHESIS:
|
||||
return ')';
|
||||
|
||||
case QUOTED_CONTENT_I_CURLY:
|
||||
case QUOTED_CONTENT_CURLY:
|
||||
return '}';
|
||||
|
||||
case QUOTED_CONTENT_I_SQUARE:
|
||||
case QUOTED_CONTENT_SQUARE:
|
||||
return ']';
|
||||
|
||||
case QUOTED_CONTENT_I_ANGLE:
|
||||
case QUOTED_CONTENT_ANGLE:
|
||||
return '>';
|
||||
|
||||
case QUOTED_CONTENT_I_BAR:
|
||||
case QUOTED_CONTENT_BAR:
|
||||
return '|';
|
||||
|
||||
case QUOTED_CONTENT_I_SLASH:
|
||||
case QUOTED_CONTENT_SLASH:
|
||||
return '/';
|
||||
}
|
||||
|
||||
__builtin_unreachable();
|
||||
}
|
||||
|
||||
uint8_t quoted_delimiter_length(TokenType token_type) {
|
||||
switch (token_type) {
|
||||
case QUOTED_CONTENT_I_HEREDOC_SINGLE:
|
||||
case QUOTED_CONTENT_I_HEREDOC_DOUBLE:
|
||||
case QUOTED_CONTENT_HEREDOC_SINGLE:
|
||||
case QUOTED_CONTENT_HEREDOC_DOUBLE:
|
||||
return 3;
|
||||
|
||||
default:
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
bool quoted_is_interpol(TokenType token_type) {
|
||||
switch (token_type) {
|
||||
case QUOTED_CONTENT_I_SINGLE:
|
||||
case QUOTED_CONTENT_I_DOUBLE:
|
||||
case QUOTED_CONTENT_I_HEREDOC_SINGLE:
|
||||
case QUOTED_CONTENT_I_HEREDOC_DOUBLE:
|
||||
case QUOTED_CONTENT_I_PARENTHESIS:
|
||||
case QUOTED_CONTENT_I_CURLY:
|
||||
case QUOTED_CONTENT_I_SQUARE:
|
||||
case QUOTED_CONTENT_I_ANGLE:
|
||||
case QUOTED_CONTENT_I_BAR:
|
||||
case QUOTED_CONTENT_I_SLASH:
|
||||
return true;
|
||||
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
bool is_whitespace(int32_t c) {
|
||||
return c == ' ' || c == '\t' || c == '\v' ||
|
||||
c == '\n' || c == '\f' || c == '\r';
|
||||
}
|
||||
|
||||
bool is_inline_whitespace(int32_t c) {
|
||||
return c == ' ' || c == '\t' || c == '\v';
|
||||
}
|
||||
|
||||
// TODO what about these weird \f \r
|
||||
bool is_newline(int32_t c) {
|
||||
return c == '\n';
|
||||
}
|
||||
|
||||
void advance(TSLexer* lexer) {
|
||||
lexer->advance(lexer, false);
|
||||
}
|
||||
|
||||
void skip(TSLexer *lexer) {
|
||||
lexer->advance(lexer, true);
|
||||
}
|
||||
|
||||
bool finish_atom_start(TSLexer* lexer) {
|
||||
// The first ':' is already scanned and parser advanced
|
||||
lexer->mark_end(lexer);
|
||||
lexer->result_symbol = ATOM_START;
|
||||
|
||||
if (lexer->lookahead == ':') {
|
||||
advance(lexer);
|
||||
if (lexer->lookahead == ':') {
|
||||
// :::
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
return !is_whitespace(lexer->lookahead);
|
||||
}
|
||||
}
|
||||
|
||||
bool is_keyword_end(TSLexer* lexer) {
|
||||
if (lexer->lookahead == ':') {
|
||||
advance(lexer);
|
||||
return is_whitespace(lexer->lookahead);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool finish_keyword(TSLexer* lexer) {
|
||||
lexer->mark_end(lexer);
|
||||
lexer->result_symbol = KEYWORD_SPECIAL_LITERAL;
|
||||
return is_keyword_end(lexer);
|
||||
}
|
||||
|
||||
bool is_digit(int32_t c) {
|
||||
return '0' <= c && c <= '9';
|
||||
}
|
||||
|
||||
bool is_operator_end(TSLexer* lexer) {
|
||||
// Keyword
|
||||
if (lexer->lookahead == ':') {
|
||||
return !is_keyword_end(lexer);
|
||||
}
|
||||
while (is_inline_whitespace(lexer->lookahead)) {
|
||||
advance(lexer);
|
||||
}
|
||||
// Operator identifier with arity
|
||||
if (lexer->lookahead == '/') {
|
||||
advance(lexer);
|
||||
while (is_whitespace(lexer->lookahead)) {
|
||||
advance(lexer);
|
||||
}
|
||||
if (is_digit(lexer->lookahead)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
const char TOKEN_TERMINATORS[] = {
|
||||
// Operator starts
|
||||
'@', '.', '+', '-', '^', '-', '*', '/', '<', '>', '|', '~', '=', '&', '\\', '%',
|
||||
// Delimiters
|
||||
'{', '}', '[', ']', '(', ')', '"', '\'',
|
||||
// Separators
|
||||
',', ';',
|
||||
// Comment
|
||||
'#'
|
||||
};
|
||||
|
||||
// Note: this is a heuristic as we only use this to distinguish word
|
||||
// operators and we don't want to include complex Unicode ranges.
|
||||
bool is_token_end(int32_t c) {
|
||||
for (unsigned int i = 0; i < sizeof(TOKEN_TERMINATORS); i++) {
|
||||
if (c == TOKEN_TERMINATORS[i]) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return is_whitespace(c);
|
||||
}
|
||||
|
||||
bool scan(TSLexer* lexer, const bool* valid_symbols) {
|
||||
TokenType token_type;
|
||||
bool is_quoted_symbol = quoted_token_type(valid_symbols, token_type);
|
||||
|
||||
// Quoted content, which matches any character except for close
|
||||
// delimiters, escapes and interpolations
|
||||
if (is_quoted_symbol) {
|
||||
// TODO naming
|
||||
// TODO move all of this into a separate function like scan_quoted_content
|
||||
int32_t end_delimiter = quoted_end_delimiter(token_type);
|
||||
bool supports_interpol = quoted_is_interpol(token_type);
|
||||
uint8_t delimiter_length = quoted_delimiter_length(token_type);
|
||||
|
||||
lexer->result_symbol = token_type;
|
||||
|
||||
for (bool has_content = false; true; has_content = true) {
|
||||
lexer->mark_end(lexer);
|
||||
|
||||
if (lexer->lookahead == end_delimiter) {
|
||||
uint8_t length = 1;
|
||||
|
||||
while (length < delimiter_length) {
|
||||
advance(lexer);
|
||||
if (lexer->lookahead == end_delimiter) {
|
||||
length++;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (length == delimiter_length) {
|
||||
return has_content;
|
||||
}
|
||||
} else {
|
||||
switch (lexer->lookahead) {
|
||||
case '#':
|
||||
advance(lexer);
|
||||
|
||||
if (supports_interpol && lexer->lookahead == '{') {
|
||||
return has_content;
|
||||
}
|
||||
|
||||
break;
|
||||
|
||||
case '\\':
|
||||
if (supports_interpol) {
|
||||
return has_content;
|
||||
} else {
|
||||
advance(lexer);
|
||||
|
||||
if (lexer->lookahead == end_delimiter) {
|
||||
return has_content;
|
||||
}
|
||||
}
|
||||
|
||||
break;
|
||||
|
||||
case '\0':
|
||||
return false;
|
||||
|
||||
default:
|
||||
advance(lexer);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
if (lexer->lookahead == ':') {
|
||||
if (valid_symbols[ATOM_START] || valid_symbols[KEYWORD_END]) {
|
||||
advance(lexer);
|
||||
|
||||
if (is_whitespace(lexer->lookahead)) {
|
||||
if (valid_symbols[KEYWORD_END]) {
|
||||
lexer->result_symbol = KEYWORD_END;
|
||||
return true;
|
||||
}
|
||||
} else {
|
||||
if (valid_symbols[ATOM_START]) {
|
||||
return finish_atom_start(lexer);
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
bool skipped_whitespace = false;
|
||||
|
||||
while (is_inline_whitespace(lexer->lookahead)) {
|
||||
skipped_whitespace = true;
|
||||
skip(lexer);
|
||||
}
|
||||
|
||||
// TODO moves this below together with other functions on this level
|
||||
if (lexer->lookahead == '+') {
|
||||
if (skipped_whitespace && valid_symbols[BEFORE_UNARY_OP]) {
|
||||
lexer->mark_end(lexer);
|
||||
advance(lexer);
|
||||
if (lexer->lookahead == '+' || lexer->lookahead == ':' || lexer->lookahead == '/') {
|
||||
return false;
|
||||
}
|
||||
if (is_whitespace(lexer->lookahead)) {
|
||||
return false;
|
||||
}
|
||||
lexer->result_symbol = BEFORE_UNARY_OP;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
if (lexer->lookahead == '-') {
|
||||
if (skipped_whitespace && valid_symbols[BEFORE_UNARY_OP]) {
|
||||
lexer->mark_end(lexer);
|
||||
advance(lexer);
|
||||
if (lexer->lookahead == '-' || lexer->lookahead == '>' || lexer->lookahead == ':' || lexer->lookahead == '/') {
|
||||
return false;
|
||||
}
|
||||
if (is_whitespace(lexer->lookahead)) {
|
||||
return false;
|
||||
}
|
||||
lexer->result_symbol = BEFORE_UNARY_OP;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
if (lexer->lookahead == 'n') {
|
||||
lexer->result_symbol = NOT_IN;
|
||||
advance(lexer);
|
||||
if (lexer->lookahead == 'o') {
|
||||
advance(lexer);
|
||||
if (lexer->lookahead == 't') {
|
||||
advance(lexer);
|
||||
while (is_inline_whitespace(lexer->lookahead)) {
|
||||
advance(lexer);
|
||||
}
|
||||
if (lexer->lookahead == 'i') {
|
||||
advance(lexer);
|
||||
if (lexer->lookahead == 'n') {
|
||||
advance(lexer);
|
||||
return is_token_end(lexer->lookahead);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// TODO can be a separate function
|
||||
|
||||
if (is_newline(lexer->lookahead) && (
|
||||
valid_symbols[NEWLINE_BEFORE_DO] ||
|
||||
valid_symbols[NEWLINE_BEFORE_BINARY_OP] ||
|
||||
valid_symbols[NEWLINE_BEFORE_COMMENT])) {
|
||||
advance(lexer);
|
||||
|
||||
while (is_whitespace(lexer->lookahead)) {
|
||||
advance(lexer);
|
||||
}
|
||||
|
||||
// Note we include all the whitespace after newline, so that the
|
||||
// parser doesn't have to go through it again
|
||||
lexer->mark_end(lexer);
|
||||
|
||||
if (lexer->lookahead == '#') {
|
||||
lexer->result_symbol = NEWLINE_BEFORE_COMMENT;
|
||||
return true;
|
||||
}
|
||||
|
||||
if (valid_symbols[NEWLINE_BEFORE_DO] && lexer->lookahead == 'd') {
|
||||
lexer->result_symbol = NEWLINE_BEFORE_DO;
|
||||
advance(lexer);
|
||||
if (lexer->lookahead == 'o') {
|
||||
advance(lexer);
|
||||
return is_token_end(lexer->lookahead);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
if (valid_symbols[NEWLINE_BEFORE_BINARY_OP] ) {
|
||||
lexer->result_symbol = NEWLINE_BEFORE_BINARY_OP;
|
||||
|
||||
// &&, &&&
|
||||
if (lexer->lookahead == '&') {
|
||||
advance(lexer);
|
||||
if (lexer->lookahead == '&') {
|
||||
advance(lexer);
|
||||
if (lexer->lookahead == '&') {
|
||||
advance(lexer);
|
||||
return is_operator_end(lexer);
|
||||
} else {
|
||||
return is_operator_end(lexer);
|
||||
}
|
||||
}
|
||||
// =, ==, ===, =~, =>
|
||||
} else if (lexer->lookahead == '=') {
|
||||
advance(lexer);
|
||||
if (lexer->lookahead == '=') {
|
||||
advance(lexer);
|
||||
if (lexer->lookahead == '=') {
|
||||
advance(lexer);
|
||||
return is_operator_end(lexer);
|
||||
} else {
|
||||
return is_operator_end(lexer);
|
||||
}
|
||||
} else if (lexer->lookahead == '~') {
|
||||
advance(lexer);
|
||||
return is_operator_end(lexer);
|
||||
} else if (lexer->lookahead == '>') {
|
||||
advance(lexer);
|
||||
return is_operator_end(lexer);
|
||||
} else {
|
||||
return is_operator_end(lexer);
|
||||
}
|
||||
// ::
|
||||
} else if (lexer->lookahead == ':') {
|
||||
advance(lexer);
|
||||
if (lexer->lookahead == ':') {
|
||||
advance(lexer);
|
||||
// Ignore ::: atom
|
||||
if (lexer->lookahead == ':') return false;
|
||||
return is_operator_end(lexer);
|
||||
}
|
||||
// ++, +++
|
||||
} else if (lexer->lookahead == '+') {
|
||||
advance(lexer);
|
||||
if (lexer->lookahead == '+') {
|
||||
advance(lexer);
|
||||
if (lexer->lookahead == '+') {
|
||||
advance(lexer);
|
||||
return is_operator_end(lexer);
|
||||
} else {
|
||||
return is_operator_end(lexer);
|
||||
}
|
||||
}
|
||||
// --, ---, ->
|
||||
} else if (lexer->lookahead == '-') {
|
||||
advance(lexer);
|
||||
if (lexer->lookahead == '-') {
|
||||
advance(lexer);
|
||||
if (lexer->lookahead == '-') {
|
||||
advance(lexer);
|
||||
return is_operator_end(lexer);
|
||||
} else {
|
||||
return is_operator_end(lexer);
|
||||
}
|
||||
} else if (lexer->lookahead == '>') {
|
||||
advance(lexer);
|
||||
return is_operator_end(lexer);
|
||||
}
|
||||
// <, <=, <-, <>, <~, <~>, <|>, <<<, <<~
|
||||
} else if (lexer->lookahead == '<') {
|
||||
advance(lexer);
|
||||
if (lexer->lookahead == '=' ||
|
||||
lexer->lookahead == '-' ||
|
||||
lexer->lookahead == '>') {
|
||||
advance(lexer);
|
||||
return is_operator_end(lexer);
|
||||
} else if (lexer->lookahead == '~') {
|
||||
advance(lexer);
|
||||
if (lexer->lookahead == '>') {
|
||||
advance(lexer);
|
||||
return is_operator_end(lexer);
|
||||
} else {
|
||||
return is_operator_end(lexer);
|
||||
}
|
||||
} else if (lexer->lookahead == '|') {
|
||||
advance(lexer);
|
||||
if (lexer->lookahead == '>') {
|
||||
advance(lexer);
|
||||
return is_operato |