Initial parser implementation

This commit is contained in:
Jonatan Kłosko 2021-09-25 02:23:37 +02:00
parent 3f19b87c70
commit dad92d2b87
31 changed files with 415172 additions and 337 deletions

7
.gitignore vendored
View File

@ -1,3 +1,8 @@
# Deps
/node_modules/
/build/
# Temporary files
/tmp/
# Temporary files generated by Tree-sitter
log.html

View File

@ -9,7 +9,7 @@
"sources": [
"bindings/node/binding.cc",
"src/parser.c",
# If your language uses an external scanner, add it here.
"src/scanner.cc"
],
"cflags_c": [
"-std=c99",

View File

@ -1,7 +1,921 @@
// Operator precedence:
// * https://hexdocs.pm/elixir/master/operators.html
// * https://github.com/elixir-lang/elixir/blob/master/lib/elixir/src/elixir_parser.yrl
const PREC = {
IN_MATCH_OPS: 10,
WHEN_OP: 20,
TYPE_OP: 30,
BAR_OP: 40,
ASSOC_OP: 50,
CAPTURE_OP: 60,
MATCH_OP: 70,
OR_OPS: 80,
AND_OPS: 90,
COMP_OPS: 100,
REL_OPS: 110,
ARROW_OPS: 120,
IN_OPS: 130,
XOR_OP: 140,
TERNARY_OP: 150,
CONCAT_OPS: 160,
ADD_OPS: 170,
MULT_OPS: 180,
POWER_OP: 190,
UNARY_OPS: 200,
ACCESS: 205,
DOT_OP: 210,
AT_OP: 220,
CAPTURE_OPERAND: 235,
};
const IN_MATCH_OPS = ["<-", "\\\\"];
const OR_OPS = ["||", "|||", "or"];
const AND_OPS = ["&&", "&&&", "and"];
const COMP_OPS = ["==", "!=", "=~", "===", "!=="];
const REL_OPS = ["<", ">", "<=", ">="];
const ARROW_OPS = ["|>", "<<<", ">>>", "<<~", "~>>", "<~", "~>", "<~>", "<|>"];
const IN_OPS = ["in", "not in"];
const CONCAT_OPS = ["++", "--", "+++", "---", "..", "<>"];
const ADD_OPS = ["+", "-"];
const MULT_OPS = ["*", "/"];
const UNARY_OPS = ["+", "-", "!", "^", "~~~", "not"];
const ALL_OPS = [
["->", "when", "::", "|", "=>", "&", "=", "^^^", "//", "**", ".", "@"],
IN_MATCH_OPS,
OR_OPS,
AND_OPS,
COMP_OPS,
REL_OPS,
ARROW_OPS,
IN_OPS,
CONCAT_OPS,
ADD_OPS,
MULT_OPS,
UNARY_OPS,
].flat();
// Ignore word literals and "=>" which is not a valid atom
const ATOM_OPERATOR_LITERALS = ALL_OPS.filter(
(operator) => !/[a-z]/.test(operator) && operator !== "=>"
);
// Note that for keywords we use external scanner (KEYWORD_SPECIAL_LITERAL),
// so it should be kept in sync
const ATOM_SPECIAL_LITERALS = ["...", "%{}", "{}", "%", "<<>>", "..//"];
// Word tokens used directly in the grammar
const RESERVED_WORD_TOKENS = [
// Operators
["and", "in", "not", "or", "when"],
// Literals
["true", "false", "nil"],
// Other
["after", "catch", "do", "else", "end", "fn", "rescue"],
].flat();
const SPECIAL_IDENTIFIERS = [
"__MODULE__",
"__DIR__",
"__ENV__",
"__CALLER__",
"__STACKTRACE__",
];
// Numbers
const DIGITS = /[0-9]+/;
const BIN_DIGITS = /[0-1]+/;
const OCT_DIGITS = /[0-7]+/;
const HEX_DIGITS = /[0-9a-fA-F]+/;
const numberDec = sep1(DIGITS, "_");
const numberBin = seq("0b", sep1(BIN_DIGITS, "_"));
const numberOct = seq("0o", sep1(OCT_DIGITS, "_"));
const numberHex = seq("0x", sep1(HEX_DIGITS, "_"));
const integer = choice(numberDec, numberBin, numberOct, numberHex);
const floatScientificPart = seq(/[eE]/, optional(choice("-", "+")), integer);
const float = seq(numberDec, ".", numberDec, optional(floatScientificPart));
const aliasPart = /[A-Z][_a-zA-Z0-9]*/;
module.exports = grammar({
name: "elixir",
// TODO describe stuff (also in the separate notes doc add clarification
// how we use this verbose tokens to avoid needing scanner state)
externals: ($) => [
$._quoted_content_i_single,
$._quoted_content_i_double,
$._quoted_content_i_heredoc_single,
$._quoted_content_i_heredoc_double,
$._quoted_content_i_parenthesis,
$._quoted_content_i_curly,
$._quoted_content_i_square,
$._quoted_content_i_angle,
$._quoted_content_i_bar,
$._quoted_content_i_slash,
$._quoted_content_single,
$._quoted_content_double,
$._quoted_content_heredoc_single,
$._quoted_content_heredoc_double,
$._quoted_content_parenthesis,
$._quoted_content_curly,
$._quoted_content_square,
$._quoted_content_angle,
$._quoted_content_bar,
$._quoted_content_slash,
$._keyword_special_literal,
$._atom_start,
$._keyword_end,
$._newline_before_do,
$._newline_before_binary_op,
// TODO explain this, basically we use newline ignored for newline before comment,
// as after the comment there is another newline that we then consider as usual (so
// that comments are skipped when considering newlines) <- this is chaotic need a better one
$._newline_before_comment,
// TODO explain this, basically we use this to force unary + and -
// if there is no spacing before the operand
$._before_unary_op,
$._not_in,
],
// TODO include in notes about why using extra for newline before binary op is fine
// TODO figure out how "\n" helps with the behaviour in
// [
// :a,
// ]
// and how it generally works with extras
extras: ($) => [
$.comment,
/\s|\\\n/,
$._newline_before_binary_op,
$._newline_before_comment,
"\n",
],
// TODO check if the parser doesn't compile without each conflict rule,
// otherwise it means we don't really use it (I think)
conflicts: ($) => [
// [$._newline_before_binary_op],
[$.binary_operator],
[$.keywords],
// [$.identifier, $.atom_literal],
[$._expression, $._local_call_with_arguments],
[
$._expression,
$._local_call_with_arguments,
$._local_call_without_arguments,
],
[$._remote_call, $._parenthesised_remote_call],
// stab clause `(x` may be either `(x;y) ->` or `(x, y) ->`
// [$.block, $._stab_clause_arguments],
[$.block, $._stab_clause_parentheses_arguments],
[$.block, $._stab_clause_arguments],
[$.block, $._stab_clause_arguments_expression],
// when in stab clause
[$.binary_operator, $._stab_clause_arguments_expression],
[$.tuple, $.map],
[$.tuple, $.map_content],
[$.operator_identifier, $.stab_clause],
[$.unary_operator, $.operator_identifier],
// [$.alias],
[$.body],
// [$.block, $._stab_clause_arguments],
// [$.block, $._stab_clause_parentheses_arguments],
// [$.block, $._stab_clause_parentheses_arguments],
[$.after_block],
[$.rescue_block],
[$.catch_block],
[$.else_block],
],
rules: {
source: ($) => "TODO",
source: ($) =>
seq(
optional($._terminator),
optional(
seq(sep1($._expression, $._terminator), optional($._terminator))
)
),
_terminator: ($) =>
prec.right(choice(seq(repeat("\n"), ";"), repeat1("\n"))),
_expression: ($) =>
choice(
$.block,
$._identifier,
$.alias,
$.integer,
$.float,
$.atom,
$.string,
$.charlist,
$.sigil,
$.list,
$.tuple,
$.bitstring,
$.map,
$.char,
$.boolean,
$.nil,
$.unary_operator,
$.binary_operator,
$.dot,
$.call,
$.access_call,
$.anonymous_function
),
block: ($) =>
prec(
PREC.WHEN_OP,
seq(
"(",
seq(
optional($._terminator),
optional(
seq(
sep1(choice($._expression, $.stab_clause), $._terminator),
optional($._terminator)
)
)
),
")"
)
),
_identifier: ($) =>
choice($.identifier, $.unused_identifier, $.special_identifier),
// Note: Elixir does not allow uppercase and titlecase letters
// as a variable starting character, but this regex would match
// those. This implies we would happily parse those cases, but
// since they are not valid Elixir it's unlikely to stumble upon
// them. TODO reword
// Ref: https://hexdocs.pm/elixir/master/unicode-syntax.html#variables
// TODO see if we need this in custom scanner in the end, if we do,
// then we may use the generation script from the original repo instead
// and make this an external (though I'd check if these custom unicode
// functions are efficient, does compiler optimise such checks?)
// identifier: ($) => choice(/[\p{ID_Start}][\p{ID_Continue}]*[?!]?/u, "..."),
// identifier: ($) => choice(/[\p{Ll}\p{Lm}\p{Lo}\p{Nl}\p{Other_ID_Start}][\p{ID_Continue}]*[?!]?/u, "..."),
// identifier: ($) => choice(/[\p{Ll}\p{Lm}\p{Lo}\p{Nl}][\p{ID_Continue}]*[?!]?/u, "..."),
//
// TODO elaborate, but basically
//
// we remove uppercase/titlecase letters from ID_Start as elixir does
// we remove the subtractions (we cannot express group subtraction in regex),
// but it's fine becaues at the time of writing these groups only really subtract
// a single character
// Unicode.Set.to_utf8_char "[[[:L:][:Nl:][:Other_ID_Start:]] & [[:Pattern_Syntax:][:Pattern_White_Space:]]]"
// we use hardcoded codepoints for \p{Other_ID_Start} since treesitter/js regexp doesn't
// recognise this group
//
// Other_ID_Start \u1885\u1886\u2118\u212E\u309B\u309C
// (this the list at the time of writing, it's for backward compatibility, see https://unicode.org/reports/tr31/#Backward_Compatibility)
identifier: ($) =>
choice(
/[\p{Ll}\p{Lm}\p{Lo}\p{Nl}\u1885\u1886\u2118\u212E\u309B\u309C][\p{ID_Continue}]*[?!]?/u,
"..."
),
unused_identifier: ($) => /_[\p{ID_Continue}]*[?!]?/u,
special_identifier: ($) => choice(...SPECIAL_IDENTIFIERS),
// We have a separate rule for single-part alias, so that we
// can use it in the keywords rule
alias: ($) => choice($._alias_single, $._alias_multi),
_alias_single: ($) => aliasPart,
_alias_multi: ($) => token(sep1(aliasPart, /\s*\.\s*/)),
integer: ($) => token(integer),
float: ($) => token(float),
atom: ($) =>
seq(
$._atom_start,
choice(
alias($._atom_word_literal, $.atom_literal),
alias($._atom_operator_literal, $.atom_literal),
alias($._atom_special_literal, $.atom_literal),
$._quoted_i_double,
$._quoted_i_single
)
),
// TODO comment on the unicode groups here
_atom_word_literal: ($) => /[\p{ID_Start}_][\p{ID_Continue}@]*[?!]?/u,
_atom_operator_literal: ($) => choice(...ATOM_OPERATOR_LITERALS),
_atom_special_literal: ($) => choice(...ATOM_SPECIAL_LITERALS),
// Defines $._quoted_content_i_{name} and $._quoted_content_{name} rules,
// content with and without interpolation respectively
...defineQuoted(`"`, `"`, "double"),
...defineQuoted(`'`, `'`, "single"),
...defineQuoted(`'''`, `'''`, "heredoc_single"),
...defineQuoted(`"""`, `"""`, "heredoc_double"),
...defineQuoted(`(`, `)`, "parenthesis"),
...defineQuoted(`{`, `}`, "curly"),
...defineQuoted(`[`, `]`, "square"),
...defineQuoted(`<`, `>`, "angle"),
...defineQuoted(`|`, `|`, "bar"),
...defineQuoted(`/`, `/`, "slash"),
string: ($) => choice($._quoted_i_double, $._quoted_i_heredoc_double),
charlist: ($) => choice($._quoted_i_single, $._quoted_i_heredoc_single),
interpolation: ($) => seq("#{", $._expression, "}"),
escape_sequence: ($) =>
token(
seq(
"\\",
choice(
// Single escaped character
/[^ux]/,
// Hex byte
/x[0-9a-fA-F]{1,2}/,
/x{[0-9a-fA-F]+}/,
// Unicode code point
/u{[0-9a-fA-F]+}/,
/u[0-9a-fA-F]{4}/
)
)
),
sigil: ($) =>
seq(
"~",
choice(
seq(
alias(token.immediate(/[a-z]/), $.sigil_name),
choice(
$._quoted_i_double,
$._quoted_i_single,
$._quoted_i_heredoc_single,
$._quoted_i_heredoc_double,
$._quoted_i_parenthesis,
$._quoted_i_curly,
$._quoted_i_square,
$._quoted_i_angle,
$._quoted_i_bar,
$._quoted_i_slash
)
),
seq(
alias(token.immediate(/[A-Z]/), $.sigil_name),
choice(
$._quoted_double,
$._quoted_single,
$._quoted_heredoc_single,
$._quoted_heredoc_double,
$._quoted_parenthesis,
$._quoted_curly,
$._quoted_square,
$._quoted_angle,
$._quoted_bar,
$._quoted_slash
)
)
),
optional(alias(token.immediate(/[a-zA-Z]+/), $.sigil_modifiers))
),
unary_operator: ($) =>
choice(
unaryOp($, prec, PREC.CAPTURE_OP, "&", $._capture_expression),
unaryOp($, prec, PREC.UNARY_OPS, choice(...UNARY_OPS)),
unaryOp($, prec, PREC.AT_OP, "@"),
// Capture operand like &1 is a special case with higher precedence
unaryOp($, prec, PREC.CAPTURE_OPERAND, "&", $.integer)
),
_capture_expression: ($) =>
choice(
// TODO sholud parenthesised expression be generally used (?)
// Precedence over block expression
prec(PREC.WHEN_OP + 1, seq("(", $._expression, ")")),
$._expression
),
binary_operator: ($) =>
choice(
binaryOp($, prec.left, PREC.IN_MATCH_OPS, choice(...IN_MATCH_OPS)),
binaryOp(
$,
prec.right,
PREC.WHEN_OP,
"when",
$._expression,
choice($._expression, $.keywords)
),
binaryOp($, prec.right, PREC.TYPE_OP, "::"),
binaryOp(
$,
prec.right,
PREC.BAR_OP,
"|",
$._expression,
choice($._expression, $.keywords)
),
binaryOp($, prec.right, PREC.ASSOC_OP, "=>"),
binaryOp($, prec.right, PREC.MATCH_OP, "="),
binaryOp($, prec.left, PREC.OR_OPS, choice(...OR_OPS)),
binaryOp($, prec.left, PREC.AND_OPS, choice(...AND_OPS)),
binaryOp($, prec.left, PREC.COMP_OPS, choice(...COMP_OPS)),
binaryOp($, prec.left, PREC.REL_OPS, choice(...REL_OPS)),
binaryOp($, prec.left, PREC.ARROW_OPS, choice(...ARROW_OPS)),
binaryOp($, prec.left, PREC.IN_OPS, choice("in", $._not_in)),
binaryOp($, prec.left, PREC.XOR_OP, "^^^"),
binaryOp($, prec.right, PREC.TERNARY_OP, "//"),
binaryOp($, prec.right, PREC.CONCAT_OPS, choice(...CONCAT_OPS)),
binaryOp($, prec.left, PREC.ADD_OPS, choice(...ADD_OPS)),
binaryOp($, prec.left, PREC.MULT_OPS, choice(...MULT_OPS)),
binaryOp($, prec.left, PREC.POWER_OP, "**"),
// Operator with arity
binaryOp(
$,
prec.left,
PREC.MULT_OPS,
"/",
$.operator_identifier,
$.integer
)
),
operator_identifier: ($) =>
// Operators with the following changes:
// * exclude "=>" since it's not a valid atom/operator identifier anyway (valid only in map)
// * we exclude // since it's only valid after ..
// * we remove "-" and "+" since they are both unary and binary
// We use the same precedence as unary operators, so that a sequence
// like `& /` is a conflict and is resolved via $.conflicts
// (could be be either `& / 2` or `& / / 2`)
choice(
// Unary operators
prec(PREC.CAPTURE_OP, "&"),
prec(PREC.UNARY_OPS, choice(...UNARY_OPS)),
prec(PREC.AT_OP, "@"),
// Binary operators
...IN_MATCH_OPS,
"when",
"::",
"|",
"=",
...OR_OPS,
...AND_OPS,
...COMP_OPS,
...REL_OPS,
...ARROW_OPS,
"in",
$._not_in,
"^^",
...CONCAT_OPS,
...MULT_OPS,
"**",
"->",
"."
),
dot: ($) =>
prec(
PREC.DOT_OP,
seq(choice($._expression), ".", choice($.alias, $.tuple))
),
keywords: ($) => sep1($.pair, ","),
pair: ($) => seq($.keyword, $._expression),
keyword: ($) =>
seq(
// Tree-sitter doesn't consider ambiguities within individual
// tokens (in this case regexps). So both in [a] and [a: 1] it
// would always parse "a" as the same node (based on whether
// $.identifier or $.atom_literal) is listed first in the rules.
// However, since identifiers and alias parts are valid atom
// literals, we can list them here, in which case the parser will
// consider all paths and pick the valid one.
// Also see https://github.com/tree-sitter/tree-sitter/issues/518
choice(
alias($._atom_word_literal, $.atom_literal),
alias($._atom_operator_literal, $.atom_literal),
alias($._keyword_special_literal, $.atom_literal),
alias($.identifier, $.atom_literal),
alias($.unused_identifier, $.atom_literal),
alias($.special_identifier, $.atom_literal),
alias($._alias_single, $.atom_literal),
alias(choice(...RESERVED_WORD_TOKENS), $.atom_literal),
$._quoted_i_double,
$._quoted_i_single
),
$._keyword_end
),
list: ($) => seq("[", optional($._items_with_trailing_separator), "]"),
tuple: ($) => seq("{", optional($._items_with_trailing_separator), "}"),
bitstring: ($) =>
seq("<<", optional($._items_with_trailing_separator), ">>"),
map: ($) => seq("%", optional($.struct), "{", optional($.map_content), "}"),
struct: ($) =>
prec.left(
choice(
$.alias,
$.atom,
$._identifier,
$.unary_operator,
$.dot,
alias($._parenthesised_call, $.call)
)
),
map_content: ($) => $._items_with_trailing_separator,
_items_with_trailing_separator: ($) =>
seq(
choice(
seq(sep1($._expression, ","), optional(seq(",", $.keywords))),
$.keywords
),
optional(",")
),
char: ($) => /\?(.|\\.)/,
boolean: ($) => choice("true", "false"),
nil: ($) => "nil",
call: ($) =>
choice(
$._local_call_with_arguments,
$._parenthesised_local_call_with_arguments,
$._local_call_without_arguments,
$._remote_call,
$._parenthesised_remote_call,
$._anonymous_call,
$._call_on_call
),
_parenthesised_call: ($) =>
choice(
$._parenthesised_local_call_with_arguments,
$._parenthesised_remote_call,
$._anonymous_call,
$._call_on_call
),
_call_on_call: ($) =>
prec.left(
seq(
alias(
choice(
$._parenthesised_local_call_with_arguments,
$._parenthesised_remote_call,
$._anonymous_call
),
$.call
),
// arguments in parentheses
// alias($._local_or_remote_arguments, $.arguments),
// TODO just make nonimmediate/immediate in the name
alias($._anonymous_arguments, $.arguments),
optional(seq(optional($._newline_before_do), $.do_block))
)
),
_local_call_with_arguments: ($) =>
// Given `x + y` it can be interpreted either as a binary operator
// or a call with unary operator. This is an actual ambiguity, so
// we use dynamic precedence to penalize call
// prec.dynamic(
// TODO ideally we would penalize whitespace after unary op,
// so that x + y is binary op and x +y is unary op, to reflect
// Elixir ast
// -1,
prec.left(
seq(
$._identifier,
alias($._call_arguments, $.arguments),
// TODO include this in notes:
// We use external scanner for _newline_before_do because
// this way we can lookahead through any whitespace
// (especially newlines). We cannot simply use repeat("\n")
// and conflict with expression end, because this function
// rule has left precedence (so that do-end sticks to the outermost
// call), and thus expression end would always be preferred
optional(seq(optional($._newline_before_do), $.do_block))
// optional($.do_block)
)
// )
),
_parenthesised_local_call_with_arguments: ($) =>
// Given `x + y` it can be interpreted either as a binary operator
// or a call with unary operator. This is an actual ambiguity, so
// we use dynamic precedence to penalize call
// prec.dynamic(
// TODO ideally we would penalize whitespace after unary op,
// so that x + y is binary op and x +y is unary op, to reflect
// Elixir ast
// -1,
prec.left(
seq(
$._identifier,
alias($._parenthesised_call_arguments, $.arguments),
// TODO include this in notes:
// We use external scanner for _newline_before_do because
// this way we can lookahead through any whitespace
// (especially newlines). We cannot simply use repeat("\n")
// and conflict with expression end, because this function
// rule has left precedence (so that do-end sticks to the outermost
// call), and thus expression end would always be preferred
optional(seq(optional($._newline_before_do), $.do_block))
// optional($.do_block)
)
// )
),
_local_call_without_arguments: ($) =>
// We use lower precedence, so given `fun arg do end`
// we don't tokenize `arg` as a call
// we actually need a conflict because of `foo bar do end` vs `foo bar do: 1`
// prec(-1,
prec.dynamic(-1, seq($._identifier, $.do_block)),
// )
_remote_call: ($) =>
prec.left(
seq(
alias($._remote_dot, $.dot),
optional(alias($._call_arguments, $.arguments)),
optional(seq(optional($._newline_before_do), $.do_block))
// optional($.do_block)
)
),
_parenthesised_remote_call: ($) =>
prec.left(
seq(
alias($._remote_dot, $.dot),
alias($._parenthesised_call_arguments, $.arguments),
optional(seq(optional($._newline_before_do), $.do_block))
// optional($.do_block)
)
),
_remote_dot: ($) =>
prec(
PREC.DOT_OP,
seq(
$._expression,
".",
// TODO can also be string, anything else?
// compare with the other parser
// TODO we don't want to support heredoc though
choice(
$._identifier,
alias(choice(...RESERVED_WORD_TOKENS), $.identifier),
$.operator_identifier,
alias($._quoted_i_double, $.string),
alias($._quoted_i_single, $.charlist)
)
)
),
_parenthesised_call_arguments: ($) =>
seq(token.immediate("("), optional($._call_arguments), ")"),
_anonymous_call: ($) =>
seq(
alias($._anonymous_dot, $.dot),
alias($._anonymous_arguments, $.arguments)
),
_anonymous_dot: ($) => prec(PREC.DOT_OP, seq($._expression, ".")),
_anonymous_arguments: ($) => seq("(", optional($._call_arguments), ")"),
_call_arguments: ($) =>
// Right precedence ensures that `fun1 fun2 x, y` is treated
// as `fun1(fun2(x, y))` and not `fun1(fun2(x), y)
prec.right(
seq(
choice(
seq(
sep1($._expression, ","),
optional(seq(",", $.keywords, optional(",")))
),
seq($.keywords, optional(","))
)
)
),
access_call: ($) =>
prec(
PREC.ACCESS,
seq($._expression, token.immediate("["), $._expression, "]")
),
do_block: ($) =>
seq(
sugarBlock($, "do"),
repeat(
choice($.after_block, $.rescue_block, $.catch_block, $.else_block)
),
"end"
),
after_block: ($) => sugarBlock($, "after"),
rescue_block: ($) => sugarBlock($, "rescue"),
catch_block: ($) => sugarBlock($, "catch"),
else_block: ($) => sugarBlock($, "else"),
// Specify right precedence, so that we consume as much as we can
stab_clause: ($) =>
prec.right(seq(optional($._stab_clause_left), "->", optional($.body))),
_stab_clause_left: ($) =>
choice(
// Note the first option has higher precedence, TODO clarify
alias($._stab_clause_parentheses_arguments, $.arguments),
// TODO naming/cleanup
alias(
$._stab_clause_parentheses_arguments_with_guard,
$.binary_operator
),
alias($._stab_clause_arguments, $.arguments),
alias($._stab_clause_arguments_with_guard, $.binary_operator)
),
_stab_clause_parentheses_arguments: ($) =>
// `(1) ->` may be interpreted either as block argument
// or argument in parentheses and we use dynamic precedence
// to favour the latter
prec(
PREC.WHEN_OP,
prec.dynamic(1, seq("(", optional($._stab_clause_arguments), ")"))
),
_stab_clause_parentheses_arguments_with_guard: ($) =>
seq(
alias($._stab_clause_parentheses_arguments, $.arguments),
"when",
$._expression
),
_stab_clause_arguments_with_guard: ($) =>
// `a when b ->` may be interpted either such that `a when b` is an argument
// or a guard binary operator with argument `a` and right operand `b`,
// we use dynamic precedence to favour the latter
prec.dynamic(
1,
seq(alias($._stab_clause_arguments, $.arguments), "when", $._expression)
),
_stab_clause_arguments: ($) =>
// TODO this is a variant of _items_with_trailing_separator, cleanup
choice(
seq(
sep1($._stab_clause_arguments_expression, ","),
optional(seq(",", $.keywords))
),
$.keywords
),
_stab_clause_arguments_expression: ($) =>
// Note here we use the same precedence as when operator,
// so we get a conflict and resolve it dynamically
prec(PREC.WHEN_OP, $._expression),
body: ($) =>
seq(
choice(
seq($._terminator, sep($._expression, $._terminator)),
sep1($._expression, $._terminator)
),
optional($._terminator)
),
anonymous_function: ($) =>
seq(
"fn",
optional($._terminator),
sep1($.stab_clause, $._terminator),
"end"
),
// A comment may be anywhere, we give it a lower precedence,
// so it doesn't intercept sequences such as interpolation
comment: ($) => token(prec(-1, seq("#", /.*/))),
},
});
function sep1(rule, separator) {
return seq(rule, repeat(seq(separator, rule)));
}
function sep(rule, separator) {
return optional(sep1(rule, separator));
}
function unaryOp($, assoc, precedence, operator, right = null) {
return assoc(
precedence,
// TODO clarify, we use lower precedence, so given `x + y`,
// which can be interpreted as either `x + y` or `x(+y)`
// we favour the former. The only exception is when
// _before_unary_op matches which forces the latter interpretation
// in case like `x +y`
prec.dynamic(
-1,
seq(
optional($._before_unary_op),
field("operator", operator),
right || $._expression
)
)
);
}
function binaryOp($, assoc, precedence, operator, left = null, right = null) {
return assoc(
precedence,
seq(
field("left", left || $._expression),
field("operator", operator),
field("right", right || $._expression)
)
);
}
function sugarBlock($, start) {
return seq(
start,
optional($._terminator),
optional(
choice(
sep1(choice($.stab_clause), $._terminator),
seq(sep1(choice($._expression), $._terminator), optional($._terminator))
)
)
);
}
function defineQuoted(start, end, name) {
return {
[`_quoted_i_${name}`]: ($) =>
seq(
start,
repeat(
choice(
// TODO rename the extenrals to _content
alias($[`_quoted_content_i_${name}`], $.string_content),
$.interpolation,
$.escape_sequence
)
),
end
),
[`_quoted_${name}`]: ($) =>
seq(
start,
repeat(
choice(
// TODO rename the extenrals to _content
alias($[`_quoted_content_${name}`], $.string_content),
// It's always possible to escape the end delimiter
$.escape_sequence
)
),
end
),
};
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

403241
src/parser.c

File diff suppressed because it is too large Load Diff

844
src/scanner.cc Normal file
View File

@ -0,0 +1,844 @@
#include <tree_sitter/parser.h>
namespace {
enum TokenType {
// TODO add a note that all QUOTE_* tokens are mutually exclusive
// i.e. the valid_symbols array contains at most one truthy of these
QUOTED_CONTENT_I_SINGLE,
QUOTED_CONTENT_I_DOUBLE,
QUOTED_CONTENT_I_HEREDOC_SINGLE,
QUOTED_CONTENT_I_HEREDOC_DOUBLE,
QUOTED_CONTENT_I_PARENTHESIS,
QUOTED_CONTENT_I_CURLY,
QUOTED_CONTENT_I_SQUARE,
QUOTED_CONTENT_I_ANGLE,
QUOTED_CONTENT_I_BAR,
QUOTED_CONTENT_I_SLASH,
QUOTED_CONTENT_SINGLE,
QUOTED_CONTENT_DOUBLE,
QUOTED_CONTENT_HEREDOC_SINGLE,
QUOTED_CONTENT_HEREDOC_DOUBLE,
QUOTED_CONTENT_PARENTHESIS,
QUOTED_CONTENT_CURLY,
QUOTED_CONTENT_SQUARE,
QUOTED_CONTENT_ANGLE,
QUOTED_CONTENT_BAR,
QUOTED_CONTENT_SLASH,
KEYWORD_SPECIAL_LITERAL,
ATOM_START,
KEYWORD_END,
NEWLINE_BEFORE_DO,
NEWLINE_BEFORE_BINARY_OP,
NEWLINE_BEFORE_COMMENT,
BEFORE_UNARY_OP,
NOT_IN
};
bool quoted_token_type(const bool* valid_symbols, TokenType& token_type) {
// Quoted symbols are mutually exclusive and only one should
// be valid at a time. If multiple are valid it means we parse
// an arbitrary code outside quotes, in which case we don't
// want to tokenize it as quoted content.
if (valid_symbols[QUOTED_CONTENT_I_SINGLE] && valid_symbols[QUOTED_CONTENT_I_DOUBLE]) {
return false;
}
if (valid_symbols[QUOTED_CONTENT_I_SINGLE]) {
token_type = QUOTED_CONTENT_I_SINGLE;
return true;
}
if (valid_symbols[QUOTED_CONTENT_I_DOUBLE]) {
token_type = QUOTED_CONTENT_I_DOUBLE;
return true;
}
if (valid_symbols[QUOTED_CONTENT_I_HEREDOC_SINGLE]) {
token_type = QUOTED_CONTENT_I_HEREDOC_SINGLE;
return true;
}
if (valid_symbols[QUOTED_CONTENT_I_HEREDOC_DOUBLE]) {
token_type = QUOTED_CONTENT_I_HEREDOC_DOUBLE;
return true;
}
if (valid_symbols[QUOTED_CONTENT_I_PARENTHESIS]) {
token_type = QUOTED_CONTENT_I_PARENTHESIS;
return true;
}
if (valid_symbols[QUOTED_CONTENT_I_CURLY]) {
token_type = QUOTED_CONTENT_I_CURLY;
return true;
}
if (valid_symbols[QUOTED_CONTENT_I_SQUARE]) {
token_type = QUOTED_CONTENT_I_SQUARE;
return true;
}
if (valid_symbols[QUOTED_CONTENT_I_ANGLE]) {
token_type = QUOTED_CONTENT_I_ANGLE;
return true;
}
if (valid_symbols[QUOTED_CONTENT_I_BAR]) {
token_type = QUOTED_CONTENT_I_BAR;
return true;
}
if (valid_symbols[QUOTED_CONTENT_I_SLASH]) {
token_type = QUOTED_CONTENT_I_SLASH;
return true;
}
if (valid_symbols[QUOTED_CONTENT_SINGLE]) {
token_type = QUOTED_CONTENT_SINGLE;
return true;
}
if (valid_symbols[QUOTED_CONTENT_DOUBLE]) {
token_type = QUOTED_CONTENT_DOUBLE;
return true;
}
if (valid_symbols[QUOTED_CONTENT_HEREDOC_SINGLE]) {
token_type = QUOTED_CONTENT_HEREDOC_SINGLE;
return true;
}
if (valid_symbols[QUOTED_CONTENT_HEREDOC_DOUBLE]) {
token_type = QUOTED_CONTENT_HEREDOC_DOUBLE;
return true;
}
if (valid_symbols[QUOTED_CONTENT_PARENTHESIS]) {
token_type = QUOTED_CONTENT_PARENTHESIS;
return true;
}
if (valid_symbols[QUOTED_CONTENT_CURLY]) {
token_type = QUOTED_CONTENT_CURLY;
return true;
}
if (valid_symbols[QUOTED_CONTENT_SQUARE]) {
token_type = QUOTED_CONTENT_SQUARE;
return true;
}
if (valid_symbols[QUOTED_CONTENT_ANGLE]) {
token_type = QUOTED_CONTENT_ANGLE;
return true;
}
if (valid_symbols[QUOTED_CONTENT_BAR]) {
token_type = QUOTED_CONTENT_BAR;
return true;
}
if (valid_symbols[QUOTED_CONTENT_SLASH]) {
token_type = QUOTED_CONTENT_SLASH;
return true;
}
return false;
}
int32_t quoted_end_delimiter(TokenType token_type) {
switch (token_type) {
case QUOTED_CONTENT_I_SINGLE:
case QUOTED_CONTENT_SINGLE:
case QUOTED_CONTENT_I_HEREDOC_SINGLE:
case QUOTED_CONTENT_HEREDOC_SINGLE:
return '\'';
case QUOTED_CONTENT_I_DOUBLE:
case QUOTED_CONTENT_DOUBLE:
case QUOTED_CONTENT_I_HEREDOC_DOUBLE:
case QUOTED_CONTENT_HEREDOC_DOUBLE:
return '\"';
case QUOTED_CONTENT_I_PARENTHESIS:
case QUOTED_CONTENT_PARENTHESIS:
return ')';
case QUOTED_CONTENT_I_CURLY:
case QUOTED_CONTENT_CURLY:
return '}';
case QUOTED_CONTENT_I_SQUARE:
case QUOTED_CONTENT_SQUARE:
return ']';
case QUOTED_CONTENT_I_ANGLE:
case QUOTED_CONTENT_ANGLE:
return '>';
case QUOTED_CONTENT_I_BAR:
case QUOTED_CONTENT_BAR:
return '|';
case QUOTED_CONTENT_I_SLASH:
case QUOTED_CONTENT_SLASH:
return '/';
}
__builtin_unreachable();
}
uint8_t quoted_delimiter_length(TokenType token_type) {
switch (token_type) {
case QUOTED_CONTENT_I_HEREDOC_SINGLE:
case QUOTED_CONTENT_I_HEREDOC_DOUBLE:
case QUOTED_CONTENT_HEREDOC_SINGLE:
case QUOTED_CONTENT_HEREDOC_DOUBLE:
return 3;
default:
return 1;
}
}
bool quoted_is_interpol(TokenType token_type) {
switch (token_type) {
case QUOTED_CONTENT_I_SINGLE:
case QUOTED_CONTENT_I_DOUBLE:
case QUOTED_CONTENT_I_HEREDOC_SINGLE:
case QUOTED_CONTENT_I_HEREDOC_DOUBLE:
case QUOTED_CONTENT_I_PARENTHESIS:
case QUOTED_CONTENT_I_CURLY:
case QUOTED_CONTENT_I_SQUARE:
case QUOTED_CONTENT_I_ANGLE:
case QUOTED_CONTENT_I_BAR:
case QUOTED_CONTENT_I_SLASH:
return true;
default:
return false;
}
}
bool is_whitespace(int32_t c) {
return c == ' ' || c == '\t' || c == '\v' ||
c == '\n' || c == '\f' || c == '\r';
}
bool is_inline_whitespace(int32_t c) {
return c == ' ' || c == '\t' || c == '\v';
}
// TODO what about these weird \f \r
bool is_newline(int32_t c) {
return c == '\n';
}
void advance(TSLexer* lexer) {
lexer->advance(lexer, false);
}
void skip(TSLexer *lexer) {
lexer->advance(lexer, true);
}
bool finish_atom_start(TSLexer* lexer) {
// The first ':' is already scanned and parser advanced
lexer->mark_end(lexer);
lexer->result_symbol = ATOM_START;
if (lexer->lookahead == ':') {
advance(lexer);
if (lexer->lookahead == ':') {
// :::
return true;
} else {
return false;
}
} else {
return !is_whitespace(lexer->lookahead);
}
}
bool is_keyword_end(TSLexer* lexer) {
if (lexer->lookahead == ':') {
advance(lexer);
return is_whitespace(lexer->lookahead);
}
return false;
}
bool finish_keyword(TSLexer* lexer) {
lexer->mark_end(lexer);
lexer->result_symbol = KEYWORD_SPECIAL_LITERAL;
return is_keyword_end(lexer);
}
bool is_digit(int32_t c) {
return '0' <= c && c <= '9';
}
bool is_operator_end(TSLexer* lexer) {
// Keyword
if (lexer->lookahead == ':') {
return !is_keyword_end(lexer);
}
while (is_inline_whitespace(lexer->lookahead)) {
advance(lexer);
}
// Operator identifier with arity
if (lexer->lookahead == '/') {
advance(lexer);
while (is_whitespace(lexer->lookahead)) {
advance(lexer);
}
if (is_digit(lexer->lookahead)) {
return false;
}
}
return true;
}
const char TOKEN_TERMINATORS[] = {
// Operator starts
'@', '.', '+', '-', '^', '-', '*', '/', '<', '>', '|', '~', '=', '&', '\\', '%',
// Delimiters
'{', '}', '[', ']', '(', ')', '"', '\'',
// Separators
',', ';',
// Comment
'#'
};
// Note: this is a heuristic as we only use this to distinguish word
// operators and we don't want to include complex Unicode ranges.
bool is_token_end(int32_t c) {
for (unsigned int i = 0; i < sizeof(TOKEN_TERMINATORS); i++) {
if (c == TOKEN_TERMINATORS[i]) {
return true;
}
}
return is_whitespace(c);
}
bool scan(TSLexer* lexer, const bool* valid_symbols) {
TokenType token_type;
bool is_quoted_symbol = quoted_token_type(valid_symbols, token_type);
// Quoted content, which matches any character except for close
// delimiters, escapes and interpolations
if (is_quoted_symbol) {
// TODO naming
// TODO move all of this into a separate function like scan_quoted_content
int32_t end_delimiter = quoted_end_delimiter(token_type);
bool supports_interpol = quoted_is_interpol(token_type);
uint8_t delimiter_length = quoted_delimiter_length(token_type);
lexer->result_symbol = token_type;
for (bool has_content = false; true; has_content = true) {
lexer->mark_end(lexer);
if (lexer->lookahead == end_delimiter) {
uint8_t length = 1;
while (length < delimiter_length) {
advance(lexer);
if (lexer->lookahead == end_delimiter) {
length++;
} else {
break;
}
}
if (length == delimiter_length) {
return has_content;
}
} else {
switch (lexer->lookahead) {
case '#':
advance(lexer);
if (supports_interpol && lexer->lookahead == '{') {
return has_content;
}
break;
case '\\':
if (supports_interpol) {
return has_content;
} else {
advance(lexer);
if (lexer->lookahead == end_delimiter) {
return has_content;
}
}
break;
case '\0':
return false;
default:
advance(lexer);
}
}
}
return false;
}
if (lexer->lookahead == ':') {
if (valid_symbols[ATOM_START] || valid_symbols[KEYWORD_END]) {
advance(lexer);
if (is_whitespace(lexer->lookahead)) {
if (valid_symbols[KEYWORD_END]) {
lexer->result_symbol = KEYWORD_END;
return true;
}
} else {
if (valid_symbols[ATOM_START]) {
return finish_atom_start(lexer);
}
}
return false;
}
}
bool skipped_whitespace = false;
while (is_inline_whitespace(lexer->lookahead)) {
skipped_whitespace = true;
skip(lexer);
}
// TODO moves this below together with other functions on this level
if (lexer->lookahead == '+') {
if (skipped_whitespace && valid_symbols[BEFORE_UNARY_OP]) {
lexer->mark_end(lexer);
advance(lexer);
if (lexer->lookahead == '+' || lexer->lookahead == ':' || lexer->lookahead == '/') {
return false;
}
if (is_whitespace(lexer->lookahead)) {
return false;
}
lexer->result_symbol = BEFORE_UNARY_OP;
return true;
}
}
if (lexer->lookahead == '-') {
if (skipped_whitespace && valid_symbols[BEFORE_UNARY_OP]) {
lexer->mark_end(lexer);
advance(lexer);
if (lexer->lookahead == '-' || lexer->lookahead == '>' || lexer->lookahead == ':' || lexer->lookahead == '/') {
return false;
}
if (is_whitespace(lexer->lookahead)) {
return false;
}
lexer->result_symbol = BEFORE_UNARY_OP;
return true;
}
}
if (lexer->lookahead == 'n') {
lexer->result_symbol = NOT_IN;
advance(lexer);
if (lexer->lookahead == 'o') {
advance(lexer);
if (lexer->lookahead == 't') {
advance(lexer);
while (is_inline_whitespace(lexer->lookahead)) {
advance(lexer);
}
if (lexer->lookahead == 'i') {
advance(lexer);
if (lexer->lookahead == 'n') {
advance(lexer);
return is_token_end(lexer->lookahead);
}
}
}
}
return false;
}
// TODO can be a separate function
if (is_newline(lexer->lookahead) && (
valid_symbols[NEWLINE_BEFORE_DO] ||
valid_symbols[NEWLINE_BEFORE_BINARY_OP] ||
valid_symbols[NEWLINE_BEFORE_COMMENT])) {
advance(lexer);
while (is_whitespace(lexer->lookahead)) {
advance(lexer);
}
// Note we include all the whitespace after newline, so that the
// parser doesn't have to go through it again
lexer->mark_end(lexer);
if (lexer->lookahead == '#') {
lexer->result_symbol = NEWLINE_BEFORE_COMMENT;
return true;
}
if (valid_symbols[NEWLINE_BEFORE_DO] && lexer->lookahead == 'd') {
lexer->result_symbol = NEWLINE_BEFORE_DO;
advance(lexer);
if (lexer->lookahead == 'o') {
advance(lexer);
return is_token_end(lexer->lookahead);
}
return false;
}
if (valid_symbols[NEWLINE_BEFORE_BINARY_OP] ) {
lexer->result_symbol = NEWLINE_BEFORE_BINARY_OP;
// &&, &&&
if (lexer->lookahead == '&') {
advance(lexer);
if (lexer->lookahead == '&') {
advance(lexer);
if (lexer->lookahead == '&') {
advance(lexer);
return is_operator_end(lexer);
} else {
return is_operator_end(lexer);
}
}
// =, ==, ===, =~, =>
} else if (lexer->lookahead == '=') {
advance(lexer);
if (lexer->lookahead == '=') {
advance(lexer);
if (lexer->lookahead == '=') {
advance(lexer);
return is_operator_end(lexer);
} else {
return is_operator_end(lexer);
}
} else if (lexer->lookahead == '~') {
advance(lexer);
return is_operator_end(lexer);
} else if (lexer->lookahead == '>') {
advance(lexer);
return is_operator_end(lexer);
} else {
return is_operator_end(lexer);
}
// ::
} else if (lexer->lookahead == ':') {
advance(lexer);
if (lexer->lookahead == ':') {
advance(lexer);
// Ignore ::: atom
if (lexer->lookahead == ':') return false;
return is_operator_end(lexer);
}
// ++, +++
} else if (lexer->lookahead == '+') {
advance(lexer);
if (lexer->lookahead == '+') {
advance(lexer);
if (lexer->lookahead == '+') {
advance(lexer);
return is_operator_end(lexer);
} else {
return is_operator_end(lexer);
}
}
// --, ---, ->
} else if (lexer->lookahead == '-') {
advance(lexer);
if (lexer->lookahead == '-') {
advance(lexer);
if (lexer->lookahead == '-') {
advance(lexer);
return is_operator_end(lexer);
} else {
return is_operator_end(lexer);
}
} else if (lexer->lookahead == '>') {
advance(lexer);
return is_operator_end(lexer);
}
// <, <=, <-, <>, <~, <~>, <|>, <<<, <<~
} else if (lexer->lookahead == '<') {
advance(lexer);
if (lexer->lookahead == '=' ||
lexer->lookahead == '-' ||
lexer->lookahead == '>') {
advance(lexer);
return is_operator_end(lexer);
} else if (lexer->lookahead == '~') {
advance(lexer);
if (lexer->lookahead == '>') {
advance(lexer);
return is_operator_end(lexer);
} else {
return is_operator_end(lexer);
}
} else if (lexer->lookahead == '|') {
advance(lexer);
if (lexer->lookahead == '>') {
advance(lexer);
return