Always lex braces and brackets as separate tokens

The lexer needs to always return braces and brackets separately so that
the parser can decide if they are part of some construct like an array
subscript or a variable expansion. This means that there was no point
in moving bare-word tokenization into the external scanner. I've moved
it back into the normal scanner.

The tricky part is how to deal with the separate '}' and ']' tokens
in the case where they are *not* part of a subscript or an expansion.
For example, in code like `echo {hi}`, the syntax tree should still
clearly indicate that only *one* argument is being passed to echo.
For now, we achieve this by grouping the '{', hi, and '}' tokens into
a single `concatenation` node, which is a bit odd, but it's the best
we can do.
This commit is contained in:
Max Brunsfeld 2018-02-28 11:13:49 -08:00
parent c34619a1c4
commit 6f81608535
6 changed files with 55311 additions and 34710 deletions

View File

@ -139,12 +139,12 @@ function do_yet_another_thing {
(program
(function_definition
(variable_name)
(word)
(compound_statement (command (command_name (word)) (word))))
(function_definition
(variable_name)
(word)
(compound_statement (command (command_name (word)) (word))))
(function_definition
(variable_name)
(word)
(compound_statement (command (command_name (word)) (word)))
(file_redirect (file_descriptor) (word))))

View File

@ -15,6 +15,7 @@ echo a b
Words with special characters
=============================
echo {o[k]}
echo }}}
echo ]]] ===
[[ "35d8b" =~ ^[0-9a-fA-F] ]] || echo {nomatch}
@ -22,11 +23,19 @@ echo ]]] ===
---
(program
(command (command_name (word)) (concatenation (word) (word)))
(command (command_name (word)) (word))
(command (command_name (word)) (word) (word))
(list
(command (command_name (word)) (string) (word) (word) (word))
(command (command_name (word)) (word))))
(command
(command_name (word))
(string)
(word)
(concatenation (word) (word))
(word))
(command
(command_name (word))
(concatenation (word)))))
=============================
Simple variable expansions
@ -57,6 +66,32 @@ echo ${abc:
(command (command_name (word)) (expansion (variable_name)))
(command (command_name (word)) (expansion (variable_name))))
===================================
Variable expansions in strings
===================================
A="${A:-$B/c}"
A="${b/$c/$d}"
---
(program
(variable_assignment
(variable_name)
(string
(expansion
(variable_name)
(concatenation (simple_expansion (variable_name)) (word)))))
(variable_assignment
(variable_name)
(string
(expansion
(variable_name)
(concatenation
(simple_expansion (variable_name))
(word)
(simple_expansion (variable_name)))))))
===================================
Other variable expansion operators
===================================
@ -248,8 +283,8 @@ a+=(foo "bar" $(baz))
(program
(variable_assignment (variable_name) (array))
(variable_assignment (variable_name) (array (word) (word) (word)))
(command (command_name (word)) (expansion (variable_name)))
(command (command_name (word)) (expansion (variable_name)))
(command (command_name (word)) (expansion (subscript (variable_name) (word))))
(command (command_name (word)) (expansion (subscript (variable_name) (word))))
(variable_assignment
(subscript (variable_name) (simple_expansion (variable_name)))
(word))

View File

@ -1,3 +1,16 @@
const SPECIAL_CHARACTERS = [
"'", '"',
'<', '>',
'{', '}',
'\\[', '\\]',
'(', ')',
'`', '$',
'&', ';',
'\\',
'\\s',
'#',
];
module.exports = grammar({
name: 'bash',
@ -6,9 +19,8 @@ module.exports = grammar({
$._terminator,
$._expression,
$._primary_expression,
$._variable_name,
$._simple_variable_name,
$._simple_word,
$._special_variable_name,
],
externals: $ => [
@ -17,13 +29,12 @@ module.exports = grammar({
$._heredoc_middle,
$._heredoc_end,
$.file_descriptor,
$.word,
$._empty_value,
$._concat,
$.variable_name, // Variable name followed by an operator like '=' or '+='
'\n',
']',
'}',
']',
'\n',
],
extras: $ => [
@ -118,8 +129,8 @@ module.exports = grammar({
function_definition: $ => seq(
choice(
seq('function', $._simple_variable_name, optional(seq('(', ')'))),
seq($._simple_variable_name, '(', ')')
seq('function', $.word, optional(seq('(', ')'))),
seq($.word, '(', ')')
),
$.compound_statement,
optional($.file_redirect)
@ -162,7 +173,8 @@ module.exports = grammar({
repeat($._expression),
repeat(choice(
$.file_redirect,
$.heredoc_redirect
$.heredoc_redirect,
$.herestring_redirect
))
)),
@ -201,7 +213,9 @@ module.exports = grammar({
$.variable_name,
'[',
$._expression,
']'
optional($._concat),
']',
optional($._concat)
),
file_redirect: $ => prec.left(seq(
@ -228,16 +242,21 @@ module.exports = grammar({
)
),
herestring_redirect: $ => seq(
'<<<',
$._expression
),
// Expressions
_expression: $ => choice(
$.concatenation,
$._primary_expression
$._primary_expression,
alias(prec(-2, $._special_characters), $.word)
),
_primary_expression: $ => choice(
$.word,
$._simple_word,
$.string,
$.raw_string,
$.expansion,
@ -247,22 +266,36 @@ module.exports = grammar({
),
concatenation: $ => prec(-1, seq(
choice(
$._primary_expression,
repeat1(seq($._concat, $._primary_expression))
$._special_characters,
),
repeat1(prec(-1, seq(
$._concat,
choice(
$._primary_expression,
$._special_characters,
)
))),
)),
_special_characters: $ => token(prec(-1, repeat1(choice('{', '}', '[', ']')))),
string: $ => seq(
'"',
repeat(choice(
repeat(seq(
choice(
$._string_content,
$.expansion,
$.simple_expansion,
$.command_substitution
),
optional($._concat)
)),
'"'
),
_string_content: $ => /([^"`$]|\\.)*/,
_string_content: $ => /([^"`$]|\\.)+/,
array: $ => seq(
'(',
@ -274,30 +307,37 @@ module.exports = grammar({
simple_expansion: $ => seq(
'$',
$._variable_name
choice($._simple_variable_name, $._special_variable_name)
),
expansion: $ => seq(
'${',
optional('#'),
choice(
$._variable_name,
seq('#', $._variable_name),
seq('#', $._variable_name, '[', '@', ']'),
seq($._variable_name, '[', '@', ']'),
seq(
$._variable_name,
choice(':', ':?', '=', ':-', '%', '/'),
optional(seq($._expression, optional($._concat)))
)
$.variable_name,
'=',
optional(seq(
$._expression
))
),
seq(
choice(
$.subscript,
$._simple_variable_name,
$._special_variable_name
),
optional(seq(
choice(':', ':?', '=', ':-', '%', '/', '-'),
optional(seq(
$._expression
))
))
),
),
'}'
),
_variable_name: $ => choice(
$._simple_variable_name,
$.special_variable_name
),
command_substitution: $ => choice(
seq('$(', $._statement, ')'),
prec(1, seq('`', $._statement, '`'))
@ -311,14 +351,20 @@ module.exports = grammar({
comment: $ => token(prec(-1, /#.*/)),
_simple_variable_name: $ => alias($.identifier, $.variable_name),
_simple_variable_name: $ => alias(/\w+/, $.variable_name),
_simple_word: $ => alias($.identifier, $.word),
_special_variable_name: $ => alias(choice('*', '@', '?', '-', '$', '0', '_'), $.special_variable_name),
identifier: $ => /\w+/,
special_variable_name: $ => choice('*', '@', '#', '?', '-', '$', '!', '0', '_'),
word: $ => token(repeat1(choice(
noneOf(...SPECIAL_CHARACTERS),
seq('\\', noneOf('\\s'))
))),
_terminator: $ => choice(';', ';;', '\n', '&')
}
});
function noneOf(...characters) {
const negatedString = characters.map(c => c == '\\' ? '\\\\' : c).join('')
return new RegExp('[^' + negatedString + ']')
}

324
src/grammar.json vendored
View File

@ -325,7 +325,7 @@
},
{
"type": "SYMBOL",
"name": "_simple_variable_name"
"name": "word"
},
{
"type": "CHOICE",
@ -355,7 +355,7 @@
"members": [
{
"type": "SYMBOL",
"name": "_simple_variable_name"
"name": "word"
},
{
"type": "STRING",
@ -548,6 +548,10 @@
{
"type": "SYMBOL",
"name": "heredoc_redirect"
},
{
"type": "SYMBOL",
"name": "herestring_redirect"
}
]
}
@ -681,9 +685,33 @@
"type": "SYMBOL",
"name": "_expression"
},
{
"type": "CHOICE",
"members": [
{
"type": "SYMBOL",
"name": "_concat"
},
{
"type": "BLANK"
}
]
},
{
"type": "STRING",
"value": "]"
},
{
"type": "CHOICE",
"members": [
{
"type": "SYMBOL",
"name": "_concat"
},
{
"type": "BLANK"
}
]
}
]
},
@ -809,6 +837,19 @@
}
]
},
"herestring_redirect": {
"type": "SEQ",
"members": [
{
"type": "STRING",
"value": "<<<"
},
{
"type": "SYMBOL",
"name": "_expression"
}
]
},
"_expression": {
"type": "CHOICE",
"members": [
@ -819,6 +860,19 @@
{
"type": "SYMBOL",
"name": "_primary_expression"
},
{
"type": "ALIAS",
"content": {
"type": "PREC",
"value": -2,
"content": {
"type": "SYMBOL",
"name": "_special_characters"
}
},
"named": true,
"value": "word"
}
]
},
@ -829,10 +883,6 @@
"type": "SYMBOL",
"name": "word"
},
{
"type": "SYMBOL",
"name": "_simple_word"
},
{
"type": "SYMBOL",
"name": "string"
@ -864,13 +914,25 @@
"value": -1,
"content": {
"type": "SEQ",
"members": [
{
"type": "CHOICE",
"members": [
{
"type": "SYMBOL",
"name": "_primary_expression"
},
{
"type": "SYMBOL",
"name": "_special_characters"
}
]
},
{
"type": "REPEAT1",
"content": {
"type": "PREC",
"value": -1,
"content": {
"type": "SEQ",
"members": [
@ -878,16 +940,57 @@
"type": "SYMBOL",
"name": "_concat"
},
{
"type": "CHOICE",
"members": [
{
"type": "SYMBOL",
"name": "_primary_expression"
},
{
"type": "SYMBOL",
"name": "_special_characters"
}
]
}
]
}
}
}
]
}
},
"_special_characters": {
"type": "TOKEN",
"content": {
"type": "PREC",
"value": -1,
"content": {
"type": "REPEAT1",
"content": {
"type": "CHOICE",
"members": [
{
"type": "STRING",
"value": "{"
},
{
"type": "STRING",
"value": "}"
},
{
"type": "STRING",
"value": "["
},
{
"type": "STRING",
"value": "]"
}
]
}
}
}
},
"string": {
"type": "SEQ",
"members": [
@ -898,6 +1001,9 @@
{
"type": "REPEAT",
"content": {
"type": "SEQ",
"members": [
{
"type": "CHOICE",
"members": [
{
@ -917,6 +1023,20 @@
"name": "command_substitution"
}
]
},
{
"type": "CHOICE",
"members": [
{
"type": "SYMBOL",
"name": "_concat"
},
{
"type": "BLANK"
}
]
}
]
}
},
{
@ -927,7 +1047,7 @@
},
"_string_content": {
"type": "PATTERN",
"value": "([^\"`$]|\\\\.)*"
"value": "([^\"`$]|\\\\.)+"
},
"array": {
"type": "SEQ",
@ -960,9 +1080,18 @@
"type": "STRING",
"value": "$"
},
{
"type": "CHOICE",
"members": [
{
"type": "SYMBOL",
"name": "_variable_name"
"name": "_simple_variable_name"
},
{
"type": "SYMBOL",
"name": "_special_variable_name"
}
]
}
]
},
@ -975,21 +1104,46 @@
},
{
"type": "CHOICE",
"members": [
{
"type": "SYMBOL",
"name": "_variable_name"
},
{
"type": "SEQ",
"members": [
{
"type": "STRING",
"value": "#"
},
{
"type": "BLANK"
}
]
},
{
"type": "CHOICE",
"members": [
{
"type": "SEQ",
"members": [
{
"type": "SYMBOL",
"name": "_variable_name"
"name": "variable_name"
},
{
"type": "STRING",
"value": "="
},
{
"type": "CHOICE",
"members": [
{
"type": "SEQ",
"members": [
{
"type": "SYMBOL",
"name": "_expression"
}
]
},
{
"type": "BLANK"
}
]
}
]
},
@ -997,55 +1151,28 @@
"type": "SEQ",
"members": [
{
"type": "STRING",
"value": "#"
"type": "CHOICE",
"members": [
{
"type": "SYMBOL",
"name": "subscript"
},
{
"type": "SYMBOL",
"name": "_variable_name"
"name": "_simple_variable_name"
},
{
"type": "STRING",
"value": "["
},
{
"type": "STRING",
"value": "@"
},
{
"type": "STRING",
"value": "]"
"type": "SYMBOL",
"name": "_special_variable_name"
}
]
},
{
"type": "SEQ",
"type": "CHOICE",
"members": [
{
"type": "SYMBOL",
"name": "_variable_name"
},
{
"type": "STRING",
"value": "["
},
{
"type": "STRING",
"value": "@"
},
{
"type": "STRING",
"value": "]"
}
]
},
{
"type": "SEQ",
"members": [
{
"type": "SYMBOL",
"name": "_variable_name"
},
{
"type": "CHOICE",
"members": [
@ -1072,6 +1199,10 @@
{
"type": "STRING",
"value": "/"
},
{
"type": "STRING",
"value": "-"
}
]
},
@ -1084,13 +1215,8 @@
{
"type": "SYMBOL",
"name": "_expression"
},
{
"type": "CHOICE",
"members": [
{
"type": "SYMBOL",
"name": "_concat"
}
]
},
{
"type": "BLANK"
@ -1114,19 +1240,6 @@
}
]
},
"_variable_name": {
"type": "CHOICE",
"members": [
{
"type": "SYMBOL",
"name": "_simple_variable_name"
},
{
"type": "SYMBOL",
"name": "special_variable_name"
}
]
},
"command_substitution": {
"type": "CHOICE",
"members": [
@ -1210,26 +1323,15 @@
"_simple_variable_name": {
"type": "ALIAS",
"content": {
"type": "SYMBOL",
"name": "identifier"
"type": "PATTERN",
"value": "\\w+"
},
"named": true,
"value": "variable_name"
},
"_simple_word": {
"_special_variable_name": {
"type": "ALIAS",
"content": {
"type": "SYMBOL",
"name": "identifier"
},
"named": true,
"value": "word"
},
"identifier": {
"type": "PATTERN",
"value": "\\w+"
},
"special_variable_name": {
"type": "CHOICE",
"members": [
{
@ -1240,10 +1342,6 @@
"type": "STRING",
"value": "@"
},
{
"type": "STRING",
"value": "#"
},
{
"type": "STRING",
"value": "?"
@ -1256,10 +1354,6 @@
"type": "STRING",
"value": "$"
},
{
"type": "STRING",
"value": "!"
},
{
"type": "STRING",
"value": "0"
@ -1270,6 +1364,37 @@
}
]
},
"named": true,
"value": "special_variable_name"
},
"word": {
"type": "TOKEN",
"content": {
"type": "REPEAT1",
"content": {
"type": "CHOICE",
"members": [
{
"type": "PATTERN",
"value": "[^'\"<>{}\\[\\]()`$&;\\\\\\s#]"
},
{
"type": "SEQ",
"members": [
{
"type": "STRING",
"value": "\\"
},
{
"type": "PATTERN",
"value": "[^\\s]"
}
]
}
]
}
}
},
"_terminator": {
"type": "CHOICE",
"members": [
@ -1336,10 +1461,6 @@
"type": "SYMBOL",
"name": "file_descriptor"
},
{
"type": "SYMBOL",
"name": "word"
},
{
"type": "SYMBOL",
"name": "_empty_value"
@ -1354,7 +1475,7 @@
},
{
"type": "STRING",
"value": "\n"
"value": "}"
},
{
"type": "STRING",
@ -1362,7 +1483,7 @@
},
{
"type": "STRING",
"value": "}"
"value": "\n"
}
],
"inline": [
@ -1370,8 +1491,7 @@
"_terminator",
"_expression",
"_primary_expression",
"_variable_name",
"_simple_variable_name",
"_simple_word"
"_special_variable_name"
]
}

89210
src/parser.c vendored

File diff suppressed because it is too large Load Diff

104
src/scanner.cc vendored
View File

@ -12,13 +12,12 @@ enum TokenType {
HEREDOC_MIDDLE,
HEREDOC_END,
FILE_DESCRIPTOR,
WORD,
EMPTY_VALUE,
CONCAT,
VARIABLE_NAME,
NEWLINE,
CLOSING_BRACKET,
CLOSING_BRACE,
CLOSING_BRACKET,
NEWLINE,
};
struct Scanner {
@ -92,13 +91,12 @@ struct Scanner {
lexer->lookahead == '<' ||
lexer->lookahead == ')' ||
lexer->lookahead == '(' ||
lexer->lookahead == '[' ||
lexer->lookahead == '|' ||
lexer->lookahead == ']' ||
lexer->lookahead == '}' ||
lexer->lookahead == ';' ||
lexer->lookahead == '&' ||
lexer->lookahead == '`'
lexer->lookahead == '`' ||
lexer->lookahead == 0 ||
(lexer->lookahead == '}' && valid_symbols[CLOSING_BRACE]) ||
(lexer->lookahead == ']' && valid_symbols[CLOSING_BRACKET])
)) {
lexer->result_symbol = CONCAT;
return true;
@ -134,9 +132,7 @@ struct Scanner {
return scan_heredoc_content(lexer, HEREDOC_BEGINNING, SIMPLE_HEREDOC);
}
if (valid_symbols[VARIABLE_NAME] || valid_symbols[FILE_DESCRIPTOR] || valid_symbols[WORD]) {
unsigned length = 0;
if (valid_symbols[VARIABLE_NAME] || valid_symbols[FILE_DESCRIPTOR]) {
for (;;) {
if (
lexer->lookahead == ' ' ||
@ -145,70 +141,46 @@ struct Scanner {
) {
skip(lexer);
} else if (lexer->lookahead == '\\') {
advance(lexer);
skip(lexer);
if (lexer->lookahead == '\n') {
skip(lexer);
} else {
length++;
break;
return false;
}
} else {
break;
}
}
bool is_numeric = iswdigit(lexer->lookahead);
bool is_alphanumeric = iswalpha(lexer->lookahead);
bool is_number = true;
if (iswdigit(lexer->lookahead)) {
advance(lexer);
} else if (iswalpha(lexer->lookahead) || lexer->lookahead == '_') {
is_number = false;
advance(lexer);
} else {
return false;
}
for (;;) {
// These characters are not allowed in unquoted arguments
// or environment variable names
if (
lexer->lookahead == 0 ||
lexer->lookahead == ';' ||
lexer->lookahead == '"' ||
lexer->lookahead == '(' ||
lexer->lookahead == ')' ||
lexer->lookahead == '\'' ||
lexer->lookahead == '&' ||
lexer->lookahead == '#' ||
lexer->lookahead == '`' ||
lexer->lookahead == '|' ||
lexer->lookahead == '$' ||
iswspace(lexer->lookahead)
) break;
if (iswdigit(lexer->lookahead)) {
advance(lexer);
} else if (iswalpha(lexer->lookahead) || lexer->lookahead == '_') {
is_number = false;
advance(lexer);
} else {
break;
}
}
// Curly braces are not allowed in unquoted arguments within curly braces
// (e.g. inside of a variable expansion like `${key:arg}`).
if (
lexer->lookahead == '}' &&
valid_symbols[CLOSING_BRACE]
) break;
// Square brackets are not allowed in unquoted arguments within square brackets
// (e.g. inside of an array subscript like `a[arg]`).
if (
lexer->lookahead == ']' &&
valid_symbols[CLOSING_BRACKET]
) break;
// Numbers followed by '<' and '>' at the beginning of commands
// are parsed as file descriptors.
if (lexer->lookahead == '<' || lexer->lookahead == '>') {
if (is_numeric && valid_symbols[FILE_DESCRIPTOR]) {
if (is_number &&
valid_symbols[FILE_DESCRIPTOR] &&
(lexer->lookahead == '>' || lexer->lookahead == '<')) {
lexer->result_symbol = FILE_DESCRIPTOR;
return true;
}
break;
}
if (!iswdigit(lexer->lookahead)) is_numeric = false;
if (!iswalnum(lexer->lookahead) && lexer->lookahead != '_') {
// Alphanumeric strings followed by '=', '[', or '+=' are treated
// as environment variable names.
if (is_alphanumeric && valid_symbols[VARIABLE_NAME] && length > 0) {
if (valid_symbols[VARIABLE_NAME]) {
if (lexer->lookahead == '+') {
lexer->mark_end(lexer);
advance(lexer);
@ -224,19 +196,7 @@ struct Scanner {
}
}
is_alphanumeric = false;
}
advance(lexer);
length++;
}
// Do not handle strings containing only letters, because those
// might be keywords. Let the normal lexer handle those.
if (length > 0 && valid_symbols[WORD] && !is_alphanumeric) {
lexer->result_symbol = WORD;
return true;
}
return false;
}
return false;