Allow special characters in bare word arguments
I've moved tokenization of bare words into the external scanner. This way we can keep the grammar simple, but support some fancy rules that I've inferred from experimenting with bash: - Only allow '}' inside of a bare word if '}' isn't a valid lookahead token (i.e. we're not inside of a variable expansion). - Only allow ']' at the *start* of a bare word if neither ']' nor ']]' are valid lookahead tokens (i.e. we're not inside of a square bracket command or an array subscript). - Parentheses seem to never be allowed in bare words. You have to quote them. For alphanumeric words, I fall through to the normal scanner so that it can continue to distinguish reserved words from other words. Fixes #5
This commit is contained in:
parent
8a0a4a8501
commit
675a581839
|
@ -137,12 +137,12 @@ function do_yet_another_thing {
|
|||
|
||||
(program
|
||||
(function_definition
|
||||
(word)
|
||||
(variable_name)
|
||||
(compound_statement (command (command_name (word)) (word))))
|
||||
(function_definition
|
||||
(word)
|
||||
(variable_name)
|
||||
(compound_statement (command (command_name (word)) (word))))
|
||||
(function_definition
|
||||
(word)
|
||||
(variable_name)
|
||||
(compound_statement (command (command_name (word)) (word)))
|
||||
(file_redirect (file_descriptor) (word))))
|
||||
|
|
|
@ -11,6 +11,23 @@ echo a b
|
|||
(command (command_name (word)) (word))
|
||||
(command (command_name (word)) (word) (word)))
|
||||
|
||||
=============================
|
||||
Words with special characters
|
||||
=============================
|
||||
|
||||
echo }}}
|
||||
echo ]]] ===
|
||||
[[ "35d8b" =~ ^[0-9a-fA-F] ]] || echo {nomatch}
|
||||
|
||||
---
|
||||
|
||||
(program
|
||||
(command (command_name (word)) (word))
|
||||
(command (command_name (word)) (word) (word))
|
||||
(list
|
||||
(bracket_command (string) (word) (word))
|
||||
(command (command_name (word)) (word))))
|
||||
|
||||
=============================
|
||||
Simple variable expansions
|
||||
=============================
|
||||
|
@ -153,8 +170,8 @@ typeset -i -r var2=42 var3=10
|
|||
---
|
||||
|
||||
(program
|
||||
(declaration_command (simple_variable_name))
|
||||
(declaration_command (argument) (argument)
|
||||
(declaration_command (variable_name))
|
||||
(declaration_command (word) (word)
|
||||
(variable_assignment (variable_name) (word))
|
||||
(variable_assignment (variable_name) (word))))
|
||||
|
||||
|
@ -168,7 +185,7 @@ readonly var2=42
|
|||
---
|
||||
|
||||
(program
|
||||
(declaration_command (simple_variable_name))
|
||||
(declaration_command (variable_name))
|
||||
(declaration_command (variable_assignment (variable_name) (word))))
|
||||
|
||||
=========================================
|
||||
|
@ -183,8 +200,10 @@ local -r c
|
|||
(program
|
||||
(declaration_command
|
||||
(variable_assignment (variable_name) (word))
|
||||
(simple_variable_name))
|
||||
(declaration_command (argument) (simple_variable_name)))
|
||||
(variable_name))
|
||||
(declaration_command
|
||||
(word)
|
||||
(variable_name)))
|
||||
|
||||
=========================================
|
||||
Variable declaration: export
|
||||
|
@ -196,9 +215,9 @@ export FOOBAR PATH="$PATH:/usr/foobar/bin"
|
|||
---
|
||||
|
||||
(program
|
||||
(declaration_command (simple_variable_name))
|
||||
(declaration_command (variable_name))
|
||||
(declaration_command
|
||||
(simple_variable_name)
|
||||
(variable_name)
|
||||
(variable_assignment (variable_name) (string (simple_expansion (variable_name))))))
|
||||
|
||||
=========================================
|
||||
|
|
48
grammar.js
48
grammar.js
|
@ -1,14 +1,3 @@
|
|||
const SPECIAL_CHARACTERS = [
|
||||
"'", '"',
|
||||
'<', '>',
|
||||
'{', '}',
|
||||
'(', ')',
|
||||
'`', '$',
|
||||
'&', ';',
|
||||
'\\',
|
||||
'\\s',
|
||||
];
|
||||
|
||||
module.exports = grammar({
|
||||
name: 'bash',
|
||||
|
||||
|
@ -18,6 +7,8 @@ module.exports = grammar({
|
|||
$._expression,
|
||||
$._primary_expression,
|
||||
$._variable_name,
|
||||
$._simple_variable_name,
|
||||
$._simple_word,
|
||||
],
|
||||
|
||||
externals: $ => [
|
||||
|
@ -26,10 +17,14 @@ module.exports = grammar({
|
|||
$._heredoc_middle,
|
||||
$._heredoc_end,
|
||||
$.file_descriptor,
|
||||
$.word,
|
||||
$._empty_value,
|
||||
$._concat,
|
||||
$.variable_name, // Variable name followed by an operator like '=' or '+='
|
||||
'\n',
|
||||
']',
|
||||
']]',
|
||||
'}',
|
||||
],
|
||||
|
||||
extras: $ => [
|
||||
|
@ -64,7 +59,7 @@ module.exports = grammar({
|
|||
|
||||
for_statement: $ => seq(
|
||||
'for',
|
||||
$._variable_name,
|
||||
$._simple_variable_name,
|
||||
'in',
|
||||
repeat1($._expression),
|
||||
$._terminator,
|
||||
|
@ -125,8 +120,8 @@ module.exports = grammar({
|
|||
|
||||
function_definition: $ => seq(
|
||||
choice(
|
||||
seq('function', $.word, optional(seq('(', ')'))),
|
||||
seq($.word, '(', ')')
|
||||
seq('function', $._simple_variable_name, optional(seq('(', ')'))),
|
||||
seq($._simple_variable_name, '(', ')')
|
||||
),
|
||||
$.compound_statement,
|
||||
optional($.file_redirect)
|
||||
|
@ -190,9 +185,9 @@ module.exports = grammar({
|
|||
|
||||
declaration_command: $ => seq(
|
||||
choice('declare', 'typeset', 'export', 'readonly', 'local'),
|
||||
repeat(alias(seq('-', $.word), 'argument')),
|
||||
repeat(choice(
|
||||
$.simple_variable_name,
|
||||
$.word,
|
||||
$._simple_variable_name,
|
||||
$.variable_assignment
|
||||
))
|
||||
),
|
||||
|
@ -249,6 +244,7 @@ module.exports = grammar({
|
|||
|
||||
_primary_expression: $ => choice(
|
||||
$.word,
|
||||
$._simple_word,
|
||||
$.string,
|
||||
$.raw_string,
|
||||
$.expansion,
|
||||
|
@ -298,14 +294,14 @@ module.exports = grammar({
|
|||
seq(
|
||||
$._variable_name,
|
||||
choice(':', ':?', '=', ':-', '%', '/'),
|
||||
optional($._expression)
|
||||
optional(seq($._expression, optional($._concat)))
|
||||
)
|
||||
),
|
||||
'}'
|
||||
),
|
||||
|
||||
_variable_name: $ => choice(
|
||||
alias($.simple_variable_name, $.variable_name),
|
||||
$._simple_variable_name,
|
||||
$.special_variable_name
|
||||
),
|
||||
|
||||
|
@ -320,22 +316,16 @@ module.exports = grammar({
|
|||
')'
|
||||
),
|
||||
|
||||
word: $ => token(repeat1(choice(
|
||||
noneOf('#', ...SPECIAL_CHARACTERS),
|
||||
seq('\\', noneOf('\\s'))
|
||||
))),
|
||||
|
||||
comment: $ => token(prec(-1, /#.*/)),
|
||||
|
||||
simple_variable_name: $ => /\w+/,
|
||||
_simple_variable_name: $ => alias($.identifier, $.variable_name),
|
||||
|
||||
_simple_word: $ => alias($.identifier, $.word),
|
||||
|
||||
identifier: $ => /\w+/,
|
||||
|
||||
special_variable_name: $ => choice('*', '@', '#', '?', '-', '$', '!', '0', '_'),
|
||||
|
||||
_terminator: $ => choice(';', ';;', '\n', '&')
|
||||
}
|
||||
});
|
||||
|
||||
function noneOf(...characters) {
|
||||
const negatedString = characters.map(c => c == '\\' ? '\\\\' : c).join('')
|
||||
return new RegExp('[^' + negatedString + ']')
|
||||
}
|
||||
|
|
|
@ -83,7 +83,7 @@
|
|||
},
|
||||
{
|
||||
"type": "SYMBOL",
|
||||
"name": "_variable_name"
|
||||
"name": "_simple_variable_name"
|
||||
},
|
||||
{
|
||||
"type": "STRING",
|
||||
|
@ -329,7 +329,7 @@
|
|||
},
|
||||
{
|
||||
"type": "SYMBOL",
|
||||
"name": "word"
|
||||
"name": "_simple_variable_name"
|
||||
},
|
||||
{
|
||||
"type": "CHOICE",
|
||||
|
@ -359,7 +359,7 @@
|
|||
"members": [
|
||||
{
|
||||
"type": "SYMBOL",
|
||||
"name": "word"
|
||||
"name": "_simple_variable_name"
|
||||
},
|
||||
{
|
||||
"type": "STRING",
|
||||
|
@ -658,27 +658,6 @@
|
|||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "REPEAT",
|
||||
"content": {
|
||||
"type": "ALIAS",
|
||||
"content": {
|
||||
"type": "SEQ",
|
||||
"members": [
|
||||
{
|
||||
"type": "STRING",
|
||||
"value": "-"
|
||||
},
|
||||
{
|
||||
"type": "SYMBOL",
|
||||
"name": "word"
|
||||
}
|
||||
]
|
||||
},
|
||||
"named": false,
|
||||
"value": "argument"
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "REPEAT",
|
||||
"content": {
|
||||
|
@ -686,7 +665,11 @@
|
|||
"members": [
|
||||
{
|
||||
"type": "SYMBOL",
|
||||
"name": "simple_variable_name"
|
||||
"name": "word"
|
||||
},
|
||||
{
|
||||
"type": "SYMBOL",
|
||||
"name": "_simple_variable_name"
|
||||
},
|
||||
{
|
||||
"type": "SYMBOL",
|
||||
|
@ -895,6 +878,10 @@
|
|||
"type": "SYMBOL",
|
||||
"name": "word"
|
||||
},
|
||||
{
|
||||
"type": "SYMBOL",
|
||||
"name": "_simple_word"
|
||||
},
|
||||
{
|
||||
"type": "SYMBOL",
|
||||
"name": "string"
|
||||
|
@ -1141,8 +1128,25 @@
|
|||
"type": "CHOICE",
|
||||
"members": [
|
||||
{
|
||||
"type": "SYMBOL",
|
||||
"name": "_expression"
|
||||
"type": "SEQ",
|
||||
"members": [
|
||||
{
|
||||
"type": "SYMBOL",
|
||||
"name": "_expression"
|
||||
},
|
||||
{
|
||||
"type": "CHOICE",
|
||||
"members": [
|
||||
{
|
||||
"type": "SYMBOL",
|
||||
"name": "_concat"
|
||||
},
|
||||
{
|
||||
"type": "BLANK"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "BLANK"
|
||||
|
@ -1163,13 +1167,8 @@
|
|||
"type": "CHOICE",
|
||||
"members": [
|
||||
{
|
||||
"type": "ALIAS",
|
||||
"content": {
|
||||
"type": "SYMBOL",
|
||||
"name": "simple_variable_name"
|
||||
},
|
||||
"named": true,
|
||||
"value": "variable_name"
|
||||
"type": "SYMBOL",
|
||||
"name": "_simple_variable_name"
|
||||
},
|
||||
{
|
||||
"type": "SYMBOL",
|
||||
|
@ -1246,34 +1245,6 @@
|
|||
}
|
||||
]
|
||||
},
|
||||
"word": {
|
||||
"type": "TOKEN",
|
||||
"content": {
|
||||
"type": "REPEAT1",
|
||||
"content": {
|
||||
"type": "CHOICE",
|
||||
"members": [
|
||||
{
|
||||
"type": "PATTERN",
|
||||
"value": "[^#'\"<>{}()`$&;\\\\\\s]"
|
||||
},
|
||||
{
|
||||
"type": "SEQ",
|
||||
"members": [
|
||||
{
|
||||
"type": "STRING",
|
||||
"value": "\\"
|
||||
},
|
||||
{
|
||||
"type": "PATTERN",
|
||||
"value": "[^\\s]"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"comment": {
|
||||
"type": "TOKEN",
|
||||
"content": {
|
||||
|
@ -1285,7 +1256,25 @@
|
|||
}
|
||||
}
|
||||
},
|
||||
"simple_variable_name": {
|
||||
"_simple_variable_name": {
|
||||
"type": "ALIAS",
|
||||
"content": {
|
||||
"type": "SYMBOL",
|
||||
"name": "identifier"
|
||||
},
|
||||
"named": true,
|
||||
"value": "variable_name"
|
||||
},
|
||||
"_simple_word": {
|
||||
"type": "ALIAS",
|
||||
"content": {
|
||||
"type": "SYMBOL",
|
||||
"name": "identifier"
|
||||
},
|
||||
"named": true,
|
||||
"value": "word"
|
||||
},
|
||||
"identifier": {
|
||||
"type": "PATTERN",
|
||||
"value": "\\w+"
|
||||
},
|
||||
|
@ -1396,6 +1385,10 @@
|
|||
"type": "SYMBOL",
|
||||
"name": "file_descriptor"
|
||||
},
|
||||
{
|
||||
"type": "SYMBOL",
|
||||
"name": "word"
|
||||
},
|
||||
{
|
||||
"type": "SYMBOL",
|
||||
"name": "_empty_value"
|
||||
|
@ -1411,6 +1404,18 @@
|
|||
{
|
||||
"type": "STRING",
|
||||
"value": "\n"
|
||||
},
|
||||
{
|
||||
"type": "STRING",
|
||||
"value": "]"
|
||||
},
|
||||
{
|
||||
"type": "STRING",
|
||||
"value": "]]"
|
||||
},
|
||||
{
|
||||
"type": "STRING",
|
||||
"value": "}"
|
||||
}
|
||||
],
|
||||
"inline": [
|
||||
|
@ -1418,6 +1423,8 @@
|
|||
"_terminator",
|
||||
"_expression",
|
||||
"_primary_expression",
|
||||
"_variable_name"
|
||||
"_variable_name",
|
||||
"_simple_variable_name",
|
||||
"_simple_word"
|
||||
]
|
||||
}
|
File diff suppressed because it is too large
Load Diff
|
@ -12,10 +12,14 @@ enum TokenType {
|
|||
HEREDOC_MIDDLE,
|
||||
HEREDOC_END,
|
||||
FILE_DESCRIPTOR,
|
||||
WORD,
|
||||
EMPTY_VALUE,
|
||||
CONCAT,
|
||||
VARIABLE_NAME,
|
||||
NEWLINE,
|
||||
CLOSING_BRACKET,
|
||||
CLOSING_DOUBLE_BRACKET,
|
||||
CLOSING_BRACE,
|
||||
};
|
||||
|
||||
struct Scanner {
|
||||
|
@ -90,6 +94,7 @@ struct Scanner {
|
|||
lexer->lookahead == ')' ||
|
||||
lexer->lookahead == '(' ||
|
||||
lexer->lookahead == '[' ||
|
||||
lexer->lookahead == '|' ||
|
||||
lexer->lookahead == ']' ||
|
||||
lexer->lookahead == '}' ||
|
||||
lexer->lookahead == ';' ||
|
||||
|
@ -130,7 +135,9 @@ struct Scanner {
|
|||
return scan_heredoc_content(lexer, HEREDOC_BEGINNING, SIMPLE_HEREDOC);
|
||||
}
|
||||
|
||||
if (valid_symbols[VARIABLE_NAME] || valid_symbols[FILE_DESCRIPTOR]) {
|
||||
if (valid_symbols[VARIABLE_NAME] || valid_symbols[FILE_DESCRIPTOR] || valid_symbols[WORD]) {
|
||||
unsigned length = 0;
|
||||
|
||||
for (;;) {
|
||||
if (
|
||||
lexer->lookahead == ' ' ||
|
||||
|
@ -139,11 +146,12 @@ struct Scanner {
|
|||
) {
|
||||
skip(lexer);
|
||||
} else if (lexer->lookahead == '\\') {
|
||||
skip(lexer);
|
||||
advance(lexer);
|
||||
if (lexer->lookahead == '\n') {
|
||||
skip(lexer);
|
||||
} else {
|
||||
return false;
|
||||
length++;
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
break;
|
||||
|
@ -151,26 +159,41 @@ struct Scanner {
|
|||
}
|
||||
|
||||
bool is_number = true;
|
||||
if (iswdigit(lexer->lookahead)) {
|
||||
advance(lexer);
|
||||
} else if (iswalpha(lexer->lookahead) || lexer->lookahead == '_') {
|
||||
is_number = false;
|
||||
advance(lexer);
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
|
||||
bool is_alphanumeric = true;
|
||||
for (;;) {
|
||||
if (iswdigit(lexer->lookahead)) {
|
||||
advance(lexer);
|
||||
} else if (iswalpha(lexer->lookahead) || lexer->lookahead == '_') {
|
||||
is_number = false;
|
||||
advance(lexer);
|
||||
} else if (
|
||||
!iswspace(lexer->lookahead) &&
|
||||
lexer->lookahead != 0 &&
|
||||
lexer->lookahead != '"' &&
|
||||
lexer->lookahead != '\'' &&
|
||||
lexer->lookahead != '`' &&
|
||||
lexer->lookahead != '>' &&
|
||||
lexer->lookahead != '<' &&
|
||||
lexer->lookahead != '#' &&
|
||||
lexer->lookahead != '|' &&
|
||||
lexer->lookahead != '(' &&
|
||||
lexer->lookahead != ')' &&
|
||||
lexer->lookahead != ';' &&
|
||||
lexer->lookahead != '&' &&
|
||||
lexer->lookahead != '$'
|
||||
) {
|
||||
if (lexer->lookahead == '}' && valid_symbols[CLOSING_BRACE]) break;
|
||||
if (lexer->lookahead == ']' && length == 0 && (valid_symbols[CLOSING_BRACKET] || valid_symbols[CLOSING_DOUBLE_BRACKET])) break;
|
||||
if (is_alphanumeric && valid_symbols[VARIABLE_NAME] && (lexer->lookahead == '=' || lexer->lookahead == '[' || lexer->lookahead == '+')) break;
|
||||
is_alphanumeric = false;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
|
||||
advance(lexer);
|
||||
length++;
|
||||
}
|
||||
|
||||
if (length == 0) return false;
|
||||
|
||||
if (is_number &&
|
||||
valid_symbols[FILE_DESCRIPTOR] &&
|
||||
(lexer->lookahead == '>' || lexer->lookahead == '<')) {
|
||||
|
@ -194,7 +217,10 @@ struct Scanner {
|
|||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
if (valid_symbols[WORD] && !is_alphanumeric) {
|
||||
lexer->result_symbol = WORD;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
|
|
Loading…
Reference in New Issue