Allow special characters in bare word arguments

I've moved tokenization of bare words into the external scanner. This
way we can keep the grammar simple, but support some fancy rules that
I've inferred from experimenting with bash:

- Only allow '}' inside of a bare word if '}' isn't a valid lookahead
token (i.e. we're not inside of a variable expansion).
- Only allow ']' at the *start* of a bare word if neither ']' nor ']]'
are valid lookahead tokens (i.e. we're not inside of a square bracket
command or an array subscript).
- Parentheses seem to never be allowed in bare words. You have to quote
them.

For alphanumeric words, I fall through to the normal scanner so that it
can continue to distinguish reserved words from other words.

Fixes #5
This commit is contained in:
Max Brunsfeld 2018-02-27 10:54:40 -08:00
parent 8a0a4a8501
commit 675a581839
6 changed files with 32596 additions and 31037 deletions

View File

@ -137,12 +137,12 @@ function do_yet_another_thing {
(program (program
(function_definition (function_definition
(word) (variable_name)
(compound_statement (command (command_name (word)) (word)))) (compound_statement (command (command_name (word)) (word))))
(function_definition (function_definition
(word) (variable_name)
(compound_statement (command (command_name (word)) (word)))) (compound_statement (command (command_name (word)) (word))))
(function_definition (function_definition
(word) (variable_name)
(compound_statement (command (command_name (word)) (word))) (compound_statement (command (command_name (word)) (word)))
(file_redirect (file_descriptor) (word)))) (file_redirect (file_descriptor) (word))))

View File

@ -11,6 +11,23 @@ echo a b
(command (command_name (word)) (word)) (command (command_name (word)) (word))
(command (command_name (word)) (word) (word))) (command (command_name (word)) (word) (word)))
=============================
Words with special characters
=============================
echo }}}
echo ]]] ===
[[ "35d8b" =~ ^[0-9a-fA-F] ]] || echo {nomatch}
---
(program
(command (command_name (word)) (word))
(command (command_name (word)) (word) (word))
(list
(bracket_command (string) (word) (word))
(command (command_name (word)) (word))))
============================= =============================
Simple variable expansions Simple variable expansions
============================= =============================
@ -153,8 +170,8 @@ typeset -i -r var2=42 var3=10
--- ---
(program (program
(declaration_command (simple_variable_name)) (declaration_command (variable_name))
(declaration_command (argument) (argument) (declaration_command (word) (word)
(variable_assignment (variable_name) (word)) (variable_assignment (variable_name) (word))
(variable_assignment (variable_name) (word)))) (variable_assignment (variable_name) (word))))
@ -168,7 +185,7 @@ readonly var2=42
--- ---
(program (program
(declaration_command (simple_variable_name)) (declaration_command (variable_name))
(declaration_command (variable_assignment (variable_name) (word)))) (declaration_command (variable_assignment (variable_name) (word))))
========================================= =========================================
@ -183,8 +200,10 @@ local -r c
(program (program
(declaration_command (declaration_command
(variable_assignment (variable_name) (word)) (variable_assignment (variable_name) (word))
(simple_variable_name)) (variable_name))
(declaration_command (argument) (simple_variable_name))) (declaration_command
(word)
(variable_name)))
========================================= =========================================
Variable declaration: export Variable declaration: export
@ -196,9 +215,9 @@ export FOOBAR PATH="$PATH:/usr/foobar/bin"
--- ---
(program (program
(declaration_command (simple_variable_name)) (declaration_command (variable_name))
(declaration_command (declaration_command
(simple_variable_name) (variable_name)
(variable_assignment (variable_name) (string (simple_expansion (variable_name)))))) (variable_assignment (variable_name) (string (simple_expansion (variable_name))))))
========================================= =========================================

View File

@ -1,14 +1,3 @@
const SPECIAL_CHARACTERS = [
"'", '"',
'<', '>',
'{', '}',
'(', ')',
'`', '$',
'&', ';',
'\\',
'\\s',
];
module.exports = grammar({ module.exports = grammar({
name: 'bash', name: 'bash',
@ -18,6 +7,8 @@ module.exports = grammar({
$._expression, $._expression,
$._primary_expression, $._primary_expression,
$._variable_name, $._variable_name,
$._simple_variable_name,
$._simple_word,
], ],
externals: $ => [ externals: $ => [
@ -26,10 +17,14 @@ module.exports = grammar({
$._heredoc_middle, $._heredoc_middle,
$._heredoc_end, $._heredoc_end,
$.file_descriptor, $.file_descriptor,
$.word,
$._empty_value, $._empty_value,
$._concat, $._concat,
$.variable_name, // Variable name followed by an operator like '=' or '+=' $.variable_name, // Variable name followed by an operator like '=' or '+='
'\n', '\n',
']',
']]',
'}',
], ],
extras: $ => [ extras: $ => [
@ -64,7 +59,7 @@ module.exports = grammar({
for_statement: $ => seq( for_statement: $ => seq(
'for', 'for',
$._variable_name, $._simple_variable_name,
'in', 'in',
repeat1($._expression), repeat1($._expression),
$._terminator, $._terminator,
@ -125,8 +120,8 @@ module.exports = grammar({
function_definition: $ => seq( function_definition: $ => seq(
choice( choice(
seq('function', $.word, optional(seq('(', ')'))), seq('function', $._simple_variable_name, optional(seq('(', ')'))),
seq($.word, '(', ')') seq($._simple_variable_name, '(', ')')
), ),
$.compound_statement, $.compound_statement,
optional($.file_redirect) optional($.file_redirect)
@ -190,9 +185,9 @@ module.exports = grammar({
declaration_command: $ => seq( declaration_command: $ => seq(
choice('declare', 'typeset', 'export', 'readonly', 'local'), choice('declare', 'typeset', 'export', 'readonly', 'local'),
repeat(alias(seq('-', $.word), 'argument')),
repeat(choice( repeat(choice(
$.simple_variable_name, $.word,
$._simple_variable_name,
$.variable_assignment $.variable_assignment
)) ))
), ),
@ -249,6 +244,7 @@ module.exports = grammar({
_primary_expression: $ => choice( _primary_expression: $ => choice(
$.word, $.word,
$._simple_word,
$.string, $.string,
$.raw_string, $.raw_string,
$.expansion, $.expansion,
@ -298,14 +294,14 @@ module.exports = grammar({
seq( seq(
$._variable_name, $._variable_name,
choice(':', ':?', '=', ':-', '%', '/'), choice(':', ':?', '=', ':-', '%', '/'),
optional($._expression) optional(seq($._expression, optional($._concat)))
) )
), ),
'}' '}'
), ),
_variable_name: $ => choice( _variable_name: $ => choice(
alias($.simple_variable_name, $.variable_name), $._simple_variable_name,
$.special_variable_name $.special_variable_name
), ),
@ -320,22 +316,16 @@ module.exports = grammar({
')' ')'
), ),
word: $ => token(repeat1(choice(
noneOf('#', ...SPECIAL_CHARACTERS),
seq('\\', noneOf('\\s'))
))),
comment: $ => token(prec(-1, /#.*/)), comment: $ => token(prec(-1, /#.*/)),
simple_variable_name: $ => /\w+/, _simple_variable_name: $ => alias($.identifier, $.variable_name),
_simple_word: $ => alias($.identifier, $.word),
identifier: $ => /\w+/,
special_variable_name: $ => choice('*', '@', '#', '?', '-', '$', '!', '0', '_'), special_variable_name: $ => choice('*', '@', '#', '?', '-', '$', '!', '0', '_'),
_terminator: $ => choice(';', ';;', '\n', '&') _terminator: $ => choice(';', ';;', '\n', '&')
} }
}); });
function noneOf(...characters) {
const negatedString = characters.map(c => c == '\\' ? '\\\\' : c).join('')
return new RegExp('[^' + negatedString + ']')
}

129
src/grammar.json vendored
View File

@ -83,7 +83,7 @@
}, },
{ {
"type": "SYMBOL", "type": "SYMBOL",
"name": "_variable_name" "name": "_simple_variable_name"
}, },
{ {
"type": "STRING", "type": "STRING",
@ -329,7 +329,7 @@
}, },
{ {
"type": "SYMBOL", "type": "SYMBOL",
"name": "word" "name": "_simple_variable_name"
}, },
{ {
"type": "CHOICE", "type": "CHOICE",
@ -359,7 +359,7 @@
"members": [ "members": [
{ {
"type": "SYMBOL", "type": "SYMBOL",
"name": "word" "name": "_simple_variable_name"
}, },
{ {
"type": "STRING", "type": "STRING",
@ -658,27 +658,6 @@
} }
] ]
}, },
{
"type": "REPEAT",
"content": {
"type": "ALIAS",
"content": {
"type": "SEQ",
"members": [
{
"type": "STRING",
"value": "-"
},
{
"type": "SYMBOL",
"name": "word"
}
]
},
"named": false,
"value": "argument"
}
},
{ {
"type": "REPEAT", "type": "REPEAT",
"content": { "content": {
@ -686,7 +665,11 @@
"members": [ "members": [
{ {
"type": "SYMBOL", "type": "SYMBOL",
"name": "simple_variable_name" "name": "word"
},
{
"type": "SYMBOL",
"name": "_simple_variable_name"
}, },
{ {
"type": "SYMBOL", "type": "SYMBOL",
@ -895,6 +878,10 @@
"type": "SYMBOL", "type": "SYMBOL",
"name": "word" "name": "word"
}, },
{
"type": "SYMBOL",
"name": "_simple_word"
},
{ {
"type": "SYMBOL", "type": "SYMBOL",
"name": "string" "name": "string"
@ -1139,11 +1126,28 @@
}, },
{ {
"type": "CHOICE", "type": "CHOICE",
"members": [
{
"type": "SEQ",
"members": [ "members": [
{ {
"type": "SYMBOL", "type": "SYMBOL",
"name": "_expression" "name": "_expression"
}, },
{
"type": "CHOICE",
"members": [
{
"type": "SYMBOL",
"name": "_concat"
},
{
"type": "BLANK"
}
]
}
]
},
{ {
"type": "BLANK" "type": "BLANK"
} }
@ -1163,13 +1167,8 @@
"type": "CHOICE", "type": "CHOICE",
"members": [ "members": [
{ {
"type": "ALIAS",
"content": {
"type": "SYMBOL", "type": "SYMBOL",
"name": "simple_variable_name" "name": "_simple_variable_name"
},
"named": true,
"value": "variable_name"
}, },
{ {
"type": "SYMBOL", "type": "SYMBOL",
@ -1246,34 +1245,6 @@
} }
] ]
}, },
"word": {
"type": "TOKEN",
"content": {
"type": "REPEAT1",
"content": {
"type": "CHOICE",
"members": [
{
"type": "PATTERN",
"value": "[^#'\"<>{}()`$&;\\\\\\s]"
},
{
"type": "SEQ",
"members": [
{
"type": "STRING",
"value": "\\"
},
{
"type": "PATTERN",
"value": "[^\\s]"
}
]
}
]
}
}
},
"comment": { "comment": {
"type": "TOKEN", "type": "TOKEN",
"content": { "content": {
@ -1285,7 +1256,25 @@
} }
} }
}, },
"simple_variable_name": { "_simple_variable_name": {
"type": "ALIAS",
"content": {
"type": "SYMBOL",
"name": "identifier"
},
"named": true,
"value": "variable_name"
},
"_simple_word": {
"type": "ALIAS",
"content": {
"type": "SYMBOL",
"name": "identifier"
},
"named": true,
"value": "word"
},
"identifier": {
"type": "PATTERN", "type": "PATTERN",
"value": "\\w+" "value": "\\w+"
}, },
@ -1396,6 +1385,10 @@
"type": "SYMBOL", "type": "SYMBOL",
"name": "file_descriptor" "name": "file_descriptor"
}, },
{
"type": "SYMBOL",
"name": "word"
},
{ {
"type": "SYMBOL", "type": "SYMBOL",
"name": "_empty_value" "name": "_empty_value"
@ -1411,6 +1404,18 @@
{ {
"type": "STRING", "type": "STRING",
"value": "\n" "value": "\n"
},
{
"type": "STRING",
"value": "]"
},
{
"type": "STRING",
"value": "]]"
},
{
"type": "STRING",
"value": "}"
} }
], ],
"inline": [ "inline": [
@ -1418,6 +1423,8 @@
"_terminator", "_terminator",
"_expression", "_expression",
"_primary_expression", "_primary_expression",
"_variable_name" "_variable_name",
"_simple_variable_name",
"_simple_word"
] ]
} }

63355
src/parser.c vendored

File diff suppressed because it is too large Load Diff

56
src/scanner.cc vendored
View File

@ -12,10 +12,14 @@ enum TokenType {
HEREDOC_MIDDLE, HEREDOC_MIDDLE,
HEREDOC_END, HEREDOC_END,
FILE_DESCRIPTOR, FILE_DESCRIPTOR,
WORD,
EMPTY_VALUE, EMPTY_VALUE,
CONCAT, CONCAT,
VARIABLE_NAME, VARIABLE_NAME,
NEWLINE, NEWLINE,
CLOSING_BRACKET,
CLOSING_DOUBLE_BRACKET,
CLOSING_BRACE,
}; };
struct Scanner { struct Scanner {
@ -90,6 +94,7 @@ struct Scanner {
lexer->lookahead == ')' || lexer->lookahead == ')' ||
lexer->lookahead == '(' || lexer->lookahead == '(' ||
lexer->lookahead == '[' || lexer->lookahead == '[' ||
lexer->lookahead == '|' ||
lexer->lookahead == ']' || lexer->lookahead == ']' ||
lexer->lookahead == '}' || lexer->lookahead == '}' ||
lexer->lookahead == ';' || lexer->lookahead == ';' ||
@ -130,7 +135,9 @@ struct Scanner {
return scan_heredoc_content(lexer, HEREDOC_BEGINNING, SIMPLE_HEREDOC); return scan_heredoc_content(lexer, HEREDOC_BEGINNING, SIMPLE_HEREDOC);
} }
if (valid_symbols[VARIABLE_NAME] || valid_symbols[FILE_DESCRIPTOR]) { if (valid_symbols[VARIABLE_NAME] || valid_symbols[FILE_DESCRIPTOR] || valid_symbols[WORD]) {
unsigned length = 0;
for (;;) { for (;;) {
if ( if (
lexer->lookahead == ' ' || lexer->lookahead == ' ' ||
@ -139,11 +146,12 @@ struct Scanner {
) { ) {
skip(lexer); skip(lexer);
} else if (lexer->lookahead == '\\') { } else if (lexer->lookahead == '\\') {
skip(lexer); advance(lexer);
if (lexer->lookahead == '\n') { if (lexer->lookahead == '\n') {
skip(lexer); skip(lexer);
} else { } else {
return false; length++;
break;
} }
} else { } else {
break; break;
@ -151,26 +159,41 @@ struct Scanner {
} }
bool is_number = true; bool is_number = true;
if (iswdigit(lexer->lookahead)) { bool is_alphanumeric = true;
advance(lexer);
} else if (iswalpha(lexer->lookahead) || lexer->lookahead == '_') {
is_number = false;
advance(lexer);
} else {
return false;
}
for (;;) { for (;;) {
if (iswdigit(lexer->lookahead)) { if (iswdigit(lexer->lookahead)) {
advance(lexer);
} else if (iswalpha(lexer->lookahead) || lexer->lookahead == '_') { } else if (iswalpha(lexer->lookahead) || lexer->lookahead == '_') {
is_number = false; is_number = false;
advance(lexer); } else if (
!iswspace(lexer->lookahead) &&
lexer->lookahead != 0 &&
lexer->lookahead != '"' &&
lexer->lookahead != '\'' &&
lexer->lookahead != '`' &&
lexer->lookahead != '>' &&
lexer->lookahead != '<' &&
lexer->lookahead != '#' &&
lexer->lookahead != '|' &&
lexer->lookahead != '(' &&
lexer->lookahead != ')' &&
lexer->lookahead != ';' &&
lexer->lookahead != '&' &&
lexer->lookahead != '$'
) {
if (lexer->lookahead == '}' && valid_symbols[CLOSING_BRACE]) break;
if (lexer->lookahead == ']' && length == 0 && (valid_symbols[CLOSING_BRACKET] || valid_symbols[CLOSING_DOUBLE_BRACKET])) break;
if (is_alphanumeric && valid_symbols[VARIABLE_NAME] && (lexer->lookahead == '=' || lexer->lookahead == '[' || lexer->lookahead == '+')) break;
is_alphanumeric = false;
} else { } else {
break; break;
} }
advance(lexer);
length++;
} }
if (length == 0) return false;
if (is_number && if (is_number &&
valid_symbols[FILE_DESCRIPTOR] && valid_symbols[FILE_DESCRIPTOR] &&
(lexer->lookahead == '>' || lexer->lookahead == '<')) { (lexer->lookahead == '>' || lexer->lookahead == '<')) {
@ -194,7 +217,10 @@ struct Scanner {
} }
} }
return false; if (valid_symbols[WORD] && !is_alphanumeric) {
lexer->result_symbol = WORD;
return true;
}
} }
return false; return false;