diff --git a/grammar.js b/grammar.js index f91bdbf..69cf600 100644 --- a/grammar.js +++ b/grammar.js @@ -888,11 +888,11 @@ function defineQuoted(start, end, name) { [`_quoted_i_${name}`]: ($) => seq( field("quoted_start", start), + optional(alias($[`_quoted_content_i_${name}`], $.quoted_content)), repeat( - choice( - alias($[`_quoted_content_i_${name}`], $.quoted_content), - $.interpolation, - $.escape_sequence + seq( + choice($.interpolation, $.escape_sequence), + optional(alias($[`_quoted_content_i_${name}`], $.quoted_content)) ) ), field("quoted_end", end) @@ -901,11 +901,12 @@ function defineQuoted(start, end, name) { [`_quoted_${name}`]: ($) => seq( field("quoted_start", start), + optional(alias($[`_quoted_content_${name}`], $.quoted_content)), repeat( - choice( - alias($[`_quoted_content_${name}`], $.quoted_content), - // The end delimiter may always be escaped - $.escape_sequence + seq( + // The end delimiter may be escaped in non-interpolating strings too + $.escape_sequence, + optional(alias($[`_quoted_content_${name}`], $.quoted_content)) ) ), field("quoted_end", end) diff --git a/src/scanner.cc b/src/scanner.cc index b8dd7b7..8889957 100644 --- a/src/scanner.cc +++ b/src/scanner.cc @@ -174,7 +174,22 @@ int8_t find_quoted_token_info(const bool* valid_symbols) { bool scan_quoted_content(TSLexer* lexer, const QuotedContentInfo& info) { lexer->result_symbol = info.token_type; + bool is_heredoc = (info.delimiter_length == 3); + for (bool has_content = false; true; has_content = true) { + bool newline = false; + + if (is_newline(lexer->lookahead)) { + advance(lexer); + + has_content = true; + newline = true; + + while (is_whitespace(lexer->lookahead)) { + advance(lexer); + } + } + lexer->mark_end(lexer); if (lexer->lookahead == info.end_delimiter) { @@ -189,7 +204,7 @@ bool scan_quoted_content(TSLexer* lexer, const QuotedContentInfo& info) { } } - if (length == info.delimiter_length) { + if (length == info.delimiter_length && (!is_heredoc || newline)) { return has_content; } } else { @@ -199,16 +214,18 @@ bool scan_quoted_content(TSLexer* lexer, const QuotedContentInfo& info) { return has_content; } } else if (lexer->lookahead == '\\') { - if (info.supports_interpol) { + advance(lexer); + if (is_heredoc && lexer->lookahead == '\n') { + // We need to know about the newline to correctly recognise + // heredoc end delimiter, so we intentionally ignore escaping + } else if (info.supports_interpol || lexer->lookahead == info.end_delimiter) { return has_content; - } else { - advance(lexer); - if (lexer->lookahead == info.end_delimiter) { - return has_content; - } } } else if (lexer->lookahead == '\0') { - return false; + // If we reached the end of the file, this means there is no + // end delimiter, so the syntax is invalid. In that case we + // want to treat all the scanned content as quoted content. + return has_content; } else { advance(lexer); } diff --git a/test/corpus/term/string.txt b/test/corpus/term/string.txt index 63c28f5..9869797 100644 --- a/test/corpus/term/string.txt +++ b/test/corpus/term/string.txt @@ -1,3 +1,14 @@ +===================================== +empty +===================================== + +"" + +--- + +(source + (string)) + ===================================== single line ===================================== @@ -171,6 +182,47 @@ this is #{ (quoted_content))) (quoted_content))) +===================================== +heredoc / delimiter in the middle +===================================== + +""" +hey """ +""" + +--- + +(source + (string + (quoted_content))) + +===================================== +heredoc / escaped newline (ignored) +===================================== + +""" +hey \ +""" + + """ + hey \ + """ + +""" +hey \ +there +""" + +--- + +(source + (string + (quoted_content)) + (string + (quoted_content)) + (string + (quoted_content))) + ===================================== heredoc / escaped delimiter =====================================