Add comments, allow slashes in unquoted attribute values

Co-Authored-By: Ashi Krishan <queerviolet@github.com>
2018-06-11 15:36:18 -07:00 · 2018-06-11 15:36:18 -07:00 · e56df0fc7f
parent 6ee8f55084
commit e56df0fc7f
6 changed files with 591 additions and 452 deletions
--- a/corpus/main.txt
+++ b/corpus/main.txt
@ -71,3 +71,19 @@ Void tags
        (attribute (attribute_name) (attribute_value))
        (attribute (attribute_name) (attribute_value))))
    (end_tag)))
+
+==================================
+Comments
+==================================
+<!-- hello -->
+<div>
+  <!-- <span>something</span> -->
+</div>
+---
+
+(fragment
+  (comment)
+  (element
+    (start_tag)
+    (comment)
+    (end_tag)))
--- a/grammar.js
+++ b/grammar.js
@ -1,6 +1,11 @@
 module.exports = grammar({
  name: 'html',

+  extras: $ => [
+    $.comment,
+    /\s+/,
+  ],
+
  externals: $ => [
    $._open_start_tag,
    $._close_start_tag,
@ -8,6 +13,7 @@ module.exports = grammar({
    $.end_tag,
    $._implicit_end_tag,
    $._erroneous_end_tag,
+    $.comment,
  ],

  rules: {
@ -41,17 +47,19 @@ module.exports = grammar({
    ),

    attribute: $ => seq(
-      alias($._attribute_part, $.attribute_name),
+      $.attribute_name,
      optional(seq(
        '=',
        choice(
-          alias($._attribute_part, $.attribute_value),
+          $.attribute_value,
          $.quoted_attribute_value
        )
      ))
    ),

-    _attribute_part: $ => /[^<>"'/=\s]+/,
+    attribute_name: $ => /[^<>"'/=\s]+/,
+
+    attribute_value: $ => /[^<>"'=\s]+/,

    quoted_attribute_value: $ => choice(
      seq("'", optional(alias(/[^']+/, $.attribute_value)), "'"),
--- a/src/grammar.json
+++ b/src/grammar.json
@ -107,13 +107,8 @@
      "type": "SEQ",
      "members": [
        {
-          "type": "ALIAS",
-          "content": {
          "type": "SYMBOL",
-            "name": "_attribute_part"
-          },
-          "named": true,
-          "value": "attribute_name"
+          "name": "attribute_name"
        },
        {
          "type": "CHOICE",
@ -129,13 +124,8 @@
                  "type": "CHOICE",
                  "members": [
                    {
-                      "type": "ALIAS",
-                      "content": {
                      "type": "SYMBOL",
-                        "name": "_attribute_part"
-                      },
-                      "named": true,
-                      "value": "attribute_value"
+                      "name": "attribute_value"
                    },
                    {
                      "type": "SYMBOL",
@ -152,10 +142,14 @@
        }
      ]
    },
-    "_attribute_part": {
+    "attribute_name": {
      "type": "PATTERN",
      "value": "[^<>\"'\\/=\\s]+"
    },
+    "attribute_value": {
+      "type": "PATTERN",
+      "value": "[^<>\"'=\\s]+"
+    },
    "quoted_attribute_value": {
      "type": "CHOICE",
      "members": [
@ -227,9 +221,13 @@
    }
  },
  "extras": [
+    {
+      "type": "SYMBOL",
+      "name": "comment"
+    },
    {
      "type": "PATTERN",
-      "value": "\\s"
+      "value": "\\s+"
    }
  ],
  "conflicts": [],
@ -257,6 +255,10 @@
    {
      "type": "SYMBOL",
      "name": "_erroneous_end_tag"
+    },
+    {
+      "type": "SYMBOL",
+      "name": "comment"
    }
  ],
  "inline": []
--- a/src/parser.c
+++ b/src/parser.c
--- a/src/scanner.cc
+++ b/src/scanner.cc
@ -18,6 +18,7 @@ enum TokenType {
  END_TAG,
  IMPLICIT_END_TAG,
  ERRONEOUS_END_TAG,
+  COMMENT,
 };

 struct Scanner {
@ -66,7 +67,43 @@ struct Scanner {
    return tag_name;
  }

+  bool comment(TSLexer *lexer) {
+    if (lexer->lookahead != '-') return false;
+    lexer->advance(lexer, false);
+    if (lexer->lookahead != '-') return false;
+    lexer->advance(lexer, false);
+
+    unsigned dashes = 0;
+    auto c = lexer->lookahead;
+    while (c) {
+      switch (c) {
+        case '-':
+          ++dashes;
+          break;
+        case '>':
+          if (dashes >= 2) {
+            lexer->result_symbol = COMMENT;
+            lexer->advance(lexer, false);
+            lexer->mark_end(lexer);
+            return true;
+          }
+          break;
+        default:
+          dashes = 0;
+      }
+      lexer->advance(lexer, false);
+      c = lexer->lookahead;
+    }
+    return false;
+  }
+
  bool start_tag(TSLexer *lexer) {
+    if (!tags.empty() && tags.back().is_void()) {
+      tags.pop_back();
+      lexer->result_symbol = IMPLICIT_END_TAG;
+      return true;
+    }
+
    auto tag_name = scan_tag_name(lexer);
    if (tag_name.empty()) return false;

@ -115,15 +152,22 @@ struct Scanner {

    switch (lexer->lookahead) {
      case '<':
-        if (valid_symbols[OPEN_START_TAG] || valid_symbols[END_TAG]) {
        lexer->mark_end(lexer);
        lexer->advance(lexer, false);
+
+        if (lexer->lookahead == '!') {
+          lexer->advance(lexer, false);
+          return comment(lexer);
+        }
+
+        if (valid_symbols[OPEN_START_TAG] || valid_symbols[END_TAG]) {
          if (lexer->lookahead == '/') {
            lexer->advance(lexer, false);
            return end_tag(lexer);
          }
          return start_tag(lexer);
        }
+
        break;

      case '>':
--- a/test.html
+++ b/test.html
@ -1 +0,0 @@
-<form><img src=something.png><br><input type=submit value=Ok /></form>
				`@ -1 +0,0 @@`
				`<form><img src=something.png><br><input type=submit value=Ok /></form>`