Parse tag names as separate tokens

2018-06-12 12:20:13 -07:00 · 2018-06-12 12:20:13 -07:00 · 5f2a122de7
parent 4d11a75675
commit 5f2a122de7
5 changed files with 1267 additions and 1007 deletions
--- a/corpus/main.txt
+++ b/corpus/main.txt
@ -6,9 +6,9 @@ Tags

 (fragment
  (element
-    (start_tag)
+    (start_tag (tag_name))
    (text)
-    (end_tag)))
+    (end_tag (tag_name))))

 ===================================
 Tags with attributes
@ -19,6 +19,7 @@ Tags with attributes
 (fragment
  (element
    (start_tag
+      (tag_name)
      (attribute
        (attribute_name)
        (attribute_value))
@ -27,7 +28,7 @@ Tags with attributes
        (quoted_attribute_value (attribute_value)))
      (attribute
        (attribute_name)))
-    (end_tag)))
+    (end_tag (tag_name))))

 ===================================
 Nested tags
@ -41,17 +42,19 @@ Nested tags

 (fragment
  (element
-    (start_tag)
-    (element
-      (start_tag)
-      (text)
-      (end_tag))
+    (start_tag (tag_name))
    (text)
    (element
-      (start_tag)
+      (start_tag (tag_name))
      (text)
-      (end_tag))
-    (end_tag)))
+      (end_tag (tag_name)))
+    (text)
+    (element
+      (start_tag (tag_name))
+      (text)
+      (end_tag (tag_name)))
+    (text)
+    (end_tag (tag_name))))

 ==================================
 Void tags
@ -61,16 +64,18 @@ Void tags

 (fragment
  (element
-    (start_tag)
+    (start_tag (tag_name))
    (element
      (start_tag
+        (tag_name)
        (attribute (attribute_name) (quoted_attribute_value (attribute_value)))))
-    (element (start_tag))
+    (element (start_tag (tag_name)))
    (element
      (self_closing_tag
+        (tag_name)
        (attribute (attribute_name) (attribute_value))
        (attribute (attribute_name) (attribute_value))))
-    (end_tag)))
+    (end_tag (tag_name))))

 ==================================
 Comments
@ -83,15 +88,16 @@ Comments

 (fragment
  (comment)
+  (text)
  (element
-    (start_tag)
+    (start_tag (tag_name))
    (comment)
-    (end_tag)))
+    (text)
+    (end_tag (tag_name))))

 ==================================
 Raw text elements
 ==================================
-
 <script>
  </s
  </sc
@ -108,11 +114,14 @@ Raw text elements

 (fragment
  (raw_element
-    (start_tag)
-    (end_tag))
+    (start_tag (tag_name))
+    (raw_text)
+    (end_tag (tag_name)))
+  (text)
  (raw_element
-    (start_tag)
-    (end_tag))
+    (start_tag (tag_name))
+    (raw_text)
+    (end_tag (tag_name)))
  (text))

 ==================================
@ -146,10 +155,11 @@ LI elements without close tags

 (fragment
  (element
-    (start_tag)
-    (element (start_tag) (text))
-    (element (start_tag) (text))
-    (end_tag)))
+    (start_tag (tag_name))
+    (text)
+    (element (start_tag (tag_name)) (text))
+    (element (start_tag (tag_name)) (text))
+    (end_tag (tag_name))))

 ======================================
 DT and DL elements without close tags
@ -165,13 +175,14 @@ DT and DL elements without close tags

 (fragment
  (element
-    (start_tag)
-    (element (start_tag) (text))
-    (element (start_tag) (text))
-    (element (start_tag) (text))
-    (element (start_tag) (text))
-    (element (start_tag) (text))
-    (end_tag)))
+    (start_tag (tag_name))
+    (text)
+    (element (start_tag (tag_name)) (text))
+    (element (start_tag (tag_name)) (text))
+    (element (start_tag (tag_name)) (text))
+    (element (start_tag (tag_name)) (text))
+    (element (start_tag (tag_name)) (text))
+    (end_tag (tag_name))))

 ======================================
 P elements without close tags
@ -184,11 +195,12 @@ P elements without close tags
 ---

 (fragment
-  (element (start_tag) (text))
-  (element (start_tag) (text) (end_tag))
-  (element (start_tag) (text))
-  (element (start_tag) (text))
-  (element (start_tag) (text) (end_tag)))
+  (element (start_tag (tag_name)) (text))
+  (element (start_tag (tag_name)) (text) (end_tag (tag_name)))
+  (text)
+  (element (start_tag (tag_name)) (text))
+  (element (start_tag (tag_name)) (text))
+  (element (start_tag (tag_name)) (text) (end_tag (tag_name))))

 ======================================
 Ruby annotation elements without close tags
@ -198,17 +210,16 @@ Ruby annotation elements without close tags

 (fragment
  (element
-    (start_tag)
+    (start_tag (tag_name))
    (text)
-    (element (start_tag) (text))
-    (element (start_tag) (text))
-    (element (start_tag) (text))
-    (end_tag)))
+    (element (start_tag (tag_name)) (text))
+    (element (start_tag (tag_name)) (text))
+    (element (start_tag (tag_name)) (text))
+    (end_tag (tag_name))))

 =======================================
 COLGROUP elements without end tags
 =======================================
-
 <table>
  <colgroup>
    <col style="background-color: #0f0">
@ -223,18 +234,29 @@ COLGROUP elements without end tags

 (fragment
  (element
-    (start_tag)
+    (start_tag (tag_name))
+    (text)
    (element
-      (start_tag)
-      (element (start_tag (attribute (attribute_name) (quoted_attribute_value (attribute_value)))))
-      (element (start_tag (attribute (attribute_name) (quoted_attribute_value (attribute_value))))))
+      (start_tag (tag_name))
+      (text)
+      (element (start_tag
+        (tag_name)
+        (attribute (attribute_name) (quoted_attribute_value (attribute_value)))))
+      (element (start_tag
+        (tag_name)
+        (attribute (attribute_name) (quoted_attribute_value (attribute_value))))))
    (element
-      (start_tag)
-      (element (start_tag) (text) (end_tag))
-      (element (start_tag) (text) (end_tag))
-      (element (start_tag) (text) (end_tag))
-      (end_tag))
-    (end_tag)))
+      (start_tag (tag_name))
+      (text)
+      (element (start_tag (tag_name)) (text) (end_tag (tag_name)))
+      (text)
+      (element (start_tag (tag_name)) (text) (end_tag (tag_name)))
+      (text)
+      (element (start_tag (tag_name)) (text) (end_tag (tag_name)))
+      (text)
+      (end_tag (tag_name)))
+    (text)
+    (end_tag (tag_name))))

 =========================================
 TR, TD, and TH elements without end tags
@ -251,13 +273,16 @@ TR, TD, and TH elements without end tags

 (fragment
  (element
-    (start_tag)
+    (start_tag (tag_name))
+    (text)
    (element
-      (start_tag)
-      (element (start_tag) (text))
-      (element (start_tag) (text)))
+      (start_tag (tag_name))
+      (text)
+      (element (start_tag (tag_name)) (text))
+      (element (start_tag (tag_name)) (text)))
    (element
-      (start_tag)
-      (element (start_tag) (text))
-      (element (start_tag) (text)))
-    (end_tag)))
+      (start_tag (tag_name))
+      (text)
+      (element (start_tag (tag_name)) (text))
+      (element (start_tag (tag_name)) (text)))
+    (end_tag (tag_name))))
--- a/grammar.js
+++ b/grammar.js
@ -7,14 +7,13 @@ module.exports = grammar({
  ],

  externals: $ => [
-    $._open_start_tag,
-    $._open_raw_start_tag,
-    $._close_start_tag,
-    $._self_close_start_tag,
-    $.end_tag,
+    $._start_tag_name,
+    $._start_raw_tag_name,
+    $._end_tag_name,
+    $.erroneous_end_tag_name,
+    '/>',
    $._implicit_end_tag,
-    $._erroneous_end_tag,
-    $._raw_text,
+    $.raw_text,
    $.comment,
  ],

@ -31,8 +30,8 @@ module.exports = grammar({
    _node: $ => choice(
      $.doctype,
      $.text,
-      $._erroneous_end_tag,
      $.element,
+      $.erroneous_end_tag,
      $.raw_element
    ),

@ -47,26 +46,41 @@ module.exports = grammar({

    raw_element: $ => seq(
      alias($._raw_start_tag, $.start_tag),
-      optional($._raw_text),
+      optional($.raw_text),
      $.end_tag
    ),

    start_tag: $ => seq(
-      $._open_start_tag,
+      '<',
+      alias($._start_tag_name, $.tag_name),
      repeat($.attribute),
-      $._close_start_tag
+      '>'
    ),

    _raw_start_tag: $ => seq(
-      $._open_raw_start_tag,
+      '<',
+      alias($._start_raw_tag_name, $.tag_name),
      repeat($.attribute),
-      $._close_start_tag
+      '>'
    ),

    self_closing_tag: $ => seq(
-      $._open_start_tag,
+      '<',
+      alias($._start_tag_name, $.tag_name),
      repeat($.attribute),
-      $._self_close_start_tag
+      '/>'
+    ),
+
+    end_tag: $ => seq(
+      '</',
+      alias($._end_tag_name, $.tag_name),
+      '>'
+    ),
+
+    erroneous_end_tag: $ => seq(
+      '</',
+      $.erroneous_end_tag_name,
+      '>'
    ),

    attribute: $ => seq(
--- a/src/grammar.json
+++ b/src/grammar.json
@ -42,11 +42,11 @@
        },
        {
          "type": "SYMBOL",
-          "name": "_erroneous_end_tag"
+          "name": "element"
        },
        {
          "type": "SYMBOL",
-          "name": "element"
+          "name": "erroneous_end_tag"
        },
        {
          "type": "SYMBOL",
@ -109,7 +109,7 @@
          "members": [
            {
              "type": "SYMBOL",
-              "name": "_raw_text"
+              "name": "raw_text"
            },
            {
              "type": "BLANK"
@ -126,8 +126,17 @@
      "type": "SEQ",
      "members": [
        {
-          "type": "SYMBOL",
-          "name": "_open_start_tag"
+          "type": "STRING",
+          "value": "<"
+        },
+        {
+          "type": "ALIAS",
+          "content": {
+            "type": "SYMBOL",
+            "name": "_start_tag_name"
+          },
+          "named": true,
+          "value": "tag_name"
        },
        {
          "type": "REPEAT",
@ -137,8 +146,8 @@
          }
        },
        {
-          "type": "SYMBOL",
-          "name": "_close_start_tag"
+          "type": "STRING",
+          "value": ">"
        }
      ]
    },
@ -146,8 +155,17 @@
      "type": "SEQ",
      "members": [
        {
-          "type": "SYMBOL",
-          "name": "_open_raw_start_tag"
+          "type": "STRING",
+          "value": "<"
+        },
+        {
+          "type": "ALIAS",
+          "content": {
+            "type": "SYMBOL",
+            "name": "_start_raw_tag_name"
+          },
+          "named": true,
+          "value": "tag_name"
        },
        {
          "type": "REPEAT",
@ -157,8 +175,8 @@
          }
        },
        {
-          "type": "SYMBOL",
-          "name": "_close_start_tag"
+          "type": "STRING",
+          "value": ">"
        }
      ]
    },
@ -166,8 +184,17 @@
      "type": "SEQ",
      "members": [
        {
-          "type": "SYMBOL",
-          "name": "_open_start_tag"
+          "type": "STRING",
+          "value": "<"
+        },
+        {
+          "type": "ALIAS",
+          "content": {
+            "type": "SYMBOL",
+            "name": "_start_tag_name"
+          },
+          "named": true,
+          "value": "tag_name"
        },
        {
          "type": "REPEAT",
@ -176,9 +203,48 @@
            "name": "attribute"
          }
        },
+        {
+          "type": "STRING",
+          "value": "/>"
+        }
+      ]
+    },
+    "end_tag": {
+      "type": "SEQ",
+      "members": [
+        {
+          "type": "STRING",
+          "value": "</"
+        },
+        {
+          "type": "ALIAS",
+          "content": {
+            "type": "SYMBOL",
+            "name": "_end_tag_name"
+          },
+          "named": true,
+          "value": "tag_name"
+        },
+        {
+          "type": "STRING",
+          "value": ">"
+        }
+      ]
+    },
+    "erroneous_end_tag": {
+      "type": "SEQ",
+      "members": [
+        {
+          "type": "STRING",
+          "value": "</"
+        },
        {
          "type": "SYMBOL",
-          "name": "_self_close_start_tag"
+          "name": "erroneous_end_tag_name"
+        },
+        {
+          "type": "STRING",
+          "value": ">"
        }
      ]
    },
@ -313,23 +379,23 @@
  "externals": [
    {
      "type": "SYMBOL",
-      "name": "_open_start_tag"
+      "name": "_start_tag_name"
    },
    {
      "type": "SYMBOL",
-      "name": "_open_raw_start_tag"
+      "name": "_start_raw_tag_name"
    },
    {
      "type": "SYMBOL",
-      "name": "_close_start_tag"
+      "name": "_end_tag_name"
    },
    {
      "type": "SYMBOL",
-      "name": "_self_close_start_tag"
+      "name": "erroneous_end_tag_name"
    },
    {
-      "type": "SYMBOL",
-      "name": "end_tag"
+      "type": "STRING",
+      "value": "/>"
    },
    {
      "type": "SYMBOL",
@ -337,11 +403,7 @@
    },
    {
      "type": "SYMBOL",
-      "name": "_erroneous_end_tag"
-    },
-    {
-      "type": "SYMBOL",
-      "name": "_raw_text"
+      "name": "raw_text"
    },
    {
      "type": "SYMBOL",
--- a/src/parser.c
+++ b/src/parser.c
--- a/src/scanner.cc
+++ b/src/scanner.cc
@ -11,13 +11,12 @@ using std::vector;
 using std::string;

 enum TokenType {
-  OPEN_START_TAG,
-  OPEN_RAW_START_TAG,
-  CLOSE_START_TAG,
-  SELF_CLOSE_START_TAG,
-  END_TAG,
+  START_TAG_NAME,
+  START_RAW_TAG_NAME,
+  END_TAG_NAME,
+  ERRONEOUS_END_TAG_NAME,
+  SELF_CLOSING_TAG_DELIMITER,
  IMPLICIT_END_TAG,
-  ERRONEOUS_END_TAG,
  RAW_TEXT,
  COMMENT
 };
@ -68,7 +67,7 @@ struct Scanner {
    return tag_name;
  }

-  bool comment(TSLexer *lexer) {
+  bool scan_comment(TSLexer *lexer) {
    if (lexer->lookahead != '-') return false;
    lexer->advance(lexer, false);
    if (lexer->lookahead != '-') return false;
@ -98,7 +97,7 @@ struct Scanner {
    return false;
  }

-  bool raw_text(TSLexer *lexer) {
+  bool scan_raw_text(TSLexer *lexer) {
    if (!tags.size()) return false;

    lexer->mark_end(lexer);
@ -123,13 +122,19 @@ struct Scanner {
    return true;
  }

-  bool start_tag(TSLexer *lexer) {
+  bool scan_implicit_end_tag(TSLexer *lexer) {
    Tag *parent = tags.empty() ? nullptr : &tags.back();

-    if (parent && parent->is_void()) {
-      tags.pop_back();
-      lexer->result_symbol = IMPLICIT_END_TAG;
-      return true;
+    bool is_closing_tag = false;
+    if (lexer->lookahead == '/') {
+      is_closing_tag = true;
+      lexer->advance(lexer, false);
+    } else {
+      if (parent && parent->is_void()) {
+        tags.pop_back();
+        lexer->result_symbol = IMPLICIT_END_TAG;
+        return true;
+      }
    }

    auto tag_name = scan_tag_name(lexer);
@ -137,55 +142,70 @@ struct Scanner {

    Tag next_tag = Tag::for_name(tag_name);

-    if (parent && !parent->can_contain(next_tag)) {
+    if (is_closing_tag) {
+      // The tag correctly closes the topmost element on the stack
+      if (next_tag == tags.back()) return false;
+
+      // Otherwise, dig deeper and queue implicit end tags (to be nice in
+      // the case of malformed HTML)
+      if (std::find(tags.begin(), tags.end(), next_tag) != tags.end()) {
+        tags.pop_back();
+        lexer->result_symbol = IMPLICIT_END_TAG;
+        return true;
+      }
+    } else if (parent && !parent->can_contain(next_tag)) {
      tags.pop_back();
      lexer->result_symbol = IMPLICIT_END_TAG;
      return true;
    }

-    tags.push_back(next_tag);
-    lexer->mark_end(lexer);
-    lexer->result_symbol = next_tag.is_raw() ? OPEN_RAW_START_TAG : OPEN_START_TAG;
+    return false;
+  }
+
+  bool scan_start_tag_name(TSLexer *lexer) {
+    auto tag_name = scan_tag_name(lexer);
+    if (tag_name.empty()) return false;
+    Tag tag = Tag::for_name(tag_name);
+    tags.push_back(tag);
+    if (tag.is_raw()) {
+      lexer->result_symbol = START_RAW_TAG_NAME;
+    } else {
+      lexer->result_symbol = START_TAG_NAME;
+    }
    return true;
  }

-  bool end_tag(TSLexer *lexer) {
+  bool scan_end_tag_name(TSLexer *lexer) {
    auto tag_name = scan_tag_name(lexer);
    if (tag_name.empty()) return false;
-
-    lexer->advance(lexer, false);
-
    Tag tag = Tag::for_name(tag_name);
-
-    // The tag correctly closes the topmost element on the stack
-    if (tag == tags.back()) {
+    if (!tags.empty() && tags.back() == tag) {
      tags.pop_back();
-      lexer->mark_end(lexer);
-      lexer->result_symbol = END_TAG;
-      return true;
+      lexer->result_symbol = END_TAG_NAME;
+    } else {
+      lexer->result_symbol = ERRONEOUS_END_TAG_NAME;
    }
-
-    // Otherwise, dig deeper and queue implicit end tags (to be nice in
-    // the case of malformed HTML)
-    if (std::find(tags.begin(), tags.end(), tag) != tags.end()) {
-      tags.pop_back();
-      lexer->result_symbol = IMPLICIT_END_TAG;
-      return true;
-    }
-
-    // You closed a tag you never opened 😭
-    lexer->mark_end(lexer);
-    lexer->result_symbol = ERRONEOUS_END_TAG;
    return true;
  }

+  bool scan_self_closing_tag_delimiter(TSLexer *lexer) {
+    lexer->advance(lexer, false);
+    if (lexer->lookahead == '>') {
+      lexer->advance(lexer, false);
+      tags.pop_back();
+      lexer->result_symbol = SELF_CLOSING_TAG_DELIMITER;
+      return true;
+    }
+    return false;
+  }
+
  bool scan(TSLexer *lexer, const bool *valid_symbols) {
    while (iswspace(lexer->lookahead)) {
      lexer->advance(lexer, true);
    }

-    if (valid_symbols[RAW_TEXT] && !valid_symbols[OPEN_START_TAG] && !valid_symbols[CLOSE_START_TAG]) {
-      return raw_text(lexer);
+    if (valid_symbols[RAW_TEXT] && !valid_symbols[START_TAG_NAME] && !valid_symbols[END_TAG_NAME]) {
+      return scan_raw_text(lexer);
    }

    switch (lexer->lookahead) {
@ -195,38 +215,26 @@ struct Scanner {

        if (lexer->lookahead == '!') {
          lexer->advance(lexer, false);
-          return comment(lexer);
+          return scan_comment(lexer);
        }

-        if (valid_symbols[OPEN_START_TAG] || valid_symbols[END_TAG]) {
-          if (lexer->lookahead == '/') {
-            lexer->advance(lexer, false);
-            return end_tag(lexer);
-          }
-          return start_tag(lexer);
-        }
-
-        break;
-
-      case '>':
-        if (valid_symbols[CLOSE_START_TAG]) {
-          lexer->advance(lexer, false);
-          lexer->result_symbol = CLOSE_START_TAG;
-          return true;
+        if (valid_symbols[IMPLICIT_END_TAG]) {
+          return scan_implicit_end_tag(lexer);
        }
        break;

      case '/':
-        if (valid_symbols[SELF_CLOSE_START_TAG]) {
-          lexer->advance(lexer, false);
-          if (lexer->lookahead == '>') {
-            lexer->advance(lexer, false);
-            tags.pop_back();
-            lexer->result_symbol = SELF_CLOSE_START_TAG;
-            return true;
-          }
+        if (valid_symbols[SELF_CLOSING_TAG_DELIMITER]) {
+          return scan_self_closing_tag_delimiter(lexer);
        }
        break;
+
+      default:
+        if (valid_symbols[START_TAG_NAME] || valid_symbols[END_TAG_NAME]) {
+          return valid_symbols[START_TAG_NAME]
+            ? scan_start_tag_name(lexer)
+            : scan_end_tag_name(lexer);
+        }
    }

    return false;