From 7cfe147792719c4517aa01d402229d1bce70804f Mon Sep 17 00:00:00 2001
From: Max Brunsfeld <maxbrunsfeld@gmail.com>
Date: Tue, 12 Jun 2018 10:51:03 -0700
Subject: [PATCH] Handle elements with optional end tags (li, p, etc)

Co-Authored-By: Ashi Krishnan <queerviolet@github.com>
---
 corpus/main.txt | 127 ++++++++++++++++++++++++++++++++++++++++++++++++
 src/scanner.cc  |  18 +++++--
 src/tag.h       |  69 ++++++++++++++++++++++++--
 3 files changed, 204 insertions(+), 10 deletions(-)
diff --git a/corpus/main.txt b/corpus/main.txt
index 3099359..c6998ed 100644
--- a/corpus/main.txt
+++ b/corpus/main.txt
@@ -134,3 +134,130 @@ Lowercase doctype
 
 (fragment
   (doctype))
+
+==================================
+LI elements without close tags
+==================================
+<ul>
+  <li>One
+  <li>Two
+</ul>
+---
+
+(fragment
+  (element
+    (start_tag)
+    (element (start_tag) (text))
+    (element (start_tag) (text))
+    (end_tag)))
+
+======================================
+DT and DL elements without close tags
+======================================
+<dl>
+  <dt>Coffee
+  <dt>Café
+  <dd>Black hot drink
+  <dt>Milk
+  <dd>White cold drink
+</dl>
+---
+
+(fragment
+  (element
+    (start_tag)
+    (element (start_tag) (text))
+    (element (start_tag) (text))
+    (element (start_tag) (text))
+    (element (start_tag) (text))
+    (element (start_tag) (text))
+    (end_tag)))
+
+======================================
+P elements without close tags
+======================================
+<p>One
+<div>Two</div>
+<p>Three
+<p>Four
+<h1>Five</h1>
+---
+
+(fragment
+  (element (start_tag) (text))
+  (element (start_tag) (text) (end_tag))
+  (element (start_tag) (text))
+  (element (start_tag) (text))
+  (element (start_tag) (text) (end_tag)))
+
+======================================
+Ruby annotation elements without close tags
+======================================
+<ruby>東<rb>京<rt>とう<rt>きょう</ruby>
+---
+
+(fragment
+  (element
+    (start_tag)
+    (text)
+    (element (start_tag) (text))
+    (element (start_tag) (text))
+    (element (start_tag) (text))
+    (end_tag)))
+
+=======================================
+COLGROUP elements without end tags
+=======================================
+
+<table>
+  <colgroup>
+    <col style="background-color: #0f0">
+    <col span="2">
+  <tr>
+    <th>Lime</th>
+    <th>Lemon</th>
+    <th>Orange</th>
+  </tr>
+</table>
+---
+
+(fragment
+  (element
+    (start_tag)
+    (element
+      (start_tag)
+      (element (start_tag (attribute (attribute_name) (quoted_attribute_value (attribute_value)))))
+      (element (start_tag (attribute (attribute_name) (quoted_attribute_value (attribute_value))))))
+    (element
+      (start_tag)
+      (element (start_tag) (text) (end_tag))
+      (element (start_tag) (text) (end_tag))
+      (element (start_tag) (text) (end_tag))
+      (end_tag))
+    (end_tag)))
+
+=========================================
+TR, TD, and TH elements without end tags
+=========================================
+<table>
+  <tr>
+    <th>One
+    <th>Two
+  <tr>
+    <td>Three
+    <td>Four
+</table>
+---
+
+(fragment
+  (element
+    (start_tag)
+    (element
+      (start_tag)
+      (element (start_tag) (text))
+      (element (start_tag) (text)))
+    (element
+      (start_tag)
+      (element (start_tag) (text))
+      (element (start_tag) (text)))
+    (end_tag)))
diff --git a/src/scanner.cc b/src/scanner.cc
index 9f2c8be..1c511c0 100644
--- a/src/scanner.cc
+++ b/src/scanner.cc
@@ -61,7 +61,7 @@ struct Scanner {
 
   string scan_tag_name(TSLexer *lexer) {
     string tag_name;
-    while (iswalpha(lexer->lookahead) || lexer->lookahead == '-' || lexer->lookahead == ':') {
+    while (iswalnum(lexer->lookahead) || lexer->lookahead == '-' || lexer->lookahead == ':') {
       tag_name += towupper(lexer->lookahead);
       lexer->advance(lexer, false);
     }
@@ -124,7 +124,9 @@ struct Scanner {
   }
 
   bool start_tag(TSLexer *lexer) {
-    if (!tags.empty() && tags.back().is_void()) {
+    Tag *parent = tags.empty() ? nullptr : &tags.back();
+
+    if (parent && parent->is_void()) {
       tags.pop_back();
       lexer->result_symbol = IMPLICIT_END_TAG;
       return true;
@@ -133,11 +135,17 @@ struct Scanner {
     auto tag_name = scan_tag_name(lexer);
     if (tag_name.empty()) return false;
 
-    Tag tag = Tag::for_name(tag_name);
-    tags.push_back(tag);
+    Tag next_tag = Tag::for_name(tag_name);
 
+    if (parent && !parent->can_contain(next_tag)) {
+      tags.pop_back();
+      lexer->result_symbol = IMPLICIT_END_TAG;
+      return true;
+    }
+
+    tags.push_back(next_tag);
     lexer->mark_end(lexer);
-    lexer->result_symbol = tag.is_raw() ? OPEN_RAW_START_TAG : OPEN_START_TAG;
+    lexer->result_symbol = next_tag.is_raw() ? OPEN_RAW_START_TAG : OPEN_START_TAG;
     return true;
   }
 
diff --git a/src/tag.h b/src/tag.h
index 91a5ca7..6e15327 100644
--- a/src/tag.h
+++ b/src/tag.h
@@ -265,6 +265,35 @@ static const unordered_map<string, TagType> TAG_TYPES_BY_TAG_NAME = {
   {"VIDEO", VIDEO},
 };
 
+static const bool PARAGRAPH_CANNOT_CONTAIN[CUSTOM + 1] = {
+  [ADDRESS] = true,
+  [ARTICLE] = true,
+  [ASIDE] = true,
+  [BLOCKQUOTE] = true,
+  [DETAILS] = true,
+  [DIV] = true,
+  [DL] = true,
+  [FIELDSET] = true,
+  [FIGCAPTION] = true,
+  [FIGURE] = true,
+  [FOOTER] = true,
+  [FORM] = true,
+  [H1] = true,
+  [H2] = true,
+  [H3] = true,
+  [H4] = true,
+  [H5] = true,
+  [H6] = true,
+  [HEADER] = true,
+  [HR] = true,
+  [MAIN] = true,
+  [NAV] = true,
+  [OL] = true,
+  [P] = true,
+  [PRE] = true,
+  [SECTION] = true,
+};
+
 struct Tag {
   TagType type;
   string custom_tag_name;
@@ -283,11 +312,41 @@ struct Tag {
     return type == SCRIPT || type == STYLE;
   }
 
-  // string name() const {
-  //   return type == TagType::CUSTOM
-  //     ? custom_tag_name
-  //     : TAG_TYPES_BY_TAG_NAME.
-  // }
+  inline bool can_contain(const Tag &tag) {
+    TagType child = tag.type;
+
+    switch (type) {
+      case LI: return child != LI;
+
+      case DT:
+      case DD:
+        return child != DT && child != DD;
+
+      case P:
+        return !PARAGRAPH_CANNOT_CONTAIN[child];
+
+      case COLGROUP:
+        return child == COL;
+
+      case RB:
+      case RT:
+      case RP:
+        return child != RB && child != RT && child != RP;
+
+      case OPTGROUP:
+        return child != OPTGROUP;
+
+      case TR:
+        return child != TR;
+
+      case TD:
+      case TH:
+        return child != TD && child != TH && child != TR;
+
+      default:
+        return true;
+    }
+  }
 
   static Tag for_name(const string &name) {
     auto type = TAG_TYPES_BY_TAG_NAME.find(name);