From 5f2a122de7d7ad3e74a3c219d05c7a61ada2112d Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 12 Jun 2018 12:20:13 -0700 Subject: [PATCH] Parse tag names as separate tokens --- corpus/main.txt | 147 ++-- grammar.js | 44 +- src/grammar.json | 112 ++- src/parser.c | 1833 +++++++++++++++++++++++++--------------------- src/scanner.cc | 138 ++-- 5 files changed, 1267 insertions(+), 1007 deletions(-) diff --git a/corpus/main.txt b/corpus/main.txt index c6998ed..1c19046 100644 --- a/corpus/main.txt +++ b/corpus/main.txt @@ -6,9 +6,9 @@ Tags (fragment (element - (start_tag) + (start_tag (tag_name)) (text) - (end_tag))) + (end_tag (tag_name)))) =================================== Tags with attributes @@ -19,6 +19,7 @@ Tags with attributes (fragment (element (start_tag + (tag_name) (attribute (attribute_name) (attribute_value)) @@ -27,7 +28,7 @@ Tags with attributes (quoted_attribute_value (attribute_value))) (attribute (attribute_name))) - (end_tag))) + (end_tag (tag_name)))) =================================== Nested tags @@ -41,17 +42,19 @@ Nested tags (fragment (element - (start_tag) - (element - (start_tag) - (text) - (end_tag)) + (start_tag (tag_name)) (text) (element - (start_tag) + (start_tag (tag_name)) (text) - (end_tag)) - (end_tag))) + (end_tag (tag_name))) + (text) + (element + (start_tag (tag_name)) + (text) + (end_tag (tag_name))) + (text) + (end_tag (tag_name)))) ================================== Void tags @@ -61,16 +64,18 @@ Void tags (fragment (element - (start_tag) + (start_tag (tag_name)) (element (start_tag + (tag_name) (attribute (attribute_name) (quoted_attribute_value (attribute_value))))) - (element (start_tag)) + (element (start_tag (tag_name))) (element (self_closing_tag + (tag_name) (attribute (attribute_name) (attribute_value)) (attribute (attribute_name) (attribute_value)))) - (end_tag))) + (end_tag (tag_name)))) ================================== Comments @@ -83,15 +88,16 @@ Comments (fragment (comment) + (text) (element - (start_tag) + (start_tag (tag_name)) (comment) - (end_tag))) + (text) + (end_tag (tag_name)))) ================================== Raw text elements ================================== -