Parse tag names as separate tokens

This commit is contained in:
Max Brunsfeld 2018-06-12 12:20:13 -07:00
parent 4d11a75675
commit 5f2a122de7
5 changed files with 1267 additions and 1007 deletions

View File

@ -6,9 +6,9 @@ Tags
(fragment
(element
(start_tag)
(start_tag (tag_name))
(text)
(end_tag)))
(end_tag (tag_name))))
===================================
Tags with attributes
@ -19,6 +19,7 @@ Tags with attributes
(fragment
(element
(start_tag
(tag_name)
(attribute
(attribute_name)
(attribute_value))
@ -27,7 +28,7 @@ Tags with attributes
(quoted_attribute_value (attribute_value)))
(attribute
(attribute_name)))
(end_tag)))
(end_tag (tag_name))))
===================================
Nested tags
@ -41,17 +42,19 @@ Nested tags
(fragment
(element
(start_tag)
(element
(start_tag)
(text)
(end_tag))
(start_tag (tag_name))
(text)
(element
(start_tag)
(start_tag (tag_name))
(text)
(end_tag))
(end_tag)))
(end_tag (tag_name)))
(text)
(element
(start_tag (tag_name))
(text)
(end_tag (tag_name)))
(text)
(end_tag (tag_name))))
==================================
Void tags
@ -61,16 +64,18 @@ Void tags
(fragment
(element
(start_tag)
(start_tag (tag_name))
(element
(start_tag
(tag_name)
(attribute (attribute_name) (quoted_attribute_value (attribute_value)))))
(element (start_tag))
(element (start_tag (tag_name)))
(element
(self_closing_tag
(tag_name)
(attribute (attribute_name) (attribute_value))
(attribute (attribute_name) (attribute_value))))
(end_tag)))
(end_tag (tag_name))))
==================================
Comments
@ -83,15 +88,16 @@ Comments
(fragment
(comment)
(text)
(element
(start_tag)
(start_tag (tag_name))
(comment)
(end_tag)))
(text)
(end_tag (tag_name))))
==================================
Raw text elements
==================================
<script>
</s
</sc
@ -108,11 +114,14 @@ Raw text elements
(fragment
(raw_element
(start_tag)
(end_tag))
(start_tag (tag_name))
(raw_text)
(end_tag (tag_name)))
(text)
(raw_element
(start_tag)
(end_tag))
(start_tag (tag_name))
(raw_text)
(end_tag (tag_name)))
(text))
==================================
@ -146,10 +155,11 @@ LI elements without close tags
(fragment
(element
(start_tag)
(element (start_tag) (text))
(element (start_tag) (text))
(end_tag)))
(start_tag (tag_name))
(text)
(element (start_tag (tag_name)) (text))
(element (start_tag (tag_name)) (text))
(end_tag (tag_name))))
======================================
DT and DL elements without close tags
@ -165,13 +175,14 @@ DT and DL elements without close tags
(fragment
(element
(start_tag)
(element (start_tag) (text))
(element (start_tag) (text))
(element (start_tag) (text))
(element (start_tag) (text))
(element (start_tag) (text))
(end_tag)))
(start_tag (tag_name))
(text)
(element (start_tag (tag_name)) (text))
(element (start_tag (tag_name)) (text))
(element (start_tag (tag_name)) (text))
(element (start_tag (tag_name)) (text))
(element (start_tag (tag_name)) (text))
(end_tag (tag_name))))
======================================
P elements without close tags
@ -184,11 +195,12 @@ P elements without close tags
---
(fragment
(element (start_tag) (text))
(element (start_tag) (text) (end_tag))
(element (start_tag) (text))
(element (start_tag) (text))
(element (start_tag) (text) (end_tag)))
(element (start_tag (tag_name)) (text))
(element (start_tag (tag_name)) (text) (end_tag (tag_name)))
(text)
(element (start_tag (tag_name)) (text))
(element (start_tag (tag_name)) (text))
(element (start_tag (tag_name)) (text) (end_tag (tag_name))))
======================================
Ruby annotation elements without close tags
@ -198,17 +210,16 @@ Ruby annotation elements without close tags
(fragment
(element
(start_tag)
(start_tag (tag_name))
(text)
(element (start_tag) (text))
(element (start_tag) (text))
(element (start_tag) (text))
(end_tag)))
(element (start_tag (tag_name)) (text))
(element (start_tag (tag_name)) (text))
(element (start_tag (tag_name)) (text))
(end_tag (tag_name))))
=======================================
COLGROUP elements without end tags
=======================================
<table>
<colgroup>
<col style="background-color: #0f0">
@ -223,18 +234,29 @@ COLGROUP elements without end tags
(fragment
(element
(start_tag)
(start_tag (tag_name))
(text)
(element
(start_tag)
(element (start_tag (attribute (attribute_name) (quoted_attribute_value (attribute_value)))))
(element (start_tag (attribute (attribute_name) (quoted_attribute_value (attribute_value))))))
(start_tag (tag_name))
(text)
(element (start_tag
(tag_name)
(attribute (attribute_name) (quoted_attribute_value (attribute_value)))))
(element (start_tag
(tag_name)
(attribute (attribute_name) (quoted_attribute_value (attribute_value))))))
(element
(start_tag)
(element (start_tag) (text) (end_tag))
(element (start_tag) (text) (end_tag))
(element (start_tag) (text) (end_tag))
(end_tag))
(end_tag)))
(start_tag (tag_name))
(text)
(element (start_tag (tag_name)) (text) (end_tag (tag_name)))
(text)
(element (start_tag (tag_name)) (text) (end_tag (tag_name)))
(text)
(element (start_tag (tag_name)) (text) (end_tag (tag_name)))
(text)
(end_tag (tag_name)))
(text)
(end_tag (tag_name))))
=========================================
TR, TD, and TH elements without end tags
@ -251,13 +273,16 @@ TR, TD, and TH elements without end tags
(fragment
(element
(start_tag)
(start_tag (tag_name))
(text)
(element
(start_tag)
(element (start_tag) (text))
(element (start_tag) (text)))
(start_tag (tag_name))
(text)
(element (start_tag (tag_name)) (text))
(element (start_tag (tag_name)) (text)))
(element
(start_tag)
(element (start_tag) (text))
(element (start_tag) (text)))
(end_tag)))
(start_tag (tag_name))
(text)
(element (start_tag (tag_name)) (text))
(element (start_tag (tag_name)) (text)))
(end_tag (tag_name))))

View File

@ -7,14 +7,13 @@ module.exports = grammar({
],
externals: $ => [
$._open_start_tag,
$._open_raw_start_tag,
$._close_start_tag,
$._self_close_start_tag,
$.end_tag,
$._start_tag_name,
$._start_raw_tag_name,
$._end_tag_name,
$.erroneous_end_tag_name,
'/>',
$._implicit_end_tag,
$._erroneous_end_tag,
$._raw_text,
$.raw_text,
$.comment,
],
@ -31,8 +30,8 @@ module.exports = grammar({
_node: $ => choice(
$.doctype,
$.text,
$._erroneous_end_tag,
$.element,
$.erroneous_end_tag,
$.raw_element
),
@ -47,26 +46,41 @@ module.exports = grammar({
raw_element: $ => seq(
alias($._raw_start_tag, $.start_tag),
optional($._raw_text),
optional($.raw_text),
$.end_tag
),
start_tag: $ => seq(
$._open_start_tag,
'<',
alias($._start_tag_name, $.tag_name),
repeat($.attribute),
$._close_start_tag
'>'
),
_raw_start_tag: $ => seq(
$._open_raw_start_tag,
'<',
alias($._start_raw_tag_name, $.tag_name),
repeat($.attribute),
$._close_start_tag
'>'
),
self_closing_tag: $ => seq(
$._open_start_tag,
'<',
alias($._start_tag_name, $.tag_name),
repeat($.attribute),
$._self_close_start_tag
'/>'
),
end_tag: $ => seq(
'</',
alias($._end_tag_name, $.tag_name),
'>'
),
erroneous_end_tag: $ => seq(
'</',
$.erroneous_end_tag_name,
'>'
),
attribute: $ => seq(

112
src/grammar.json vendored
View File

@ -42,11 +42,11 @@
},
{
"type": "SYMBOL",
"name": "_erroneous_end_tag"
"name": "element"
},
{
"type": "SYMBOL",
"name": "element"
"name": "erroneous_end_tag"
},
{
"type": "SYMBOL",
@ -109,7 +109,7 @@
"members": [
{
"type": "SYMBOL",
"name": "_raw_text"
"name": "raw_text"
},
{
"type": "BLANK"
@ -126,8 +126,17 @@
"type": "SEQ",
"members": [
{
"type": "SYMBOL",
"name": "_open_start_tag"
"type": "STRING",
"value": "<"
},
{
"type": "ALIAS",
"content": {
"type": "SYMBOL",
"name": "_start_tag_name"
},
"named": true,
"value": "tag_name"
},
{
"type": "REPEAT",
@ -137,8 +146,8 @@
}
},
{
"type": "SYMBOL",
"name": "_close_start_tag"
"type": "STRING",
"value": ">"
}
]
},
@ -146,8 +155,17 @@
"type": "SEQ",
"members": [
{
"type": "SYMBOL",
"name": "_open_raw_start_tag"
"type": "STRING",
"value": "<"
},
{
"type": "ALIAS",
"content": {
"type": "SYMBOL",
"name": "_start_raw_tag_name"
},
"named": true,
"value": "tag_name"
},
{
"type": "REPEAT",
@ -157,8 +175,8 @@
}
},
{
"type": "SYMBOL",
"name": "_close_start_tag"
"type": "STRING",
"value": ">"
}
]
},
@ -166,8 +184,17 @@
"type": "SEQ",
"members": [
{
"type": "SYMBOL",
"name": "_open_start_tag"
"type": "STRING",
"value": "<"
},
{
"type": "ALIAS",
"content": {
"type": "SYMBOL",
"name": "_start_tag_name"
},
"named": true,
"value": "tag_name"
},
{
"type": "REPEAT",
@ -176,9 +203,48 @@
"name": "attribute"
}
},
{
"type": "STRING",
"value": "/>"
}
]
},
"end_tag": {
"type": "SEQ",
"members": [
{
"type": "STRING",
"value": "</"
},
{
"type": "ALIAS",
"content": {
"type": "SYMBOL",
"name": "_end_tag_name"
},
"named": true,
"value": "tag_name"
},
{
"type": "STRING",
"value": ">"
}
]
},
"erroneous_end_tag": {
"type": "SEQ",
"members": [
{
"type": "STRING",
"value": "</"
},
{
"type": "SYMBOL",
"name": "_self_close_start_tag"
"name": "erroneous_end_tag_name"
},
{
"type": "STRING",
"value": ">"
}
]
},
@ -313,23 +379,23 @@
"externals": [
{
"type": "SYMBOL",
"name": "_open_start_tag"
"name": "_start_tag_name"
},
{
"type": "SYMBOL",
"name": "_open_raw_start_tag"
"name": "_start_raw_tag_name"
},
{
"type": "SYMBOL",
"name": "_close_start_tag"
"name": "_end_tag_name"
},
{
"type": "SYMBOL",
"name": "_self_close_start_tag"
"name": "erroneous_end_tag_name"
},
{
"type": "SYMBOL",
"name": "end_tag"
"type": "STRING",
"value": "/>"
},
{
"type": "SYMBOL",
@ -337,11 +403,7 @@
},
{
"type": "SYMBOL",
"name": "_erroneous_end_tag"
},
{
"type": "SYMBOL",
"name": "_raw_text"
"name": "raw_text"
},
{
"type": "SYMBOL",

1833
src/parser.c vendored

File diff suppressed because it is too large Load Diff

138
src/scanner.cc vendored
View File

@ -11,13 +11,12 @@ using std::vector;
using std::string;
enum TokenType {
OPEN_START_TAG,
OPEN_RAW_START_TAG,
CLOSE_START_TAG,
SELF_CLOSE_START_TAG,
END_TAG,
START_TAG_NAME,
START_RAW_TAG_NAME,
END_TAG_NAME,
ERRONEOUS_END_TAG_NAME,
SELF_CLOSING_TAG_DELIMITER,
IMPLICIT_END_TAG,
ERRONEOUS_END_TAG,
RAW_TEXT,
COMMENT
};
@ -68,7 +67,7 @@ struct Scanner {
return tag_name;
}
bool comment(TSLexer *lexer) {
bool scan_comment(TSLexer *lexer) {
if (lexer->lookahead != '-') return false;
lexer->advance(lexer, false);
if (lexer->lookahead != '-') return false;
@ -98,7 +97,7 @@ struct Scanner {
return false;
}
bool raw_text(TSLexer *lexer) {
bool scan_raw_text(TSLexer *lexer) {
if (!tags.size()) return false;
lexer->mark_end(lexer);
@ -123,13 +122,19 @@ struct Scanner {
return true;
}
bool start_tag(TSLexer *lexer) {
bool scan_implicit_end_tag(TSLexer *lexer) {
Tag *parent = tags.empty() ? nullptr : &tags.back();
if (parent && parent->is_void()) {
tags.pop_back();
lexer->result_symbol = IMPLICIT_END_TAG;
return true;
bool is_closing_tag = false;
if (lexer->lookahead == '/') {
is_closing_tag = true;
lexer->advance(lexer, false);
} else {
if (parent && parent->is_void()) {
tags.pop_back();
lexer->result_symbol = IMPLICIT_END_TAG;
return true;
}
}
auto tag_name = scan_tag_name(lexer);
@ -137,55 +142,70 @@ struct Scanner {
Tag next_tag = Tag::for_name(tag_name);
if (parent && !parent->can_contain(next_tag)) {
if (is_closing_tag) {
// The tag correctly closes the topmost element on the stack
if (next_tag == tags.back()) return false;
// Otherwise, dig deeper and queue implicit end tags (to be nice in
// the case of malformed HTML)
if (std::find(tags.begin(), tags.end(), next_tag) != tags.end()) {
tags.pop_back();
lexer->result_symbol = IMPLICIT_END_TAG;
return true;
}
} else if (parent && !parent->can_contain(next_tag)) {
tags.pop_back();
lexer->result_symbol = IMPLICIT_END_TAG;
return true;
}
tags.push_back(next_tag);
lexer->mark_end(lexer);
lexer->result_symbol = next_tag.is_raw() ? OPEN_RAW_START_TAG : OPEN_START_TAG;
return false;
}
bool scan_start_tag_name(TSLexer *lexer) {
auto tag_name = scan_tag_name(lexer);
if (tag_name.empty()) return false;
Tag tag = Tag::for_name(tag_name);
tags.push_back(tag);
if (tag.is_raw()) {
lexer->result_symbol = START_RAW_TAG_NAME;
} else {
lexer->result_symbol = START_TAG_NAME;
}
return true;
}
bool end_tag(TSLexer *lexer) {
bool scan_end_tag_name(TSLexer *lexer) {
auto tag_name = scan_tag_name(lexer);
if (tag_name.empty()) return false;
lexer->advance(lexer, false);
Tag tag = Tag::for_name(tag_name);
// The tag correctly closes the topmost element on the stack
if (tag == tags.back()) {
if (!tags.empty() && tags.back() == tag) {
tags.pop_back();
lexer->mark_end(lexer);
lexer->result_symbol = END_TAG;
return true;
lexer->result_symbol = END_TAG_NAME;
} else {
lexer->result_symbol = ERRONEOUS_END_TAG_NAME;
}
// Otherwise, dig deeper and queue implicit end tags (to be nice in
// the case of malformed HTML)
if (std::find(tags.begin(), tags.end(), tag) != tags.end()) {
tags.pop_back();
lexer->result_symbol = IMPLICIT_END_TAG;
return true;
}
// You closed a tag you never opened 😭
lexer->mark_end(lexer);
lexer->result_symbol = ERRONEOUS_END_TAG;
return true;
}
bool scan_self_closing_tag_delimiter(TSLexer *lexer) {
lexer->advance(lexer, false);
if (lexer->lookahead == '>') {
lexer->advance(lexer, false);
tags.pop_back();
lexer->result_symbol = SELF_CLOSING_TAG_DELIMITER;
return true;
}
return false;
}
bool scan(TSLexer *lexer, const bool *valid_symbols) {
while (iswspace(lexer->lookahead)) {
lexer->advance(lexer, true);
}
if (valid_symbols[RAW_TEXT] && !valid_symbols[OPEN_START_TAG] && !valid_symbols[CLOSE_START_TAG]) {
return raw_text(lexer);
if (valid_symbols[RAW_TEXT] && !valid_symbols[START_TAG_NAME] && !valid_symbols[END_TAG_NAME]) {
return scan_raw_text(lexer);
}
switch (lexer->lookahead) {
@ -195,38 +215,26 @@ struct Scanner {
if (lexer->lookahead == '!') {
lexer->advance(lexer, false);
return comment(lexer);
return scan_comment(lexer);
}
if (valid_symbols[OPEN_START_TAG] || valid_symbols[END_TAG]) {
if (lexer->lookahead == '/') {
lexer->advance(lexer, false);
return end_tag(lexer);
}
return start_tag(lexer);
}
break;
case '>':
if (valid_symbols[CLOSE_START_TAG]) {
lexer->advance(lexer, false);
lexer->result_symbol = CLOSE_START_TAG;
return true;
if (valid_symbols[IMPLICIT_END_TAG]) {
return scan_implicit_end_tag(lexer);
}
break;
case '/':
if (valid_symbols[SELF_CLOSE_START_TAG]) {
lexer->advance(lexer, false);
if (lexer->lookahead == '>') {
lexer->advance(lexer, false);
tags.pop_back();
lexer->result_symbol = SELF_CLOSE_START_TAG;
return true;
}
if (valid_symbols[SELF_CLOSING_TAG_DELIMITER]) {
return scan_self_closing_tag_delimiter(lexer);
}
break;
default:
if (valid_symbols[START_TAG_NAME] || valid_symbols[END_TAG_NAME]) {
return valid_symbols[START_TAG_NAME]
? scan_start_tag_name(lexer)
: scan_end_tag_name(lexer);
}
}
return false;