Parse tag names as separate tokens

This commit is contained in:
Max Brunsfeld 2018-06-12 12:20:13 -07:00
parent 4d11a75675
commit 5f2a122de7
5 changed files with 1267 additions and 1007 deletions

View File

@ -6,9 +6,9 @@ Tags
(fragment (fragment
(element (element
(start_tag) (start_tag (tag_name))
(text) (text)
(end_tag))) (end_tag (tag_name))))
=================================== ===================================
Tags with attributes Tags with attributes
@ -19,6 +19,7 @@ Tags with attributes
(fragment (fragment
(element (element
(start_tag (start_tag
(tag_name)
(attribute (attribute
(attribute_name) (attribute_name)
(attribute_value)) (attribute_value))
@ -27,7 +28,7 @@ Tags with attributes
(quoted_attribute_value (attribute_value))) (quoted_attribute_value (attribute_value)))
(attribute (attribute
(attribute_name))) (attribute_name)))
(end_tag))) (end_tag (tag_name))))
=================================== ===================================
Nested tags Nested tags
@ -41,17 +42,19 @@ Nested tags
(fragment (fragment
(element (element
(start_tag) (start_tag (tag_name))
(element
(start_tag)
(text)
(end_tag))
(text) (text)
(element (element
(start_tag) (start_tag (tag_name))
(text) (text)
(end_tag)) (end_tag (tag_name)))
(end_tag))) (text)
(element
(start_tag (tag_name))
(text)
(end_tag (tag_name)))
(text)
(end_tag (tag_name))))
================================== ==================================
Void tags Void tags
@ -61,16 +64,18 @@ Void tags
(fragment (fragment
(element (element
(start_tag) (start_tag (tag_name))
(element (element
(start_tag (start_tag
(tag_name)
(attribute (attribute_name) (quoted_attribute_value (attribute_value))))) (attribute (attribute_name) (quoted_attribute_value (attribute_value)))))
(element (start_tag)) (element (start_tag (tag_name)))
(element (element
(self_closing_tag (self_closing_tag
(tag_name)
(attribute (attribute_name) (attribute_value)) (attribute (attribute_name) (attribute_value))
(attribute (attribute_name) (attribute_value)))) (attribute (attribute_name) (attribute_value))))
(end_tag))) (end_tag (tag_name))))
================================== ==================================
Comments Comments
@ -83,15 +88,16 @@ Comments
(fragment (fragment
(comment) (comment)
(text)
(element (element
(start_tag) (start_tag (tag_name))
(comment) (comment)
(end_tag))) (text)
(end_tag (tag_name))))
================================== ==================================
Raw text elements Raw text elements
================================== ==================================
<script> <script>
</s </s
</sc </sc
@ -108,11 +114,14 @@ Raw text elements
(fragment (fragment
(raw_element (raw_element
(start_tag) (start_tag (tag_name))
(end_tag)) (raw_text)
(end_tag (tag_name)))
(text)
(raw_element (raw_element
(start_tag) (start_tag (tag_name))
(end_tag)) (raw_text)
(end_tag (tag_name)))
(text)) (text))
================================== ==================================
@ -146,10 +155,11 @@ LI elements without close tags
(fragment (fragment
(element (element
(start_tag) (start_tag (tag_name))
(element (start_tag) (text)) (text)
(element (start_tag) (text)) (element (start_tag (tag_name)) (text))
(end_tag))) (element (start_tag (tag_name)) (text))
(end_tag (tag_name))))
====================================== ======================================
DT and DL elements without close tags DT and DL elements without close tags
@ -165,13 +175,14 @@ DT and DL elements without close tags
(fragment (fragment
(element (element
(start_tag) (start_tag (tag_name))
(element (start_tag) (text)) (text)
(element (start_tag) (text)) (element (start_tag (tag_name)) (text))
(element (start_tag) (text)) (element (start_tag (tag_name)) (text))
(element (start_tag) (text)) (element (start_tag (tag_name)) (text))
(element (start_tag) (text)) (element (start_tag (tag_name)) (text))
(end_tag))) (element (start_tag (tag_name)) (text))
(end_tag (tag_name))))
====================================== ======================================
P elements without close tags P elements without close tags
@ -184,11 +195,12 @@ P elements without close tags
--- ---
(fragment (fragment
(element (start_tag) (text)) (element (start_tag (tag_name)) (text))
(element (start_tag) (text) (end_tag)) (element (start_tag (tag_name)) (text) (end_tag (tag_name)))
(element (start_tag) (text)) (text)
(element (start_tag) (text)) (element (start_tag (tag_name)) (text))
(element (start_tag) (text) (end_tag))) (element (start_tag (tag_name)) (text))
(element (start_tag (tag_name)) (text) (end_tag (tag_name))))
====================================== ======================================
Ruby annotation elements without close tags Ruby annotation elements without close tags
@ -198,17 +210,16 @@ Ruby annotation elements without close tags
(fragment (fragment
(element (element
(start_tag) (start_tag (tag_name))
(text) (text)
(element (start_tag) (text)) (element (start_tag (tag_name)) (text))
(element (start_tag) (text)) (element (start_tag (tag_name)) (text))
(element (start_tag) (text)) (element (start_tag (tag_name)) (text))
(end_tag))) (end_tag (tag_name))))
======================================= =======================================
COLGROUP elements without end tags COLGROUP elements without end tags
======================================= =======================================
<table> <table>
<colgroup> <colgroup>
<col style="background-color: #0f0"> <col style="background-color: #0f0">
@ -223,18 +234,29 @@ COLGROUP elements without end tags
(fragment (fragment
(element (element
(start_tag) (start_tag (tag_name))
(text)
(element (element
(start_tag) (start_tag (tag_name))
(element (start_tag (attribute (attribute_name) (quoted_attribute_value (attribute_value))))) (text)
(element (start_tag (attribute (attribute_name) (quoted_attribute_value (attribute_value)))))) (element (start_tag
(tag_name)
(attribute (attribute_name) (quoted_attribute_value (attribute_value)))))
(element (start_tag
(tag_name)
(attribute (attribute_name) (quoted_attribute_value (attribute_value))))))
(element (element
(start_tag) (start_tag (tag_name))
(element (start_tag) (text) (end_tag)) (text)
(element (start_tag) (text) (end_tag)) (element (start_tag (tag_name)) (text) (end_tag (tag_name)))
(element (start_tag) (text) (end_tag)) (text)
(end_tag)) (element (start_tag (tag_name)) (text) (end_tag (tag_name)))
(end_tag))) (text)
(element (start_tag (tag_name)) (text) (end_tag (tag_name)))
(text)
(end_tag (tag_name)))
(text)
(end_tag (tag_name))))
========================================= =========================================
TR, TD, and TH elements without end tags TR, TD, and TH elements without end tags
@ -251,13 +273,16 @@ TR, TD, and TH elements without end tags
(fragment (fragment
(element (element
(start_tag) (start_tag (tag_name))
(text)
(element (element
(start_tag) (start_tag (tag_name))
(element (start_tag) (text)) (text)
(element (start_tag) (text))) (element (start_tag (tag_name)) (text))
(element (start_tag (tag_name)) (text)))
(element (element
(start_tag) (start_tag (tag_name))
(element (start_tag) (text)) (text)
(element (start_tag) (text))) (element (start_tag (tag_name)) (text))
(end_tag))) (element (start_tag (tag_name)) (text)))
(end_tag (tag_name))))

View File

@ -7,14 +7,13 @@ module.exports = grammar({
], ],
externals: $ => [ externals: $ => [
$._open_start_tag, $._start_tag_name,
$._open_raw_start_tag, $._start_raw_tag_name,
$._close_start_tag, $._end_tag_name,
$._self_close_start_tag, $.erroneous_end_tag_name,
$.end_tag, '/>',
$._implicit_end_tag, $._implicit_end_tag,
$._erroneous_end_tag, $.raw_text,
$._raw_text,
$.comment, $.comment,
], ],
@ -31,8 +30,8 @@ module.exports = grammar({
_node: $ => choice( _node: $ => choice(
$.doctype, $.doctype,
$.text, $.text,
$._erroneous_end_tag,
$.element, $.element,
$.erroneous_end_tag,
$.raw_element $.raw_element
), ),
@ -47,26 +46,41 @@ module.exports = grammar({
raw_element: $ => seq( raw_element: $ => seq(
alias($._raw_start_tag, $.start_tag), alias($._raw_start_tag, $.start_tag),
optional($._raw_text), optional($.raw_text),
$.end_tag $.end_tag
), ),
start_tag: $ => seq( start_tag: $ => seq(
$._open_start_tag, '<',
alias($._start_tag_name, $.tag_name),
repeat($.attribute), repeat($.attribute),
$._close_start_tag '>'
), ),
_raw_start_tag: $ => seq( _raw_start_tag: $ => seq(
$._open_raw_start_tag, '<',
alias($._start_raw_tag_name, $.tag_name),
repeat($.attribute), repeat($.attribute),
$._close_start_tag '>'
), ),
self_closing_tag: $ => seq( self_closing_tag: $ => seq(
$._open_start_tag, '<',
alias($._start_tag_name, $.tag_name),
repeat($.attribute), repeat($.attribute),
$._self_close_start_tag '/>'
),
end_tag: $ => seq(
'</',
alias($._end_tag_name, $.tag_name),
'>'
),
erroneous_end_tag: $ => seq(
'</',
$.erroneous_end_tag_name,
'>'
), ),
attribute: $ => seq( attribute: $ => seq(

112
src/grammar.json vendored
View File

@ -42,11 +42,11 @@
}, },
{ {
"type": "SYMBOL", "type": "SYMBOL",
"name": "_erroneous_end_tag" "name": "element"
}, },
{ {
"type": "SYMBOL", "type": "SYMBOL",
"name": "element" "name": "erroneous_end_tag"
}, },
{ {
"type": "SYMBOL", "type": "SYMBOL",
@ -109,7 +109,7 @@
"members": [ "members": [
{ {
"type": "SYMBOL", "type": "SYMBOL",
"name": "_raw_text" "name": "raw_text"
}, },
{ {
"type": "BLANK" "type": "BLANK"
@ -126,8 +126,17 @@
"type": "SEQ", "type": "SEQ",
"members": [ "members": [
{ {
"type": "SYMBOL", "type": "STRING",
"name": "_open_start_tag" "value": "<"
},
{
"type": "ALIAS",
"content": {
"type": "SYMBOL",
"name": "_start_tag_name"
},
"named": true,
"value": "tag_name"
}, },
{ {
"type": "REPEAT", "type": "REPEAT",
@ -137,8 +146,8 @@
} }
}, },
{ {
"type": "SYMBOL", "type": "STRING",
"name": "_close_start_tag" "value": ">"
} }
] ]
}, },
@ -146,8 +155,17 @@
"type": "SEQ", "type": "SEQ",
"members": [ "members": [
{ {
"type": "SYMBOL", "type": "STRING",
"name": "_open_raw_start_tag" "value": "<"
},
{
"type": "ALIAS",
"content": {
"type": "SYMBOL",
"name": "_start_raw_tag_name"
},
"named": true,
"value": "tag_name"
}, },
{ {
"type": "REPEAT", "type": "REPEAT",
@ -157,8 +175,8 @@
} }
}, },
{ {
"type": "SYMBOL", "type": "STRING",
"name": "_close_start_tag" "value": ">"
} }
] ]
}, },
@ -166,8 +184,17 @@
"type": "SEQ", "type": "SEQ",
"members": [ "members": [
{ {
"type": "SYMBOL", "type": "STRING",
"name": "_open_start_tag" "value": "<"
},
{
"type": "ALIAS",
"content": {
"type": "SYMBOL",
"name": "_start_tag_name"
},
"named": true,
"value": "tag_name"
}, },
{ {
"type": "REPEAT", "type": "REPEAT",
@ -176,9 +203,48 @@
"name": "attribute" "name": "attribute"
} }
}, },
{
"type": "STRING",
"value": "/>"
}
]
},
"end_tag": {
"type": "SEQ",
"members": [
{
"type": "STRING",
"value": "</"
},
{
"type": "ALIAS",
"content": {
"type": "SYMBOL",
"name": "_end_tag_name"
},
"named": true,
"value": "tag_name"
},
{
"type": "STRING",
"value": ">"
}
]
},
"erroneous_end_tag": {
"type": "SEQ",
"members": [
{
"type": "STRING",
"value": "</"
},
{ {
"type": "SYMBOL", "type": "SYMBOL",
"name": "_self_close_start_tag" "name": "erroneous_end_tag_name"
},
{
"type": "STRING",
"value": ">"
} }
] ]
}, },
@ -313,23 +379,23 @@
"externals": [ "externals": [
{ {
"type": "SYMBOL", "type": "SYMBOL",
"name": "_open_start_tag" "name": "_start_tag_name"
}, },
{ {
"type": "SYMBOL", "type": "SYMBOL",
"name": "_open_raw_start_tag" "name": "_start_raw_tag_name"
}, },
{ {
"type": "SYMBOL", "type": "SYMBOL",
"name": "_close_start_tag" "name": "_end_tag_name"
}, },
{ {
"type": "SYMBOL", "type": "SYMBOL",
"name": "_self_close_start_tag" "name": "erroneous_end_tag_name"
}, },
{ {
"type": "SYMBOL", "type": "STRING",
"name": "end_tag" "value": "/>"
}, },
{ {
"type": "SYMBOL", "type": "SYMBOL",
@ -337,11 +403,7 @@
}, },
{ {
"type": "SYMBOL", "type": "SYMBOL",
"name": "_erroneous_end_tag" "name": "raw_text"
},
{
"type": "SYMBOL",
"name": "_raw_text"
}, },
{ {
"type": "SYMBOL", "type": "SYMBOL",

1833
src/parser.c vendored

File diff suppressed because it is too large Load Diff

138
src/scanner.cc vendored
View File

@ -11,13 +11,12 @@ using std::vector;
using std::string; using std::string;
enum TokenType { enum TokenType {
OPEN_START_TAG, START_TAG_NAME,
OPEN_RAW_START_TAG, START_RAW_TAG_NAME,
CLOSE_START_TAG, END_TAG_NAME,
SELF_CLOSE_START_TAG, ERRONEOUS_END_TAG_NAME,
END_TAG, SELF_CLOSING_TAG_DELIMITER,
IMPLICIT_END_TAG, IMPLICIT_END_TAG,
ERRONEOUS_END_TAG,
RAW_TEXT, RAW_TEXT,
COMMENT COMMENT
}; };
@ -68,7 +67,7 @@ struct Scanner {
return tag_name; return tag_name;
} }
bool comment(TSLexer *lexer) { bool scan_comment(TSLexer *lexer) {
if (lexer->lookahead != '-') return false; if (lexer->lookahead != '-') return false;
lexer->advance(lexer, false); lexer->advance(lexer, false);
if (lexer->lookahead != '-') return false; if (lexer->lookahead != '-') return false;
@ -98,7 +97,7 @@ struct Scanner {
return false; return false;
} }
bool raw_text(TSLexer *lexer) { bool scan_raw_text(TSLexer *lexer) {
if (!tags.size()) return false; if (!tags.size()) return false;
lexer->mark_end(lexer); lexer->mark_end(lexer);
@ -123,13 +122,19 @@ struct Scanner {
return true; return true;
} }
bool start_tag(TSLexer *lexer) { bool scan_implicit_end_tag(TSLexer *lexer) {
Tag *parent = tags.empty() ? nullptr : &tags.back(); Tag *parent = tags.empty() ? nullptr : &tags.back();
if (parent && parent->is_void()) { bool is_closing_tag = false;
tags.pop_back(); if (lexer->lookahead == '/') {
lexer->result_symbol = IMPLICIT_END_TAG; is_closing_tag = true;
return true; lexer->advance(lexer, false);
} else {
if (parent && parent->is_void()) {
tags.pop_back();
lexer->result_symbol = IMPLICIT_END_TAG;
return true;
}
} }
auto tag_name = scan_tag_name(lexer); auto tag_name = scan_tag_name(lexer);
@ -137,55 +142,70 @@ struct Scanner {
Tag next_tag = Tag::for_name(tag_name); Tag next_tag = Tag::for_name(tag_name);
if (parent && !parent->can_contain(next_tag)) { if (is_closing_tag) {
// The tag correctly closes the topmost element on the stack
if (next_tag == tags.back()) return false;
// Otherwise, dig deeper and queue implicit end tags (to be nice in
// the case of malformed HTML)
if (std::find(tags.begin(), tags.end(), next_tag) != tags.end()) {
tags.pop_back();
lexer->result_symbol = IMPLICIT_END_TAG;
return true;
}
} else if (parent && !parent->can_contain(next_tag)) {
tags.pop_back(); tags.pop_back();
lexer->result_symbol = IMPLICIT_END_TAG; lexer->result_symbol = IMPLICIT_END_TAG;
return true; return true;
} }
tags.push_back(next_tag); return false;
lexer->mark_end(lexer); }
lexer->result_symbol = next_tag.is_raw() ? OPEN_RAW_START_TAG : OPEN_START_TAG;
bool scan_start_tag_name(TSLexer *lexer) {
auto tag_name = scan_tag_name(lexer);
if (tag_name.empty()) return false;
Tag tag = Tag::for_name(tag_name);
tags.push_back(tag);
if (tag.is_raw()) {
lexer->result_symbol = START_RAW_TAG_NAME;
} else {
lexer->result_symbol = START_TAG_NAME;
}
return true; return true;
} }
bool end_tag(TSLexer *lexer) { bool scan_end_tag_name(TSLexer *lexer) {
auto tag_name = scan_tag_name(lexer); auto tag_name = scan_tag_name(lexer);
if (tag_name.empty()) return false; if (tag_name.empty()) return false;
lexer->advance(lexer, false);
Tag tag = Tag::for_name(tag_name); Tag tag = Tag::for_name(tag_name);
if (!tags.empty() && tags.back() == tag) {
// The tag correctly closes the topmost element on the stack
if (tag == tags.back()) {
tags.pop_back(); tags.pop_back();
lexer->mark_end(lexer); lexer->result_symbol = END_TAG_NAME;
lexer->result_symbol = END_TAG; } else {
return true; lexer->result_symbol = ERRONEOUS_END_TAG_NAME;
} }
// Otherwise, dig deeper and queue implicit end tags (to be nice in
// the case of malformed HTML)
if (std::find(tags.begin(), tags.end(), tag) != tags.end()) {
tags.pop_back();
lexer->result_symbol = IMPLICIT_END_TAG;
return true;
}
// You closed a tag you never opened 😭
lexer->mark_end(lexer);
lexer->result_symbol = ERRONEOUS_END_TAG;
return true; return true;
} }
bool scan_self_closing_tag_delimiter(TSLexer *lexer) {
lexer->advance(lexer, false);
if (lexer->lookahead == '>') {
lexer->advance(lexer, false);
tags.pop_back();
lexer->result_symbol = SELF_CLOSING_TAG_DELIMITER;
return true;
}
return false;
}
bool scan(TSLexer *lexer, const bool *valid_symbols) { bool scan(TSLexer *lexer, const bool *valid_symbols) {
while (iswspace(lexer->lookahead)) { while (iswspace(lexer->lookahead)) {
lexer->advance(lexer, true); lexer->advance(lexer, true);
} }
if (valid_symbols[RAW_TEXT] && !valid_symbols[OPEN_START_TAG] && !valid_symbols[CLOSE_START_TAG]) { if (valid_symbols[RAW_TEXT] && !valid_symbols[START_TAG_NAME] && !valid_symbols[END_TAG_NAME]) {
return raw_text(lexer); return scan_raw_text(lexer);
} }
switch (lexer->lookahead) { switch (lexer->lookahead) {
@ -195,38 +215,26 @@ struct Scanner {
if (lexer->lookahead == '!') { if (lexer->lookahead == '!') {
lexer->advance(lexer, false); lexer->advance(lexer, false);
return comment(lexer); return scan_comment(lexer);
} }
if (valid_symbols[OPEN_START_TAG] || valid_symbols[END_TAG]) { if (valid_symbols[IMPLICIT_END_TAG]) {
if (lexer->lookahead == '/') { return scan_implicit_end_tag(lexer);
lexer->advance(lexer, false);
return end_tag(lexer);
}
return start_tag(lexer);
}
break;
case '>':
if (valid_symbols[CLOSE_START_TAG]) {
lexer->advance(lexer, false);
lexer->result_symbol = CLOSE_START_TAG;
return true;
} }
break; break;
case '/': case '/':
if (valid_symbols[SELF_CLOSE_START_TAG]) { if (valid_symbols[SELF_CLOSING_TAG_DELIMITER]) {
lexer->advance(lexer, false); return scan_self_closing_tag_delimiter(lexer);
if (lexer->lookahead == '>') {
lexer->advance(lexer, false);
tags.pop_back();
lexer->result_symbol = SELF_CLOSE_START_TAG;
return true;
}
} }
break; break;
default:
if (valid_symbols[START_TAG_NAME] || valid_symbols[END_TAG_NAME]) {
return valid_symbols[START_TAG_NAME]
? scan_start_tag_name(lexer)
: scan_end_tag_name(lexer);
}
} }
return false; return false;