Add raw text elements and doctypes

Co-Authored-By: Ashi Krishnan <queerviolet@github.com>
This commit is contained in:
Max Brunsfeld 2018-06-11 16:56:33 -07:00
parent d9a504136e
commit 21b5fad6f0
6 changed files with 1156 additions and 465 deletions

View File

@ -87,3 +87,50 @@ Comments
(start_tag) (start_tag)
(comment) (comment)
(end_tag))) (end_tag)))
==================================
Raw text elements
==================================
<script>
</s
</sc
</scr
</scri
</scrip
</script>
<style>
</ </s </st </sty </styl
</style>
---
(fragment
(raw_element
(start_tag)
(end_tag))
(raw_element
(start_tag)
(end_tag))
(text))
==================================
All-caps doctype
==================================
<!DOCTYPE html PUBLIC
"-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
---
(fragment
(doctype))
==================================
Lowercase doctype
==================================
<!doctype html>
---
(fragment
(doctype))

View File

@ -8,21 +8,32 @@ module.exports = grammar({
externals: $ => [ externals: $ => [
$._open_start_tag, $._open_start_tag,
$._open_raw_start_tag,
$._close_start_tag, $._close_start_tag,
$._self_close_start_tag, $._self_close_start_tag,
$.end_tag, $.end_tag,
$._implicit_end_tag, $._implicit_end_tag,
$._erroneous_end_tag, $._erroneous_end_tag,
$._raw_text,
$.comment, $.comment,
], ],
rules: { rules: {
fragment: $ => repeat($._node), fragment: $ => repeat($._node),
doctype: $ => seq(
'<!',
/[Dd][Oo][Cc][Tt][Yy][Pp][Ee]/,
/[^>]+/,
'>'
),
_node: $ => choice( _node: $ => choice(
$.doctype,
$.text, $.text,
$._erroneous_end_tag, $._erroneous_end_tag,
$.element $.element,
$.raw_element
), ),
element: $ => choice( element: $ => choice(
@ -34,12 +45,24 @@ module.exports = grammar({
$.self_closing_tag $.self_closing_tag
), ),
raw_element: $ => seq(
alias($._raw_start_tag, $.start_tag),
optional($._raw_text),
$.end_tag
),
start_tag: $ => seq( start_tag: $ => seq(
$._open_start_tag, $._open_start_tag,
repeat($.attribute), repeat($.attribute),
$._close_start_tag $._close_start_tag
), ),
_raw_start_tag: $ => seq(
$._open_raw_start_tag,
repeat($.attribute),
$._close_start_tag
),
self_closing_tag: $ => seq( self_closing_tag: $ => seq(
$._open_start_tag, $._open_start_tag,
repeat($.attribute), repeat($.attribute),

87
src/grammar.json vendored
View File

@ -8,9 +8,34 @@
"name": "_node" "name": "_node"
} }
}, },
"doctype": {
"type": "SEQ",
"members": [
{
"type": "STRING",
"value": "<!"
},
{
"type": "PATTERN",
"value": "[Dd][Oo][Cc][Tt][Yy][Pp][Ee]"
},
{
"type": "PATTERN",
"value": "[^>]+"
},
{
"type": "STRING",
"value": ">"
}
]
},
"_node": { "_node": {
"type": "CHOICE", "type": "CHOICE",
"members": [ "members": [
{
"type": "SYMBOL",
"name": "doctype"
},
{ {
"type": "SYMBOL", "type": "SYMBOL",
"name": "text" "name": "text"
@ -22,6 +47,10 @@
{ {
"type": "SYMBOL", "type": "SYMBOL",
"name": "element" "name": "element"
},
{
"type": "SYMBOL",
"name": "raw_element"
} }
] ]
}, },
@ -63,6 +92,36 @@
} }
] ]
}, },
"raw_element": {
"type": "SEQ",
"members": [
{
"type": "ALIAS",
"content": {
"type": "SYMBOL",
"name": "_raw_start_tag"
},
"named": true,
"value": "start_tag"
},
{
"type": "CHOICE",
"members": [
{
"type": "SYMBOL",
"name": "_raw_text"
},
{
"type": "BLANK"
}
]
},
{
"type": "SYMBOL",
"name": "end_tag"
}
]
},
"start_tag": { "start_tag": {
"type": "SEQ", "type": "SEQ",
"members": [ "members": [
@ -83,6 +142,26 @@
} }
] ]
}, },
"_raw_start_tag": {
"type": "SEQ",
"members": [
{
"type": "SYMBOL",
"name": "_open_raw_start_tag"
},
{
"type": "REPEAT",
"content": {
"type": "SYMBOL",
"name": "attribute"
}
},
{
"type": "SYMBOL",
"name": "_close_start_tag"
}
]
},
"self_closing_tag": { "self_closing_tag": {
"type": "SEQ", "type": "SEQ",
"members": [ "members": [
@ -236,6 +315,10 @@
"type": "SYMBOL", "type": "SYMBOL",
"name": "_open_start_tag" "name": "_open_start_tag"
}, },
{
"type": "SYMBOL",
"name": "_open_raw_start_tag"
},
{ {
"type": "SYMBOL", "type": "SYMBOL",
"name": "_close_start_tag" "name": "_close_start_tag"
@ -256,6 +339,10 @@
"type": "SYMBOL", "type": "SYMBOL",
"name": "_erroneous_end_tag" "name": "_erroneous_end_tag"
}, },
{
"type": "SYMBOL",
"name": "_raw_text"
},
{ {
"type": "SYMBOL", "type": "SYMBOL",
"name": "comment" "name": "comment"

1422
src/parser.c vendored

File diff suppressed because it is too large Load Diff

36
src/scanner.cc vendored
View File

@ -3,7 +3,6 @@
#include <vector> #include <vector>
#include <string> #include <string>
#include <locale> #include <locale>
#include "tag.h" #include "tag.h"
namespace { namespace {
@ -13,12 +12,14 @@ using std::string;
enum TokenType { enum TokenType {
OPEN_START_TAG, OPEN_START_TAG,
OPEN_RAW_START_TAG,
CLOSE_START_TAG, CLOSE_START_TAG,
SELF_CLOSE_START_TAG, SELF_CLOSE_START_TAG,
END_TAG, END_TAG,
IMPLICIT_END_TAG, IMPLICIT_END_TAG,
ERRONEOUS_END_TAG, ERRONEOUS_END_TAG,
COMMENT, RAW_TEXT,
COMMENT
}; };
struct Scanner { struct Scanner {
@ -97,6 +98,31 @@ struct Scanner {
return false; return false;
} }
bool raw_text(TSLexer *lexer) {
if (!tags.size()) return false;
lexer->mark_end(lexer);
const string &end_delimiter = tags.back().type == SCRIPT
? "</script"
: "</style";
unsigned delimiter_index = 0;
while (lexer->lookahead) {
if (lexer->lookahead == end_delimiter[delimiter_index]) {
delimiter_index++;
if (delimiter_index == end_delimiter.size()) break;
} else {
delimiter_index = 0;
lexer->mark_end(lexer);
}
lexer->advance(lexer, false);
}
lexer->result_symbol = RAW_TEXT;
return true;
}
bool start_tag(TSLexer *lexer) { bool start_tag(TSLexer *lexer) {
if (!tags.empty() && tags.back().is_void()) { if (!tags.empty() && tags.back().is_void()) {
tags.pop_back(); tags.pop_back();
@ -111,7 +137,7 @@ struct Scanner {
tags.push_back(tag); tags.push_back(tag);
lexer->mark_end(lexer); lexer->mark_end(lexer);
lexer->result_symbol = OPEN_START_TAG; lexer->result_symbol = tag.is_raw() ? OPEN_RAW_START_TAG : OPEN_START_TAG;
return true; return true;
} }
@ -150,6 +176,10 @@ struct Scanner {
lexer->advance(lexer, true); lexer->advance(lexer, true);
} }
if (valid_symbols[RAW_TEXT] && !valid_symbols[OPEN_START_TAG] && !valid_symbols[CLOSE_START_TAG]) {
return raw_text(lexer);
}
switch (lexer->lookahead) { switch (lexer->lookahead) {
case '<': case '<':
lexer->mark_end(lexer); lexer->mark_end(lexer);

4
src/tag.h vendored
View File

@ -279,6 +279,10 @@ struct Tag {
return type < END_OF_VOID_TAGS; return type < END_OF_VOID_TAGS;
} }
inline bool is_raw() const {
return type == SCRIPT || type == STYLE;
}
// string name() const { // string name() const {
// return type == TagType::CUSTOM // return type == TagType::CUSTOM
// ? custom_tag_name // ? custom_tag_name