Start tracking element nesting in external scanner

Co-Authored-By: Ashi Krishan <queerviolet@github.com>
This commit is contained in:
Max Brunsfeld 2018-06-11 15:12:01 -07:00
parent e74d96c27b
commit 6ee8f55084
8 changed files with 1102 additions and 1128 deletions

View File

@ -8,6 +8,7 @@
], ],
"sources": [ "sources": [
"src/parser.c", "src/parser.c",
"src/scanner.cc",
"src/binding.cc" "src/binding.cc"
], ],
"cflags_c": [ "cflags_c": [

View File

@ -6,9 +6,9 @@ Tags
(fragment (fragment
(element (element
(start_tag (tag_name)) (start_tag)
(text) (text)
(end_tag (tag_name)))) (end_tag)))
=================================== ===================================
Tags with attributes Tags with attributes
@ -17,9 +17,8 @@ Tags with attributes
--- ---
(fragment (fragment
(void_element (element
(void_start_tag (start_tag
(void_tag_name)
(attribute (attribute
(attribute_name) (attribute_name)
(attribute_value)) (attribute_value))
@ -28,7 +27,7 @@ Tags with attributes
(quoted_attribute_value (attribute_value))) (quoted_attribute_value (attribute_value)))
(attribute (attribute
(attribute_name))) (attribute_name)))
(end_tag (tag_name)))) (end_tag)))
=================================== ===================================
Nested tags Nested tags
@ -42,37 +41,33 @@ Nested tags
(fragment (fragment
(element (element
(start_tag (tag_name)) (start_tag)
(element
(start_tag)
(text)
(end_tag))
(text) (text)
(element (element
(start_tag (tag_name)) (start_tag)
(text) (text)
(end_tag (tag_name))) (end_tag))
(text) (end_tag)))
(element
(start_tag (tag_name))
(text)
(end_tag (tag_name)))
(text)
(end_tag (tag_name))))
================================== ==================================
Void tags Void tags
================================== ==================================
<form><img src="somethign.png"><br><input type=submit value=Ok /></form> <form><img src="something.png"><br><input type=submit value=Ok /></form>
--- ---
(fragment (fragment
(element (element
(start_tag (tag_name)) (start_tag)
(void_element (element
(void_start_tag (start_tag
(tag_name)
(attribute (attribute_name) (quoted_attribute_value (attribute_value))))) (attribute (attribute_name) (quoted_attribute_value (attribute_value)))))
(void_element (void_start_tag (tag_name))) (element (start_tag))
(void_element (element
(self_closing_tag (self_closing_tag
(tag_name)
(attribute (attribute_name) (attribute_value)) (attribute (attribute_name) (attribute_value))
(attribute (attribute_name) (attribute_value)))) (attribute (attribute_name) (attribute_value))))
(end_tag (tag_name)))) (end_tag)))

View File

@ -1,16 +1,13 @@
const startTag = ($, tag) => seq(
'<',
alias(tag, $.tag_name),
repeat($.attribute),
'>'
)
module.exports = grammar({ module.exports = grammar({
name: 'html', name: 'html',
externals: $ => [ externals: $ => [
$.tag_name, $._open_start_tag,
$._close_start_tag,
$._self_close_start_tag,
$.end_tag,
$._implicit_end_tag,
$._erroneous_end_tag,
], ],
rules: { rules: {
@ -18,31 +15,29 @@ module.exports = grammar({
_node: $ => choice( _node: $ => choice(
$.text, $.text,
$.element, $._erroneous_end_tag,
$.void_element $.element
), ),
element: $ => seq( element: $ => choice(
seq(
$.start_tag, $.start_tag,
repeat($._node), repeat($._node),
$.end_tag choice($.end_tag, $._implicit_end_tag)
), ),
void_element: $ => choice(
seq($.void_start_tag, optional($.end_tag)),
$.self_closing_tag $.self_closing_tag
), ),
start_tag: $ => startTag($, $.tag_name), start_tag: $ => seq(
$._open_start_tag,
void_start_tag: $ => startTag($, $.void_tag_name), repeat($.attribute),
$._close_start_tag
),
self_closing_tag: $ => seq( self_closing_tag: $ => seq(
'<', $._open_start_tag,
choice($.tag_name, $.void_tag_name),
repeat($.attribute), repeat($.attribute),
'/', $._self_close_start_tag
'>'
), ),
attribute: $ => seq( attribute: $ => seq(
@ -63,40 +58,6 @@ module.exports = grammar({
seq('"', optional(alias(/[^"]+/, $.attribute_value)), '"') seq('"', optional(alias(/[^"]+/, $.attribute_value)), '"')
), ),
end_tag: $ => seq(
'</',
choice($.tag_name, $.void_tag_name),
'>'
),
tag_name: $ => /[a-zA-Z\-]+/,
void_tag_name: $ => token(prec(1, choice(
'area',
'base',
'basefont',
'bgsound',
'br',
'col',
'command',
'embed',
'frame',
'hr',
'image',
'img',
'input',
'isindex',
'keygen',
'link',
'menuitem',
'meta',
'nextid',
'param',
'source',
'track',
'wbr'
))),
text: $ => /[^<>]+/ text: $ => /[^<>]+/
} }
}); });

244
src/grammar.json vendored
View File

@ -17,15 +17,18 @@
}, },
{ {
"type": "SYMBOL", "type": "SYMBOL",
"name": "element" "name": "_erroneous_end_tag"
}, },
{ {
"type": "SYMBOL", "type": "SYMBOL",
"name": "void_element" "name": "element"
} }
] ]
}, },
"element": { "element": {
"type": "CHOICE",
"members": [
{
"type": "SEQ", "type": "SEQ",
"members": [ "members": [
{ {
@ -40,17 +43,19 @@
} }
}, },
{ {
"type": "SYMBOL",
"name": "end_tag"
}
]
},
"void_element": {
"type": "CHOICE", "type": "CHOICE",
"members": [ "members": [
{ {
"type": "SYMBOL", "type": "SYMBOL",
"name": "void_start_tag" "name": "end_tag"
},
{
"type": "SYMBOL",
"name": "_implicit_end_tag"
}
]
}
]
}, },
{ {
"type": "SYMBOL", "type": "SYMBOL",
@ -62,17 +67,8 @@
"type": "SEQ", "type": "SEQ",
"members": [ "members": [
{ {
"type": "STRING",
"value": "<"
},
{
"type": "ALIAS",
"content": {
"type": "SYMBOL", "type": "SYMBOL",
"name": "tag_name" "name": "_open_start_tag"
},
"named": true,
"value": "tag_name"
}, },
{ {
"type": "REPEAT", "type": "REPEAT",
@ -82,59 +78,17 @@
} }
}, },
{ {
"type": "STRING",
"value": ">"
}
]
},
"void_start_tag": {
"type": "SEQ",
"members": [
{
"type": "STRING",
"value": "<"
},
{
"type": "ALIAS",
"content": {
"type": "SYMBOL", "type": "SYMBOL",
"name": "void_tag_name" "name": "_close_start_tag"
},
"named": true,
"value": "tag_name"
},
{
"type": "REPEAT",
"content": {
"type": "SYMBOL",
"name": "attribute"
}
},
{
"type": "STRING",
"value": ">"
} }
] ]
}, },
"self_closing_tag": { "self_closing_tag": {
"type": "SEQ", "type": "SEQ",
"members": [
{
"type": "STRING",
"value": "<"
},
{
"type": "CHOICE",
"members": [ "members": [
{ {
"type": "SYMBOL", "type": "SYMBOL",
"name": "tag_name" "name": "_open_start_tag"
},
{
"type": "SYMBOL",
"name": "void_tag_name"
}
]
}, },
{ {
"type": "REPEAT", "type": "REPEAT",
@ -144,12 +98,8 @@
} }
}, },
{ {
"type": "STRING", "type": "SYMBOL",
"value": "/" "name": "_self_close_start_tag"
},
{
"type": "STRING",
"value": ">"
} }
] ]
}, },
@ -271,135 +221,6 @@
} }
] ]
}, },
"end_tag": {
"type": "SEQ",
"members": [
{
"type": "STRING",
"value": "<"
},
{
"type": "STRING",
"value": "/"
},
{
"type": "SYMBOL",
"name": "tag_name"
},
{
"type": "STRING",
"value": ">"
}
]
},
"tag_name": {
"type": "PATTERN",
"value": "[a-zA-Z\\-]+"
},
"void_tag_name": {
"type": "TOKEN",
"content": {
"type": "PREC",
"value": 1,
"content": {
"type": "CHOICE",
"members": [
{
"type": "STRING",
"value": "area"
},
{
"type": "STRING",
"value": "base"
},
{
"type": "STRING",
"value": "basefont"
},
{
"type": "STRING",
"value": "bgsound"
},
{
"type": "STRING",
"value": "br"
},
{
"type": "STRING",
"value": "col"
},
{
"type": "STRING",
"value": "command"
},
{
"type": "STRING",
"value": "embed"
},
{
"type": "STRING",
"value": "frame"
},
{
"type": "STRING",
"value": "hr"
},
{
"type": "STRING",
"value": "image"
},
{
"type": "STRING",
"value": "img"
},
{
"type": "STRING",
"value": "input"
},
{
"type": "STRING",
"value": "isindex"
},
{
"type": "STRING",
"value": "keygen"
},
{
"type": "STRING",
"value": "link"
},
{
"type": "STRING",
"value": "menuitem"
},
{
"type": "STRING",
"value": "meta"
},
{
"type": "STRING",
"value": "nextid"
},
{
"type": "STRING",
"value": "param"
},
{
"type": "STRING",
"value": "source"
},
{
"type": "STRING",
"value": "track"
},
{
"type": "STRING",
"value": "wbr"
}
]
}
}
},
"text": { "text": {
"type": "PATTERN", "type": "PATTERN",
"value": "[^<>]+" "value": "[^<>]+"
@ -412,6 +233,31 @@
} }
], ],
"conflicts": [], "conflicts": [],
"externals": [], "externals": [
{
"type": "SYMBOL",
"name": "_open_start_tag"
},
{
"type": "SYMBOL",
"name": "_close_start_tag"
},
{
"type": "SYMBOL",
"name": "_self_close_start_tag"
},
{
"type": "SYMBOL",
"name": "end_tag"
},
{
"type": "SYMBOL",
"name": "_implicit_end_tag"
},
{
"type": "SYMBOL",
"name": "_erroneous_end_tag"
}
],
"inline": [] "inline": []
} }

1340
src/parser.c vendored

File diff suppressed because it is too large Load Diff

185
src/scanner.cc vendored Normal file
View File

@ -0,0 +1,185 @@
#include <tree_sitter/parser.h>
#include <algorithm>
#include <vector>
#include <string>
#include <locale>
#include "tag.h"
namespace {
using std::vector;
using std::string;
enum TokenType {
OPEN_START_TAG,
CLOSE_START_TAG,
SELF_CLOSE_START_TAG,
END_TAG,
IMPLICIT_END_TAG,
ERRONEOUS_END_TAG,
};
struct Scanner {
Scanner() {}
unsigned serialize(char *buffer) {
unsigned i = 0;
for (Tag &tag : tags) {
buffer[i] = static_cast<char>(tag.type);
i++;
if (tag.type == CUSTOM) {
buffer[i++] = tag.custom_tag_name.size();
for (char c : tag.custom_tag_name) {
buffer[i++] = c;
}
}
}
return i;
}
void deserialize(const char *buffer, unsigned length) {
tags.clear();
unsigned i = 0;
while (i < length) {
Tag tag { static_cast<TagType>(buffer[i]), "" };
i++;
if (tag.type == CUSTOM) {
tag.custom_tag_name.resize(buffer[i++]);
for (unsigned j = 0; j < tag.custom_tag_name.size(); j++) {
tag.custom_tag_name[j] = buffer[i++];
}
}
tags.push_back(tag);
}
}
string scan_tag_name(TSLexer *lexer) {
string tag_name;
while (iswalpha(lexer->lookahead) || lexer->lookahead == '-' || lexer->lookahead == ':') {
tag_name += std::toupper(lexer->lookahead);
lexer->advance(lexer, false);
}
return tag_name;
}
bool start_tag(TSLexer *lexer) {
auto tag_name = scan_tag_name(lexer);
if (tag_name.empty()) return false;
Tag tag = Tag::for_name(tag_name);
tags.push_back(tag);
lexer->mark_end(lexer);
lexer->result_symbol = OPEN_START_TAG;
return true;
}
bool end_tag(TSLexer *lexer) {
auto tag_name = scan_tag_name(lexer);
if (tag_name.empty()) return false;
lexer->advance(lexer, false);
Tag tag = Tag::for_name(tag_name);
// The tag correctly closes the topmost element on the stack
if (tag == tags.back()) {
tags.pop_back();
lexer->mark_end(lexer);
lexer->result_symbol = END_TAG;
return true;
}
// Otherwise, dig deeper and queue implicit end tags (to be nice in
// the case of malformed HTML)
if (std::find(tags.begin(), tags.end(), tag) != tags.end()) {
tags.pop_back();
lexer->result_symbol = IMPLICIT_END_TAG;
return true;
}
// You closed a tag you never opened 😭
lexer->mark_end(lexer);
lexer->result_symbol = ERRONEOUS_END_TAG;
return true;
}
bool scan(TSLexer *lexer, const bool *valid_symbols) {
while (iswspace(lexer->lookahead)) {
lexer->advance(lexer, true);
}
switch (lexer->lookahead) {
case '<':
if (valid_symbols[OPEN_START_TAG] || valid_symbols[END_TAG]) {
lexer->mark_end(lexer);
lexer->advance(lexer, false);
if (lexer->lookahead == '/') {
lexer->advance(lexer, false);
return end_tag(lexer);
}
return start_tag(lexer);
}
break;
case '>':
if (valid_symbols[CLOSE_START_TAG]) {
lexer->advance(lexer, false);
lexer->result_symbol = CLOSE_START_TAG;
return true;
}
break;
case '/':
if (valid_symbols[SELF_CLOSE_START_TAG]) {
lexer->advance(lexer, false);
if (lexer->lookahead == '>') {
lexer->advance(lexer, false);
tags.pop_back();
lexer->result_symbol = SELF_CLOSE_START_TAG;
return true;
}
}
break;
}
return false;
}
vector<Tag> tags;
};
}
extern "C" {
void *tree_sitter_html_external_scanner_create() {
return new Scanner();
}
bool tree_sitter_html_external_scanner_scan(void *payload, TSLexer *lexer,
const bool *valid_symbols) {
Scanner *scanner = static_cast<Scanner *>(payload);
return scanner->scan(lexer, valid_symbols);
}
unsigned tree_sitter_html_external_scanner_serialize(void *payload, char *buffer) {
Scanner *scanner = static_cast<Scanner *>(payload);
return scanner->serialize(buffer);
}
void tree_sitter_html_external_scanner_deserialize(void *payload, const char *buffer, unsigned length) {
Scanner *scanner = static_cast<Scanner *>(payload);
scanner->deserialize(buffer, length);
}
void tree_sitter_html_external_scanner_destroy(void *payload) {
Scanner *scanner = static_cast<Scanner *>(payload);
delete scanner;
}
}

296
src/tag.h vendored Normal file
View File

@ -0,0 +1,296 @@
#include <string>
#include <unordered_map>
using std::string;
using std::unordered_map;
enum TagType : char {
// Void tags
AREA,
BASE,
BASEFONT,
BGSOUND,
BR,
COL,
COMMAND,
EMBED,
FRAME,
HR,
IMAGE,
IMG,
INPUT,
ISINDEX,
KEYGEN,
LINK,
MENUITEM,
META,
NEXTID,
PARAM,
SOURCE,
TRACK,
WBR,
END_OF_VOID_TAGS,
A,
ABBR,
ADDRESS,
ARTICLE,
ASIDE,
AUDIO,
B,
BDI,
BDO,
BLOCKQUOTE,
BODY,
BUTTON,
CANVAS,
CAPTION,
CITE,
CODE,
COLGROUP,
DATA,
DATALIST,
DD,
DEL,
DETAILS,
DFN,
DIALOG,
DIV,
DL,
DT,
EM,
FIELDSET,
FIGCAPTION,
FIGURE,
FOOTER,
FORM,
H1,
H2,
H3,
H4,
H5,
H6,
HEAD,
HEADER,
HGROUP,
HTML,
I,
IFRAME,
INS,
KBD,
LABEL,
LEGEND,
LI,
MAIN,
MAP,
MARK,
MATH,
MENU,
METER,
NAV,
NOSCRIPT,
OBJECT,
OL,
OPTGROUP,
OPTION,
OUTPUT,
P,
PICTURE,
PRE,
PROGRESS,
Q,
RB,
RP,
RT,
RTC,
RUBY,
S,
SAMP,
SCRIPT,
SECTION,
SELECT,
SLOT,
SMALL,
SPAN,
STRONG,
STYLE,
SUB,
SUMMARY,
SUP,
SVG,
TABLE,
TBODY,
TD,
TEMPLATE,
TEXTAREA,
TFOOT,
TH,
THEAD,
TIME,
TITLE,
TR,
U,
UL,
VAR,
VIDEO,
CUSTOM,
};
static const unordered_map<string, TagType> TAG_TYPES_BY_TAG_NAME = {
{"AREA", AREA},
{"BASE", BASE},
{"BASEFONT", BASEFONT},
{"BGSOUND", BGSOUND},
{"BR", BR},
{"COL", COL},
{"COMMAND", COMMAND},
{"EMBED", EMBED},
{"FRAME", FRAME},
{"HR", HR},
{"IMAGE", IMAGE},
{"IMG", IMG},
{"INPUT", INPUT},
{"ISINDEX", ISINDEX},
{"KEYGEN", KEYGEN},
{"LINK", LINK},
{"MENUITEM", MENUITEM},
{"META", META},
{"NEXTID", NEXTID},
{"PARAM", PARAM},
{"SOURCE", SOURCE},
{"TRACK", TRACK},
{"WBR", WBR},
{"A", A},
{"ABBR", ABBR},
{"ADDRESS", ADDRESS},
{"ARTICLE", ARTICLE},
{"ASIDE", ASIDE},
{"AUDIO", AUDIO},
{"B", B},
{"BDI", BDI},
{"BDO", BDO},
{"BLOCKQUOTE", BLOCKQUOTE},
{"BODY", BODY},
{"BUTTON", BUTTON},
{"CANVAS", CANVAS},
{"CAPTION", CAPTION},
{"CITE", CITE},
{"CODE", CODE},
{"COLGROUP", COLGROUP},
{"DATA", DATA},
{"DATALIST", DATALIST},
{"DD", DD},
{"DEL", DEL},
{"DETAILS", DETAILS},
{"DFN", DFN},
{"DIALOG", DIALOG},
{"DIV", DIV},
{"DL", DL},
{"DT", DT},
{"EM", EM},
{"FIELDSET", FIELDSET},
{"FIGCAPTION", FIGCAPTION},
{"FIGURE", FIGURE},
{"FOOTER", FOOTER},
{"FORM", FORM},
{"H1", H1},
{"H2", H2},
{"H3", H3},
{"H4", H4},
{"H5", H5},
{"H6", H6},
{"HEAD", HEAD},
{"HEADER", HEADER},
{"HGROUP", HGROUP},
{"HTML", HTML},
{"I", I},
{"IFRAME", IFRAME},
{"INS", INS},
{"KBD", KBD},
{"LABEL", LABEL},
{"LEGEND", LEGEND},
{"LI", LI},
{"MAIN", MAIN},
{"MAP", MAP},
{"MARK", MARK},
{"MATH", MATH},
{"MENU", MENU},
{"METER", METER},
{"NAV", NAV},
{"NOSCRIPT", NOSCRIPT},
{"OBJECT", OBJECT},
{"OL", OL},
{"OPTGROUP", OPTGROUP},
{"OPTION", OPTION},
{"OUTPUT", OUTPUT},
{"P", P},
{"PICTURE", PICTURE},
{"PRE", PRE},
{"PROGRESS", PROGRESS},
{"Q", Q},
{"RB", RB},
{"RP", RP},
{"RT", RT},
{"RTC", RTC},
{"RUBY", RUBY},
{"S", S},
{"SAMP", SAMP},
{"SCRIPT", SCRIPT},
{"SECTION", SECTION},
{"SELECT", SELECT},
{"SLOT", SLOT},
{"SMALL", SMALL},
{"SPAN", SPAN},
{"STRONG", STRONG},
{"STYLE", STYLE},
{"SUB", SUB},
{"SUMMARY", SUMMARY},
{"SUP", SUP},
{"SVG", SVG},
{"TABLE", TABLE},
{"TBODY", TBODY},
{"TD", TD},
{"TEMPLATE", TEMPLATE},
{"TEXTAREA", TEXTAREA},
{"TFOOT", TFOOT},
{"TH", TH},
{"THEAD", THEAD},
{"TIME", TIME},
{"TITLE", TITLE},
{"TR", TR},
{"U", U},
{"UL", UL},
{"VAR", VAR},
{"VIDEO", VIDEO},
};
struct Tag {
TagType type;
string custom_tag_name;
bool operator==(const Tag &other) const {
if (type != other.type) return false;
if (type == TagType::CUSTOM && custom_tag_name != other.custom_tag_name) return false;
return true;
}
inline bool is_void() const {
return type < END_OF_VOID_TAGS;
}
// string name() const {
// return type == TagType::CUSTOM
// ? custom_tag_name
// : TAG_TYPES_BY_TAG_NAME.
// }
static Tag for_name(const string &name) {
auto type = TAG_TYPES_BY_TAG_NAME.find(name);
if (type != TAG_TYPES_BY_TAG_NAME.end()) {
return Tag { type->second, "" };
}
return Tag { CUSTOM, name };
}
};

View File

@ -1 +1 @@
<input value=stuff></input> <form><img src=something.png><br><input type=submit value=Ok /></form>