diff --git a/binding.gyp b/binding.gyp index 2c933cf..5ed124d 100644 --- a/binding.gyp +++ b/binding.gyp @@ -8,6 +8,7 @@ ], "sources": [ "src/parser.c", + "src/scanner.cc", "src/binding.cc" ], "cflags_c": [ diff --git a/corpus/main.txt b/corpus/main.txt index 2546f0a..e2f06db 100644 --- a/corpus/main.txt +++ b/corpus/main.txt @@ -6,9 +6,9 @@ Tags (fragment (element - (start_tag (tag_name)) + (start_tag) (text) - (end_tag (tag_name)))) + (end_tag))) =================================== Tags with attributes @@ -17,9 +17,8 @@ Tags with attributes --- (fragment - (void_element - (void_start_tag - (void_tag_name) + (element + (start_tag (attribute (attribute_name) (attribute_value)) @@ -28,7 +27,7 @@ Tags with attributes (quoted_attribute_value (attribute_value))) (attribute (attribute_name))) - (end_tag (tag_name)))) + (end_tag))) =================================== Nested tags @@ -42,37 +41,33 @@ Nested tags (fragment (element - (start_tag (tag_name)) + (start_tag) + (element + (start_tag) + (text) + (end_tag)) (text) (element - (start_tag (tag_name)) + (start_tag) (text) - (end_tag (tag_name))) - (text) - (element - (start_tag (tag_name)) - (text) - (end_tag (tag_name))) - (text) - (end_tag (tag_name)))) + (end_tag)) + (end_tag))) ================================== Void tags ================================== -

+

--- (fragment (element - (start_tag (tag_name)) - (void_element - (void_start_tag - (tag_name) + (start_tag) + (element + (start_tag (attribute (attribute_name) (quoted_attribute_value (attribute_value))))) - (void_element (void_start_tag (tag_name))) - (void_element + (element (start_tag)) + (element (self_closing_tag - (tag_name) (attribute (attribute_name) (attribute_value)) (attribute (attribute_name) (attribute_value)))) - (end_tag (tag_name)))) + (end_tag))) diff --git a/grammar.js b/grammar.js index b9d07cb..e696dbd 100644 --- a/grammar.js +++ b/grammar.js @@ -1,16 +1,13 @@ -const startTag = ($, tag) => seq( - '<', - alias(tag, $.tag_name), - repeat($.attribute), - '>' -) - module.exports = grammar({ name: 'html', externals: $ => [ - $.tag_name, - + $._open_start_tag, + $._close_start_tag, + $._self_close_start_tag, + $.end_tag, + $._implicit_end_tag, + $._erroneous_end_tag, ], rules: { @@ -18,31 +15,29 @@ module.exports = grammar({ _node: $ => choice( $.text, - $.element, - $.void_element + $._erroneous_end_tag, + $.element ), - element: $ => seq( - $.start_tag, - repeat($._node), - $.end_tag - ), - - void_element: $ => choice( - seq($.void_start_tag, optional($.end_tag)), + element: $ => choice( + seq( + $.start_tag, + repeat($._node), + choice($.end_tag, $._implicit_end_tag) + ), $.self_closing_tag ), - start_tag: $ => startTag($, $.tag_name), - - void_start_tag: $ => startTag($, $.void_tag_name), + start_tag: $ => seq( + $._open_start_tag, + repeat($.attribute), + $._close_start_tag + ), self_closing_tag: $ => seq( - '<', - choice($.tag_name, $.void_tag_name), + $._open_start_tag, repeat($.attribute), - '/', - '>' + $._self_close_start_tag ), attribute: $ => seq( @@ -63,40 +58,6 @@ module.exports = grammar({ seq('"', optional(alias(/[^"]+/, $.attribute_value)), '"') ), - end_tag: $ => seq( - '' - ), - - tag_name: $ => /[a-zA-Z\-]+/, - - void_tag_name: $ => token(prec(1, choice( - 'area', - 'base', - 'basefont', - 'bgsound', - 'br', - 'col', - 'command', - 'embed', - 'frame', - 'hr', - 'image', - 'img', - 'input', - 'isindex', - 'keygen', - 'link', - 'menuitem', - 'meta', - 'nextid', - 'param', - 'source', - 'track', - 'wbr' - ))), - text: $ => /[^<>]+/ } }); diff --git a/src/grammar.json b/src/grammar.json index 0d4f600..5ca074d 100644 --- a/src/grammar.json +++ b/src/grammar.json @@ -17,40 +17,45 @@ }, { "type": "SYMBOL", - "name": "element" + "name": "_erroneous_end_tag" }, { "type": "SYMBOL", - "name": "void_element" + "name": "element" } ] }, "element": { - "type": "SEQ", - "members": [ - { - "type": "SYMBOL", - "name": "start_tag" - }, - { - "type": "REPEAT", - "content": { - "type": "SYMBOL", - "name": "_node" - } - }, - { - "type": "SYMBOL", - "name": "end_tag" - } - ] - }, - "void_element": { "type": "CHOICE", "members": [ { - "type": "SYMBOL", - "name": "void_start_tag" + "type": "SEQ", + "members": [ + { + "type": "SYMBOL", + "name": "start_tag" + }, + { + "type": "REPEAT", + "content": { + "type": "SYMBOL", + "name": "_node" + } + }, + { + "type": "CHOICE", + "members": [ + { + "type": "SYMBOL", + "name": "end_tag" + }, + { + "type": "SYMBOL", + "name": "_implicit_end_tag" + } + ] + } + ] }, { "type": "SYMBOL", @@ -62,17 +67,8 @@ "type": "SEQ", "members": [ { - "type": "STRING", - "value": "<" - }, - { - "type": "ALIAS", - "content": { - "type": "SYMBOL", - "name": "tag_name" - }, - "named": true, - "value": "tag_name" + "type": "SYMBOL", + "name": "_open_start_tag" }, { "type": "REPEAT", @@ -82,37 +78,8 @@ } }, { - "type": "STRING", - "value": ">" - } - ] - }, - "void_start_tag": { - "type": "SEQ", - "members": [ - { - "type": "STRING", - "value": "<" - }, - { - "type": "ALIAS", - "content": { - "type": "SYMBOL", - "name": "void_tag_name" - }, - "named": true, - "value": "tag_name" - }, - { - "type": "REPEAT", - "content": { - "type": "SYMBOL", - "name": "attribute" - } - }, - { - "type": "STRING", - "value": ">" + "type": "SYMBOL", + "name": "_close_start_tag" } ] }, @@ -120,21 +87,8 @@ "type": "SEQ", "members": [ { - "type": "STRING", - "value": "<" - }, - { - "type": "CHOICE", - "members": [ - { - "type": "SYMBOL", - "name": "tag_name" - }, - { - "type": "SYMBOL", - "name": "void_tag_name" - } - ] + "type": "SYMBOL", + "name": "_open_start_tag" }, { "type": "REPEAT", @@ -144,12 +98,8 @@ } }, { - "type": "STRING", - "value": "/" - }, - { - "type": "STRING", - "value": ">" + "type": "SYMBOL", + "name": "_self_close_start_tag" } ] }, @@ -271,135 +221,6 @@ } ] }, - "end_tag": { - "type": "SEQ", - "members": [ - { - "type": "STRING", - "value": "<" - }, - { - "type": "STRING", - "value": "/" - }, - { - "type": "SYMBOL", - "name": "tag_name" - }, - { - "type": "STRING", - "value": ">" - } - ] - }, - "tag_name": { - "type": "PATTERN", - "value": "[a-zA-Z\\-]+" - }, - "void_tag_name": { - "type": "TOKEN", - "content": { - "type": "PREC", - "value": 1, - "content": { - "type": "CHOICE", - "members": [ - { - "type": "STRING", - "value": "area" - }, - { - "type": "STRING", - "value": "base" - }, - { - "type": "STRING", - "value": "basefont" - }, - { - "type": "STRING", - "value": "bgsound" - }, - { - "type": "STRING", - "value": "br" - }, - { - "type": "STRING", - "value": "col" - }, - { - "type": "STRING", - "value": "command" - }, - { - "type": "STRING", - "value": "embed" - }, - { - "type": "STRING", - "value": "frame" - }, - { - "type": "STRING", - "value": "hr" - }, - { - "type": "STRING", - "value": "image" - }, - { - "type": "STRING", - "value": "img" - }, - { - "type": "STRING", - "value": "input" - }, - { - "type": "STRING", - "value": "isindex" - }, - { - "type": "STRING", - "value": "keygen" - }, - { - "type": "STRING", - "value": "link" - }, - { - "type": "STRING", - "value": "menuitem" - }, - { - "type": "STRING", - "value": "meta" - }, - { - "type": "STRING", - "value": "nextid" - }, - { - "type": "STRING", - "value": "param" - }, - { - "type": "STRING", - "value": "source" - }, - { - "type": "STRING", - "value": "track" - }, - { - "type": "STRING", - "value": "wbr" - } - ] - } - } - }, "text": { "type": "PATTERN", "value": "[^<>]+" @@ -412,6 +233,31 @@ } ], "conflicts": [], - "externals": [], + "externals": [ + { + "type": "SYMBOL", + "name": "_open_start_tag" + }, + { + "type": "SYMBOL", + "name": "_close_start_tag" + }, + { + "type": "SYMBOL", + "name": "_self_close_start_tag" + }, + { + "type": "SYMBOL", + "name": "end_tag" + }, + { + "type": "SYMBOL", + "name": "_implicit_end_tag" + }, + { + "type": "SYMBOL", + "name": "_erroneous_end_tag" + } + ], "inline": [] } \ No newline at end of file diff --git a/src/parser.c b/src/parser.c index d4b969f..a874a2c 100644 --- a/src/parser.c +++ b/src/parser.c @@ -6,90 +6,96 @@ #endif #define LANGUAGE_VERSION 8 -#define STATE_COUNT 40 -#define SYMBOL_COUNT 25 -#define ALIAS_COUNT 3 -#define TOKEN_COUNT 13 -#define EXTERNAL_TOKEN_COUNT 0 -#define MAX_ALIAS_SEQUENCE_LENGTH 4 +#define STATE_COUNT 39 +#define SYMBOL_COUNT 23 +#define ALIAS_COUNT 2 +#define TOKEN_COUNT 14 +#define EXTERNAL_TOKEN_COUNT 6 +#define MAX_ALIAS_SEQUENCE_LENGTH 3 enum { - anon_sym_LT = 1, - anon_sym_GT = 2, - anon_sym_SLASH = 3, - anon_sym_EQ = 4, - sym__attribute_part = 5, - anon_sym_SQUOTE = 6, - aux_sym_SLASH_LBRACK_CARET_SQUOTE_RBRACK_PLUS_SLASH = 7, - anon_sym_DQUOTE = 8, - aux_sym_SLASH_LBRACK_CARET_DQUOTE_RBRACK_PLUS_SLASH = 9, - sym_tag_name = 10, - sym_void_tag_name = 11, - sym_text = 12, - sym_fragment = 13, - sym__node = 14, - sym_element = 15, - sym_void_element = 16, + sym__open_start_tag = 1, + sym__close_start_tag = 2, + sym__self_close_start_tag = 3, + sym_end_tag = 4, + sym__implicit_end_tag = 5, + sym__erroneous_end_tag = 6, + anon_sym_EQ = 7, + sym__attribute_part = 8, + anon_sym_SQUOTE = 9, + aux_sym_SLASH_LBRACK_CARET_SQUOTE_RBRACK_PLUS_SLASH = 10, + anon_sym_DQUOTE = 11, + aux_sym_SLASH_LBRACK_CARET_DQUOTE_RBRACK_PLUS_SLASH = 12, + sym_text = 13, + sym_fragment = 14, + sym__node = 15, + sym_element = 16, sym_start_tag = 17, - sym_void_start_tag = 18, - sym_self_closing_tag = 19, - sym_attribute = 20, - sym_quoted_attribute_value = 21, - sym_end_tag = 22, - aux_sym_fragment_repeat1 = 23, - aux_sym_start_tag_repeat1 = 24, - alias_sym_attribute_name = 25, - alias_sym_attribute_value = 26, - alias_sym_tag_name = 27, + sym_self_closing_tag = 18, + sym_attribute = 19, + sym_quoted_attribute_value = 20, + aux_sym_fragment_repeat1 = 21, + aux_sym_start_tag_repeat1 = 22, + alias_sym_attribute_name = 23, + alias_sym_attribute_value = 24, }; static const char *ts_symbol_names[] = { + [sym__open_start_tag] = "_open_start_tag", + [sym__close_start_tag] = "_close_start_tag", + [sym__self_close_start_tag] = "_self_close_start_tag", + [sym_end_tag] = "end_tag", + [sym__implicit_end_tag] = "_implicit_end_tag", + [sym__erroneous_end_tag] = "_erroneous_end_tag", [ts_builtin_sym_end] = "END", - [anon_sym_LT] = "<", - [anon_sym_GT] = ">", - [anon_sym_SLASH] = "/", [anon_sym_EQ] = "=", [sym__attribute_part] = "_attribute_part", [anon_sym_SQUOTE] = "'", [aux_sym_SLASH_LBRACK_CARET_SQUOTE_RBRACK_PLUS_SLASH] = "/[^']+/", [anon_sym_DQUOTE] = "\"", [aux_sym_SLASH_LBRACK_CARET_DQUOTE_RBRACK_PLUS_SLASH] = "/[^\"]+/", - [sym_tag_name] = "tag_name", - [sym_void_tag_name] = "void_tag_name", [sym_text] = "text", [sym_fragment] = "fragment", [sym__node] = "_node", [sym_element] = "element", - [sym_void_element] = "void_element", [sym_start_tag] = "start_tag", - [sym_void_start_tag] = "void_start_tag", [sym_self_closing_tag] = "self_closing_tag", [sym_attribute] = "attribute", [sym_quoted_attribute_value] = "quoted_attribute_value", - [sym_end_tag] = "end_tag", [aux_sym_fragment_repeat1] = "fragment_repeat1", [aux_sym_start_tag_repeat1] = "start_tag_repeat1", [alias_sym_attribute_name] = "attribute_name", [alias_sym_attribute_value] = "attribute_value", - [alias_sym_tag_name] = "tag_name", }; static const TSSymbolMetadata ts_symbol_metadata[] = { - [ts_builtin_sym_end] = { + [sym__open_start_tag] = { .visible = false, .named = true, }, - [anon_sym_LT] = { - .visible = true, - .named = false, + [sym__close_start_tag] = { + .visible = false, + .named = true, }, - [anon_sym_GT] = { - .visible = true, - .named = false, + [sym__self_close_start_tag] = { + .visible = false, + .named = true, }, - [anon_sym_SLASH] = { + [sym_end_tag] = { .visible = true, - .named = false, + .named = true, + }, + [sym__implicit_end_tag] = { + .visible = false, + .named = true, + }, + [sym__erroneous_end_tag] = { + .visible = false, + .named = true, + }, + [ts_builtin_sym_end] = { + .visible = false, + .named = true, }, [anon_sym_EQ] = { .visible = true, @@ -115,14 +121,6 @@ static const TSSymbolMetadata ts_symbol_metadata[] = { .visible = false, .named = false, }, - [sym_tag_name] = { - .visible = true, - .named = true, - }, - [sym_void_tag_name] = { - .visible = true, - .named = true, - }, [sym_text] = { .visible = true, .named = true, @@ -139,18 +137,10 @@ static const TSSymbolMetadata ts_symbol_metadata[] = { .visible = true, .named = true, }, - [sym_void_element] = { - .visible = true, - .named = true, - }, [sym_start_tag] = { .visible = true, .named = true, }, - [sym_void_start_tag] = { - .visible = true, - .named = true, - }, [sym_self_closing_tag] = { .visible = true, .named = true, @@ -163,10 +153,6 @@ static const TSSymbolMetadata ts_symbol_metadata[] = { .visible = true, .named = true, }, - [sym_end_tag] = { - .visible = true, - .named = true, - }, [aux_sym_fragment_repeat1] = { .visible = false, .named = false, @@ -183,24 +169,17 @@ static const TSSymbolMetadata ts_symbol_metadata[] = { .visible = true, .named = true, }, - [alias_sym_tag_name] = { - .visible = true, - .named = true, - }, }; -static TSSymbol ts_alias_sequences[5][MAX_ALIAS_SEQUENCE_LENGTH] = { +static TSSymbol ts_alias_sequences[4][MAX_ALIAS_SEQUENCE_LENGTH] = { [1] = { - [1] = alias_sym_tag_name, + [0] = alias_sym_attribute_name, }, [2] = { [0] = alias_sym_attribute_name, - }, - [3] = { - [0] = alias_sym_attribute_name, [2] = alias_sym_attribute_value, }, - [4] = { + [3] = { [1] = alias_sym_attribute_value, }, }; @@ -215,24 +194,16 @@ static bool ts_lex(TSLexer *lexer, TSStateId state) { ADVANCE(2); if (lookahead == '\'') ADVANCE(3); - if (lookahead == '/') - ADVANCE(4); - if (lookahead == '<') - ADVANCE(5); if (lookahead == '=') - ADVANCE(6); - if (lookahead == '>') - ADVANCE(7); + ADVANCE(4); if (lookahead == '\t' || lookahead == '\n' || lookahead == '\r' || lookahead == ' ') SKIP(0); - if (lookahead == '-' || - ('A' <= lookahead && lookahead <= 'Z') || - ('a' <= lookahead && lookahead <= 'z')) - ADVANCE(8); - ADVANCE(9); + if (lookahead != '/' && + (lookahead < '<' || lookahead > '>')) + ADVANCE(5); END_STATE(); case 1: ACCEPT_TOKEN(ts_builtin_sym_end); @@ -244,522 +215,160 @@ static bool ts_lex(TSLexer *lexer, TSStateId state) { ACCEPT_TOKEN(anon_sym_SQUOTE); END_STATE(); case 4: - ACCEPT_TOKEN(anon_sym_SLASH); - END_STATE(); - case 5: - ACCEPT_TOKEN(anon_sym_LT); - END_STATE(); - case 6: ACCEPT_TOKEN(anon_sym_EQ); END_STATE(); - case 7: - ACCEPT_TOKEN(anon_sym_GT); - END_STATE(); - case 8: - ACCEPT_TOKEN(sym__attribute_part); - if (lookahead == '-' || - ('A' <= lookahead && lookahead <= 'Z') || - ('a' <= lookahead && lookahead <= 'z')) - ADVANCE(8); - if (lookahead != 0 && - lookahead != '\t' && - lookahead != '\n' && - lookahead != '\r' && - lookahead != ' ' && - lookahead != '\"' && - lookahead != '\'' && - lookahead != '/' && - (lookahead < '<' || lookahead > '>')) - ADVANCE(9); - END_STATE(); - case 9: - ACCEPT_TOKEN(sym__attribute_part); - if (lookahead != 0 && - lookahead != '\t' && - lookahead != '\n' && - lookahead != '\r' && - lookahead != ' ' && - lookahead != '\"' && - lookahead != '\'' && - lookahead != '/' && - (lookahead < '<' || lookahead > '>')) - ADVANCE(9); - END_STATE(); - case 10: - if (lookahead == 0) - ADVANCE(1); - if (lookahead == '<') - ADVANCE(5); - if (lookahead == '\t' || - lookahead == '\n' || - lookahead == '\r' || - lookahead == ' ') - ADVANCE(11); - if (lookahead != '>') - ADVANCE(12); - END_STATE(); - case 11: - ACCEPT_TOKEN(sym_text); - if (lookahead == '\t' || - lookahead == '\n' || - lookahead == '\r' || - lookahead == ' ') - ADVANCE(11); - if (lookahead != 0 && - lookahead != '<' && - lookahead != '>') - ADVANCE(12); - END_STATE(); - case 12: - ACCEPT_TOKEN(sym_text); - if (lookahead != 0 && - lookahead != '<' && - lookahead != '>') - ADVANCE(12); - END_STATE(); - case 13: - if (lookahead == '/') - ADVANCE(4); - if (lookahead == '\t' || - lookahead == '\n' || - lookahead == '\r' || - lookahead == ' ') - SKIP(13); - if (lookahead == '-' || - ('A' <= lookahead && lookahead <= 'Z') || - ('a' <= lookahead && lookahead <= 'z')) - ADVANCE(14); - END_STATE(); - case 14: - ACCEPT_TOKEN(sym_tag_name); - if (lookahead == '-' || - ('A' <= lookahead && lookahead <= 'Z') || - ('a' <= lookahead && lookahead <= 'z')) - ADVANCE(14); - END_STATE(); - case 15: - if (lookahead == 0) - ADVANCE(1); - if (lookahead == '\t' || - lookahead == '\n' || - lookahead == '\r' || - lookahead == ' ') - SKIP(15); - END_STATE(); - case 16: - if (lookahead == '/') - ADVANCE(4); - if (lookahead == '=') - ADVANCE(6); - if (lookahead == '>') - ADVANCE(7); - if (lookahead == '\t' || - lookahead == '\n' || - lookahead == '\r' || - lookahead == ' ') - SKIP(16); - if (lookahead != 0 && - lookahead != '\"' && - lookahead != '\'' && - (lookahead < '<' || lookahead > '>')) - ADVANCE(9); - END_STATE(); - case 17: - if (lookahead == '\"') - ADVANCE(2); - if (lookahead == '\'') - ADVANCE(3); - if (lookahead == '\t' || - lookahead == '\n' || - lookahead == '\r' || - lookahead == ' ') - SKIP(17); - if (lookahead != 0 && - lookahead != '/' && - (lookahead < '<' || lookahead > '>')) - ADVANCE(9); - END_STATE(); - case 18: - if (lookahead == '\'') - ADVANCE(3); - if (lookahead == '\t' || - lookahead == '\n' || - lookahead == '\r' || - lookahead == ' ') - ADVANCE(19); - if (lookahead != 0) - ADVANCE(20); - END_STATE(); - case 19: - ACCEPT_TOKEN(aux_sym_SLASH_LBRACK_CARET_SQUOTE_RBRACK_PLUS_SLASH); - if (lookahead == '\t' || - lookahead == '\n' || - lookahead == '\r' || - lookahead == ' ') - ADVANCE(19); - if (lookahead != 0 && - lookahead != '\'') - ADVANCE(20); - END_STATE(); - case 20: - ACCEPT_TOKEN(aux_sym_SLASH_LBRACK_CARET_SQUOTE_RBRACK_PLUS_SLASH); - if (lookahead != 0 && - lookahead != '\'') - ADVANCE(20); - END_STATE(); - case 21: - if (lookahead == '\"') - ADVANCE(2); - if (lookahead == '\t' || - lookahead == '\n' || - lookahead == '\r' || - lookahead == ' ') - ADVANCE(22); - if (lookahead != 0) - ADVANCE(23); - END_STATE(); - case 22: - ACCEPT_TOKEN(aux_sym_SLASH_LBRACK_CARET_DQUOTE_RBRACK_PLUS_SLASH); - if (lookahead == '\t' || - lookahead == '\n' || - lookahead == '\r' || - lookahead == ' ') - ADVANCE(22); - if (lookahead != 0 && - lookahead != '\"') - ADVANCE(23); - END_STATE(); - case 23: - ACCEPT_TOKEN(aux_sym_SLASH_LBRACK_CARET_DQUOTE_RBRACK_PLUS_SLASH); - if (lookahead != 0 && - lookahead != '\"') - ADVANCE(23); - END_STATE(); - default: - return false; - } -} - -static bool ts_lex_keywords(TSLexer *lexer, TSStateId state) { - START_LEXER(); - switch (state) { - case 0: - if (lookahead == 'a') - ADVANCE(1); - if (lookahead == 'b') - ADVANCE(5); - if (lookahead == 'c') - ADVANCE(17); - if (lookahead == 'e') - ADVANCE(21); - if (lookahead == 'f') - ADVANCE(24); - if (lookahead == 'h') - ADVANCE(28); - if (lookahead == 'i') - ADVANCE(29); - if (lookahead == 'k') - ADVANCE(39); - if (lookahead == 'l') - ADVANCE(44); - if (lookahead == 'm') - ADVANCE(47); - if (lookahead == 'n') - ADVANCE(54); - if (lookahead == 'p') - ADVANCE(58); - if (lookahead == 's') - ADVANCE(61); - if (lookahead == 't') - ADVANCE(65); - if (lookahead == 'w') - ADVANCE(68); - END_STATE(); - case 1: - if (lookahead == 'r') - ADVANCE(2); - END_STATE(); - case 2: - if (lookahead == 'e') - ADVANCE(3); - END_STATE(); - case 3: - if (lookahead == 'a') - ADVANCE(4); - END_STATE(); - case 4: - ACCEPT_TOKEN(sym_void_tag_name); - END_STATE(); case 5: - if (lookahead == 'a') - ADVANCE(6); - if (lookahead == 'g') - ADVANCE(12); - if (lookahead == 'r') - ADVANCE(4); + ACCEPT_TOKEN(sym__attribute_part); + if (lookahead != 0 && + lookahead != '\t' && + lookahead != '\n' && + lookahead != '\r' && + lookahead != ' ' && + lookahead != '\"' && + lookahead != '\'' && + lookahead != '/' && + (lookahead < '<' || lookahead > '>')) + ADVANCE(5); END_STATE(); case 6: - if (lookahead == 's') + if (lookahead == 0) + ADVANCE(1); + if (lookahead == '\t' || + lookahead == '\n' || + lookahead == '\r' || + lookahead == ' ') ADVANCE(7); + if (lookahead != '<' && + lookahead != '>') + ADVANCE(8); END_STATE(); case 7: - if (lookahead == 'e') + ACCEPT_TOKEN(sym_text); + if (lookahead == '\t' || + lookahead == '\n' || + lookahead == '\r' || + lookahead == ' ') + ADVANCE(7); + if (lookahead != 0 && + lookahead != '<' && + lookahead != '>') ADVANCE(8); END_STATE(); case 8: - ACCEPT_TOKEN(sym_void_tag_name); - if (lookahead == 'f') - ADVANCE(9); + ACCEPT_TOKEN(sym_text); + if (lookahead != 0 && + lookahead != '<' && + lookahead != '>') + ADVANCE(8); END_STATE(); case 9: - if (lookahead == 'o') - ADVANCE(10); + if (lookahead == '\"') + ADVANCE(2); + if (lookahead == '\t' || + lookahead == '\n' || + lookahead == '\r' || + lookahead == ' ') + SKIP(9); + if (lookahead != 0 && + lookahead != '\'' && + lookahead != '/' && + (lookahead < '<' || lookahead > '>')) + ADVANCE(5); END_STATE(); case 10: - if (lookahead == 'n') - ADVANCE(11); + if (lookahead == 0) + ADVANCE(1); + if (lookahead == '\t' || + lookahead == '\n' || + lookahead == '\r' || + lookahead == ' ') + SKIP(10); END_STATE(); case 11: - if (lookahead == 't') + if (lookahead == '=') ADVANCE(4); + if (lookahead == '\t' || + lookahead == '\n' || + lookahead == '\r' || + lookahead == ' ') + SKIP(11); + if (lookahead != 0 && + lookahead != '\"' && + lookahead != '\'' && + lookahead != '/' && + (lookahead < '<' || lookahead > '>')) + ADVANCE(5); END_STATE(); case 12: - if (lookahead == 's') - ADVANCE(13); + if (lookahead == '\"') + ADVANCE(2); + if (lookahead == '\'') + ADVANCE(3); + if (lookahead == '\t' || + lookahead == '\n' || + lookahead == '\r' || + lookahead == ' ') + SKIP(12); + if (lookahead != 0 && + lookahead != '/' && + (lookahead < '<' || lookahead > '>')) + ADVANCE(5); END_STATE(); case 13: - if (lookahead == 'o') + if (lookahead == '\'') + ADVANCE(3); + if (lookahead == '\t' || + lookahead == '\n' || + lookahead == '\r' || + lookahead == ' ') ADVANCE(14); + if (lookahead != 0) + ADVANCE(15); END_STATE(); case 14: - if (lookahead == 'u') + ACCEPT_TOKEN(aux_sym_SLASH_LBRACK_CARET_SQUOTE_RBRACK_PLUS_SLASH); + if (lookahead == '\t' || + lookahead == '\n' || + lookahead == '\r' || + lookahead == ' ') + ADVANCE(14); + if (lookahead != 0 && + lookahead != '\'') ADVANCE(15); END_STATE(); case 15: - if (lookahead == 'n') - ADVANCE(16); + ACCEPT_TOKEN(aux_sym_SLASH_LBRACK_CARET_SQUOTE_RBRACK_PLUS_SLASH); + if (lookahead != 0 && + lookahead != '\'') + ADVANCE(15); END_STATE(); case 16: - if (lookahead == 'd') - ADVANCE(4); + if (lookahead == '\"') + ADVANCE(2); + if (lookahead == '\t' || + lookahead == '\n' || + lookahead == '\r' || + lookahead == ' ') + ADVANCE(17); + if (lookahead != 0) + ADVANCE(18); END_STATE(); case 17: - if (lookahead == 'o') + ACCEPT_TOKEN(aux_sym_SLASH_LBRACK_CARET_DQUOTE_RBRACK_PLUS_SLASH); + if (lookahead == '\t' || + lookahead == '\n' || + lookahead == '\r' || + lookahead == ' ') + ADVANCE(17); + if (lookahead != 0 && + lookahead != '\"') ADVANCE(18); END_STATE(); case 18: - if (lookahead == 'l') - ADVANCE(4); - if (lookahead == 'm') - ADVANCE(19); - END_STATE(); - case 19: - if (lookahead == 'm') - ADVANCE(20); - END_STATE(); - case 20: - if (lookahead == 'a') - ADVANCE(15); - END_STATE(); - case 21: - if (lookahead == 'm') - ADVANCE(22); - END_STATE(); - case 22: - if (lookahead == 'b') - ADVANCE(23); - END_STATE(); - case 23: - if (lookahead == 'e') - ADVANCE(16); - END_STATE(); - case 24: - if (lookahead == 'r') - ADVANCE(25); - END_STATE(); - case 25: - if (lookahead == 'a') - ADVANCE(26); - END_STATE(); - case 26: - if (lookahead == 'm') - ADVANCE(27); - END_STATE(); - case 27: - if (lookahead == 'e') - ADVANCE(4); - END_STATE(); - case 28: - if (lookahead == 'r') - ADVANCE(4); - END_STATE(); - case 29: - if (lookahead == 'm') - ADVANCE(30); - if (lookahead == 'n') - ADVANCE(32); - if (lookahead == 's') - ADVANCE(34); - END_STATE(); - case 30: - if (lookahead == 'a') - ADVANCE(31); - if (lookahead == 'g') - ADVANCE(4); - END_STATE(); - case 31: - if (lookahead == 'g') - ADVANCE(27); - END_STATE(); - case 32: - if (lookahead == 'p') - ADVANCE(33); - END_STATE(); - case 33: - if (lookahead == 'u') - ADVANCE(11); - END_STATE(); - case 34: - if (lookahead == 'i') - ADVANCE(35); - END_STATE(); - case 35: - if (lookahead == 'n') - ADVANCE(36); - END_STATE(); - case 36: - if (lookahead == 'd') - ADVANCE(37); - END_STATE(); - case 37: - if (lookahead == 'e') - ADVANCE(38); - END_STATE(); - case 38: - if (lookahead == 'x') - ADVANCE(4); - END_STATE(); - case 39: - if (lookahead == 'e') - ADVANCE(40); - END_STATE(); - case 40: - if (lookahead == 'y') - ADVANCE(41); - END_STATE(); - case 41: - if (lookahead == 'g') - ADVANCE(42); - END_STATE(); - case 42: - if (lookahead == 'e') - ADVANCE(43); - END_STATE(); - case 43: - if (lookahead == 'n') - ADVANCE(4); - END_STATE(); - case 44: - if (lookahead == 'i') - ADVANCE(45); - END_STATE(); - case 45: - if (lookahead == 'n') - ADVANCE(46); - END_STATE(); - case 46: - if (lookahead == 'k') - ADVANCE(4); - END_STATE(); - case 47: - if (lookahead == 'e') - ADVANCE(48); - END_STATE(); - case 48: - if (lookahead == 'n') - ADVANCE(49); - if (lookahead == 't') - ADVANCE(3); - END_STATE(); - case 49: - if (lookahead == 'u') - ADVANCE(50); - END_STATE(); - case 50: - if (lookahead == 'i') - ADVANCE(51); - END_STATE(); - case 51: - if (lookahead == 't') - ADVANCE(52); - END_STATE(); - case 52: - if (lookahead == 'e') - ADVANCE(53); - END_STATE(); - case 53: - if (lookahead == 'm') - ADVANCE(4); - END_STATE(); - case 54: - if (lookahead == 'e') - ADVANCE(55); - END_STATE(); - case 55: - if (lookahead == 'x') - ADVANCE(56); - END_STATE(); - case 56: - if (lookahead == 't') - ADVANCE(57); - END_STATE(); - case 57: - if (lookahead == 'i') - ADVANCE(16); - END_STATE(); - case 58: - if (lookahead == 'a') - ADVANCE(59); - END_STATE(); - case 59: - if (lookahead == 'r') - ADVANCE(60); - END_STATE(); - case 60: - if (lookahead == 'a') - ADVANCE(53); - END_STATE(); - case 61: - if (lookahead == 'o') - ADVANCE(62); - END_STATE(); - case 62: - if (lookahead == 'u') - ADVANCE(63); - END_STATE(); - case 63: - if (lookahead == 'r') - ADVANCE(64); - END_STATE(); - case 64: - if (lookahead == 'c') - ADVANCE(27); - END_STATE(); - case 65: - if (lookahead == 'r') - ADVANCE(66); - END_STATE(); - case 66: - if (lookahead == 'a') - ADVANCE(67); - END_STATE(); - case 67: - if (lookahead == 'c') - ADVANCE(46); - END_STATE(); - case 68: - if (lookahead == 'b') - ADVANCE(28); + ACCEPT_TOKEN(aux_sym_SLASH_LBRACK_CARET_DQUOTE_RBRACK_PLUS_SLASH); + if (lookahead != 0 && + lookahead != '\"') + ADVANCE(18); END_STATE(); default: return false; @@ -767,359 +376,433 @@ static bool ts_lex_keywords(TSLexer *lexer, TSStateId state) { } static TSLexMode ts_lex_modes[STATE_COUNT] = { - [0] = {.lex_state = 0}, - [1] = {.lex_state = 10}, - [2] = {.lex_state = 13}, - [3] = {.lex_state = 15}, + [0] = {.lex_state = 0, .external_lex_state = 1}, + [1] = {.lex_state = 6, .external_lex_state = 2}, + [2] = {.lex_state = 9, .external_lex_state = 3}, + [3] = {.lex_state = 6, .external_lex_state = 2}, [4] = {.lex_state = 10}, - [5] = {.lex_state = 10}, - [6] = {.lex_state = 10}, - [7] = {.lex_state = 10}, - [8] = {.lex_state = 16}, - [9] = {.lex_state = 16}, - [10] = {.lex_state = 13}, - [11] = {.lex_state = 10}, - [12] = {.lex_state = 10}, - [13] = {.lex_state = 10}, - [14] = {.lex_state = 10}, - [15] = {.lex_state = 16}, - [16] = {.lex_state = 16}, - [17] = {.lex_state = 16}, - [18] = {.lex_state = 10}, - [19] = {.lex_state = 16}, - [20] = {.lex_state = 13}, - [21] = {.lex_state = 10}, - [22] = {.lex_state = 10}, - [23] = {.lex_state = 10}, - [24] = {.lex_state = 17}, - [25] = {.lex_state = 10}, - [26] = {.lex_state = 16}, - [27] = {.lex_state = 16}, - [28] = {.lex_state = 10}, - [29] = {.lex_state = 16}, - [30] = {.lex_state = 16}, - [31] = {.lex_state = 18}, - [32] = {.lex_state = 21}, - [33] = {.lex_state = 16}, - [34] = {.lex_state = 10}, - [35] = {.lex_state = 10}, - [36] = {.lex_state = 16}, - [37] = {.lex_state = 17}, - [38] = {.lex_state = 17}, - [39] = {.lex_state = 16}, + [5] = {.lex_state = 6, .external_lex_state = 4}, + [6] = {.lex_state = 6, .external_lex_state = 2}, + [7] = {.lex_state = 6, .external_lex_state = 2}, + [8] = {.lex_state = 6, .external_lex_state = 4}, + [9] = {.lex_state = 6, .external_lex_state = 2}, + [10] = {.lex_state = 11, .external_lex_state = 3}, + [11] = {.lex_state = 9, .external_lex_state = 3}, + [12] = {.lex_state = 9, .external_lex_state = 3}, + [13] = {.lex_state = 6, .external_lex_state = 2}, + [14] = {.lex_state = 6, .external_lex_state = 4}, + [15] = {.lex_state = 6, .external_lex_state = 4}, + [16] = {.lex_state = 6, .external_lex_state = 4}, + [17] = {.lex_state = 6, .external_lex_state = 4}, + [18] = {.lex_state = 6, .external_lex_state = 2}, + [19] = {.lex_state = 12}, + [20] = {.lex_state = 6, .external_lex_state = 4}, + [21] = {.lex_state = 6, .external_lex_state = 2}, + [22] = {.lex_state = 9, .external_lex_state = 3}, + [23] = {.lex_state = 6, .external_lex_state = 4}, + [24] = {.lex_state = 9, .external_lex_state = 3}, + [25] = {.lex_state = 6, .external_lex_state = 4}, + [26] = {.lex_state = 6, .external_lex_state = 4}, + [27] = {.lex_state = 6, .external_lex_state = 2}, + [28] = {.lex_state = 6, .external_lex_state = 4}, + [29] = {.lex_state = 9, .external_lex_state = 3}, + [30] = {.lex_state = 13}, + [31] = {.lex_state = 16}, + [32] = {.lex_state = 9, .external_lex_state = 3}, + [33] = {.lex_state = 6, .external_lex_state = 4}, + [34] = {.lex_state = 6, .external_lex_state = 4}, + [35] = {.lex_state = 9, .external_lex_state = 3}, + [36] = {.lex_state = 12}, + [37] = {.lex_state = 9}, + [38] = {.lex_state = 9, .external_lex_state = 3}, +}; + +enum { + ts_external_token__open_start_tag, + ts_external_token__close_start_tag, + ts_external_token__self_close_start_tag, + ts_external_token_end_tag, + ts_external_token__implicit_end_tag, + ts_external_token__erroneous_end_tag, +}; + +static TSSymbol ts_external_scanner_symbol_map[EXTERNAL_TOKEN_COUNT] = { + [ts_external_token__open_start_tag] = sym__open_start_tag, + [ts_external_token__close_start_tag] = sym__close_start_tag, + [ts_external_token__self_close_start_tag] = sym__self_close_start_tag, + [ts_external_token_end_tag] = sym_end_tag, + [ts_external_token__implicit_end_tag] = sym__implicit_end_tag, + [ts_external_token__erroneous_end_tag] = sym__erroneous_end_tag, +}; + +static bool ts_external_scanner_states[5][EXTERNAL_TOKEN_COUNT] = { + [1] = { + [ts_external_token__open_start_tag] = true, + [ts_external_token__close_start_tag] = true, + [ts_external_token__self_close_start_tag] = true, + [ts_external_token_end_tag] = true, + [ts_external_token__implicit_end_tag] = true, + [ts_external_token__erroneous_end_tag] = true, + }, + [2] = { + [ts_external_token__open_start_tag] = true, + [ts_external_token__erroneous_end_tag] = true, + }, + [3] = { + [ts_external_token__close_start_tag] = true, + [ts_external_token__self_close_start_tag] = true, + }, + [4] = { + [ts_external_token__open_start_tag] = true, + [ts_external_token_end_tag] = true, + [ts_external_token__implicit_end_tag] = true, + [ts_external_token__erroneous_end_tag] = true, + }, }; static uint16_t ts_parse_table[STATE_COUNT][SYMBOL_COUNT] = { [0] = { + [sym__open_start_tag] = ACTIONS(1), + [sym__close_start_tag] = ACTIONS(1), + [sym__self_close_start_tag] = ACTIONS(1), + [sym_end_tag] = ACTIONS(1), + [sym__implicit_end_tag] = ACTIONS(1), + [sym__erroneous_end_tag] = ACTIONS(1), [ts_builtin_sym_end] = ACTIONS(1), - [anon_sym_LT] = ACTIONS(1), - [anon_sym_GT] = ACTIONS(1), - [anon_sym_SLASH] = ACTIONS(1), [anon_sym_EQ] = ACTIONS(1), - [sym__attribute_part] = ACTIONS(3), + [sym__attribute_part] = ACTIONS(1), [anon_sym_SQUOTE] = ACTIONS(1), [anon_sym_DQUOTE] = ACTIONS(1), - [sym_tag_name] = ACTIONS(3), - [sym_void_tag_name] = ACTIONS(1), }, [1] = { - [sym_fragment] = STATE(3), - [sym__node] = STATE(4), - [sym_element] = STATE(4), - [sym_void_element] = STATE(4), + [sym_fragment] = STATE(4), + [sym__node] = STATE(7), + [sym_element] = STATE(3), [sym_start_tag] = STATE(5), - [sym_void_start_tag] = STATE(6), [sym_self_closing_tag] = STATE(6), [aux_sym_fragment_repeat1] = STATE(7), - [ts_builtin_sym_end] = ACTIONS(5), - [anon_sym_LT] = ACTIONS(7), - [sym_text] = ACTIONS(9), + [sym__open_start_tag] = ACTIONS(3), + [sym__erroneous_end_tag] = ACTIONS(5), + [ts_builtin_sym_end] = ACTIONS(7), + [sym_text] = ACTIONS(5), }, [2] = { - [sym_tag_name] = ACTIONS(11), - [sym_void_tag_name] = ACTIONS(13), + [sym_attribute] = STATE(11), + [aux_sym_start_tag_repeat1] = STATE(11), + [sym__close_start_tag] = ACTIONS(9), + [sym__self_close_start_tag] = ACTIONS(11), + [sym__attribute_part] = ACTIONS(13), }, [3] = { + [sym__open_start_tag] = ACTIONS(15), + [sym__erroneous_end_tag] = ACTIONS(15), [ts_builtin_sym_end] = ACTIONS(15), + [sym_text] = ACTIONS(15), }, [4] = { [ts_builtin_sym_end] = ACTIONS(17), - [anon_sym_LT] = ACTIONS(19), - [sym_text] = ACTIONS(17), }, [5] = { - [sym__node] = STATE(4), - [sym_element] = STATE(4), - [sym_void_element] = STATE(4), - [sym_start_tag] = STATE(5), - [sym_void_start_tag] = STATE(6), - [sym_self_closing_tag] = STATE(6), - [sym_end_tag] = STATE(11), - [aux_sym_fragment_repeat1] = STATE(12), - [anon_sym_LT] = ACTIONS(21), - [sym_text] = ACTIONS(9), - }, - [6] = { - [ts_builtin_sym_end] = ACTIONS(23), - [anon_sym_LT] = ACTIONS(25), + [sym__node] = STATE(17), + [sym_element] = STATE(14), + [sym_start_tag] = STATE(15), + [sym_self_closing_tag] = STATE(16), + [aux_sym_fragment_repeat1] = STATE(17), + [sym__open_start_tag] = ACTIONS(19), + [sym_end_tag] = ACTIONS(21), + [sym__implicit_end_tag] = ACTIONS(21), + [sym__erroneous_end_tag] = ACTIONS(23), [sym_text] = ACTIONS(23), }, + [6] = { + [sym__open_start_tag] = ACTIONS(25), + [sym__erroneous_end_tag] = ACTIONS(25), + [ts_builtin_sym_end] = ACTIONS(25), + [sym_text] = ACTIONS(25), + }, [7] = { - [sym__node] = STATE(4), - [sym_element] = STATE(4), - [sym_void_element] = STATE(4), + [sym__node] = STATE(18), + [sym_element] = STATE(3), [sym_start_tag] = STATE(5), - [sym_void_start_tag] = STATE(6), [sym_self_closing_tag] = STATE(6), - [aux_sym_fragment_repeat1] = STATE(13), + [aux_sym_fragment_repeat1] = STATE(18), + [sym__open_start_tag] = ACTIONS(3), + [sym__erroneous_end_tag] = ACTIONS(5), [ts_builtin_sym_end] = ACTIONS(27), - [anon_sym_LT] = ACTIONS(7), - [sym_text] = ACTIONS(9), + [sym_text] = ACTIONS(5), }, [8] = { - [sym_attribute] = STATE(17), - [aux_sym_start_tag_repeat1] = STATE(17), - [anon_sym_GT] = ACTIONS(29), - [anon_sym_SLASH] = ACTIONS(31), - [sym__attribute_part] = ACTIONS(33), + [sym__open_start_tag] = ACTIONS(29), + [sym_end_tag] = ACTIONS(29), + [sym__implicit_end_tag] = ACTIONS(29), + [sym__erroneous_end_tag] = ACTIONS(29), + [sym_text] = ACTIONS(29), }, [9] = { - [sym_attribute] = STATE(19), - [aux_sym_start_tag_repeat1] = STATE(19), - [anon_sym_GT] = ACTIONS(35), - [anon_sym_SLASH] = ACTIONS(31), - [sym__attribute_part] = ACTIONS(33), + [sym__open_start_tag] = ACTIONS(31), + [sym__erroneous_end_tag] = ACTIONS(31), + [ts_builtin_sym_end] = ACTIONS(31), + [sym_text] = ACTIONS(31), }, [10] = { - [anon_sym_SLASH] = ACTIONS(37), - [sym_tag_name] = ACTIONS(11), - [sym_void_tag_name] = ACTIONS(13), + [sym__close_start_tag] = ACTIONS(33), + [sym__self_close_start_tag] = ACTIONS(33), + [anon_sym_EQ] = ACTIONS(35), + [sym__attribute_part] = ACTIONS(33), }, [11] = { - [ts_builtin_sym_end] = ACTIONS(39), - [anon_sym_LT] = ACTIONS(41), - [sym_text] = ACTIONS(39), + [sym_attribute] = STATE(22), + [aux_sym_start_tag_repeat1] = STATE(22), + [sym__close_start_tag] = ACTIONS(37), + [sym__self_close_start_tag] = ACTIONS(39), + [sym__attribute_part] = ACTIONS(13), }, [12] = { - [sym__node] = STATE(4), - [sym_element] = STATE(4), - [sym_void_element] = STATE(4), - [sym_start_tag] = STATE(5), - [sym_void_start_tag] = STATE(6), - [sym_self_closing_tag] = STATE(6), - [sym_end_tag] = STATE(21), - [aux_sym_fragment_repeat1] = STATE(22), - [anon_sym_LT] = ACTIONS(21), - [sym_text] = ACTIONS(9), + [sym_attribute] = STATE(24), + [aux_sym_start_tag_repeat1] = STATE(24), + [sym__close_start_tag] = ACTIONS(9), + [sym__self_close_start_tag] = ACTIONS(41), + [sym__attribute_part] = ACTIONS(13), }, [13] = { - [sym__node] = STATE(4), - [sym_element] = STATE(4), - [sym_void_element] = STATE(4), - [sym_start_tag] = STATE(5), - [sym_void_start_tag] = STATE(6), - [sym_self_closing_tag] = STATE(6), - [aux_sym_fragment_repeat1] = STATE(13), + [sym__open_start_tag] = ACTIONS(43), + [sym__erroneous_end_tag] = ACTIONS(43), [ts_builtin_sym_end] = ACTIONS(43), - [anon_sym_LT] = ACTIONS(45), - [sym_text] = ACTIONS(48), + [sym_text] = ACTIONS(43), }, [14] = { - [anon_sym_LT] = ACTIONS(51), - [sym_text] = ACTIONS(53), + [sym__open_start_tag] = ACTIONS(15), + [sym_end_tag] = ACTIONS(15), + [sym__implicit_end_tag] = ACTIONS(15), + [sym__erroneous_end_tag] = ACTIONS(15), + [sym_text] = ACTIONS(15), }, [15] = { - [anon_sym_GT] = ACTIONS(55), + [sym__node] = STATE(26), + [sym_element] = STATE(14), + [sym_start_tag] = STATE(15), + [sym_self_closing_tag] = STATE(16), + [aux_sym_fragment_repeat1] = STATE(26), + [sym__open_start_tag] = ACTIONS(19), + [sym_end_tag] = ACTIONS(45), + [sym__implicit_end_tag] = ACTIONS(45), + [sym__erroneous_end_tag] = ACTIONS(23), + [sym_text] = ACTIONS(23), }, [16] = { - [anon_sym_GT] = ACTIONS(57), - [anon_sym_SLASH] = ACTIONS(57), - [anon_sym_EQ] = ACTIONS(59), - [sym__attribute_part] = ACTIONS(57), + [sym__open_start_tag] = ACTIONS(25), + [sym_end_tag] = ACTIONS(25), + [sym__implicit_end_tag] = ACTIONS(25), + [sym__erroneous_end_tag] = ACTIONS(25), + [sym_text] = ACTIONS(25), }, [17] = { - [sym_attribute] = STATE(27), - [aux_sym_start_tag_repeat1] = STATE(27), - [anon_sym_GT] = ACTIONS(61), - [anon_sym_SLASH] = ACTIONS(63), - [sym__attribute_part] = ACTIONS(33), + [sym__node] = STATE(28), + [sym_element] = STATE(14), + [sym_start_tag] = STATE(15), + [sym_self_closing_tag] = STATE(16), + [aux_sym_fragment_repeat1] = STATE(28), + [sym__open_start_tag] = ACTIONS(19), + [sym_end_tag] = ACTIONS(47), + [sym__implicit_end_tag] = ACTIONS(47), + [sym__erroneous_end_tag] = ACTIONS(23), + [sym_text] = ACTIONS(23), }, [18] = { - [ts_builtin_sym_end] = ACTIONS(65), - [anon_sym_LT] = ACTIONS(67), - [sym_text] = ACTIONS(65), + [sym__node] = STATE(18), + [sym_element] = STATE(3), + [sym_start_tag] = STATE(5), + [sym_self_closing_tag] = STATE(6), + [aux_sym_fragment_repeat1] = STATE(18), + [sym__open_start_tag] = ACTIONS(49), + [sym__erroneous_end_tag] = ACTIONS(52), + [ts_builtin_sym_end] = ACTIONS(55), + [sym_text] = ACTIONS(52), }, [19] = { - [sym_attribute] = STATE(27), - [aux_sym_start_tag_repeat1] = STATE(27), - [anon_sym_GT] = ACTIONS(69), - [anon_sym_SLASH] = ACTIONS(63), - [sym__attribute_part] = ACTIONS(33), + [sym_quoted_attribute_value] = STATE(32), + [sym__attribute_part] = ACTIONS(57), + [anon_sym_SQUOTE] = ACTIONS(59), + [anon_sym_DQUOTE] = ACTIONS(61), }, [20] = { - [sym_tag_name] = ACTIONS(71), + [sym__open_start_tag] = ACTIONS(63), + [sym_end_tag] = ACTIONS(63), + [sym__implicit_end_tag] = ACTIONS(63), + [sym__erroneous_end_tag] = ACTIONS(63), + [sym_text] = ACTIONS(63), }, [21] = { - [ts_builtin_sym_end] = ACTIONS(73), - [anon_sym_LT] = ACTIONS(75), - [sym_text] = ACTIONS(73), + [sym__open_start_tag] = ACTIONS(65), + [sym__erroneous_end_tag] = ACTIONS(65), + [ts_builtin_sym_end] = ACTIONS(65), + [sym_text] = ACTIONS(65), }, [22] = { - [sym__node] = STATE(4), - [sym_element] = STATE(4), - [sym_void_element] = STATE(4), - [sym_start_tag] = STATE(5), - [sym_void_start_tag] = STATE(6), - [sym_self_closing_tag] = STATE(6), - [aux_sym_fragment_repeat1] = STATE(22), - [anon_sym_LT] = ACTIONS(45), - [sym_text] = ACTIONS(48), + [sym_attribute] = STATE(22), + [aux_sym_start_tag_repeat1] = STATE(22), + [sym__close_start_tag] = ACTIONS(67), + [sym__self_close_start_tag] = ACTIONS(67), + [sym__attribute_part] = ACTIONS(69), }, [23] = { - [ts_builtin_sym_end] = ACTIONS(77), - [anon_sym_LT] = ACTIONS(79), - [sym_text] = ACTIONS(77), + [sym__open_start_tag] = ACTIONS(31), + [sym_end_tag] = ACTIONS(31), + [sym__implicit_end_tag] = ACTIONS(31), + [sym__erroneous_end_tag] = ACTIONS(31), + [sym_text] = ACTIONS(31), }, [24] = { - [sym_quoted_attribute_value] = STATE(33), - [sym__attribute_part] = ACTIONS(81), - [anon_sym_SQUOTE] = ACTIONS(83), - [anon_sym_DQUOTE] = ACTIONS(85), + [sym_attribute] = STATE(22), + [aux_sym_start_tag_repeat1] = STATE(22), + [sym__close_start_tag] = ACTIONS(37), + [sym__self_close_start_tag] = ACTIONS(72), + [sym__attribute_part] = ACTIONS(13), }, [25] = { - [anon_sym_LT] = ACTIONS(87), - [sym_text] = ACTIONS(89), + [sym__open_start_tag] = ACTIONS(43), + [sym_end_tag] = ACTIONS(43), + [sym__implicit_end_tag] = ACTIONS(43), + [sym__erroneous_end_tag] = ACTIONS(43), + [sym_text] = ACTIONS(43), }, [26] = { - [anon_sym_GT] = ACTIONS(91), + [sym__node] = STATE(28), + [sym_element] = STATE(14), + [sym_start_tag] = STATE(15), + [sym_self_closing_tag] = STATE(16), + [aux_sym_fragment_repeat1] = STATE(28), + [sym__open_start_tag] = ACTIONS(19), + [sym_end_tag] = ACTIONS(74), + [sym__implicit_end_tag] = ACTIONS(74), + [sym__erroneous_end_tag] = ACTIONS(23), + [sym_text] = ACTIONS(23), }, [27] = { - [sym_attribute] = STATE(27), - [aux_sym_start_tag_repeat1] = STATE(27), - [anon_sym_GT] = ACTIONS(93), - [anon_sym_SLASH] = ACTIONS(93), - [sym__attribute_part] = ACTIONS(95), + [sym__open_start_tag] = ACTIONS(76), + [sym__erroneous_end_tag] = ACTIONS(76), + [ts_builtin_sym_end] = ACTIONS(76), + [sym_text] = ACTIONS(76), }, [28] = { - [ts_builtin_sym_end] = ACTIONS(98), - [anon_sym_LT] = ACTIONS(100), - [sym_text] = ACTIONS(98), + [sym__node] = STATE(28), + [sym_element] = STATE(14), + [sym_start_tag] = STATE(15), + [sym_self_closing_tag] = STATE(16), + [aux_sym_fragment_repeat1] = STATE(28), + [sym__open_start_tag] = ACTIONS(78), + [sym_end_tag] = ACTIONS(55), + [sym__implicit_end_tag] = ACTIONS(55), + [sym__erroneous_end_tag] = ACTIONS(81), + [sym_text] = ACTIONS(81), }, [29] = { - [anon_sym_GT] = ACTIONS(102), + [sym__close_start_tag] = ACTIONS(84), + [sym__self_close_start_tag] = ACTIONS(84), + [sym__attribute_part] = ACTIONS(84), }, [30] = { - [anon_sym_GT] = ACTIONS(104), - [anon_sym_SLASH] = ACTIONS(104), - [sym__attribute_part] = ACTIONS(104), + [anon_sym_SQUOTE] = ACTIONS(86), + [aux_sym_SLASH_LBRACK_CARET_SQUOTE_RBRACK_PLUS_SLASH] = ACTIONS(88), }, [31] = { - [anon_sym_SQUOTE] = ACTIONS(106), - [aux_sym_SLASH_LBRACK_CARET_SQUOTE_RBRACK_PLUS_SLASH] = ACTIONS(108), + [anon_sym_DQUOTE] = ACTIONS(86), + [aux_sym_SLASH_LBRACK_CARET_DQUOTE_RBRACK_PLUS_SLASH] = ACTIONS(90), }, [32] = { - [anon_sym_DQUOTE] = ACTIONS(106), - [aux_sym_SLASH_LBRACK_CARET_DQUOTE_RBRACK_PLUS_SLASH] = ACTIONS(110), + [sym__close_start_tag] = ACTIONS(92), + [sym__self_close_start_tag] = ACTIONS(92), + [sym__attribute_part] = ACTIONS(92), }, [33] = { - [anon_sym_GT] = ACTIONS(112), - [anon_sym_SLASH] = ACTIONS(112), - [sym__attribute_part] = ACTIONS(112), + [sym__open_start_tag] = ACTIONS(65), + [sym_end_tag] = ACTIONS(65), + [sym__implicit_end_tag] = ACTIONS(65), + [sym__erroneous_end_tag] = ACTIONS(65), + [sym_text] = ACTIONS(65), }, [34] = { - [ts_builtin_sym_end] = ACTIONS(114), - [anon_sym_LT] = ACTIONS(116), - [sym_text] = ACTIONS(114), + [sym__open_start_tag] = ACTIONS(76), + [sym_end_tag] = ACTIONS(76), + [sym__implicit_end_tag] = ACTIONS(76), + [sym__erroneous_end_tag] = ACTIONS(76), + [sym_text] = ACTIONS(76), }, [35] = { - [ts_builtin_sym_end] = ACTIONS(118), - [anon_sym_LT] = ACTIONS(120), - [sym_text] = ACTIONS(118), + [sym__close_start_tag] = ACTIONS(94), + [sym__self_close_start_tag] = ACTIONS(94), + [sym__attribute_part] = ACTIONS(94), }, [36] = { - [anon_sym_GT] = ACTIONS(122), - [anon_sym_SLASH] = ACTIONS(122), - [sym__attribute_part] = ACTIONS(122), + [anon_sym_SQUOTE] = ACTIONS(96), }, [37] = { - [anon_sym_SQUOTE] = ACTIONS(124), + [anon_sym_DQUOTE] = ACTIONS(96), }, [38] = { - [anon_sym_DQUOTE] = ACTIONS(124), - }, - [39] = { - [anon_sym_GT] = ACTIONS(126), - [anon_sym_SLASH] = ACTIONS(126), - [sym__attribute_part] = ACTIONS(126), + [sym__close_start_tag] = ACTIONS(98), + [sym__self_close_start_tag] = ACTIONS(98), + [sym__attribute_part] = ACTIONS(98), }, }; static TSParseActionEntry ts_parse_actions[] = { [0] = {.count = 0, .reusable = false}, [1] = {.count = 1, .reusable = true}, RECOVER(), - [3] = {.count = 1, .reusable = false}, RECOVER(), - [5] = {.count = 1, .reusable = true}, REDUCE(sym_fragment, 0), - [7] = {.count = 1, .reusable = false}, SHIFT(2), - [9] = {.count = 1, .reusable = true}, SHIFT(4), - [11] = {.count = 1, .reusable = false}, SHIFT(8), - [13] = {.count = 1, .reusable = true}, SHIFT(9), - [15] = {.count = 1, .reusable = true}, ACCEPT_INPUT(), - [17] = {.count = 1, .reusable = true}, REDUCE(aux_sym_fragment_repeat1, 1), - [19] = {.count = 1, .reusable = false}, REDUCE(aux_sym_fragment_repeat1, 1), - [21] = {.count = 1, .reusable = false}, SHIFT(10), - [23] = {.count = 1, .reusable = true}, REDUCE(sym_void_element, 1), - [25] = {.count = 1, .reusable = false}, REDUCE(sym_void_element, 1), + [3] = {.count = 1, .reusable = true}, SHIFT(2), + [5] = {.count = 1, .reusable = true}, SHIFT(3), + [7] = {.count = 1, .reusable = true}, REDUCE(sym_fragment, 0), + [9] = {.count = 1, .reusable = true}, SHIFT(8), + [11] = {.count = 1, .reusable = true}, SHIFT(9), + [13] = {.count = 1, .reusable = true}, SHIFT(10), + [15] = {.count = 1, .reusable = true}, REDUCE(sym__node, 1), + [17] = {.count = 1, .reusable = true}, ACCEPT_INPUT(), + [19] = {.count = 1, .reusable = true}, SHIFT(12), + [21] = {.count = 1, .reusable = true}, SHIFT(13), + [23] = {.count = 1, .reusable = true}, SHIFT(14), + [25] = {.count = 1, .reusable = true}, REDUCE(sym_element, 1), [27] = {.count = 1, .reusable = true}, REDUCE(sym_fragment, 1), - [29] = {.count = 1, .reusable = true}, SHIFT(14), - [31] = {.count = 1, .reusable = true}, SHIFT(15), - [33] = {.count = 1, .reusable = true}, SHIFT(16), - [35] = {.count = 1, .reusable = true}, SHIFT(18), + [29] = {.count = 1, .reusable = true}, REDUCE(sym_start_tag, 2), + [31] = {.count = 1, .reusable = true}, REDUCE(sym_self_closing_tag, 2), + [33] = {.count = 1, .reusable = true}, REDUCE(sym_attribute, 1, .alias_sequence_id = 1), + [35] = {.count = 1, .reusable = true}, SHIFT(19), [37] = {.count = 1, .reusable = true}, SHIFT(20), - [39] = {.count = 1, .reusable = true}, REDUCE(sym_element, 2), - [41] = {.count = 1, .reusable = false}, REDUCE(sym_element, 2), - [43] = {.count = 1, .reusable = true}, REDUCE(aux_sym_fragment_repeat1, 2), - [45] = {.count = 2, .reusable = false}, REDUCE(aux_sym_fragment_repeat1, 2), SHIFT_REPEAT(2), - [48] = {.count = 2, .reusable = true}, REDUCE(aux_sym_fragment_repeat1, 2), SHIFT_REPEAT(4), - [51] = {.count = 1, .reusable = false}, REDUCE(sym_start_tag, 3, .alias_sequence_id = 1), - [53] = {.count = 1, .reusable = true}, REDUCE(sym_start_tag, 3, .alias_sequence_id = 1), - [55] = {.count = 1, .reusable = true}, SHIFT(23), - [57] = {.count = 1, .reusable = true}, REDUCE(sym_attribute, 1, .alias_sequence_id = 2), - [59] = {.count = 1, .reusable = true}, SHIFT(24), - [61] = {.count = 1, .reusable = true}, SHIFT(25), - [63] = {.count = 1, .reusable = true}, SHIFT(26), - [65] = {.count = 1, .reusable = true}, REDUCE(sym_void_start_tag, 3, .alias_sequence_id = 1), - [67] = {.count = 1, .reusable = false}, REDUCE(sym_void_start_tag, 3, .alias_sequence_id = 1), - [69] = {.count = 1, .reusable = true}, SHIFT(28), - [71] = {.count = 1, .reusable = true}, SHIFT(29), - [73] = {.count = 1, .reusable = true}, REDUCE(sym_element, 3), - [75] = {.count = 1, .reusable = false}, REDUCE(sym_element, 3), - [77] = {.count = 1, .reusable = true}, REDUCE(sym_self_closing_tag, 4), - [79] = {.count = 1, .reusable = false}, REDUCE(sym_self_closing_tag, 4), - [81] = {.count = 1, .reusable = true}, SHIFT(30), - [83] = {.count = 1, .reusable = true}, SHIFT(31), - [85] = {.count = 1, .reusable = true}, SHIFT(32), - [87] = {.count = 1, .reusable = false}, REDUCE(sym_start_tag, 4, .alias_sequence_id = 1), - [89] = {.count = 1, .reusable = true}, REDUCE(sym_start_tag, 4, .alias_sequence_id = 1), - [91] = {.count = 1, .reusable = true}, SHIFT(34), - [93] = {.count = 1, .reusable = true}, REDUCE(aux_sym_start_tag_repeat1, 2), - [95] = {.count = 2, .reusable = true}, REDUCE(aux_sym_start_tag_repeat1, 2), SHIFT_REPEAT(16), - [98] = {.count = 1, .reusable = true}, REDUCE(sym_void_start_tag, 4, .alias_sequence_id = 1), - [100] = {.count = 1, .reusable = false}, REDUCE(sym_void_start_tag, 4, .alias_sequence_id = 1), - [102] = {.count = 1, .reusable = true}, SHIFT(35), - [104] = {.count = 1, .reusable = true}, REDUCE(sym_attribute, 3, .alias_sequence_id = 3), - [106] = {.count = 1, .reusable = false}, SHIFT(36), - [108] = {.count = 1, .reusable = true}, SHIFT(37), - [110] = {.count = 1, .reusable = true}, SHIFT(38), - [112] = {.count = 1, .reusable = true}, REDUCE(sym_attribute, 3, .alias_sequence_id = 2), - [114] = {.count = 1, .reusable = true}, REDUCE(sym_self_closing_tag, 5), - [116] = {.count = 1, .reusable = false}, REDUCE(sym_self_closing_tag, 5), - [118] = {.count = 1, .reusable = true}, REDUCE(sym_end_tag, 4), - [120] = {.count = 1, .reusable = false}, REDUCE(sym_end_tag, 4), - [122] = {.count = 1, .reusable = true}, REDUCE(sym_quoted_attribute_value, 2), - [124] = {.count = 1, .reusable = true}, SHIFT(39), - [126] = {.count = 1, .reusable = true}, REDUCE(sym_quoted_attribute_value, 3, .alias_sequence_id = 4), + [39] = {.count = 1, .reusable = true}, SHIFT(21), + [41] = {.count = 1, .reusable = true}, SHIFT(23), + [43] = {.count = 1, .reusable = true}, REDUCE(sym_element, 2), + [45] = {.count = 1, .reusable = true}, SHIFT(25), + [47] = {.count = 1, .reusable = true}, SHIFT(27), + [49] = {.count = 2, .reusable = true}, REDUCE(aux_sym_fragment_repeat1, 2), SHIFT_REPEAT(2), + [52] = {.count = 2, .reusable = true}, REDUCE(aux_sym_fragment_repeat1, 2), SHIFT_REPEAT(3), + [55] = {.count = 1, .reusable = true}, REDUCE(aux_sym_fragment_repeat1, 2), + [57] = {.count = 1, .reusable = true}, SHIFT(29), + [59] = {.count = 1, .reusable = true}, SHIFT(30), + [61] = {.count = 1, .reusable = true}, SHIFT(31), + [63] = {.count = 1, .reusable = true}, REDUCE(sym_start_tag, 3), + [65] = {.count = 1, .reusable = true}, REDUCE(sym_self_closing_tag, 3), + [67] = {.count = 1, .reusable = true}, REDUCE(aux_sym_start_tag_repeat1, 2), + [69] = {.count = 2, .reusable = true}, REDUCE(aux_sym_start_tag_repeat1, 2), SHIFT_REPEAT(10), + [72] = {.count = 1, .reusable = true}, SHIFT(33), + [74] = {.count = 1, .reusable = true}, SHIFT(34), + [76] = {.count = 1, .reusable = true}, REDUCE(sym_element, 3), + [78] = {.count = 2, .reusable = true}, REDUCE(aux_sym_fragment_repeat1, 2), SHIFT_REPEAT(12), + [81] = {.count = 2, .reusable = true}, REDUCE(aux_sym_fragment_repeat1, 2), SHIFT_REPEAT(14), + [84] = {.count = 1, .reusable = true}, REDUCE(sym_attribute, 3, .alias_sequence_id = 2), + [86] = {.count = 1, .reusable = false}, SHIFT(35), + [88] = {.count = 1, .reusable = true}, SHIFT(36), + [90] = {.count = 1, .reusable = true}, SHIFT(37), + [92] = {.count = 1, .reusable = true}, REDUCE(sym_attribute, 3, .alias_sequence_id = 1), + [94] = {.count = 1, .reusable = true}, REDUCE(sym_quoted_attribute_value, 2), + [96] = {.count = 1, .reusable = true}, SHIFT(38), + [98] = {.count = 1, .reusable = true}, REDUCE(sym_quoted_attribute_value, 3, .alias_sequence_id = 3), }; +void *tree_sitter_html_external_scanner_create(); +void tree_sitter_html_external_scanner_destroy(void *); +bool tree_sitter_html_external_scanner_scan(void *, TSLexer *, const bool *); +unsigned tree_sitter_html_external_scanner_serialize(void *, char *); +void tree_sitter_html_external_scanner_deserialize(void *, const char *, unsigned); + #ifdef _WIN32 #define extern __declspec(dllexport) #endif @@ -1138,9 +821,16 @@ extern const TSLanguage *tree_sitter_html() { .alias_sequences = (const TSSymbol *)ts_alias_sequences, .max_alias_sequence_length = MAX_ALIAS_SEQUENCE_LENGTH, .lex_fn = ts_lex, - .keyword_lex_fn = ts_lex_keywords, - .keyword_capture_token = sym_tag_name, .external_token_count = EXTERNAL_TOKEN_COUNT, + .external_scanner = { + (const bool *)ts_external_scanner_states, + ts_external_scanner_symbol_map, + tree_sitter_html_external_scanner_create, + tree_sitter_html_external_scanner_destroy, + tree_sitter_html_external_scanner_scan, + tree_sitter_html_external_scanner_serialize, + tree_sitter_html_external_scanner_deserialize, + }, }; return &language; } diff --git a/src/scanner.cc b/src/scanner.cc new file mode 100644 index 0000000..008b856 --- /dev/null +++ b/src/scanner.cc @@ -0,0 +1,185 @@ +#include +#include +#include +#include +#include + +#include "tag.h" + +namespace { + +using std::vector; +using std::string; + +enum TokenType { + OPEN_START_TAG, + CLOSE_START_TAG, + SELF_CLOSE_START_TAG, + END_TAG, + IMPLICIT_END_TAG, + ERRONEOUS_END_TAG, +}; + +struct Scanner { + Scanner() {} + + unsigned serialize(char *buffer) { + unsigned i = 0; + for (Tag &tag : tags) { + buffer[i] = static_cast(tag.type); + i++; + + if (tag.type == CUSTOM) { + buffer[i++] = tag.custom_tag_name.size(); + for (char c : tag.custom_tag_name) { + buffer[i++] = c; + } + } + } + + return i; + } + + void deserialize(const char *buffer, unsigned length) { + tags.clear(); + + unsigned i = 0; + while (i < length) { + Tag tag { static_cast(buffer[i]), "" }; + i++; + if (tag.type == CUSTOM) { + tag.custom_tag_name.resize(buffer[i++]); + for (unsigned j = 0; j < tag.custom_tag_name.size(); j++) { + tag.custom_tag_name[j] = buffer[i++]; + } + } + tags.push_back(tag); + } + } + + string scan_tag_name(TSLexer *lexer) { + string tag_name; + while (iswalpha(lexer->lookahead) || lexer->lookahead == '-' || lexer->lookahead == ':') { + tag_name += std::toupper(lexer->lookahead); + lexer->advance(lexer, false); + } + return tag_name; + } + + bool start_tag(TSLexer *lexer) { + auto tag_name = scan_tag_name(lexer); + if (tag_name.empty()) return false; + + Tag tag = Tag::for_name(tag_name); + tags.push_back(tag); + + lexer->mark_end(lexer); + lexer->result_symbol = OPEN_START_TAG; + return true; + } + + bool end_tag(TSLexer *lexer) { + auto tag_name = scan_tag_name(lexer); + if (tag_name.empty()) return false; + + lexer->advance(lexer, false); + + Tag tag = Tag::for_name(tag_name); + + // The tag correctly closes the topmost element on the stack + if (tag == tags.back()) { + tags.pop_back(); + lexer->mark_end(lexer); + lexer->result_symbol = END_TAG; + return true; + } + + // Otherwise, dig deeper and queue implicit end tags (to be nice in + // the case of malformed HTML) + if (std::find(tags.begin(), tags.end(), tag) != tags.end()) { + tags.pop_back(); + lexer->result_symbol = IMPLICIT_END_TAG; + return true; + } + + // You closed a tag you never opened 😭 + lexer->mark_end(lexer); + lexer->result_symbol = ERRONEOUS_END_TAG; + return true; + } + + bool scan(TSLexer *lexer, const bool *valid_symbols) { + while (iswspace(lexer->lookahead)) { + lexer->advance(lexer, true); + } + + switch (lexer->lookahead) { + case '<': + if (valid_symbols[OPEN_START_TAG] || valid_symbols[END_TAG]) { + lexer->mark_end(lexer); + lexer->advance(lexer, false); + if (lexer->lookahead == '/') { + lexer->advance(lexer, false); + return end_tag(lexer); + } + return start_tag(lexer); + } + break; + + case '>': + if (valid_symbols[CLOSE_START_TAG]) { + lexer->advance(lexer, false); + lexer->result_symbol = CLOSE_START_TAG; + return true; + } + break; + + case '/': + if (valid_symbols[SELF_CLOSE_START_TAG]) { + lexer->advance(lexer, false); + if (lexer->lookahead == '>') { + lexer->advance(lexer, false); + tags.pop_back(); + lexer->result_symbol = SELF_CLOSE_START_TAG; + return true; + } + } + break; + } + + return false; + } + + vector tags; +}; + +} + +extern "C" { + +void *tree_sitter_html_external_scanner_create() { + return new Scanner(); +} + +bool tree_sitter_html_external_scanner_scan(void *payload, TSLexer *lexer, + const bool *valid_symbols) { + Scanner *scanner = static_cast(payload); + return scanner->scan(lexer, valid_symbols); +} + +unsigned tree_sitter_html_external_scanner_serialize(void *payload, char *buffer) { + Scanner *scanner = static_cast(payload); + return scanner->serialize(buffer); +} + +void tree_sitter_html_external_scanner_deserialize(void *payload, const char *buffer, unsigned length) { + Scanner *scanner = static_cast(payload); + scanner->deserialize(buffer, length); +} + +void tree_sitter_html_external_scanner_destroy(void *payload) { + Scanner *scanner = static_cast(payload); + delete scanner; +} + +} diff --git a/src/tag.h b/src/tag.h new file mode 100644 index 0000000..5246433 --- /dev/null +++ b/src/tag.h @@ -0,0 +1,296 @@ +#include +#include + +using std::string; +using std::unordered_map; + +enum TagType : char { + // Void tags + AREA, + BASE, + BASEFONT, + BGSOUND, + BR, + COL, + COMMAND, + EMBED, + FRAME, + HR, + IMAGE, + IMG, + INPUT, + ISINDEX, + KEYGEN, + LINK, + MENUITEM, + META, + NEXTID, + PARAM, + SOURCE, + TRACK, + WBR, + END_OF_VOID_TAGS, + + A, + ABBR, + ADDRESS, + ARTICLE, + ASIDE, + AUDIO, + B, + BDI, + BDO, + BLOCKQUOTE, + BODY, + BUTTON, + CANVAS, + CAPTION, + CITE, + CODE, + COLGROUP, + DATA, + DATALIST, + DD, + DEL, + DETAILS, + DFN, + DIALOG, + DIV, + DL, + DT, + EM, + FIELDSET, + FIGCAPTION, + FIGURE, + FOOTER, + FORM, + H1, + H2, + H3, + H4, + H5, + H6, + HEAD, + HEADER, + HGROUP, + HTML, + I, + IFRAME, + INS, + KBD, + LABEL, + LEGEND, + LI, + MAIN, + MAP, + MARK, + MATH, + MENU, + METER, + NAV, + NOSCRIPT, + OBJECT, + OL, + OPTGROUP, + OPTION, + OUTPUT, + P, + PICTURE, + PRE, + PROGRESS, + Q, + RB, + RP, + RT, + RTC, + RUBY, + S, + SAMP, + SCRIPT, + SECTION, + SELECT, + SLOT, + SMALL, + SPAN, + STRONG, + STYLE, + SUB, + SUMMARY, + SUP, + SVG, + TABLE, + TBODY, + TD, + TEMPLATE, + TEXTAREA, + TFOOT, + TH, + THEAD, + TIME, + TITLE, + TR, + U, + UL, + VAR, + VIDEO, + + CUSTOM, +}; + +static const unordered_map TAG_TYPES_BY_TAG_NAME = { + {"AREA", AREA}, + {"BASE", BASE}, + {"BASEFONT", BASEFONT}, + {"BGSOUND", BGSOUND}, + {"BR", BR}, + {"COL", COL}, + {"COMMAND", COMMAND}, + {"EMBED", EMBED}, + {"FRAME", FRAME}, + {"HR", HR}, + {"IMAGE", IMAGE}, + {"IMG", IMG}, + {"INPUT", INPUT}, + {"ISINDEX", ISINDEX}, + {"KEYGEN", KEYGEN}, + {"LINK", LINK}, + {"MENUITEM", MENUITEM}, + {"META", META}, + {"NEXTID", NEXTID}, + {"PARAM", PARAM}, + {"SOURCE", SOURCE}, + {"TRACK", TRACK}, + {"WBR", WBR}, + {"A", A}, + {"ABBR", ABBR}, + {"ADDRESS", ADDRESS}, + {"ARTICLE", ARTICLE}, + {"ASIDE", ASIDE}, + {"AUDIO", AUDIO}, + {"B", B}, + {"BDI", BDI}, + {"BDO", BDO}, + {"BLOCKQUOTE", BLOCKQUOTE}, + {"BODY", BODY}, + {"BUTTON", BUTTON}, + {"CANVAS", CANVAS}, + {"CAPTION", CAPTION}, + {"CITE", CITE}, + {"CODE", CODE}, + {"COLGROUP", COLGROUP}, + {"DATA", DATA}, + {"DATALIST", DATALIST}, + {"DD", DD}, + {"DEL", DEL}, + {"DETAILS", DETAILS}, + {"DFN", DFN}, + {"DIALOG", DIALOG}, + {"DIV", DIV}, + {"DL", DL}, + {"DT", DT}, + {"EM", EM}, + {"FIELDSET", FIELDSET}, + {"FIGCAPTION", FIGCAPTION}, + {"FIGURE", FIGURE}, + {"FOOTER", FOOTER}, + {"FORM", FORM}, + {"H1", H1}, + {"H2", H2}, + {"H3", H3}, + {"H4", H4}, + {"H5", H5}, + {"H6", H6}, + {"HEAD", HEAD}, + {"HEADER", HEADER}, + {"HGROUP", HGROUP}, + {"HTML", HTML}, + {"I", I}, + {"IFRAME", IFRAME}, + {"INS", INS}, + {"KBD", KBD}, + {"LABEL", LABEL}, + {"LEGEND", LEGEND}, + {"LI", LI}, + {"MAIN", MAIN}, + {"MAP", MAP}, + {"MARK", MARK}, + {"MATH", MATH}, + {"MENU", MENU}, + {"METER", METER}, + {"NAV", NAV}, + {"NOSCRIPT", NOSCRIPT}, + {"OBJECT", OBJECT}, + {"OL", OL}, + {"OPTGROUP", OPTGROUP}, + {"OPTION", OPTION}, + {"OUTPUT", OUTPUT}, + {"P", P}, + {"PICTURE", PICTURE}, + {"PRE", PRE}, + {"PROGRESS", PROGRESS}, + {"Q", Q}, + {"RB", RB}, + {"RP", RP}, + {"RT", RT}, + {"RTC", RTC}, + {"RUBY", RUBY}, + {"S", S}, + {"SAMP", SAMP}, + {"SCRIPT", SCRIPT}, + {"SECTION", SECTION}, + {"SELECT", SELECT}, + {"SLOT", SLOT}, + {"SMALL", SMALL}, + {"SPAN", SPAN}, + {"STRONG", STRONG}, + {"STYLE", STYLE}, + {"SUB", SUB}, + {"SUMMARY", SUMMARY}, + {"SUP", SUP}, + {"SVG", SVG}, + {"TABLE", TABLE}, + {"TBODY", TBODY}, + {"TD", TD}, + {"TEMPLATE", TEMPLATE}, + {"TEXTAREA", TEXTAREA}, + {"TFOOT", TFOOT}, + {"TH", TH}, + {"THEAD", THEAD}, + {"TIME", TIME}, + {"TITLE", TITLE}, + {"TR", TR}, + {"U", U}, + {"UL", UL}, + {"VAR", VAR}, + {"VIDEO", VIDEO}, +}; + +struct Tag { + TagType type; + string custom_tag_name; + + bool operator==(const Tag &other) const { + if (type != other.type) return false; + if (type == TagType::CUSTOM && custom_tag_name != other.custom_tag_name) return false; + return true; + } + + inline bool is_void() const { + return type < END_OF_VOID_TAGS; + } + + // string name() const { + // return type == TagType::CUSTOM + // ? custom_tag_name + // : TAG_TYPES_BY_TAG_NAME. + // } + + static Tag for_name(const string &name) { + auto type = TAG_TYPES_BY_TAG_NAME.find(name); + if (type != TAG_TYPES_BY_TAG_NAME.end()) { + return Tag { type->second, "" }; + } + + return Tag { CUSTOM, name }; + } +}; diff --git a/test.html b/test.html index ba655d2..3f121b3 100644 --- a/test.html +++ b/test.html @@ -1 +1 @@ - +