2018-06-11 22:12:01 +00:00
|
|
|
#include <tree_sitter/parser.h>
|
|
|
|
#include <algorithm>
|
|
|
|
#include <vector>
|
|
|
|
#include <string>
|
2018-06-12 00:12:34 +00:00
|
|
|
#include <cwctype>
|
2018-07-18 18:30:49 +00:00
|
|
|
#include <cstring>
|
2018-06-11 22:12:01 +00:00
|
|
|
#include "tag.h"
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
|
|
|
|
using std::vector;
|
|
|
|
using std::string;
|
|
|
|
|
|
|
|
enum TokenType {
|
2018-06-12 19:20:13 +00:00
|
|
|
START_TAG_NAME,
|
2018-12-15 01:43:41 +00:00
|
|
|
SCRIPT_START_TAG_NAME,
|
|
|
|
STYLE_START_TAG_NAME,
|
2018-06-12 19:20:13 +00:00
|
|
|
END_TAG_NAME,
|
|
|
|
ERRONEOUS_END_TAG_NAME,
|
|
|
|
SELF_CLOSING_TAG_DELIMITER,
|
2018-06-11 22:12:01 +00:00
|
|
|
IMPLICIT_END_TAG,
|
2018-06-11 23:56:33 +00:00
|
|
|
RAW_TEXT,
|
|
|
|
COMMENT
|
2018-06-11 22:12:01 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
struct Scanner {
|
|
|
|
Scanner() {}
|
|
|
|
|
|
|
|
unsigned serialize(char *buffer) {
|
2018-08-29 18:01:00 +00:00
|
|
|
uint16_t tag_count = tags.size() > UINT16_MAX ? UINT16_MAX : tags.size();
|
|
|
|
uint16_t serialized_tag_count = 0;
|
|
|
|
|
2018-08-07 20:38:54 +00:00
|
|
|
unsigned i = sizeof(tag_count);
|
2018-08-29 18:01:00 +00:00
|
|
|
std::memcpy(&buffer[i], &tag_count, sizeof(tag_count));
|
|
|
|
i += sizeof(tag_count);
|
2018-08-07 20:38:54 +00:00
|
|
|
|
2018-08-29 18:01:00 +00:00
|
|
|
for (; serialized_tag_count < tag_count; serialized_tag_count++) {
|
|
|
|
Tag &tag = tags[serialized_tag_count];
|
2018-06-11 22:12:01 +00:00
|
|
|
if (tag.type == CUSTOM) {
|
2018-06-18 17:04:46 +00:00
|
|
|
unsigned name_length = tag.custom_tag_name.size();
|
2018-08-29 18:01:00 +00:00
|
|
|
if (name_length > UINT8_MAX) name_length = UINT8_MAX;
|
2018-08-07 20:38:54 +00:00
|
|
|
if (i + 2 + name_length >= TREE_SITTER_SERIALIZATION_BUFFER_SIZE) break;
|
2018-07-18 18:24:15 +00:00
|
|
|
buffer[i++] = static_cast<char>(tag.type);
|
2018-06-18 17:04:46 +00:00
|
|
|
buffer[i++] = name_length;
|
|
|
|
tag.custom_tag_name.copy(&buffer[i], name_length);
|
|
|
|
i += name_length;
|
2018-07-18 18:24:15 +00:00
|
|
|
} else {
|
2018-08-07 20:38:54 +00:00
|
|
|
if (i + 1 >= TREE_SITTER_SERIALIZATION_BUFFER_SIZE) break;
|
2018-07-18 18:24:15 +00:00
|
|
|
buffer[i++] = static_cast<char>(tag.type);
|
2018-06-11 22:12:01 +00:00
|
|
|
}
|
|
|
|
}
|
2018-08-07 20:38:54 +00:00
|
|
|
|
2018-08-29 18:01:00 +00:00
|
|
|
std::memcpy(&buffer[0], &serialized_tag_count, sizeof(serialized_tag_count));
|
2018-06-11 22:12:01 +00:00
|
|
|
return i;
|
|
|
|
}
|
|
|
|
|
|
|
|
void deserialize(const char *buffer, unsigned length) {
|
|
|
|
tags.clear();
|
2018-06-18 17:04:46 +00:00
|
|
|
if (length > 0) {
|
|
|
|
unsigned i = 0;
|
2018-08-29 18:01:00 +00:00
|
|
|
uint16_t tag_count, serialized_tag_count;
|
|
|
|
|
|
|
|
std::memcpy(&serialized_tag_count, &buffer[i], sizeof(serialized_tag_count));
|
|
|
|
i += sizeof(serialized_tag_count);
|
|
|
|
|
|
|
|
std::memcpy(&tag_count, &buffer[i], sizeof(tag_count));
|
|
|
|
i += sizeof(tag_count);
|
|
|
|
|
|
|
|
tags.resize(tag_count);
|
|
|
|
for (unsigned j = 0; j < serialized_tag_count; j++) {
|
2018-06-18 17:04:46 +00:00
|
|
|
Tag &tag = tags[j];
|
|
|
|
tag.type = static_cast<TagType>(buffer[i++]);
|
|
|
|
if (tag.type == CUSTOM) {
|
2018-08-29 18:01:00 +00:00
|
|
|
uint16_t name_length = (uint16_t)buffer[i++];
|
2018-06-18 17:04:46 +00:00
|
|
|
tag.custom_tag_name.assign(&buffer[i], &buffer[i + name_length]);
|
|
|
|
i += name_length;
|
|
|
|
}
|
2018-06-11 22:12:01 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
string scan_tag_name(TSLexer *lexer) {
|
|
|
|
string tag_name;
|
2018-06-18 17:04:46 +00:00
|
|
|
while (iswalnum(lexer->lookahead) ||
|
|
|
|
lexer->lookahead == '-' ||
|
|
|
|
lexer->lookahead == ':') {
|
2018-06-12 00:12:34 +00:00
|
|
|
tag_name += towupper(lexer->lookahead);
|
2018-06-11 22:12:01 +00:00
|
|
|
lexer->advance(lexer, false);
|
|
|
|
}
|
|
|
|
return tag_name;
|
|
|
|
}
|
|
|
|
|
2018-06-12 19:20:13 +00:00
|
|
|
bool scan_comment(TSLexer *lexer) {
|
2018-06-11 22:36:18 +00:00
|
|
|
if (lexer->lookahead != '-') return false;
|
|
|
|
lexer->advance(lexer, false);
|
|
|
|
if (lexer->lookahead != '-') return false;
|
|
|
|
lexer->advance(lexer, false);
|
|
|
|
|
|
|
|
unsigned dashes = 0;
|
2018-06-15 22:32:21 +00:00
|
|
|
while (lexer->lookahead) {
|
|
|
|
switch (lexer->lookahead) {
|
2018-06-11 22:36:18 +00:00
|
|
|
case '-':
|
|
|
|
++dashes;
|
|
|
|
break;
|
|
|
|
case '>':
|
|
|
|
if (dashes >= 2) {
|
|
|
|
lexer->result_symbol = COMMENT;
|
|
|
|
lexer->advance(lexer, false);
|
|
|
|
lexer->mark_end(lexer);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
default:
|
|
|
|
dashes = 0;
|
|
|
|
}
|
|
|
|
lexer->advance(lexer, false);
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2018-06-12 19:20:13 +00:00
|
|
|
bool scan_raw_text(TSLexer *lexer) {
|
2018-06-11 23:56:33 +00:00
|
|
|
if (!tags.size()) return false;
|
|
|
|
|
|
|
|
lexer->mark_end(lexer);
|
|
|
|
|
|
|
|
const string &end_delimiter = tags.back().type == SCRIPT
|
|
|
|
? "</script"
|
|
|
|
: "</style";
|
|
|
|
|
|
|
|
unsigned delimiter_index = 0;
|
|
|
|
while (lexer->lookahead) {
|
|
|
|
if (lexer->lookahead == end_delimiter[delimiter_index]) {
|
|
|
|
delimiter_index++;
|
|
|
|
if (delimiter_index == end_delimiter.size()) break;
|
2018-06-29 03:32:27 +00:00
|
|
|
lexer->advance(lexer, false);
|
2018-06-11 23:56:33 +00:00
|
|
|
} else {
|
|
|
|
delimiter_index = 0;
|
2018-06-29 03:32:27 +00:00
|
|
|
lexer->advance(lexer, false);
|
2018-06-11 23:56:33 +00:00
|
|
|
lexer->mark_end(lexer);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
lexer->result_symbol = RAW_TEXT;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2018-06-12 19:20:13 +00:00
|
|
|
bool scan_implicit_end_tag(TSLexer *lexer) {
|
2018-06-18 17:17:38 +00:00
|
|
|
Tag *parent = tags.empty() ? NULL : &tags.back();
|
2018-06-12 17:51:03 +00:00
|
|
|
|
2018-06-12 19:20:13 +00:00
|
|
|
bool is_closing_tag = false;
|
|
|
|
if (lexer->lookahead == '/') {
|
|
|
|
is_closing_tag = true;
|
|
|
|
lexer->advance(lexer, false);
|
|
|
|
} else {
|
|
|
|
if (parent && parent->is_void()) {
|
|
|
|
tags.pop_back();
|
|
|
|
lexer->result_symbol = IMPLICIT_END_TAG;
|
|
|
|
return true;
|
|
|
|
}
|
2018-06-11 22:36:18 +00:00
|
|
|
}
|
|
|
|
|
2018-06-15 22:32:21 +00:00
|
|
|
string tag_name = scan_tag_name(lexer);
|
2018-06-11 22:12:01 +00:00
|
|
|
if (tag_name.empty()) return false;
|
|
|
|
|
2018-06-12 17:51:03 +00:00
|
|
|
Tag next_tag = Tag::for_name(tag_name);
|
|
|
|
|
2018-06-12 19:20:13 +00:00
|
|
|
if (is_closing_tag) {
|
|
|
|
// The tag correctly closes the topmost element on the stack
|
2018-06-15 22:35:50 +00:00
|
|
|
if (!tags.empty() && tags.back() == next_tag) return false;
|
2018-06-12 19:20:13 +00:00
|
|
|
|
|
|
|
// Otherwise, dig deeper and queue implicit end tags (to be nice in
|
|
|
|
// the case of malformed HTML)
|
|
|
|
if (std::find(tags.begin(), tags.end(), next_tag) != tags.end()) {
|
|
|
|
tags.pop_back();
|
|
|
|
lexer->result_symbol = IMPLICIT_END_TAG;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
} else if (parent && !parent->can_contain(next_tag)) {
|
2018-06-12 17:51:03 +00:00
|
|
|
tags.pop_back();
|
|
|
|
lexer->result_symbol = IMPLICIT_END_TAG;
|
|
|
|
return true;
|
|
|
|
}
|
2018-06-11 22:12:01 +00:00
|
|
|
|
2018-06-12 19:20:13 +00:00
|
|
|
return false;
|
2018-06-11 22:12:01 +00:00
|
|
|
}
|
|
|
|
|
2018-06-12 19:20:13 +00:00
|
|
|
bool scan_start_tag_name(TSLexer *lexer) {
|
2018-06-15 22:32:21 +00:00
|
|
|
string tag_name = scan_tag_name(lexer);
|
2018-06-11 22:12:01 +00:00
|
|
|
if (tag_name.empty()) return false;
|
|
|
|
Tag tag = Tag::for_name(tag_name);
|
2018-06-12 19:20:13 +00:00
|
|
|
tags.push_back(tag);
|
2018-12-15 01:43:41 +00:00
|
|
|
switch (tag.type) {
|
|
|
|
case SCRIPT:
|
|
|
|
lexer->result_symbol = SCRIPT_START_TAG_NAME;
|
|
|
|
break;
|
|
|
|
case STYLE:
|
|
|
|
lexer->result_symbol = STYLE_START_TAG_NAME;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
lexer->result_symbol = START_TAG_NAME;
|
|
|
|
break;
|
2018-06-12 19:20:13 +00:00
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
2018-06-11 22:12:01 +00:00
|
|
|
|
2018-06-12 19:20:13 +00:00
|
|
|
bool scan_end_tag_name(TSLexer *lexer) {
|
2018-06-15 22:32:21 +00:00
|
|
|
string tag_name = scan_tag_name(lexer);
|
2018-06-12 19:20:13 +00:00
|
|
|
if (tag_name.empty()) return false;
|
|
|
|
Tag tag = Tag::for_name(tag_name);
|
|
|
|
if (!tags.empty() && tags.back() == tag) {
|
2018-06-11 22:12:01 +00:00
|
|
|
tags.pop_back();
|
2018-06-12 19:20:13 +00:00
|
|
|
lexer->result_symbol = END_TAG_NAME;
|
|
|
|
} else {
|
|
|
|
lexer->result_symbol = ERRONEOUS_END_TAG_NAME;
|
2018-06-11 22:12:01 +00:00
|
|
|
}
|
2018-06-12 19:20:13 +00:00
|
|
|
return true;
|
|
|
|
}
|
2018-06-11 22:12:01 +00:00
|
|
|
|
2018-06-12 19:20:13 +00:00
|
|
|
bool scan_self_closing_tag_delimiter(TSLexer *lexer) {
|
|
|
|
lexer->advance(lexer, false);
|
|
|
|
if (lexer->lookahead == '>') {
|
|
|
|
lexer->advance(lexer, false);
|
2018-08-07 18:18:18 +00:00
|
|
|
if (!tags.empty()) {
|
|
|
|
tags.pop_back();
|
|
|
|
lexer->result_symbol = SELF_CLOSING_TAG_DELIMITER;
|
|
|
|
}
|
2018-06-11 22:12:01 +00:00
|
|
|
return true;
|
|
|
|
}
|
2018-06-12 19:20:13 +00:00
|
|
|
return false;
|
2018-06-11 22:12:01 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
bool scan(TSLexer *lexer, const bool *valid_symbols) {
|
|
|
|
while (iswspace(lexer->lookahead)) {
|
|
|
|
lexer->advance(lexer, true);
|
|
|
|
}
|
|
|
|
|
2018-06-12 19:20:13 +00:00
|
|
|
if (valid_symbols[RAW_TEXT] && !valid_symbols[START_TAG_NAME] && !valid_symbols[END_TAG_NAME]) {
|
|
|
|
return scan_raw_text(lexer);
|
2018-06-11 23:56:33 +00:00
|
|
|
}
|
|
|
|
|
2018-06-11 22:12:01 +00:00
|
|
|
switch (lexer->lookahead) {
|
|
|
|
case '<':
|
2018-06-11 22:36:18 +00:00
|
|
|
lexer->mark_end(lexer);
|
|
|
|
lexer->advance(lexer, false);
|
|
|
|
|
|
|
|
if (lexer->lookahead == '!') {
|
2018-06-11 22:12:01 +00:00
|
|
|
lexer->advance(lexer, false);
|
2018-06-12 19:20:13 +00:00
|
|
|
return scan_comment(lexer);
|
2018-06-11 22:36:18 +00:00
|
|
|
}
|
|
|
|
|
2018-06-12 19:20:13 +00:00
|
|
|
if (valid_symbols[IMPLICIT_END_TAG]) {
|
|
|
|
return scan_implicit_end_tag(lexer);
|
2018-06-11 22:12:01 +00:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
2018-06-12 21:06:16 +00:00
|
|
|
case '\0':
|
|
|
|
if (valid_symbols[IMPLICIT_END_TAG]) {
|
|
|
|
return scan_implicit_end_tag(lexer);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
2018-06-12 19:20:13 +00:00
|
|
|
case '/':
|
|
|
|
if (valid_symbols[SELF_CLOSING_TAG_DELIMITER]) {
|
|
|
|
return scan_self_closing_tag_delimiter(lexer);
|
2018-06-11 22:12:01 +00:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
2018-06-12 19:20:13 +00:00
|
|
|
default:
|
|
|
|
if (valid_symbols[START_TAG_NAME] || valid_symbols[END_TAG_NAME]) {
|
|
|
|
return valid_symbols[START_TAG_NAME]
|
|
|
|
? scan_start_tag_name(lexer)
|
|
|
|
: scan_end_tag_name(lexer);
|
2018-06-11 22:12:01 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
vector<Tag> tags;
|
|
|
|
};
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
extern "C" {
|
|
|
|
|
|
|
|
void *tree_sitter_html_external_scanner_create() {
|
|
|
|
return new Scanner();
|
|
|
|
}
|
|
|
|
|
|
|
|
bool tree_sitter_html_external_scanner_scan(void *payload, TSLexer *lexer,
|
|
|
|
const bool *valid_symbols) {
|
|
|
|
Scanner *scanner = static_cast<Scanner *>(payload);
|
|
|
|
return scanner->scan(lexer, valid_symbols);
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned tree_sitter_html_external_scanner_serialize(void *payload, char *buffer) {
|
|
|
|
Scanner *scanner = static_cast<Scanner *>(payload);
|
|
|
|
return scanner->serialize(buffer);
|
|
|
|
}
|
|
|
|
|
|
|
|
void tree_sitter_html_external_scanner_deserialize(void *payload, const char *buffer, unsigned length) {
|
|
|
|
Scanner *scanner = static_cast<Scanner *>(payload);
|
|
|
|
scanner->deserialize(buffer, length);
|
|
|
|
}
|
|
|
|
|
|
|
|
void tree_sitter_html_external_scanner_destroy(void *payload) {
|
|
|
|
Scanner *scanner = static_cast<Scanner *>(payload);
|
|
|
|
delete scanner;
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|