From 7cfe147792719c4517aa01d402229d1bce70804f Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 12 Jun 2018 10:51:03 -0700 Subject: [PATCH] Handle elements with optional end tags (li, p, etc) Co-Authored-By: Ashi Krishnan --- corpus/main.txt | 127 ++++++++++++++++++++++++++++++++++++++++++++++++ src/scanner.cc | 18 +++++-- src/tag.h | 69 ++++++++++++++++++++++++-- 3 files changed, 204 insertions(+), 10 deletions(-) diff --git a/corpus/main.txt b/corpus/main.txt index 3099359..c6998ed 100644 --- a/corpus/main.txt +++ b/corpus/main.txt @@ -134,3 +134,130 @@ Lowercase doctype (fragment (doctype)) + +================================== +LI elements without close tags +================================== + +--- + +(fragment + (element + (start_tag) + (element (start_tag) (text)) + (element (start_tag) (text)) + (end_tag))) + +====================================== +DT and DL elements without close tags +====================================== +
+
Coffee +
Café +
Black hot drink +
Milk +
White cold drink +
+--- + +(fragment + (element + (start_tag) + (element (start_tag) (text)) + (element (start_tag) (text)) + (element (start_tag) (text)) + (element (start_tag) (text)) + (element (start_tag) (text)) + (end_tag))) + +====================================== +P elements without close tags +====================================== +

One +

Two
+

Three +

Four +

Five

+--- + +(fragment + (element (start_tag) (text)) + (element (start_tag) (text) (end_tag)) + (element (start_tag) (text)) + (element (start_tag) (text)) + (element (start_tag) (text) (end_tag))) + +====================================== +Ruby annotation elements without close tags +====================================== +とうきょう +--- + +(fragment + (element + (start_tag) + (text) + (element (start_tag) (text)) + (element (start_tag) (text)) + (element (start_tag) (text)) + (end_tag))) + +======================================= +COLGROUP elements without end tags +======================================= + + + + + + + + + + +
LimeLemonOrange
+--- + +(fragment + (element + (start_tag) + (element + (start_tag) + (element (start_tag (attribute (attribute_name) (quoted_attribute_value (attribute_value))))) + (element (start_tag (attribute (attribute_name) (quoted_attribute_value (attribute_value)))))) + (element + (start_tag) + (element (start_tag) (text) (end_tag)) + (element (start_tag) (text) (end_tag)) + (element (start_tag) (text) (end_tag)) + (end_tag)) + (end_tag))) + +========================================= +TR, TD, and TH elements without end tags +========================================= + + + +
One + Two +
Three + Four +
+--- + +(fragment + (element + (start_tag) + (element + (start_tag) + (element (start_tag) (text)) + (element (start_tag) (text))) + (element + (start_tag) + (element (start_tag) (text)) + (element (start_tag) (text))) + (end_tag))) diff --git a/src/scanner.cc b/src/scanner.cc index 9f2c8be..1c511c0 100644 --- a/src/scanner.cc +++ b/src/scanner.cc @@ -61,7 +61,7 @@ struct Scanner { string scan_tag_name(TSLexer *lexer) { string tag_name; - while (iswalpha(lexer->lookahead) || lexer->lookahead == '-' || lexer->lookahead == ':') { + while (iswalnum(lexer->lookahead) || lexer->lookahead == '-' || lexer->lookahead == ':') { tag_name += towupper(lexer->lookahead); lexer->advance(lexer, false); } @@ -124,7 +124,9 @@ struct Scanner { } bool start_tag(TSLexer *lexer) { - if (!tags.empty() && tags.back().is_void()) { + Tag *parent = tags.empty() ? nullptr : &tags.back(); + + if (parent && parent->is_void()) { tags.pop_back(); lexer->result_symbol = IMPLICIT_END_TAG; return true; @@ -133,11 +135,17 @@ struct Scanner { auto tag_name = scan_tag_name(lexer); if (tag_name.empty()) return false; - Tag tag = Tag::for_name(tag_name); - tags.push_back(tag); + Tag next_tag = Tag::for_name(tag_name); + if (parent && !parent->can_contain(next_tag)) { + tags.pop_back(); + lexer->result_symbol = IMPLICIT_END_TAG; + return true; + } + + tags.push_back(next_tag); lexer->mark_end(lexer); - lexer->result_symbol = tag.is_raw() ? OPEN_RAW_START_TAG : OPEN_START_TAG; + lexer->result_symbol = next_tag.is_raw() ? OPEN_RAW_START_TAG : OPEN_START_TAG; return true; } diff --git a/src/tag.h b/src/tag.h index 91a5ca7..6e15327 100644 --- a/src/tag.h +++ b/src/tag.h @@ -265,6 +265,35 @@ static const unordered_map TAG_TYPES_BY_TAG_NAME = { {"VIDEO", VIDEO}, }; +static const bool PARAGRAPH_CANNOT_CONTAIN[CUSTOM + 1] = { + [ADDRESS] = true, + [ARTICLE] = true, + [ASIDE] = true, + [BLOCKQUOTE] = true, + [DETAILS] = true, + [DIV] = true, + [DL] = true, + [FIELDSET] = true, + [FIGCAPTION] = true, + [FIGURE] = true, + [FOOTER] = true, + [FORM] = true, + [H1] = true, + [H2] = true, + [H3] = true, + [H4] = true, + [H5] = true, + [H6] = true, + [HEADER] = true, + [HR] = true, + [MAIN] = true, + [NAV] = true, + [OL] = true, + [P] = true, + [PRE] = true, + [SECTION] = true, +}; + struct Tag { TagType type; string custom_tag_name; @@ -283,11 +312,41 @@ struct Tag { return type == SCRIPT || type == STYLE; } - // string name() const { - // return type == TagType::CUSTOM - // ? custom_tag_name - // : TAG_TYPES_BY_TAG_NAME. - // } + inline bool can_contain(const Tag &tag) { + TagType child = tag.type; + + switch (type) { + case LI: return child != LI; + + case DT: + case DD: + return child != DT && child != DD; + + case P: + return !PARAGRAPH_CANNOT_CONTAIN[child]; + + case COLGROUP: + return child == COL; + + case RB: + case RT: + case RP: + return child != RB && child != RT && child != RP; + + case OPTGROUP: + return child != OPTGROUP; + + case TR: + return child != TR; + + case TD: + case TH: + return child != TD && child != TH && child != TR; + + default: + return true; + } + } static Tag for_name(const string &name) { auto type = TAG_TYPES_BY_TAG_NAME.find(name);