Handle elements with optional end tags (li, p, etc)

Co-Authored-By: Ashi Krishnan <queerviolet@github.com>
This commit is contained in:
Max Brunsfeld 2018-06-12 10:51:03 -07:00
parent 43e4826bb7
commit 7cfe147792
3 changed files with 204 additions and 10 deletions

View File

@ -134,3 +134,130 @@ Lowercase doctype
(fragment
(doctype))
==================================
LI elements without close tags
==================================
<ul>
<li>One
<li>Two
</ul>
---
(fragment
(element
(start_tag)
(element (start_tag) (text))
(element (start_tag) (text))
(end_tag)))
======================================
DT and DL elements without close tags
======================================
<dl>
<dt>Coffee
<dt>Café
<dd>Black hot drink
<dt>Milk
<dd>White cold drink
</dl>
---
(fragment
(element
(start_tag)
(element (start_tag) (text))
(element (start_tag) (text))
(element (start_tag) (text))
(element (start_tag) (text))
(element (start_tag) (text))
(end_tag)))
======================================
P elements without close tags
======================================
<p>One
<div>Two</div>
<p>Three
<p>Four
<h1>Five</h1>
---
(fragment
(element (start_tag) (text))
(element (start_tag) (text) (end_tag))
(element (start_tag) (text))
(element (start_tag) (text))
(element (start_tag) (text) (end_tag)))
======================================
Ruby annotation elements without close tags
======================================
<ruby>東<rb>京<rt>とう<rt>きょう</ruby>
---
(fragment
(element
(start_tag)
(text)
(element (start_tag) (text))
(element (start_tag) (text))
(element (start_tag) (text))
(end_tag)))
=======================================
COLGROUP elements without end tags
=======================================
<table>
<colgroup>
<col style="background-color: #0f0">
<col span="2">
<tr>
<th>Lime</th>
<th>Lemon</th>
<th>Orange</th>
</tr>
</table>
---
(fragment
(element
(start_tag)
(element
(start_tag)
(element (start_tag (attribute (attribute_name) (quoted_attribute_value (attribute_value)))))
(element (start_tag (attribute (attribute_name) (quoted_attribute_value (attribute_value))))))
(element
(start_tag)
(element (start_tag) (text) (end_tag))
(element (start_tag) (text) (end_tag))
(element (start_tag) (text) (end_tag))
(end_tag))
(end_tag)))
=========================================
TR, TD, and TH elements without end tags
=========================================
<table>
<tr>
<th>One
<th>Two
<tr>
<td>Three
<td>Four
</table>
---
(fragment
(element
(start_tag)
(element
(start_tag)
(element (start_tag) (text))
(element (start_tag) (text)))
(element
(start_tag)
(element (start_tag) (text))
(element (start_tag) (text)))
(end_tag)))

18
src/scanner.cc vendored
View File

@ -61,7 +61,7 @@ struct Scanner {
string scan_tag_name(TSLexer *lexer) {
string tag_name;
while (iswalpha(lexer->lookahead) || lexer->lookahead == '-' || lexer->lookahead == ':') {
while (iswalnum(lexer->lookahead) || lexer->lookahead == '-' || lexer->lookahead == ':') {
tag_name += towupper(lexer->lookahead);
lexer->advance(lexer, false);
}
@ -124,7 +124,9 @@ struct Scanner {
}
bool start_tag(TSLexer *lexer) {
if (!tags.empty() && tags.back().is_void()) {
Tag *parent = tags.empty() ? nullptr : &tags.back();
if (parent && parent->is_void()) {
tags.pop_back();
lexer->result_symbol = IMPLICIT_END_TAG;
return true;
@ -133,11 +135,17 @@ struct Scanner {
auto tag_name = scan_tag_name(lexer);
if (tag_name.empty()) return false;
Tag tag = Tag::for_name(tag_name);
tags.push_back(tag);
Tag next_tag = Tag::for_name(tag_name);
if (parent && !parent->can_contain(next_tag)) {
tags.pop_back();
lexer->result_symbol = IMPLICIT_END_TAG;
return true;
}
tags.push_back(next_tag);
lexer->mark_end(lexer);
lexer->result_symbol = tag.is_raw() ? OPEN_RAW_START_TAG : OPEN_START_TAG;
lexer->result_symbol = next_tag.is_raw() ? OPEN_RAW_START_TAG : OPEN_START_TAG;
return true;
}

69
src/tag.h vendored
View File

@ -265,6 +265,35 @@ static const unordered_map<string, TagType> TAG_TYPES_BY_TAG_NAME = {
{"VIDEO", VIDEO},
};
static const bool PARAGRAPH_CANNOT_CONTAIN[CUSTOM + 1] = {
[ADDRESS] = true,
[ARTICLE] = true,
[ASIDE] = true,
[BLOCKQUOTE] = true,
[DETAILS] = true,
[DIV] = true,
[DL] = true,
[FIELDSET] = true,
[FIGCAPTION] = true,
[FIGURE] = true,
[FOOTER] = true,
[FORM] = true,
[H1] = true,
[H2] = true,
[H3] = true,
[H4] = true,
[H5] = true,
[H6] = true,
[HEADER] = true,
[HR] = true,
[MAIN] = true,
[NAV] = true,
[OL] = true,
[P] = true,
[PRE] = true,
[SECTION] = true,
};
struct Tag {
TagType type;
string custom_tag_name;
@ -283,11 +312,41 @@ struct Tag {
return type == SCRIPT || type == STYLE;
}
// string name() const {
// return type == TagType::CUSTOM
// ? custom_tag_name
// : TAG_TYPES_BY_TAG_NAME.
// }
inline bool can_contain(const Tag &tag) {
TagType child = tag.type;
switch (type) {
case LI: return child != LI;
case DT:
case DD:
return child != DT && child != DD;
case P:
return !PARAGRAPH_CANNOT_CONTAIN[child];
case COLGROUP:
return child == COL;
case RB:
case RT:
case RP:
return child != RB && child != RT && child != RP;
case OPTGROUP:
return child != OPTGROUP;
case TR:
return child != TR;
case TD:
case TH:
return child != TD && child != TH && child != TR;
default:
return true;
}
}
static Tag for_name(const string &name) {
auto type = TAG_TYPES_BY_TAG_NAME.find(name);