diff --git a/corpus/main.txt b/corpus/main.txt
index 3099359..c6998ed 100644
--- a/corpus/main.txt
+++ b/corpus/main.txt
@@ -134,3 +134,130 @@ Lowercase doctype
(fragment
(doctype))
+
+==================================
+LI elements without close tags
+==================================
+
+---
+
+(fragment
+ (element
+ (start_tag)
+ (element (start_tag) (text))
+ (element (start_tag) (text))
+ (end_tag)))
+
+======================================
+DT and DL elements without close tags
+======================================
+
+ - Coffee
+
- Café
+
- Black hot drink
+
- Milk
+
- White cold drink
+
+---
+
+(fragment
+ (element
+ (start_tag)
+ (element (start_tag) (text))
+ (element (start_tag) (text))
+ (element (start_tag) (text))
+ (element (start_tag) (text))
+ (element (start_tag) (text))
+ (end_tag)))
+
+======================================
+P elements without close tags
+======================================
+One
+
Two
+Three
+
Four
+
Five
+---
+
+(fragment
+ (element (start_tag) (text))
+ (element (start_tag) (text) (end_tag))
+ (element (start_tag) (text))
+ (element (start_tag) (text))
+ (element (start_tag) (text) (end_tag)))
+
+======================================
+Ruby annotation elements without close tags
+======================================
+東京
+---
+
+(fragment
+ (element
+ (start_tag)
+ (text)
+ (element (start_tag) (text))
+ (element (start_tag) (text))
+ (element (start_tag) (text))
+ (end_tag)))
+
+=======================================
+COLGROUP elements without end tags
+=======================================
+
+
+
+
+
+
+ Lime |
+ Lemon |
+ Orange |
+
+
+---
+
+(fragment
+ (element
+ (start_tag)
+ (element
+ (start_tag)
+ (element (start_tag (attribute (attribute_name) (quoted_attribute_value (attribute_value)))))
+ (element (start_tag (attribute (attribute_name) (quoted_attribute_value (attribute_value))))))
+ (element
+ (start_tag)
+ (element (start_tag) (text) (end_tag))
+ (element (start_tag) (text) (end_tag))
+ (element (start_tag) (text) (end_tag))
+ (end_tag))
+ (end_tag)))
+
+=========================================
+TR, TD, and TH elements without end tags
+=========================================
+
+
+ One
+ | Two
+ |
+ Three
+ | Four
+ |
+---
+
+(fragment
+ (element
+ (start_tag)
+ (element
+ (start_tag)
+ (element (start_tag) (text))
+ (element (start_tag) (text)))
+ (element
+ (start_tag)
+ (element (start_tag) (text))
+ (element (start_tag) (text)))
+ (end_tag)))
diff --git a/src/scanner.cc b/src/scanner.cc
index 9f2c8be..1c511c0 100644
--- a/src/scanner.cc
+++ b/src/scanner.cc
@@ -61,7 +61,7 @@ struct Scanner {
string scan_tag_name(TSLexer *lexer) {
string tag_name;
- while (iswalpha(lexer->lookahead) || lexer->lookahead == '-' || lexer->lookahead == ':') {
+ while (iswalnum(lexer->lookahead) || lexer->lookahead == '-' || lexer->lookahead == ':') {
tag_name += towupper(lexer->lookahead);
lexer->advance(lexer, false);
}
@@ -124,7 +124,9 @@ struct Scanner {
}
bool start_tag(TSLexer *lexer) {
- if (!tags.empty() && tags.back().is_void()) {
+ Tag *parent = tags.empty() ? nullptr : &tags.back();
+
+ if (parent && parent->is_void()) {
tags.pop_back();
lexer->result_symbol = IMPLICIT_END_TAG;
return true;
@@ -133,11 +135,17 @@ struct Scanner {
auto tag_name = scan_tag_name(lexer);
if (tag_name.empty()) return false;
- Tag tag = Tag::for_name(tag_name);
- tags.push_back(tag);
+ Tag next_tag = Tag::for_name(tag_name);
+ if (parent && !parent->can_contain(next_tag)) {
+ tags.pop_back();
+ lexer->result_symbol = IMPLICIT_END_TAG;
+ return true;
+ }
+
+ tags.push_back(next_tag);
lexer->mark_end(lexer);
- lexer->result_symbol = tag.is_raw() ? OPEN_RAW_START_TAG : OPEN_START_TAG;
+ lexer->result_symbol = next_tag.is_raw() ? OPEN_RAW_START_TAG : OPEN_START_TAG;
return true;
}
diff --git a/src/tag.h b/src/tag.h
index 91a5ca7..6e15327 100644
--- a/src/tag.h
+++ b/src/tag.h
@@ -265,6 +265,35 @@ static const unordered_map TAG_TYPES_BY_TAG_NAME = {
{"VIDEO", VIDEO},
};
+static const bool PARAGRAPH_CANNOT_CONTAIN[CUSTOM + 1] = {
+ [ADDRESS] = true,
+ [ARTICLE] = true,
+ [ASIDE] = true,
+ [BLOCKQUOTE] = true,
+ [DETAILS] = true,
+ [DIV] = true,
+ [DL] = true,
+ [FIELDSET] = true,
+ [FIGCAPTION] = true,
+ [FIGURE] = true,
+ [FOOTER] = true,
+ [FORM] = true,
+ [H1] = true,
+ [H2] = true,
+ [H3] = true,
+ [H4] = true,
+ [H5] = true,
+ [H6] = true,
+ [HEADER] = true,
+ [HR] = true,
+ [MAIN] = true,
+ [NAV] = true,
+ [OL] = true,
+ [P] = true,
+ [PRE] = true,
+ [SECTION] = true,
+};
+
struct Tag {
TagType type;
string custom_tag_name;
@@ -283,11 +312,41 @@ struct Tag {
return type == SCRIPT || type == STYLE;
}
- // string name() const {
- // return type == TagType::CUSTOM
- // ? custom_tag_name
- // : TAG_TYPES_BY_TAG_NAME.
- // }
+ inline bool can_contain(const Tag &tag) {
+ TagType child = tag.type;
+
+ switch (type) {
+ case LI: return child != LI;
+
+ case DT:
+ case DD:
+ return child != DT && child != DD;
+
+ case P:
+ return !PARAGRAPH_CANNOT_CONTAIN[child];
+
+ case COLGROUP:
+ return child == COL;
+
+ case RB:
+ case RT:
+ case RP:
+ return child != RB && child != RT && child != RP;
+
+ case OPTGROUP:
+ return child != OPTGROUP;
+
+ case TR:
+ return child != TR;
+
+ case TD:
+ case TH:
+ return child != TD && child != TH && child != TR;
+
+ default:
+ return true;
+ }
+ }
static Tag for_name(const string &name) {
auto type = TAG_TYPES_BY_TAG_NAME.find(name);