[libxml2] Skip incorrectly opened HTML comments
- From: Nick Wellnhofer <nwellnhof src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [libxml2] Skip incorrectly opened HTML comments
- Date: Tue, 2 Aug 2022 12:51:13 +0000 (UTC)
commit e986d09cf531e77a9ab46af2d1a219072f310190
Author: Nick Wellnhofer <wellnhofer aevum de>
Date: Fri Jul 15 14:02:26 2022 +0200
Skip incorrectly opened HTML comments
Commit 4fd69f3e fixed handling of '<' characters not followed by an
ASCII letter. But a '<!' sequence followed by invalid characters should
be treated as bogus comment and skipped.
Fixes #380.
HTMLparser.c | 145 +++++++++++++++++++++++++-----------------
include/libxml/xmlerror.h | 1 +
result/HTML/issue380.html | 6 ++
result/HTML/issue380.html.err | 6 ++
result/HTML/issue380.html.sax | 20 ++++++
test/HTML/issue380.html | 5 ++
6 files changed, 123 insertions(+), 60 deletions(-)
---
diff --git a/HTMLparser.c b/HTMLparser.c
index 46744900..abc4e905 100644
--- a/HTMLparser.c
+++ b/HTMLparser.c
@@ -2545,6 +2545,21 @@ htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
+static void
+htmlSkipBogusComment(htmlParserCtxtPtr ctxt) {
+ int c;
+
+ htmlParseErr(ctxt, XML_HTML_INCORRECTLY_OPENED_COMMENT,
+ "Incorrectly opened comment\n", NULL, NULL);
+
+ do {
+ c = CUR;
+ if (c == 0)
+ break;
+ NEXT;
+ } while (c != '>');
+}
+
/**
* htmlParseHTMLName:
* @ctxt: an HTML parser context
@@ -4380,26 +4395,28 @@ htmlParseContent(htmlParserCtxtPtr ctxt) {
htmlParseScript(ctxt);
}
- /*
- * Sometimes DOCTYPE arrives in the middle of the document
- */
- else if ((CUR == '<') && (NXT(1) == '!') &&
- (UPP(2) == 'D') && (UPP(3) == 'O') &&
- (UPP(4) == 'C') && (UPP(5) == 'T') &&
- (UPP(6) == 'Y') && (UPP(7) == 'P') &&
- (UPP(8) == 'E')) {
- htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
- "Misplaced DOCTYPE declaration\n",
- BAD_CAST "DOCTYPE" , NULL);
- htmlParseDocTypeDecl(ctxt);
- }
-
- /*
- * First case : a comment
- */
- else if ((CUR == '<') && (NXT(1) == '!') &&
- (NXT(2) == '-') && (NXT(3) == '-')) {
- htmlParseComment(ctxt);
+ else if ((CUR == '<') && (NXT(1) == '!')) {
+ /*
+ * Sometimes DOCTYPE arrives in the middle of the document
+ */
+ if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
+ (UPP(4) == 'C') && (UPP(5) == 'T') &&
+ (UPP(6) == 'Y') && (UPP(7) == 'P') &&
+ (UPP(8) == 'E')) {
+ htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
+ "Misplaced DOCTYPE declaration\n",
+ BAD_CAST "DOCTYPE" , NULL);
+ htmlParseDocTypeDecl(ctxt);
+ }
+ /*
+ * First case : a comment
+ */
+ else if ((NXT(2) == '-') && (NXT(3) == '-')) {
+ htmlParseComment(ctxt);
+ }
+ else {
+ htmlSkipBogusComment(ctxt);
+ }
}
/*
@@ -4785,26 +4802,28 @@ htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
htmlParseScript(ctxt);
}
- /*
- * Sometimes DOCTYPE arrives in the middle of the document
- */
- else if ((CUR == '<') && (NXT(1) == '!') &&
- (UPP(2) == 'D') && (UPP(3) == 'O') &&
- (UPP(4) == 'C') && (UPP(5) == 'T') &&
- (UPP(6) == 'Y') && (UPP(7) == 'P') &&
- (UPP(8) == 'E')) {
- htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
- "Misplaced DOCTYPE declaration\n",
- BAD_CAST "DOCTYPE" , NULL);
- htmlParseDocTypeDecl(ctxt);
- }
-
- /*
- * First case : a comment
- */
- else if ((CUR == '<') && (NXT(1) == '!') &&
- (NXT(2) == '-') && (NXT(3) == '-')) {
- htmlParseComment(ctxt);
+ else if ((CUR == '<') && (NXT(1) == '!')) {
+ /*
+ * Sometimes DOCTYPE arrives in the middle of the document
+ */
+ if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
+ (UPP(4) == 'C') && (UPP(5) == 'T') &&
+ (UPP(6) == 'Y') && (UPP(7) == 'P') &&
+ (UPP(8) == 'E')) {
+ htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
+ "Misplaced DOCTYPE declaration\n",
+ BAD_CAST "DOCTYPE" , NULL);
+ htmlParseDocTypeDecl(ctxt);
+ }
+ /*
+ * First case : a comment
+ */
+ else if ((NXT(2) == '-') && (NXT(3) == '-')) {
+ htmlParseComment(ctxt);
+ }
+ else {
+ htmlSkipBogusComment(ctxt);
+ }
}
/*
@@ -5949,31 +5968,37 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
#endif
break;
}
- } else if ((cur == '<') && (next == '!') &&
- (UPP(2) == 'D') && (UPP(3) == 'O') &&
- (UPP(4) == 'C') && (UPP(5) == 'T') &&
- (UPP(6) == 'Y') && (UPP(7) == 'P') &&
- (UPP(8) == 'E')) {
+ } else if ((cur == '<') && (next == '!')) {
/*
* Sometimes DOCTYPE arrives in the middle of the document
*/
- if ((!terminate) &&
- (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
- goto done;
- htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
- "Misplaced DOCTYPE declaration\n",
- BAD_CAST "DOCTYPE" , NULL);
- htmlParseDocTypeDecl(ctxt);
- } else if ((cur == '<') && (next == '!') &&
- (in->cur[2] == '-') && (in->cur[3] == '-')) {
- if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
- goto done;
+ if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
+ (UPP(4) == 'C') && (UPP(5) == 'T') &&
+ (UPP(6) == 'Y') && (UPP(7) == 'P') &&
+ (UPP(8) == 'E')) {
+ if ((!terminate) &&
+ (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
+ goto done;
+ htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
+ "Misplaced DOCTYPE declaration\n",
+ BAD_CAST "DOCTYPE" , NULL);
+ htmlParseDocTypeDecl(ctxt);
+ } else if ((in->cur[2] == '-') && (in->cur[3] == '-')) {
+ if ((!terminate) &&
+ (htmlParseLookupCommentEnd(ctxt) < 0))
+ goto done;
#ifdef DEBUG_PUSH
- xmlGenericError(xmlGenericErrorContext,
- "HPP: Parsing Comment\n");
+ xmlGenericError(xmlGenericErrorContext,
+ "HPP: Parsing Comment\n");
#endif
- htmlParseComment(ctxt);
- ctxt->instate = XML_PARSER_CONTENT;
+ htmlParseComment(ctxt);
+ ctxt->instate = XML_PARSER_CONTENT;
+ } else {
+ if ((!terminate) &&
+ (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
+ goto done;
+ htmlSkipBogusComment(ctxt);
+ }
} else if ((cur == '<') && (next == '?')) {
if ((!terminate) &&
(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
diff --git a/include/libxml/xmlerror.h b/include/libxml/xmlerror.h
index 7b68e401..ee95be9c 100644
--- a/include/libxml/xmlerror.h
+++ b/include/libxml/xmlerror.h
@@ -260,6 +260,7 @@ typedef enum {
XML_DTD_DUP_TOKEN, /* 541 */
XML_HTML_STRUCURE_ERROR = 800,
XML_HTML_UNKNOWN_TAG, /* 801 */
+ XML_HTML_INCORRECTLY_OPENED_COMMENT, /* 802 */
XML_RNGP_ANYNAME_ATTR_ANCESTOR = 1000,
XML_RNGP_ATTR_CONFLICT, /* 1001 */
XML_RNGP_ATTRIBUTE_CHILDREN, /* 1002 */
diff --git a/result/HTML/issue380.html b/result/HTML/issue380.html
new file mode 100644
index 00000000..1fcf4965
--- /dev/null
+++ b/result/HTML/issue380.html
@@ -0,0 +1,6 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
+<html>
+ <body>
+ ...
+ </body>
+</html>
diff --git a/result/HTML/issue380.html.err b/result/HTML/issue380.html.err
new file mode 100644
index 00000000..efbb8bdf
--- /dev/null
+++ b/result/HTML/issue380.html.err
@@ -0,0 +1,6 @@
+./test/HTML/issue380.html:3: HTML parser error : Incorrectly opened comment
+ <![if !supportLists]>...<![endif]>
+ ^
+./test/HTML/issue380.html:3: HTML parser error : Incorrectly opened comment
+ <![if !supportLists]>...<![endif]>
+ ^
diff --git a/result/HTML/issue380.html.sax b/result/HTML/issue380.html.sax
new file mode 100644
index 00000000..5df2b506
--- /dev/null
+++ b/result/HTML/issue380.html.sax
@@ -0,0 +1,20 @@
+SAX.setDocumentLocator()
+SAX.startDocument()
+SAX.startElement(html)
+SAX.characters(
+ , 3)
+SAX.startElement(body)
+SAX.characters(
+ , 5)
+SAX.error: Incorrectly opened comment
+SAX.characters(..., 3)
+SAX.error: Incorrectly opened comment
+SAX.characters(
+ , 3)
+SAX.endElement(body)
+SAX.characters(
+, 1)
+SAX.endElement(html)
+SAX.characters(
+, 1)
+SAX.endDocument()
diff --git a/test/HTML/issue380.html b/test/HTML/issue380.html
new file mode 100644
index 00000000..46c07f26
--- /dev/null
+++ b/test/HTML/issue380.html
@@ -0,0 +1,5 @@
+<html>
+ <body>
+ <![if !supportLists]>...<![endif]>
+ </body>
+</html>
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]