[libxml2] Skip incorrectly opened HTML comments



commit e986d09cf531e77a9ab46af2d1a219072f310190
Author: Nick Wellnhofer <wellnhofer aevum de>
Date:   Fri Jul 15 14:02:26 2022 +0200

    Skip incorrectly opened HTML comments
    
    Commit 4fd69f3e fixed handling of '<' characters not followed by an
    ASCII letter. But a '<!' sequence followed by invalid characters should
    be treated as bogus comment and skipped.
    
    Fixes #380.

 HTMLparser.c                  | 145 +++++++++++++++++++++++++-----------------
 include/libxml/xmlerror.h     |   1 +
 result/HTML/issue380.html     |   6 ++
 result/HTML/issue380.html.err |   6 ++
 result/HTML/issue380.html.sax |  20 ++++++
 test/HTML/issue380.html       |   5 ++
 6 files changed, 123 insertions(+), 60 deletions(-)
---
diff --git a/HTMLparser.c b/HTMLparser.c
index 46744900..abc4e905 100644
--- a/HTMLparser.c
+++ b/HTMLparser.c
@@ -2545,6 +2545,21 @@ htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
 
 static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
 
+static void
+htmlSkipBogusComment(htmlParserCtxtPtr ctxt) {
+    int c;
+
+    htmlParseErr(ctxt, XML_HTML_INCORRECTLY_OPENED_COMMENT,
+                 "Incorrectly opened comment\n", NULL, NULL);
+
+    do {
+        c = CUR;
+        if (c == 0)
+            break;
+        NEXT;
+    } while (c != '>');
+}
+
 /**
  * htmlParseHTMLName:
  * @ctxt:  an HTML parser context
@@ -4380,26 +4395,28 @@ htmlParseContent(htmlParserCtxtPtr ctxt) {
            htmlParseScript(ctxt);
        }
 
-        /*
-         * Sometimes DOCTYPE arrives in the middle of the document
-         */
-        else if ((CUR == '<') && (NXT(1) == '!') &&
-            (UPP(2) == 'D') && (UPP(3) == 'O') &&
-            (UPP(4) == 'C') && (UPP(5) == 'T') &&
-            (UPP(6) == 'Y') && (UPP(7) == 'P') &&
-            (UPP(8) == 'E')) {
-            htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
-                         "Misplaced DOCTYPE declaration\n",
-                         BAD_CAST "DOCTYPE" , NULL);
-            htmlParseDocTypeDecl(ctxt);
-        }
-
-        /*
-         * First case :  a comment
-         */
-        else if ((CUR == '<') && (NXT(1) == '!') &&
-            (NXT(2) == '-') && (NXT(3) == '-')) {
-            htmlParseComment(ctxt);
+        else if ((CUR == '<') && (NXT(1) == '!')) {
+            /*
+             * Sometimes DOCTYPE arrives in the middle of the document
+             */
+            if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
+                (UPP(4) == 'C') && (UPP(5) == 'T') &&
+                (UPP(6) == 'Y') && (UPP(7) == 'P') &&
+                (UPP(8) == 'E')) {
+                htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
+                             "Misplaced DOCTYPE declaration\n",
+                             BAD_CAST "DOCTYPE" , NULL);
+                htmlParseDocTypeDecl(ctxt);
+            }
+            /*
+             * First case :  a comment
+             */
+            else if ((NXT(2) == '-') && (NXT(3) == '-')) {
+                htmlParseComment(ctxt);
+            }
+            else {
+                htmlSkipBogusComment(ctxt);
+            }
         }
 
         /*
@@ -4785,26 +4802,28 @@ htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
            htmlParseScript(ctxt);
        }
 
-        /*
-         * Sometimes DOCTYPE arrives in the middle of the document
-         */
-        else if ((CUR == '<') && (NXT(1) == '!') &&
-            (UPP(2) == 'D') && (UPP(3) == 'O') &&
-            (UPP(4) == 'C') && (UPP(5) == 'T') &&
-            (UPP(6) == 'Y') && (UPP(7) == 'P') &&
-            (UPP(8) == 'E')) {
-            htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
-                         "Misplaced DOCTYPE declaration\n",
-                         BAD_CAST "DOCTYPE" , NULL);
-            htmlParseDocTypeDecl(ctxt);
-        }
-
-        /*
-         * First case :  a comment
-         */
-        else if ((CUR == '<') && (NXT(1) == '!') &&
-            (NXT(2) == '-') && (NXT(3) == '-')) {
-            htmlParseComment(ctxt);
+        else if ((CUR == '<') && (NXT(1) == '!')) {
+            /*
+             * Sometimes DOCTYPE arrives in the middle of the document
+             */
+            if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
+                (UPP(4) == 'C') && (UPP(5) == 'T') &&
+                (UPP(6) == 'Y') && (UPP(7) == 'P') &&
+                (UPP(8) == 'E')) {
+                htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
+                             "Misplaced DOCTYPE declaration\n",
+                             BAD_CAST "DOCTYPE" , NULL);
+                htmlParseDocTypeDecl(ctxt);
+            }
+            /*
+             * First case :  a comment
+             */
+            else if ((NXT(2) == '-') && (NXT(3) == '-')) {
+                htmlParseComment(ctxt);
+            }
+            else {
+                htmlSkipBogusComment(ctxt);
+            }
         }
 
         /*
@@ -5949,31 +5968,37 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
 #endif
                        break;
                    }
-               } else if ((cur == '<') && (next == '!') &&
-                    (UPP(2) == 'D') && (UPP(3) == 'O') &&
-                    (UPP(4) == 'C') && (UPP(5) == 'T') &&
-                    (UPP(6) == 'Y') && (UPP(7) == 'P') &&
-                    (UPP(8) == 'E')) {
+               } else if ((cur == '<') && (next == '!')) {
                     /*
                      * Sometimes DOCTYPE arrives in the middle of the document
                      */
-                    if ((!terminate) &&
-                        (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
-                        goto done;
-                    htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
-                                 "Misplaced DOCTYPE declaration\n",
-                                 BAD_CAST "DOCTYPE" , NULL);
-                    htmlParseDocTypeDecl(ctxt);
-                } else if ((cur == '<') && (next == '!') &&
-                    (in->cur[2] == '-') && (in->cur[3] == '-')) {
-                    if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
-                        goto done;
+                    if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
+                        (UPP(4) == 'C') && (UPP(5) == 'T') &&
+                        (UPP(6) == 'Y') && (UPP(7) == 'P') &&
+                        (UPP(8) == 'E')) {
+                        if ((!terminate) &&
+                            (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
+                            goto done;
+                        htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
+                                     "Misplaced DOCTYPE declaration\n",
+                                     BAD_CAST "DOCTYPE" , NULL);
+                        htmlParseDocTypeDecl(ctxt);
+                    } else if ((in->cur[2] == '-') && (in->cur[3] == '-')) {
+                        if ((!terminate) &&
+                            (htmlParseLookupCommentEnd(ctxt) < 0))
+                            goto done;
 #ifdef DEBUG_PUSH
-                    xmlGenericError(xmlGenericErrorContext,
-                            "HPP: Parsing Comment\n");
+                        xmlGenericError(xmlGenericErrorContext,
+                                "HPP: Parsing Comment\n");
 #endif
-                    htmlParseComment(ctxt);
-                    ctxt->instate = XML_PARSER_CONTENT;
+                        htmlParseComment(ctxt);
+                        ctxt->instate = XML_PARSER_CONTENT;
+                    } else {
+                        if ((!terminate) &&
+                            (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
+                            goto done;
+                        htmlSkipBogusComment(ctxt);
+                    }
                 } else if ((cur == '<') && (next == '?')) {
                     if ((!terminate) &&
                         (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
diff --git a/include/libxml/xmlerror.h b/include/libxml/xmlerror.h
index 7b68e401..ee95be9c 100644
--- a/include/libxml/xmlerror.h
+++ b/include/libxml/xmlerror.h
@@ -260,6 +260,7 @@ typedef enum {
     XML_DTD_DUP_TOKEN, /* 541 */
     XML_HTML_STRUCURE_ERROR = 800,
     XML_HTML_UNKNOWN_TAG, /* 801 */
+    XML_HTML_INCORRECTLY_OPENED_COMMENT, /* 802 */
     XML_RNGP_ANYNAME_ATTR_ANCESTOR = 1000,
     XML_RNGP_ATTR_CONFLICT, /* 1001 */
     XML_RNGP_ATTRIBUTE_CHILDREN, /* 1002 */
diff --git a/result/HTML/issue380.html b/result/HTML/issue380.html
new file mode 100644
index 00000000..1fcf4965
--- /dev/null
+++ b/result/HTML/issue380.html
@@ -0,0 +1,6 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd";>
+<html>
+  <body>
+    ...
+  </body>
+</html>
diff --git a/result/HTML/issue380.html.err b/result/HTML/issue380.html.err
new file mode 100644
index 00000000..efbb8bdf
--- /dev/null
+++ b/result/HTML/issue380.html.err
@@ -0,0 +1,6 @@
+./test/HTML/issue380.html:3: HTML parser error : Incorrectly opened comment
+    <![if !supportLists]>...<![endif]>
+    ^
+./test/HTML/issue380.html:3: HTML parser error : Incorrectly opened comment
+    <![if !supportLists]>...<![endif]>
+                            ^
diff --git a/result/HTML/issue380.html.sax b/result/HTML/issue380.html.sax
new file mode 100644
index 00000000..5df2b506
--- /dev/null
+++ b/result/HTML/issue380.html.sax
@@ -0,0 +1,20 @@
+SAX.setDocumentLocator()
+SAX.startDocument()
+SAX.startElement(html)
+SAX.characters(
+  , 3)
+SAX.startElement(body)
+SAX.characters(
+    , 5)
+SAX.error: Incorrectly opened comment
+SAX.characters(..., 3)
+SAX.error: Incorrectly opened comment
+SAX.characters(
+  , 3)
+SAX.endElement(body)
+SAX.characters(
+, 1)
+SAX.endElement(html)
+SAX.characters(
+, 1)
+SAX.endDocument()
diff --git a/test/HTML/issue380.html b/test/HTML/issue380.html
new file mode 100644
index 00000000..46c07f26
--- /dev/null
+++ b/test/HTML/issue380.html
@@ -0,0 +1,5 @@
+<html>
+  <body>
+    <![if !supportLists]>...<![endif]>
+  </body>
+</html>


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]