[libxml2] use new htmlParseLookupCommentEnd to find comment ends



commit a67b63d183f5ab5d5af70fe47ef3a3d53fa3cb09
Author: Mike Dalessio <mike dalessio gmail com>
Date:   Sun Oct 11 14:15:37 2020 -0400

    use new htmlParseLookupCommentEnd to find comment ends
    
    Note that the caret in error messages generated during comment parsing
    may have moved by one byte.
    
    See guidance provided on incorrectly-closed comments here:
    
    https://html.spec.whatwg.org/multipage/parsing.html#parse-error-incorrectly-closed-comment

 HTMLparser.c                  | 46 ++++++++++++++++++++++++++++++++++---------
 result/HTML/758606.html.err   |  2 +-
 result/HTML/758606_2.html.err |  2 +-
 3 files changed, 39 insertions(+), 11 deletions(-)
---
diff --git a/HTMLparser.c b/HTMLparser.c
index 41ab4aa5..2877f4b7 100644
--- a/HTMLparser.c
+++ b/HTMLparser.c
@@ -5220,6 +5220,39 @@ htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
     return (-1);
 }
 
+/**
+ * htmlParseLookupCommentEnd:
+ * @ctxt: an HTML parser context
+ *
+ * Try to find a comment end tag in the input stream
+ * The search includes "-->" as well as WHATWG-recommended incorrectly-closed tags.
+ * (See https://html.spec.whatwg.org/multipage/parsing.html#parse-error-incorrectly-closed-comment)
+ * This function has a side effect of (possibly) incrementing ctxt->checkIndex
+ * to avoid rescanning sequences of bytes, it DOES change the state of the
+ * parser, do not use liberally.
+ * This wraps to htmlParseLookupSequence()
+ *
+ * Returns the index to the current parsing point if the full sequence is available, -1 otherwise.
+ */
+static int
+htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt)
+{
+    int mark = 0;
+    int cur = CUR_PTR - BASE_PTR;
+
+    while (mark >= 0) {
+       mark = htmlParseLookupSequence(ctxt, '-', '-', 0, 0);
+       if ((mark < 0) ||
+           (NXT(mark+2) == '>') ||
+           ((NXT(mark+2) == '!') && (NXT(mark+3) == '>'))) {
+           return mark;
+       }
+       ctxt->checkIndex = cur + mark + 1;
+    }
+    return mark;
+}
+
+
 /**
  * htmlParseTryOrFinish:
  * @ctxt:  an HTML parser context
@@ -5405,8 +5438,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
                cur = in->cur[0];
                if ((cur == '<') && (next == '!') &&
                    (in->cur[2] == '-') && (in->cur[3] == '-')) {
-                   if ((!terminate) &&
-                       (htmlParseLookupSequence(ctxt, '-', '-', '>', 0) < 0))
+                   if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
                        goto done;
 #ifdef DEBUG_PUSH
                    xmlGenericError(xmlGenericErrorContext,
@@ -5466,8 +5498,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
                next = in->cur[1];
                if ((cur == '<') && (next == '!') &&
                    (in->cur[2] == '-') && (in->cur[3] == '-')) {
-                   if ((!terminate) &&
-                       (htmlParseLookupSequence(ctxt, '-', '-', '>', 0) < 0))
+                   if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
                        goto done;
 #ifdef DEBUG_PUSH
                    xmlGenericError(xmlGenericErrorContext,
@@ -5514,8 +5545,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
                next = in->cur[1];
                if ((cur == '<') && (next == '!') &&
                    (in->cur[2] == '-') && (in->cur[3] == '-')) {
-                   if ((!terminate) &&
-                       (htmlParseLookupSequence(ctxt, '-', '-', '>', 0) < 0))
+                   if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
                        goto done;
 #ifdef DEBUG_PUSH
                    xmlGenericError(xmlGenericErrorContext,
@@ -5769,9 +5799,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
                        htmlParseDocTypeDecl(ctxt);
                    } else if ((cur == '<') && (next == '!') &&
                        (in->cur[2] == '-') && (in->cur[3] == '-')) {
-                       if ((!terminate) &&
-                           (htmlParseLookupSequence(
-                               ctxt, '-', '-', '>', 0) < 0))
+                       if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
                            goto done;
 #ifdef DEBUG_PUSH
                        xmlGenericError(xmlGenericErrorContext,
diff --git a/result/HTML/758606.html.err b/result/HTML/758606.html.err
index e3e61265..bcb253eb 100644
--- a/result/HTML/758606.html.err
+++ b/result/HTML/758606.html.err
@@ -1,6 +1,6 @@
 ./test/HTML/758606.html:1: HTML parser error : Invalid char in comment 0xC
 <!--<!doctype
-      ^
+       ^
 ./test/HTML/758606.html:2: HTML parser error : Comment not terminated 
 <!--<!doctyp
 
diff --git a/result/HTML/758606_2.html.err b/result/HTML/758606_2.html.err
index e9bf4060..88bcde6b 100644
--- a/result/HTML/758606_2.html.err
+++ b/result/HTML/758606_2.html.err
@@ -1,6 +1,6 @@
 ./test/HTML/758606_2.html:1: HTML parser error : Invalid char in comment 0xC
 ‘<!dOctYPE
-  ^
+   ^
 ./test/HTML/758606_2.html:2: HTML parser error : Comment not terminated 
 <!--‘<!dOctYP
 


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]