[libxml2] Recover unescaped less-than character in HTML recovery parsing

From: Daniel Veillard <veillard src gnome org>
To: commits-list gnome org
Cc:
Subject: [libxml2] Recover unescaped less-than character in HTML recovery parsing
Date: Tue, 30 Jun 2015 03:42:39 +0000 (UTC)

commit 140c251e8e5653572edcca91b9d675f871735cb4
Author: Daniel Veillard <veillard redhat com>
Date:   Tue Jun 30 11:36:28 2015 +0800

    Recover unescaped less-than character in HTML recovery parsing
    
    As pointed by Christian Schoenebeck <schoenebeck crudebyte com>
    on the list and based on some of his early patches, this preserve
    content when unescaped opening angle brackets are not escaped in
    textual content like:
      <p>  a < b </p>
      <p> a <0 </p>
      <p> a <=0 </p>
    
    while still reporting the error.

 HTMLparser.c |   33 ++++++++++++++++++++++++++++++---
 1 files changed, 30 insertions(+), 3 deletions(-)
---
diff --git a/HTMLparser.c b/HTMLparser.c
index d329d3b..19c10c3 100644
--- a/HTMLparser.c
+++ b/HTMLparser.c
@@ -2948,8 +2948,9 @@ htmlParseScript(htmlParserCtxtPtr ctxt) {
 
 
 /**
- * htmlParseCharData:
+ * htmlParseCharDataInternal:
  * @ctxt:  an HTML parser context
+ * @readahead: optional read ahead character in ascii range
  *
  * parse a CharData section.
  * if we are within a CDATA section ']]>' marks an end of section.
@@ -2958,12 +2959,15 @@ htmlParseScript(htmlParserCtxtPtr ctxt) {
  */
 
 static void
-htmlParseCharData(htmlParserCtxtPtr ctxt) {
-    xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
+htmlParseCharDataInternal(htmlParserCtxtPtr ctxt, int readahead) {
+    xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6];
     int nbchar = 0;
     int cur, l;
     int chunk = 0;
 
+    if (readahead)
+        buf[nbchar++] = readahead;
+
     SHRINK;
     cur = CUR_CHAR(l);
     while (((cur != '<') || (ctxt->token == '<')) &&
@@ -3043,6 +3047,21 @@ htmlParseCharData(htmlParserCtxtPtr ctxt) {
 }
 
 /**
+ * htmlParseCharData:
+ * @ctxt:  an HTML parser context
+ *
+ * parse a CharData section.
+ * if we are within a CDATA section ']]>' marks an end of section.
+ *
+ * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
+ */
+
+static void
+htmlParseCharData(htmlParserCtxtPtr ctxt) {
+    htmlParseCharDataInternal(ctxt, 0);
+}
+
+/**
  * htmlParseExternalID:
  * @ctxt:  an HTML parser context
  * @publicID:  a xmlChar** receiving PubidLiteral
@@ -3690,6 +3709,14 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) {
        htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
                     "htmlParseStartTag: invalid element name\n",
                     NULL, NULL);
+       /* if recover preserve text on classic misconstructs */
+       if ((ctxt->recovery) && ((IS_BLANK_CH(CUR)) || (CUR == '<') ||
+           (CUR == '=') || (CUR == '>') || (((CUR >= '0') && (CUR <= '9'))))) {
+           htmlParseCharDataInternal(ctxt, '<');
+           return(-1);
+       }
+
+
        /* Dump the bogus tag like browsers do */
        while ((IS_CHAR_CH(CUR)) && (CUR != '>') &&
                (ctxt->instate != XML_PARSER_EOF))

[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]