[xml] Patch to improve HTMLparser's robustness



I've been running into problems parsing incoming email messages through libxml2's HTML parser, which when seeing tags such as <html xml:lang="en" xmlns:....> in an unexpected place, will just eat the '<html' part and turn the attributes of that html tag into normal text, causing odd code to appear at the top of email messages. This mostly affects Outlook/Exchange generated messages.

The attached patch tries to fix it. It works for me, but I wonder whether I haven't introduced memory allocation issues with it, and hope the patch (or a similar solution) can be integrated into a future libxml release.

With regards,
Arnold Hendriks
diff -ur libxml2-2.6.30/HTMLparser.c libxml2-2.6.30patched/HTMLparser.c
--- libxml2-2.6.30/HTMLparser.c Tue Jun 12 04:36:23 2007
+++ libxml2-2.6.30patched/HTMLparser.c  Tue Nov 20 08:40:59 2007
@@ -3431,6 +3431,7 @@
     int maxatts;
     int meta = 0;
     int i;
+    int discardtag = 0;
 
     if ((ctxt == NULL) || (ctxt->input == NULL)) {
        htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
@@ -3475,14 +3476,14 @@
        htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
                     "htmlParseStartTag: misplaced <html> tag\n",
                     name, NULL);
-       return 0;
+       discardtag = 1;
     }
     if ((ctxt->nameNr != 1) && 
        (xmlStrEqual(name, BAD_CAST"head"))) {
        htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
                     "htmlParseStartTag: misplaced <head> tag\n",
                     name, NULL);
-       return 0;
+       discardtag = 1;
     }
     if (xmlStrEqual(name, BAD_CAST"body")) {
        int indx;
@@ -3491,9 +3492,7 @@
                htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
                             "htmlParseStartTag: misplaced <body> tag\n",
                             name, NULL);
-               while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
-                   NEXT;
-               return 0;
+               discardtag = 1;
            }
        }
     }
@@ -3592,12 +3591,14 @@
     /*
      * SAX: Start of Element !
      */
-    htmlnamePush(ctxt, name);
-    if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
-       if (nbatts != 0)
-            ctxt->sax->startElement(ctxt->userData, name, atts);
-       else
-            ctxt->sax->startElement(ctxt->userData, name, NULL);
+    if (!discardtag) {
+       htmlnamePush(ctxt, name);
+       if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
+           if (nbatts != 0)
+               ctxt->sax->startElement(ctxt->userData, name, atts);
+           else
+               ctxt->sax->startElement(ctxt->userData, name, NULL);
+       }
     }
 
     if (atts != NULL) {


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]