[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]

[xml] Patch to improve HTMLparser's robustness



I've been running into problems parsing incoming email messages through libxml2's HTML parser, which when seeing tags such as <html xml:lang="en" xmlns:....> in an unexpected place, will just eat the '<html' part and turn the attributes of that html tag into normal text, causing odd code to appear at the top of email messages. This mostly affects Outlook/Exchange generated messages.

The attached patch tries to fix it. It works for me, but I wonder whether I haven't introduced memory allocation issues with it, and hope the patch (or a similar solution) can be integrated into a future libxml release.

With regards,
Arnold Hendriks
diff -ur libxml2-2.6.30/HTMLparser.c libxml2-2.6.30patched/HTMLparser.c
--- libxml2-2.6.30/HTMLparser.c	Tue Jun 12 04:36:23 2007
+++ libxml2-2.6.30patched/HTMLparser.c	Tue Nov 20 08:40:59 2007
@@ -3431,6 +3431,7 @@
     int maxatts;
     int meta = 0;
     int i;
+    int discardtag = 0;
 
     if ((ctxt == NULL) || (ctxt->input == NULL)) {
 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
@@ -3475,14 +3476,14 @@
 	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
 	             "htmlParseStartTag: misplaced <html> tag\n",
 		     name, NULL);
-	return 0;
+	discardtag = 1;
     }
     if ((ctxt->nameNr != 1) && 
 	(xmlStrEqual(name, BAD_CAST"head"))) {
 	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
 	             "htmlParseStartTag: misplaced <head> tag\n",
 		     name, NULL);
-	return 0;
+	discardtag = 1;
     }
     if (xmlStrEqual(name, BAD_CAST"body")) {
 	int indx;
@@ -3491,9 +3492,7 @@
 		htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
 		             "htmlParseStartTag: misplaced <body> tag\n",
 			     name, NULL);
-		while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
-		    NEXT;
-		return 0;
+		discardtag = 1;
 	    }
 	}
     }
@@ -3592,12 +3591,14 @@
     /*
      * SAX: Start of Element !
      */
-    htmlnamePush(ctxt, name);
-    if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
-	if (nbatts != 0)
-            ctxt->sax->startElement(ctxt->userData, name, atts);
-	else
-            ctxt->sax->startElement(ctxt->userData, name, NULL);
+    if (!discardtag) {
+	htmlnamePush(ctxt, name);
+	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
+	    if (nbatts != 0)
+		ctxt->sax->startElement(ctxt->userData, name, atts);
+	    else
+		ctxt->sax->startElement(ctxt->userData, name, NULL);
+	}
     }
 
     if (atts != NULL) {


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]