[libxml2] 572129 speed up parasing of large HTML text nodes

From: Daniel Veillard <veillard src gnome org>
To: svn-commits-list gnome org
Cc:
Subject: [libxml2] 572129 speed up parasing of large HTML text nodes
Date: Mon, 24 Aug 2009 17:01:49 +0000 (UTC)
commit 56a03035bf94defaf0777db54db59530146f5f9b
Author: Markus Kull <markus kull 1und1 de>
Date:   Mon Aug 24 19:00:23 2009 +0200

    572129 speed up parasing of large HTML text nodes
    
    * HTMLparser.c: use a different lookup function htmlParseLookupChars()
      to avoid the quadratic behaviour

 HTMLparser.c |   79 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 files changed, 77 insertions(+), 2 deletions(-)
---
diff --git a/HTMLparser.c b/HTMLparser.c
index 65571ae..c64590e 100644
--- a/HTMLparser.c
+++ b/HTMLparser.c
@@ -4726,6 +4726,80 @@ htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
 }
 
 /**
+ * htmlParseLookupChars:
+ * @ctxt: an HTML parser context
+ * @stop: Array of chars, which stop the lookup.
+ * @stopLen: Length of stop-Array
+ *
+ * Try to find if any char of the stop-Array is available in the input 
+ * stream.
+ * This function has a side effect of (possibly) incrementing ctxt->checkIndex
+ * to avoid rescanning sequences of bytes, it DOES change the state of the
+ * parser, do not use liberally.
+ *
+ * Returns the index to the current parsing point if a stopChar 
+ *      is available, -1 otherwise.
+ */
+static int
+htmlParseLookupChars(htmlParserCtxtPtr ctxt, const xmlChar * stop,
+                     int stopLen)
+{
+    int base, len;
+    htmlParserInputPtr in;
+    const xmlChar *buf;
+    int incomment = 0;
+    int i;
+
+    in = ctxt->input;
+    if (in == NULL)
+        return (-1);
+
+    base = in->cur - in->base;
+    if (base < 0)
+        return (-1);
+
+    if (ctxt->checkIndex > base)
+        base = ctxt->checkIndex;
+
+    if (in->buf == NULL) {
+        buf = in->base;
+        len = in->length;
+    } else {
+        buf = in->buf->buffer->content;
+        len = in->buf->buffer->use;
+    }
+
+    for (; base < len; base++) {
+        if (!incomment && (base + 4 < len)) {
+            if ((buf[base] == '<') && (buf[base + 1] == '!') &&
+                (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
+                incomment = 1;
+                /* do not increment past <! - some people use <!--> */
+                base += 2;
+            }
+        }
+        if (incomment) {
+            if (base + 3 > len)
+                return (-1);
+            if ((buf[base] == '-') && (buf[base + 1] == '-') &&
+                (buf[base + 2] == '>')) {
+                incomment = 0;
+                base += 2;
+            }
+            continue;
+        }
+        for (i = 0; i < stopLen; ++i) {
+            if (buf[base] == stop[i]) {
+                ctxt->checkIndex = 0;
+                return (base - (in->cur - in->base));
+            }
+        }
+    }
+    ctxt->checkIndex = base;
+    return (-1);
+}
+
+/**
  * htmlParseTryOrFinish:
  * @ctxt:  an HTML parser context
  * @terminate:  last chunk indicator
@@ -5254,7 +5328,8 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
 			break;
 		    } else if (cur == '&') {
 			if ((!terminate) &&
-			    (htmlParseLookupSequence(ctxt, ';', 0, 0, 0, 1) < 0))
+			    (htmlParseLookupChars(ctxt,
+                                                  BAD_CAST "; >/", 4) < 0))
 			    goto done;
 #ifdef DEBUG_PUSH
 			xmlGenericError(xmlGenericErrorContext,
@@ -5270,7 +5345,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
 			 * data detection.
 			 */
 			if ((!terminate) &&
-			    (htmlParseLookupSequence(ctxt, '<', 0, 0, 0, 1) < 0))
+                            (htmlParseLookupChars(ctxt, BAD_CAST "<&", 2) < 0))
 			    goto done;
 			ctxt->checkIndex = 0;
 #ifdef DEBUG_PUSH
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]