[gmime-devel] parser optimization for mmap streams



In my obsessive quest to make sure that GMime is so insanely fast that
no other parser can come close to touching it, I've poked around a bit
with optimizing the parser in the case where the developer has given it
an mmap'd stream.

Attached is the patch.

Unfortunately I've had to introduce more branches into the code (which
makes it a little uglier to maintain) because the mmap'd case cannot
take advantage of a wonderful hack I've been using to avoid unnecessary
compares in the inner-most loops.

Even with the added compares in the inner-most loops, the patch does
have a slight improvement over parsing more typical streams.

I haven't yet tested this code on a modern machine, so maybe the
difference isn't really worth it, I'm not sure.

On my antique desktop, the difference is about 0.3s for parsing 333 MB
of data (6.4s w/ mmap patch, 6.7s w/o).

Previously, when I was working on improving the performance trying to
overcome the overhead of GObject and GSignal, my Core2 laptop was
parsing about half a gig in roughly 4 or 4.5 seconds (I can't remember
exactly, and my laptop is currently dead so I can't check), so I expect
that the difference will likely be under 0.2s.

I'm not sure if it's worth committing or not, but it's probably worth
playing with :)

Jeff

Index: gmime/gmime-parser.c
===================================================================
--- gmime/gmime-parser.c	(revision 1465)
+++ gmime/gmime-parser.c	(working copy)
@@ -31,6 +31,7 @@
 #include "gmime-parser.h"
 
 #include "gmime-table-private.h"
+#include "gmime-stream-mmap.h"
 #include "gmime-stream-mem.h"
 #include "gmime-message-part.h"
 #include "gmime-multipart.h"
@@ -136,7 +137,8 @@
 	
 	short int state;
 	
-	unsigned short int unused:10;
+	unsigned short int unused:9;
+	unsigned short int mmaped:1;
 	unsigned short int midline:1;
 	unsigned short int seekable:1;
 	unsigned short int scan_from:1;
@@ -326,12 +328,24 @@
 	
 	priv->stream = stream;
 	
-	priv->offset = offset;
+	if (GMIME_IS_STREAM_MMAP (stream)) {
+		GMimeStreamMmap *mmap = (GMimeStreamMmap *) stream;
+		
+		priv->inbuf = (unsigned char *) mmap->map;
+		priv->inend = priv->inbuf + mmap->maplen;
+		priv->inptr = priv->inbuf;
+		
+		priv->offset = offset + mmap->maplen;
+		priv->mmaped = TRUE;
+	} else {
+		priv->inbuf = priv->realbuf + SCAN_HEAD;
+		priv->inptr = priv->inbuf;
+		priv->inend = priv->inbuf;
+		
+		priv->offset = offset;
+		priv->mmaped = FALSE;
+	}
 	
-	priv->inbuf = priv->realbuf + SCAN_HEAD;
-	priv->inptr = priv->inbuf;
-	priv->inend = priv->inbuf;
-	
 	priv->from_offset = -1;
 	priv->from_line = g_byte_array_new ();
 	
@@ -612,7 +626,7 @@
 	
 	g_assert (inptr <= inend);
 	
-	if (inlen > atleast)
+	if (priv->mmaped || inlen > atleast)
 		return inlen;
 	
 	/* attempt to align 'inend' with realbuf + SCAN_HEAD */
@@ -723,12 +737,22 @@
 		
 		inptr = priv->inptr;
 		inend = priv->inend;
-		*inend = '\n';
 		
+		if (!priv->mmaped) {
+			/* Note: see optimization comment [1] */
+			*inend = '\n';
+		}
+		
 		while (inptr < inend) {
 			start = inptr;
-			while (*inptr != '\n')
-				inptr++;
+			if (priv->mmaped) {
+				while (inptr < inend && *inptr != '\n')
+					inptr++;
+			} else {
+				/* Note: see optimization comment [1] */
+				while (*inptr != '\n')
+					inptr++;
+			}
 			
 			if (inptr + 1 >= inend) {
 				/* we don't have enough data; if we can't get more we have to bail */
@@ -908,9 +932,12 @@
 		
 		inptr = priv->inptr;
 		inend = priv->inend;
-		/* Note: see optimization comment [1] */
-		*inend = '\n';
 		
+		if (!priv->mmaped) {
+			/* Note: see optimization comment [1] */
+			*inend = '\n';
+		}
+		
 		g_assert (inptr <= inend);
 		
 		while (inptr < inend) {
@@ -929,16 +956,29 @@
 			if (fieldname && !eoln) {
 				/* scan and validate the field name */
 				if (*inptr != ':') {
-					/* Note: see optimization comment [1] */
-					*inend = ':';
-					
-					while (*inptr != ':') {
-						if (is_type (*inptr, IS_SPACE | IS_CTRL)) {
-							valid = FALSE;
-							break;
+					if (!priv->mmaped) {
+						/* Note: see optimization comment [1] */
+						*inend = ':';
+						
+						while (*inptr != ':') {
+							if (is_type (*inptr, IS_SPACE | IS_CTRL)) {
+								valid = FALSE;
+								break;
+							}
+							
+							inptr++;
 						}
 						
-						inptr++;
+						*inend = '\n';
+					} else {
+						while (inptr < inend && *inptr != ':') {
+							if (is_type (*inptr, IS_SPACE | IS_CTRL)) {
+								valid = FALSE;
+								break;
+							}
+							
+							inptr++;
+						}
 					}
 					
 					if (inptr == inend) {
@@ -947,9 +987,6 @@
 						priv->inptr = start;
 						goto refill;
 					}
-					
-					/* Note: see optimization comment [1] */
-					*inend = '\n';
 				} else if (*inptr == ':') {
 					valid = FALSE;
 				}
@@ -970,9 +1007,14 @@
 			
 			fieldname = FALSE;
 			
-			/* Note: see optimization comment [1] */
-			while (*inptr != '\n')
-				inptr++;
+			if (priv->mmaped) {
+				while (inptr < inend && *inptr != '\n')
+					inptr++;
+			} else {
+				/* Note: see optimization comment [1] */
+				while (*inptr != '\n')
+					inptr++;
+			}
 			
 			len = (size_t) (inptr - start);
 			
@@ -1115,10 +1157,17 @@
 	do {
 		inptr = priv->inptr;
 		inend = priv->inend;
-		*inend = '\n';
 		
-		while (*inptr != '\n')
-			inptr++;
+		if (!priv->mmaped) {
+			/* Note: see optimization comment [1] */
+			*inend = '\n';
+			
+			while (*inptr != '\n')
+				inptr++;
+		} else {
+			while (inptr < inend && *inptr != '\n')
+				inptr++;
+		}
 		
 		if (inptr < inend)
 			break;
@@ -1285,9 +1334,12 @@
 		
 		inptr = priv->inptr;
 		inend = priv->inend;
-		/* Note: see optimization comment [1] */
-		*inend = '\n';
 		
+		if (!priv->mmaped) {
+			/* Note: see optimization comment [1] */
+			*inend = '\n';
+		}
+		
 		len = (size_t) (inend - inptr);
 		if (priv->midline && len == nleft)
 			found = FOUND_EOS;
@@ -1296,9 +1348,14 @@
 		
 		while (inptr < inend) {
 			start = inptr;
-			/* Note: see optimization comment [1] */
-			while (*inptr != '\n')
-				inptr++;
+			if (priv->mmaped) {
+				while (inptr < inend && *inptr != '\n')
+					inptr++;
+			} else {
+				/* Note: see optimization comment [1] */
+				while (*inptr != '\n')
+					inptr++;
+			}
 			
 			len = (size_t) (inptr - start);
 			
Index: tests/test-mbox.c
===================================================================
--- tests/test-mbox.c	(revision 1465)
+++ tests/test-mbox.c	(working copy)
@@ -27,6 +27,7 @@
 #include <string.h>
 #include <sys/types.h>
 #include <sys/stat.h>
+#include <sys/mman.h>
 #include <unistd.h>
 #include <dirent.h>
 #include <fcntl.h>
@@ -305,7 +306,7 @@
 							      input, strerror (errno)));
 				}
 				
-				istream = g_mime_stream_fs_new (fd);
+				istream = g_mime_stream_mmap_new (fd, PROT_READ, MAP_PRIVATE);
 				
 				if ((fd = open (output, O_RDONLY)) == -1) {
 					throw (exception_new ("could not open `%s': %s",
@@ -383,7 +384,7 @@
 		if ((fd = open (path, O_RDONLY)) == -1)
 			goto exit;
 		
-		istream = g_mime_stream_fs_new (fd);
+		istream = g_mime_stream_mmap_new (fd, PROT_READ, MAP_PRIVATE);
 		parser = g_mime_parser_new_with_stream (istream);
 		g_mime_parser_set_scan_from (parser, TRUE);
 		


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]