[tracker] Fixes GB#616329: Improve and fix reading msoffice/excel files

From: Aleksander Morgado <aleksm src gnome org>
To: commits-list gnome org
Cc:
Subject: [tracker] Fixes GB#616329: Improve and fix reading msoffice/excel files
Date: Wed, 21 Apr 2010 12:05:35 +0000 (UTC)
commit d20c3adbb6b8a2445bdb241fd66b937ebbe6319c
Author: Aleksander Morgado <aleksander lanedo com>
Date:   Tue Apr 20 14:45:36 2010 +0200

    Fixes GB#616329: Improve and fix reading msoffice/excel files
    
     * Strings which are split into two ExcelExtendedStringRecord are now properly read.
     * String contents are now converted from CP1252 or UTF-16 to UTF-8
     * Reading the string is now done in either 1 or 2 GSF reads, not byte per byte.
    
     * Limited the max number of bytes to be read from the stream, to the common safe
         limit of 3*max_words*max_word_size.
     * Contents are now normalized and word-counted in a buffered way.
     * Stop reading the contents when max bytes reached.
     * Stop reading the contents when max number of words reached.

 src/tracker-extract/tracker-extract-msoffice.c |  575 +++++++++++++++++-------
 1 files changed, 403 insertions(+), 172 deletions(-)
---
diff --git a/src/tracker-extract/tracker-extract-msoffice.c b/src/tracker-extract/tracker-extract-msoffice.c
index 355b92e..40c9c14 100644
--- a/src/tracker-extract/tracker-extract-msoffice.c
+++ b/src/tracker-extract/tracker-extract-msoffice.c
@@ -114,13 +114,13 @@ typedef enum {
 /* ExcelBiffHeader to read excel spec header */
 typedef struct {
 	ExcelRecordType id;
-	gint length;
+	guint length;
 } ExcelBiffHeader;
 
 /* ExtendendString Record offset in stream and length */
 typedef struct {
-	guint32 offset;
-	guint32	length;
+	gsf_off_t offset; /* 64 bits!! */
+	gsize     length;
 } ExcelExtendedStringRecord;
 
 typedef enum {
@@ -367,7 +367,7 @@ read_8bit (const guint8 *buffer)
  * @param buffer data to read integer from
  * @return 16 bit unsigned integer
  */
-static gint
+static guint16
 read_16bit (const guint8 *buffer)
 {
 	return buffer[0] + (buffer[1] << 8);
@@ -378,7 +378,7 @@ read_16bit (const guint8 *buffer)
  * @param buffer data to read integer from
  * @return 32 bit unsigned integer
  */
-static gint
+static guint32
 read_32bit (const guint8 *buffer)
 {
 	return buffer[0] + (buffer[1] << 8) + (buffer[2] << 16) + (buffer[3] << 24);
@@ -784,6 +784,91 @@ open_uri (const gchar *uri)
 	return infile;
 }
 
+/* Reads 'chunk_size' bytes from 'stream' into 'buffer', then converts from
+ * UTF-16 or CP1252 to UTF-8, normalizes the string, and limits it to
+ * 'n_words_remaining' max words, updating this value accordingly */
+static void
+read_convert_and_normalize_chunk (guint8    *buffer,
+                                  gsize      chunk_size,
+                                  gboolean   is_ansi,
+                                  gint      *p_words_remaining,
+                                  gsize     *p_bytes_remaining,
+                                  GString  **p_content)
+{
+	gsize n_bytes_utf8;
+	gchar *converted_text;
+	GError *error = NULL;
+
+	g_return_if_fail (buffer != NULL);
+	g_return_if_fail (chunk_size > 0);
+	g_return_if_fail (p_words_remaining != NULL);
+	g_return_if_fail (p_bytes_remaining != NULL);
+	g_return_if_fail (p_content != NULL);
+
+
+	/* chunks can have different encoding
+	 *  TODO: Using g_iconv, this extra heap allocation could be
+	 *   avoided, re-using over and over again the same output buffer
+	 *   for the UTF-8 encoded string */
+	converted_text = g_convert (buffer,
+	                            chunk_size,
+	                            "UTF-8",
+	                            is_ansi ? "CP1252" : "UTF-16",
+	                            NULL,
+	                            &n_bytes_utf8,
+	                            &error);
+
+	if (converted_text) {
+		gchar *normalized_chunk;
+		guint n_words_normalized;
+
+		/* Get normalized chunk */
+		normalized_chunk = tracker_text_normalize (converted_text,
+		                                           *p_words_remaining,
+		                                           &n_words_normalized);
+
+		/* Update number of words remaining.
+		 * Note that n_words_normalized should always be less or
+		 * equal than n_words_remaining */
+		*p_words_remaining = (n_words_normalized <= *p_words_remaining ?
+		                      *p_words_remaining - n_words_normalized : 0);
+
+		/* Update accumulated UTF-8 bytes read */
+		*p_bytes_remaining = (n_bytes_utf8 <= *p_bytes_remaining ?
+		                      *p_bytes_remaining - n_bytes_utf8 : 0);
+
+		/* g_debug ("Words normalized: %u (remaining: %u); " */
+		/*          "Bytes read (UTF-8): %" G_GSIZE_FORMAT " bytes " */
+		/*          "(remaining: %" G_GSIZE_FORMAT ")", */
+		/*          n_words_normalized, *p_words_remaining, */
+		/*          n_bytes_utf8, *p_bytes_remaining); */
+
+		/* Append normalized chunk to the string to be returned */
+		if (*p_content) {
+			g_string_append (*p_content, normalized_chunk);
+		} else {
+			*p_content = g_string_new (normalized_chunk);
+		}
+
+		/* A whitespace is added to separate next strings appended */
+		g_string_append (*p_content, " ");
+
+		g_free (converted_text);
+		g_free (normalized_chunk);
+	} else {
+		g_warning ("Couldn't convert %d bytes from %s to UTF-8: %s",
+		           chunk_size,
+		           is_ansi ? "CP1252" : "UTF-16",
+		           error ? error->message : "no error given");
+	}
+
+	/* Note that error may be set even if some converted text is
+	 * available, due to G_CONVERT_ERROR_ILLEGAL_SEQUENCE for example */
+	g_clear_error (&error);
+}
+
+
+
 /* This function was programmed by using ideas and algorithms from
  * b2xtranslator project (http://b2xtranslator.sourceforge.net/)
  */
@@ -890,7 +975,6 @@ extract_msword_content (GsfInfile *infile,
 	while (n_words_remaining > 0 &&
 	       n_bytes_remaining > 0 &&
 	       i < piece_count) {
-		gchar *converted_text;
 		guint8 *piece_descriptor;
 		gint piece_start;
 		gint piece_end;
@@ -917,7 +1001,6 @@ extract_msword_content (GsfInfile *infile,
 			fc = (fc & 0xBFFFFFFF) >> 1;
 		}
 
-
 		piece_size  = piece_end - piece_start;
 
 		/* NOTE: Very very long pieces may appear. In fact, a single
@@ -937,9 +1020,6 @@ extract_msword_content (GsfInfile *infile,
 
 		/* Avoid empty pieces */
 		if (piece_size >= 1) {
-			GError *error = NULL;
-			gsize n_bytes_utf8;
-			guint n_words_normalized;
 
 			/* Re-allocate buffer to make it bigger if needed.
 			 *  This text buffer is re-used over and over in each
@@ -949,66 +1029,16 @@ extract_msword_content (GsfInfile *infile,
 				text_buffer_size = piece_size;
 			}
 
-			/* read single text piece from document_stream */
+			/* read and parse single text piece from document_stream */
 			gsf_input_seek (document_stream, fc, G_SEEK_SET);
 			gsf_input_read (document_stream, piece_size, text_buffer);
 
-			/* pieces can have different encoding
-			 *  TODO: Using g_iconv, this extra heap allocation could be
-			 *   avoided, re-using over and over again the same output buffer
-			 *   for the UTF-8 encoded string */
-			converted_text = g_convert (text_buffer,
-			                            piece_size,
-			                            "UTF-8",
-			                            is_ansi ? "CP1252" : "UTF-16",
-			                            NULL,
-			                            &n_bytes_utf8,
-			                            &error);
-
-			if (converted_text) {
-				gchar *normalized_chunk;
-
-				/* Get normalized chunk */
-				normalized_chunk = tracker_text_normalize (converted_text,
-				                                           n_words_remaining,
-				                                           &n_words_normalized);
-
-				/* Update number of words remaining.
-				 * Note that n_words_normalized should always be less or
-				 * equal than n_words_remaining */
-				n_words_remaining = (n_words_normalized <= n_words_remaining ?
-				                     n_words_remaining - n_words_normalized : 0);
-
-				/* Update accumulated UTF-8 bytes read */
-				n_bytes_remaining = (n_bytes_utf8 <= n_bytes_remaining ?
-				                     n_bytes_remaining - n_bytes_utf8 : 0);
-
-				g_debug ("(%s) Piece %u; Words normalized: %u (remaining: %u); "
-				         "Bytes read (UTF-8): %" G_GSIZE_FORMAT " bytes "
-				         "(remaining: %" G_GSIZE_FORMAT ")",
-				         __FUNCTION__, i, n_words_normalized, n_words_remaining,
-				         n_bytes_utf8, n_bytes_remaining);
-
-				/* Append normalized chunk to the string to be returned */
-				if (!content) {
-					content = g_string_new (normalized_chunk);
-				} else {
-					g_string_append (content, normalized_chunk);
-				}
-
-				g_free (converted_text);
-				g_free (normalized_chunk);
-			}
-			else {
-				g_warning ("Couldn't convert %d bytes from %s to UTF-8: %s",
-				           piece_size,
-				           is_ansi ? "CP1252" : "UTF-16",
-				           error ? error->message : NULL);
-			}
-
-			/* Note that error may be set even if some converted text is
-			 * available, due to G_CONVERT_ERROR_ILLEGAL_SEQUENCE for example */
-			g_clear_error (&error);
+			read_convert_and_normalize_chunk (text_buffer,
+			                                  piece_size,
+			                                  is_ansi,
+			                                  &n_words_remaining,
+			                                  &n_bytes_remaining,
+			                                  &content);
 		}
 
 		/* Go on to next piece */
@@ -1023,8 +1053,209 @@ extract_msword_content (GsfInfile *infile,
 	return content ? g_string_free (content, FALSE) : NULL;
 }
 
+/* Reads and interprets the flags of a given string. May be
+ *  used just to skip the fields, as when this bitmask-byte
+ *  comes as the first byte of a new record.
+ * NOTE: For a detailed meaning of each field parsed here,
+ *  take a look at the XLUnicodeRichExtendedString format:
+ *  http://msdn.microsoft.com/en-us/library/dd943830.aspx
+ **/
+static void
+read_excel_string_flags (GsfInput *stream,
+                         gboolean *p_is_high_byte,
+                         guint16  *p_c_run,
+                         guint16  *p_cb_ext_rst)
+{
+	guint8 tmp_buffer[4] = { 0 };
+	guint8 bit_mask;
+	gboolean is_ext_string;
+	gboolean is_rich_string;
+
+	/* Note that output arguments may be NULL if we don't need
+	 * their values... */
+
+	/* Reading 1 byte for mask */
+	gsf_input_read (stream, 1, tmp_buffer);
+	bit_mask = read_8bit (tmp_buffer);
+
+	/* Get flags */
+	if (p_is_high_byte) {
+		*p_is_high_byte = (bit_mask & 0x01) == 0x01;
+	}
+	is_ext_string = (bit_mask & 0x04) == 0x04;
+	is_rich_string = (bit_mask & 0x08) == 0x08;
+
+	/* If the c_run value is required as output, read it */
+	if (p_c_run) {
+		if (is_rich_string) {
+			/* Reading 2 Bytes */
+			gsf_input_read (stream, 2, tmp_buffer);
+
+			/* Reading cRun */
+			*p_c_run = read_16bit (tmp_buffer);
+		} else {
+			*p_c_run = 0;
+		}
+	} else if (is_rich_string) {
+		/* If not required, just skip those bytes */
+		gsf_input_seek (stream, 2, G_SEEK_CUR);
+	}
+
+	/* If the cb_ext_rst value is required as output, read it */
+	if (p_cb_ext_rst) {
+		if (is_ext_string) {
+			/* Reading 4 Bytes */
+			gsf_input_read (stream, 4, tmp_buffer);
+
+			/* Reading cRun */
+			*p_cb_ext_rst = read_16bit (tmp_buffer);
+		} else {
+			*p_cb_ext_rst = 0;
+		}
+	} else if (is_ext_string) {
+		/* If not required, just skip those bytes */
+		gsf_input_seek (stream, 4, G_SEEK_CUR);
+	}
+}
+
+/* Returns TRUE if record was changed. BUT, the value of the
+ *  current_record should be checked by the caller to know
+ *  if there are no more records */
+static gboolean
+change_excel_record_if_needed (GsfInput *stream,
+                               GArray   *record_array,
+                               guint    *p_current_record)
+{
+	ExcelExtendedStringRecord *record;
+
+	/* Get current record */
+	record = &g_array_index (record_array,
+	                         ExcelExtendedStringRecord,
+	                         *p_current_record);
+
+	/* We may already have surpassed the record, so adjust if so */
+	if (gsf_input_tell (stream) >= (record->offset + record->length)) {
+		/* Switch records and read from the second one... */
+		(*p_current_record)++;
+
+		if (*p_current_record < record_array->len) {
+			record = &g_array_index (record_array,
+			                         ExcelExtendedStringRecord,
+			                         *p_current_record);
+
+			gsf_input_seek (stream, record->offset, G_SEEK_SET);
+		}
+
+		return TRUE;
+	}
+
+	return FALSE;
+}
+
+/* Returns TRUE if correctly read
+ *
+ *  Note that p_current_record may get changed if the required
+ *  bytes to read were split into two different records.
+ */
+static gboolean
+read_excel_string (GsfInput *stream,
+                   guint8   *buffer,
+                   gsize     chunk_size,
+                   GArray   *record_array,
+                   guint    *p_current_record)
+{
+	ExcelExtendedStringRecord *record;
+	gsf_off_t current_position;
+	gsf_off_t current_record_end;
+
+	/* Record may have changed when we want to read the string contents
+	 *  This is a pretty special case, where the new CONTINUE record
+	 * shouldn't start with a bitmask */
+	if (change_excel_record_if_needed (stream, record_array, p_current_record) &&
+	    *p_current_record >= record_array->len) {
+		/* When reached max number of records, just return */
+		return FALSE;
+	}
+
+	/* Get current record */
+	record = &g_array_index (record_array,
+	                         ExcelExtendedStringRecord,
+	                         *p_current_record);
+
+	/* Compute current position in the stream and end of current record*/
+	current_position = gsf_input_tell (stream);
+	current_record_end = record->offset + record->length;
+
+	/* The best case is when the whole number of bytes to read are in the
+	 * current record, as no record switching is therefore needed */
+	if (current_position + chunk_size <= current_record_end) {
+		return gsf_input_read (stream, chunk_size, buffer) != NULL ? TRUE : FALSE;
+	} else if (current_record_end < current_position) {
+		/* Safety check, actually pretty important */
+		return FALSE;
+	} else {
+		/* Read the string in two chunks */
+		gsize chunk_size_first_record;
+		gsize chunk_size_second_record;
+
+		/* Compute how much to read in each record */
+		chunk_size_first_record = current_record_end - current_position;
+		chunk_size_second_record = chunk_size - chunk_size_first_record;
+
+		/* g_debug ("Current position:      %" GSF_OFF_T_FORMAT, current_position); */
+		/* g_debug ("Current record index:  %u", *p_current_record); */
+		/* g_debug ("Current record start:  %" GSF_OFF_T_FORMAT, record->offset); */
+		/* g_debug ("Current record length: %" G_GSIZE_FORMAT, record->length); */
+		/* g_debug ("Current record end:    %" GSF_OFF_T_FORMAT, current_record_end); */
+		/* g_debug ("Bytes to read:         %" G_GSIZE_FORMAT,   chunk_size); */
+		/* g_debug ("Bytes to read (1st):   %" G_GSIZE_FORMAT,   chunk_size_first_record); */
+		/* g_debug ("Bytes to read (2nd):   %" G_GSIZE_FORMAT,   chunk_size_second_record); */
+
+		/* Now, read from first record... */
+		if (gsf_input_read (stream,
+		                    chunk_size_first_record,
+		                    buffer)) {
+			/* Now switch records and read from the second one... */
+			(*p_current_record)++;
+
+			if (*p_current_record < record_array->len) {
+				record = &g_array_index (record_array,
+				                         ExcelExtendedStringRecord,
+				                         *p_current_record);
+
+				/* g_debug ("New record index:  %u", *p_current_record); */
+				/* g_debug ("New record start:  %" GSF_OFF_T_FORMAT, record->offset); */
+				/* g_debug ("New record length: %" G_GSIZE_FORMAT, record->length); */
+
+				/* Move stream pointer to the new location, beginning of next record */
+				gsf_input_seek (stream, record->offset, G_SEEK_SET);
+
+				/* Every CONTINUE records starts with a bitmask + optional fields that
+				 * should be skipped properly */
+				read_excel_string_flags (stream, NULL, NULL, NULL);
+
+				/* And finally, read the second part */
+				if (gsf_input_read (stream,
+				                    chunk_size_second_record,
+				                    &buffer[chunk_size_first_record])) {
+					/* All OK! */
+					return TRUE;
+				}
+			}
+		}
+
+		return FALSE;
+	}
+}
+
+
 
 /**
+ * [MS-XLS] â?? v20090708
+ * Excel Binary File Format (.xls) Structure Specification
+ * Copyright Â© 2009 Microsoft Corporation.
+ *  Release: Wednesday, July 8, 2009
+ *
  * 2.5.293 XLUnicodeRichExtendedString
  * This structure specifies a Unicode string, which can contain
  * formatting information and phoneticstring data.
@@ -1040,11 +1271,6 @@ extract_msword_content (GsfInfile *infile,
  *                            cch    A B C D reserved2 cRun (optional)
  *               ...                   cbExtRst (optional)
  *               ...                   rgb (variable)
- *                                                        951 / 1165
- * [MS-XLS] â?? v20090708
- * Excel Binary File Format (.xls) Structure Specification
- * Copyright Â© 2009 Microsoft Corporation.
- *  Release: Wednesday, July 8, 2009
  *               ...
  *                         rgRun (variable, optional)
  *               ...
@@ -1094,20 +1320,22 @@ extract_msword_content (GsfInfile *infile,
  * only if fExtSt is 0x1.
  */
 static void
-xls_get_extended_record_string (GsfInput *stream,
-                                GArray   *list,
-                                GString  *content)
+xls_get_extended_record_string (GsfInput  *stream,
+                                GArray    *list,
+                                guint     *p_words_remaining,
+                                gsize     *p_bytes_remaining,
+                                GString  **p_content)
 {
 	ExcelExtendedStringRecord *record;
 	guint32 cst_total;
 	guint32 cst_unique;
-	guint8 parsing_record = 0;
+	guint parsing_record = 0;
 	guint8 tmp_buffer[4] = { 0 };
 	guint i;
+	guint8 *buffer;
+	gsize buffer_size;
 
-	/* g_debug ("#Entering extract_est_string #"); */
-
-	/* Parsing the record from the list*/
+	/* Parsing the record from the list */
 	record = &g_array_index (list, ExcelExtendedStringRecord, parsing_record);
 
 	/* First record parsing */
@@ -1115,7 +1343,15 @@ xls_get_extended_record_string (GsfInput *stream,
 		return;
 	}
 
-	parsing_record++;
+	/* Note: The first record is ALWAYS the SST, so coming with cst_total and
+	 * cst_unique values.
+	 * Some extra background: Records with data longer than 8,224 bytes MUST be
+	 * split into several records, so in this case, if the SST record is big
+	 * enough, it will have one or more CONTINUE records
+	 *
+	 * SST record: http://msdn.microsoft.com/en-us/library/dd773037%28v=office.12%29.aspx
+	 * CONTINUE record: http://msdn.microsoft.com/en-us/library/dd949081%28v=office.12%29.aspx
+	 **/
 
 	/* Reading cst total */
 	gsf_input_read (stream, 4, tmp_buffer);
@@ -1125,118 +1361,107 @@ xls_get_extended_record_string (GsfInput *stream,
 	gsf_input_read (stream, 4, tmp_buffer);
 	cst_unique = read_32bit (tmp_buffer);
 
-	/* g_debug ("cst_total :%d,cst_unique %d ",cst_total,cst_unique); */
-
-	for (i = 0; i < cst_unique; i++) {
+	/* Iterate over chunks...
+	 *   Loop is halted whenever one of this conditions is met:
+	 *     a) Max bytes to be read reached
+	 *     b) Already read up to the max number of words configured
+	 *     c) No more chunks to read
+	 */
+	i = 0;
+	while (*p_words_remaining > 0 &&
+	       *p_bytes_remaining > 0 &&
+	       i < cst_unique) {
 		guint16 cch;
 		guint16 c_run;
 		guint16 cb_ext_rst;
-		guint8 bit_mask;
-		guint char_index;
 		gboolean is_high_byte;
-		gboolean is_ext_string;
-		gboolean is_rich_string;
+		gsize chunk_size;
 
-		/* Switching the stream */
-		if (gsf_input_tell (stream) >= (record->offset + record->length)) {
-			if (parsing_record < list->len) {
-				record = &g_array_index (list, ExcelExtendedStringRecord, parsing_record);
-				gsf_input_seek (stream, record->offset, G_SEEK_SET);
-				parsing_record++;
-			} else {
-				break;
-			}
+		/* RECORD may have been changed here */
+		if (change_excel_record_if_needed (stream, list, &parsing_record) &&
+		    parsing_record >= list->len) {
+			/* When reached max number of records, stop loop */
+			break;
 		}
 
-		/* Resetting record format values */
+		/* Reading 2 bytes for cch */
+		gsf_input_read (stream, 2, tmp_buffer);
 
-		/* Reading 3 Btyes 2 bytes for cch and 1 byte for mask */
-		gsf_input_read (stream, 3, tmp_buffer);
 		/* Reading cch - char count of current string */
 		cch = read_16bit (tmp_buffer);
 
-		/* Get bitMask */
-		bit_mask = read_8bit (tmp_buffer + 2);
-		/* g_debug ("cch: %d, bit_mask: 0x%x", cch, bit_mask); */
+		/* Read string flags */
+		read_excel_string_flags (stream,
+		                         &is_high_byte,
+		                         &c_run,
+		                         &cb_ext_rst);
 
-		/* is big and litte endian problem effect this ? */
-		is_high_byte = (bit_mask & 0x01) == 0x01;
-		is_ext_string = (bit_mask & 0x04) == 0x04;
-		is_rich_string = (bit_mask & 0x08) == 0x08;
+		/* RECORD may have been changed here, but it is managed when reading the
+		 *  string contents */
 
-		if (is_rich_string) {
-			/* Reading 2 Btyes */
-			gsf_input_read (stream, 2, tmp_buffer);
-			/* Reading cRun */
-			c_run = read_16bit (tmp_buffer);
-		} else {
-			c_run = 0;
-		}
 
-		if (is_ext_string) {
-			/* Reading 4 Btyes */
-			gsf_input_read (stream, 4, tmp_buffer);
-			/* Reading cRun */
-			cb_ext_rst = read_16bit (tmp_buffer);
-		} else {
-			cb_ext_rst = 0;
-		}
+		/* NOTE: In order to avoid reading unnecessary bytes, limit it based
+		 * on the number of bytes remaining */
+		chunk_size = MIN (cch, *p_bytes_remaining);
 
-		/* Switching the stream */
-		if (gsf_input_tell (stream) >= (record->offset + record->length)) {
-			if (parsing_record < list->len) {
-				record = &g_array_index (list, ExcelExtendedStringRecord, parsing_record);
-				gsf_input_seek (stream, record->offset, G_SEEK_SET);
-				parsing_record++;
-			} else {
-				break;
-			}
+		/* If High Byte, chunk size *2 as stream is in UTF-16 */
+		if (is_high_byte) {
+			chunk_size *= 2;
 		}
 
-		/* Reading string */
-		for (char_index = 0; char_index < cch; char_index++) {
-			/* Note everytime we need to reset the buffer
-			 * that why declaring inside the for loop
-			 */
-			gchar buffer[4] = { 0 };
-
-			if (is_high_byte) {
-				/* Reading two byte */
-				gsf_input_read (stream, 2, buffer);
-				g_string_append (content, (gchar*) buffer);
-			} else {
-				/* Reading one byte */
-				gsf_input_read (stream, 1, buffer);
-				g_string_append_c (content, (gchar) buffer[0]);
-			}
+		/* If the new chunk size is longer than our reused buffer,
+		 * make the buffer bigger */
+		if (chunk_size > buffer_size) {
+			buffer = g_realloc (buffer, chunk_size);
+			buffer_size = chunk_size;
 		}
 
-		g_string_append (content, " ");
+		/* Read the chunk! NOTE that it may be split in several records... */
+		if (!read_excel_string (stream, buffer, chunk_size, list, &parsing_record)) {
+			break;
+		}
 
-		/* g_debug ("cRun %d cb_ext_rst %d", c_run, cb_ext_rst); */
+		/* Read whole stream in one operation */
+		read_convert_and_normalize_chunk (buffer,
+		                                  chunk_size,
+		                                  !is_high_byte,
+		                                  p_words_remaining,
+		                                  p_bytes_remaining,
+		                                  p_content);
 
 		/* Formatting string */
-		if (is_rich_string) {
+		if (c_run > 0) {
 			/* rgRun (variable): An optional array of
 			 * FormatRun structures that specifies the
 			 * formatting for each ext run. The number of
 			 * elements in the array is cRun. MUST exist
 			 * if and only if fRichSt is 0x1.
 			 *
+			 * Note: As defined in MSDN, a FormatRun structure has a size
+			 *  of 4 bytes, so the size of this rgRun variable is really
+			 *  (4*cRun) bytes.
+			 *  http://msdn.microsoft.com/en-us/library/dd921712.aspx
+			 *
 			 * Skiping this as it will not be useful in
 			 * our case.
 			 */
-			gsf_input_seek (stream, c_run, G_SEEK_CUR);
+			gsf_input_seek (stream, 4 * c_run, G_SEEK_CUR);
+			/* Note that we may be now out of the current record after having
+			 * done this seek operation. */
 		}
 
 		/* ExtString */
-		if (is_ext_string) {
+		if (cb_ext_rst > 0) {
 			/* Again its not so clear may be it will not
 			 * useful in our case.
 			 */
 			gsf_input_seek (stream, cb_ext_rst, G_SEEK_CUR);
-
+			/* Note that we may be now out of the current record after having
+			 * done this seek operation. */
 		}
+
+		/* Go to next chunk */
+		i++;
 	}
 }
 
@@ -1244,6 +1469,7 @@ xls_get_extended_record_string (GsfInput *stream,
  * @brief Extract excel content from specified infile
  * @param infile file to read summary from
  * @param n_words number of max words to extract
+ * @param n_bytes max number of bytes to extract
  * @param is_encrypted
  * @Notes :- About SST record
  *
@@ -1277,13 +1503,15 @@ xls_get_extended_record_string (GsfInput *stream,
 static gchar*
 extract_excel_content (GsfInfile *infile,
                        gint       n_words,
+                       gsize      n_bytes,
                        gboolean  *is_encrypted)
 {
 	ExcelBiffHeader header1;
-	GString *content;
+	GString *content = NULL;
 	GsfInput *stream;
-	gchar *normalized;
 	guint saved_offset;
+	guint n_words_remaining = n_words;
+	gsize n_bytes_remaining = n_bytes;
 
 	stream = gsf_infile_child_by_name (infile, "Workbook");
 
@@ -1291,12 +1519,11 @@ extract_excel_content (GsfInfile *infile,
 		return NULL;
 	}
 
-	content = g_string_new ("");
-
-	/* Read until we reach eof. */
-	while (!gsf_input_eof (stream)) {
+	/* Read until we reach eof or any of our limits reached */
+	while (n_words_remaining > 0 &&
+	       n_bytes_remaining > 0 &&
+	       !gsf_input_eof (stream)) {
 		guint8 tmp_buffer[4] = { 0 };
-		guint8 *data = NULL;
 
 		/* Reading 4 bytes to read header */
 		gsf_input_read (stream, 4, tmp_buffer);
@@ -1341,7 +1568,7 @@ extract_excel_content (GsfInfile *infile,
 			 * Note: we are justing parsing notrequired
 			 * to read data so passing null data
 			 */
-			gsf_input_read (stream, length, data);
+			gsf_input_seek (stream, length, G_SEEK_CUR);
 
 			/* Reading & Assigning biff header 4 bytes */
 			gsf_input_read (stream, 4, tmp_buffer);
@@ -1364,7 +1591,7 @@ extract_excel_content (GsfInfile *infile,
 				/*           record.offset, record.length); */
 
 				/* Then parse the data from the stream */
-				gsf_input_read (stream, header2.length, data);
+				gsf_input_seek (stream, header2.length, G_SEEK_CUR);
 
 				/* Reading and assigning biff header */
 				gsf_input_read (stream, 4, tmp_buffer);
@@ -1375,7 +1602,11 @@ extract_excel_content (GsfInfile *infile,
 			};
 
 			/* Read extended string */
-			xls_get_extended_record_string (stream, list, content);
+			xls_get_extended_record_string (stream,
+			                                list,
+			                                &n_words_remaining,
+			                                &n_bytes_remaining,
+			                                &content);
 
 			g_array_unref (list);
 
@@ -1392,10 +1623,11 @@ extract_excel_content (GsfInfile *infile,
 
 	g_object_unref (stream);
 
-	normalized = tracker_text_normalize (content->str, n_words, NULL);
-	g_string_free (content, TRUE);
+	g_debug ("Words normalized: %u, Bytes: %" G_GSIZE_FORMAT,
+	         n_words - n_words_remaining,
+	         n_bytes - n_bytes_remaining);
 
-	return normalized;
+	return content ? g_string_free (content, FALSE) : NULL;
 }
 
 /**
@@ -1549,9 +1781,8 @@ extract_msoffice (const gchar          *uri,
 		 *  TODO: Limit max bytes to read */
 		content = extract_powerpoint_content (infile, max_words, &is_encrypted);
 	} else if (g_ascii_strcasecmp (mime_used, "application/vnd.ms-excel") == 0) {
-		/* Excel File
-		 *  TODO: Limit max bytes to read */
-		content = extract_excel_content (infile, max_words, &is_encrypted);
+		/* Excel File */
+		content = extract_excel_content (infile, max_words, max_bytes, &is_encrypted);
 	} else {
 		g_message ("Mime type was not recognised:'%s'", mime_used);
 	}
@@ -1934,17 +2165,17 @@ parse_xml_contents (const gchar *file_uri,
 	if ((filename = g_filename_from_uri (file_uri,
 	                                     NULL, &error)) == NULL) {
 		g_warning ("Can't get filename from uri '%s': %s",
-		           file_uri, error ? error->message : NULL);
+		           file_uri, error ? error->message : "no error given");
 	}
 	/* Create a new Input GSF object for the given file */
 	else if ((src = gsf_input_stdio_new (filename, &error)) == NULL) {
 		g_warning ("Failed creating a GSF Input object for '%s': %s",
-		           filename, error ? error->message : NULL);
+		           filename, error ? error->message : "no error given");
 	}
 	/* Input object is a Zip file */
 	else if ((infile = gsf_infile_zip_new (src, &error)) == NULL) {
 		g_warning ("'%s' Not a zip file: %s",
-		           filename, error ? error->message : NULL);
+		           filename, error ? error->message : "no error given");
 	}
 	/* Look for requested filename inside the ZIP file */
 	else if ((member = find_member (infile, xml_filename)) == NULL) {
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]