[tracker/tracker-0.8] Fixes GB#615948 - Improved reading msoffice/xml files
- From: Martyn James Russell <mr src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [tracker/tracker-0.8] Fixes GB#615948 - Improved reading msoffice/xml files
- Date: Thu, 22 Apr 2010 11:27:31 +0000 (UTC)
commit 67f67fc4d095c1474eeb36e42b4608068fa73ee0
Author: Aleksander Morgado <aleksander lanedo com>
Date: Fri Apr 16 12:15:48 2010 +0200
Fixes GB#615948 - Improved reading msoffice/xml files
* Don't use heap to store the whole output of libgsf, use just a buffer
in stack, and read & parse in a buffered way.
* Limit to 20MBytes the max of bytes which could be read from the
uncompressed XML file.
src/tracker-extract/tracker-extract-msoffice.c | 70 ++++++++++++++----------
1 files changed, 40 insertions(+), 30 deletions(-)
---
diff --git a/src/tracker-extract/tracker-extract-msoffice.c b/src/tracker-extract/tracker-extract-msoffice.c
index f99d2c5..355b92e 100644
--- a/src/tracker-extract/tracker-extract-msoffice.c
+++ b/src/tracker-extract/tracker-extract-msoffice.c
@@ -1912,17 +1912,24 @@ find_member (GsfInfile *arch,
}
-static gchar *
-load_xml_contents (const gchar *file_uri,
- const gchar *xml_filename)
+#define XML_BUFFER_SIZE 8192 /* bytes */
+/* Note: 20 MBytes of max size is really assumed to be a safe limit. */
+#define XML_MAX_BYTES_READ (20u << 20) /* bytes */
+
+static void
+parse_xml_contents (const gchar *file_uri,
+ const gchar *xml_filename,
+ GMarkupParseContext *context)
{
gchar *filename;
- gchar *xml = NULL;
GError *error = NULL;
GsfInfile *infile = NULL;;
GsfInput *src = NULL;
GsfInput *member = NULL;
+ g_debug ("Parsing '%s' XML file from '%s' zip archive...",
+ xml_filename, file_uri);
+
/* Get filename from the given URI */
if ((filename = g_filename_from_uri (file_uri,
NULL, &error)) == NULL) {
@@ -1946,21 +1953,30 @@ load_xml_contents (const gchar *file_uri,
}
/* Load whole contents of the internal file in the xml buffer */
else {
- size_t size;
+ guint8 buf[XML_BUFFER_SIZE];
+ size_t remaining_size, chunk_size, accum;
+
/* Get whole size of the contents to read */
- size = (size_t) gsf_input_size (GSF_INPUT (member));
-
- /* Allocate buffer to return, and make sure it will be
- * NIL-terminated */
- xml = g_malloc (size + 1);
- xml [size] = '\0';
-
- /* And read all the bytes in one operation */
- if(gsf_input_read (GSF_INPUT (member), size, xml) == NULL) {
- g_warning ("Couldn't read '%u' bytes from '%s'",
- size, xml_filename);
- g_free (xml);
- xml = NULL;
+ remaining_size = (size_t) gsf_input_size (GSF_INPUT (member));
+
+ /* Note that gsf_input_read() needs to be able to read ALL specified
+ * number of bytes, or it will fail */
+ chunk_size = MIN (remaining_size, XML_BUFFER_SIZE);
+
+ accum = 0;
+ while (accum <= XML_MAX_BYTES_READ &&
+ chunk_size > 0 &&
+ gsf_input_read (GSF_INPUT (member), chunk_size, buf) != NULL) {
+
+ /* update accumulated count */
+ accum += chunk_size;
+
+ /* Pass the read stream to the context parser... */
+ g_markup_parse_context_parse (context, buf, chunk_size, NULL);
+
+ /* update bytes to be read */
+ remaining_size -= chunk_size;
+ chunk_size = MIN (remaining_size, XML_BUFFER_SIZE);
}
}
@@ -1975,8 +1991,6 @@ load_xml_contents (const gchar *file_uri,
g_object_unref (src);
if (member)
g_object_unref (member);
-
- return xml;
}
@@ -2036,13 +2050,10 @@ xml_read (MsOfficeXMLParserInfo *parser_info,
}
if (context) {
- gchar *xml = load_xml_contents (parser_info->uri,
- xml_filename);
-
- g_markup_parse_context_parse (context, xml, -1, NULL);
+ /* Load the internal XML file from the Zip archive, and parse it
+ * using the given context */
+ parse_xml_contents (parser_info->uri, xml_filename, context);
g_markup_parse_context_free (context);
-
- g_free (xml);
}
return TRUE;
@@ -2129,7 +2140,6 @@ extract_msoffice_xml (const gchar *uri,
NULL,
NULL
};
- gchar *xml = NULL;
const gchar *mime_used;
file = g_file_new_for_uri (uri);
@@ -2183,9 +2193,9 @@ extract_msoffice_xml (const gchar *uri,
info.content = g_string_new ("");
context = g_markup_parse_context_new (&parser, 0, &info, NULL);
- xml = load_xml_contents (uri, "[Content_Types].xml");
- g_markup_parse_context_parse (context, xml, -1, NULL);
- g_free (xml);
+ /* Load the internal XML file from the Zip archive, and parse it
+ * using the given context */
+ parse_xml_contents (uri, "[Content_Types].xml", context);
if (info.content) {
gchar *content;
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]