[tracker] tracker-extract: Cleaned up MSOffice code



commit e80b15afe188fb144b95950f0de8ebb8bd68bbf9
Author: Martyn Russell <martyn lanedo com>
Date:   Thu Mar 4 10:48:42 2010 +0000

    tracker-extract: Cleaned up MSOffice code

 src/tracker-extract/tracker-extract-msoffice.c |  790 ++++++++++++------------
 1 files changed, 379 insertions(+), 411 deletions(-)
---
diff --git a/src/tracker-extract/tracker-extract-msoffice.c b/src/tracker-extract/tracker-extract-msoffice.c
index 8ba683e..dee4aa1 100644
--- a/src/tracker-extract/tracker-extract-msoffice.c
+++ b/src/tracker-extract/tracker-extract-msoffice.c
@@ -42,38 +42,68 @@
 
 #include "tracker-main.h"
 
-/*
- * Powerpoint files comprise of structures. Each structure contains a header.
- * Within that header is a record type that specifies what strcture it is. It is
- * called record type.
+/* Powerpoint files comprise of structures. Each structure contains a
+ * header. Within that header is a record type that specifies what
+ * strcture it is. It is called record type.
  *
- * Here are are some record types and description of the structure (called atom)
- * they contain.
+ * Here are are some record types and description of the structure
+ * (called atom) they contain.
  */
 
-/*
- * An atom record that specifies Unicode characters with no high byte of a UTF-16
- * Unicode character. High byte is always 0.
+/* An atom record that specifies Unicode characters with no high byte
+ * of a UTF-16 Unicode character. High byte is always 0.
  */
-#define TEXTBYTESATOM_RECORD_TYPE               0x0FA0
+#define TEXTBYTESATOM_RECORD_TYPE      0x0FA0
 
-/*
- * An atom record that specifies Unicode characters.
+/* An atom record that specifies Unicode characters. */
+#define TEXTCHARSATOM_RECORD_TYPE      0x0FA8
+
+/* A container record that specifies information about the powerpoint
+ * document.
  */
-#define TEXTCHARSATOM_RECORD_TYPE               0x0FA8
+#define DOCUMENTCONTAINER_RECORD_TYPE  0x03E8
 
-/*
- * A container record that specifies information about the powerpoint document.
+/* Variant type of record. Within Powerpoint text extraction we are
+ * interested of SlideListWithTextContainer type that contains the
+ * textual content of the slide(s).
+ *
  */
-#define DOCUMENTCONTAINER_RECORD_TYPE           0x03E8
+#define SLIDELISTWITHTEXT_RECORD_TYPE  0x0FF0
 
-/*
- * Variant type of record. Within Powerpoint text extraction we are interested
- * of SlideListWithTextContainer type that contains the textual content
- * of the slide(s).
+/**
+ * @brief Header for all powerpoint structures
  *
+ * A structure at the beginning of each container record and each atom record in
+ * the file. The values in the record header and the context of the record are
+ * used to identify and interpret the record data that follows.
  */
-#define SLIDELISTWITHTEXT_RECORD_TYPE           0x0FF0
+typedef struct {
+	/**
+	 * @brief An unsigned integer that specifies the version of the record
+	 * data that follows the record header. A value of 0xF specifies that the
+	 * record is a container record.
+	 */
+	guint recVer;
+
+	/**
+	 * @brief An unsigned integer that specifies the record instance data.
+	 * Interpretation of the value is dependent on the particular record
+	 * type.
+	 */
+	guint recInstance;
+
+	/**
+	 * @brief A RecordType enumeration that specifies the type of the record
+	 * data that follows the record header.
+	 */
+	gint recType;
+
+	/**
+	 * @brief An unsigned integer that specifies the length, in bytes, of the
+	 * record data that follows the record header.
+	 */
+	guint recLen;
+} PowerPointRecordHeader;
 
 typedef enum {
 	TAG_TYPE_INVALID,
@@ -114,41 +144,41 @@ typedef struct {
 	GString *content;
 } MsOfficeXMLParserInfo;
 
-static void extract_msoffice            (const gchar          *uri,
-                                         TrackerSparqlBuilder *preupdate,
-                                         TrackerSparqlBuilder *metadata);
-static void extract_powerpoint          (const gchar          *uri,
-                                         TrackerSparqlBuilder *preupdate,
-                                         TrackerSparqlBuilder *metadata);
-static void extract_msoffice_xml_format (const gchar          *uri,
-                                         TrackerSparqlBuilder *preupdate,
-                                         TrackerSparqlBuilder *metadata);
+typedef struct {
+	TrackerSparqlBuilder *metadata;
+	const gchar *uri;
+} MetadataInfo;
+
+static void extract_msoffice     (const gchar          *uri,
+                                  TrackerSparqlBuilder *preupdate,
+                                  TrackerSparqlBuilder *metadata);
+static void extract_msoffice_xml (const gchar          *uri,
+                                  TrackerSparqlBuilder *preupdate,
+                                  TrackerSparqlBuilder *metadata);
+static void extract_ppt          (const gchar          *uri,
+                                  TrackerSparqlBuilder *preupdate,
+                                  TrackerSparqlBuilder *metadata);
 
 static TrackerExtractData data[] = {
 	{ "application/msword",            extract_msoffice },
 	/* Powerpoint files */
-	{ "application/vnd.ms-powerpoint", extract_powerpoint },
+	{ "application/vnd.ms-powerpoint", extract_ppt },
 	{ "application/vnd.ms-*",          extract_msoffice },
 	/* MSoffice2007*/
-	{ "application/vnd.openxmlformats-officedocument.presentationml.presentation", extract_msoffice_xml_format },
-	{ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",         extract_msoffice_xml_format },
-	{ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",   extract_msoffice_xml_format },
+	{ "application/vnd.openxmlformats-officedocument.presentationml.presentation", extract_msoffice_xml },
+	{ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",         extract_msoffice_xml },
+	{ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",   extract_msoffice_xml },
 	{ NULL, NULL }
 };
 
-typedef struct {
-	TrackerSparqlBuilder *metadata;
-	const gchar *uri;
-} ForeachInfo;
-
 static void
-add_gvalue_in_metadata (TrackerSparqlBuilder *metadata,
-                        const gchar          *uri,
-                        const gchar          *key,
-                        GValue const         *val,
-                        const gchar          *type,
-                        const gchar          *predicate,
-                        gboolean              is_date)
+metadata_add_gvalue (TrackerSparqlBuilder *metadata,
+                     const gchar          *uri,
+                     const gchar          *key,
+                     GValue const         *val,
+                     const gchar          *type,
+                     const gchar          *predicate,
+                     gboolean              is_date)
 {
 	gchar *s;
 
@@ -169,7 +199,8 @@ add_gvalue_in_metadata (TrackerSparqlBuilder *metadata,
 		gchar *str_val;
 
 		/* Some fun: strings are always written "str" with double quotes
-		 * around, but not numbers! */
+		 * around, but not numbers!
+		 */
 		if (s[0] == '"') {
 			size_t len;
 
@@ -184,24 +215,28 @@ add_gvalue_in_metadata (TrackerSparqlBuilder *metadata,
 					} else {
 						str_val = NULL;
 					}
-				} else
-					str_val = (len > 2 ? g_strndup (s + 1, len - 2) : NULL); 
+				} else {
+					str_val = len > 2 ? g_strndup (s + 1, len - 2) : NULL;
+				}
 			} else {
 				/* We have a string that begins with a double
 				 * quote but which finishes by something
 				 * different... We copy the string from the
-				 * beginning. */
-				if (is_date)
+				 * beginning.
+				 */
+				if (is_date) {
 					str_val = tracker_extract_guess_date (s);
-				else
+				} else {
 					str_val = g_strdup (s);
+				}
 			}
 		} else {
 			/* Here, we probably have a number */
-			if (is_date)
+			if (is_date) {
 				str_val = tracker_extract_guess_date (s);
-			else
+			} else {
 				str_val = g_strdup (s);
+			}
 		}
 
 		if (str_val) {
@@ -228,83 +263,75 @@ add_gvalue_in_metadata (TrackerSparqlBuilder *metadata,
 }
 
 static void
-metadata_cb (gpointer key,
-             gpointer value,
-             gpointer user_data)
+summary_metadata_cb (gpointer key,
+                     gpointer value,
+                     gpointer user_data)
 {
-	ForeachInfo          *info = user_data;
-	gchar                *name;
-	GsfDocProp           *property;
-	TrackerSparqlBuilder *metadata = info->metadata;
-	GValue const         *val;
-	const gchar          *uri = info->uri;
-
-	name = key;
-	property = value;
-	metadata = info->metadata;
-	val = gsf_doc_prop_get_val (property);
-
-	if (g_strcmp0 (name, "dc:title") == 0) {
-		add_gvalue_in_metadata (metadata, uri, "nie:title", val, NULL, NULL, FALSE);
-	} else if (g_strcmp0 (name, "dc:subject") == 0) {
-		add_gvalue_in_metadata (metadata, uri, "nie:subject", val, NULL, NULL, FALSE);
-	} else if (g_strcmp0 (name, "dc:creator") == 0) {
-		add_gvalue_in_metadata (metadata, uri, "nco:creator", val, "nco:Contact", "nco:fullname", FALSE);
-	} else if (g_strcmp0 (name, "dc:keywords") == 0) {
+	MetadataInfo *info = user_data;
+	GValue const *val;
+
+	val = gsf_doc_prop_get_val (value);
+
+	if (g_strcmp0 (key, "dc:title") == 0) {
+		metadata_add_gvalue (info->metadata, info->uri, "nie:title", val, NULL, NULL, FALSE);
+	} else if (g_strcmp0 (key, "dc:subject") == 0) {
+		metadata_add_gvalue (info->metadata, info->uri, "nie:subject", val, NULL, NULL, FALSE);
+	} else if (g_strcmp0 (key, "dc:creator") == 0) {
+		metadata_add_gvalue (info->metadata, info->uri, "nco:creator", val, "nco:Contact", "nco:fullname", FALSE);
+	} else if (g_strcmp0 (key, "dc:keywords") == 0) {
 		gchar *keywords = g_strdup_value_contents (val);
-		char  *lasts, *keyw;
+		gchar *lasts, *keyw;
 		size_t len;
 
 		keyw = keywords;
 		keywords = strchr (keywords, '"');
-		if (keywords)
+
+		if (keywords) {
 			keywords++;
-		else
+		} else {
 			keywords = keyw;
+		}
 
 		len = strlen (keywords);
-		if (keywords[len - 1] == '"')
+		if (keywords[len - 1] == '"') {
 			keywords[len - 1] = '\0';
+		}
 
 		for (keyw = strtok_r (keywords, ",; ", &lasts); keyw;
 		     keyw = strtok_r (NULL, ",; ", &lasts)) {
-			tracker_sparql_builder_predicate (metadata, "nie:keyword");
-			tracker_sparql_builder_object_unvalidated (metadata, keyw);
+			tracker_sparql_builder_predicate (info->metadata, "nie:keyword");
+			tracker_sparql_builder_object_unvalidated (info->metadata, keyw);
 		}
 
 		g_free (keyw);
-	} else if (g_strcmp0 (name, "dc:description") == 0) {
-		add_gvalue_in_metadata (metadata, uri, "nie:comment", val, NULL, NULL, FALSE);
-	} else if (g_strcmp0 (name, "gsf:page-count") == 0) {
-		add_gvalue_in_metadata (metadata, uri, "nfo:pageCount", val, NULL, NULL, FALSE);
-	} else if (g_strcmp0 (name, "gsf:word-count") == 0) {
-		add_gvalue_in_metadata (metadata, uri, "nfo:wordCount", val, NULL, NULL, FALSE);
-	} else if (g_strcmp0 (name, "meta:creation-date") == 0) {
-		add_gvalue_in_metadata (metadata, uri, "nie:contentCreated", val, NULL, NULL, TRUE);
-	} else if (g_strcmp0 (name, "meta:generator") == 0) {
-		add_gvalue_in_metadata (metadata, uri, "nie:generator", val, NULL, NULL, FALSE);
+	} else if (g_strcmp0 (key, "dc:description") == 0) {
+		metadata_add_gvalue (info->metadata, info->uri, "nie:comment", val, NULL, NULL, FALSE);
+	} else if (g_strcmp0 (key, "gsf:page-count") == 0) {
+		metadata_add_gvalue (info->metadata, info->uri, "nfo:pageCount", val, NULL, NULL, FALSE);
+	} else if (g_strcmp0 (key, "gsf:word-count") == 0) {
+		metadata_add_gvalue (info->metadata, info->uri, "nfo:wordCount", val, NULL, NULL, FALSE);
+	} else if (g_strcmp0 (key, "meta:creation-date") == 0) {
+		metadata_add_gvalue (info->metadata, info->uri, "nie:contentCreated", val, NULL, NULL, TRUE);
+	} else if (g_strcmp0 (key, "meta:generator") == 0) {
+		metadata_add_gvalue (info->metadata, info->uri, "nie:generator", val, NULL, NULL, FALSE);
 	}
 }
 
 static void
-doc_metadata_cb (gpointer key,
-                 gpointer value,
-                 gpointer user_data)
+document_metadata_cb (gpointer key,
+                      gpointer value,
+                      gpointer user_data)
 {
-	ForeachInfo          *info = user_data;
-	gchar                *name;
-	GsfDocProp           *property;
-	TrackerSparqlBuilder *metadata = info->metadata;
-	GValue const         *val;
-	const gchar          *uri = info->uri;
-
-	name = key;
-	property = value;
-	metadata = user_data;
-	val = gsf_doc_prop_get_val (property);
-
-	if (g_strcmp0 (name, "CreativeCommons_LicenseURL") == 0) {
-		add_gvalue_in_metadata (metadata, uri, "nie:license", val, NULL, NULL, FALSE);
+	if (g_strcmp0 (key, "CreativeCommons_LicenseURL") == 0) {
+		MetadataInfo *info = user_data;
+
+		metadata_add_gvalue (info->metadata,
+		                     info->uri,
+		                     "nie:license",
+		                     gsf_doc_prop_get_val (value),
+		                     NULL,
+		                     NULL,
+		                     FALSE);
 	}
 }
 
@@ -331,59 +358,25 @@ read_32bit (const guint8* buffer)
 }
 
 /**
- * @brief Header for all powerpoint structures
- *
- * A structure at the beginning of each container record and each atom record in
- * the file. The values in the record header and the context of the record are
- * used to identify and interpret the record data that follows.
- */
-typedef struct {
-	/**
-	 * @brief An unsigned integer that specifies the version of the record
-	 * data that follows the record header. A value of 0xF specifies that the
-	 * record is a container record.
-	 */
-	guint recVer;
-
-	/**
-	 * @brief An unsigned integer that specifies the record instance data.
-	 * Interpretation of the value is dependent on the particular record
-	 * type.
-	 */
-	guint recInstance;
-
-	/**
-	 * @brief A RecordType enumeration that specifies the type of the record
-	 * data that follows the record header.
-	 */
-	gint recType;
-
-	/**
-	 * @brief An unsigned integer that specifies the length, in bytes, of the
-	 * record data that follows the record header.
-	 */
-	guint recLen;
-}RecordHeader;
-
-/**
  * @brief Read header data from given stream
  * @param stream Stream to read header data
  * @param header Pointer to header where to store results
  */
 static gboolean
-read_header (GsfInput *stream, RecordHeader *header) {
+read_header (GsfInput               *stream,
+             PowerPointRecordHeader *header)
+{
 	guint8 buffer[8] = {0};
 
-	g_return_val_if_fail(stream,FALSE);
-	g_return_val_if_fail(header,FALSE);
-	g_return_val_if_fail(!gsf_input_eof(stream),FALSE);
+	g_return_val_if_fail (stream, FALSE);
+	g_return_val_if_fail (header, FALSE);
+	g_return_val_if_fail (!gsf_input_eof (stream), FALSE);
 
 
 	/* Header is always 8 bytes, read it */
-	g_return_val_if_fail(gsf_input_read(stream,8,buffer),FALSE);
+	g_return_val_if_fail (gsf_input_read (stream, 8, buffer), FALSE);
 
-	/*
-	 * Then parse individual details
+	/* Then parse individual details
 	 *
 	 * Record header is 8 bytes long. Data is split as follows:
 	 * recVer (4 bits)
@@ -396,10 +389,10 @@ read_header (GsfInput *stream, RecordHeader *header) {
 	 * Here we parse each of those fields.
 	 */
 
-	header->recType = read_16bit(&buffer[2]);
-	header->recLen = read_32bit(&buffer[4]);
-	header->recVer = (read_16bit(buffer) & 0xF000) >> 12;
-	header->recInstance = read_16bit(buffer) & 0x0FFF;
+	header->recType = read_16bit (&buffer[2]);
+	header->recLen = read_32bit (&buffer[4]);
+	header->recVer = (read_16bit (buffer) & 0xF000) >> 12;
+	header->recInstance = read_16bit (buffer) & 0x0FFF;
 
 	return TRUE;
 }
@@ -426,20 +419,18 @@ read_header (GsfInput *stream, RecordHeader *header) {
 static gchar*
 read_text (GsfInput *stream)
 {
-	gint          i = 0;
-	RecordHeader  header;
-	guint8       *data = NULL;
+	gint i = 0;
+	PowerPointRecordHeader header;
+	guint8 *data = NULL;
 
-	g_return_val_if_fail (stream,NULL);
+	g_return_val_if_fail (stream, NULL);
 
-	/*
-	 * First read the header that describes the structures type
+	/* First read the header that describes the structures type
 	 * (TextBytesAtom or TextCharsAtom) and it's length.
 	 */
-	g_return_val_if_fail (read_header(stream, &header),NULL);
+	g_return_val_if_fail (read_header (stream, &header),NULL);
 
-	/*
-	 * We only want header with type either TEXTBYTESATOM_RECORD_TYPE
+	/* We only want header with type either TEXTBYTESATOM_RECORD_TYPE
 	 * (TextBytesAtom) or TEXTCHARSATOM_RECORD_TYPE (TextCharsAtom).
 	 *
 	 * We don't care about anything else
@@ -451,8 +442,7 @@ read_text (GsfInput *stream)
 
 	/* Then we'll allocate data for the actual texts */
 	if (header.recType == TEXTBYTESATOM_RECORD_TYPE) {
-		/*
-		 * TextBytesAtom doesn't include high bytes propably in order to
+		/* TextBytesAtom doesn't include high bytes propably in order to
 		 * save space on the ppt files. We'll have to allocate double the
 		 * size for it to get the high bytes
 		 */
@@ -461,35 +451,31 @@ read_text (GsfInput *stream)
 		data = g_try_new0 (guint8,header.recLen);
 	}
 
-	g_return_val_if_fail (data,NULL);
+	g_return_val_if_fail (data, NULL);
 
 	/* Then read the textual data from the stream */
-	if (!gsf_input_read (stream,header.recLen,data)) {
+	if (!gsf_input_read (stream, header.recLen, data)) {
 		g_free (data);
 		return NULL;
 	}
 
-
-	/*
-	 * Again if we are reading TextBytesAtom we'll need to add those utf16
+	/* Again if we are reading TextBytesAtom we'll need to add those utf16
 	 * high bytes ourselves. They are zero as specified in [MS-PPT].pdf
 	 * and this function's comments
 	 */
 	if (header.recType == TEXTBYTESATOM_RECORD_TYPE) {
-		for(i = 0; i < header.recLen; i++) {
-
-			/*
-			 * We'll add an empty 0 byte between each byte in the
+		for (i = 0; i < header.recLen; i++) {
+			/* We'll add an empty 0 byte between each byte in the
 			 * array
 			 */
 			data[(header.recLen - i - 1) * 2] = data[header.recLen - i - 1];
+
 			if ((header.recLen - i - 1) % 2) {
 				data[header.recLen - i - 1] = 0;
 			}
 		}
 
-		/*
-		 * Then double the recLen now that we have the high bytes added
+		/* Then double the recLen now that we have the high bytes added
 		 * between read bytes
 		 */
 		header.recLen *= 2;
@@ -515,27 +501,21 @@ seek_header (GsfInput *stream,
              gint      type2,
              gboolean  rewind)
 {
-	RecordHeader header;
+	PowerPointRecordHeader header;
 
-	g_return_val_if_fail(stream,FALSE);
+	g_return_val_if_fail (stream,FALSE);
 
 	/*
 	 * Read until we reach eof
 	 */
-	while(!gsf_input_eof(stream)) {
+	while (!gsf_input_eof (stream)) {
 
-		/*
-		 * Read first header
-		 */
+		/* Read first header */
 		g_return_val_if_fail(read_header(stream, &header),FALSE);
 
-		/*
-		 * Check if it's the correct type
-		 */
+		/* Check if it's the correct type */
 		if (header.recType == type1 || header.recType == type2) {
-
-			/*
-			 * Sometimes it's needed to rewind to the start of the
+			/* Sometimes it's needed to rewind to the start of the
 			 * header
 			 */
 			if (rewind) {
@@ -544,14 +524,13 @@ seek_header (GsfInput *stream,
 			return TRUE;
 		}
 
-		/*
-		 * If it's not the correct type, seek to the beginning of the
+		/* If it's not the correct type, seek to the beginning of the
 		 * next header
 		 */
-		g_return_val_if_fail(!gsf_input_seek(stream,
-		                                     header.recLen,
-		                                     G_SEEK_CUR),
-		                     FALSE);
+		g_return_val_if_fail (!gsf_input_seek (stream,
+		                                       header.recLen,
+		                                       G_SEEK_CUR),
+		                      FALSE);
 	}
 
 	return FALSE;
@@ -566,70 +545,64 @@ seek_header (GsfInput *stream,
  * @return number of words appended to all_text
  */
 static gint
-append_text (gchar   *text,
-             GString *all_texts,
-             gint     words,
-             gint     max_words)
+ppt_append_text (gchar   *text,
+                 GString *all_texts,
+                 gint     words,
+                 gint     max_words)
 {
-	guint count = 0;
 	gchar *normalized_text;
+	guint count = 0;
 
-	g_return_val_if_fail(text,-1);
-	g_return_val_if_fail(all_texts,-1);
+	g_return_val_if_fail (text, -1);
+	g_return_val_if_fail (all_texts, -1);
 
 	normalized_text = tracker_extract_text_normalize (text,
 	                                                  max_words - words,
 	                                                  &count);
 
 	if (normalized_text) {
-		/*
-		 * If the last added text didn't end in a space, we'll append a
-		 * space between this text and previous text so the last word of
-		 * previous text and first word of this text don't become one big
-		 * word.
+		/* If the last added text didn't end in a space, we'll
+		 * append a space between this text and previous text
+		 * so the last word of previous text and first word of
+		 * this text don't become one big word.
 		 */
 		if (all_texts->len > 0 &&
 		    all_texts->str[all_texts->len-1] != ' ') {
-
 			g_string_append_c(all_texts,' ');
 		}
 
-		g_string_append(all_texts,normalized_text);
-		g_free(normalized_text);
+		g_string_append (all_texts,normalized_text);
+		g_free (normalized_text);
 	}
 
-	g_free(text);
+	g_free (text);
+
 	return count;
 }
 
 static void
-read_powerpoint (GsfInfile            *infile,
-                 TrackerSparqlBuilder *metadata,
-                 gint                  max_words)
+ppt_read (GsfInfile            *infile,
+          TrackerSparqlBuilder *metadata,
+          gint                  max_words)
 {
-	/*
-	 * Try to find Powerpoint Document stream
-	 */
-	gsf_off_t  lastDocumentContainer = -1;
-	GsfInput  *stream = gsf_infile_child_by_name(infile,
-	                                             "PowerPoint Document");
+	/* Try to find Powerpoint Document stream */
+	GsfInput *stream;
+	gsf_off_t last_document_container = -1;
+
+	stream = gsf_infile_child_by_name (infile, "PowerPoint Document");
 
 	g_return_if_fail (stream);
 
-	/*
-	 * Powerpoint documents have a "editing history" stored within them.
+	/* Powerpoint documents have a "editing history" stored within them.
 	 * There is a structure that defines what changes were made each time
 	 * but it is just easier to get the current/latest version just by
 	 * finding the last occurrence of DocumentContainer structure
 	 */
+	last_document_container = -1;
 
-	lastDocumentContainer = -1;
-
-	/*
-	 * Read until we reach eof.
-	 */
-	while(!gsf_input_eof (stream)) {
-		RecordHeader header;
+	/* Read until we reach eof. */
+	while (!gsf_input_eof (stream)) {
+		PowerPointRecordHeader header;
 
 		/*
 		 * We only read headers of data structures
@@ -638,36 +611,35 @@ read_powerpoint (GsfInfile            *infile,
 			break;
 		}
 
-		/*
-		 * And we only care about headers with type 1000,
+		/* And we only care about headers with type 1000,
 		 * DocumentContainer
 		 */
 
 		if (header.recType == DOCUMENTCONTAINER_RECORD_TYPE) {
-			lastDocumentContainer = gsf_input_tell(stream);
+			last_document_container = gsf_input_tell (stream);
 		}
 
-		/*
-		 * and then seek to the start of the next data structure so it is
-		 * fast and we don't have to read through the whole file
+		/* and then seek to the start of the next data
+		 * structure so it is fast and we don't have to read
+		 * through the whole file
 		 */
 		if (gsf_input_seek (stream, header.recLen, G_SEEK_CUR)) {
 			break;
 		}
 	}
 
-	/*
-	 * If a DocumentContainer was found and we are able to seek to it.
+	/* If a DocumentContainer was found and we are able to seek to it.
 	 *
 	 * Then we'll have to find the second header with type
-	 * SLIDELISTWITHTEXT_RECORD_TYPE since DocumentContainer contains
-	 * MasterListWithTextContainer and SlideListWithTextContainer structures
-	 * with both having the same header type. We however only want
-	 * SlideListWithTextContainer which contains the textual content
-	 * of the power point file.
+	 * SLIDELISTWITHTEXT_RECORD_TYPE since DocumentContainer
+	 * contains MasterListWithTextContainer and
+	 * SlideListWithTextContainer structures with both having the
+	 * same header type. We however only want
+	 * SlideListWithTextContainer which contains the textual
+	 * content of the power point file.
 	 */
-	if (lastDocumentContainer >= 0 &&
-	    !gsf_input_seek(stream,lastDocumentContainer,G_SEEK_SET) &&
+	if (last_document_container >= 0 &&
+	    !gsf_input_seek (stream, last_document_container, G_SEEK_SET) &&
 	    seek_header (stream,
 	                 SLIDELISTWITHTEXT_RECORD_TYPE,
 	                 SLIDELISTWITHTEXT_RECORD_TYPE,
@@ -676,78 +648,100 @@ read_powerpoint (GsfInfile            *infile,
 	                 SLIDELISTWITHTEXT_RECORD_TYPE,
 	                 SLIDELISTWITHTEXT_RECORD_TYPE,
 	                 FALSE)) {
-
 		GString *all_texts = g_string_new ("");
-		int word_count = 0;
+		gint word_count = 0;
 
 		/*
 		 * Read while we have either TextBytesAtom or
 		 * TextCharsAtom and we have read less than max_words
 		 * amount of words
 		 */
-		while(seek_header (stream,
-		                   TEXTBYTESATOM_RECORD_TYPE,
-		                   TEXTCHARSATOM_RECORD_TYPE,
-		                   TRUE) &&
-		      word_count < max_words) {
-
+		while (seek_header (stream,
+		                    TEXTBYTESATOM_RECORD_TYPE,
+		                    TEXTCHARSATOM_RECORD_TYPE,
+		                    TRUE) &&
+		       word_count < max_words) {
 			gchar *text = read_text(stream);
-			if(text) {
-				int count = append_text (text,
-			                         all_texts,
-			                         word_count,
-			                         max_words);
+
+			if (text) {
+				gint count;
+
+				count = ppt_append_text (text, all_texts, word_count, max_words);
 				if (count < 0) {
 					break;
 				}
+
 				word_count += count;
 			}
 		}
 
-		/*
-		 * If we have any text read
-		 */
+		/* If we have any text read */
 		if (all_texts->len > 0) {
-			/*
-			 * Send it to tracker
-			 */
-			tracker_sparql_builder_predicate (metadata,
-			                                  "nie:plainTextContent");
-			tracker_sparql_builder_object_unvalidated (metadata,
-			                                           all_texts->str);
+			/* Send it to tracker */
+			tracker_sparql_builder_predicate (metadata, "nie:plainTextContent");
+			tracker_sparql_builder_object_unvalidated (metadata, all_texts->str);
 		}
 
-		g_string_free (all_texts,TRUE);
-
+		g_string_free (all_texts, TRUE);
 	}
 
 	g_object_unref (stream);
 }
 
-/* This function was programmed by using ideas and algorithms from 
- * b2xtranslator project (http://b2xtranslator.sourceforge.net/) */
+/**
+ * @brief get maximum number of words to index
+ * @return maximum number of words to index
+ */
+static gint
+fts_max_words (void)
+{
+	TrackerFTSConfig *fts_config = tracker_main_get_fts_config ();
+	return tracker_fts_config_get_max_words_to_index (fts_config);
+}
 
-static gchar* 
-extract_msword_content (GsfInfile *infile, 
+/**
+ * @brief Open specified uri for reading and initialize gsf
+ * @param uri URI of the file to open
+ * @return GsfInFile of the opened file or NULL if failed to open file
+ */
+static GsfInfile *
+open_uri (const gchar *uri)
+{
+	GsfInput *input;
+	GsfInfile *infile;
+	gchar *filename;
+
+	filename = g_filename_from_uri (uri, NULL, NULL);
+	input = gsf_input_stdio_new (filename, NULL);
+	g_free (filename);
+
+	if (!input) {
+		return NULL;
+	}
+
+	infile = gsf_infile_msole_new (input, NULL);
+	g_object_unref (G_OBJECT (input));
+
+	return infile;
+}
+
+/* This function was programmed by using ideas and algorithms from
+ * b2xtranslator project (http://b2xtranslator.sourceforge.net/)
+ */
+static gchar *
+extract_msword_content (GsfInfile *infile,
                         gint       n_words,
-                        gboolean  *is_encrypted) 
+                        gboolean  *is_encrypted)
 {
-	GsfInput *document_stream = NULL, *table_stream = NULL;
+	GsfInput *document_stream, *table_stream;
 	gint16 i = 0;
-	guint8 tmp_buffer[4] = {0};
+	guint8 tmp_buffer[4] = { 0 };
 	gint fcClx, lcbClx;
 	guint8 *piece_table = NULL;
 	guint8 *clx = NULL;
 	gint lcb_piece_table;
 	gint piece_count;
-	gint piece_start;
-	gint piece_end;
-	guint8 *piece_descriptor = NULL;
-	gint piece_size;
 	gint32 fc;
-	guint32 is_ansi;
-	guint8 *text_buffer = NULL;
-	gchar *converted_text = NULL;
 	GString *content = NULL;
 	gchar *normalized = NULL;
 
@@ -774,17 +768,16 @@ extract_msword_content (GsfInfile *infile,
 	} else
 		*is_encrypted = FALSE;
 
-	/* document can have 0Table or 1Table or both. If flag 0x0200 is 
-	 * set to true in word 0x000A of the FIB then 1Table is used */
-
+	/* document can have 0Table or 1Table or both. If flag 0x0200 is
+	 * set to true in word 0x000A of the FIB then 1Table is used
+	 */
 	gsf_input_seek (document_stream, 0x000A, G_SEEK_SET);
 	gsf_input_read (document_stream, 2, tmp_buffer);
 	i = read_16bit (tmp_buffer);
 
 	if ((i & 0x0200) == 0x0200) {
 		table_stream = gsf_infile_child_by_name (infile, "1Table");
-	}
-	else {
+	} else {
 		table_stream = gsf_infile_child_by_name (infile, "0Table");
 	}
 
@@ -808,41 +801,47 @@ extract_msword_content (GsfInfile *infile,
 	/* find out piece table from clx and set piece_table -pointer to it */
 	i = 0;
 	lcb_piece_table = 0;
+
 	while (TRUE) {
 		if (clx[i] == 2) {
-			lcb_piece_table = read_32bit (clx+(i+1));
-			piece_table = clx+i+5;
+			lcb_piece_table = read_32bit (clx + (i + 1));
+			piece_table = clx + i + 5;
 			piece_count = (lcb_piece_table - 4) / 12;
 			break;
-		}
-		else if (clx[i] == 1) {
-			i = i + 2 + clx[i+1];
-		}
-		else {
+		} else if (clx[i] == 1) {
+			i = i + 2 + clx[i + 1];
+		} else {
 			break;
 		}
 	}
 
 	/* iterate over pieces and save text to the content -variable */
 	for (i = 0; i < piece_count; i++) {
+		gchar *converted_text;
+		guint8 *text_buffer;
+		guint8 *piece_descriptor;
+		gint piece_start;
+		gint piece_end;
+		gint piece_size;
+		gboolean is_ansi;
+
 		/* logical position of the text piece in the document_stream */
-		piece_start = read_32bit (piece_table+(i*4));
-		piece_end = read_32bit (piece_table+((i+1)*4));
+		piece_start = read_32bit (piece_table + (i * 4));
+		piece_end = read_32bit (piece_table + ((i + 1) * 4));
 
 		/* descriptor of single piece from piece table */
-		piece_descriptor = piece_table + ((piece_count+1)*4) + (i*8);
+		piece_descriptor = piece_table + ((piece_count + 1) * 4) + (i * 8);
 
 		/* file character position */
-		fc = read_32bit (piece_descriptor+2);
+		fc = read_32bit (piece_descriptor + 2);
 
 		/* second bit is set to 1 if text is saved in ANSI encoding */
-		is_ansi = ((fc & 0x40000000) == 0x40000000);
+		is_ansi = (fc & 0x40000000) == 0x40000000;
 
 		/* modify file character position according to text encoding */
 		if (!is_ansi) {
 			fc = (fc & 0xBFFFFFFF);
-		}
-		else {
+		} else {
 			fc = (fc & 0xBFFFFFFF) >> 1;
 		}
 
@@ -862,30 +861,20 @@ extract_msword_content (GsfInfile *infile,
 		gsf_input_read (document_stream, piece_size, text_buffer);
 
 		/* pieces can have different encoding */
-		if(is_ansi) {
-			converted_text = g_convert (text_buffer, 
-			                            piece_size, 
-			                            "UTF-8", 
-			                            "CP1252", 
-			                            NULL, 
-			                            NULL, 
-			                            NULL);
-		}
-		else {
-			converted_text = g_convert (text_buffer, 
-			                            piece_size, 
-			                            "UTF-8", 
-			                            "UTF-16", 
-			                            NULL, 
-			                            NULL, 
-			                            NULL);
-		}
+		converted_text = g_convert (text_buffer,
+		                            piece_size,
+		                            "UTF-8",
+		                            is_ansi ? "CP1252" : "UTF-16",
+		                            NULL,
+		                            NULL,
+		                            NULL);
 
 		if (converted_text) {
-			if (!content)
+			if (!content) {
 				content = g_string_new (converted_text);
-			else
+			} else {
 				g_string_append (content, converted_text);
+			}
 
 			g_free (converted_text);
 		}
@@ -906,30 +895,19 @@ extract_msword_content (GsfInfile *infile,
 }
 
 /**
- * @brief get maximum number of words to index
- * @return maximum number of words to index
- */
-static gint
-max_words (void)
-{
-	TrackerFTSConfig *fts_config = tracker_main_get_fts_config ();
-	return tracker_fts_config_get_max_words_to_index (fts_config);
-}
-
-/**
  * @brief Extract summary OLE stream from specified uri
  * @param metadata where to store summary
  * @param infile file to read summary from
  * @param uri uri of the file
  */
-static void
+static gboolean
 extract_summary (TrackerSparqlBuilder *metadata,
                  GsfInfile            *infile,
                  const gchar          *uri)
 {
 	GsfInput *stream;
-	gchar    *content;
-	gboolean  is_encrypted = FALSE;
+	gchar *content;
+	gboolean is_encrypted = FALSE;
 
 	tracker_sparql_builder_predicate (metadata, "a");
 	tracker_sparql_builder_object (metadata, "nfo:PaginatedTextDocument");
@@ -938,21 +916,28 @@ extract_summary (TrackerSparqlBuilder *metadata,
 
 	if (stream) {
 		GsfDocMetaData *md;
-		GError *err = NULL;
-		ForeachInfo     info = { metadata, uri };
+		MetadataInfo info;
+		GError *error = NULL;
 
 		md = gsf_doc_meta_data_new ();
-		err = gsf_msole_metadata_read (stream, md);
+		error = gsf_msole_metadata_read (stream, md);
+
+		if (error) {
+			g_warning ("Could not extract summary information, %s",
+			           error->message ? error->message : "no error given");
 
-		if (err) {
-			g_error_free (err);
+			g_error_free (error);
 			g_object_unref (md);
 			g_object_unref (stream);
 			gsf_shutdown ();
-			return;
+
+			return FALSE;
 		}
 
-		gsf_doc_meta_data_foreach (md, metadata_cb, &info);
+		info.metadata = metadata;
+		info.uri = uri;
+
+		gsf_doc_meta_data_foreach (md, summary_metadata_cb, &info);
 
 		g_object_unref (md);
 		g_object_unref (stream);
@@ -962,77 +947,47 @@ extract_summary (TrackerSparqlBuilder *metadata,
 
 	if (stream) {
 		GsfDocMetaData *md;
-		GError *err = NULL;
-		ForeachInfo     info = { metadata, uri };
+		MetadataInfo info;
+		GError *error = NULL;
 
 		md = gsf_doc_meta_data_new ();
 
-		err = gsf_msole_metadata_read (stream, md);
-		if (err) {
-			g_error_free (err);
+		error = gsf_msole_metadata_read (stream, md);
+		if (error) {
+			g_warning ("Could not extract document summary information, %s",
+			           error->message ? error->message : "no error given");
+
+			g_error_free (error);
 			g_object_unref (md);
 			g_object_unref (stream);
 			gsf_shutdown ();
-			return;
+
+			return FALSE;
 		}
 
-		gsf_doc_meta_data_foreach (md, doc_metadata_cb, &info);
+		info.metadata = metadata;
+		info.uri = uri;
+
+		gsf_doc_meta_data_foreach (md, document_metadata_cb, &info);
 
 		g_object_unref (md);
 		g_object_unref (stream);
 	}
 
-	content = extract_msword_content(infile, max_words (), &is_encrypted);
+	content = extract_msword_content (infile, fts_max_words (), &is_encrypted);
 
 	if (content) {
-		tracker_sparql_builder_predicate (metadata,
-		                                  "nie:plainTextContent");
+		tracker_sparql_builder_predicate (metadata, "nie:plainTextContent");
 		tracker_sparql_builder_object_unvalidated (metadata, content);
 		g_free (content);
 	}
 
 	if (is_encrypted) {
-		tracker_sparql_builder_predicate (metadata,
-		                                  "nfo:isContentEncrypted");
+		tracker_sparql_builder_predicate (metadata, "nfo:isContentEncrypted");
 		tracker_sparql_builder_object_boolean (metadata, TRUE);
 	}
-}
-
-/**
- * @brief Open specified uri for reading and initialize gsf
- * @param uri URI of the file to open
- * @return GsfInFile of the opened file or NULL if failed to open file
- */
-static GsfInfile *
-open_uri (const gchar *uri)
-{
-	GsfInput  *input;
-	GsfInfile *infile;
-	gchar     *filename;
-
-	gsf_init ();
-
-	filename = g_filename_from_uri (uri, NULL, NULL);
-
-	input = gsf_input_stdio_new (filename, NULL);
-
-	if (!input) {
-		g_free (filename);
-		gsf_shutdown ();
-		return NULL;
-	}
-
-	infile = gsf_infile_msole_new (input, NULL);
-	g_object_unref (G_OBJECT (input));
-
-	if (!infile) {
-		g_free (filename);
-		gsf_shutdown ();
-		return NULL;
-	}
 
-	g_free (filename);
-	return infile;
+	return TRUE;
 }
 
 /**
@@ -1047,9 +1002,16 @@ extract_msoffice (const gchar          *uri,
                   TrackerSparqlBuilder *preupdate,
                   TrackerSparqlBuilder *metadata)
 {
-	GsfInfile *infile = open_uri(uri);
-	extract_summary(metadata,infile,uri);
-	g_object_unref (infile);
+	GsfInfile *infile;
+
+	gsf_init ();
+
+	infile = open_uri (uri);
+	if (infile) {
+		extract_summary (metadata, infile, uri);
+		g_object_unref (infile);
+	}
+
 	gsf_shutdown ();
 }
 
@@ -1061,15 +1023,21 @@ extract_msoffice (const gchar          *uri,
  * @param metadata where to store extracted data to
  */
 static void
-extract_powerpoint (const gchar          *uri,
-                    TrackerSparqlBuilder *preupdate,
-                    TrackerSparqlBuilder *metadata)
+extract_ppt (const gchar          *uri,
+             TrackerSparqlBuilder *preupdate,
+             TrackerSparqlBuilder *metadata)
 {
-	GsfInfile *infile = open_uri(uri);
-	extract_summary(metadata,infile,uri);
-	read_powerpoint(infile,metadata,max_words());
+	GsfInfile *infile;
+
+	gsf_init ();
+
+	infile = open_uri (uri);
+	if (infile) {
+		extract_summary (metadata, infile, uri);
+		ppt_read (infile, metadata, fts_max_words());
+		g_object_unref (infile);
+	}
 
-	g_object_unref (infile);
 	gsf_shutdown ();
 }
 
@@ -1377,9 +1345,9 @@ text_handler_document_data (GMarkupParseContext  *context,
 }
 
 static gboolean
-read_file_xml_data (MsOfficeXMLParserInfo *parser_info,
-                    const gchar           *xml_filename,
-                    TagType	           type)
+xml_read (MsOfficeXMLParserInfo *parser_info,
+          const gchar           *xml_filename,
+          TagType                type)
 {
 	GMarkupParseContext *context;
 	MsOfficeXMLParserInfo info;
@@ -1498,28 +1466,28 @@ start_element_handler_content_types (GMarkupParseContext  *context,
 
 	if ((g_ascii_strcasecmp (content_type, "application/vnd.openxmlformats-package.core-properties+xml") == 0) ||
 	    (g_ascii_strcasecmp (content_type, "application/vnd.openxmlformats-officedocument.extended-properties+xml") == 0)) {
-		read_file_xml_data (info, part_name + 1, TAG_TYPE_DOCUMENT_CORE_DATA);
+		xml_read (info, part_name + 1, TAG_TYPE_DOCUMENT_CORE_DATA);
 		return;
 	}
 
 	switch (info->file_type) {
 	case FILE_TYPE_DOCX:
 		if (g_ascii_strcasecmp (content_type, "application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml") == 0) {
-			read_file_xml_data (info, part_name + 1, TAG_TYPE_DOCUMENT_TEXT_DATA);
+			xml_read (info, part_name + 1, TAG_TYPE_DOCUMENT_TEXT_DATA);
 		}
 		break;
 
 	case FILE_TYPE_PPTX:
 		if ((g_ascii_strcasecmp (content_type, "application/vnd.openxmlformats-officedocument.presentationml.slide+xml") == 0) ||
 		    (g_ascii_strcasecmp (content_type, "application/vnd.openxmlformats-officedocument.drawingml.diagramData+xml") == 0)) {
-			read_file_xml_data (info, part_name + 1, TAG_TYPE_DOCUMENT_TEXT_DATA);
+			xml_read (info, part_name + 1, TAG_TYPE_DOCUMENT_TEXT_DATA);
 		}
 		break;
 
 	case FILE_TYPE_XLSX:
 		if ((g_ascii_strcasecmp (content_type, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet.main+xml") == 0) ||
 		    (g_ascii_strcasecmp (content_type, "application/vnd.openxmlformats-officedocument.spreadsheetml.sharedStrings+xml") == 0)) {
-			read_file_xml_data (info, part_name + 1, TAG_TYPE_DOCUMENT_TEXT_DATA);
+			xml_read (info, part_name + 1, TAG_TYPE_DOCUMENT_TEXT_DATA);
 		}
 		break;
 
@@ -1530,9 +1498,9 @@ start_element_handler_content_types (GMarkupParseContext  *context,
 }
 
 static void
-extract_msoffice_xml_format (const gchar          *uri,
-			     TrackerSparqlBuilder *preupdate,
-                             TrackerSparqlBuilder *metadata)
+extract_msoffice_xml (const gchar          *uri,
+                      TrackerSparqlBuilder *preupdate,
+                      TrackerSparqlBuilder *metadata)
 {
 	MsOfficeXMLParserInfo info;
 	FileType file_type;



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]