[tracker] Added support for msoffice extractor to parse textual content from .ppt Powerpoint Files.
- From: Philip Van Hoof <pvanhoof src gnome org>
- To: svn-commits-list gnome org
- Cc:
- Subject: [tracker] Added support for msoffice extractor to parse textual content from .ppt Powerpoint Files.
- Date: Mon, 7 Dec 2009 14:41:06 +0000 (UTC)
commit 2c463f73e10c5417dbeaadae12ba4ef7205facdf
Author: Vesa Pikki <vesa pikki ixonos com>
Date: Mon Dec 7 14:48:00 2009 +0100
Added support for msoffice extractor to parse textual content from .ppt Powerpoint Files.
configure.ac | 7 +
src/tracker-extract/Makefile.am | 1 +
src/tracker-extract/tracker-extract-msoffice.c | 712 +++++++++++++++++++++---
3 files changed, 633 insertions(+), 87 deletions(-)
---
diff --git a/configure.ac b/configure.ac
index 62ed994..97116e2 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1109,6 +1109,13 @@ if test "x$enable_libgsf" != "xno" ; then
AC_SUBST(LIBGSF_CFLAGS)
AC_SUBST(LIBGSF_LIBS)
+ AC_PATH_PROG(WVWAREBIN, wvWare, no)
+ AC_SUBST(WVWAREBIN)
+
+ if test "x$WVWAREBIN" != "xno"; then
+ AC_DEFINE(HAVE_WVWARE, [], [Define if we have wvWare])
+ fi
+
if test "x$have_libgsf" = "xyes"; then
AC_DEFINE(HAVE_LIBGSF, [], [Define if we have libgsf])
fi
diff --git a/src/tracker-extract/Makefile.am b/src/tracker-extract/Makefile.am
index 77b3bff..697a3f7 100644
--- a/src/tracker-extract/Makefile.am
+++ b/src/tracker-extract/Makefile.am
@@ -7,6 +7,7 @@ INCLUDES = \
-DLOCALEDIR=\""$(localedir)"\" \
-DMODULESDIR=\"$(modulesdir)\" \
-DG_LOG_DOMAIN=\"Tracker\" \
+ -DWVWAREBIN=\"$(WVWAREBIN)\" \
-DTRACKER_COMPILATION \
-I$(top_srcdir)/src \
$(WARN_CFLAGS) \
diff --git a/src/tracker-extract/tracker-extract-msoffice.c b/src/tracker-extract/tracker-extract-msoffice.c
index 9572c05..465153e 100644
--- a/src/tracker-extract/tracker-extract-msoffice.c
+++ b/src/tracker-extract/tracker-extract-msoffice.c
@@ -40,19 +40,60 @@
#include "tracker-main.h"
-#define NIE_PREFIX TRACKER_NIE_PREFIX
-#define NFO_PREFIX TRACKER_NFO_PREFIX
-#define NCO_PREFIX TRACKER_NCO_PREFIX
+#define NIE_PREFIX TRACKER_NIE_PREFIX
+#define NFO_PREFIX TRACKER_NFO_PREFIX
+#define NCO_PREFIX TRACKER_NCO_PREFIX
-#define RDF_PREFIX TRACKER_RDF_PREFIX
-#define RDF_TYPE RDF_PREFIX "type"
+#define RDF_PREFIX TRACKER_RDF_PREFIX
+#define RDF_TYPE RDF_PREFIX "type"
-static void extract_msoffice (const gchar *uri,
- TrackerSparqlBuilder *metadata);
+
+/*
+* Powerpoint files comprise of structures. Each structure contains a header.
+* Within that header is a record type that specifies what strcture it is. It is
+* called record type.
+*
+* Here are are some record types and description of the structure (called atom)
+* they contain.
+*/
+
+/*
+* An atom record that specifies Unicode characters with no high byte of a UTF-16
+* Unicode character. High byte is always 0.
+*/
+#define TEXTBYTESATOM_RECORD_TYPE 0x0FA0
+
+/*
+* An atom record that specifies Unicode characters.
+*/
+#define TEXTCHARSATOM_RECORD_TYPE 0x0FA8
+
+/*
+* A container record that specifies information about the powerpoint document.
+*/
+#define DOCUMENTCONTAINER_RECORD_TYPE 0x1000
+
+/*
+* Variant type of record. Within Powerpoint text extraction we are interested
+* of SlideListWithTextContainer type that contains the textual content
+* of the slide(s).
+*
+*/
+
+#define SLIDELISTWITHTEXT_RECORD_TYPE 0x0FF0
+
+
+static void extract_msoffice (const gchar *uri,
+ TrackerSparqlBuilder *metadata);
+
+static void extract_powerpoint (const gchar *uri,
+ TrackerSparqlBuilder *metadata);
static TrackerExtractData data[] = {
- { "application/msword", extract_msoffice },
- { "application/vnd.ms-*", extract_msoffice },
+ { "application/msword", extract_msoffice },
+ /* Powerpoint files */
+ { "application/vnd.ms-powerpoint", extract_powerpoint },
+ { "application/vnd.ms-*", extract_msoffice },
{ NULL, NULL }
};
@@ -63,11 +104,11 @@ typedef struct {
static void
add_gvalue_in_metadata (TrackerSparqlBuilder *metadata,
- const gchar *uri,
- const gchar *key,
- GValue const *val,
- const gchar *type,
- const gchar *predicate)
+ const gchar *uri,
+ const gchar *key,
+ GValue const *val,
+ const gchar *type,
+ const gchar *predicate)
{
gchar *s;
@@ -87,10 +128,8 @@ add_gvalue_in_metadata (TrackerSparqlBuilder *metadata,
if (!tracker_is_empty_string (s)) {
gchar *str_val;
- /* Some fun: strings are always
- * written "str" with double quotes
- * around, but not numbers!
- */
+ /* Some fun: strings are always written "str" with double quotes
+ * around, but not numbers! */
if (s[0] == '"') {
size_t len;
@@ -99,15 +138,10 @@ add_gvalue_in_metadata (TrackerSparqlBuilder *metadata,
if (s[len - 1] == '"') {
str_val = (len > 2 ? g_strndup (s + 1, len - 2) : NULL);
} else {
- /* We have a string
- * that begins with a
- * double quote but
- * which finishes by
- * something different...
- * We copy the string
- * from the
- * beginning.
- */
+ /* We have a string that begins with a double
+ * quote but which finishes by something
+ * different... We copy the string from the
+ * beginning. */
str_val = g_strdup (s);
}
} else {
@@ -140,59 +174,59 @@ add_gvalue_in_metadata (TrackerSparqlBuilder *metadata,
static void
metadata_cb (gpointer key,
- gpointer value,
- gpointer user_data)
+ gpointer value,
+ gpointer user_data)
{
- ForeachInfo *info = user_data;
- gchar *name;
- GsfDocProp *property;
- TrackerSparqlBuilder *metadata = info->metadata;
- GValue const *val;
- const gchar *uri = info->uri;
+ ForeachInfo *info = user_data;
+ gchar *name;
+ GsfDocProp *property;
+ TrackerSparqlBuilder *metadata = info->metadata;
+ GValue const *val;
+ const gchar *uri = info->uri;
name = key;
property = value;
metadata = info->metadata;
val = gsf_doc_prop_get_val (property);
- if (strcmp (name, "dc:title") == 0) {
+ if (g_strcmp0 (name, "dc:title") == 0) {
add_gvalue_in_metadata (metadata, uri, "nie:title", val, NULL, NULL);
- } else if (strcmp (name, "dc:subject") == 0) {
+ } else if (g_strcmp0 (name, "dc:subject") == 0) {
add_gvalue_in_metadata (metadata, uri, "nie:subject", val, NULL, NULL);
- } else if (strcmp (name, "dc:creator") == 0) {
+ } else if (g_strcmp0 (name, "dc:creator") == 0) {
add_gvalue_in_metadata (metadata, uri, "nco:creator", val, "nco:Contact", "nco:fullname");
- } else if (strcmp (name, "dc:keywords") == 0) {
+ } else if (g_strcmp0 (name, "dc:keywords") == 0) {
gchar *keywords = g_strdup_value_contents (val);
- char *lasts, *keyw;
+ char *lasts, *keyw;
size_t len;
keyw = keywords;
keywords = strchr (keywords, '"');
if (keywords)
keywords++;
- else
+ else
keywords = keyw;
len = strlen (keywords);
if (keywords[len - 1] == '"')
keywords[len - 1] = '\0';
- for (keyw = strtok_r (keywords, ",; ", &lasts); keyw;
+ for (keyw = strtok_r (keywords, ",; ", &lasts); keyw;
keyw = strtok_r (NULL, ",; ", &lasts)) {
tracker_sparql_builder_predicate (metadata, "nie:keyword");
tracker_sparql_builder_object_unvalidated (metadata, keyw);
}
g_free (keyw);
- } else if (strcmp (name, "dc:description") == 0) {
+ } else if (g_strcmp0 (name, "dc:description") == 0) {
add_gvalue_in_metadata (metadata, uri, "nie:comment", val, NULL, NULL);
- } else if (strcmp (name, "gsf:page-count") == 0) {
+ } else if (g_strcmp0 (name, "gsf:page-count") == 0) {
add_gvalue_in_metadata (metadata, uri, "nfo:pageCount", val, NULL, NULL);
- } else if (strcmp (name, "gsf:word-count") == 0) {
+ } else if (g_strcmp0 (name, "gsf:word-count") == 0) {
add_gvalue_in_metadata (metadata, uri, "nfo:wordCount", val, NULL, NULL);
- } else if (strcmp (name, "meta:creation-date") == 0) {
+ } else if (g_strcmp0 (name, "meta:creation-date") == 0) {
add_gvalue_in_metadata (metadata, uri, "nie:contentCreated", val, NULL, NULL);
- } else if (strcmp (name, "meta:generator") == 0) {
+ } else if (g_strcmp0 (name, "meta:generator") == 0) {
add_gvalue_in_metadata (metadata, uri, "nie:generator", val, NULL, NULL);
}
}
@@ -202,27 +236,33 @@ doc_metadata_cb (gpointer key,
gpointer value,
gpointer user_data)
{
- ForeachInfo *info = user_data;
- gchar *name;
- GsfDocProp *property;
- TrackerSparqlBuilder *metadata = info->metadata;
- GValue const *val;
- const gchar *uri = info->uri;
+ ForeachInfo *info = user_data;
+ gchar *name;
+ GsfDocProp *property;
+ TrackerSparqlBuilder *metadata = info->metadata;
+ GValue const *val;
+ const gchar *uri = info->uri;
name = key;
property = value;
metadata = user_data;
val = gsf_doc_prop_get_val (property);
- if (strcmp (name, "CreativeCommons_LicenseURL") == 0) {
+ if (g_strcmp0 (name, "CreativeCommons_LicenseURL") == 0) {
add_gvalue_in_metadata (metadata, uri, "nie:license", val, NULL, NULL);
}
}
static gchar *
extract_content (const gchar *uri,
- guint n_words)
+ guint n_words)
{
+#ifdef HAVE_WVWARE
+
+ /* TODO, question: can't we replace this command-calling with a function
+ * in libwmf-dev or something? If yes and somebody wants to contribute
+ * replacing this with libwmf-dev, go ahead */
+
gchar *path, *command, *output, *text;
GError *error = NULL;
@@ -232,12 +272,13 @@ extract_content (const gchar *uri,
return NULL;
}
- command = g_strdup_printf ("wvWare --charset utf-8 -1 -x wvText.xml %s", path);
+ command = g_strdup_printf (WVWAREBIN " --charset utf-8 -1 -x wvText.xml %s", path);
g_free (path);
if (!g_spawn_command_line_sync (command, &output, NULL, NULL, &error)) {
- g_warning ("Could not extract text from '%s': %s", uri, error->message);
+ g_warning ("Could not extract text from '%s': %s",
+ uri, error->message);
g_error_free (error);
g_free (command);
@@ -250,55 +291,485 @@ extract_content (const gchar *uri,
g_free (output);
return text;
+#else
+ return NULL;
+#endif
}
-static void
-extract_msoffice (const gchar *uri,
- TrackerSparqlBuilder *metadata)
+/**
+* @brief Read 16 bit unsigned integer
+* @param buffer data to read integer from
+* @return 16 bit unsigned integer
+*/
+static gint
+read_16bit (const guint8* buffer)
{
- GsfInput *input;
- GsfInfile *infile;
- GsfInput *stream;
- gchar *filename, *content;
- TrackerFTSConfig *fts_config;
- guint n_words;
+ return buffer[0] + (buffer[1] << 8);
+}
- gsf_init ();
+/**
+* @brief Read 32 bit unsigned integer
+* @param buffer data to read integer from
+* @return 32 bit unsigned integer
+*/
+static gint
+read_32bit (const guint8* buffer)
+{
+ return buffer[0] + (buffer[1] << 8) + (buffer[2] << 16) + (buffer[3] << 24);
+}
- filename = g_filename_from_uri (uri, NULL, NULL);
+/**
+* @brief Header for all powerpoint structures
+*
+* A structure at the beginning of each container record and each atom record in
+* the file. The values in the record header and the context of the record are
+* used to identify and interpret the record data that follows.
+*/
+typedef struct {
+ /**
+ * @brief An unsigned integer that specifies the version of the record
+ * data that follows the record header. A value of 0xF specifies that the
+ * record is a container record.
+ */
+ guint recVer;
+
+ /**
+ * @brief An unsigned integer that specifies the record instance data.
+ * Interpretation of the value is dependent on the particular record
+ * type.
+ */
+ guint recInstance;
+
+ /**
+ * @brief A RecordType enumeration that specifies the type of the record
+ * data that follows the record header.
+ */
+ gint recType;
+
+ /**
+ * @brief An unsigned integer that specifies the length, in bytes, of the
+ * record data that follows the record header.
+ */
+ guint recLen;
+}RecordHeader;
+
+/**
+* @brief Read header data from given stream
+* @param stream Stream to read header data
+* @param header Pointer to header where to store results
+*/
+static gboolean
+read_header (GsfInput *stream, RecordHeader *header) {
+ guint8 buffer[8] = {0};
+
+ g_return_val_if_fail(stream,FALSE);
+ g_return_val_if_fail(header,FALSE);
+ g_return_val_if_fail(!gsf_input_eof(stream),FALSE);
+
+
+ /* Header is always 8 bytes, read it */
+ g_return_val_if_fail(gsf_input_read(stream,8,buffer),FALSE);
+
+ /*
+ * Then parse individual details
+ *
+ * Record header is 8 bytes long. Data is split as follows:
+ * recVer (4 bits)
+ * recInstance (12 bits)
+ * recType (2 bytes)
+ * recLen (4 bytes)
+ *
+ * See RecordHeader for more detailed explanation of each field.
+ *
+ * Here we parse each of those fields.
+ */
+
+ header->recType = read_16bit(&buffer[2]);
+ header->recLen = read_32bit(&buffer[4]);
+ header->recVer = (read_16bit(buffer) & 0xF000) >> 12;
+ header->recInstance = read_16bit(buffer) & 0x0FFF;
+
+ return TRUE;
+}
- input = gsf_input_stdio_new (filename, NULL);
+/**
+* @brief Read powerpoint text from given stream.
+*
+* Powerpoint contains texts in either TextBytesAtom or TextCharsAtom. Below
+* are excerpt from [MS-PPT].pdf file describing the ppt file struture:
+*
+* TextCharsAtom contains an array of UTF-16 Unicode [RFC2781] characters that
+* specifies the characters of the corresponding text. The length, in bytes, of
+* the array is specified by rh.recLen. The array MUST NOT contain the NUL
+* character 0x0000.
+*
+* TextBytesAtom contains an array of bytes that specifies the characters of the
+* corresponding text. Each item represents the low byte of a UTF-16 Unicode
+* [RFC2781] character whose high byte is 0x00. The length, in bytes, of the
+* array is specified by rh.recLen. The array MUST NOT contain a 0x00 byte.
+*
+* @param stream Stream to read text bytes/chars atom
+* @return read text or NULL if no text was read. Has to be freed by the caller
+*/
+static gchar*
+read_text (GsfInput *stream)
+{
+ gint i = 0;
+ RecordHeader header;
+ guint8 *data = NULL;
+ gsize written = 0;
+ gchar *converted = 0;
+
+ g_return_val_if_fail (stream,NULL);
+
+ /*
+ * First read the header that describes the structures type
+ * (TextBytesAtom or TextCharsAtom) and it's length.
+ */
+ g_return_val_if_fail (read_header(stream, &header),NULL);
+
+ /*
+ * We only want header with type either TEXTBYTESATOM_RECORD_TYPE
+ * (TextBytesAtom) or TEXTCHARSATOM_RECORD_TYPE (TextCharsAtom).
+ *
+ * We don't care about anything else
+ */
+ if (header.recType != TEXTBYTESATOM_RECORD_TYPE &&
+ header.recType != TEXTCHARSATOM_RECORD_TYPE) {
+ return NULL;
+ }
- if (!input) {
- g_free (filename);
- gsf_shutdown ();
- return;
+ /* Then we'll allocate data for the actual texts */
+ if (header.recType == TEXTBYTESATOM_RECORD_TYPE) {
+ /*
+ * TextBytesAtom doesn't include high bytes propably in order to
+ * save space on the ppt files. We'll have to allocate double the
+ * size for it to get the high bytes
+ */
+ data = g_try_new0 (guint8,header.recLen * 2);
+ } else {
+ data = g_try_new0 (guint8,header.recLen);
}
- infile = gsf_infile_msole_new (input, NULL);
- g_object_unref (G_OBJECT (input));
+ g_return_val_if_fail (data,NULL);
+
+ /* Then read the textual data from the stream */
+ if (!gsf_input_read (stream,header.recLen,data)) {
+ g_free (data);
+ return NULL;
+ }
+
+
+ /*
+ * Again if we are reading TextBytesAtom we'll need to add those utf16
+ * high bytes ourselves. They are zero as specified in [MS-PPT].pdf
+ * and this function's comments
+ */
+ if (header.recType == TEXTBYTESATOM_RECORD_TYPE) {
+ for(i = 0; i < header.recLen; i++) {
+
+ /*
+ * We'll add an empty 0 byte between each byte in the
+ * array
+ */
+ data[(header.recLen - i - 1) * 2] = data[header.recLen - i - 1];
+ if ((header.recLen - i - 1) % 2) {
+ data[header.recLen - i - 1] = 0;
+ }
+ }
+
+ /*
+ * Then double the recLen now that we have the high bytes added
+ * between read bytes
+ */
+ header.recLen *= 2;
+ }
+
+ /*
+ * Then we'll convert the text from UTF-16 to UTF-8 for the tracker
+ */
+ converted = g_convert(data,header.recLen,
+ "UTF-8",
+ "UTF-16",
+ NULL,
+ &written,
+ NULL);
+
+ /*
+ * And free the data
+ */
+ g_free(data);
+
+ /* Return read text */
+ return converted;
+}
+
+/**
+* @brief Find a specific header from given stream
+* @param stream Stream to parse headers from
+* @param type1 first type of header to look for
+* @param type2 convenience parameter if we are looking for either of two
+* header types
+* @param rewind if a proper header is found should this function seek
+* to the start of the header (TRUE)
+* @return TRUE if either of specified headers was found
+*/
+static gboolean
+seek_header (GsfInput *stream,
+ gint type1,
+ gint type2,
+ gboolean rewind)
+{
+ RecordHeader header;
+
+ g_return_val_if_fail(stream,FALSE);
+
+ /*
+ * Read until we reach eof
+ */
+ while(!gsf_input_eof(stream)) {
+
+ /*
+ * Read first header
+ */
+ g_return_val_if_fail(read_header(stream, &header),FALSE);
+
+ /*
+ * Check if it's the correct type
+ */
+ if (header.recType == type1 || header.recType == type2) {
+
+ /*
+ * Sometimes it's needed to rewind to the start of the
+ * header
+ */
+ if (rewind) {
+ gsf_input_seek(stream,-8,G_SEEK_CUR);
+ }
+ return TRUE;
+ }
+
+ /*
+ * If it's not the correct type, seek to the beginning of the
+ * next header
+ */
+ g_return_val_if_fail(!gsf_input_seek(stream,
+ header.recLen,
+ G_SEEK_CUR),
+ FALSE);
+ }
+
+ return FALSE;
+}
+
+/**
+* @brief Normalize and append given text to all_texts variable
+* @param text text to append
+* @param all_texts GString to append text after normalizing it
+* @param words number of words already in all_texts
+* @param max_words maximum number of words allowed in all_texts
+* @return number of words appended to all_text
+*/
+static gint
+append_text (gchar *text,
+ GString *all_texts,
+ gint words,
+ gint max_words)
+{
+ guint count = 0;
+ gchar *normalized_text;
+
+ g_return_val_if_fail(text,-1);
+ g_return_val_if_fail(all_texts,-1);
+
+ normalized_text = tracker_text_normalize(text,
+ max_words - words,
+ &count);
+
+ if (normalized_text) {
+ /*
+ * If the last added text didn't end in a space, we'll append a
+ * space between this text and previous text so the last word of
+ * previous text and first word of this text don't become one big
+ * word.
+ */
+ if (all_texts->len > 0 &&
+ all_texts->str[all_texts->len-1] != ' ') {
+
+ g_string_append_c(all_texts,' ');
+ }
+
+ g_string_append(all_texts,normalized_text);
+ g_free(normalized_text);
+ }
+
+ g_free(text);
+ return count;
+}
+
+static void
+read_powerpoint (GsfInfile *infile,
+ TrackerSparqlBuilder *metadata,
+ gint max_words)
+{
+ /*
+ * Try to find Powerpoint Document stream
+ */
+ gsf_off_t lastDocumentContainer = -1;
+ GsfInput *stream = gsf_infile_child_by_name(infile,
+ "PowerPoint Document");
+
+ g_return_if_fail (stream);
+
+ /*
+ * Powerpoint documents have a "editing history" stored within them.
+ * There is a structure that defines what changes were made each time
+ * but it is just easier to get the current/latest version just by
+ * finding the last occurrence of DocumentContainer structure
+ */
+
+ lastDocumentContainer = -1;
+
+ /*
+ * Read until we reach eof.
+ */
+ while(!gsf_input_eof (stream)) {
+ RecordHeader header;
+
+ /*
+ * We only read headers of data structures
+ */
+ if (!read_header (stream,&header)) {
+ break;
+ }
+
+ /*
+ * And we only care about headers with type 1000,
+ * DocumentContainer
+ */
+
+ if (header.recType == DOCUMENTCONTAINER_RECORD_TYPE) {
+ lastDocumentContainer = gsf_input_tell(stream);
+ }
+
+ /*
+ * and then seek to the start of the next data structure so it is
+ * fast and we don't have to read through the whole file
+ */
+ if (gsf_input_seek (stream, header.recLen, G_SEEK_CUR)) {
+ break;
+ }
+ }
+
+ /*
+ * If a DocumentContainer was found and we are able to seek to it.
+ *
+ * Then we'll have to find the second header with type
+ * SLIDELISTWITHTEXT_RECORD_TYPE since DocumentContainer contains
+ * MasterListWithTextContainer and SlideListWithTextContainer structures
+ * with both having the same header type. We however only want
+ * SlideListWithTextContainer which contains the textual content
+ * of the power point file.
+ */
+ if (lastDocumentContainer >= 0 &&
+ !gsf_input_seek(stream,lastDocumentContainer,G_SEEK_SET) &&
+ seek_header (stream,
+ SLIDELISTWITHTEXT_RECORD_TYPE,
+ SLIDELISTWITHTEXT_RECORD_TYPE,
+ FALSE) &&
+ seek_header (stream,
+ SLIDELISTWITHTEXT_RECORD_TYPE,
+ SLIDELISTWITHTEXT_RECORD_TYPE,
+ FALSE)) {
+
+ GString *all_texts = g_string_new ("");
+ int word_count = 0;
+
+ /*
+ * Read while we have either TextBytesAtom or
+ * TextCharsAtom and we have read less than max_words
+ * amount of words
+ */
+ while(seek_header (stream,
+ TEXTBYTESATOM_RECORD_TYPE,
+ TEXTCHARSATOM_RECORD_TYPE,
+ TRUE) &&
+ word_count < max_words) {
+
+ gchar *text = read_text(stream);
+
+ int count = append_text (text,
+ all_texts,
+ word_count,
+ max_words);
+
+ if (count < 0) {
+ break;
+ }
+
+ word_count += count;
+ }
+
+ /*
+ * If we have any text read
+ */
+ if (all_texts->len > 0) {
+ /*
+ * Send it to tracker
+ */
+ tracker_sparql_builder_predicate (metadata,
+ "nie:plainTextContent");
+ tracker_sparql_builder_object_unvalidated (metadata,
+ all_texts->str);
+ }
+
+ g_string_free (all_texts,TRUE);
- if (!infile) {
- g_free (filename);
- gsf_shutdown ();
- return;
}
+ g_object_unref (stream);
+}
+
+/**
+* @brief get maximum number of words to index
+* @return maximum number of words to index
+*/
+static gint
+max_words (void)
+{
+ TrackerFTSConfig *fts_config = tracker_main_get_fts_config ();
+ return tracker_fts_config_get_max_words_to_index (fts_config);
+}
+
+/**
+* @brief Extract summary OLE stream from specified uri
+* @param metadata where to store summary
+* @param infile file to read summary from
+* @param uri uri of the file
+*/
+static void
+extract_summary (TrackerSparqlBuilder *metadata,
+ GsfInfile *infile,
+ const gchar *uri)
+{
+ gchar *content;
+ GsfInput *stream;
+
tracker_sparql_builder_subject_iri (metadata, uri);
tracker_sparql_builder_predicate (metadata, "a");
tracker_sparql_builder_object (metadata, "nfo:PaginatedTextDocument");
stream = gsf_infile_child_by_name (infile, "\05SummaryInformation");
+
if (stream) {
GsfDocMetaData *md;
- ForeachInfo info = { metadata, uri };
+ ForeachInfo info = { metadata, uri };
md = gsf_doc_meta_data_new ();
if (gsf_msole_metadata_read (stream, md)) {
g_object_unref (md);
g_object_unref (stream);
- g_free (filename);
gsf_shutdown ();
return;
}
@@ -310,9 +781,10 @@ extract_msoffice (const gchar *uri,
}
stream = gsf_infile_child_by_name (infile, "\05DocumentSummaryInformation");
+
if (stream) {
GsfDocMetaData *md;
- ForeachInfo info = { metadata, uri };
+ ForeachInfo info = { metadata, uri };
md = gsf_doc_meta_data_new ();
@@ -320,7 +792,6 @@ extract_msoffice (const gchar *uri,
g_object_unref (md);
g_object_unref (stream);
gsf_shutdown ();
- g_free (filename);
return;
}
@@ -330,19 +801,86 @@ extract_msoffice (const gchar *uri,
g_object_unref (stream);
}
- fts_config = tracker_main_get_fts_config ();
- n_words = tracker_fts_config_get_max_words_to_index (fts_config);
- content = extract_content (uri, n_words);
+ content = extract_content (uri, max_words());
if (content) {
tracker_sparql_builder_predicate (metadata, "nie:plainTextContent");
tracker_sparql_builder_object_unvalidated (metadata, content);
g_free (content);
}
+}
+
+/**
+* @brief Open specified uri for reading and initialize gsf
+* @param uri URI of the file to open
+* @return GsfInFile of the opened file or NULL if failed to open file
+*/
+static GsfInfile *
+open_uri (const gchar *uri)
+{
+ GsfInput *input;
+ GsfInfile *infile;
+ gchar *filename;
+
+ gsf_init ();
+
+ filename = g_filename_from_uri (uri, NULL, NULL);
+
+ input = gsf_input_stdio_new (filename, NULL);
+
+ if (!input) {
+ g_free (filename);
+ gsf_shutdown ();
+ return NULL;
+ }
+
+ infile = gsf_infile_msole_new (input, NULL);
+ g_object_unref (G_OBJECT (input));
+
+ if (!infile) {
+ g_free (filename);
+ gsf_shutdown ();
+ return NULL;
+ }
- g_object_unref (infile);
g_free (filename);
+ return infile;
+}
+
+/**
+* @brief Extract data from generic office files
+*
+* At the moment only extracts document summary from summary OLE stream.
+* @param uri URI of the file to extract data
+* @param metadata where to store extracted data to
+*/
+static void
+extract_msoffice (const gchar *uri,
+ TrackerSparqlBuilder *metadata)
+{
+ GsfInfile *infile = open_uri(uri);
+ extract_summary(metadata,infile,uri);
+ g_object_unref (infile);
+ gsf_shutdown ();
+}
+
+/**
+* @brief Extract data from powerpoin files
+*
+* At the moment can extract textual content and summary.
+* @param uri URI of the file to extract data
+* @param metadata where to store extracted data to
+*/
+static void
+extract_powerpoint (const gchar *uri,
+ TrackerSparqlBuilder *metadata)
+{
+ GsfInfile *infile = open_uri(uri);
+ extract_summary(metadata,infile,uri);
+ read_powerpoint(infile,metadata,max_words());
+
+ g_object_unref (infile);
gsf_shutdown ();
}
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]