[tracker/msword] Added new WV2 based MS Document extractor.
- From: Philip Van Hoof <pvanhoof src gnome org>
- To: svn-commits-list gnome org
- Cc:
- Subject: [tracker/msword] Added new WV2 based MS Document extractor.
- Date: Mon, 7 Dec 2009 15:42:49 +0000 (UTC)
commit 1fb592fc750aa6d985fa249efd45588c9ef8752a
Author: Philip Van Hoof <philip codeminded be>
Date: Mon Dec 7 12:57:53 2009 +0100
Added new WV2 based MS Document extractor.
configure.ac | 49 ++++++
src/tracker-extract/Makefile.am | 10 +
src/tracker-extract/tracker-extract-msoffice.c | 82 +++++-----
src/tracker-extract/tracker-extract-msword.cpp | 213 ++++++++++++++++++++++++
4 files changed, 314 insertions(+), 40 deletions(-)
---
diff --git a/configure.ac b/configure.ac
index 97116e2..99f4d32 100644
--- a/configure.ac
+++ b/configure.ac
@@ -147,6 +147,7 @@ GDK_REQUIRED=1.0
LIBVORBIS_REQUIRED=0.22
LIBEXIF_REQUIRED=0.6
LIBGSF_REQUIRED=1.13
+LIBWV2_REQUIRED=0.4.0
EXEMPI_REQUIRED=2.1.0
HILDON_THUMBNAIL_REQUIRED=3.0.10
EVO_REQUIRED=2.25.5
@@ -1132,6 +1133,53 @@ fi
AM_CONDITIONAL(HAVE_LIBGSF, test "x$have_libgsf" = "xyes")
##################################################################
+# Check for libwv2
+##################################################################
+
+# FIXME This should be package based. Unfortunately in several main
+# distros, it is not.
+
+AC_ARG_ENABLE(libwv2,
+ AS_HELP_STRING([--enable-libwv2],
+ [enable content extractor for MS documents [[default=auto]]]),,
+ [enable_libwv2=auto])
+
+if test "x$enable_libwv2" != "xno" ; then
+ AC_MSG_CHECKING(for wv2)
+
+ AC_LANG(C++)
+ oldflags = $LDFLAGS
+ LDFLAGS="$LDFLAGS -lwv2"
+ AC_LINK_IFELSE(
+ [AC_LANG_PROGRAM([#include <wv2/wv2version.h>],
+ [wvWare::version()])],
+ [TEST_LIBS="$TEST_LIBS -lwv2"] [have_libwv2=yes],
+ [have_libwv2=no])
+
+ LIBWV2_LIBS="-lwv2"
+ AC_SUBST(LIBWV2_LIBS)
+
+ LDFLAGS=$oldflags
+
+ if test "x$have_libwv2" = "xyes"; then
+ AC_DEFINE(HAVE_LIBWV2, [], [Define if we have libwv2])
+ AC_MSG_RESULT(yes)
+ else
+ AC_MSG_RESULT(no)
+ fi
+else
+ have_libwv2="no (disabled)"
+fi
+
+if test "x$enable_libwv2" = "xyes"; then
+ if test "x$have_libwv2" != "xyes"; then
+ AC_MSG_ERROR([Couldn't find libwv2 >= $LIBWV2_REQUIRED.])
+ fi
+fi
+
+AM_CONDITIONAL(HAVE_LIBWV2, test "x$have_libwv2" = "xyes")
+
+##################################################################
# Check for libjpeg
##################################################################
@@ -1607,6 +1655,7 @@ Metadata Extractors:
Support TIFF: $have_libtiff (xmp: $have_exempi, exif: yes, iptc: $have_libiptcdata)
Support Vorbis (ogg/etc): $have_libvorbis
Support MS & Open Office: $have_libgsf
+ Support MS DOC content extraction: $have_libwv2
Support XML / HTML: $have_libxml2
Support embedded / sidecar XMP: $have_exempi
Support video formats: $have_video_handler ($have_video_handler_app) (tagreadbin: $enable_tagreadbin)
diff --git a/src/tracker-extract/Makefile.am b/src/tracker-extract/Makefile.am
index 697a3f7..c200589 100644
--- a/src/tracker-extract/Makefile.am
+++ b/src/tracker-extract/Makefile.am
@@ -69,6 +69,10 @@ if HAVE_LIBGSF
modules_LTLIBRARIES += libextract-msoffice.la
endif
+if HAVE_LIBWV2
+modules_LTLIBRARIES += libextract-msword.la
+endif
+
if HAVE_POPPLER_GLIB
modules_LTLIBRARIES += libextract-pdf.la
endif
@@ -186,6 +190,12 @@ libextract_msoffice_la_LDFLAGS = $(module_flags)
libextract_msoffice_la_LIBADD = $(GLIB2_LIBS) $(LIBGSF_LIBS) $(GCOV_LIBS) \
$(top_builddir)/src/libtracker-common/libtracker-common.la
+# MS Word
+libextract_msword_la_SOURCES = tracker-extract-msword.cpp
+libextract_msword_la_LDFLAGS = $(module_flags)
+libextract_msword_la_LIBADD = $(GLIB2_LIBS) $(LIBWV2_LIBS) $(GCOV_LIBS) \
+ $(top_builddir)/src/libtracker-common/libtracker-common.la
+
# PDF
libextract_pdf_la_SOURCES = tracker-extract-pdf.c $(xmp_sources)
libextract_pdf_la_LDFLAGS = $(module_flags)
diff --git a/src/tracker-extract/tracker-extract-msoffice.c b/src/tracker-extract/tracker-extract-msoffice.c
index 465153e..e4572ea 100644
--- a/src/tracker-extract/tracker-extract-msoffice.c
+++ b/src/tracker-extract/tracker-extract-msoffice.c
@@ -90,7 +90,9 @@ static void extract_powerpoint (const gchar *uri,
TrackerSparqlBuilder *metadata);
static TrackerExtractData data[] = {
+#ifndef HAVE_LIBWV2
{ "application/msword", extract_msoffice },
+#endif
/* Powerpoint files */
{ "application/vnd.ms-powerpoint", extract_powerpoint },
{ "application/vnd.ms-*", extract_msoffice },
@@ -362,13 +364,13 @@ static gboolean
read_header (GsfInput *stream, RecordHeader *header) {
guint8 buffer[8] = {0};
- g_return_val_if_fail(stream,FALSE);
- g_return_val_if_fail(header,FALSE);
- g_return_val_if_fail(!gsf_input_eof(stream),FALSE);
+ g_return_val_if_fail (stream, FALSE);
+ g_return_val_if_fail (header, FALSE);
+ g_return_val_if_fail (!gsf_input_eof (stream), FALSE);
/* Header is always 8 bytes, read it */
- g_return_val_if_fail(gsf_input_read(stream,8,buffer),FALSE);
+ g_return_val_if_fail (gsf_input_read (stream, 8, buffer), FALSE);
/*
* Then parse individual details
@@ -384,10 +386,10 @@ read_header (GsfInput *stream, RecordHeader *header) {
* Here we parse each of those fields.
*/
- header->recType = read_16bit(&buffer[2]);
- header->recLen = read_32bit(&buffer[4]);
- header->recVer = (read_16bit(buffer) & 0xF000) >> 12;
- header->recInstance = read_16bit(buffer) & 0x0FFF;
+ header->recType = read_16bit (&buffer[2]);
+ header->recLen = read_32bit (&buffer[4]);
+ header->recVer = (read_16bit (buffer) & 0xF000) >> 12;
+ header->recInstance = read_16bit (buffer) & 0x0FFF;
return TRUE;
}
@@ -426,7 +428,7 @@ read_text (GsfInput *stream)
* First read the header that describes the structures type
* (TextBytesAtom or TextCharsAtom) and it's length.
*/
- g_return_val_if_fail (read_header(stream, &header),NULL);
+ g_return_val_if_fail (read_header (stream, &header),NULL);
/*
* We only want header with type either TEXTBYTESATOM_RECORD_TYPE
@@ -446,15 +448,15 @@ read_text (GsfInput *stream)
* save space on the ppt files. We'll have to allocate double the
* size for it to get the high bytes
*/
- data = g_try_new0 (guint8,header.recLen * 2);
+ data = g_try_new0 (guint8, header.recLen * 2);
} else {
- data = g_try_new0 (guint8,header.recLen);
+ data = g_try_new0 (guint8, header.recLen);
}
g_return_val_if_fail (data,NULL);
/* Then read the textual data from the stream */
- if (!gsf_input_read (stream,header.recLen,data)) {
+ if (!gsf_input_read (stream, header.recLen,data)) {
g_free (data);
return NULL;
}
@@ -522,17 +524,17 @@ seek_header (GsfInput *stream,
{
RecordHeader header;
- g_return_val_if_fail(stream,FALSE);
+ g_return_val_if_fail (stream,FALSE);
/*
* Read until we reach eof
*/
- while(!gsf_input_eof(stream)) {
+ while(!gsf_input_eof (stream)) {
/*
* Read first header
*/
- g_return_val_if_fail(read_header(stream, &header),FALSE);
+ g_return_val_if_fail (read_header (stream, &header),FALSE);
/*
* Check if it's the correct type
@@ -544,7 +546,7 @@ seek_header (GsfInput *stream,
* header
*/
if (rewind) {
- gsf_input_seek(stream,-8,G_SEEK_CUR);
+ gsf_input_seek (stream,-8,G_SEEK_CUR);
}
return TRUE;
}
@@ -553,10 +555,10 @@ seek_header (GsfInput *stream,
* If it's not the correct type, seek to the beginning of the
* next header
*/
- g_return_val_if_fail(!gsf_input_seek(stream,
- header.recLen,
- G_SEEK_CUR),
- FALSE);
+ g_return_val_if_fail (!gsf_input_seek (stream,
+ header.recLen,
+ G_SEEK_CUR),
+ FALSE);
}
return FALSE;
@@ -579,12 +581,12 @@ append_text (gchar *text,
guint count = 0;
gchar *normalized_text;
- g_return_val_if_fail(text,-1);
- g_return_val_if_fail(all_texts,-1);
+ g_return_val_if_fail (text,-1);
+ g_return_val_if_fail (all_texts,-1);
- normalized_text = tracker_text_normalize(text,
- max_words - words,
- &count);
+ normalized_text = tracker_text_normalize (text,
+ max_words - words,
+ &count);
if (normalized_text) {
/*
@@ -596,11 +598,11 @@ append_text (gchar *text,
if (all_texts->len > 0 &&
all_texts->str[all_texts->len-1] != ' ') {
- g_string_append_c(all_texts,' ');
+ g_string_append_c (all_texts,' ');
}
- g_string_append(all_texts,normalized_text);
- g_free(normalized_text);
+ g_string_append (all_texts, normalized_text);
+ g_free (normalized_text);
}
g_free(text);
@@ -616,8 +618,8 @@ read_powerpoint (GsfInfile *infile,
* Try to find Powerpoint Document stream
*/
gsf_off_t lastDocumentContainer = -1;
- GsfInput *stream = gsf_infile_child_by_name(infile,
- "PowerPoint Document");
+ GsfInput *stream = gsf_infile_child_by_name (infile,
+ "PowerPoint Document");
g_return_if_fail (stream);
@@ -639,7 +641,7 @@ read_powerpoint (GsfInfile *infile,
/*
* We only read headers of data structures
*/
- if (!read_header (stream,&header)) {
+ if (!read_header (stream, &header)) {
break;
}
@@ -649,7 +651,7 @@ read_powerpoint (GsfInfile *infile,
*/
if (header.recType == DOCUMENTCONTAINER_RECORD_TYPE) {
- lastDocumentContainer = gsf_input_tell(stream);
+ lastDocumentContainer = gsf_input_tell (stream);
}
/*
@@ -672,7 +674,7 @@ read_powerpoint (GsfInfile *infile,
* of the power point file.
*/
if (lastDocumentContainer >= 0 &&
- !gsf_input_seek(stream,lastDocumentContainer,G_SEEK_SET) &&
+ !gsf_input_seek (stream, lastDocumentContainer, G_SEEK_SET) &&
seek_header (stream,
SLIDELISTWITHTEXT_RECORD_TYPE,
SLIDELISTWITHTEXT_RECORD_TYPE,
@@ -696,7 +698,7 @@ read_powerpoint (GsfInfile *infile,
TRUE) &&
word_count < max_words) {
- gchar *text = read_text(stream);
+ gchar *text = read_text (stream);
int count = append_text (text,
all_texts,
@@ -801,7 +803,7 @@ extract_summary (TrackerSparqlBuilder *metadata,
g_object_unref (stream);
}
- content = extract_content (uri, max_words());
+ content = extract_content (uri, max_words ());
if (content) {
tracker_sparql_builder_predicate (metadata, "nie:plainTextContent");
@@ -858,8 +860,8 @@ static void
extract_msoffice (const gchar *uri,
TrackerSparqlBuilder *metadata)
{
- GsfInfile *infile = open_uri(uri);
- extract_summary(metadata,infile,uri);
+ GsfInfile *infile = open_uri (uri);
+ extract_summary (metadata, infile, uri);
g_object_unref (infile);
gsf_shutdown ();
}
@@ -876,9 +878,9 @@ static void
extract_powerpoint (const gchar *uri,
TrackerSparqlBuilder *metadata)
{
- GsfInfile *infile = open_uri(uri);
- extract_summary(metadata,infile,uri);
- read_powerpoint(infile,metadata,max_words());
+ GsfInfile *infile = open_uri (uri);
+ extract_summary (metadata, infile, uri);
+ read_powerpoint (infile, metadata, max_words ());
g_object_unref (infile);
gsf_shutdown ();
diff --git a/src/tracker-extract/tracker-extract-msword.cpp b/src/tracker-extract/tracker-extract-msword.cpp
new file mode 100644
index 0000000..a87d3b7
--- /dev/null
+++ b/src/tracker-extract/tracker-extract-msword.cpp
@@ -0,0 +1,213 @@
+/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
+/*
+ * Copyright (C) 2009, Nokia
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#include "config.h"
+
+#include <glib.h>
+#include <glib/gprintf.h>
+
+#include <wv2/parser.h>
+#include <wv2/handlers.h>
+#include <wv2/parserfactory.h>
+#include <wv2/word97_generated.h>
+#include <wv2/ustring.h>
+#include <wv2/associatedstrings.h>
+
+extern "C" {
+#include <libtracker-common/tracker-utils.h>
+}
+
+#include <libtracker-common/tracker-statement-list.h>
+#include <libtracker-common/tracker-ontology.h>
+#include <libtracker-common/tracker-file-utils.h>
+#include <libtracker-common/tracker-type-utils.h>
+
+#include "tracker-fts-config.h"
+#include "tracker-main.h"
+
+#define NIE_PREFIX TRACKER_NIE_PREFIX
+#define NFO_PREFIX TRACKER_NFO_PREFIX
+#define NCO_PREFIX TRACKER_NCO_PREFIX
+
+#define RDF_PREFIX TRACKER_RDF_PREFIX
+#define RDF_TYPE RDF_PREFIX "type"
+
+
+using namespace wvWare;
+
+extern "C" void extract_msword (const gchar *uri,
+ TrackerSparqlBuilder *metadata);
+
+
+static TrackerExtractData data[] = {
+ { "application/msword", extract_msword },
+ { NULL, NULL }
+};
+
+
+class TextExtractor : public TextHandler
+{
+public:
+ UString content;
+ virtual void runOfText (const UString &text,
+ SharedPtr<const Word97::CHP> chp);
+};
+
+
+void
+TextExtractor::runOfText (const UString &text,
+ SharedPtr<const Word97::CHP> chp)
+{
+ content += text;
+}
+
+
+static gchar*
+ustring2utf (const UString& ustr, guint n_words)
+{
+ CString cstring = ustr.cstring();
+ gchar *unicode_str = g_convert (cstring.c_str (), cstring.length (),
+ "UTF-8", "ISO-8859-1",
+ NULL, NULL, NULL);
+
+ if(unicode_str) {
+ gchar *normalized = tracker_text_normalize (unicode_str, n_words, NULL);
+ g_free (unicode_str);
+ return normalized;
+ }
+ else {
+ return NULL;
+ }
+}
+
+
+extern "C" void
+extract_msword (const gchar *uri,
+ TrackerSparqlBuilder *metadata)
+{
+ gchar *filename = g_filename_from_uri (uri, NULL, NULL);
+ TrackerFTSConfig *fts_config;
+ guint n_words;
+ gchar *str;
+ Word97::DOP dop;
+ char buffer[10];
+
+ if(!filename) {
+ return;
+ }
+
+ SharedPtr<Parser> parser (ParserFactory::createParser (filename));
+
+ if (!parser) {
+ g_free(filename);
+ return;
+ }
+
+ TextExtractor* extractor = new TextExtractor;
+ if (!extractor) {
+ g_free (filename);
+ return;
+ }
+
+ parser->setTextHandler (extractor);
+ if (!parser->parse ()) {
+ g_free (filename);
+ delete extractor;
+ return;
+ }
+
+ tracker_sparql_builder_subject_iri (metadata, uri);
+ tracker_sparql_builder_predicate (metadata, "a");
+ tracker_sparql_builder_object (metadata, "nfo:PaginatedTextDocument");
+
+ fts_config = tracker_main_get_fts_config ();
+ n_words = tracker_fts_config_get_max_words_to_index (fts_config);
+
+ str = ustring2utf (extractor->content, n_words);
+
+ if(str) {
+ tracker_sparql_builder_predicate (metadata, "nie:plainTextContent");
+ tracker_sparql_builder_object_unvalidated(metadata, str);
+ g_free (str);
+ }
+
+ AssociatedStrings strings = parser->associatedStrings ();
+
+ if (strings.title () != "") {
+ str = ustring2utf(strings.title (), n_words);
+ if (str) {
+ tracker_sparql_builder_predicate (metadata, "nie:title");
+ tracker_sparql_builder_object_unvalidated (metadata, str);
+ g_free (str);
+ }
+ }
+
+ if (strings.subject () != "") {
+ str = ustring2utf (strings.subject (), n_words);
+ if (str) {
+ tracker_sparql_builder_predicate (metadata, "nie:subject");
+ tracker_sparql_builder_object_unvalidated (metadata, str);
+ g_free (str);
+ }
+ }
+
+ if (strings.author () != "") {
+ str = ustring2utf (strings.author (), n_words);
+ if (str) {
+ tracker_sparql_builder_predicate (metadata, "nco:creator");
+ tracker_sparql_builder_object_blank_open (metadata);
+ tracker_sparql_builder_predicate (metadata, "a");
+ tracker_sparql_builder_object (metadata, "nco:Contact");
+ tracker_sparql_builder_predicate (metadata, "nco:fullname");
+ tracker_sparql_builder_object_unvalidated (metadata, str);
+ tracker_sparql_builder_object_blank_close (metadata);
+ g_free (str);
+ }
+}
+
+ if (strings.keywords () != "") {
+ str = ustring2utf (strings.keywords (), n_words);
+ if (str) {
+ gchar **keys = g_strsplit_set (str, ",; ", 0);
+ for(int i=0; i<g_strv_length(keys); i++) {
+ tracker_sparql_builder_predicate (metadata, "nie:keyword");
+ tracker_sparql_builder_object_unvalidated (metadata, keys[i]);
+ }
+ g_strfreev (keys);
+ g_free (str);
+ }
+ }
+
+ dop = parser->dop ();
+
+ g_sprintf (buffer, "%d", dop.cWords);
+ tracker_sparql_builder_predicate (metadata, "nfo:wordCount");
+ tracker_sparql_builder_object_unvalidated (metadata, buffer);
+
+ delete extractor;
+ g_free (filename);
+}
+
+
+extern "C" TrackerExtractData *
+tracker_get_extract_data (void)
+{
+ return data;
+}
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]