[tracker] Refactored the MSWord extractor to use libwv2 for content extraction



commit 84c23b4da780abfe490638deffd807f6fa7582ad
Author: Philip Van Hoof <philip codeminded be>
Date:   Tue Dec 8 21:08:00 2009 +0100

    Refactored the MSWord extractor to use libwv2 for content extraction

 configure.ac                                   |   54 ++++++++++--
 src/tracker-extract/Makefile.am                |    7 ++-
 src/tracker-extract/tracker-extract-msoffice.c |   53 ++----------
 src/tracker-extract/tracker-msword.cpp         |  113 ++++++++++++++++++++++++
 src/tracker-extract/tracker-msword.h           |   31 +++++++
 5 files changed, 205 insertions(+), 53 deletions(-)
---
diff --git a/configure.ac b/configure.ac
index 97116e2..2d6bbfd 100644
--- a/configure.ac
+++ b/configure.ac
@@ -147,6 +147,7 @@ GDK_REQUIRED=1.0
 LIBVORBIS_REQUIRED=0.22
 LIBEXIF_REQUIRED=0.6
 LIBGSF_REQUIRED=1.13
+LIBWV2_REQUIRED=0.3.1
 EXEMPI_REQUIRED=2.1.0
 HILDON_THUMBNAIL_REQUIRED=3.0.10
 EVO_REQUIRED=2.25.5
@@ -1109,13 +1110,6 @@ if test "x$enable_libgsf" != "xno" ; then
    AC_SUBST(LIBGSF_CFLAGS)
    AC_SUBST(LIBGSF_LIBS)
 
-   AC_PATH_PROG(WVWAREBIN, wvWare, no)
-   AC_SUBST(WVWAREBIN)
-    
-   if test "x$WVWAREBIN" != "xno"; then
-      AC_DEFINE(HAVE_WVWARE, [], [Define if we have wvWare])
-   fi
-
    if test "x$have_libgsf" = "xyes"; then
       AC_DEFINE(HAVE_LIBGSF, [], [Define if we have libgsf])
    fi
@@ -1132,6 +1126,52 @@ fi
 AM_CONDITIONAL(HAVE_LIBGSF, test "x$have_libgsf" = "xyes")
 
 ##################################################################
+# Check for libwv2
+##################################################################
+
+# FIXME This should be package based. Unfortunately in several main
+# distros, it is not.
+
+AC_ARG_ENABLE(libwv2,
+              AS_HELP_STRING([--enable-libwv2],
+                             [enable content extractor for MS documents [[default=auto]]]),,
+                             [enable_libwv2=auto])
+
+if test "x$enable_libwv2" != "xno" ; then
+   AC_MSG_CHECKING(for wv2)
+
+   AC_LANG(C++)
+   oldflags=$LDFLAGS
+   LDFLAGS="$LDFLAGS -lwv2"
+   AC_LINK_IFELSE(
+   [AC_LANG_PROGRAM([#include <wv2/wv2version.h>],
+                    [wvWare::version()])],
+                    [TEST_LIBS="$TEST_LIBS -lwv2"] [have_libwv2=yes],
+                    [have_libwv2=no])
+
+   LIBWV2_LIBS="-lwv2"
+   AC_SUBST(LIBWV2_LIBS)
+   LDFLAGS=$oldflags
+
+   if test "x$have_libwv2" = "xyes"; then
+     AC_DEFINE(HAVE_LIBWV2, [], [Define if we have libwv2])
+     AC_MSG_RESULT(yes)
+   else
+     AC_MSG_RESULT(no)
+   fi
+else
+   have_libwv2="no (disabled)"
+fi
+
+if test "x$enable_libwv2" = "xyes"; then
+ if test "x$have_libwv2" != "xyes"; then
+ AC_MSG_ERROR([Couldn't find libwv2 >= $LIBWV2_REQUIRED.])
+ fi
+fi
+
+AM_CONDITIONAL(HAVE_LIBWV2, test "x$have_libwv2" = "xyes")
+
+##################################################################
 # Check for libjpeg
 ##################################################################
 
diff --git a/src/tracker-extract/Makefile.am b/src/tracker-extract/Makefile.am
index 697a3f7..c5d2316 100644
--- a/src/tracker-extract/Makefile.am
+++ b/src/tracker-extract/Makefile.am
@@ -7,7 +7,6 @@ INCLUDES = 								\
 	-DLOCALEDIR=\""$(localedir)"\" 					\
 	-DMODULESDIR=\"$(modulesdir)\"					\
 	-DG_LOG_DOMAIN=\"Tracker\"					\
-	-DWVWAREBIN=\"$(WVWAREBIN)\"					\
 	-DTRACKER_COMPILATION						\
 	-I$(top_srcdir)/src 						\
 	$(WARN_CFLAGS)							\
@@ -22,6 +21,7 @@ INCLUDES = 								\
 	$(LIBEXIF_CFLAGS) 						\
 	$(LIBIPTCDATA_CFLAGS)						\
 	$(LIBGSF_CFLAGS) 						\
+	$(LIBWV2_CFLAGS) 						\
 	$(LIBXML2_CFLAGS) 						\
 	$(LIBPNG_CFLAGS) 						\
 	$(POPPLER_GLIB_CFLAGS) 						\
@@ -186,6 +186,11 @@ libextract_msoffice_la_LDFLAGS = $(module_flags)
 libextract_msoffice_la_LIBADD = $(GLIB2_LIBS) $(LIBGSF_LIBS) $(GCOV_LIBS) \
 	$(top_builddir)/src/libtracker-common/libtracker-common.la
 
+if HAVE_LIBWV2
+libextract_msoffice_la_SOURCES += tracker-msword.cpp tracker-msword.h
+libextract_msoffice_la_LIBADD += $(LIBWV2_LIBS)
+endif
+
 # PDF
 libextract_pdf_la_SOURCES = tracker-extract-pdf.c $(xmp_sources)
 libextract_pdf_la_LDFLAGS = $(module_flags)
diff --git a/src/tracker-extract/tracker-extract-msoffice.c b/src/tracker-extract/tracker-extract-msoffice.c
index 465153e..ba1f0de 100644
--- a/src/tracker-extract/tracker-extract-msoffice.c
+++ b/src/tracker-extract/tracker-extract-msoffice.c
@@ -39,6 +39,7 @@
 #include <libtracker-common/tracker-ontology.h>
 
 #include "tracker-main.h"
+#include "tracker-msword.h"
 
 #define NIE_PREFIX                              TRACKER_NIE_PREFIX
 #define NFO_PREFIX                              TRACKER_NFO_PREFIX
@@ -253,49 +254,6 @@ doc_metadata_cb (gpointer key,
 	}
 }
 
-static gchar *
-extract_content (const gchar *uri,
-                 guint        n_words)
-{
-#ifdef HAVE_WVWARE
-
-	/* TODO, question: can't we replace this command-calling with a function
-	 * in libwmf-dev or something? If yes and somebody wants to contribute 
-	 * replacing this with libwmf-dev, go ahead */
-
-	gchar *path, *command, *output, *text;
-	GError *error = NULL;
-
-	path = g_filename_from_uri (uri, NULL, NULL);
-
-	if (!path) {
-		return NULL;
-	}
-
-	command = g_strdup_printf (WVWAREBIN " --charset utf-8 -1 -x wvText.xml %s", path);
-
-	g_free (path);
-
-	if (!g_spawn_command_line_sync (command, &output, NULL, NULL, &error)) {
-		g_warning ("Could not extract text from '%s': %s", 
-		           uri, error->message);
-		g_error_free (error);
-		g_free (command);
-
-		return NULL;
-	}
-
-	text = tracker_text_normalize (output, n_words, NULL);
-
-	g_free (command);
-	g_free (output);
-
-	return text;
-#else
-	return NULL;
-#endif
-}
-
 /**
 * @brief Read 16 bit unsigned integer
 * @param buffer data to read integer from
@@ -752,8 +710,10 @@ extract_summary (TrackerSparqlBuilder *metadata,
                  GsfInfile            *infile,
                  const gchar          *uri)
 {
-	gchar    *content;
 	GsfInput *stream;
+#ifdef HAVE_LIBWV2
+	gchar    *content;
+#endif
 
 	tracker_sparql_builder_subject_iri (metadata, uri);
 	tracker_sparql_builder_predicate (metadata, "a");
@@ -801,13 +761,16 @@ extract_summary (TrackerSparqlBuilder *metadata,
 		g_object_unref (stream);
 	}
 
-	content = extract_content (uri, max_words());
+
+#ifdef HAVE_LIBWV2
+	content = extract_msword_content (uri, max_words ());
 
 	if (content) {
 		tracker_sparql_builder_predicate (metadata, "nie:plainTextContent");
 		tracker_sparql_builder_object_unvalidated (metadata, content);
 		g_free (content);
 	}
+#endif
 }
 
 /**
diff --git a/src/tracker-extract/tracker-msword.cpp b/src/tracker-extract/tracker-msword.cpp
new file mode 100644
index 0000000..741860a
--- /dev/null
+++ b/src/tracker-extract/tracker-msword.cpp
@@ -0,0 +1,113 @@
+/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
+/*
+ * Copyright (C) 2009, Nokia
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA  02110-1301, USA.
+ */
+
+#include "config.h"
+#include "tracker-msword.h"
+
+#include <glib.h>
+#include <glib/gprintf.h>
+
+#include <wv2/wvlog.h>
+#include <wv2/parser.h>
+#include <wv2/handlers.h>
+#include <wv2/parserfactory.h>
+#include <wv2/word97_generated.h>
+#include <wv2/ustring.h>
+
+
+extern "C" {
+#include <libtracker-common/tracker-utils.h>
+}
+
+
+using namespace wvWare;
+
+
+class TextExtractor : public TextHandler
+{
+public:
+	UString content;
+	virtual void runOfText (const UString                &text, 
+	                        SharedPtr<const Word97::CHP> chp);
+}; 
+
+
+void 
+TextExtractor::runOfText (const  UString               &text,  
+                          SharedPtr<const Word97::CHP> chp)
+{
+	content += text;
+}
+
+
+static gchar* 
+ustring2utf (const UString& ustr, guint n_words) 
+{
+	CString cstring = ustr.cstring();
+	gchar *unicode_str = g_convert (cstring.c_str (), cstring.length (), 
+	                                "UTF-8", "ISO-8859-1", 
+	                                NULL, NULL, NULL);
+
+	if(unicode_str) { 
+		gchar *normalized = tracker_text_normalize (unicode_str, n_words, NULL);
+		g_free (unicode_str);
+		return normalized;
+	}
+	
+	return NULL;
+}
+
+gchar* 
+extract_msword_content (const gchar *uri, gint max_words)
+{
+	gchar *filename = g_filename_from_uri (uri, NULL, NULL);
+	gchar *str;
+	
+	if(!filename) {
+		return NULL;
+	}
+
+	SharedPtr<Parser> parser (ParserFactory::createParser (filename));
+
+	if (!parser) {
+		g_free(filename);
+		return NULL;
+	}
+
+	TextExtractor* extractor = new TextExtractor;
+	if (!extractor) {
+		g_free (filename);
+		return NULL;
+	}
+
+	parser->setTextHandler (extractor);
+	if (!parser->parse ()) {
+		g_free (filename);
+		delete extractor;
+		return NULL;
+	}
+	
+	str = ustring2utf (extractor->content, max_words);
+	
+	delete extractor;
+	g_free (filename);
+	
+	return str;
+}
diff --git a/src/tracker-extract/tracker-msword.h b/src/tracker-extract/tracker-msword.h
new file mode 100644
index 0000000..cd31044
--- /dev/null
+++ b/src/tracker-extract/tracker-msword.h
@@ -0,0 +1,31 @@
+/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
+/*
+ * Copyright (C) 2009, Nokia
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA  02110-1301, USA.
+ */
+
+#ifndef __TRACKERD_MSWORD_H__
+#define __TRACKERD_MSWORD_H__
+
+#include <glib.h>
+
+#ifdef __cplusplus
+extern "C"
+#endif
+gchar* extract_msword_content (const gchar *uri, gint max_words);
+
+#endif /* __TRACKERD_MSWORD_H__ */



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]