[tracker] Implementation for the streamanalyzer extractor



commit 03f8dc5883172bde64aefa5346a254d562b03d09
Author: Philip Van Hoof <philip codeminded be>
Date:   Thu Apr 9 18:36:58 2009 +0200

    Implementation for the streamanalyzer extractor
---
 configure.ac                                |    4 +
 src/tracker-extract/Makefile.am             |   12 +-
 src/tracker-extract/tracker-extract.c       |   15 ++
 src/tracker-extract/tracker-topanalyzer.cpp |  339 +++++++++++++++++++++++++++
 src/tracker-extract/tracker-topanalyzer.h   |   38 +++
 5 files changed, 407 insertions(+), 1 deletions(-)

diff --git a/configure.ac b/configure.ac
index dc80edd..99c4489 100644
--- a/configure.ac
+++ b/configure.ac
@@ -84,6 +84,7 @@ AC_SUBST(LT_CURRENT_MINUS_AGE)
 
 # Checks for programs.
 AC_PROG_CC
+AC_PROG_CXX
 AC_PROG_LN_S
 AC_PROG_INSTALL
 AC_PROG_MAKE_SET
@@ -148,6 +149,8 @@ EXEMPI_REQUIRED=1.99.2
 HILDON_THUMBNAIL_REQUIRED=3.0.10
 EVO_REQUIRED=2.25.5
 EDS_REQUIRED=2.25.5
+# Unlikely version for now, Nepomuk integration isn't finished in streamanalyzer atm
+STREAMANALYZER_REQUIRED=9.9.9
 LIBGNOME_DESKTOP_REQUIRED=2.9.91
 LIBGNOME_REQUIRED=2.13.2
 LIBGNOMEUI_REQUIRED=2.13.7
@@ -1475,6 +1478,7 @@ Applications:
 
 Metadata Extractors:
 
+	Support StreamAnalyzer:			$have_streamanalyzer
 	Support PNG:				yes
 	Support PDF:				$have_poppler_glib
 	Support JPEG:				$have_libjpeg (xmp: $have_exempi, exif: $have_libexif, iptc: $have_libiptcdata)
diff --git a/src/tracker-extract/Makefile.am b/src/tracker-extract/Makefile.am
index 2919dc0..653d701 100644
--- a/src/tracker-extract/Makefile.am
+++ b/src/tracker-extract/Makefile.am
@@ -28,6 +28,10 @@ INCLUDES = 								\
 	$(XINE_CFLAGS) 							\
 	$(TOTEM_PL_PARSER_CFLAGS)					
 
+if HAVE_STREAMANALYZER
+INCLUDES += $(STREAMANALYZER_CFLAGS) -DHAVE_STREAMANALYZER
+endif
+
 modules_LTLIBRARIES = 							\
 	libextract-abw.la 						\
 	libextract-mp3.la				 		\
@@ -236,7 +240,13 @@ tracker_extract_LDADD = 						\
 	$(GMODULE_LIBS)							\
 	$(GTHREAD_LIBS)							\
 	$(GCOV_LIBS)							\
-	$(GLIB2_LIBS)
+	$(GLIB2_LIBS)							\
+	$(STREAMANALYZER_LIBS)
+
+if HAVE_STREAMANALYZER
+tracker_extract_SOURCES += tracker-topanalyzer.cpp tracker-topanalyzer.h
+tracker_extract_LDADD += $(STREAMANALYZER_LIBS)
+endif
 
 dbus_sources = 								\
 	tracker-extract-glue.h
diff --git a/src/tracker-extract/tracker-extract.c b/src/tracker-extract/tracker-extract.c
index e588817..d380784 100644
--- a/src/tracker-extract/tracker-extract.c
+++ b/src/tracker-extract/tracker-extract.c
@@ -32,6 +32,10 @@
 #include "tracker-dbus.h"
 #include "tracker-extract.h"
 
+#ifdef HAVE_STREAMANALYZER
+#include "tracker-topanalyzer.h"
+#endif
+
 #define MAX_EXTRACT_TIME 5
 #define TRACKER_EXTRACT_GET_PRIVATE(obj) (G_TYPE_INSTANCE_GET_PRIVATE ((obj), TRACKER_TYPE_EXTRACT, TrackerExtractPrivate))
 
@@ -58,6 +62,9 @@ tracker_extract_class_init (TrackerExtractClass *klass)
 static void
 tracker_extract_init (TrackerExtract *object)
 {
+#ifdef HAVE_STREAMANALYZER
+	tracker_topanalyzer_init ();
+#endif
 }
 
 static void
@@ -67,6 +74,10 @@ tracker_extract_finalize (GObject *object)
 
 	priv = TRACKER_EXTRACT_GET_PRIVATE (object);
 
+#ifdef HAVE_STREAMANALYZER
+	tracker_topanalyzer_shutdown ();
+#endif
+
 	g_array_free (priv->extractors, TRUE);
 
 	G_OBJECT_CLASS (tracker_extract_parent_class)->finalize (object);
@@ -220,6 +231,10 @@ get_file_metadata (TrackerExtract *extract,
 	/* Create hash table to send back */
 	statements = g_ptr_array_new ();
 
+#ifdef HAVE_STREAMANALYZER
+	tracker_topanalyzer_extract (uri, statements, &content_type);
+#endif
+
 	if ((!mime || mime[0]=='\0') && content_type)
 		mime = content_type;
 
diff --git a/src/tracker-extract/tracker-topanalyzer.cpp b/src/tracker-extract/tracker-topanalyzer.cpp
new file mode 100644
index 0000000..fae01cc
--- /dev/null
+++ b/src/tracker-extract/tracker-topanalyzer.cpp
@@ -0,0 +1,339 @@
+/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
+/*
+ * Copyright (C) 2008, Nokia
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA  02110-1301, USA.
+ *
+  * Authors: Philip Van Hoof <philip codeminded be>
+ */
+
+#include <glib.h>
+#include <glib/gstdio.h>
+
+#include <strigi/indexwriter.h>
+#include <strigi/analysisresult.h>
+#include <strigi/analyzerconfiguration.h>
+#include <strigi/fileinputstream.h>
+
+#include <sys/stat.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include <map>
+#include <sstream>
+#include <algorithm>
+
+#include <libtracker-common/tracker-type-utils.h>
+#include <libtracker-common/tracker-os-dependant.h>
+#include <libtracker-common/tracker-statement-list.h>
+
+#include <libtracker-common/tracker-ontology.h>
+
+#define NIE_PREFIX TRACKER_NIE_PREFIX
+
+#include "tracker-main.h"
+#include "tracker-topanalyzer.h"
+
+using namespace std;
+using namespace Strigi;
+
+static GStaticPrivate private_key = G_STATIC_PRIVATE_INIT;
+
+namespace Tracker {
+	class TripleCollector : public Strigi::IndexWriter
+	{
+	public:
+		TripleCollector ();
+		~TripleCollector ();
+
+		void commit();
+		void deleteEntries( const std::vector<std::string>& entries );
+		void deleteAllEntries();
+		void initWriterData( const Strigi::FieldRegister& );
+		void releaseWriterData( const Strigi::FieldRegister& );
+		void startAnalysis( const AnalysisResult* );
+		void addText( const AnalysisResult*, const char* text, int32_t length );
+		void addValue( const AnalysisResult*, const RegisteredField* field,
+			       const std::string& value );
+		void addValue( const AnalysisResult*, const RegisteredField* field,
+			       const unsigned char* data, uint32_t size );
+		void addValue( const AnalysisResult*, const RegisteredField* field,
+			       int32_t value );
+		void addValue( const AnalysisResult*, const RegisteredField* field,
+			       uint32_t value );
+		void addValue( const AnalysisResult*, const RegisteredField* field,
+			       double value );
+		void addTriplet( const std::string& subject,
+				 const std::string& predicate, const std::string& object );
+		void addValue( const AnalysisResult*, const RegisteredField* field,
+			       const std::string& name, const std::string& value );
+		void finishAnalysis( const AnalysisResult* );
+		void setParams (const gchar *uri_, GPtrArray *metadata_);
+
+		gchar *content_type;
+
+	private:
+		const gchar* PredicateMapping (const RegisteredField *field);
+		const gchar* PredicateMapping (const std::string &key);
+
+		const gchar *uri;
+		GPtrArray *metadata;
+	};
+
+	Tracker::TripleCollector::TripleCollector () 
+	{ 
+		content_type = NULL;
+	}
+
+	void Tracker::TripleCollector::setParams (const gchar *uri_, GPtrArray *metadata_)
+	{
+		uri = uri_;
+		metadata = metadata_;
+		if (content_type)
+			g_free (content_type);
+		content_type = NULL;
+	}
+
+	Tracker::TripleCollector::~TripleCollector ()
+	{
+		if (content_type)
+			g_free (content_type);
+	}
+
+	void Tracker::TripleCollector::commit () { }
+	void Tracker::TripleCollector::deleteEntries (const std::vector<std::string>& entries ) { }
+	void Tracker::TripleCollector::deleteAllEntries () { }
+	void Tracker::TripleCollector::initWriterData (const Strigi::FieldRegister&) { }
+	void Tracker::TripleCollector::releaseWriterData (const Strigi::FieldRegister&) { }
+	void Tracker::TripleCollector::startAnalysis (const AnalysisResult* idx) { }
+
+	void Tracker::TripleCollector::addText (const AnalysisResult* idx, 
+	                                        const char* text, 
+	                                        int32_t length)
+	{
+		tracker_statement_list_insert (metadata, idx->path().c_str(),
+		                          NIE_PREFIX "plainTextContent",
+		                          text);
+	}
+
+	const gchar* Tracker::TripleCollector::PredicateMapping (const std::string &key)
+	{
+		return (const gchar *) key.c_str();
+	}
+
+	const gchar* Tracker::TripleCollector::PredicateMapping (const RegisteredField *field)
+	{
+		return (const gchar *) field->key().c_str();
+	}
+
+	/* The methods below basically just convert the C++ world to the C world
+	 * of tracker_statement_list_insert. Nothing magical about it. */
+
+	void Tracker::TripleCollector::addValue (const AnalysisResult* idx, 
+	                                         const RegisteredField* field,
+	                                         const std::string& value)
+	{
+		if (field->key() == FieldRegister::mimetypeFieldName && idx->depth() == 0) {
+			if (content_type)
+				g_free (content_type);
+			content_type = g_strdup (value.c_str());
+		}
+
+		tracker_statement_list_insert (metadata, idx->path().c_str(), 
+		                          PredicateMapping (field),
+		                          value.c_str());
+	}
+
+	void Tracker::TripleCollector::addValue (const AnalysisResult* idx, 
+	                                         const RegisteredField* field,
+	                                         const unsigned char* data, 
+	                                         uint32_t size )
+	{
+		tracker_statement_list_insert (metadata, idx->path().c_str(),
+		                          PredicateMapping (field),
+		                          (const gchar*) data);
+	}
+
+	void Tracker::TripleCollector::addValue (const AnalysisResult* idx, 
+	                                         const RegisteredField* field,
+	                                         int32_t value)
+	{
+		tracker_statement_list_insert_with_int (metadata, idx->path().c_str(), 
+		                                   PredicateMapping (field),
+		                                   (gint) value);
+	}
+
+	void Tracker::TripleCollector::addValue (const AnalysisResult* idx, 
+	                                         const RegisteredField* field,
+	                                         uint32_t value ) 
+	{
+		tracker_statement_list_insert_with_int (metadata, idx->path().c_str(),
+		                                   PredicateMapping (field),
+		                                   (gint) value);
+	}
+
+	void Tracker::TripleCollector::addValue (const AnalysisResult* idx, 
+	                                         const RegisteredField* field,
+	                                         double value ) 
+	{
+		tracker_statement_list_insert_with_double (metadata, idx->path().c_str(), 
+		                                      PredicateMapping (field),
+		                                      (gdouble) value);
+	}
+
+	void Tracker::TripleCollector::addTriplet (const std::string& subject,
+	                                           const std::string& predicate, 
+	                                           const std::string& object ) 
+	{
+		tracker_statement_list_insert (metadata, subject.c_str(), 
+		                          PredicateMapping (predicate),
+		                          object.c_str());
+	}
+
+	void Tracker::TripleCollector::addValue (const AnalysisResult* idx, 
+	                                         const RegisteredField* field,
+	                                         const std::string& name, 
+	                                         const std::string& value )
+	{
+		if (field->key() == FieldRegister::mimetypeFieldName && idx->depth() == 0) {
+			if (content_type)
+				g_free (content_type);
+			content_type = g_strdup (value.c_str());
+		}
+
+		tracker_statement_list_insert (metadata, idx->path().c_str(),
+		                          PredicateMapping (name),
+		                          value.c_str());
+	}
+
+	void Tracker::TripleCollector::finishAnalysis (const AnalysisResult* ) { }
+}
+
+typedef struct {
+	Strigi::AnalyzerConfiguration *mconfig;
+	Strigi::StreamAnalyzer *streamindexer;
+	Tracker::TripleCollector *m_writer;
+} TrackerTopanalyzerPrivate;
+
+
+static void
+private_free (gpointer data)
+{
+	TrackerTopanalyzerPrivate *priv = (TrackerTopanalyzerPrivate *) data;
+
+	delete priv->mconfig;
+	delete priv->streamindexer;
+	delete priv->m_writer;
+
+	g_free (priv);
+}
+
+void
+tracker_topanalyzer_init (void)
+{
+	TrackerTopanalyzerPrivate *priv;
+
+	/* For added granularity of what analyzer should be elected for which
+	 * filetype or file, you can inherit a Strigi::AnalyzerConfiguration
+	 * and have some tuning this way. */
+
+	FieldRegister::FieldRegister();
+
+	priv = g_new0 (TrackerTopanalyzerPrivate, 1);
+
+	priv->mconfig = new Strigi::AnalyzerConfiguration ();
+	priv->streamindexer = new Strigi::StreamAnalyzer (*priv->mconfig);
+	priv->m_writer = new Tracker::TripleCollector ();
+
+	priv->streamindexer->setIndexWriter (*priv->m_writer);
+
+	g_static_private_set (&private_key,
+			      priv,
+			      private_free);
+}
+
+void
+tracker_topanalyzer_shutdown (void)
+{
+	g_static_private_set (&private_key, NULL, NULL);
+}
+
+void
+tracker_topanalyzer_extract (const gchar *uri, GPtrArray  *metadata, gchar **content_type)
+{
+	gchar *filename;
+	TrackerTopanalyzerPrivate *priv;
+
+	priv = (TrackerTopanalyzerPrivate *) g_static_private_get (&private_key);
+	g_return_if_fail (priv != NULL);
+
+	/* We need the filename from the URI because we'll use stat() and because
+	 * in this experiment I used FileInputStream. But any kind of stream could
+	 * work with StreamAnalyzer's analyzers. */
+
+	filename = g_filename_from_uri (uri, NULL, NULL);
+
+	if (filename) {
+		struct stat s;
+
+		/* We use our own strategy as writer. Our writer writes to the @metadata
+		 * array. I decided to call it a collector because that's what its 
+		 * implementation does (collecting triples) */
+
+		priv->m_writer->setParams (uri, metadata);
+		stat(filename, &s);
+
+		/* The first parameter that we pass here will influence what 
+		 * idx->path() will be above. StreamAnalyzer only ever appends
+		 * path chunks to this initial stringvalue. So if we pass
+		 * our://URI then idx->path will end up being: 
+		 *
+		 * our://URI
+		 * our://URI/child
+		 * our://URI/child/child.
+		 *
+		 * For example the URI of a tar.gz will go like this:
+		 *
+		 * file:///path/to/my.tar.gz
+		 * file:///path/to.my.tar.gz/dir_in_tar/file1.txt
+		 * file:///path/to.my.tar.gz/dir_in_tar/file2.txt
+		 *
+		 * The URI passed here doesn't mean the stream passed later must
+		 * not really resemble the URI. Usually it will of course.
+		 */
+
+		AnalysisResult analysisresult (uri, s.st_mtime, *priv->m_writer, 
+		                               *priv->streamindexer);
+
+		/* If we want a remote stream, then we implement a Stream in C++
+		 * for it and use that instead of FileInputStream. We could for
+		 * example make a C++ wrapper for GInputStream and enjoy using
+		 * GIO and GNIO here that way. */
+
+		FileInputStream resource (filename);
+
+		if (resource.status() == Ok) {
+			analysisresult.index(&resource);
+
+			if (content_type && priv->m_writer->content_type)
+				*content_type = g_strdup (priv->m_writer->content_type);
+		}
+
+		g_free (filename);
+	}
+}
+
diff --git a/src/tracker-extract/tracker-topanalyzer.h b/src/tracker-extract/tracker-topanalyzer.h
new file mode 100644
index 0000000..7e16acb
--- /dev/null
+++ b/src/tracker-extract/tracker-topanalyzer.h
@@ -0,0 +1,38 @@
+/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
+/*
+ * Copyright (C) 2008, Nokia
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA  02110-1301, USA.
+ *
+  * Authors: Philip Van Hoof <philip codeminded be>
+ */
+
+#ifndef _TRACKER_TOPANALYZER_H_
+#define _TRACKER_TOPANALYZER_H_
+
+#include <glib.h>
+
+G_BEGIN_DECLS
+
+void tracker_topanalyzer_init     (void);
+void tracker_topanalyzer_extract  (const gchar *uri, 
+                                   GPtrArray *metadata, 
+                                   gchar **content_type);
+void tracker_topanalyzer_shutdown (void);
+
+G_END_DECLS
+
+#endif



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]