[tracker] Implementation for the streamanalyzer extractor
- From: Jürg Billeter <juergbi src gnome org>
- To: svn-commits-list gnome org
- Subject: [tracker] Implementation for the streamanalyzer extractor
- Date: Thu, 16 Apr 2009 11:27:45 -0400 (EDT)
commit 03f8dc5883172bde64aefa5346a254d562b03d09
Author: Philip Van Hoof <philip codeminded be>
Date: Thu Apr 9 18:36:58 2009 +0200
Implementation for the streamanalyzer extractor
---
configure.ac | 4 +
src/tracker-extract/Makefile.am | 12 +-
src/tracker-extract/tracker-extract.c | 15 ++
src/tracker-extract/tracker-topanalyzer.cpp | 339 +++++++++++++++++++++++++++
src/tracker-extract/tracker-topanalyzer.h | 38 +++
5 files changed, 407 insertions(+), 1 deletions(-)
diff --git a/configure.ac b/configure.ac
index dc80edd..99c4489 100644
--- a/configure.ac
+++ b/configure.ac
@@ -84,6 +84,7 @@ AC_SUBST(LT_CURRENT_MINUS_AGE)
# Checks for programs.
AC_PROG_CC
+AC_PROG_CXX
AC_PROG_LN_S
AC_PROG_INSTALL
AC_PROG_MAKE_SET
@@ -148,6 +149,8 @@ EXEMPI_REQUIRED=1.99.2
HILDON_THUMBNAIL_REQUIRED=3.0.10
EVO_REQUIRED=2.25.5
EDS_REQUIRED=2.25.5
+# Unlikely version for now, Nepomuk integration isn't finished in streamanalyzer atm
+STREAMANALYZER_REQUIRED=9.9.9
LIBGNOME_DESKTOP_REQUIRED=2.9.91
LIBGNOME_REQUIRED=2.13.2
LIBGNOMEUI_REQUIRED=2.13.7
@@ -1475,6 +1478,7 @@ Applications:
Metadata Extractors:
+ Support StreamAnalyzer: $have_streamanalyzer
Support PNG: yes
Support PDF: $have_poppler_glib
Support JPEG: $have_libjpeg (xmp: $have_exempi, exif: $have_libexif, iptc: $have_libiptcdata)
diff --git a/src/tracker-extract/Makefile.am b/src/tracker-extract/Makefile.am
index 2919dc0..653d701 100644
--- a/src/tracker-extract/Makefile.am
+++ b/src/tracker-extract/Makefile.am
@@ -28,6 +28,10 @@ INCLUDES = \
$(XINE_CFLAGS) \
$(TOTEM_PL_PARSER_CFLAGS)
+if HAVE_STREAMANALYZER
+INCLUDES += $(STREAMANALYZER_CFLAGS) -DHAVE_STREAMANALYZER
+endif
+
modules_LTLIBRARIES = \
libextract-abw.la \
libextract-mp3.la \
@@ -236,7 +240,13 @@ tracker_extract_LDADD = \
$(GMODULE_LIBS) \
$(GTHREAD_LIBS) \
$(GCOV_LIBS) \
- $(GLIB2_LIBS)
+ $(GLIB2_LIBS) \
+ $(STREAMANALYZER_LIBS)
+
+if HAVE_STREAMANALYZER
+tracker_extract_SOURCES += tracker-topanalyzer.cpp tracker-topanalyzer.h
+tracker_extract_LDADD += $(STREAMANALYZER_LIBS)
+endif
dbus_sources = \
tracker-extract-glue.h
diff --git a/src/tracker-extract/tracker-extract.c b/src/tracker-extract/tracker-extract.c
index e588817..d380784 100644
--- a/src/tracker-extract/tracker-extract.c
+++ b/src/tracker-extract/tracker-extract.c
@@ -32,6 +32,10 @@
#include "tracker-dbus.h"
#include "tracker-extract.h"
+#ifdef HAVE_STREAMANALYZER
+#include "tracker-topanalyzer.h"
+#endif
+
#define MAX_EXTRACT_TIME 5
#define TRACKER_EXTRACT_GET_PRIVATE(obj) (G_TYPE_INSTANCE_GET_PRIVATE ((obj), TRACKER_TYPE_EXTRACT, TrackerExtractPrivate))
@@ -58,6 +62,9 @@ tracker_extract_class_init (TrackerExtractClass *klass)
static void
tracker_extract_init (TrackerExtract *object)
{
+#ifdef HAVE_STREAMANALYZER
+ tracker_topanalyzer_init ();
+#endif
}
static void
@@ -67,6 +74,10 @@ tracker_extract_finalize (GObject *object)
priv = TRACKER_EXTRACT_GET_PRIVATE (object);
+#ifdef HAVE_STREAMANALYZER
+ tracker_topanalyzer_shutdown ();
+#endif
+
g_array_free (priv->extractors, TRUE);
G_OBJECT_CLASS (tracker_extract_parent_class)->finalize (object);
@@ -220,6 +231,10 @@ get_file_metadata (TrackerExtract *extract,
/* Create hash table to send back */
statements = g_ptr_array_new ();
+#ifdef HAVE_STREAMANALYZER
+ tracker_topanalyzer_extract (uri, statements, &content_type);
+#endif
+
if ((!mime || mime[0]=='\0') && content_type)
mime = content_type;
diff --git a/src/tracker-extract/tracker-topanalyzer.cpp b/src/tracker-extract/tracker-topanalyzer.cpp
new file mode 100644
index 0000000..fae01cc
--- /dev/null
+++ b/src/tracker-extract/tracker-topanalyzer.cpp
@@ -0,0 +1,339 @@
+/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
+/*
+ * Copyright (C) 2008, Nokia
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ *
+ * Authors: Philip Van Hoof <philip codeminded be>
+ */
+
+#include <glib.h>
+#include <glib/gstdio.h>
+
+#include <strigi/indexwriter.h>
+#include <strigi/analysisresult.h>
+#include <strigi/analyzerconfiguration.h>
+#include <strigi/fileinputstream.h>
+
+#include <sys/stat.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include <map>
+#include <sstream>
+#include <algorithm>
+
+#include <libtracker-common/tracker-type-utils.h>
+#include <libtracker-common/tracker-os-dependant.h>
+#include <libtracker-common/tracker-statement-list.h>
+
+#include <libtracker-common/tracker-ontology.h>
+
+#define NIE_PREFIX TRACKER_NIE_PREFIX
+
+#include "tracker-main.h"
+#include "tracker-topanalyzer.h"
+
+using namespace std;
+using namespace Strigi;
+
+static GStaticPrivate private_key = G_STATIC_PRIVATE_INIT;
+
+namespace Tracker {
+ class TripleCollector : public Strigi::IndexWriter
+ {
+ public:
+ TripleCollector ();
+ ~TripleCollector ();
+
+ void commit();
+ void deleteEntries( const std::vector<std::string>& entries );
+ void deleteAllEntries();
+ void initWriterData( const Strigi::FieldRegister& );
+ void releaseWriterData( const Strigi::FieldRegister& );
+ void startAnalysis( const AnalysisResult* );
+ void addText( const AnalysisResult*, const char* text, int32_t length );
+ void addValue( const AnalysisResult*, const RegisteredField* field,
+ const std::string& value );
+ void addValue( const AnalysisResult*, const RegisteredField* field,
+ const unsigned char* data, uint32_t size );
+ void addValue( const AnalysisResult*, const RegisteredField* field,
+ int32_t value );
+ void addValue( const AnalysisResult*, const RegisteredField* field,
+ uint32_t value );
+ void addValue( const AnalysisResult*, const RegisteredField* field,
+ double value );
+ void addTriplet( const std::string& subject,
+ const std::string& predicate, const std::string& object );
+ void addValue( const AnalysisResult*, const RegisteredField* field,
+ const std::string& name, const std::string& value );
+ void finishAnalysis( const AnalysisResult* );
+ void setParams (const gchar *uri_, GPtrArray *metadata_);
+
+ gchar *content_type;
+
+ private:
+ const gchar* PredicateMapping (const RegisteredField *field);
+ const gchar* PredicateMapping (const std::string &key);
+
+ const gchar *uri;
+ GPtrArray *metadata;
+ };
+
+ Tracker::TripleCollector::TripleCollector ()
+ {
+ content_type = NULL;
+ }
+
+ void Tracker::TripleCollector::setParams (const gchar *uri_, GPtrArray *metadata_)
+ {
+ uri = uri_;
+ metadata = metadata_;
+ if (content_type)
+ g_free (content_type);
+ content_type = NULL;
+ }
+
+ Tracker::TripleCollector::~TripleCollector ()
+ {
+ if (content_type)
+ g_free (content_type);
+ }
+
+ void Tracker::TripleCollector::commit () { }
+ void Tracker::TripleCollector::deleteEntries (const std::vector<std::string>& entries ) { }
+ void Tracker::TripleCollector::deleteAllEntries () { }
+ void Tracker::TripleCollector::initWriterData (const Strigi::FieldRegister&) { }
+ void Tracker::TripleCollector::releaseWriterData (const Strigi::FieldRegister&) { }
+ void Tracker::TripleCollector::startAnalysis (const AnalysisResult* idx) { }
+
+ void Tracker::TripleCollector::addText (const AnalysisResult* idx,
+ const char* text,
+ int32_t length)
+ {
+ tracker_statement_list_insert (metadata, idx->path().c_str(),
+ NIE_PREFIX "plainTextContent",
+ text);
+ }
+
+ const gchar* Tracker::TripleCollector::PredicateMapping (const std::string &key)
+ {
+ return (const gchar *) key.c_str();
+ }
+
+ const gchar* Tracker::TripleCollector::PredicateMapping (const RegisteredField *field)
+ {
+ return (const gchar *) field->key().c_str();
+ }
+
+ /* The methods below basically just convert the C++ world to the C world
+ * of tracker_statement_list_insert. Nothing magical about it. */
+
+ void Tracker::TripleCollector::addValue (const AnalysisResult* idx,
+ const RegisteredField* field,
+ const std::string& value)
+ {
+ if (field->key() == FieldRegister::mimetypeFieldName && idx->depth() == 0) {
+ if (content_type)
+ g_free (content_type);
+ content_type = g_strdup (value.c_str());
+ }
+
+ tracker_statement_list_insert (metadata, idx->path().c_str(),
+ PredicateMapping (field),
+ value.c_str());
+ }
+
+ void Tracker::TripleCollector::addValue (const AnalysisResult* idx,
+ const RegisteredField* field,
+ const unsigned char* data,
+ uint32_t size )
+ {
+ tracker_statement_list_insert (metadata, idx->path().c_str(),
+ PredicateMapping (field),
+ (const gchar*) data);
+ }
+
+ void Tracker::TripleCollector::addValue (const AnalysisResult* idx,
+ const RegisteredField* field,
+ int32_t value)
+ {
+ tracker_statement_list_insert_with_int (metadata, idx->path().c_str(),
+ PredicateMapping (field),
+ (gint) value);
+ }
+
+ void Tracker::TripleCollector::addValue (const AnalysisResult* idx,
+ const RegisteredField* field,
+ uint32_t value )
+ {
+ tracker_statement_list_insert_with_int (metadata, idx->path().c_str(),
+ PredicateMapping (field),
+ (gint) value);
+ }
+
+ void Tracker::TripleCollector::addValue (const AnalysisResult* idx,
+ const RegisteredField* field,
+ double value )
+ {
+ tracker_statement_list_insert_with_double (metadata, idx->path().c_str(),
+ PredicateMapping (field),
+ (gdouble) value);
+ }
+
+ void Tracker::TripleCollector::addTriplet (const std::string& subject,
+ const std::string& predicate,
+ const std::string& object )
+ {
+ tracker_statement_list_insert (metadata, subject.c_str(),
+ PredicateMapping (predicate),
+ object.c_str());
+ }
+
+ void Tracker::TripleCollector::addValue (const AnalysisResult* idx,
+ const RegisteredField* field,
+ const std::string& name,
+ const std::string& value )
+ {
+ if (field->key() == FieldRegister::mimetypeFieldName && idx->depth() == 0) {
+ if (content_type)
+ g_free (content_type);
+ content_type = g_strdup (value.c_str());
+ }
+
+ tracker_statement_list_insert (metadata, idx->path().c_str(),
+ PredicateMapping (name),
+ value.c_str());
+ }
+
+ void Tracker::TripleCollector::finishAnalysis (const AnalysisResult* ) { }
+}
+
+typedef struct {
+ Strigi::AnalyzerConfiguration *mconfig;
+ Strigi::StreamAnalyzer *streamindexer;
+ Tracker::TripleCollector *m_writer;
+} TrackerTopanalyzerPrivate;
+
+
+static void
+private_free (gpointer data)
+{
+ TrackerTopanalyzerPrivate *priv = (TrackerTopanalyzerPrivate *) data;
+
+ delete priv->mconfig;
+ delete priv->streamindexer;
+ delete priv->m_writer;
+
+ g_free (priv);
+}
+
+void
+tracker_topanalyzer_init (void)
+{
+ TrackerTopanalyzerPrivate *priv;
+
+ /* For added granularity of what analyzer should be elected for which
+ * filetype or file, you can inherit a Strigi::AnalyzerConfiguration
+ * and have some tuning this way. */
+
+ FieldRegister::FieldRegister();
+
+ priv = g_new0 (TrackerTopanalyzerPrivate, 1);
+
+ priv->mconfig = new Strigi::AnalyzerConfiguration ();
+ priv->streamindexer = new Strigi::StreamAnalyzer (*priv->mconfig);
+ priv->m_writer = new Tracker::TripleCollector ();
+
+ priv->streamindexer->setIndexWriter (*priv->m_writer);
+
+ g_static_private_set (&private_key,
+ priv,
+ private_free);
+}
+
+void
+tracker_topanalyzer_shutdown (void)
+{
+ g_static_private_set (&private_key, NULL, NULL);
+}
+
+void
+tracker_topanalyzer_extract (const gchar *uri, GPtrArray *metadata, gchar **content_type)
+{
+ gchar *filename;
+ TrackerTopanalyzerPrivate *priv;
+
+ priv = (TrackerTopanalyzerPrivate *) g_static_private_get (&private_key);
+ g_return_if_fail (priv != NULL);
+
+ /* We need the filename from the URI because we'll use stat() and because
+ * in this experiment I used FileInputStream. But any kind of stream could
+ * work with StreamAnalyzer's analyzers. */
+
+ filename = g_filename_from_uri (uri, NULL, NULL);
+
+ if (filename) {
+ struct stat s;
+
+ /* We use our own strategy as writer. Our writer writes to the @metadata
+ * array. I decided to call it a collector because that's what its
+ * implementation does (collecting triples) */
+
+ priv->m_writer->setParams (uri, metadata);
+ stat(filename, &s);
+
+ /* The first parameter that we pass here will influence what
+ * idx->path() will be above. StreamAnalyzer only ever appends
+ * path chunks to this initial stringvalue. So if we pass
+ * our://URI then idx->path will end up being:
+ *
+ * our://URI
+ * our://URI/child
+ * our://URI/child/child.
+ *
+ * For example the URI of a tar.gz will go like this:
+ *
+ * file:///path/to/my.tar.gz
+ * file:///path/to.my.tar.gz/dir_in_tar/file1.txt
+ * file:///path/to.my.tar.gz/dir_in_tar/file2.txt
+ *
+ * The URI passed here doesn't mean the stream passed later must
+ * not really resemble the URI. Usually it will of course.
+ */
+
+ AnalysisResult analysisresult (uri, s.st_mtime, *priv->m_writer,
+ *priv->streamindexer);
+
+ /* If we want a remote stream, then we implement a Stream in C++
+ * for it and use that instead of FileInputStream. We could for
+ * example make a C++ wrapper for GInputStream and enjoy using
+ * GIO and GNIO here that way. */
+
+ FileInputStream resource (filename);
+
+ if (resource.status() == Ok) {
+ analysisresult.index(&resource);
+
+ if (content_type && priv->m_writer->content_type)
+ *content_type = g_strdup (priv->m_writer->content_type);
+ }
+
+ g_free (filename);
+ }
+}
+
diff --git a/src/tracker-extract/tracker-topanalyzer.h b/src/tracker-extract/tracker-topanalyzer.h
new file mode 100644
index 0000000..7e16acb
--- /dev/null
+++ b/src/tracker-extract/tracker-topanalyzer.h
@@ -0,0 +1,38 @@
+/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
+/*
+ * Copyright (C) 2008, Nokia
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ *
+ * Authors: Philip Van Hoof <philip codeminded be>
+ */
+
+#ifndef _TRACKER_TOPANALYZER_H_
+#define _TRACKER_TOPANALYZER_H_
+
+#include <glib.h>
+
+G_BEGIN_DECLS
+
+void tracker_topanalyzer_init (void);
+void tracker_topanalyzer_extract (const gchar *uri,
+ GPtrArray *metadata,
+ gchar **content_type);
+void tracker_topanalyzer_shutdown (void);
+
+G_END_DECLS
+
+#endif
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]