[tracker/miner-userguide: 3/15] tracker-miner-fs: Added userguide extraction code
- From: Martyn James Russell <mr src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [tracker/miner-userguide: 3/15] tracker-miner-fs: Added userguide extraction code
- Date: Thu, 24 Nov 2011 16:43:24 +0000 (UTC)
commit 270948d2995058f78d9260c7beaea7eb21437a5e
Author: Martyn Russell <martyn lanedo com>
Date: Tue Nov 22 16:59:27 2011 +0000
tracker-miner-fs: Added userguide extraction code
Mostly works with the exception of a pipe leak by the looks of it.
configure.ac | 7 +
src/miners/fs/Makefile.am | 14 +-
src/miners/fs/tracker-miner-userguides.c | 283 ++++++++++++++++++++++++++++--
3 files changed, 281 insertions(+), 23 deletions(-)
---
diff --git a/configure.ac b/configure.ac
index 180289e..d31a005 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1418,6 +1418,13 @@ AC_ARG_ENABLE(libxml2,
[disable HTML/XML extractors [[default=yes]]]),,
[enable_libxml2=yes])
+# Force libxml requirement for HAVE_MAEMO,
+# given the miner-fs needs it for the userguide extraction.
+if test "x$enable_maemo" = "xyes" ; then
+ enable_libxml2="yes"
+fi
+
+
if test "x$enable_libxml2" != "xno"; then
PKG_CHECK_MODULES(LIBXML2,
[libxml-2.0 >= $LIBXML2_REQUIRED],
diff --git a/src/miners/fs/Makefile.am b/src/miners/fs/Makefile.am
index c49560e..75ff702 100644
--- a/src/miners/fs/Makefile.am
+++ b/src/miners/fs/Makefile.am
@@ -47,12 +47,6 @@ tracker_miner_fs_SOURCES = \
tracker-writeback.c \
tracker-writeback.h
-if HAVE_MAEMO
-tracker_miner_fs_SOURCES += \
- tracker-miner-userguides.c \
- tracker-mienr-userguides.h
-endif
-
tracker_miner_fs_LDADD = \
$(top_builddir)/src/libtracker-miner/libtracker-miner- TRACKER_API_VERSION@.la \
$(top_builddir)/src/libtracker-sparql-backend/libtracker-sparql- TRACKER_API_VERSION@.la \
@@ -62,6 +56,14 @@ tracker_miner_fs_LDADD = \
$(BUILD_LIBS) \
$(TRACKER_MINER_FS_LIBS)
+if HAVE_MAEMO
+tracker_miner_fs_SOURCES += \
+ tracker-miner-userguides.c \
+ tracker-mienr-userguides.h
+tracker_miner_fs_CFLAGS = $(LIBXML2_CFLAGS)
+tracker_miner_fs_LDADD += $(LIBXML2_LIBS)
+endif
+
if HAVE_MEEGOTOUCH
tracker_miner_fs_SOURCES += \
tracker-miner-applications-meego.cpp \
diff --git a/src/miners/fs/tracker-miner-userguides.c b/src/miners/fs/tracker-miner-userguides.c
index 14548b5..06b960f 100644
--- a/src/miners/fs/tracker-miner-userguides.c
+++ b/src/miners/fs/tracker-miner-userguides.c
@@ -21,6 +21,8 @@
#include "config.h"
+#include <libxml/HTMLparser.h>
+
#include <libtracker-common/tracker-utils.h>
#include <libtracker-common/tracker-ontologies.h>
#include <libtracker-common/tracker-locale.h>
@@ -30,6 +32,28 @@
/* FIXME: Should we rename this to just -locale not -applications-locale ? */
#include "tracker-miner-applications-locale.h"
+// FIXME: get this value from tracker conf
+#define MAX_EXTRACT_SIZE 1024 * 1024 // 1 MiB
+#define MAX_TITLE_LENGTH 1000
+
+typedef struct {
+ gchar *uri;
+ GString *title;
+ gboolean in_text;
+ gboolean in_title;
+ GString *plain_text;
+ gssize max_length;
+} ParserContext;
+
+typedef struct {
+ TrackerMinerFS *miner;
+ GFile *file;
+ TrackerSparqlBuilder *sparql;
+ GCancellable *cancellable;
+ GKeyFile *key_file;
+ gchar *type;
+} ProcessUserguideData;
+
static void miner_userguides_initable_iface_init (GInitableIface *iface);
static gboolean miner_userguides_initable_init (GInitable *initable,
GCancellable *cancellable,
@@ -49,21 +73,13 @@ static gboolean miner_userguides_process_file_attributes (TrackerMinerFS *
static gboolean miner_userguides_monitor_directory (TrackerMinerFS *fs,
GFile *file);
static void miner_userguides_finalize (GObject *object);
-
+static void parser_get_file_content (const gchar *uri,
+ gssize max_extract_size,
+ gchar **content,
+ gchar **title);
static GQuark miner_userguides_error_quark = 0;
-typedef struct ProcessUserguideData ProcessUserguideData;
-
-struct ProcessUserguideData {
- TrackerMinerFS *miner;
- GFile *file;
- TrackerSparqlBuilder *sparql;
- GCancellable *cancellable;
- GKeyFile *key_file;
- gchar *type;
-};
-
static GInitableIface* miner_userguides_initable_parent_iface;
G_DEFINE_TYPE_WITH_CODE (TrackerMinerUserguides, tracker_miner_userguides, TRACKER_TYPE_MINER_FS,
@@ -218,6 +234,10 @@ miner_userguides_check_file (TrackerMinerFS *fs,
retval = TRUE;
}
+ g_debug ("Checking FILE '%s', returning %s", basename, retval ? "TRUE" : "FALSE");
+
+ /* FIXME: Do we check the mime type is 'application/x-userguide-html' */
+
g_free (basename);
return retval;
@@ -241,6 +261,8 @@ miner_userguides_check_directory (TrackerMinerFS *fs,
retval = FALSE;
}
+ g_debug ("Checking DIR '%s', returning %s", basename, retval ? "TRUE" : "FALSE");
+
g_free (basename);
return retval;
@@ -306,7 +328,87 @@ process_userguide_file (ProcessUserguideData *data,
GFileInfo *file_info,
GError **error)
{
- /* TODO: Insert SPARQL per user guide */
+ TrackerSparqlBuilder *sparql;
+ gchar *uri;
+ gchar *path;
+ gchar *filename;
+ gchar *content = NULL;
+ gchar *title = NULL;
+ const gchar *parent_urn;
+
+ sparql = data->sparql;
+ uri = g_file_get_uri (data->file);
+
+ g_message ("Processing '%s'", uri);
+
+ /* FIXME: We didn't use a graph before AFAICS. */
+ /* tracker_sparql_builder_insert_silent_open (sparql, TRACKER_MINER_FS_GRAPH_URN); */
+ tracker_sparql_builder_insert_open (sparql, NULL);
+
+ tracker_sparql_builder_subject (sparql, "_:file");
+ tracker_sparql_builder_predicate (sparql, "a");
+ tracker_sparql_builder_object (sparql, "nfo:FileDataObject");
+ tracker_sparql_builder_object (sparql, "nie:DataObject");
+ tracker_sparql_builder_object (sparql, "nfo:HelpDocument");
+
+ /* tracker_sparql_builder_object (sparql, "nfo:Document"); */
+
+ /* FIXME: Do we need these, they're all new:
+ * nie:dataSource, nfo:fileName, nie:url, nfo:fileLastModified, tracker:available
+ */
+
+ /* tracker_sparql_builder_predicate (sparql, "nie:dataSource"); */
+ /* tracker_sparql_builder_object_iri (sparql, APPLET_DATASOURCE_URN); */
+
+ tracker_sparql_builder_predicate (sparql, "tracker:available");
+ tracker_sparql_builder_object_boolean (sparql, TRUE);
+
+ path = g_file_get_path (data->file);
+ filename = g_filename_display_basename (path);
+ tracker_sparql_builder_predicate (sparql, "nfo:fileName");
+ tracker_sparql_builder_object_string (sparql, filename);
+ g_free (filename);
+ g_free (path);
+
+ tracker_sparql_builder_predicate (sparql, "nie:url");
+ tracker_sparql_builder_object_string (sparql, uri);
+
+ if (file_info) {
+ guint64 time;
+
+ time = g_file_info_get_attribute_uint64 (file_info, G_FILE_ATTRIBUTE_TIME_MODIFIED);
+ tracker_sparql_builder_predicate (sparql, "nfo:fileLastModified");
+ tracker_sparql_builder_object_date (sparql, (time_t *) &time);
+ }
+
+ parent_urn = tracker_miner_fs_get_parent_urn (TRACKER_MINER_FS (data->miner), data->file);
+
+ if (parent_urn) {
+ tracker_sparql_builder_predicate (sparql, "nfo:belongsToContainer");
+ tracker_sparql_builder_object_iri (sparql, parent_urn);
+ }
+
+ /* Get content */
+ parser_get_file_content (uri, MAX_EXTRACT_SIZE, &content, &title);
+
+ g_message (" Title: '%s'", title);
+ /* g_debug (" Content:\n\"\"\"\n%s\n\"\"\"\n", content); */
+
+ if (title && title[0]) {
+ tracker_sparql_builder_predicate (sparql, "nie:title");
+ tracker_sparql_builder_object_unvalidated (sparql, title);
+ }
+
+ if (content) {
+ tracker_sparql_builder_predicate (sparql, "nie:plainTextContent");
+ tracker_sparql_builder_object_unvalidated (sparql, content);
+ }
+
+ tracker_sparql_builder_insert_close (sparql);
+
+ g_free (content);
+ g_free (title);
+ g_free (uri);
}
static void
@@ -409,11 +511,148 @@ miner_userguides_process_file_attributes (TrackerMinerFS *fs,
return FALSE;
}
+static void
+parser_start_element (void *ctx,
+ const xmlChar *name,
+ const xmlChar **atts G_GNUC_UNUSED)
+{
+ const gchar *elem = (const gchar *) name;
+ ParserContext *pctx = ctx;
+
+ if (g_strcmp0 (elem, "title") == 0) {
+ pctx->in_title = TRUE;
+ return;
+ }
+
+ if (g_strcmp0 (elem, "body") == 0) {
+ pctx->in_text = TRUE;
+ return;
+ }
+}
+
+static void
+parser_end_element (void *ctx,
+ const xmlChar *name)
+{
+ const gchar *elem = (const gchar *) name;
+ ParserContext *pctx = ctx;
+
+ if (g_strcmp0 (elem, "title") == 0) {
+ pctx->in_title = FALSE;
+ }
+}
+
+static void
+parser_characters (void *ctx,
+ const xmlChar *ch,
+ int len G_GNUC_UNUSED)
+{
+ ParserContext *pctx = ctx;
+ gchar *str;
+ int len_to_append;
+
+ if (pctx->in_title) {
+ gchar *title = g_strdup ((const gchar *) ch);
+
+ if (title[0]) {
+ g_string_append_len (pctx->title,
+ title,
+ MIN(strlen (title), MAX_TITLE_LENGTH));
+ }
+
+ g_free(title);
+ }
+
+ if (!pctx->in_text) {
+ return;
+ }
+
+ if ((gssize) pctx->plain_text->len >= pctx->max_length) {
+ return;
+ }
+
+ str = g_strdup ((const gchar *) ch);
+
+ if (!str[0]) {
+ g_free (str);
+ return;
+ }
+
+ len_to_append = strlen (str);
+
+ if ((gssize) pctx->plain_text->len + len_to_append > pctx->max_length) {
+ len_to_append = pctx->max_length - pctx->plain_text->len;
+ }
+
+ g_string_append_len (pctx->plain_text, str, len_to_append);
+ g_free (str);
+}
+
+static void
+parser_error (void *ctx,
+ const char *msg,
+ ...)
+{
+ ParserContext *pctx = ctx;
+
+ g_critical ("Could not parse file '%s': %s", pctx->uri, msg);
+}
+
+static void
+parser_get_file_content (const gchar *uri,
+ gssize max_extract_size,
+ gchar **content,
+ gchar **title)
+{
+ GError *error = NULL;
+ gchar *filename;
+ ParserContext parser_ctx;
+ htmlSAXHandler sax_handler = { 0 };
+ htmlDocPtr doc;
+
+ /* TODO: utf8 sanitization */
+
+ filename = g_filename_from_uri (uri, NULL, &error);
+
+ if (error) {
+ g_message ("Could not open '%s': %s", uri, error->message);
+ g_error_free (error);
+ g_free (filename);
+ return;
+ }
+
+ parser_ctx.uri = g_strdup (uri);
+ parser_ctx.title = g_string_new (NULL);
+ parser_ctx.in_text = FALSE;
+ parser_ctx.in_title = FALSE;
+ parser_ctx.plain_text = g_string_new (NULL);
+ /* leave space for terminating 0 char */
+ parser_ctx.max_length = max_extract_size - 1;
+
+ sax_handler.startElement = parser_start_element;
+ sax_handler.endElement = parser_end_element;
+ sax_handler.characters = parser_characters;
+ sax_handler.error = parser_error;
+
+ doc = htmlSAXParseFile (filename, "utf-8", &sax_handler, &parser_ctx);
+ g_free (filename);
+
+ if (doc) {
+ xmlFreeDoc (doc);
+ }
+
+ g_free (parser_ctx.uri);
+
+ *title = g_string_free (parser_ctx.title, FALSE);
+ g_strstrip (*title);
+
+ *content = g_string_free (parser_ctx.plain_text, FALSE);
+ g_strstrip (*content);
+}
+
/* If a reset is requested, we will remove from the store all items previously
* inserted by the tracker-miner-userguides, this is:
- * (a) ... FIXME: What needs doing here?
- * (b) ... FIXME: What needs doing here?
- * (c) ... FIXME: What needs doing here?
+ * (a) Remove all resources which are a nfo:HelpDocument
*/
static void
miner_userguides_reset (TrackerMiner *miner)
@@ -423,7 +662,17 @@ miner_userguides_reset (TrackerMiner *miner)
sparql = tracker_sparql_builder_new_update ();
- /* FIXME: Add necessary SPARQL to clean up */
+ tracker_sparql_builder_delete_open (sparql, TRACKER_MINER_FS_GRAPH_URN);
+ tracker_sparql_builder_subject_variable (sparql, "userguide");
+ tracker_sparql_builder_predicate (sparql, "a");
+ tracker_sparql_builder_object (sparql, "rdfs:Resource");
+ tracker_sparql_builder_delete_close (sparql);
+
+ tracker_sparql_builder_where_open (sparql);
+ tracker_sparql_builder_subject_variable (sparql, "userguide");
+ tracker_sparql_builder_predicate (sparql, "a");
+ tracker_sparql_builder_object (sparql, "nfo:HelpDocument");
+ tracker_sparql_builder_where_close (sparql);
/* Execute a sync update, we don't want the userguides miner to start before
* we finish this. */
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]