[tracker/crash-aware-extractor: 2/2] tracker-extract: Implement "failsafe" extraction



commit 0b87154f9550defe1c40680ed45fc88b5716f188
Author: Carlos Garnacho <carlosg gnome org>
Date:   Thu Jul 24 00:30:06 2014 +0200

    tracker-extract: Implement "failsafe" extraction
    
    The extractor will store a symbolic link in a user directory in /tmp to
    the file(s) being currently processed, encoding also the number of tries
    previously performed. If the extractor happens to crash, the links will
    be used on a future respawn to resume operation.
    
    If enough crashes happen on a given file, the file will be then discarded,
    and tagged with an "extractor-failure-data-source" nie:dataSource,
    additionally to the dataSource that will make tracker-extract take the
    file as indexed in future runs.

 src/tracker-extract/Makefile.am                   |    2 +
 src/tracker-extract/tracker-extract-decorator.c   |   89 +++++++-
 src/tracker-extract/tracker-extract-persistence.c |  274 +++++++++++++++++++++
 src/tracker-extract/tracker-extract-persistence.h |   64 +++++
 4 files changed, 428 insertions(+), 1 deletions(-)
---
diff --git a/src/tracker-extract/Makefile.am b/src/tracker-extract/Makefile.am
index 112a8e0..6007968 100644
--- a/src/tracker-extract/Makefile.am
+++ b/src/tracker-extract/Makefile.am
@@ -521,6 +521,8 @@ tracker_extract_SOURCES = \
        tracker-extract-controller.h \
        tracker-extract-decorator.c \
        tracker-extract-decorator.h \
+       tracker-extract-persistence.c \
+       tracker-extract-persistence.h \
        tracker-extract-priority-dbus.c \
        tracker-extract-priority-dbus.h \
        tracker-read.c \
diff --git a/src/tracker-extract/tracker-extract-decorator.c b/src/tracker-extract/tracker-extract-decorator.c
index 9c49d47..7d0f2d2 100644
--- a/src/tracker-extract/tracker-extract-decorator.c
+++ b/src/tracker-extract/tracker-extract-decorator.c
@@ -22,6 +22,7 @@
 #include <libtracker-extract/tracker-extract.h>
 #include <libtracker-common/tracker-ontologies.h>
 #include "tracker-extract-decorator.h"
+#include "tracker-extract-persistence.h"
 #include "tracker-extract-priority-dbus.h"
 
 enum {
@@ -29,6 +30,7 @@ enum {
 };
 
 #define TRACKER_EXTRACT_DATA_SOURCE TRACKER_TRACKER_PREFIX "extractor-data-source"
+#define TRACKER_EXTRACT_FAILURE_DATA_SOURCE TRACKER_TRACKER_PREFIX "extractor-failure-data-source"
 #define MAX_EXTRACTING_FILES 1
 
 #define TRACKER_EXTRACT_DECORATOR_GET_PRIVATE(o) (G_TYPE_INSTANCE_GET_PRIVATE ((o), 
TRACKER_TYPE_EXTRACT_DECORATOR, TrackerExtractDecoratorPrivate))
@@ -39,6 +41,7 @@ typedef struct _ExtractData ExtractData;
 struct _ExtractData {
        TrackerDecorator *decorator;
        TrackerDecoratorInfo *decorator_info;
+       GFile *file;
 };
 
 struct _TrackerExtractDecoratorPrivate {
@@ -46,6 +49,9 @@ struct _TrackerExtractDecoratorPrivate {
        GTimer *timer;
        guint n_extracting_files;
 
+       TrackerExtractPersistence *persistence;
+       GHashTable *recovery_files;
+
        /* DBus name -> AppData */
        GHashTable *apps;
        TrackerExtractDBusPriority *iface;
@@ -127,6 +133,7 @@ tracker_extract_decorator_finalize (GObject *object)
 
        g_object_unref (priv->iface);
        g_hash_table_unref (priv->apps);
+       g_hash_table_unref (priv->recovery_files);
 
        G_OBJECT_CLASS (tracker_extract_decorator_parent_class)->finalize (object);
 }
@@ -200,6 +207,9 @@ get_metadata_cb (TrackerExtract *extract,
        task = tracker_decorator_info_get_task (data->decorator_info);
        info = g_simple_async_result_get_op_res_gpointer (G_SIMPLE_ASYNC_RESULT (result));
 
+       tracker_extract_persistence_remove_file (priv->persistence, data->file);
+       g_hash_table_remove (priv->recovery_files, tracker_decorator_info_get_url (data->decorator_info));
+
        if (!info) {
                GError *error = NULL;
 
@@ -216,9 +226,30 @@ get_metadata_cb (TrackerExtract *extract,
        decorator_get_next_file (data->decorator);
 
        tracker_decorator_info_unref (data->decorator_info);
+       g_object_unref (data->file);
        g_free (data);
 }
 
+static GFile *
+decorator_get_recovery_file (TrackerExtractDecorator *decorator,
+                             TrackerDecoratorInfo    *info)
+{
+       TrackerExtractDecoratorPrivate *priv;
+       GFile *file;
+
+       priv = decorator->priv;
+       file = g_hash_table_lookup (priv->recovery_files,
+                                   tracker_decorator_info_get_url (info));
+
+       if (file) {
+               g_object_ref (file);
+       } else {
+               file = g_file_new_for_uri (tracker_decorator_info_get_url (info));
+       }
+
+       return file;
+}
+
 static void
 decorator_next_item_cb (TrackerDecorator *decorator,
                         GAsyncResult     *result,
@@ -256,10 +287,13 @@ decorator_next_item_cb (TrackerDecorator *decorator,
        data = g_new0 (ExtractData, 1);
        data->decorator = decorator;
        data->decorator_info = info;
+       data->file = decorator_get_recovery_file (TRACKER_EXTRACT_DECORATOR (decorator), info);
        task = tracker_decorator_info_get_task (info);
 
        g_message ("Extracting metadata for '%s'", tracker_decorator_info_get_url (info));
 
+       tracker_extract_persistence_add_file (priv->persistence, data->file);
+
        tracker_extract_file (priv->extractor,
                              tracker_decorator_info_get_url (info),
                              tracker_decorator_info_get_mimetype (info),
@@ -535,9 +569,62 @@ tracker_extract_decorator_class_init (TrackerExtractDecoratorClass *klass)
 }
 
 static void
+decorator_retry_file (GFile    *file,
+                      gpointer  user_data)
+{
+       TrackerExtractDecorator *decorator = user_data;
+       TrackerExtractDecoratorPrivate *priv = decorator->priv;
+       gchar *path;
+
+       path = g_file_get_uri (file);
+       g_hash_table_insert (priv->recovery_files, path, file);
+       tracker_decorator_fs_prepend_file (TRACKER_DECORATOR_FS (decorator), file);
+}
+
+static void
+decorator_ignore_file (GFile    *file,
+                       gpointer  user_data)
+{
+       TrackerExtractDecorator *decorator = user_data;
+       TrackerSparqlConnection *conn;
+       GError *error = NULL;
+       gchar *uri, *query;
+
+       uri = g_file_get_uri (file);
+       g_message ("Extraction on file '%s' has been attempted too many times, ignoring", uri);
+
+       conn = tracker_miner_get_connection (TRACKER_MINER (decorator));
+       query = g_strdup_printf ("INSERT { GRAPH <" TRACKER_MINER_FS_GRAPH_URN "> {"
+                                "  ?urn nie:dataSource <" TRACKER_EXTRACT_DATA_SOURCE ">;"
+                                "       nie:dataSource <" TRACKER_EXTRACT_FAILURE_DATA_SOURCE ">."
+                                "} WHERE {"
+                                "  ?urn nie:url \"%s\""
+                                "}}", uri);
+
+       tracker_sparql_connection_update (conn, query, G_PRIORITY_DEFAULT, NULL, &error);
+
+       if (error) {
+               g_warning ("Failed to update ignored file '%s': %s",
+                          uri, error->message);
+               g_error_free (error);
+       }
+
+       g_free (query);
+       g_free (uri);
+}
+
+static void
 tracker_extract_decorator_init (TrackerExtractDecorator *decorator)
 {
-       decorator->priv = TRACKER_EXTRACT_DECORATOR_GET_PRIVATE (decorator);
+       TrackerExtractDecoratorPrivate *priv;
+
+       decorator->priv = priv = TRACKER_EXTRACT_DECORATOR_GET_PRIVATE (decorator);
+       priv->persistence = tracker_extract_persistence_initialize (decorator_retry_file,
+                                                                   decorator_ignore_file,
+                                                                   decorator);
+       priv->recovery_files = g_hash_table_new_full (g_str_hash, g_str_equal,
+                                                     (GDestroyNotify) g_free,
+                                                     (GDestroyNotify) g_object_unref);
 }
 
 static gboolean
diff --git a/src/tracker-extract/tracker-extract-persistence.c 
b/src/tracker-extract/tracker-extract-persistence.c
new file mode 100644
index 0000000..38bdd35
--- /dev/null
+++ b/src/tracker-extract/tracker-extract-persistence.c
@@ -0,0 +1,274 @@
+/*
+ * Copyright (C) 2014 Carlos Garnacho <carlosg gnome org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.         See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA  02110-1301, USA.
+ */
+
+#include "tracker-extract-persistence.h"
+
+#define MAX_RETRIES 3
+
+typedef struct _TrackerExtractPersistencePrivate TrackerExtractPersistencePrivate;
+
+struct _TrackerExtractPersistencePrivate
+{
+       GFile *tmp_dir;
+};
+
+G_DEFINE_TYPE_WITH_PRIVATE (TrackerExtractPersistence, tracker_extract_persistence, G_TYPE_OBJECT)
+
+static GQuark n_retries_quark = 0;
+
+static void
+tracker_extract_persistence_class_init (TrackerExtractPersistenceClass *klass)
+{
+       n_retries_quark = g_quark_from_static_string ("tracker-extract-n-retries-quark");
+}
+
+static void
+tracker_extract_persistence_init (TrackerExtractPersistence *persistence)
+{
+       TrackerExtractPersistencePrivate *priv;
+       gchar *dirname, *tmp_path;
+
+       priv = tracker_extract_persistence_get_instance_private (persistence);
+
+       dirname = g_strdup_printf ("tracker-extract-files.%d", getuid ());
+       tmp_path = g_build_filename (g_get_tmp_dir (), dirname, NULL);
+       g_free (dirname);
+
+       if (g_mkdir_with_parents (tmp_path, 0700) != 0) {
+               g_critical ("The directory %s could not be created, or has the wrong permissions",
+                           tmp_path);
+               g_assert_not_reached ();
+       }
+
+       priv->tmp_dir = g_file_new_for_path (tmp_path);
+       g_free (tmp_path);
+}
+
+static void
+increment_n_retries (GFile *file)
+{
+       guint n_retries;
+
+       n_retries = GPOINTER_TO_UINT (g_object_get_qdata (G_OBJECT (file), n_retries_quark));
+       g_object_set_qdata (G_OBJECT (file), n_retries_quark, GUINT_TO_POINTER (n_retries + 1));
+}
+
+static GFile *
+persistence_create_symlink_file (TrackerExtractPersistence *persistence,
+                                 GFile                     *file)
+{
+       TrackerExtractPersistencePrivate *priv;
+       guint n_retries = GPOINTER_TO_UINT (g_object_get_qdata (G_OBJECT (file), n_retries_quark));
+       gchar *link_name, *path, *md5;
+       GFile *link_file;
+
+       priv = tracker_extract_persistence_get_instance_private (persistence);
+       path = g_file_get_path (file);
+       md5 = g_compute_checksum_for_string (G_CHECKSUM_MD5, path, -1);
+       link_name = g_strdup_printf ("%d-%s", n_retries, md5);
+       link_file = g_file_get_child (priv->tmp_dir, link_name);
+
+       g_free (link_name);
+       g_free (path);
+       g_free (md5);
+
+       return link_file;
+}
+
+static GFile *
+persistence_symlink_get_file (GFileInfo *info)
+{
+       const gchar *symlink_name, *symlink_target;
+       gchar *md5, **items;
+       GFile *file = NULL;
+       guint n_retries;
+
+       symlink_target = g_file_info_get_symlink_target (info);
+
+       if (!g_path_is_absolute (symlink_target)) {
+               g_critical ("Symlink paths must be absolute, '%s' points to '%s'",
+                           symlink_name, symlink_target);
+               return NULL;
+       }
+
+       symlink_name = g_file_info_get_name (info);
+       md5 = g_compute_checksum_for_string (G_CHECKSUM_MD5, symlink_target, -1);
+       items = g_strsplit (symlink_name, "-", 2);
+       n_retries = g_strtod (items[0], NULL);
+
+       if (g_strcmp0 (items[1], md5) == 0) {
+               file = g_file_new_for_path (symlink_target);
+               g_object_set_qdata (G_OBJECT (file), n_retries_quark,
+                                   GUINT_TO_POINTER (n_retries));
+       } else {
+               g_critical ("path MD5 for '%s' doesn't match with symlink '%s'",
+                           symlink_target, symlink_name);
+       }
+
+       g_strfreev (items);
+       g_free (md5);
+
+       return file;
+}
+
+static gboolean
+persistence_store_file (TrackerExtractPersistence *persistence,
+                        GFile                     *file)
+{
+       GError *error = NULL;
+       gboolean success;
+       GFile *link_file;
+       gchar *path;
+
+       increment_n_retries (file);
+       path = g_file_get_path (file);
+       link_file = persistence_create_symlink_file (persistence, file);
+
+       success = g_file_make_symbolic_link (link_file, path, NULL, &error);
+
+       if (!success) {
+               g_warning ("Could not save '%s' into failsafe persistence store: %s",
+                          path, error->message);
+               g_error_free (error);
+       }
+
+       g_object_unref (link_file);
+       g_free (path);
+
+       return success;
+}
+
+static gboolean
+persistence_remove_file (TrackerExtractPersistence *persistence,
+                         GFile                     *file)
+{
+       GError *error = NULL;
+       GFile *link_file;
+       gboolean success;
+
+       link_file = persistence_create_symlink_file (persistence, file);
+       success = g_file_delete (link_file, NULL, &error);
+
+       if (!success) {
+               gchar *path = g_file_get_path (file);
+
+               g_warning ("Could not delete '%s' from failsafe persistence store",
+                          path);
+               g_free (path);
+       }
+
+       g_object_unref (link_file);
+
+       return success;
+}
+
+static void
+persistence_retrieve_files (TrackerExtractPersistence *persistence,
+                            TrackerFileRecoveryFunc    retry_func,
+                            TrackerFileRecoveryFunc    ignore_func,
+                            gpointer                   user_data)
+{
+       TrackerExtractPersistencePrivate *priv;
+       GFileEnumerator *enumerator;
+       GFileInfo *info;
+
+       priv = tracker_extract_persistence_get_instance_private (persistence);
+       enumerator = g_file_enumerate_children (priv->tmp_dir,
+                                               G_FILE_ATTRIBUTE_STANDARD_NAME ","
+                                               G_FILE_ATTRIBUTE_STANDARD_SYMLINK_TARGET,
+                                               G_FILE_QUERY_INFO_NOFOLLOW_SYMLINKS,
+                                               NULL, NULL);
+       if (!enumerator)
+               return;
+
+       while ((info = g_file_enumerator_next_file (enumerator, NULL, NULL)) != NULL) {
+               GFile *file, *symlink_file;
+               gchar *symlink_name;
+               guint n_retries;
+
+               symlink_file = g_file_enumerator_get_child (enumerator, info);
+               file = persistence_symlink_get_file (info);
+
+               if (!file) {
+                       g_critical ("Symlink has bad format: %s\n", symlink_name);
+                       g_object_unref (symlink_file);
+                       g_object_unref (info);
+                       continue;
+               }
+
+               /* Delete the symlink, it will get probably added back soon after,
+                * and n_retries incremented.
+                */
+               g_file_delete (symlink_file, NULL, NULL);
+               g_object_unref (symlink_file);
+
+               n_retries = GPOINTER_TO_UINT (g_object_get_qdata (G_OBJECT (file), n_retries_quark));
+
+               /* Trigger retry/ignore func for the symlink target */
+               if (n_retries >= MAX_RETRIES) {
+                       ignore_func (file, user_data);
+               } else {
+                       retry_func (file, user_data);
+               }
+
+               g_object_unref (file);
+               g_object_unref (info);
+       }
+
+       g_file_enumerator_close (enumerator, NULL, NULL);
+       g_object_unref (enumerator);
+}
+
+TrackerExtractPersistence *
+tracker_extract_persistence_initialize (TrackerFileRecoveryFunc retry_func,
+                                        TrackerFileRecoveryFunc ignore_func,
+                                        gpointer                user_data)
+{
+       static TrackerExtractPersistence *persistence = NULL;
+
+       if (!persistence) {
+               persistence = g_object_new (TRACKER_TYPE_EXTRACT_PERSISTENCE,
+                                           NULL);
+               persistence_retrieve_files (persistence,
+                                           retry_func, ignore_func,
+                                           user_data);
+       }
+
+       return persistence;
+}
+
+void
+tracker_extract_persistence_add_file (TrackerExtractPersistence *persistence,
+                                      GFile                     *file)
+{
+       g_return_if_fail (TRACKER_IS_EXTRACT_PERSISTENCE (persistence));
+       g_return_if_fail (G_IS_FILE (file));
+
+       persistence_store_file (persistence, file);
+}
+
+void
+tracker_extract_persistence_remove_file (TrackerExtractPersistence *persistence,
+                                         GFile                     *file)
+{
+       g_return_if_fail (TRACKER_IS_EXTRACT_PERSISTENCE (persistence));
+       g_return_if_fail (G_IS_FILE (file));
+
+       persistence_remove_file (persistence, file);
+}
diff --git a/src/tracker-extract/tracker-extract-persistence.h 
b/src/tracker-extract/tracker-extract-persistence.h
new file mode 100644
index 0000000..d5474af
--- /dev/null
+++ b/src/tracker-extract/tracker-extract-persistence.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (C) 2014 Carlos Garnacho <carlosg gnome org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.         See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA  02110-1301, USA.
+ */
+
+#ifndef __TRACKER_EXTRACT_PERSISTENCE_H__
+#define __TRACKER_EXTRACT_PERSISTENCE_H__
+
+#include <gio/gio.h>
+
+G_BEGIN_DECLS
+
+#define TRACKER_TYPE_EXTRACT_PERSISTENCE         (tracker_extract_persistence_get_type ())
+#define TRACKER_EXTRACT_PERSISTENCE(o)           (G_TYPE_CHECK_INSTANCE_CAST ((o), 
TRACKER_TYPE_EXTRACT_PERSISTENCE, TrackerExtractPersistence))
+#define TRACKER_EXTRACT_PERSISTENCE_CLASS(c)     (G_TYPE_CHECK_CLASS_CAST ((c), 
TRACKER_TYPE_EXTRACT_PERSISTENCE, TrackerExtractPersistenceClass))
+#define TRACKER_IS_EXTRACT_PERSISTENCE(o)        (G_TYPE_CHECK_INSTANCE_TYPE ((o), 
TRACKER_TYPE_EXTRACT_PERSISTENCE))
+#define TRACKER_IS_EXTRACT_PERSISTENCE_CLASS(c)  (G_TYPE_CHECK_CLASS_TYPE ((c), 
TRACKER_TYPE_EXTRACT_PERSISTENCE))
+#define TRACKER_EXTRACT_PERSISTENCE_GET_CLASS(o) (G_TYPE_INSTANCE_GET_CLASS ((o), 
TRACKER_TYPE_EXTRACT_PERSISTENCE, TrackerExtractPersistenceClass))
+
+typedef struct _TrackerExtractPersistence TrackerExtractPersistence;
+typedef struct _TrackerExtractPersistenceClass TrackerExtractPersistenceClass;
+
+typedef void (* TrackerFileRecoveryFunc) (GFile    *file,
+                                          gpointer  user_data);
+
+struct _TrackerExtractPersistence
+{
+       GObject parent_instance;
+};
+
+struct _TrackerExtractPersistenceClass
+{
+       GObjectClass parent_class;
+};
+
+GType tracker_extract_persistence_get_type (void) G_GNUC_CONST;
+
+TrackerExtractPersistence *
+     tracker_extract_persistence_initialize (TrackerFileRecoveryFunc     retry_func,
+                                             TrackerFileRecoveryFunc     ignore_func,
+                                             gpointer                    user_data);
+
+void tracker_extract_persistence_add_file    (TrackerExtractPersistence *persistence,
+                                              GFile                     *file);
+void tracker_extract_persistence_remove_file (TrackerExtractPersistence *persistence,
+                                              GFile                     *file);
+
+G_END_DECLS
+
+#endif /* __TRACKER_EXTRACT_PERSISTENCE_H__ */


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]