[tracker-miners/wip/carlosg/perf-squeeze: 16/16] libtracker-miner: Try to process as many files as possible on every batch



commit e42257074200cf4f6daafa7b44f8e34092be9a5a
Author: Carlos Garnacho <carlosg gnome org>
Date:   Sun Jul 5 15:32:37 2020 +0200

    libtracker-miner: Try to process as many files as possible on every batch
    
    We currently block the processing queue if the parent is seen in any stage
    of processing, the situation is unblocked by flushing early, so processing
    can resume after the SPARQL updates were performed.
    
    This may lead to suboptimal buffer occupation, ultimately dependent on
    the filesystem layout.
    
    To improve this situation, rely on blank node labels being stable across
    the whole SPARQL update string, and add a blank node labeling scheme that
    allows files within a same SPARQL batch reference each other through these
    blank node labels instead of IRIs.
    
    This allows maximum buffer occupation regardless of the filesystem layout,
    we still have to wait after a SPARQL update if a file being processed
    references (i.e. child/parent relationship) another file added in the
    SPARQL update being currently done. But that happens once per batch,
    instead of once per folder.

 src/libtracker-miner/tracker-miner-fs.c      | 58 +++++++++++-------------
 src/libtracker-miner/tracker-miner-fs.h      |  3 +-
 src/libtracker-miner/tracker-sparql-buffer.c | 22 +++++++++
 src/libtracker-miner/tracker-sparql-buffer.h | 10 ++++
 src/miners/fs/tracker-miner-files.c          | 68 +++++++++++++++++++++-------
 5 files changed, 112 insertions(+), 49 deletions(-)
---
diff --git a/src/libtracker-miner/tracker-miner-fs.c b/src/libtracker-miner/tracker-miner-fs.c
index cd72ab0f3..ffe3ea835 100644
--- a/src/libtracker-miner/tracker-miner-fs.c
+++ b/src/libtracker-miner/tracker-miner-fs.c
@@ -1536,8 +1536,7 @@ should_wait (TrackerMinerFS *fs,
        GFile *parent;
 
        /* Is the item already being processed? */
-       if (tracker_task_pool_find (fs->priv->task_pool, file) ||
-           tracker_task_pool_find (TRACKER_TASK_POOL (fs->priv->sparql_buffer), file)) {
+       if (tracker_sparql_buffer_get_state (fs->priv->sparql_buffer, file) == TRACKER_BUFFER_STATE_FLUSHING) 
{
                /* Yes, a previous event on same item currently
                 * being processed */
                fs->priv->item_queue_blocker = g_object_ref (file);
@@ -1547,8 +1546,7 @@ should_wait (TrackerMinerFS *fs,
        /* Is the item's parent being processed right now? */
        parent = g_file_get_parent (file);
        if (parent) {
-               if (tracker_task_pool_find (fs->priv->task_pool, parent) ||
-                   tracker_task_pool_find (TRACKER_TASK_POOL (fs->priv->sparql_buffer), parent)) {
+               if (tracker_sparql_buffer_get_state (fs->priv->sparql_buffer, parent) == 
TRACKER_BUFFER_STATE_FLUSHING) {
                        /* Yes, a previous event on the parent of this item
                         * currently being processed */
                        fs->priv->item_queue_blocker = parent;
@@ -1826,34 +1824,7 @@ miner_handle_next_item (TrackerMinerFS *fs)
        case TRACKER_MINER_FS_EVENT_UPDATED:
                parent = g_file_get_parent (file);
 
-               if (!parent ||
-                   tracker_indexing_tree_file_is_root (fs->priv->indexing_tree, file) ||
-                   !tracker_indexing_tree_get_root (fs->priv->indexing_tree, file, NULL) ||
-                   tracker_file_notifier_get_file_iri (fs->priv->file_notifier, parent, FALSE)) {
-                       keep_processing = item_add_or_update (fs, file, priority, attributes_update);
-               } else {
-                       gchar *uri;
-
-                       /* We got an event on a file that has not its parent indexed
-                        * even though it should. Given item_queue_get_next_file()
-                        * above should return FALSE whenever the parent file is
-                        * being processed, this means the parent is neither
-                        * being processed nor indexed, no good.
-                        *
-                        * Bail out in these cases by removing all queued files
-                        * inside the missing file. Whatever it was, it shall
-                        * hopefully be fixed on next index.
-                        */
-                       uri = g_file_get_uri (parent);
-                       g_warning ("Parent '%s' not indexed yet", uri);
-                       g_free (uri);
-
-                       tracker_priority_queue_foreach_remove (fs->priv->items,
-                                                              (GEqualFunc) 
queue_event_is_equal_or_descendant,
-                                                              parent,
-                                                              (GDestroyNotify) queue_event_free);
-                       keep_processing = TRUE;
-               }
+               keep_processing = item_add_or_update (fs, file, priority, attributes_update);
 
                if (parent) {
                        g_object_unref (parent);
@@ -2548,3 +2519,26 @@ tracker_miner_fs_get_data_provider (TrackerMinerFS *fs)
 
        return fs->priv->data_provider;
 }
+
+gchar *
+tracker_miner_fs_get_file_bnode (TrackerMinerFS *fs,
+                                 GFile          *file)
+{
+       g_return_val_if_fail (TRACKER_IS_MINER_FS (fs), NULL);
+       g_return_val_if_fail (G_IS_FILE (file), NULL);
+
+       if (tracker_task_pool_find (fs->priv->task_pool, file) ||
+           tracker_sparql_buffer_get_state (fs->priv->sparql_buffer, file) == TRACKER_BUFFER_STATE_QUEUED) {
+               gchar *uri, *bnode, *checksum;
+
+               uri = g_file_get_uri (file);
+               checksum = g_compute_checksum_for_string (G_CHECKSUM_MD5, uri, -1);
+               bnode = g_strdup_printf ("_:%s", checksum);
+               g_free (checksum);
+               g_free (uri);
+
+               return bnode;
+       }
+
+       return NULL;
+}
diff --git a/src/libtracker-miner/tracker-miner-fs.h b/src/libtracker-miner/tracker-miner-fs.h
index 0ff583b34..b53a00c17 100644
--- a/src/libtracker-miner/tracker-miner-fs.h
+++ b/src/libtracker-miner/tracker-miner-fs.h
@@ -161,7 +161,8 @@ void                  tracker_miner_fs_notify_finish         (TrackerMinerFS  *f
 /* URNs */
 const gchar          *tracker_miner_fs_get_folder_urn        (TrackerMinerFS  *fs,
                                                               GFile           *file);
-
+gchar *               tracker_miner_fs_get_file_bnode        (TrackerMinerFS *fs,
+                                                              GFile          *file);
 
 /* Progress */
 gboolean              tracker_miner_fs_has_items_to_process  (TrackerMinerFS  *fs);
diff --git a/src/libtracker-miner/tracker-sparql-buffer.c b/src/libtracker-miner/tracker-sparql-buffer.c
index 51f7fda5c..ba89c8019 100644
--- a/src/libtracker-miner/tracker-sparql-buffer.c
+++ b/src/libtracker-miner/tracker-sparql-buffer.c
@@ -516,3 +516,25 @@ tracker_sparql_buffer_push_finish (TrackerSparqlBuffer  *buffer,
 
        return task;
 }
+
+TrackerSparqlBufferState
+tracker_sparql_buffer_get_state (TrackerSparqlBuffer *buffer,
+                                 GFile               *file)
+{
+       TrackerSparqlBufferPrivate *priv;
+       TrackerTask *task;
+
+       g_return_val_if_fail (TRACKER_IS_SPARQL_BUFFER (buffer), TRACKER_BUFFER_STATE_UNKNOWN);
+       g_return_val_if_fail (G_IS_FILE (file), TRACKER_BUFFER_STATE_UNKNOWN);
+
+       priv = tracker_sparql_buffer_get_instance_private (TRACKER_SPARQL_BUFFER (buffer));
+
+       task = tracker_task_pool_find (TRACKER_TASK_POOL (buffer), file);
+       if (!task)
+               return TRACKER_BUFFER_STATE_UNKNOWN;
+
+       if (priv->tasks && g_ptr_array_find (priv->tasks, task, NULL))
+               return TRACKER_BUFFER_STATE_QUEUED;
+
+       return TRACKER_BUFFER_STATE_FLUSHING;
+}
diff --git a/src/libtracker-miner/tracker-sparql-buffer.h b/src/libtracker-miner/tracker-sparql-buffer.h
index bc3b11035..87b2b0144 100644
--- a/src/libtracker-miner/tracker-sparql-buffer.h
+++ b/src/libtracker-miner/tracker-sparql-buffer.h
@@ -42,6 +42,13 @@ G_BEGIN_DECLS
 typedef struct _TrackerSparqlBuffer TrackerSparqlBuffer;
 typedef struct _TrackerSparqlBufferClass TrackerSparqlBufferClass;
 
+typedef enum
+{
+       TRACKER_BUFFER_STATE_UNKNOWN,
+       TRACKER_BUFFER_STATE_QUEUED,
+       TRACKER_BUFFER_STATE_FLUSHING,
+} TrackerSparqlBufferState;
+
 struct _TrackerSparqlBuffer
 {
        TrackerTaskPool parent_instance;
@@ -70,6 +77,9 @@ TrackerTask *        tracker_sparql_buffer_push_finish (TrackerSparqlBuffer  *bu
                                                         GAsyncResult         *res,
                                                         GError              **error);
 
+TrackerSparqlBufferState tracker_sparql_buffer_get_state (TrackerSparqlBuffer *buffer,
+                                                          GFile               *file);
+
 TrackerTask *        tracker_sparql_task_new_take_sparql_str (GFile                *file,
                                                               gchar                *sparql_str);
 TrackerTask *        tracker_sparql_task_new_with_sparql_str (GFile                *file,
diff --git a/src/miners/fs/tracker-miner-files.c b/src/miners/fs/tracker-miner-files.c
index 06816423c..be4d11774 100644
--- a/src/miners/fs/tracker-miner-files.c
+++ b/src/miners/fs/tracker-miner-files.c
@@ -1937,6 +1937,26 @@ index_applications_changed_cb (GObject    *gobject,
        }
 }
 
+static gchar *
+folder_urn_or_bnode (TrackerMinerFiles *mf,
+                     GFile             *file,
+                     gboolean          *is_iri)
+{
+       const gchar *urn;
+
+       if (is_iri)
+               *is_iri = FALSE;
+
+       urn = tracker_miner_fs_get_folder_urn (TRACKER_MINER_FS (mf), file);
+       if (urn) {
+               if (is_iri)
+                       *is_iri = TRUE;
+               return g_strdup (urn);
+       }
+
+       return tracker_miner_fs_get_file_bnode (TRACKER_MINER_FS (mf), file);
+}
+
 static void
 miner_files_add_to_datasource (TrackerMinerFiles *mf,
                                GFile             *file,
@@ -1950,16 +1970,18 @@ miner_files_add_to_datasource (TrackerMinerFiles *mf,
        if (tracker_indexing_tree_file_is_root (indexing_tree, file)) {
                tracker_resource_set_relation (resource, "nie:dataSource", element_resource);
        } else {
-               const gchar *root_urn = NULL;
+               gchar *identifier = NULL;
                GFile *root;
 
                root = tracker_indexing_tree_get_root (indexing_tree, file, NULL);
 
                if (root)
-                       root_urn = tracker_miner_fs_get_folder_urn (TRACKER_MINER_FS (mf), root);
+                       identifier = folder_urn_or_bnode (mf, root, NULL);
 
-               if (root_urn)
-                       tracker_resource_set_uri (resource, "nie:dataSource", root_urn);
+               if (identifier)
+                       tracker_resource_set_uri (resource, "nie:dataSource", identifier);
+
+               g_free (identifier);
        }
 }
 
@@ -1991,13 +2013,13 @@ miner_files_create_folder_information_element (TrackerMinerFiles *miner,
                                               gboolean           is_directory)
 {
        TrackerResource *resource, *file_resource;
-       const gchar *urn = NULL;
-       gchar *uri;
+       gchar *urn, *uri;
 
        /* Preserve URN for nfo:Folders */
-       urn = tracker_miner_fs_get_folder_urn (TRACKER_MINER_FS (miner), file);
-
+       urn = folder_urn_or_bnode (miner, file, NULL);
        resource = tracker_resource_new (urn);
+       g_free (urn);
+
        tracker_resource_set_string (resource, "nie:mimeType", mime_type);
        tracker_resource_add_uri (resource, "rdf:type", "nie:InformationElement");
 
@@ -2094,7 +2116,7 @@ process_file_cb (GObject      *object,
        TrackerResource *resource, *folder_resource = NULL;
        ProcessFileData *data;
        const gchar *mime_type, *graph;
-       const gchar *parent_urn;
+       gchar *parent_urn;
        gchar *delete_properties_sparql = NULL, *mount_point_sparql;
        GFileInfo *file_info;
        guint64 time_;
@@ -2164,11 +2186,13 @@ process_file_cb (GObject      *object,
        tracker_resource_add_uri (resource, "rdf:type", "nfo:FileDataObject");
 
        parent = g_file_get_parent (file);
-       parent_urn = tracker_miner_fs_get_folder_urn (TRACKER_MINER_FS (data->miner), parent);
+       parent_urn = folder_urn_or_bnode (data->miner, parent, NULL);
        g_object_unref (parent);
 
-       if (parent_urn)
+       if (parent_urn) {
                tracker_resource_set_uri (resource, "nfo:belongsToContainer", parent_urn);
+               g_free (parent_urn);
+       }
 
        tracker_resource_set_string (resource, "nfo:fileName",
                                     g_file_info_get_display_name (file_info));
@@ -2443,7 +2467,6 @@ miner_files_move_file (TrackerMinerFS *fs,
                        gboolean        recursive)
 {
        GString *sparql = g_string_new (NULL);
-       const gchar *new_parent_iri = NULL;
        gchar *uri, *source_uri, *display_name, *container_clause = NULL;
        gchar *path, *basename;
        GFile *new_parent;
@@ -2459,10 +2482,23 @@ miner_files_move_file (TrackerMinerFS *fs,
 
        /* Get new parent information */
        new_parent = g_file_get_parent (file);
-       if (new_parent)
-               new_parent_iri = tracker_miner_fs_get_folder_urn (fs, new_parent);
-       if (new_parent_iri)
-               container_clause = g_strdup_printf ("; nfo:belongsToContainer <%s>", new_parent_iri);
+       if (new_parent) {
+               gchar *new_parent_id;
+               gboolean is_iri;
+
+               new_parent_id = folder_urn_or_bnode (TRACKER_MINER_FILES (fs),
+                                                    new_parent, &is_iri);
+
+               if (new_parent_id) {
+                       container_clause =
+                               g_strdup_printf ("; nfo:belongsToContainer %s%s%s",
+                                                is_iri ? "<" : "",
+                                                new_parent_id,
+                                                is_iri ? ">" : "");
+               }
+
+               g_free (new_parent_id);
+       }
 
        g_string_append_printf (sparql,
                                "DELETE { "


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]