[gegl] operation, gegl-parallel: calculate per-operation thread cost dynamically

From: Ell <ell src gnome org>
To: commits-list gnome org
Cc:
Subject: [gegl] operation, gegl-parallel: calculate per-operation thread cost dynamically
Date: Sat, 20 Jul 2019 21:00:29 +0000 (UTC)
commit 021ac2409db2d671cf3f0615989c70da5f41800f
Author: Ell <ell_se yahoo com>
Date:   Sat Jul 20 23:15:03 2019 +0300

    operation, gegl-parallel: calculate per-operation thread cost dynamically
    
    Currently, the realtive cost of using additional threads for a
    given operation, as per gegl_operation_get_pixels_per_thread(), is
    fixed at 64 * 64 pixels per thread.  This prevents expensive
    operations, most notably certain area filters, from using all
    available threads when processed in small chunks, as done by GIMP
    to keep chunk-processing rate interactive.
    
    This commit dynamically adjusts the thread cost on a per-operation-
    instance basis, by estimating the extra processing time incurred by
    additional threads, and by measuring the pixel processing rate of
    the operation (the thread cost is then the ratio between these two
    values.)  This is a very rough estimate, but it's better than a
    hard-coded constant, and it does allow GIMP to make better use of
    multithreading for expensive operations.

 gegl/gegl-parallel-private.h    |   9 ++-
 gegl/gegl-parallel.c            | 134 +++++++++++++++++++++++++++++++---------
 gegl/operation/gegl-operation.c | 105 +++++++++++++++++++++++++++----
 gegl/operation/gegl-operation.h |   3 +-
 4 files changed, 207 insertions(+), 44 deletions(-)
---
diff --git a/gegl/gegl-parallel-private.h b/gegl/gegl-parallel-private.h
index 3c13ec85d..c7840e9f8 100644
--- a/gegl/gegl-parallel-private.h
+++ b/gegl/gegl-parallel-private.h
@@ -23,8 +23,13 @@
 G_BEGIN_DECLS
 
 
-void   gegl_parallel_init    (void);
-void   gegl_parallel_cleanup (void);
+void      gegl_parallel_init                             (void);
+void      gegl_parallel_cleanup                          (void);
+
+gdouble   gegl_parallel_distribute_get_thread_time       (void);
+
+gint      gegl_parallel_distribute_get_optimal_n_threads (gdouble n_elements,
+                                                          gdouble thread_cost);
 
 
 G_END_DECLS
diff --git a/gegl/gegl-parallel.c b/gegl/gegl-parallel.c
index 2bb82caff..c699633ff 100644
--- a/gegl/gegl-parallel.c
+++ b/gegl/gegl-parallel.c
@@ -19,6 +19,7 @@
 #include "config.h"
 
 #include <math.h>
+#include <stdlib.h>
 
 #include <glib.h>
 
@@ -28,7 +29,8 @@
 #include "gegl-parallel-private.h"
 
 
-#define GEGL_PARALLEL_DISTRIBUTE_MAX_THREADS GEGL_MAX_THREADS
+#define GEGL_PARALLEL_DISTRIBUTE_MAX_THREADS           GEGL_MAX_THREADS
+#define GEGL_PARALLEL_DISTRIBUTE_THREAD_TIME_N_SAMPLES 10
 
 
 typedef struct
@@ -60,9 +62,7 @@ static void          gegl_parallel_set_n_threads                    (gint
 
 static void          gegl_parallel_distribute_set_n_threads         (gint                          
n_threads);
 static gpointer      gegl_parallel_distribute_thread_func           (GeglParallelDistributeThread *thread);
-
-static inline gint   gegl_parallel_distribute_get_optimal_n_threads (gdouble                       
n_elements,
-                                                                     gdouble                       
thread_cost);
+static void          gegl_parallel_distribute_update_thread_time    (void);
 
 
 /*  local variables  */
@@ -75,6 +75,8 @@ static GCond                        gegl_parallel_distribute_completion_cond;
 static volatile gint                gegl_parallel_distribute_completion_counter;
 static volatile gint                gegl_parallel_distribute_busy;
 
+static gdouble                      gegl_parallel_distribute_thread_time;
+
 
 /*  public functions  */
 
@@ -100,6 +102,46 @@ gegl_parallel_cleanup (void)
   gegl_parallel_set_n_threads (0, /* finish_tasks = */ FALSE);
 }
 
+gdouble
+gegl_parallel_distribute_get_thread_time (void)
+{
+  return gegl_parallel_distribute_thread_time;
+}
+
+/* calculates the optimal number of threads, n_threads, to process n_elements
+ * elements, assuming the cost of processing the elements is proportional to
+ * the number of elements to be processed by each thread, and assuming that
+ * each thread additionally incurs a fixed cost of thread_cost, relative to the
+ * cost of processing a single element.
+ *
+ * in other words, the assumption is that the total cost of processing the
+ * elements is proportional to:
+ *
+ *   n_elements / n_threads + thread_cost * n_threads
+ */
+inline gint
+gegl_parallel_distribute_get_optimal_n_threads (gdouble n_elements,
+                                                gdouble thread_cost)
+{
+  gint n_threads;
+
+  if (n_elements > 0 && thread_cost > 0.0)
+    {
+      gdouble n = n_elements;
+      gdouble c = thread_cost;
+
+      n_threads = floor ((c + sqrt (c * (c + 4.0 * n))) / (2.0 * c));
+      n_threads = CLAMP (n_threads, 1, gegl_parallel_distribute_n_threads);
+    }
+  else
+    {
+      n_threads = n_elements;
+      n_threads = CLAMP (n_threads, 0, gegl_parallel_distribute_n_threads);
+    }
+
+  return n_threads;
+}
+
 void
 gegl_parallel_distribute (gint                       max_n,
                           GeglParallelDistributeFunc func,
@@ -393,6 +435,8 @@ gegl_parallel_distribute_set_n_threads (gint n_threads)
   gegl_parallel_distribute_n_threads = n_threads;
 
   g_atomic_int_set (&gegl_parallel_distribute_busy, 0);
+
+  gegl_parallel_distribute_update_thread_time ();
 }
 
 static gpointer
@@ -432,36 +476,70 @@ gegl_parallel_distribute_thread_func (GeglParallelDistributeThread *thread)
   return NULL;
 }
 
-/* calculates the optimal number of threads, n_threads, to process n_elements
- * elements, assuming the cost of processing the elements is proportional to
- * the number of elements to be processed by each thread, and assuming that
- * each thread additionally incurs a fixed cost of thread_cost, relative to the
- * cost of processing a single element.
- *
- * in other words, the assumption is that the total cost of processing the
- * elements is proportional to:
- *
- *   n_elements / n_threads + thread_cost * n_threads
- */
-static inline gint
-gegl_parallel_distribute_get_optimal_n_threads (gdouble n_elements,
-                                                gdouble thread_cost)
+static void
+gegl_parallel_distribute_update_thread_time_func (gint  i,
+                                                  gint  n,
+                                                  gint *n_threads)
 {
-  gint n_threads;
+  if (i == 0)
+    *n_threads = n;
+}
 
-  if (n_elements > 0 && thread_cost > 0.0)
+static gint
+gegl_parallel_distribute_update_thread_time_compare (gconstpointer x,
+                                                     gconstpointer y)
+{
+  return (*(const gdouble *) x > *(const gdouble *) y) -
+         (*(const gdouble *) x < *(const gdouble *) y);
+}
+
+static void
+gegl_parallel_distribute_update_thread_time (void)
+{
+  gint64 samples[GEGL_PARALLEL_DISTRIBUTE_THREAD_TIME_N_SAMPLES];
+  gint   i;
+
+  if (gegl_parallel_distribute_n_threads <= 1)
     {
-      gdouble n = n_elements;
-      gdouble c = thread_cost;
+      gegl_parallel_distribute_thread_time = 0.0;
 
-      n_threads = floor ((c + sqrt (c * (c + 4.0 * n))) / (2.0 * c));
-      n_threads = CLAMP (n_threads, 1, gegl_parallel_distribute_n_threads);
+      return;
     }
-  else
+
+  for (i = 0; i < GEGL_PARALLEL_DISTRIBUTE_THREAD_TIME_N_SAMPLES; i++)
     {
-      n_threads = n_elements;
-      n_threads = CLAMP (n_threads, 0, gegl_parallel_distribute_n_threads);
+      gint   n_threads = 0;
+      gint64 t         = 0;
+
+      while (n_threads != gegl_parallel_distribute_n_threads)
+        {
+          /* to estimate the extra processing time incurred by additional
+           * threads, we simply distribute a NOP function across all threads,
+           * and measure how long it takes.  this measures the impact of
+           * synchronizing work distribution itself, but leaves out the effects
+           * of contention when performing actual work, making this a lower
+           * bound.
+           */
+          t = g_get_monotonic_time ();
+
+          gegl_parallel_distribute (
+            -1,
+            (GeglParallelDistributeFunc)
+              gegl_parallel_distribute_update_thread_time_func,
+            &n_threads);
+
+          t = g_get_monotonic_time () - t;
+        }
+
+      samples[i] = t;
     }
 
-  return n_threads;
+  qsort (samples,
+         GEGL_PARALLEL_DISTRIBUTE_THREAD_TIME_N_SAMPLES, sizeof (gint64),
+         gegl_parallel_distribute_update_thread_time_compare);
+
+  gegl_parallel_distribute_thread_time =
+    (gdouble) samples[GEGL_PARALLEL_DISTRIBUTE_THREAD_TIME_N_SAMPLES / 2] /
+    G_TIME_SPAN_SECOND                                                    /
+    (gegl_parallel_distribute_n_threads - 1);
 }
diff --git a/gegl/operation/gegl-operation.c b/gegl/operation/gegl-operation.c
index 90f68689c..d12858b2e 100644
--- a/gegl/operation/gegl-operation.c
+++ b/gegl/operation/gegl-operation.c
@@ -25,6 +25,7 @@
 #include "gegl.h"
 #include "gegl-config.h"
 #include "gegl-types-internal.h"
+#include "gegl-parallel-private.h"
 #include "gegl-operation.h"
 #include "gegl-operation-private.h"
 #include "gegl-operation-context.h"
@@ -34,18 +35,34 @@
 #include "graph/gegl-pad.h"
 #include "gegl-operations.h"
 
-static void         attach                    (GeglOperation       *self);
 
-static GeglRectangle get_bounding_box          (GeglOperation       *self);
-static GeglRectangle get_invalidated_by_change (GeglOperation       *self,
-                                                const gchar         *input_pad,
-                                                const GeglRectangle *input_region);
-static GeglRectangle get_required_for_output   (GeglOperation       *self,
-                                                const gchar         *input_pad,
-                                                const GeglRectangle *region);
+#define GEGL_OPERATION_MIN_PIXELS_PER_PIXEL_TIME_UPDATE ( 32 *  32)
+#define GEGL_OPERATION_DEFAULT_PIXELS_PER_THREAD        ( 64 *  64)
+#define GEGL_OPERATION_MAX_PIXELS_PER_THREAD            (128 * 128)
 
 
-G_DEFINE_TYPE (GeglOperation, gegl_operation, G_TYPE_OBJECT)
+struct _GeglOperationPrivate
+{
+  gdouble pixel_time;
+};
+
+
+static void            attach                           (GeglOperation       *self);
+
+static GeglRectangle   get_bounding_box                 (GeglOperation       *self);
+static GeglRectangle   get_invalidated_by_change        (GeglOperation       *self,
+                                                         const gchar         *input_pad,
+                                                         const GeglRectangle *input_region);
+static GeglRectangle   get_required_for_output          (GeglOperation       *self,
+                                                         const gchar         *input_pad,
+                                                         const GeglRectangle *region);
+
+static void            gegl_operation_update_pixel_time (GeglOperation       *self,
+                                                         const GeglRectangle *roi,
+                                                         gdouble              t);
+
+
+G_DEFINE_TYPE_WITH_PRIVATE (GeglOperation, gegl_operation, G_TYPE_OBJECT)
 
 
 static void
@@ -72,6 +89,9 @@ gegl_operation_class_init (GeglOperationClass *klass)
 static void
 gegl_operation_init (GeglOperation *self)
 {
+  GeglOperationPrivate *priv = gegl_operation_get_instance_private (self);
+
+  priv->pixel_time = -1.0;
 }
 
 /**
@@ -111,9 +131,14 @@ gegl_operation_process (GeglOperation        *operation,
                         const GeglRectangle  *result,
                         gint                  level)
 {
-  GeglOperationClass  *klass;
+  GeglOperationClass *klass;
+  gint64              t;
+  gint64              n_pixels;
+  gboolean            update_pixel_time;
+  gboolean            success;
 
   g_return_val_if_fail (GEGL_IS_OPERATION (operation), FALSE);
+  g_return_val_if_fail (result != NULL, FALSE);
 
   klass = GEGL_OPERATION_GET_CLASS (operation);
 
@@ -136,9 +161,28 @@ gegl_operation_process (GeglOperation        *operation,
 
   g_return_val_if_fail (klass->process, FALSE);
 
-  return klass->process (operation, context, output_pad, result, level);
+  n_pixels = (gint64) result->width * (gint64) result->height;
+
+  update_pixel_time = n_pixels >=
+                      GEGL_OPERATION_MIN_PIXELS_PER_PIXEL_TIME_UPDATE;
+
+  if (update_pixel_time)
+    t = g_get_monotonic_time ();
+
+  success = klass->process (operation, context, output_pad, result, level);
+
+  if (success && update_pixel_time)
+    {
+      t = g_get_monotonic_time () - t;
+
+      gegl_operation_update_pixel_time (operation, result,
+                                        (gdouble) t / G_TIME_SPAN_SECOND);
+    }
+
+  return success;
 }
 
+
 /* Calls an extending class' get_bound_box method if defined otherwise
  * just returns a zero-initialised bounding box
  */
@@ -827,8 +871,43 @@ gegl_operation_use_threading (GeglOperation *operation,
 gdouble
 gegl_operation_get_pixels_per_thread (GeglOperation *operation)
 {
-  /* FIXME: too arbitrary? */
-  return 64 * 64;
+  GeglOperationPrivate *priv = gegl_operation_get_instance_private (operation);
+
+  if (priv->pixel_time < 0.0)
+    return GEGL_OPERATION_DEFAULT_PIXELS_PER_THREAD;
+  else if (priv->pixel_time == 0.0)
+    return GEGL_OPERATION_MAX_PIXELS_PER_THREAD;
+
+  return MIN (gegl_parallel_distribute_get_thread_time () / priv->pixel_time,
+              GEGL_OPERATION_MAX_PIXELS_PER_THREAD);
+}
+
+static void
+gegl_operation_update_pixel_time (GeglOperation       *self,
+                                  const GeglRectangle *roi,
+                                  gdouble              t)
+{
+  GeglOperationPrivate *priv      = gegl_operation_get_instance_private (self);
+  gdouble               n_pixels;
+  gint                  n_threads = 1;
+
+  n_pixels = (gdouble) roi->width * (gdouble) roi->height;
+
+  if (gegl_operation_use_threading (self, roi))
+    {
+      /* we're assuming the entire processing cost was distributed over the
+       * optimal number of threads, as per the op's thread cost, which might
+       * not always be the case, but should generally be about right.
+       */
+      n_threads = gegl_parallel_distribute_get_optimal_n_threads (
+        n_pixels,
+        gegl_operation_get_pixels_per_thread (self));
+    }
+
+  priv->pixel_time = (t - (n_threads - 1)                              *
+                          gegl_parallel_distribute_get_thread_time ()) *
+                     n_threads / n_pixels;
+  priv->pixel_time = MAX (priv->pixel_time, 0.0);
 }
 
 static guchar *gegl_temp_alloc[GEGL_MAX_THREADS * 4]={NULL,};
diff --git a/gegl/operation/gegl-operation.h b/gegl/operation/gegl-operation.h
index fa100034c..12d3bc62b 100644
--- a/gegl/operation/gegl-operation.h
+++ b/gegl/operation/gegl-operation.h
@@ -34,7 +34,8 @@ G_BEGIN_DECLS
 #define GEGL_OPERATION_GET_CLASS(obj)  (G_TYPE_INSTANCE_GET_CLASS ((obj),  GEGL_TYPE_OPERATION, 
GeglOperationClass))
 /* The rest is in gegl-types.h */
 
-typedef struct _GeglOperationClass GeglOperationClass;
+typedef struct _GeglOperationClass   GeglOperationClass;
+typedef struct _GeglOperationPrivate GeglOperationPrivate;
 
 struct _GeglOperation
 {
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]