[gimp] Bug 699436 - optimize the heal tool

From: Michael Natterer <mitch src gnome org>
To: commits-list gnome org
Cc:
Subject: [gimp] Bug 699436 - optimize the heal tool
Date: Fri, 10 May 2013 20:00:04 +0000 (UTC)
commit 8602fdbecb13fd61b3c2a933a30b44b79377870f
Author: Loren Merritt <pengvado akuvian org>
Date:   Thu Apr 25 16:46:49 2013 +0000

    Bug 699436 - optimize the heal tool
    
    Adjust over-relaxation factor as a function of problem size.  Remove
    the second array, and update in-place.  Factor branches and indexing
    out of the inner loop, instead precompute a list of pixels inside the
    brush mask and what neighbors they have.  Switch from scalar double to
    simd float.
    
    Speedup (of the laplace part, excluding gamma correction): 10x-20x,
    depending on brush size.

 app/paint/gimpheal.c |  321 +++++++++++++++++++++++++-------------------------
 1 files changed, 162 insertions(+), 159 deletions(-)
---
diff --git a/app/paint/gimpheal.c b/app/paint/gimpheal.c
index 82e501e..28acdc1 100644
--- a/app/paint/gimpheal.c
+++ b/app/paint/gimpheal.c
@@ -1,6 +1,10 @@
 /* GIMP - The GNU Image Manipulation Program
  * Copyright (C) 1995 Spencer Kimball and Peter Mattis
  *
+ * gimpheal.c
+ * Copyright (C) Jean-Yves Couleaud <cjyves free fr>
+ * Copyright (C) 2013 Loren Merritt
+ *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 3 of the License, or
@@ -17,6 +21,7 @@
 
 #include "config.h"
 
+#include <stdint.h>
 #include <string.h>
 
 #include <gegl.h>
@@ -43,15 +48,14 @@
 
 /* NOTES
  *
- * The method used here is similar to the lighting invariant correctin
+ * The method used here is similar to the lighting invariant correction
  * method but slightly different: we do not divide the RGB components,
  * but subtract them I2 = I0 - I1, where I0 is the sample image to be
  * corrected, I1 is the reference pattern. Then we solve DeltaI=0
  * (Laplace) with I2 Dirichlet conditions at the borders of the
- * mask. The solver is a unoptimized red/black checker Gauss-Siedel
- * with an over-relaxation factor of 1.8. It can benefit from a
- * multi-grid evaluation of an initial solution before the main
- * iteration loop.
+ * mask. The solver is a red/black checker Gauss-Seidel with over-relaxation.
+ * It could benefit from a multi-grid evaluation of an initial solution
+ * before the main iteration loop.
  *
  * I reduced the convergence criteria to 0.1% (0.001) as we are
  * dealing here with RGB integer components, more is overkill.
@@ -143,7 +147,7 @@ gimp_heal_start (GimpPaintCore     *paint_core,
   return TRUE;
 }
 
-/* Subtract bottom from top and store in result as a double
+/* Subtract bottom from top and store in result as a float
  */
 static void
 gimp_heal_sub (GeglBuffer          *top_buffer,
@@ -171,15 +175,15 @@ gimp_heal_sub (GeglBuffer          *top_buffer,
                             GEGL_BUFFER_READ, GEGL_ABYSS_NONE);
 
   gegl_buffer_iterator_add (iter, result_buffer, result_rect, 0,
-                            babl_format_n (babl_type ("double"), n_components),
+                            babl_format_n (babl_type ("float"), n_components),
                             GEGL_BUFFER_WRITE, GEGL_ABYSS_NONE);
 
   while (gegl_buffer_iterator_next (iter))
     {
-      gfloat  *t      = iter->data[0];
-      gfloat  *b      = iter->data[1];
-      gdouble *r      = iter->data[2];
-      gint     length = iter->length * n_components;
+      gfloat *t      = iter->data[0];
+      gfloat *b      = iter->data[1];
+      gfloat *r      = iter->data[2];
+      gint    length = iter->length * n_components;
 
       while (length--)
         *r++ = *t++ - *b++;
@@ -208,7 +212,7 @@ gimp_heal_add (GeglBuffer          *first_buffer,
     g_return_if_reached ();
 
   iter = gegl_buffer_iterator_new (first_buffer, first_rect, 0,
-                                   babl_format_n (babl_type ("double"),
+                                   babl_format_n (babl_type ("float"),
                                                   n_components),
                                    GEGL_BUFFER_READ, GEGL_ABYSS_NONE);
 
@@ -220,158 +224,168 @@ gimp_heal_add (GeglBuffer          *first_buffer,
 
   while (gegl_buffer_iterator_next (iter))
     {
-      gdouble *f      = iter->data[0];
-      gfloat  *s      = iter->data[1];
-      gfloat  *r      = iter->data[2];
-      gint     length = iter->length * n_components;
+      gfloat *f      = iter->data[0];
+      gfloat *s      = iter->data[1];
+      gfloat *r      = iter->data[2];
+      gint    length = iter->length * n_components;
 
       while (length--)
         *r++ = *f++ + *s++;
     }
 }
 
-/* Perform one iteration of the laplace solver for matrix.  Store the
- * result in solution and return the square of the cummulative error
- * of the solution.
- */
-static gdouble
-gimp_heal_laplace_iteration (gdouble *matrix,
-                             gint     height,
-                             gint     depth,
-                             gint     width,
-                             gdouble *solution,
-                             guchar  *mask)
+#if defined(__SSE__) && defined(__GNUC__) && __GNUC__ >= 4
+static float
+gimp_heal_laplace_iteration_sse (gfloat *pixels,
+                                 gfloat *Adiag,
+                                 gint   *Aidx,
+                                 gfloat  w,
+                                 gint    nmask)
 {
-  const gint    rowstride = width * depth;
-  gint          i, j, k, off, offm, offm0, off0;
-  gdouble       tmp, diff;
-  gdouble       err       = 0.0;
-  const gdouble w         = 1.80 * 0.25; /* Over-relaxation = 1.8 */
+  typedef float v4sf __attribute__((vector_size(16)));
+  gint i;
+  v4sf wv  = { w, w, w, w };
+  v4sf err = { 0, 0, 0, 0 };
+  union { v4sf v; float f[4]; } erru;
 
-  /* we use a red/black checker model of the discretization grid */
+#define Xv(j) (*(v4sf*)&pixels[Aidx[i * 5 + j]])
 
-  /* do reds */
-  for (i = 0; i < height; i++)
+  for (i = 0; i < nmask; i++)
     {
-      off0  = i * rowstride;
-      offm0 = i * width;
+      v4sf a    = { Adiag[i], Adiag[i], Adiag[i], Adiag[i] };
+      v4sf diff = a * Xv(0) - wv * (Xv(1) + Xv(2) + Xv(3) + Xv(4));
 
-      for (j = i % 2; j < width; j += 2)
-        {
-          off  = off0 + j * depth;
-          offm = offm0 + j;
-
-          if ((0 == mask[offm]) ||
-              (i == 0) || (i == (height - 1)) ||
-              (j == 0) || (j == (width - 1)))
-            {
-              /* do nothing at the boundary or outside mask */
-              for (k = 0; k < depth; k++)
-                solution[off + k] = matrix[off + k];
-            }
-          else
-            {
-              /* Use Gauss Siedel to get the correction factor then
-               * over-relax it
-               */
-              for (k = 0; k < depth; k++)
-                {
-                  tmp = solution[off + k];
-                  solution[off + k] = (matrix[off + k] +
-                                       w *
-                                       (matrix[off - depth + k] +     /* west */
-                                        matrix[off + depth + k] +     /* east */
-                                        matrix[off - rowstride + k] + /* north */
-                                        matrix[off + rowstride + k] - 4.0 *
-                                        matrix[off+k]));              /* south */
-
-                  diff = solution[off + k] - tmp;
-                  err += diff * diff;
-                }
-            }
-        }
+      Xv(0) -= diff;
+      err += diff * diff;
     }
 
+  erru.v = err;
 
-  /* Do blacks
-   *
-   * As we've done the reds earlier, we can use them right now to
-   * accelerate the convergence. So we have "solution" in the solver
-   * instead of "matrix" above
-   */
-  for (i = 0; i < height; i++)
-    {
-      off0 =  i * rowstride;
-      offm0 = i * width;
+  return erru.f[0] + erru.f[1] + erru.f[2] + erru.f[3];
+}
+#endif
+
+/* Perform one iteration of Gauss-Seidel, and return the sum squared residual.
+ */
+static float
+gimp_heal_laplace_iteration (gfloat *pixels,
+                             gfloat *Adiag,
+                             gint   *Aidx,
+                             gfloat  w,
+                             gint    nmask,
+                             gint    depth)
+{
+  gint   i, k;
+  gfloat err = 0;
+
+#if defined(__SSE__) && defined(__GNUC__) && __GNUC__ >= 4
+  if (depth == 4)
+    return gimp_heal_laplace_iteration_sse (pixels, Adiag, Aidx, w, nmask);
+#endif
 
-      for (j = (i % 2) ? 0 : 1; j < width; j += 2)
+  for (i = 0; i < nmask; i++)
+    {
+      gint   j0 = Aidx[i * 5 + 0];
+      gint   j1 = Aidx[i * 5 + 1];
+      gint   j2 = Aidx[i * 5 + 2];
+      gint   j3 = Aidx[i * 5 + 3];
+      gint   j4 = Aidx[i * 5 + 4];
+      gfloat a  = Adiag[i];
+
+      for (k = 0; k < depth; k++)
         {
-          off = off0 + j * depth;
-          offm = offm0 + j;
-
-          if ((0 == mask[offm]) ||
-              (i == 0) || (i == (height - 1)) ||
-              (j == 0) || (j == (width - 1)))
-            {
-              /* do nothing at the boundary or outside mask */
-              for (k = 0; k < depth; k++)
-                solution[off + k] = matrix[off + k];
-            }
-          else
-            {
-              /* Use Gauss Siedel to get the correction factor then
-               * over-relax it
-               */
-              for (k = 0; k < depth; k++)
-                {
-                  tmp = solution[off + k];
-                  solution[off + k] = (matrix[off + k] +
-                                       w *
-                                       (solution[off - depth + k] +     /* west */
-                                        solution[off + depth + k] +     /* east */
-                                        solution[off - rowstride + k] + /* north */
-                                        solution[off + rowstride + k] - 4.0 *
-                                        matrix[off+k]));                /* south */
-
-                  diff = solution[off + k] - tmp;
-                  err += diff*diff;
-                }
-            }
+          gfloat diff = (a * pixels[j0 + k] -
+                         w * (pixels[j1 + k] +
+                              pixels[j2 + k] +
+                              pixels[j3 + k] +
+                              pixels[j4 + k]));
+
+          pixels[j0 + k] -= diff;
+          err += diff * diff;
         }
     }
 
   return err;
 }
 
-/* Solve the laplace equation for matrix and store the result in solution.
+/* Solve the laplace equation for pixels and store the result in-place.
  */
 static void
-gimp_heal_laplace_loop (gdouble *matrix,
-                        gint     height,
-                        gint     depth,
-                        gint     width,
-                        gdouble *solution,
-                        guchar  *mask)
+gimp_heal_laplace_loop (gfloat *pixels,
+                        gint    height,
+                        gint    depth,
+                        gint    width,
+                        guchar *mask)
 {
-#define EPSILON   1e-8
-#define MAX_ITER  500
-  gint i;
-
-  /* repeat until convergence or max iterations */
-  for (i = 0; i < MAX_ITER; i++)
-    {
-      gdouble sqr_err;
-
-      /* do one iteration and store the amount of error */
-      sqr_err = gimp_heal_laplace_iteration (matrix, height, depth, width,
-                                             solution, mask);
+  /* Tolerate a total deviation-from-smoothness of 0.1 LSBs at 8bit depth. */
+#define EPSILON  (0.1/255)
+#define MAX_ITER 500
+
+  gint    i, j, iter, parity, nmask, zero;
+  gfloat *Adiag;
+  gint   *Aidx;
+  gfloat  w;
+
+  Adiag = g_new (gfloat, width * height);
+  Aidx  = g_new (gint, 5 * width * height);
+
+  /* All off-diagonal elements of A are either -1 or 0. We could store it as a
+   * general-purpose sparse matrix, but that adds some unnecessary overhead to
+   * the inner loop. Instead, assume exactly 4 off-diagonal elements in each
+   * row, all of which have value -1. Any row that in fact wants less than 4
+   * coefs can put them in a dummy column to be multiplied by an empty pixel.
+   */
+  zero = depth * width * height;
+  memset (pixels + zero, 0, depth * sizeof (gfloat));
 
-      /* copy solution to matrix */
-      memcpy (matrix, solution, width * height * depth * sizeof (double));
+  /* Construct the system of equations.
+   * Arrange Aidx in checkerboard order, so that a single linear pass over that
+   * array results updating all of the red cells and then all of the black cells.
+   */
+  nmask = 0;
+  for (parity = 0; parity < 2; parity++)
+    for (i = 0; i < height; i++)
+      for (j = (i&1)^parity; j < width; j+=2)
+        if (mask[j + i * width])
+          {
+#define A_NEIGHBOR(o,di,dj) \
+            if ((dj<0 && j==0) || (dj>0 && j==width-1) || (di<0 && i==0) || (di>0 && i==height-1)) \
+              Aidx[o + nmask * 5] = zero; \
+            else                                               \
+              Aidx[o + nmask * 5] = ((i + di) * width + (j + dj)) * depth;
+
+            /* Omit Dirichlet conditions for any neighbors off the
+             * edge of the canvas.
+             */
+            Adiag[nmask] = 4 - (i==0) - (j==0) - (i==height-1) - (j==width-1);
+            A_NEIGHBOR (0,  0,  0);
+            A_NEIGHBOR (1,  0,  1);
+            A_NEIGHBOR (2,  1,  0);
+            A_NEIGHBOR (3,  0, -1);
+            A_NEIGHBOR (4, -1,  0);
+            nmask++;
+          }
+
+  /* Empirically optimal over-relaxation factor. (Benchmarked on
+   * round brushes, at least. I don't know whether aspect ratio
+   * affects it.)
+   */
+  w = 2.0 - 1.0 / (0.1575 * sqrt (nmask) + 0.8);
+  w *= 0.25;
+  for (i = 0; i < nmask; i++)
+    Adiag[i] *= w;
 
-      if (sqr_err < EPSILON)
+  /* Gauss-Seidel with successive over-relaxation */
+  for (iter = 0; iter < MAX_ITER; iter++)
+    {
+      gfloat err = gimp_heal_laplace_iteration (pixels, Adiag, Aidx,
+                                                w, nmask, depth);
+      if (err < EPSILON * EPSILON * w * w)
         break;
     }
+
+  g_free (Adiag);
+  g_free (Aidx);
 }
 
 /* Original Algorithm Design:
@@ -393,10 +407,8 @@ gimp_heal (GeglBuffer          *src_buffer,
   gint        dest_components;
   gint        width;
   gint        height;
-  gdouble    *i_1;
-  gdouble    *i_2;
-  GeglBuffer *i_1_buffer;
-  GeglBuffer *i_2_buffer;
+  gfloat     *diff, *diff_alloc;
+  GeglBuffer *diff_buffer;
   guchar     *mask;
 
   src_format  = gegl_buffer_get_format (src_buffer);
@@ -410,46 +422,37 @@ gimp_heal (GeglBuffer          *src_buffer,
 
   g_return_if_fail (src_components == dest_components);
 
-  i_1  = g_new (gdouble, width * height * src_components);
-  i_2  = g_new (gdouble, width * height * src_components);
+  diff_alloc = g_new (gfloat, 4 + (width * height + 1) * src_components);
+  diff = (gfloat*)(((uintptr_t)diff_alloc + 15) & ~15);
 
-  i_1_buffer =
-    gegl_buffer_linear_new_from_data (i_1,
-                                      babl_format_n (babl_type ("double"),
-                                                     src_components),
-                                      GEGL_RECTANGLE (0, 0, width, height),
-                                      GEGL_AUTO_ROWSTRIDE,
-                                      (GDestroyNotify) g_free, i_1);
-  i_2_buffer =
-    gegl_buffer_linear_new_from_data (i_2,
-                                      babl_format_n (babl_type ("double"),
+  diff_buffer =
+    gegl_buffer_linear_new_from_data (diff,
+                                      babl_format_n (babl_type ("float"),
                                                      src_components),
                                       GEGL_RECTANGLE (0, 0, width, height),
                                       GEGL_AUTO_ROWSTRIDE,
-                                      (GDestroyNotify) g_free, i_2);
+                                      (GDestroyNotify) g_free, diff_alloc);
 
-  /* subtract pattern from image and store the result as a double in i_1 */
+  /* subtract pattern from image and store the result as a float in diff */
   gimp_heal_sub (dest_buffer, dest_rect,
                  src_buffer, src_rect,
-                 i_1_buffer, GEGL_RECTANGLE (0, 0, width, height));
+                 diff_buffer, GEGL_RECTANGLE (0, 0, width, height));
 
   mask = g_new (guchar, mask_rect->width * mask_rect->height);
 
   gegl_buffer_get (mask_buffer, mask_rect, 1.0, babl_format ("Y u8"),
                    mask, GEGL_AUTO_ROWSTRIDE, GEGL_ABYSS_NONE);
 
-  /* FIXME: is a faster implementation needed? */
-  gimp_heal_laplace_loop (i_1, height, src_components, width, i_2, mask);
+  gimp_heal_laplace_loop (diff, height, src_components, width, mask);
 
   g_free (mask);
 
   /* add solution to original image and store in dest */
-  gimp_heal_add (i_2_buffer, GEGL_RECTANGLE (0, 0, width, height),
+  gimp_heal_add (diff_buffer, GEGL_RECTANGLE (0, 0, width, height),
                  src_buffer, src_rect,
                  dest_buffer, dest_rect);
 
-  g_object_unref (i_1_buffer);
-  g_object_unref (i_2_buffer);
+  g_object_unref (diff_buffer);
 }
 
 static void
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]