[gegl] operations/common/gblur-1d: Optimize by exploiting cache lines



commit e7f98682b9bddc7f7236fabe4cbc77e701510aa4
Author: Debarshi Ray <debarshir gnome org>
Date:   Mon Dec 25 02:47:42 2017 +0100

    operations/common/gblur-1d: Optimize by exploiting cache lines
    
    Leapfrogging through the buffer for each colour channel doesn't fully
    utilize the data cache. When a pixel's colour channel is accessed, it
    is very likely that the other channels have also been read into the
    cache because they are on the same cache line. Therefore, computing
    the entire pixel at the same time avoids having to reload the same
    cache line again.
    
    https://bugzilla.gnome.org/show_bug.cgi?id=791994

 operations/common/gblur-1d.c |  436 +++++++++++++++++++++++++++++++++++++-----
 1 files changed, 386 insertions(+), 50 deletions(-)
---
diff --git a/operations/common/gblur-1d.c b/operations/common/gblur-1d.c
index 0af262d..0ce71cd 100644
--- a/operations/common/gblur-1d.c
+++ b/operations/common/gblur-1d.c
@@ -81,6 +81,19 @@ property_boolean (clip_extent, _("Clip to the input extent"), TRUE)
  *
  **********************************************/
 
+typedef void (*IirYoungBlur1dFunc) (gfloat           *buf,
+                                    gdouble          *tmp,
+                                    const gdouble    *b,
+                                    gdouble         (*m)[3],
+                                    const gfloat     *iminus,
+                                    const gfloat     *uplus,
+                                    const gint        len,
+                                    GeglAbyssPolicy   policy);
+
+static const gfloat white[4] = { 1.0f, 1.0f, 1.0f, 1.0f };
+static const gfloat black[4] = { 0.0f, 0.0f, 0.0f, 1.0f };
+static const gfloat none[4]  = { 0.0f, 0.0f, 0.0f, 0.0f };
+
 static void
 iir_young_find_constants (gfloat   sigma,
                           gdouble *b,
@@ -122,11 +135,51 @@ iir_young_find_constants (gfloat   sigma,
 }
 
 static void
-fix_right_boundary (gdouble        *buf,
-                    gdouble       (*m)[3],
-                    const gdouble   uplus)
+get_boundaries (GeglAbyssPolicy   policy,
+                gfloat           *buf,
+                gint              len,
+                gint              nc,
+                const gfloat    **out_iminus,
+                const gfloat    **out_uplus)
+{
+  const gfloat *iminus;
+  const gfloat *uplus;
+
+  switch (policy)
+    {
+    case GEGL_ABYSS_CLAMP:
+    default:
+      iminus = &buf[nc * 3]; uplus = &buf[nc * (len + 2)];
+      break;
+
+    case GEGL_ABYSS_NONE:
+      iminus = uplus = &none[0];
+      break;
+
+    case GEGL_ABYSS_WHITE:
+      iminus = uplus = &white[0];
+      break;
+
+    case GEGL_ABYSS_BLACK:
+      iminus = uplus = &black[nc == 2 ? 2 : 0];
+      break;
+    }
+
+  if (out_iminus != NULL)
+    *out_iminus = iminus;
+
+  if (out_uplus != NULL)
+    *out_uplus = uplus;
+}
+
+static inline void
+fix_right_boundary_y (gdouble        *buf,
+                      gdouble       (*m)[3],
+                      const gfloat   *uplus)
 {
-  gdouble u[3] = { buf[-1] - uplus, buf[-2] - uplus, buf[-3] - uplus };
+  gdouble u[3] = { buf[-1] - uplus[0],
+                   buf[-2] - uplus[0],
+                   buf[-3] - uplus[0] };
   gint    i, k;
 
   for (i = 0; i < 3; i++)
@@ -136,69 +189,330 @@ fix_right_boundary (gdouble        *buf,
       for (k = 0; k < 3; k++)
         tmp += m[i][k] * u[k];
 
-      buf[i] = tmp + uplus;
+      buf[i] = tmp + uplus[0];
+    }
+}
+
+static void
+iir_young_blur_1D_y (gfloat        *buf,
+                     gdouble       *tmp,
+                     const gdouble *b,
+                     gdouble      (*m)[3],
+                     const gfloat  *iminus,
+                     const gfloat  *uplus,
+                     const gint     len,
+                     GeglAbyssPolicy policy)
+{
+  gint    i, j;
+
+  for (i = 0; i < 3; i++, tmp++)
+    tmp[0] = iminus[0];
+
+  buf += 3;
+
+  for (i = 3; i < 3 + len; i++, buf++, tmp++)
+    {
+      tmp[0] = b[0] * buf[0];
+
+      for (j = 1; j < 4; ++j)
+        {
+          gint offset = -1 * j;
+          tmp[0] += b[j] * tmp[offset + 0];
+        }
+    }
+
+  fix_right_boundary_y (tmp, m, uplus);
+
+  buf--;
+  tmp--;
+
+  for (i = 3 + len - 1; 3 <= i; i--, buf--, tmp--)
+    {
+      tmp[0] *= b[0];
+
+      for (j = 1; j < 4; ++j)
+        {
+          gint offset = j;
+          tmp[0] += b[j] * tmp[offset + 0];
+        }
+
+      buf[0] = tmp[0];
     }
 }
 
 static inline void
-iir_young_blur_1D (gfloat        *buf,
-                   gdouble       *tmp,
-                   const gdouble *b,
-                   gdouble      (*m)[3],
-                   const gint     len,
-                   const gint     nc,
-                   GeglAbyssPolicy policy)
+fix_right_boundary_yA (gdouble        *buf,
+                       gdouble       (*m)[3],
+                       const gfloat   *uplus)
 {
-  gfloat  white[4] = { 1, 1, 1, 1 };
-  gfloat  black[4] = { 0, 0, 0, 1 };
-  gfloat  none[4]  = { 0, 0, 0, 0 };
-  gfloat *uplus, *iminus;
-  gint    i, j, c;
+  gdouble u[6] = { buf[-2] - uplus[0], buf[-1] - uplus[1],
+                   buf[-4] - uplus[0], buf[-3] - uplus[1],
+                   buf[-6] - uplus[0], buf[-5] - uplus[1] };
+  gint    i, k;
 
-  switch (policy)
+  for (i = 0; i < 3; i++)
     {
-    case GEGL_ABYSS_CLAMP: default:
-      iminus = &buf[nc * 3]; uplus = &buf[nc * (len + 2)]; break;
+      gdouble tmp[2] = { 0.0, 0.0 };
 
-    case GEGL_ABYSS_NONE:
-      iminus = uplus = &none[0]; break;
+      for (k = 0; k < 3; k++)
+        {
+          tmp[0] += m[i][k] * u[k * 2 + 0];
+          tmp[1] += m[i][k] * u[k * 2 + 1];
+        }
 
-    case GEGL_ABYSS_WHITE:
-      iminus = uplus = &white[0]; break;
+      buf[2 * i + 0] = tmp[0] + uplus[0];
+      buf[2 * i + 1] = tmp[1] + uplus[1];
+    }
+}
 
-    case GEGL_ABYSS_BLACK:
-      iminus = uplus = &black[nc == 2 ? 2 : 0]; break;
+static void
+iir_young_blur_1D_yA (gfloat           *buf,
+                      gdouble          *tmp,
+                      const gdouble    *b,
+                      gdouble         (*m)[3],
+                      const gfloat     *iminus,
+                      const gfloat     *uplus,
+                      const gint        len,
+                      GeglAbyssPolicy   policy)
+{
+  gint    i, j;
+
+  for (i = 0; i < 3; i++, tmp += 2)
+    {
+      tmp[0] = iminus[0];
+      tmp[1] = iminus[1];
     }
 
-  for (c = 0; c < nc; ++c)
+  buf += 6;
+
+  for (i = 3; i < 3 + len; i++, buf += 2, tmp += 2)
     {
-      for (i = 0; i < 3; ++i)
-        tmp[i] = iminus[c];
+      tmp[0] = b[0] * buf[0];
+      tmp[1] = b[0] * buf[1];
 
-      for (i = 3; i < 3 + len; ++i)
+      for (j = 1; j < 4; ++j)
         {
-          tmp[i] = buf[nc * i + c] * b[0];
+          gint offset = -2 * j;
 
-          for (j = 1; j < 4; ++j)
-            tmp[i] += b[j] * tmp[i - j];
+          tmp[0] += b[j] * tmp[offset + 0];
+          tmp[1] += b[j] * tmp[offset + 1];
         }
+    }
+
+  fix_right_boundary_yA (tmp, m, uplus);
+
+  buf -= 2;
+  tmp -= 2;
 
-      fix_right_boundary (&tmp[3 + len], m, uplus[c]);
+  for (i = 3 + len - 1; 3 <= i; i--, buf -= 2, tmp -= 2)
+    {
+      tmp[0] *= b[0];
+      tmp[1] *= b[0];
 
-      for (i = 3 + len - 1; 3 <= i; --i)
+      for (j = 1; j < 4; ++j)
         {
-          gdouble tt = 0.;
+          gint offset = 2 * j;
+
+          tmp[0] += b[j] * tmp[offset + 0];
+          tmp[1] += b[j] * tmp[offset + 1];
+        }
 
-          for (j = 0; j < 4; ++j)
-            tt += b[j] * tmp[i + j];
+      buf[0] = tmp[0];
+      buf[1] = tmp[1];
+    }
+}
 
-          buf[nc * i + c] = tmp[i] = tt;
+static inline void
+fix_right_boundary_rgb (gdouble        *buf,
+                        gdouble       (*m)[3],
+                        const gfloat   *uplus)
+{
+  gdouble u[9] = { buf[-3] - uplus[0], buf[-2] - uplus[1], buf[-1] - uplus[2],
+                   buf[-6] - uplus[0], buf[-5] - uplus[1], buf[-4] - uplus[2],
+                   buf[-9] - uplus[0], buf[-8] - uplus[1], buf[-7] - uplus[2] };
+  gint    i, k;
+
+  for (i = 0; i < 3; i++)
+    {
+      gdouble tmp[3] = { 0.0, 0.0, 0.0 };
+
+      for (k = 0; k < 3; k++)
+        {
+          tmp[0] += m[i][k] * u[k * 3 + 0];
+          tmp[1] += m[i][k] * u[k * 3 + 1];
+          tmp[2] += m[i][k] * u[k * 3 + 2];
         }
+
+      buf[3 * i + 0] = tmp[0] + uplus[0];
+      buf[3 * i + 1] = tmp[1] + uplus[1];
+      buf[3 * i + 2] = tmp[2] + uplus[2];
     }
 }
 
 static void
-iir_young_hor_blur (GeglBuffer          *src,
+iir_young_blur_1D_rgb (gfloat           *buf,
+                       gdouble          *tmp,
+                       const gdouble    *b,
+                       gdouble         (*m)[3],
+                       const gfloat     *iminus,
+                       const gfloat     *uplus,
+                       const gint        len,
+                       GeglAbyssPolicy   policy)
+{
+  gint    i, j;
+
+  for (i = 0; i < 3; i++, tmp += 3)
+    {
+      tmp[0] = iminus[0];
+      tmp[1] = iminus[1];
+      tmp[2] = iminus[2];
+    }
+
+  buf += 9;
+
+  for (i = 3; i < 3 + len; i++, buf += 3, tmp += 3)
+    {
+      tmp[0] = b[0] * buf[0];
+      tmp[1] = b[0] * buf[1];
+      tmp[2] = b[0] * buf[2];
+
+      for (j = 1; j < 4; ++j)
+        {
+          gint offset = -3 * j;
+
+          tmp[0] += b[j] * tmp[offset + 0];
+          tmp[1] += b[j] * tmp[offset + 1];
+          tmp[2] += b[j] * tmp[offset + 2];
+        }
+    }
+
+  fix_right_boundary_rgb (tmp, m, uplus);
+
+  buf -= 3;
+  tmp -= 3;
+
+  for (i = 3 + len - 1; 3 <= i; i--, buf -= 3, tmp -= 3)
+    {
+      tmp[0] *= b[0];
+      tmp[1] *= b[0];
+      tmp[2] *= b[0];
+
+      for (j = 1; j < 4; ++j)
+        {
+          gint offset = 3 * j;
+
+          tmp[0] += b[j] * tmp[offset + 0];
+          tmp[1] += b[j] * tmp[offset + 1];
+          tmp[2] += b[j] * tmp[offset + 2];
+        }
+
+      buf[0] = tmp[0];
+      buf[1] = tmp[1];
+      buf[2] = tmp[2];
+    }
+}
+
+static inline void
+fix_right_boundary_rgbA (gdouble        *buf,
+                         gdouble       (*m)[3],
+                         const gfloat   *uplus)
+{
+  gdouble u[12] = { buf[-4] - uplus[0], buf[-3] - uplus[1], buf[-2] - uplus[2], buf[-1] - uplus[3],
+                    buf[-8] - uplus[0], buf[-7] - uplus[1], buf[-6] - uplus[2], buf[-5] - uplus[3],
+                    buf[-12] - uplus[0], buf[-11] - uplus[1], buf[-10] - uplus[2], buf[-9] - uplus[3] };
+  gint    i, k;
+
+  for (i = 0; i < 3; i++)
+    {
+      gdouble tmp[4] = { 0.0, 0.0, 0.0, 0.0 };
+
+      for (k = 0; k < 3; k++)
+        {
+          tmp[0] += m[i][k] * u[k * 4 + 0];
+          tmp[1] += m[i][k] * u[k * 4 + 1];
+          tmp[2] += m[i][k] * u[k * 4 + 2];
+          tmp[3] += m[i][k] * u[k * 4 + 3];
+        }
+
+      buf[4 * i + 0] = tmp[0] + uplus[0];
+      buf[4 * i + 1] = tmp[1] + uplus[1];
+      buf[4 * i + 2] = tmp[2] + uplus[2];
+      buf[4 * i + 3] = tmp[3] + uplus[3];
+    }
+}
+
+static void
+iir_young_blur_1D_rgbA (gfloat           *buf,
+                        gdouble          *tmp,
+                        const gdouble    *b,
+                        gdouble         (*m)[3],
+                        const gfloat     *iminus,
+                        const gfloat     *uplus,
+                        const gint        len,
+                        GeglAbyssPolicy   policy)
+{
+  gint     i, j;
+
+  for (i = 0; i < 3; i++, tmp += 4)
+    {
+      tmp[0] = iminus[0];
+      tmp[1] = iminus[1];
+      tmp[2] = iminus[2];
+      tmp[3] = iminus[3];
+    }
+
+  buf += 12;
+
+  for (i = 0; i < len; i++, buf += 4, tmp += 4)
+    {
+      tmp[0] = b[0] * buf[0];
+      tmp[1] = b[0] * buf[1];
+      tmp[2] = b[0] * buf[2];
+      tmp[3] = b[0] * buf[3];
+
+      for (j = 1; j < 4; ++j)
+        {
+          gint offset = -4 * j;
+
+          tmp[0] += b[j] * tmp[offset + 0];
+          tmp[1] += b[j] * tmp[offset + 1];
+          tmp[2] += b[j] * tmp[offset + 2];
+          tmp[3] += b[j] * tmp[offset + 3];
+        }
+    }
+
+  fix_right_boundary_rgbA (tmp, m, uplus);
+
+  buf -= 4;
+  tmp -= 4;
+
+  for (i = 3 + len - 1; 3 <= i; i--, buf -= 4, tmp -= 4)
+    {
+      tmp[0] *= b[0];
+      tmp[1] *= b[0];
+      tmp[2] *= b[0];
+      tmp[3] *= b[0];
+
+      for (j = 1; j < 4; ++j)
+        {
+          gint offset = 4 * j;
+
+          tmp[0] += b[j] * tmp[offset + 0];
+          tmp[1] += b[j] * tmp[offset + 1];
+          tmp[2] += b[j] * tmp[offset + 2];
+          tmp[3] += b[j] * tmp[offset + 3];
+        }
+
+      buf[0] = tmp[0];
+      buf[1] = tmp[1];
+      buf[2] = tmp[2];
+      buf[3] = tmp[3];
+    }
+}
+
+static void
+iir_young_hor_blur (IirYoungBlur1dFunc   real_blur_1D,
+                    GeglBuffer          *src,
                     const GeglRectangle *rect,
                     GeglBuffer          *dst,
                     const gdouble       *b,
@@ -210,19 +524,23 @@ iir_young_hor_blur (GeglBuffer          *src,
   GeglRectangle  cur_row = *rect;
   const gint     nc = babl_format_get_n_components (format);
   gfloat        *row = g_new (gfloat, (3 + rect->width + 3) * nc);
-  gdouble       *tmp = g_new (gdouble, (3 + rect->width + 3));
+  gdouble       *tmp = g_new (gdouble, (3 + rect->width + 3) * nc);
   gint           v;
 
   cur_row.height = 1;
 
   for (v = 0; v < rect->height; v++)
     {
+      const gfloat *iminus;
+      const gfloat *uplus;
+
       cur_row.y = rect->y + v;
 
       gegl_buffer_get (src, &cur_row, 1.0/(1<<level), format, &row[3 * nc],
                        GEGL_AUTO_ROWSTRIDE, GEGL_ABYSS_NONE);
 
-      iir_young_blur_1D (row, tmp, b, m, rect->width, nc, policy);
+      get_boundaries (policy, row, rect->width, nc, &iminus, &uplus);
+      real_blur_1D (row, tmp, b, m, iminus, uplus, rect->width, policy);
 
       gegl_buffer_set (dst, &cur_row, level, format, &row[3 * nc],
                        GEGL_AUTO_ROWSTRIDE);
@@ -233,7 +551,8 @@ iir_young_hor_blur (GeglBuffer          *src,
 }
 
 static void
-iir_young_ver_blur (GeglBuffer          *src,
+iir_young_ver_blur (IirYoungBlur1dFunc   real_blur_1D,
+                    GeglBuffer          *src,
                     const GeglRectangle *rect,
                     GeglBuffer          *dst,
                     const gdouble       *b,
@@ -245,19 +564,23 @@ iir_young_ver_blur (GeglBuffer          *src,
   GeglRectangle  cur_col = *rect;
   const gint     nc = babl_format_get_n_components (format);
   gfloat        *col = g_new (gfloat, (3 + rect->height + 3) * nc);
-  gdouble       *tmp = g_new (gdouble, (3 + rect->height + 3));
+  gdouble       *tmp = g_new (gdouble, (3 + rect->height + 3) * nc);
   gint           i;
 
   cur_col.width = 1;
 
   for (i = 0; i < rect->width; i++)
     {
+      const gfloat *iminus;
+      const gfloat *uplus;
+
       cur_col.x = rect->x + i;
 
       gegl_buffer_get (src, &cur_col, 1.0/(1<<level), format, &col[3 * nc],
                        GEGL_AUTO_ROWSTRIDE, GEGL_ABYSS_NONE);
 
-      iir_young_blur_1D (col, tmp, b, m, rect->height, nc, policy);
+      get_boundaries (policy, col, rect->height, nc, &iminus, &uplus);
+      real_blur_1D (col, tmp, b, m, iminus, uplus, rect->height, policy);
 
       gegl_buffer_set (dst, &cur_col, level, format, &col[3 * nc],
                        GEGL_AUTO_ROWSTRIDE);
@@ -610,9 +933,12 @@ filter_disambiguation (GeglGblur1dFilter filter,
 static void
 gegl_gblur_1d_prepare (GeglOperation *operation)
 {
+  GeglProperties *o = GEGL_PROPERTIES (operation);
   const Babl *src_format = gegl_operation_get_source_format (operation, "input");
   const char *format     = "RaGaBaA float";
 
+  o->user_data = iir_young_blur_1D_rgbA;
+
   /*
    * FIXME: when the abyss policy is _NONE, the behavior at the edge
    *        depends on input format (with or without an alpha component)
@@ -622,12 +948,21 @@ gegl_gblur_1d_prepare (GeglOperation *operation)
       const Babl *model = babl_format_get_model (src_format);
 
       if (model == babl_model ("RGB") || model == babl_model ("R'G'B'"))
-        format = "RGB float";
+        {
+          format = "RGB float";
+          o->user_data = iir_young_blur_1D_rgb;
+        }
       else if (model == babl_model ("Y") || model == babl_model ("Y'"))
-        format = "Y float";
+        {
+          format = "Y float";
+          o->user_data = iir_young_blur_1D_y;
+        }
       else if (model == babl_model ("YA") || model == babl_model ("Y'A") ||
                model == babl_model ("YaA") || model == babl_model ("Y'aA"))
-        format = "YaA float";
+        {
+          format = "YaA float";
+          o->user_data = iir_young_blur_1D_yA;
+        }
     }
 
   gegl_operation_set_format (operation, "input", babl_format (format));
@@ -843,14 +1178,15 @@ gegl_gblur_1d_process (GeglOperation       *operation,
 
   if (filter == GEGL_GBLUR_1D_IIR)
     {
+      IirYoungBlur1dFunc real_blur_1D = (IirYoungBlur1dFunc) o->user_data;
       gdouble b[4], m[3][3];
 
       iir_young_find_constants (std_dev, b, m);
 
       if (o->orientation == GEGL_ORIENTATION_HORIZONTAL)
-        iir_young_hor_blur (input, result, output, b, m, abyss_policy, format, level);
+        iir_young_hor_blur (real_blur_1D, input, result, output, b, m, abyss_policy, format, level);
       else
-        iir_young_ver_blur (input, result, output, b, m, abyss_policy, format, level);
+        iir_young_ver_blur (real_blur_1D, input, result, output, b, m, abyss_policy, format, level);
     }
   else
     {


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]