[gegl] Performance improvements in oilfy and opencl support.



commit 295863957b16472de192638b2bf3ea487edd2f36
Author: RPG <roshin scriptumplus ru>
Date:   Sat Dec 29 20:08:31 2012 -0200

    Performance improvements in oilfy and opencl support.

 operations/common/oilify.c |  458 +++++++++++++++++++++++++++++++++++---------
 1 files changed, 365 insertions(+), 93 deletions(-)
---
diff --git a/operations/common/oilify.c b/operations/common/oilify.c
index e5cac3c..305cfb4 100644
--- a/operations/common/oilify.c
+++ b/operations/common/oilify.c
@@ -27,9 +27,12 @@
 gegl_chant_int (mask_radius, _("Mask Radius"), 1, 25, 4,
                 _("Radius of circle around pixel"))
 
-gegl_chant_double (exponent, _("Exponent"), 1.0, 20.0, 8.0,
+gegl_chant_int (exponent, _("Exponent"), 1, 20, 8,
                    _("Exponent"))
 
+gegl_chant_int (intensities, _("Number of intensities"), 8, 256, 128,
+                    _("Histogram size"))
+
 gegl_chant_boolean (use_inten, _("Intensity Mode"), TRUE,
                     _("Use pixel luminance values"))
 
@@ -58,7 +61,6 @@ get_pixel (gint    x,
       dst[b] = src[b];
     }
 }
-
 static void
 get_pixel_inten (gint    x,
                  gint    y,
@@ -68,45 +70,35 @@ get_pixel_inten (gint    x,
 {
   *dst = *(inten_begin + (x + buf_width*y));
 }
-
 static void
-oilify_pixel (gint           x,
-              gint           y,
-              GeglRectangle *whole_rect,
-              gboolean       use_inten,
-              gdouble        radius,
-              gdouble        exponent,
-              gint           buf_width,
-              gfloat        *src_buf,
-              gfloat        *inten_buf,
-              gfloat        *dst_pixel)
+oilify_pixel_inten (gint     x,
+                    gint     y,
+                    gdouble  radius,
+                    gint     exponent,
+                    gint     intensities,
+                    gint     buf_width,
+                    gfloat  *src_buf,
+                    gfloat  *inten_buf,
+                    gfloat  *dst_pixel)
 {
-  gint   hist[4][NUM_INTENSITIES];
   gfloat cumulative_rgb[4][NUM_INTENSITIES];
   gint   hist_inten[NUM_INTENSITIES];
-  gfloat mult_inten[NUM_INTENSITIES];
+  gfloat mult_inten;
   gfloat temp_pixel[4];
-  gfloat temp_inten_pixel;
   gint   ceil_radius = ceil (radius);
   gdouble radius_sq = radius*radius;
   gint i, j, b;
-  gint hist_max[4];
   gint inten_max;
   gint intensity;
-  gfloat sum;
-  gfloat ratio;
+  gfloat ratio, temp_inten_pixel;
   gfloat weight;
-  gfloat color;
-  gfloat result;
+  gfloat color[4];
   gfloat div;
-  for (i = 0; i < NUM_INTENSITIES; i++)
+  for (i = 0; i < intensities; i++)
     {
       hist_inten[i] = 0;
       for (b = 0; b < 4; b++)
-        {
-          hist[b][i] = 0;
           cumulative_rgb[b][i] = 0.0;
-        }
     }
 
   /* calculate histograms */
@@ -121,28 +113,16 @@ oilify_pixel (gint           x,
                          buf_width,
                          src_buf,
                          temp_pixel);
-
-              if (use_inten)
-                {
-                  get_pixel_inten (x + i,
-                                   y + j,
-                                   buf_width,
-                                   inten_buf,
-                                   &temp_inten_pixel);
-                  intensity = temp_inten_pixel * NUM_INTENSITIES;
-                  hist_inten[intensity]++;
-                  for (b = 0; b < 4; b++)
-                    {
-                      cumulative_rgb[b][intensity] += temp_pixel[b];
-                    }
-                }
-              else
+              get_pixel_inten (x + i,
+                               y + j,
+                               buf_width,
+                               inten_buf,
+                               &temp_inten_pixel);
+              intensity = temp_inten_pixel * (intensities - 1);
+              hist_inten[intensity]++;
+              for (b = 0; b < 4; b++)
                 {
-                  for (b = 0; b < 4; b++)
-                    {
-                      intensity = temp_pixel[b] * NUM_INTENSITIES;
-                      hist[b][intensity]++;
-                    }
+                  cumulative_rgb[b][intensity] += temp_pixel[b];
                 }
             }
         }
@@ -151,57 +131,128 @@ oilify_pixel (gint           x,
   inten_max = 1;
 
   /* calculated maximums */
-  for (i = 0; i < NUM_INTENSITIES; i++) {
+  for (i = 0; i < intensities; i++) {
     inten_max = MAX (inten_max, hist_inten[i]);
   }
 
+  /* calculate weight and use it to set the pixel */
+  div = 0.0;
+
   for (b = 0; b < 4; b++)
+    color[b] = 0.0;
+  for (i = 0; i < intensities; i++)
     {
-      hist_max[b] = 1;
-      for (i = 0; i < NUM_INTENSITIES; i++) {
-        hist_max[b] = MAX (hist_max[b], hist[b][i]);
+      if (hist_inten[i] > 0)
+      {
+        ratio = (gfloat) hist_inten[i] / (gfloat) inten_max;
+
+        /* using this instead of pow function gives HUGE performance improvement
+           but we cannot use floating point exponent... */
+        weight = 1.;
+        for(j = 0; j < exponent; j++)
+          weight *= ratio;
+        /* weight = powf(ratio, exponent); */
+        mult_inten = weight / (gfloat) hist_inten[i];
+
+        div += weight;
+        for (b = 0; b < 4; b++)
+          color[b] += mult_inten * cumulative_rgb[b][i];
       }
     }
+  for (b = 0; b < 4; b++)
+    dst_pixel[b] = color[b]/div;
+}
 
-  /* calculate weight and use it to set the pixel */
-  div = 0.0;
-  for (i = 0; i < NUM_INTENSITIES; i++)
+static void
+oilify_pixel (gint           x,
+              gint           y,
+              gdouble        radius,
+              gint           exponent,
+              gint           intensities,
+              gint           buf_width,
+              gfloat        *src_buf,
+              gfloat        *dst_pixel)
+{
+  gint   hist[4][NUM_INTENSITIES];
+  gfloat temp_pixel[4];
+  gint   ceil_radius = ceil (radius);
+  gdouble radius_sq = radius*radius;
+  gint i, j, b;
+  gint hist_max[4];
+  gint intensity;
+  gfloat sum[4];
+  gfloat ratio;
+  gfloat weight;
+  gfloat result[4];
+  gfloat div[4];
+  for (i = 0; i < intensities; i++)
     {
-      if (use_inten && hist_inten[i] > 0)
+      for (b = 0; b < 4; b++)
         {
-          ratio = (gfloat) hist_inten[i] / (gfloat) inten_max;
-          weight = pow (ratio, exponent);
-          mult_inten[i] = weight / (gfloat) hist_inten[i];
-          div += weight;
+          hist[b][i] = 0;
         }
     }
-  for (b = 0; b < 4; b++)
+
+  /* calculate histograms */
+  for (i = -ceil_radius; i <= ceil_radius; i++)
     {
-      sum = 0.0;
-      color = 0.0;
-      if (use_inten)
-        {
-          for (i = 0; i < NUM_INTENSITIES; i++)
-            {
-              if (hist_inten[i] > 0)
-                color += mult_inten[i] * cumulative_rgb[b][i];
-            }
-          dst_pixel[b] = color/div;
-        }
-      else
+      for (j = -ceil_radius; j <= ceil_radius; j++)
         {
-          div = 0.0;
-          for (i = 0; i < NUM_INTENSITIES; i++)
+          if (i*i + j*j <= radius_sq)
             {
-              ratio = (gfloat) hist[b][i] / (gfloat) hist_max[b];
-              weight = pow (ratio, exponent);
-              sum += weight * (gfloat) i;
-              div += weight;
+              get_pixel (x + i,
+                         y + j,
+                         buf_width,
+                         src_buf,
+                         temp_pixel);
+              for (b = 0; b < 4; b++)
+                {
+                  intensity = temp_pixel[b] * (intensities - 1);
+                  hist[b][intensity]++;
+                }
             }
-          result = sum / (gfloat) NUM_INTENSITIES;
-          dst_pixel[b] = result/div;
         }
     }
+    for (b = 0; b < 4; b++)
+      hist_max[b] = 1;
+    for (i = 0; i < intensities; i++) {
+      for (b = 0; b < 4; b++)
+        if(hist_max[b] < hist[b][i]) /* MAX macros too slow here */
+          hist_max[b] = hist[b][i];
+    }
+
+  /* calculate weight and use it to set the pixel */
+
+    for (b = 0; b < 4; b++)
+      {
+        sum[b] = 0.0;
+        div[b] = 0.0;
+      }
+    for (i = 0; i < intensities; i++)
+      {
+        /* UNROLL this bottleneck loop, up to 50% faster */
+        #define DO_HIST_STEP(b) if(hist[b][i] > 0)                          \
+          {                                                                 \
+            ratio = (gfloat) hist[b][i] / (gfloat) hist_max[b];             \
+            weight = 1.;                                                    \
+            for(j = 0; j < exponent; j++)                                   \
+              weight *= ratio;                                              \
+            sum[b] += weight * (gfloat) i;                                  \
+            div[b] += weight;                                               \
+          }
+
+        DO_HIST_STEP(0)
+        DO_HIST_STEP(1)
+        DO_HIST_STEP(2)
+        DO_HIST_STEP(3)
+        #undef DO_HIST_STEP
+      }
+
+    for (b = 0; b < 4; b++)
+      {
+        result[b] = sum[b] / (gfloat) (intensities - 1);
+        dst_pixel[b] = result[b]/div[b];
+      }
 }
 
 static void
@@ -214,9 +265,9 @@ prepare (GeglOperation *operation)
   o       = GEGL_CHANT_PROPERTIES (operation);
 
   op_area->left   =
-    op_area->right  =
-    op_area->top    =
-    op_area->bottom = o->mask_radius;
+  op_area->right  =
+  op_area->top    =
+  op_area->bottom = o->mask_radius;
 
   gegl_operation_set_format (operation, "input",
                              babl_format ("RGBA float"));
@@ -224,6 +275,218 @@ prepare (GeglOperation *operation)
                              babl_format ("RGBA float"));
 }
 
+#include "opencl/gegl-cl.h"
+#include "buffer/gegl-buffer-cl-iterator.h"
+
+/* two small different kernels are better than one big */
+static const char* kernel_source =
+"#define NUM_INTENSITIES 256                                                \n"
+"kernel void kernel_oilify(global float4 *in,                               \n"
+"                             global float4 *out,                           \n"
+"                             const int mask_radius,                        \n"
+"                             const int intensities,                        \n"
+"                             const float exponent)                         \n"
+"{                                                                          \n"
+"  int gidx = get_global_id(0);                                             \n"
+"  int gidy = get_global_id(1);                                             \n"
+"  int x = gidx + mask_radius;                                              \n"
+"  int y = gidy + mask_radius;                                              \n"
+"  int dst_width = get_global_size(0);                                      \n"
+"  int src_width = dst_width + mask_radius * 2;                             \n"
+"  float4 hist[NUM_INTENSITIES];                                            \n"
+"  float4 hist_max = 1.0;                                                   \n"
+"  int i, j, intensity;                                                     \n"
+"  int radius_sq = mask_radius * mask_radius;                               \n"
+"  float4 temp_pixel;                                                       \n"
+"  for (i = 0; i < intensities; i++)                                        \n"
+"    hist[i] = 0.0;                                                         \n"
+"                                                                           \n"
+"  for (i = -mask_radius; i <= mask_radius; i++)                            \n"
+"  {                                                                        \n"
+"    for (j = -mask_radius; j <= mask_radius; j++)                          \n"
+"      {                                                                    \n"
+"        if (i*i + j*j <= radius_sq)                                        \n"
+"          {                                                                \n"
+"            temp_pixel = in[x + i + (y + j) * src_width];                  \n"
+"            hist[(int)(temp_pixel.x * (intensities - 1))].x+=1;            \n"
+"            hist[(int)(temp_pixel.y * (intensities - 1))].y+=1;            \n"
+"            hist[(int)(temp_pixel.z * (intensities - 1))].z+=1;            \n"
+"            hist[(int)(temp_pixel.w * (intensities - 1))].w+=1;            \n"
+"          }                                                                \n"
+"      }                                                                    \n"
+"  }                                                                        \n"
+"                                                                           \n"
+"  for (i = 0; i < intensities; i++) {                                      \n"
+"    if(hist_max.x < hist[i].x)                                             \n"
+"      hist_max.x = hist[i].x;                                              \n"
+"    if(hist_max.y < hist[i].y)                                             \n"
+"      hist_max.y = hist[i].y;                                              \n"
+"    if(hist_max.z < hist[i].z)                                             \n"
+"      hist_max.z = hist[i].z;                                              \n"
+"    if(hist_max.w < hist[i].w)                                             \n"
+"      hist_max.w = hist[i].w;                                              \n"
+"  }                                                                        \n"
+"  float4 div = 0.0;                                                        \n"
+"  float4 sum = 0.0;                                                        \n"
+"  float4 ratio, weight;                                                    \n"
+"  for (i = 0; i < intensities; i++)                                        \n"
+"  {                                                                        \n"
+"    ratio = hist[i] / hist_max;                                            \n"
+"    weight = pow(ratio, (float4)exponent);                                 \n"
+"    sum += weight * (float4)i;                                             \n"
+"    div += weight;                                                         \n"
+"  }                                                                        \n"
+"  out[gidx + gidy * dst_width] = sum / div / (float)(intensities - 1);     \n"
+"}                                                                          \n"
+"                                                                           \n"
+"kernel void kernel_oilify_inten(global float4 *in,                         \n"
+"                             global float4 *out,                           \n"
+"                             const int mask_radius,                        \n"
+"                             const int intensities,                        \n"
+"                             const float exponent)                         \n"
+"{                                                                          \n"
+"  int gidx = get_global_id(0);                                             \n"
+"  int gidy = get_global_id(1);                                             \n"
+"  int x = gidx + mask_radius;                                              \n"
+"  int y = gidy + mask_radius;                                              \n"
+"  int dst_width = get_global_size(0);                                      \n"
+"  int src_width = dst_width + mask_radius * 2;                             \n"
+"  float4 cumulative_rgb[NUM_INTENSITIES];                                  \n"
+"  int hist_inten[NUM_INTENSITIES], inten_max;                              \n"
+"  int i, j, intensity;                                                     \n"
+"  int radius_sq = mask_radius * mask_radius;                               \n"
+"  float4 temp_pixel;                                                       \n"
+"  for (i = 0; i < intensities; i++)                                        \n"
+"  {                                                                        \n"
+"    hist_inten[i] = 0;                                                     \n"
+"    cumulative_rgb[i] = 0.0;                                               \n"
+"  }                                                                        \n"
+"  for (i = -mask_radius; i <= mask_radius; i++)                            \n"
+"  {                                                                        \n"
+"    for (j = -mask_radius; j <= mask_radius; j++)                          \n"
+"      {                                                                    \n"
+"        if (i*i + j*j <= radius_sq)                                        \n"
+"          {                                                                \n"
+"            temp_pixel = in[x + i + (y + j) * src_width];                  \n"
+"            /*Calculate intensity on the fly, GPU does it fast*/           \n"
+"            intensity = (int)((0.299 * temp_pixel.x                        \n"
+"                      +0.587 * temp_pixel.y                                \n"
+"                      +0.114 * temp_pixel.z) * (float)(intensities-1));    \n"
+"            hist_inten[intensity] += 1;                                    \n"
+"            cumulative_rgb[intensity] += temp_pixel;                       \n"
+"          }                                                                \n"
+"      }                                                                    \n"
+"  }                                                                        \n"
+"  inten_max = 1;                                                           \n"
+"                                                                           \n"
+"  /* calculated maximums */                                                \n"
+"  for (i = 0; i < intensities; i++) {                                      \n"
+"    if(hist_inten[i] > inten_max)                                          \n"
+"      inten_max = hist_inten[i];                                           \n"
+"  }                                                                        \n"
+"  float div = 0.0;                                                         \n"
+"  float ratio, weight, mult_inten;                                         \n"
+"                                                                           \n"
+"  float4 color = 0.0;                                                      \n"
+"  for (i = 0; i < intensities; i++)                                        \n"
+"  {                                                                        \n"
+"    if (hist_inten[i] > 0)                                                 \n"
+"    {                                                                      \n"
+"      ratio = (float)(hist_inten[i]) / (float)(inten_max);                 \n"
+"      weight = pow(ratio, exponent);                                       \n"
+"      mult_inten = weight / (float)(hist_inten[i]);                        \n"
+"                                                                           \n"
+"      div += weight;                                                       \n"
+"      color += mult_inten * cumulative_rgb[i];                             \n"
+"    }                                                                      \n"
+"  }                                                                        \n"
+"  out[gidx + gidy * dst_width] = color/div;                                \n"
+"}                                                                          \n";
+
+
+static GeglClRunData *cl_data = NULL;
+
+static cl_int
+cl_oilify (cl_mem              in_tex,
+           cl_mem              out_tex,
+           size_t              global_worksize,
+           const GeglRectangle *roi,
+           gint                mask_radius,
+           gint                number_of_intensities,
+           gint                exponent,
+           gboolean            use_inten)
+{
+  if (!cl_data)
+    {
+      const char *kernel_name[] = {"kernel_oilify", "kernel_oilify_inten", NULL};
+      cl_data = gegl_cl_compile_and_build(kernel_source, kernel_name);
+    }
+  if (!cl_data)  return 0;
+
+  const size_t gbl_size[2] = {roi->width,roi->height};
+  cl_int radius      = mask_radius;
+  cl_int intensities = number_of_intensities;
+  cl_float exp       = (gfloat)exponent;
+  cl_int cl_err      = 0;
+  gint arg = 0;
+
+  /* simple hack: select suitable kernel using boolean, 0 - no intensity mode, 1 - intensity mode */
+  cl_err |= gegl_clSetKernelArg(cl_data->kernel[use_inten], arg++, sizeof(cl_mem), (void*)&in_tex);
+  cl_err |= gegl_clSetKernelArg(cl_data->kernel[use_inten], arg++, sizeof(cl_mem), (void*)&out_tex);
+  cl_err |= gegl_clSetKernelArg(cl_data->kernel[use_inten], arg++, sizeof(cl_int), (void*)&radius);
+  cl_err |= gegl_clSetKernelArg(cl_data->kernel[use_inten], arg++, sizeof(cl_int), (void*)&intensities);
+  cl_err |= gegl_clSetKernelArg(cl_data->kernel[use_inten], arg++, sizeof(cl_float), (void*)&exp);
+  if (cl_err != CL_SUCCESS) return cl_err;
+
+  cl_err = gegl_clEnqueueNDRangeKernel(gegl_cl_get_command_queue(),
+                                       cl_data->kernel[use_inten], 2,
+                                       NULL, gbl_size, NULL,
+                                       0, NULL, NULL);
+  if (cl_err != CL_SUCCESS) return cl_err;
+
+  return CL_SUCCESS;
+}
+
+static gboolean
+cl_process (GeglOperation       *operation,
+            GeglBuffer          *input,
+            GeglBuffer          *output,
+            const GeglRectangle *result)
+{
+  const Babl *in_format  = gegl_operation_get_format (operation, "input");
+  const Babl *out_format = gegl_operation_get_format (operation, "output");
+  gint err;
+  gint j;
+  cl_int cl_err;
+
+  GeglOperationAreaFilter *op_area = GEGL_OPERATION_AREA_FILTER (operation);
+  GeglChantO *o = GEGL_CHANT_PROPERTIES (operation);
+
+  GeglBufferClIterator *i = gegl_buffer_cl_iterator_new (output,result, out_format, GEGL_CL_BUFFER_WRITE, GEGL_ABYSS_NONE);
+                gint read = gegl_buffer_cl_iterator_add_2 (i, input, result, in_format, GEGL_CL_BUFFER_READ,
+                                                           o->mask_radius, o->mask_radius, o->mask_radius, o->mask_radius, GEGL_ABYSS_NONE);
+  while (gegl_buffer_cl_iterator_next (i, &err))
+  {
+    if (err) return FALSE;
+    for (j=0; j < i->n; j++)
+    {
+      cl_err = cl_oilify(i->tex[read][j],
+                         i->tex[0][j],
+                         i->size[0][j],&i->roi[0][j],
+                         o->mask_radius,
+                         o->intensities,
+                         o->exponent,
+                         o->use_inten);
+      if (cl_err != CL_SUCCESS)
+      {
+        g_warning("[OpenCL] Error in gegl:oilify: %s", gegl_cl_errstring(cl_err));
+        return FALSE;
+      }
+    }
+  }
+  return TRUE;
+}
+
 static gboolean
 process (GeglOperation       *operation,
          GeglBuffer          *input,
@@ -234,7 +497,6 @@ process (GeglOperation       *operation,
   GeglChantO *o                    = GEGL_CHANT_PROPERTIES (operation);
   GeglOperationAreaFilter *op_area = GEGL_OPERATION_AREA_FILTER (operation);
 
-  GeglRectangle* whole_rect;
   gint x = o->mask_radius; /* initial x                   */
   gint y = o->mask_radius; /*           and y coordinates */
   gfloat *src_buf;
@@ -245,7 +507,9 @@ process (GeglOperation       *operation,
   GeglRectangle src_rect;
   gint total_pixels;
 
-  whole_rect = gegl_operation_source_get_bounding_box (operation, "input");
+  if (gegl_cl_is_accelerated ())
+    if(cl_process(operation, input, output, result))
+      return TRUE;
 
   src_rect.x      = result->x - op_area->left;
   src_rect.width  = result->width + op_area->left + op_area->right;
@@ -255,21 +519,27 @@ process (GeglOperation       *operation,
   total_pixels = src_rect.width * src_rect.height;
 
   src_buf = g_slice_alloc (4 * total_pixels * sizeof (gfloat));
-  dst_buf = g_slice_alloc (4 * total_pixels * sizeof (gfloat));
-  inten_buf = g_slice_alloc (total_pixels * sizeof (gfloat));
+  dst_buf = g_slice_alloc (4 * n_pixels * sizeof (gfloat));
+  if(o->use_inten)
+    inten_buf = g_slice_alloc (total_pixels * sizeof (gfloat));
 
   gegl_buffer_get (input, &src_rect, 1.0, babl_format ("RGBA float"),
-                   src_buf, GEGL_AUTO_ROWSTRIDE, GEGL_ABYSS_NONE);
+                   src_buf, GEGL_AUTO_ROWSTRIDE, GEGL_ABYSS_CLAMP);
 
-  gegl_buffer_get (input, &src_rect, 1.0, babl_format ("Y float"),
-                   inten_buf, GEGL_AUTO_ROWSTRIDE, GEGL_ABYSS_NONE);
+  if(o->use_inten)
+    gegl_buffer_get (input, &src_rect, 1.0, babl_format ("Y float"),
+                   inten_buf, GEGL_AUTO_ROWSTRIDE, GEGL_ABYSS_CLAMP);
 
   out_pixel = dst_buf;
+
   while (n_pixels--)
     {
-      oilify_pixel (x, y, whole_rect, o->use_inten, o->mask_radius, o->exponent,
+      if(o->use_inten)
+        oilify_pixel_inten (x, y, o->mask_radius, o->exponent, o->intensities,
                    src_rect.width, src_buf, inten_buf, out_pixel);
-
+      else
+        oilify_pixel (x, y, o->mask_radius, o->exponent, o->intensities,
+                   src_rect.width, src_buf, out_pixel);
       out_pixel += 4;
 
       /* update x and y coordinates */
@@ -281,12 +551,14 @@ process (GeglOperation       *operation,
         }
     }
 
+
   gegl_buffer_set (output, result, 0,
                    babl_format ("RGBA float"),
                    dst_buf, GEGL_AUTO_ROWSTRIDE);
   g_slice_free1 (4 * total_pixels * sizeof (gfloat), src_buf);
-  g_slice_free1 (4 * total_pixels * sizeof (gfloat), dst_buf);
-  g_slice_free1 (total_pixels * sizeof (gfloat), inten_buf);
+  g_slice_free1 (4 * n_pixels * sizeof (gfloat), dst_buf);
+  if(o->use_inten)
+    g_slice_free1 (total_pixels * sizeof (gfloat), inten_buf);
 
   return  TRUE;
 }



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]