[gegl] Operations: Add OpenCL support to stretch-contrast



commit cab389d42b1067d2c518ce1baa9f3ec8c0d637c3
Author: Carlos Zubieta <czubieta dev gmail com>
Date:   Mon Sep 23 00:39:50 2013 -0500

    Operations: Add OpenCL support to stretch-contrast

 opencl/stretch-contrast.cl           |  136 +++++++++++++++++
 opencl/stretch-contrast.cl.h         |  138 +++++++++++++++++
 operations/common/stretch-contrast.c |  274 ++++++++++++++++++++++++++++++++++
 3 files changed, 548 insertions(+), 0 deletions(-)
---
diff --git a/opencl/stretch-contrast.cl b/opencl/stretch-contrast.cl
new file mode 100644
index 0000000..83f7ff9
--- /dev/null
+++ b/opencl/stretch-contrast.cl
@@ -0,0 +1,136 @@
+/* This file is an image processing operation for GEGL
+ *
+ * GEGL is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * GEGL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GEGL; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * Copyright 2013 Carlos Zubieta <czubieta dev gmail com>
+ */
+
+
+__kernel void init_stretch (__global float *out_min,
+                            __global float *out_max)
+{
+  int gid = get_global_id (0);
+
+  out_min[gid] =  FLT_MAX;
+  out_max[gid] = -FLT_MAX;
+}
+
+__kernel void two_stages_local_min_max_reduce (__global const float4 *in,
+                                               __global       float  *out_min,
+                                               __global       float  *out_max,
+                                               __local        float  *aux_min,
+                                               __local        float  *aux_max,
+                                                              int    n_pixels)
+{
+  int    gid   = get_global_id(0);
+  int    gsize = get_global_size(0);
+  int    lid   = get_local_id(0);
+  int    lsize = get_local_size(0);
+  float4 min_v = (float4)( FLT_MAX);
+  float4 max_v = (float4)(-FLT_MAX);
+  float4 in_v;
+  float  aux0, aux1;
+  int    it;
+
+  /* Loop sequentially over chunks of input vector */
+  for (it = gid; it < n_pixels; it += gsize)
+    {
+      in_v  =  in[it];
+      min_v =  fmin (min_v, in_v);
+      max_v =  fmax (max_v, in_v);
+    }
+
+  /* Perform parallel reduction */
+  aux_min[lid] = min (min (min_v.x, min_v.y), min_v.z);
+  aux_max[lid] = max (max (max_v.x, max_v.y), max_v.z);
+
+  barrier (CLK_LOCAL_MEM_FENCE);
+
+  for(it = lsize / 2; it > 0; it >>= 1)
+    {
+      if (lid < it)
+        {
+          aux0         = aux_min[lid + it];
+          aux1         = aux_min[lid];
+          aux_min[lid] = fmin (aux0, aux1);
+
+          aux0         = aux_max[lid + it];
+          aux1         = aux_max[lid];
+          aux_max[lid] = fmax (aux0, aux1);
+        }
+      barrier (CLK_LOCAL_MEM_FENCE);
+  }
+  if (lid == 0)
+    {
+      out_min[get_group_id(0)] = aux_min[0];
+      out_max[get_group_id(0)] = aux_max[0];
+    }
+
+  /* the work-group size is the size of the buffer.
+   * Make sure it's fully initialized */
+  if (gid == 0)
+    {
+      /* No special case handling, gsize is a multiple of lsize */
+      int nb_wg = gsize / lsize;
+      for (it = nb_wg; it < lsize; it++)
+        {
+          out_min[it] =  FLT_MAX;
+          out_max[it] = -FLT_MAX;
+        }
+    }
+}
+
+__kernel void global_min_max_reduce (__global float *in_min,
+                                     __global float *in_max,
+                                     __global float *out_min_max)
+{
+  int   gid   = get_global_id(0);
+  int   lid   = get_local_id(0);
+  int   lsize = get_local_size(0);
+  float aux0, aux1;
+  int   it;
+
+  /* Perform parallel reduction */
+  for (it = lsize / 2; it > 0; it >>= 1)
+    {
+      if (lid < it)
+        {
+          aux0        = in_min[lid + it];
+          aux1        = in_min[lid];
+          in_min[gid] = fmin (aux0, aux1);
+
+          aux0        = in_max[lid + it];
+          aux1        = in_max[lid];
+          in_max[gid] = fmax (aux0, aux1);
+        }
+      barrier (CLK_GLOBAL_MEM_FENCE);
+  }
+  if (lid == 0)
+    {
+      out_min_max[0] = in_min[gid];
+      out_min_max[1] = in_max[gid];
+    }
+}
+
+__kernel void cl_stretch_contrast (__global const float4 *in,
+                                   __global       float4 *out,
+                                                  float   min,
+                                                  float   diff)
+{
+  int    gid  = get_global_id(0);
+  float4 in_v = in[gid];
+
+  in_v.xyz = (in_v.xyz - min) / diff;
+  out[gid] = in_v;
+}
diff --git a/opencl/stretch-contrast.cl.h b/opencl/stretch-contrast.cl.h
new file mode 100644
index 0000000..c4bd39e
--- /dev/null
+++ b/opencl/stretch-contrast.cl.h
@@ -0,0 +1,138 @@
+static const char* stretch_contrast_cl_source =
+"/* This file is an image processing operation for GEGL                        \n"
+" *                                                                            \n"
+" * GEGL is free software; you can redistribute it and/or                      \n"
+" * modify it under the terms of the GNU Lesser General Public                 \n"
+" * License as published by the Free Software Foundation; either               \n"
+" * version 3 of the License, or (at your option) any later version.           \n"
+" *                                                                            \n"
+" * GEGL is distributed in the hope that it will be useful,                    \n"
+" * but WITHOUT ANY WARRANTY; without even the implied warranty of             \n"
+" * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU          \n"
+" * Lesser General Public License for more details.                            \n"
+" *                                                                            \n"
+" * You should have received a copy of the GNU Lesser General Public           \n"
+" * License along with GEGL; if not, see <http://www.gnu.org/licenses/>.       \n"
+" *                                                                            \n"
+" * Copyright 2013 Carlos Zubieta <czubieta dev gmail com>                     \n"
+" */                                                                           \n"
+"                                                                              \n"
+"                                                                              \n"
+"__kernel void init_stretch (__global float *out_min,                          \n"
+"                            __global float *out_max)                          \n"
+"{                                                                             \n"
+"  int gid = get_global_id (0);                                                \n"
+"                                                                              \n"
+"  out_min[gid] =  FLT_MAX;                                                    \n"
+"  out_max[gid] = -FLT_MAX;                                                    \n"
+"}                                                                             \n"
+"                                                                              \n"
+"__kernel void two_stages_local_min_max_reduce (__global const float4 *in,     \n"
+"                                               __global       float  *out_min,\n"
+"                                               __global       float  *out_max,\n"
+"                                               __local        float  *aux_min,\n"
+"                                               __local        float  *aux_max,\n"
+"                                                              int    n_pixels)\n"
+"{                                                                             \n"
+"  int    gid   = get_global_id(0);                                            \n"
+"  int    gsize = get_global_size(0);                                          \n"
+"  int    lid   = get_local_id(0);                                             \n"
+"  int    lsize = get_local_size(0);                                           \n"
+"  float4 min_v = (float4)( FLT_MAX);                                          \n"
+"  float4 max_v = (float4)(-FLT_MAX);                                          \n"
+"  float4 in_v;                                                                \n"
+"  float  aux0, aux1;                                                          \n"
+"  int    it;                                                                  \n"
+"                                                                              \n"
+"  /* Loop sequentially over chunks of input vector */                         \n"
+"  for (it = gid; it < n_pixels; it += gsize)                                  \n"
+"    {                                                                         \n"
+"      in_v  =  in[it];                                                        \n"
+"      min_v =  fmin (min_v, in_v);                                            \n"
+"      max_v =  fmax (max_v, in_v);                                            \n"
+"    }                                                                         \n"
+"                                                                              \n"
+"  /* Perform parallel reduction */                                            \n"
+"  aux_min[lid] = min (min (min_v.x, min_v.y), min_v.z);                       \n"
+"  aux_max[lid] = max (max (max_v.x, max_v.y), max_v.z);                       \n"
+"                                                                              \n"
+"  barrier (CLK_LOCAL_MEM_FENCE);                                              \n"
+"                                                                              \n"
+"  for(it = lsize / 2; it > 0; it >>= 1)                                       \n"
+"    {                                                                         \n"
+"      if (lid < it)                                                           \n"
+"        {                                                                     \n"
+"          aux0         = aux_min[lid + it];                                   \n"
+"          aux1         = aux_min[lid];                                        \n"
+"          aux_min[lid] = fmin (aux0, aux1);                                   \n"
+"                                                                              \n"
+"          aux0         = aux_max[lid + it];                                   \n"
+"          aux1         = aux_max[lid];                                        \n"
+"          aux_max[lid] = fmax (aux0, aux1);                                   \n"
+"        }                                                                     \n"
+"      barrier (CLK_LOCAL_MEM_FENCE);                                          \n"
+"  }                                                                           \n"
+"  if (lid == 0)                                                               \n"
+"    {                                                                         \n"
+"      out_min[get_group_id(0)] = aux_min[0];                                  \n"
+"      out_max[get_group_id(0)] = aux_max[0];                                  \n"
+"    }                                                                         \n"
+"                                                                              \n"
+"  /* the work-group size is the size of the buffer.                           \n"
+"   * Make sure it's fully initialized */                                      \n"
+"  if (gid == 0)                                                               \n"
+"    {                                                                         \n"
+"      /* No special case handling, gsize is a multiple of lsize */            \n"
+"      int nb_wg = gsize / lsize;                                              \n"
+"      for (it = nb_wg; it < lsize; it++)                                      \n"
+"        {                                                                     \n"
+"          out_min[it] =  FLT_MAX;                                             \n"
+"          out_max[it] = -FLT_MAX;                                             \n"
+"        }                                                                     \n"
+"    }                                                                         \n"
+"}                                                                             \n"
+"                                                                              \n"
+"__kernel void global_min_max_reduce (__global float *in_min,                  \n"
+"                                     __global float *in_max,                  \n"
+"                                     __global float *out_min_max)             \n"
+"{                                                                             \n"
+"  int   gid   = get_global_id(0);                                             \n"
+"  int   lid   = get_local_id(0);                                              \n"
+"  int   lsize = get_local_size(0);                                            \n"
+"  float aux0, aux1;                                                           \n"
+"  int   it;                                                                   \n"
+"                                                                              \n"
+"  /* Perform parallel reduction */                                            \n"
+"  for (it = lsize / 2; it > 0; it >>= 1)                                      \n"
+"    {                                                                         \n"
+"      if (lid < it)                                                           \n"
+"        {                                                                     \n"
+"          aux0        = in_min[lid + it];                                     \n"
+"          aux1        = in_min[lid];                                          \n"
+"          in_min[gid] = fmin (aux0, aux1);                                    \n"
+"                                                                              \n"
+"          aux0        = in_max[lid + it];                                     \n"
+"          aux1        = in_max[lid];                                          \n"
+"          in_max[gid] = fmax (aux0, aux1);                                    \n"
+"        }                                                                     \n"
+"      barrier (CLK_GLOBAL_MEM_FENCE);                                         \n"
+"  }                                                                           \n"
+"  if (lid == 0)                                                               \n"
+"    {                                                                         \n"
+"      out_min_max[0] = in_min[gid];                                           \n"
+"      out_min_max[1] = in_max[gid];                                           \n"
+"    }                                                                         \n"
+"}                                                                             \n"
+"                                                                              \n"
+"__kernel void cl_stretch_contrast (__global const float4 *in,                 \n"
+"                                   __global       float4 *out,                \n"
+"                                                  float   min,                \n"
+"                                                  float   diff)               \n"
+"{                                                                             \n"
+"  int    gid  = get_global_id(0);                                             \n"
+"  float4 in_v = in[gid];                                                      \n"
+"                                                                              \n"
+"  in_v.xyz = (in_v.xyz - min) / diff;                                         \n"
+"  out[gid] = in_v;                                                            \n"
+"}                                                                             \n"
+;
diff --git a/operations/common/stretch-contrast.c b/operations/common/stretch-contrast.c
index e442424..3798909 100644
--- a/operations/common/stretch-contrast.c
+++ b/operations/common/stretch-contrast.c
@@ -86,6 +86,275 @@ get_cached_region (GeglOperation       *operation,
   return result;
 }
 
+#include "opencl/gegl-cl.h"
+#include "buffer/gegl-buffer-cl-iterator.h"
+#include "opencl/stretch-contrast.cl.h"
+
+GEGL_CL_STATIC
+
+static gboolean
+cl_build_kernels( void )
+{
+  GEGL_CL_BUILD( stretch_contrast,
+                 "two_stages_local_min_max_reduce",
+                 "global_min_max_reduce",
+                 "cl_stretch_contrast",
+                 "init_stretch")
+  return FALSE;
+}
+
+static gboolean
+cl_buffer_get_min_max (cl_mem               in_tex,
+                       size_t               global_worksize,
+                       const GeglRectangle *roi,
+                       gfloat              *min,
+                       gfloat              *max)
+{
+  cl_int   cl_err      = 0;
+  size_t   local_ws, max_local_ws;
+  size_t   work_groups;
+  size_t   global_ws;
+  cl_mem   cl_aux_min  = NULL;
+  cl_mem   cl_aux_max  = NULL;
+  cl_mem   cl_min_max  = NULL;
+  cl_int   n_pixels    = (cl_int)global_worksize;
+  cl_float min_max_buf[2];
+
+  if (global_worksize < 1)
+    {
+      *min = G_MAXFLOAT;
+      *max = G_MINFLOAT;
+      return FALSE;
+    }
+
+  cl_err = gegl_clGetDeviceInfo (gegl_cl_get_device (),
+                                 CL_DEVICE_MAX_WORK_GROUP_SIZE,
+                                 sizeof (size_t), &max_local_ws, NULL);
+  CL_CHECK;
+
+  /* Needs to be a power of two */
+  local_ws = 256;
+  while (local_ws > max_local_ws)
+    local_ws /= 2;
+
+  work_groups = MIN ((global_worksize + local_ws - 1) / local_ws, local_ws);
+  global_ws = work_groups * local_ws;
+
+
+  cl_aux_min = gegl_clCreateBuffer(gegl_cl_get_context(),
+                                   CL_MEM_READ_WRITE,
+                                   local_ws * sizeof(cl_float),
+                                   NULL, &cl_err);
+  CL_CHECK;
+  cl_aux_max = gegl_clCreateBuffer(gegl_cl_get_context(),
+                                   CL_MEM_READ_WRITE,
+                                   local_ws * sizeof(cl_float),
+                                   NULL, &cl_err);
+  CL_CHECK;
+  cl_min_max = gegl_clCreateBuffer(gegl_cl_get_context(),
+                                   CL_MEM_WRITE_ONLY,
+                                   2 * sizeof(cl_float),
+                                   NULL, &cl_err);
+  CL_CHECK;
+
+  /* The full initialization is done in the two_stages_local_min_max_reduce
+     kernel */
+#if 0
+  cl_err = gegl_clSetKernelArg(cl_data->kernel[3], 0, sizeof(cl_mem),
+                               (void*)&cl_aux_min);
+  CL_CHECK;
+  cl_err = gegl_clSetKernelArg(cl_data->kernel[3], 1, sizeof(cl_mem),
+                               (void*)&cl_aux_max);
+  CL_CHECK;
+
+  cl_err = gegl_clEnqueueNDRangeKernel(gegl_cl_get_command_queue (),
+                                       cl_data->kernel[3], 1,
+                                       NULL, &local_ws, &local_ws,
+                                       0, NULL, NULL);
+  CL_CHECK;
+#endif
+
+  cl_err = gegl_clSetKernelArg(cl_data->kernel[0], 0, sizeof(cl_mem),
+                               (void*)&in_tex);
+  CL_CHECK;
+  cl_err = gegl_clSetKernelArg(cl_data->kernel[0], 1, sizeof(cl_mem),
+                               (void*)&cl_aux_min);
+  CL_CHECK;
+  cl_err = gegl_clSetKernelArg(cl_data->kernel[0], 2, sizeof(cl_mem),
+                               (void*)&cl_aux_max);
+  CL_CHECK;
+  cl_err = gegl_clSetKernelArg(cl_data->kernel[0], 3,
+                               sizeof(cl_float) * local_ws, NULL);
+  CL_CHECK;
+  cl_err = gegl_clSetKernelArg(cl_data->kernel[0], 4,
+                               sizeof(cl_float) * local_ws, NULL);
+  CL_CHECK;
+  cl_err = gegl_clSetKernelArg(cl_data->kernel[0], 5, sizeof(cl_int),
+                               (void*)&n_pixels);
+  CL_CHECK;
+
+  cl_err = gegl_clEnqueueNDRangeKernel(gegl_cl_get_command_queue (),
+                                       cl_data->kernel[0], 1,
+                                       NULL, &global_ws, &local_ws,
+                                       0, NULL, NULL);
+  CL_CHECK;
+
+  cl_err = gegl_clSetKernelArg(cl_data->kernel[1], 0, sizeof(cl_mem),
+                               (void*)&cl_aux_min);
+  CL_CHECK;
+  cl_err = gegl_clSetKernelArg(cl_data->kernel[1], 1, sizeof(cl_mem),
+                               (void*)&cl_aux_max);
+  CL_CHECK;
+  cl_err = gegl_clSetKernelArg(cl_data->kernel[1], 2, sizeof(cl_mem),
+                               (void*)&cl_min_max);
+  CL_CHECK;
+
+  /* Only one work group */
+  cl_err = gegl_clEnqueueNDRangeKernel (gegl_cl_get_command_queue (),
+                                        cl_data->kernel[1], 1,
+                                        NULL, &local_ws, &local_ws,
+                                        0, NULL, NULL);
+  CL_CHECK;
+
+  /* Read the memory buffer, probably better to keep it in GPU memory */
+  cl_err = gegl_clEnqueueReadBuffer (gegl_cl_get_command_queue(),
+                                     cl_min_max, CL_TRUE, 0,
+                                     2 * sizeof (cl_float), &min_max_buf, 0,
+                                     NULL, NULL);
+  CL_CHECK;
+
+  *min = min_max_buf[0];
+  *max = min_max_buf[1];
+
+  GEGL_CL_RELEASE(cl_aux_min)
+  GEGL_CL_RELEASE(cl_aux_max)
+  GEGL_CL_RELEASE(cl_min_max)
+
+  return FALSE;
+
+error:
+  if(cl_aux_min)
+    GEGL_CL_RELEASE(cl_aux_min)
+  if(cl_aux_max)
+    GEGL_CL_RELEASE(cl_aux_max)
+  if(cl_min_max)
+    GEGL_CL_RELEASE(cl_min_max)
+  return TRUE;
+}
+
+static gboolean
+cl_stretch_contrast (cl_mem               in_tex,
+                     cl_mem               out_tex,
+                     size_t               global_worksize,
+                     const GeglRectangle *roi,
+                     gfloat               min,
+                     gfloat               diff)
+{
+  cl_int   cl_err  = 0;
+  cl_float cl_min  = min;
+  cl_float cl_diff = diff;
+
+  {
+  cl_err = gegl_clSetKernelArg(cl_data->kernel[2], 0, sizeof(cl_mem),
+                               (void*)&in_tex);
+  CL_CHECK;
+  cl_err = gegl_clSetKernelArg(cl_data->kernel[2], 1, sizeof(cl_mem),
+                               (void*)&out_tex);
+  CL_CHECK;
+  cl_err = gegl_clSetKernelArg(cl_data->kernel[2], 2, sizeof(cl_float),
+                               (void*)&cl_min);
+  CL_CHECK;
+  cl_err = gegl_clSetKernelArg(cl_data->kernel[2], 3, sizeof(cl_float),
+                               (void*)&cl_diff);
+  CL_CHECK;
+
+  cl_err = gegl_clEnqueueNDRangeKernel(gegl_cl_get_command_queue (),
+                                         cl_data->kernel[2], 1,
+                                         NULL, &global_worksize, NULL,
+                                         0, NULL, NULL);
+  CL_CHECK;
+
+  }
+  return FALSE;
+
+error:
+  return TRUE;
+}
+
+static gboolean
+cl_process (GeglOperation       *operation,
+            GeglBuffer          *input,
+            GeglBuffer          *output,
+            const GeglRectangle *result)
+{
+  const Babl *in_format  = gegl_operation_get_format (operation, "input");
+  const Babl *out_format = gegl_operation_get_format (operation, "output");
+
+  gfloat min = 1.0f;
+  gfloat max = 0.0f;
+  gfloat i_min, i_max, diff;
+  gint   err, read;
+  GeglBufferClIterator *i;
+
+  if(cl_build_kernels())
+    return FALSE;
+
+  i = gegl_buffer_cl_iterator_new (input,
+                                   result,
+                                   in_format,
+                                   GEGL_CL_BUFFER_READ);
+
+  while (gegl_buffer_cl_iterator_next (i, &err))
+    {
+      if (err) return FALSE;
+
+      err = cl_buffer_get_min_max(i->tex[0],
+                                  i->size[0],
+                                  &i->roi[0],
+                                  &i_min,
+                                  &i_max);
+      if (err) return FALSE;
+
+      if(i_min < min)
+        min = i_min;
+      if(i_max > max)
+        max = i_max;
+    }
+
+  diff = max-min;
+
+  i = gegl_buffer_cl_iterator_new (output,
+                                   result,
+                                   out_format,
+                                   GEGL_CL_BUFFER_WRITE);
+
+  read = gegl_buffer_cl_iterator_add_2 (i,
+                                        input,
+                                        result,
+                                        in_format,
+                                        GEGL_CL_BUFFER_READ,
+                                        0,
+                                        0,
+                                        0,
+                                        0,
+                                        GEGL_ABYSS_NONE);
+
+  while (gegl_buffer_cl_iterator_next (i, &err))
+    {
+      if (err) return FALSE;
+
+      err = cl_stretch_contrast(i->tex[read],
+                                i->tex[0],
+                                i->size[0],
+                                &i->roi[0],
+                                min,
+                                diff);
+      if (err) return FALSE;
+    }
+
+  return TRUE;
+}
+
 static gboolean
 process (GeglOperation       *operation,
          GeglBuffer          *input,
@@ -96,6 +365,10 @@ process (GeglOperation       *operation,
   gdouble  min, max, diff;
   GeglBufferIterator *gi;
 
+  if (gegl_cl_is_accelerated ())
+    if (cl_process (operation, input, output, result))
+      return TRUE;
+
   buffer_get_min_max (input, &min, &max);
   diff = max - min;
 
@@ -150,6 +423,7 @@ gegl_chant_class_init (GeglChantClass *klass)
   operation_class->prepare = prepare;
   operation_class->get_required_for_output = get_required_for_output;
   operation_class->get_cached_region = get_cached_region;
+  operation_class->opencl_support = TRUE;
 
   gegl_operation_class_set_keys (operation_class,
     "name"       , "gegl:stretch-contrast",


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]