[gegl] box-blur using auxiliary cl-iterator to keep intermediate results



commit 671b66ed43bc62a03bbcdab1032f6e6043a076b7
Author: Victor Oliveira <victormatheus gmail com>
Date:   Mon Mar 5 16:41:37 2012 -0300

    box-blur using auxiliary cl-iterator to keep intermediate results

 gegl/opencl/gegl-cl-init.c   |    8 ++++
 gegl/opencl/gegl-cl-init.h   |    5 ++-
 operations/common/box-blur.c |   92 +++++++++++++++++++++++-------------------
 3 files changed, 62 insertions(+), 43 deletions(-)
---
diff --git a/gegl/opencl/gegl-cl-init.c b/gegl/opencl/gegl-cl-init.c
index 7a4bc90..2f846d6 100644
--- a/gegl/opencl/gegl-cl-init.c
+++ b/gegl/opencl/gegl-cl-init.c
@@ -111,6 +111,12 @@ gegl_cl_get_command_queue (void)
   return cl_state.cq;
 }
 
+cl_ulong
+gegl_cl_get_local_mem_size (void)
+{
+  return cl_state.local_mem_size;
+}
+
 #ifdef G_OS_WIN32
 
 #include <windows.h>
@@ -232,6 +238,7 @@ gegl_cl_init (GError **error)
 
       gegl_clGetDeviceInfo (cl_state.device, CL_DEVICE_IMAGE_SUPPORT,      sizeof(cl_bool),  &cl_state.image_support,    NULL);
       gegl_clGetDeviceInfo (cl_state.device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &cl_state.max_mem_alloc,    NULL);
+      gegl_clGetDeviceInfo (cl_state.device, CL_DEVICE_LOCAL_MEM_SIZE,     sizeof(cl_ulong), &cl_state.local_mem_size,   NULL);
 
       cl_state.max_image_width  = 4096;
       cl_state.max_image_height = 4096;
@@ -241,6 +248,7 @@ gegl_cl_init (GError **error)
       g_printf("[OpenCL] Extensions:%s\n",          cl_state.platform_ext);
       g_printf("[OpenCL] Default Device Name:%s\n", cl_state.device_name);
       g_printf("[OpenCL] Max Alloc: %lu bytes\n",   cl_state.max_mem_alloc);
+      g_printf("[OpenCL] Local Mem: %lu bytes\n",   cl_state.local_mem_size);
 
       while (cl_state.max_image_width * cl_state.max_image_height * 16 > cl_state.max_mem_alloc)
         {
diff --git a/gegl/opencl/gegl-cl-init.h b/gegl/opencl/gegl-cl-init.h
index 583e76a..f77984b 100644
--- a/gegl/opencl/gegl-cl-init.h
+++ b/gegl/opencl/gegl-cl-init.h
@@ -22,6 +22,7 @@ typedef struct
     size_t max_image_height;
     size_t max_image_width;
     cl_ulong max_mem_alloc;
+    cl_ulong local_mem_size;
 
     char platform_name   [1024];
     char platform_version[1024];
@@ -44,6 +45,8 @@ cl_context gegl_cl_get_context (void);
 
 cl_command_queue gegl_cl_get_command_queue (void);
 
+cl_ulong gegl_cl_get_local_mem_size (void);
+
 typedef struct
 {
   cl_program program;
@@ -55,7 +58,7 @@ gegl_cl_run_data *gegl_cl_compile_and_build (const char *program_source,
 
 #ifdef __GEGL_CL_INIT_MAIN__
 
-gegl_cl_state cl_state = {FALSE, NULL, NULL, NULL, NULL, FALSE, 0, 0, 0, "", "", "", ""};
+gegl_cl_state cl_state = {FALSE, NULL, NULL, NULL, NULL, FALSE, 0, 0, 0, 0, "", "", "", ""};
 GHashTable *cl_program_hash = NULL;
 
 t_clGetPlatformIDs  gegl_clGetPlatformIDs  = NULL;
diff --git a/operations/common/box-blur.c b/operations/common/box-blur.c
index 7f26cdf..8a0e4d2 100644
--- a/operations/common/box-blur.c
+++ b/operations/common/box-blur.c
@@ -226,89 +226,97 @@ static void prepare (GeglOperation *operation)
 #include "buffer/gegl-buffer-cl-iterator.h"
 
 static const char* kernel_source =
-"__kernel void kernel_blur(__global const float4     *in,                                           \n"
-"                          __global       float4     *out,                                          \n"
-"                          __local        float4     *shared_roi,                                   \n"
-"                          int width, int radius)                                                   \n"
+"__kernel void kernel_blur_hor (__global const float4     *in,                                      \n"
+"                               __global       float4     *aux,                                     \n"
+"                               int width, int radius)                                              \n"
 "{                                                                                                  \n"
+"  const int in_index = get_global_id(0) * (width + 2 * radius)                                     \n"
+"                       + (radius + get_global_id (1));                                             \n"
 "                                                                                                   \n"
-"  const int out_index    = get_global_id(0) * width + get_global_id(1);                            \n"
-"  const int in_top_index = (get_group_id (0) * get_local_size (0)) * (width + 2 * radius)          \n"
-"                            + (get_group_id (1) * get_local_size (1));                             \n"
-"                                                                                                   \n"
-"  const int local_width = (2 * radius + get_local_size (1));                                       \n"
-"  const int local_index = (radius + get_local_id (0)) * local_width + (radius + get_local_id (1)); \n"
-"  int i, x, y;                                                                                     \n"
-"                                                                                                   \n"
+"  const int aux_index = get_global_id(0) * width + get_global_id (1);                              \n"
+"  int i;                                                                                           \n"
 "  float4 mean;                                                                                     \n"
 "                                                                                                   \n"
-"  for (y = get_local_id (0); y < get_local_size (0) + 2 * radius; y += get_local_size (0))         \n"
-"    for (x = get_local_id (1); x < get_local_size (1) + 2 * radius; x += get_local_size (1))       \n"
-"      shared_roi[y*local_width+x] = in[in_top_index + y * (width + 2 * radius) + x];               \n"
-"                                                                                                   \n"
-"  barrier(CLK_LOCAL_MEM_FENCE);                                                                    \n"
-"                                                                                                   \n"
 "  mean = (float4)(0.0f);                                                                           \n"
 "                                                                                                   \n"
 "  for (i=-radius; i <= radius; i++)                                                                \n"
 "   {                                                                                               \n"
-"     mean += shared_roi[local_index + i];                                                          \n"
+"     mean += in[in_index + i];                                                                     \n"
 "   }                                                                                               \n"
 "                                                                                                   \n"
-"  shared_roi[local_index] = mean / (2 * radius + 1);                                               \n"
+"  aux[aux_index] = mean / (2 * radius + 1);                                                        \n"
+"}                                                                                                  \n"
+
+"__kernel void kernel_blur_ver (__global const float4     *aux,                                     \n"
+"                               __global       float4     *out,                                     \n"
+"                               int width, int radius)                                              \n"
+"{                                                                                                  \n"
+"  const int aux_index = (radius + get_global_id(0)) * width + get_global_id (1);                   \n"
 "                                                                                                   \n"
-"  barrier(CLK_LOCAL_MEM_FENCE);                                                                    \n"
+"  const int out_index = get_global_id(0) * width + get_global_id (1);                              \n"
+"  int i;                                                                                           \n"
+"  float4 mean;                                                                                     \n"
 "                                                                                                   \n"
 "  mean = (float4)(0.0f);                                                                           \n"
 "                                                                                                   \n"
 "  for (i=-radius; i <= radius; i++)                                                                \n"
 "   {                                                                                               \n"
-"     mean += shared_roi[local_index + i * local_width];                                            \n"
+"     mean += aux[aux_index + i * width];                                                           \n"
 "   }                                                                                               \n"
 "                                                                                                   \n"
-"  shared_roi[local_index] = mean / (2 * radius + 1);                                               \n"
-"                                                                                                   \n"
-"  barrier(CLK_LOCAL_MEM_FENCE);                                                                    \n"
-"                                                                                                   \n"
-"  out[out_index] = shared_roi[local_index];                                                        \n"
+"  out[out_index] = mean / (2 * radius + 1);                                                        \n"
 "}                                                                                                  \n";
 
 static gegl_cl_run_data *cl_data = NULL;
 
 static cl_int
 cl_box_blur (cl_mem                in_tex,
+             cl_mem                aux_tex,
              cl_mem                out_tex,
              size_t                global_worksize,
              const GeglRectangle  *roi,
              gint                  radius)
 {
   cl_int cl_err = 0;
-  size_t local_ws[2], global_ws[2], local_mem_size;
+  size_t global_ws_hor[2], global_ws_ver[2];
 
   if (!cl_data)
     {
-      const char *kernel_name[] = {"kernel_blur", NULL};
+      const char *kernel_name[] = {"kernel_blur_hor", "kernel_blur_ver", NULL};
       cl_data = gegl_cl_compile_and_build (kernel_source, kernel_name);
     }
 
   if (!cl_data) return 1;
 
-  local_ws[0] = 16;
-  local_ws[1] = 16;
-  global_ws[0] = roi->height;
-  global_ws[1] = roi->width;
-  local_mem_size = sizeof(cl_float4) * (local_ws[0] + 2 * radius) * (local_ws[1] + 2 * radius);
+  global_ws_hor[0] = roi->height + 2 * radius;
+  global_ws_hor[1] = roi->width;
+
+  global_ws_ver[0] = roi->height;
+  global_ws_ver[1] = roi->width;
 
   cl_err |= gegl_clSetKernelArg(cl_data->kernel[0], 0, sizeof(cl_mem),   (void*)&in_tex);
-  cl_err |= gegl_clSetKernelArg(cl_data->kernel[0], 1, sizeof(cl_mem),   (void*)&out_tex);
-  cl_err |= gegl_clSetKernelArg(cl_data->kernel[0], 2, local_mem_size,   NULL);
-  cl_err |= gegl_clSetKernelArg(cl_data->kernel[0], 3, sizeof(cl_int),   (void*)&roi->width);
-  cl_err |= gegl_clSetKernelArg(cl_data->kernel[0], 4, sizeof(cl_int),   (void*)&radius);
+  cl_err |= gegl_clSetKernelArg(cl_data->kernel[0], 1, sizeof(cl_mem),   (void*)&aux_tex);
+  cl_err |= gegl_clSetKernelArg(cl_data->kernel[0], 2, sizeof(cl_int),   (void*)&roi->width);
+  cl_err |= gegl_clSetKernelArg(cl_data->kernel[0], 3, sizeof(cl_int),   (void*)&radius);
   if (cl_err != CL_SUCCESS) return cl_err;
 
   cl_err = gegl_clEnqueueNDRangeKernel(gegl_cl_get_command_queue (),
                                         cl_data->kernel[0], 2,
-                                        NULL, global_ws, local_ws,
+                                        NULL, global_ws_hor, NULL,
+                                        0, NULL, NULL);
+  if (cl_err != CL_SUCCESS) return cl_err;
+
+  gegl_clEnqueueBarrier (gegl_cl_get_command_queue ());
+
+  cl_err |= gegl_clSetKernelArg(cl_data->kernel[1], 0, sizeof(cl_mem),   (void*)&aux_tex);
+  cl_err |= gegl_clSetKernelArg(cl_data->kernel[1], 1, sizeof(cl_mem),   (void*)&out_tex);
+  cl_err |= gegl_clSetKernelArg(cl_data->kernel[1], 2, sizeof(cl_int),   (void*)&roi->width);
+  cl_err |= gegl_clSetKernelArg(cl_data->kernel[1], 3, sizeof(cl_int),   (void*)&radius);
+  if (cl_err != CL_SUCCESS) return cl_err;
+
+  cl_err = gegl_clEnqueueNDRangeKernel(gegl_cl_get_command_queue (),
+                                        cl_data->kernel[1], 2,
+                                        NULL, global_ws_ver, NULL,
                                         0, NULL, NULL);
   if (cl_err != CL_SUCCESS) return cl_err;
 
@@ -332,16 +340,16 @@ cl_process (GeglOperation       *operation,
 
   GeglBufferClIterator *i = gegl_buffer_cl_iterator_new (output,   result, out_format, GEGL_CL_BUFFER_WRITE);
                 gint read = gegl_buffer_cl_iterator_add_2 (i, input, result, in_format,  GEGL_CL_BUFFER_READ, op_area->left, op_area->right, op_area->top, op_area->bottom);
+                gint aux  = gegl_buffer_cl_iterator_add_2 (i, NULL, result, in_format,  GEGL_CL_BUFFER_AUX, 0, 0, op_area->top, op_area->bottom);
   while (gegl_buffer_cl_iterator_next (i, &err))
     {
       if (err) return FALSE;
       for (j=0; j < i->n; j++)
         {
-          cl_err = cl_box_blur(i->tex[read][j], i->tex[0][j], i->size[0][j], &i->roi[0][j], o->radius);
+          cl_err = cl_box_blur(i->tex[read][j], i->tex[aux][j], i->tex[0][j], i->size[0][j], &i->roi[0][j], ceil (o->radius));
           if (cl_err != CL_SUCCESS)
             {
-              g_warning("[OpenCL] Error in %s [GeglOperationPointFilter] Kernel\n",
-                        GEGL_OPERATION_CLASS (operation)->name);
+              g_warning("[OpenCL] Error in box-blur: %s\n", gegl_cl_errstring(cl_err));
               return FALSE;
             }
         }



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]