Re: [Gegl-developer] [PATCH] Optimize operation box-blur opencl kernel



On Wed, Jan 22, 2014 at 04:21:42PM +0800, Yongjia Zhang wrote:
From: Yongjia Zhang <Zhang_Yong_jia 126 com>

This is a better way to accomplish the box-blur cl operation by using ocl's
local memory from the opencv source code. It use the local shared memory to
reduce global memory access, which significantly reduces the kernel's processing
time by 70 percent compared to the original one. Because of the barriers and
local worksize limitation, processing with a radius larger than 110 becomes
slower than original algorithm, so I keep the original kernels in order to deal
with box-blur with radius larger than 110.
All the tests are based on Intel Beginet and Intel IvyBridge CPU and GPU.

Signed-off-by: Yongjia Zhang <yongjia zhang intel com>
---
 opencl/box-blur.cl           |  66 +++++++++++++++++++++++++
 opencl/box-blur.cl.h         |  66 +++++++++++++++++++++++++
 operations/common/box-blur.c | 115 ++++++++++++++++++++++++++-----------------
 3 files changed, 201 insertions(+), 46 deletions(-)

diff --git a/opencl/box-blur.cl b/opencl/box-blur.cl
index e99bea4..a1da9de 100644
--- a/opencl/box-blur.cl
+++ b/opencl/box-blur.cl
@@ -43,3 +43,69 @@ __kernel void kernel_blur_ver (__global const float4     *aux,
       out[out_index] = mean / (float)(2 * radius + 1);
     }
 }
+
+__kernel void kernel_box_blur_fast(const __global float4 *in,                       
+                                   __global float4 *out,                           
+                                   __local float4 *column_sum,                      
+                                   const int width,                         
+                                   const int height,                        
+                                   const int radius,                        
+                                   const int size)                          
+{                                                                           
+     const int local_id0 = get_local_id(0);                                 
+     const int twice_radius = 2 * radius;                                   
+     const int in_width = twice_radius + width;                             
+     const int in_height = twice_radius + height;                           
+     const float4 area = (float4)( (twice_radius+1) * (twice_radius+1) );   
+     int column_index_start,column_index_end;                               
+     int y = get_global_id(1) * size;                                       
+     const int out_x = get_group_id(0)                                      
+               * ( get_local_size(0) - twice_radius ) + local_id0 - radius;   
+     const int in_x = out_x + radius;                                       
+     int tmp_size = size;                                                   
+     int tmp_index = 0;                                                     
+     float4 tmp_sum = (float4)0.0f;                                         
+     float4 total_sum = (float4)0.0f;                                       
+     if( in_x < in_width )                                                  
+     {                                                                      
+             column_index_start = y;                                        
+             column_index_end = y + twice_radius;                           
+             for( int i=0; i<twice_radius+1; ++i )                          
+                     tmp_sum+=in[(y+i)*in_width+in_x];                      
+             column_sum[local_id0] = tmp_sum;                               
+     }
+
+     barrier( CLK_LOCAL_MEM_FENCE );                                        
+                                                                            
+     while(1)                                                               
+     {                                                                      
+             if( out_x < width )                                            
+             {                                                              
+                     if( local_id0 >= radius                                
+                                 && local_id0 < get_local_size(0) - radius )  
+                     {                                                     
+                             total_sum = (float4)0.0f;                      
+                             for( int i=0; i<twice_radius+1; ++i )          
+                                total_sum += column_sum[local_id0-radius+i];
+                             out[y*width+out_x] = total_sum/area;           
+                     }                                                      
+             }                                                              
+             if( --tmp_size ==0 || y == height - 1 )                        
+                     break;
+
+             barrier( CLK_LOCAL_MEM_FENCE );

Is this barrier call guaranteed to be executed by all threads?  If not,
then this will produce undefined behavior.

-Tom

+
+             ++y;                                                           
+             if( in_x < in_width )                                          
+             {                                                              
+                     tmp_sum = column_sum[local_id0];                       
+                     tmp_sum -= in[(column_index_start)*in_width+in_x];     
+                     tmp_sum += in[(column_index_end+1)*in_width+in_x];     
+                     ++column_index_start;                                  
+                     ++column_index_end;                                    
+                     column_sum[local_id0] = tmp_sum;                       
+             }
+
+             barrier( CLK_LOCAL_MEM_FENCE );                                
+     }                                                                      
+}                                                                           
diff --git a/opencl/box-blur.cl.h b/opencl/box-blur.cl.h
index bfed601..8f6aa81 100644
--- a/opencl/box-blur.cl.h
+++ b/opencl/box-blur.cl.h
@@ -44,4 +44,70 @@ static const char* box_blur_cl_source =
 "      out[out_index] = mean / (float)(2 * radius + 1);                        \n"
 "    }                                                                         \n"
 "}                                                                             \n"
+"                                                                              \n"
+"__kernel void kernel_box_blur_fast(const __global float4 *in,                 \n"
+"                                   __global float4 *out,                      \n"
+"                                   __local float4 *column_sum,                \n"
+"                                   const int width,                           \n"
+"                                   const int height,                          \n"
+"                                   const int radius,                          \n"
+"                                   const int size)                            \n"
+"{                                                                             \n"
+"    const int local_id0 = get_local_id(0);                                    \n"
+"    const int twice_radius = 2 * radius;                                      \n"
+"    const int in_width = twice_radius + width;                                \n"
+"    const int in_height = twice_radius + height;                              \n"
+"    const float4 area = (float4)( (twice_radius+1) * (twice_radius+1) );      \n"
+"    int column_index_start,column_index_end;                                  \n"
+"    int y = get_global_id(1) * size;                                          \n"
+"    const int out_x = get_group_id(0)                                         \n"
+"               * ( get_local_size(0) - twice_radius ) + local_id0 - radius;   \n"
+"    const int in_x = out_x + radius;                                          \n"
+"    int tmp_size = size;                                                      \n"
+"    int tmp_index = 0;                                                        \n"
+"    float4 tmp_sum = (float4)0.0f;                                            \n"
+"    float4 total_sum = (float4)0.0f;                                          \n"
+"    if( in_x < in_width )                                                     \n"
+"    {                                                                         \n"
+"        column_index_start = y;                                               \n"
+"        column_index_end = y + twice_radius;                                  \n"
+"        for( int i=0; i<twice_radius+1; ++i )                                 \n"
+"            tmp_sum+=in[(y+i)*in_width+in_x];                                 \n"
+"        column_sum[local_id0] = tmp_sum;                                      \n"
+"    }                                                                         \n"
+"                                                                              \n"
+"    barrier( CLK_LOCAL_MEM_FENCE );                                           \n"
+"                                                                              \n"
+"    while(1)                                                                  \n"
+"    {                                                                         \n"
+"        if( out_x < width )                                                   \n"
+"        {                                                                     \n"
+"            if( local_id0 >= radius                                           \n"
+"                                 && local_id0 < get_local_size(0) - radius )  \n"
+"            {                                                                 \n"
+"                total_sum = (float4)0.0f;                                     \n"
+"                for( int i=0; i<twice_radius+1; ++i )                         \n"
+"                   total_sum += column_sum[local_id0-radius+i];               \n"
+"                out[y*width+out_x] = total_sum/area;                          \n"
+"            }                                                                 \n"
+"        }                                                                     \n"
+"        if( --tmp_size ==0 || y == height - 1 )                               \n"
+"            break;                                                            \n"
+"                                                                              \n"
+"        barrier( CLK_LOCAL_MEM_FENCE );                                       \n"
+"                                                                              \n"
+"        ++y;                                                                  \n"
+"        if( in_x < in_width )                                                 \n"
+"        {                                                                     \n"
+"            tmp_sum = column_sum[local_id0];                                  \n"
+"            tmp_sum -= in[(column_index_start)*in_width+in_x];                \n"
+"            tmp_sum += in[(column_index_end+1)*in_width+in_x];                \n"
+"            ++column_index_start;                                             \n"
+"            ++column_index_end;                                               \n"
+"            column_sum[local_id0] = tmp_sum;                                  \n"
+"        }                                                                     \n"
+"                                                                              \n"
+"        barrier( CLK_LOCAL_MEM_FENCE );                                       \n"
+"    }                                                                         \n"
+"}                                                                             \n"
 ;
diff --git a/operations/common/box-blur.c b/operations/common/box-blur.c
index afc19ea..cb77ec0 100644
--- a/operations/common/box-blur.c
+++ b/operations/common/box-blur.c
@@ -180,9 +180,7 @@ static void prepare (GeglOperation *operation)
 #include "buffer/gegl-buffer-cl-iterator.h"
 
 #include "opencl/box-blur.cl.h"
-
 static GeglClRunData *cl_data = NULL;
-
 static gboolean
 cl_box_blur (cl_mem                in_tex,
              cl_mem                aux_tex,
@@ -192,57 +190,82 @@ cl_box_blur (cl_mem                in_tex,
              gint                  radius)
 {
   cl_int cl_err = 0;
-  size_t global_ws_hor[2], global_ws_ver[2];
-  size_t local_ws_hor[2], local_ws_ver[2];
-
+  size_t global_ws_hor[2], global_ws_ver[2], global_ws[2];
+  size_t local_ws_hor[2], local_ws_ver[2], local_ws[2];
+  size_t step_size ;
   if (!cl_data)
     {
-      const char *kernel_name[] = {"kernel_blur_hor", "kernel_blur_ver", NULL};
+      const char *kernel_name[] = { "kernel_blur_hor", "kernel_blur_ver","kernel_box_blur_fast", NULL};
       cl_data = gegl_cl_compile_and_build (box_blur_cl_source, kernel_name);
     }
 
   if (!cl_data)
     return TRUE;
-
-  local_ws_hor[0] = 1;
-  local_ws_hor[1] = 256;
-  global_ws_hor[0] = roi->height + 2 * radius;
-  global_ws_hor[1] = ((roi->width + local_ws_hor[1] -1)/local_ws_hor[1]) * local_ws_hor[1];
-
-  local_ws_ver[0] = 1;
-  local_ws_ver[1] = 256;
-  global_ws_ver[0] = roi->height;
-  global_ws_ver[1] = ((roi->width + local_ws_ver[1] -1)/local_ws_ver[1]) * local_ws_ver[1];
-
-
-  cl_err = gegl_cl_set_kernel_args (cl_data->kernel[0],
-                                    sizeof(cl_mem), (void*)&in_tex,
-                                    sizeof(cl_mem), (void*)&aux_tex,
-                                    sizeof(cl_int), (void*)&roi->width,
-                                    sizeof(cl_int), (void*)&radius,
-                                    NULL);
-  CL_CHECK;
-
-  cl_err = gegl_clEnqueueNDRangeKernel (gegl_cl_get_command_queue (),
-                                        cl_data->kernel[0], 2,
-                                        NULL, global_ws_hor, local_ws_hor,
-                                        0, NULL, NULL);
-  CL_CHECK;
-
-
-  cl_err = gegl_cl_set_kernel_args (cl_data->kernel[1],
-                                    sizeof(cl_mem), (void*)&aux_tex,
-                                    sizeof(cl_mem), (void*)&out_tex,
-                                    sizeof(cl_int), (void*)&roi->width,
-                                    sizeof(cl_int), (void*)&radius,
-                                    NULL);
-  CL_CHECK;
-
-  cl_err = gegl_clEnqueueNDRangeKernel (gegl_cl_get_command_queue (),
-                                        cl_data->kernel[1], 2,
-                                        NULL, global_ws_ver, local_ws_ver,
-                                        0, NULL, NULL);
-  CL_CHECK;
+  step_size = 64;
+  local_ws[0]=256;
+  local_ws[1]=1;
+
+
+  if( radius <=110 )
+  {
+    global_ws[0] = (roi->width + local_ws[0] - 2 * radius - 1) / ( local_ws[0] - 2 * radius ) * 
local_ws[0];
+    global_ws[1] = (roi->height + step_size - 1) / step_size;
+    cl_err = gegl_cl_set_kernel_args(cl_data->kernel[2],
+                                     sizeof(cl_mem), (void *)&in_tex,
+                                     sizeof(cl_mem), (void *)&out_tex,
+                                     sizeof(cl_float4)*local_ws[0], (void *)NULL,
+                                     sizeof(cl_int), (void *)&roi->width,
+                                     sizeof(cl_int), (void *)&roi->height,
+                                     sizeof(cl_int), (void *)&radius,
+                                     sizeof(cl_int), (void *)&step_size, NULL);
+    CL_CHECK;
+    cl_err = gegl_clEnqueueNDRangeKernel(gegl_cl_get_command_queue(),
+                                         cl_data->kernel[2], 2,
+                                         NULL, global_ws, local_ws, 0, NULL, NULL );
+     CL_CHECK;
+
+  }
+  else
+  {
+    local_ws_hor[0] = 1;
+    local_ws_hor[1] = 256;
+    global_ws_hor[0] = roi->height + 2 * radius;
+    global_ws_hor[1] = ((roi->width + local_ws_hor[1] -1)/local_ws_hor[1]) * local_ws_hor[1];
+
+    local_ws_ver[0] = 1;
+    local_ws_ver[1] = 256;
+    global_ws_ver[0] = roi->height;
+    global_ws_ver[1] = ((roi->width + local_ws_ver[1] -1)/local_ws_ver[1]) * local_ws_ver[1];
+
+
+    cl_err = gegl_cl_set_kernel_args (cl_data->kernel[0],
+                                      sizeof(cl_mem), (void*)&in_tex,
+                                      sizeof(cl_mem), (void*)&aux_tex,
+                                      sizeof(cl_int), (void*)&roi->width,
+                                      sizeof(cl_int), (void*)&radius,
+                                      NULL);
+    CL_CHECK;
+    cl_err = gegl_clEnqueueNDRangeKernel (gegl_cl_get_command_queue (),
+                                          cl_data->kernel[0], 2,
+                                          NULL, global_ws_hor, local_ws_hor,
+                                          0, NULL, NULL);
+    CL_CHECK;
+
+
+    cl_err = gegl_cl_set_kernel_args (cl_data->kernel[1],
+                                      sizeof(cl_mem), (void*)&aux_tex,
+                                      sizeof(cl_mem), (void*)&out_tex,
+                                      sizeof(cl_int), (void*)&roi->width,
+                                      sizeof(cl_int), (void*)&radius,
+                                      NULL);
+    CL_CHECK;
+
+    cl_err = gegl_clEnqueueNDRangeKernel (gegl_cl_get_command_queue (),
+                                          cl_data->kernel[1], 2,
+                                          NULL, global_ws_ver, local_ws_ver,
+                                          0, NULL, NULL);
+    CL_CHECK;
+  }
 
   return FALSE;
 
-- 
1.8.3.2


_______________________________________________
gegl-developer-list mailing list
List address:    gegl-developer-list gnome org
List membership: https://mail.gnome.org/mailman/listinfo/gegl-developer-list



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]