Yongjia Zhang
At 2014-01-23 03:15:54,"Victor Oliveira" <victormatheus gmail com> wrote:
>Indeed, you're right. Just now I noticed the break command before the
>barrier, that's irreducible control-flow.
>
>Yongjia, can you change your kernel so all threads execute the barriers?
>
>Victor
>
>On Wed, Jan 22, 2014 at 11:12 AM, Tom Stellard <tom stellard net> wrote:
>> On Wed, Jan 22, 2014 at 04:21:42PM +0800, Yongjia Zhang wrote:
>>> From: Yongjia Zhang <Zhang_Yong_jia 126 com>
>>>
>>> This is a better way to accomplish the box-blur cl operation by using ocl's
>>> local memory from the opencv source code. It use the local shared memory to
>>> reduce global memory access, which significantly reduces the kernel's processing
>>> time by 70 percent compared to the original one. Because of the barriers and
>>> local worksize limitation, processing with a radius larger than 110 becomes
>>> slower than original algorithm, so I keep the original kernels in order to deal
>>> with box-blur with radius larger than 110.
>>> All the tests are based on Intel Beginet and Intel IvyBridge CPU and GPU.
>>>
>>> Signed-off-by: Yongjia Zhang <yongjia zhang intel com>
>>> ---
>>> opencl/box-blur.cl | 66 +++++++++++++++++++++++++
>>> opencl/box-blur.cl.h | 66 +++++++++++++++++++++++++
>>> operations/common/box-blur.c | 115 ++++++++++++++++++++++++++-----------------
>>> 3 files changed, 201 insertions(+), 46 deletions(-)
>>>
>>> diff --git a/opencl/box-blur.cl b/opencl/box-blur.cl
>>> index e99bea4..a1da9de 100644
>>> --- a/opencl/box-blur.cl
>>> +++ b/opencl/box-blur.cl
>>> @@ -43,3 +43,69 @@ __kernel void kernel_blur_ver (__global const float4 *aux,
>>> out[out_index] = mean / (float)(2 * radius + 1);
>>> }
>>> }
>>> +
>>> +__kernel void kernel_box_blur_fast(const __global float4 *in,
>>> + __global float4 *out,
>>> + __local float4 *column_sum,
>>> + const int width,
>>> + const int height,
>>> + const int radius,
>>> + const int size)
>>> +{
>>> + const int local_id0 = get_local_id(0);
>>> + const int twice_radius = 2 * radius;
>>> + const int in_width = twice_radius + width;
>>> + const int in_height = twice_radius + height;
>>> + const float4 area = (float4)( (twice_radius+1) * (twice_radius+1) );
>>> + int column_index_start,column_index_end;
>>> + int y = get_global_id(1) * size;
>>> + const int out_x = get_group_id(0)
>>> + * ( get_local_size(0) - twice_radius ) + local_id0 - radius;
>>> + const int in_x = out_x + radius;
>>> + int tmp_size = size;
>>> + int tmp_index = 0;
>>> + float4 tmp_sum = (float4)0.0f;
>>> + float4 total_sum = (float4)0.0f;
>>> + if( in_x < in_width )
>>> + {
>>> + column_index_start = y;
>>> + column_index_end = y + twice_radius;
>>> + for( int i=0; i<twice_radius+1; ++i )
>>> + tmp_sum+=in[(y+i)*in_width+in_x];
>>> + column_sum[local_id0] = tmp_sum;
>>> + }
>>> +
>>> + barrier( CLK_LOCAL_MEM_FENCE );
>>> +
>>> + while(1)
>>> + {
>>> + if( out_x < width )
>>> + {
>>> + if( local_id0 >= radius
>>> + && local_id0 < get_local_size(0) - radius )
>>> + {
>>> + total_sum = (float4)0.0f;
>>> + for( int i=0; i<twice_radius+1; ++i )
>>> + total_sum += column_sum[local_id0-radius+i];
>>> + out[y*width+out_x] = total_sum/area;
>>> + }
>>> + }
>>> + if( --tmp_size ==0 || y == height - 1 )
>>> + break;
>>> +
>>> + barrier( CLK_LOCAL_MEM_FENCE );
>>
>> Is this barrier call guaranteed to be executed by all threads? If not,
>> then this will produce undefined behavior.
>>
>> -Tom
>>
>>> +
>>> + ++y;
>>> + if( in_x < in_width )
>>> + {
>>> + tmp_sum = column_sum[local_id0];
>>> + tmp_sum -= in[(column_index_start)*in_width+in_x];
>>> + tmp_sum += in[(column_index_end+1)*in_width+in_x];
>>> + ++column_index_start;
>>> + ++column_index_end;
>>> + column_sum[local_id0] = tmp_sum;
>>> + }
>>> +
>>> + barrier( CLK_LOCAL_MEM_FENCE );
>>> + }
>>> +}
>>> diff --git a/opencl/box-blur.cl.h b/opencl/box-blur.cl.h
>>> index bfed601..8f6aa81 100644
>>> --- a/opencl/box-blur.cl.h
>>> +++ b/opencl/box-blur.cl.h
>>> @@ -44,4 +44,70 @@ static const char* box_blur_cl_source =
>>> " out[out_index] = mean / (float)(2 * radius + 1); \n"
>>> " } \n"
>>> "} \n"
>>> +" \n"
>>> +"__kernel void kernel_box_blur_fast(const __global float4 *in, \n"
>>> +" __global float4 *out, \n"
>>> +" __local float4 *column_sum, \n"
>>> +" const int width, \n"
>>> +" const int height, \n"
>>> +" const int radius, \n"
>>> +" const int size) \n"
>>> +"{ \n"
>>> +" const int local_id0 = get_local_id(0); \n"
>>> +" const int twice_radius = 2 * radius; \n"
>>> +" const int in_width = twice_radius + width; \n"
>>> +" const int in_height = twice_radius + height; \n"
>>> +" const float4 area = (float4)( (twice_radius+1) * (twice_radius+1) ); \n"
>>> +" int column_index_start,column_index_end; \n"
>>> +" int y = get_global_id(1) * size; \n"
>>> +" const int out_x = get_group_id(0) \n"
>>> +" * ( get_local_size(0) - twice_radius ) + local_id0 - radius; \n"
>>> +" const int in_x = out_x + radius; \n"
>>> +" int tmp_size = size; \n"
>>> +" int tmp_index = 0; \n"
>>> +" float4 tmp_sum = (float4)0.0f; \n"
>>> +" float4 total_sum = (float4)0.0f; \n"
>>> +" if( in_x < in_width ) \n"
>>> +" { \n"
>>> +" column_index_start = y; \n"
>>> +" column_index_end = y + twice_radius; \n"
>>> +" for( int i=0; i<twice_radius+1; ++i ) \n"
>>> +" tmp_sum+=in[(y+i)*in_width+in_x]; \n"
>>> +" column_sum[local_id0] = tmp_sum; \n"
>>> +" } \n"
>>> +" \n"
>>> +" barrier( CLK_LOCAL_MEM_FENCE ); \n"
>>> +" \n"
>>> +" while(1) \n"
>>> +" { \n"
>>> +" if( out_x < width ) \n"
>>> +" { \n"
>>> +" if( local_id0 >= radius \n"
>>> +" && local_id0 < get_local_size(0) - radius ) \n"
>>> +" { \n"
>>> +" total_sum = (float4)0.0f; \n"
>>> +" for( int i=0; i<twice_radius+1; ++i ) \n"
>>> +" total_sum += column_sum[local_id0-radius+i]; \n"
>>> +" out[y*width+out_x] = total_sum/area; \n"
>>> +" } \n"
>>> +" } \n"
>>> +" if( --tmp_size ==0 || y == height - 1 ) \n"
>>> +" break; \n"
>>> +" \n"
>>> +" barrier( CLK_LOCAL_MEM_FENCE ); \n"
>>> +" \n"
>>> +" ++y; \n"
>>> +" if( in_x < in_width ) \n"
>>> +" { \n"
>>> +" tmp_sum = column_sum[local_id0]; \n"
>>> +" tmp_sum -= in[(column_index_start)*in_width+in_x]; \n"
>>> +" tmp_sum += in[(column_index_end+1)*in_width+in_x]; \n"
>>> +" ++column_index_start; \n"
>>> +" ++column_index_end; \n"
>>> +" column_sum[local_id0] = tmp_sum; \n"
>>> +" } \n"
>>> +" \n"
>>> +" barrier( CLK_LOCAL_MEM_FENCE ); \n"
>>> +" } \n"
>>> +"} \n"
>>> ;
>>> diff --git a/operations/common/box-blur.c b/operations/common/box-blur.c
>>> index afc19ea..cb77ec0 100644
>>> --- a/operations/common/box-blur.c
>>> +++ b/operations/common/box-blur.c
>>> @@ -180,9 +180,7 @@ static void prepare (GeglOperation *operation)
>>> #include "buffer/gegl-buffer-cl-iterator.h"
>>>
>>> #include "opencl/box-blur.cl.h"
>>> -
>>> static GeglClRunData *cl_data = NULL;
>>> -
>>> static gboolean
>>> cl_box_blur (cl_mem in_tex,
>>> cl_mem aux_tex,
>>> @@ -192,57 +190,82 @@ cl_box_blur (cl_mem in_tex,
>>> gint radius)
>>> {
>>> cl_int cl_err = 0;
>>> - size_t global_ws_hor[2], global_ws_ver[2];
>>> - size_t local_ws_hor[2], local_ws_ver[2];
>>> -
>>> + size_t global_ws_hor[2], global_ws_ver[2], global_ws[2];
>>> + size_t local_ws_hor[2], local_ws_ver[2], local_ws[2];
>>> + size_t step_size ;
>>> if (!cl_data)
>>> {
>>> - const char *kernel_name[] = {"kernel_blur_hor", "kernel_blur_ver", NULL};
>>> + const char *kernel_name[] = { "kernel_blur_hor", "kernel_blur_ver","kernel_box_blur_fast", NULL};
>>> cl_data = gegl_cl_compile_and_build (box_blur_cl_source, kernel_name);
>>> }
>>>
>>> if (!cl_data)
>>> return TRUE;
>>> -
>>> - local_ws_hor[0] = 1;
>>> - local_ws_hor[1] = 256;
>>> - global_ws_hor[0] = roi->height + 2 * radius;
>>> - global_ws_hor[1] = ((roi->width + local_ws_hor[1] -1)/local_ws_hor[1]) * local_ws_hor[1];
>>> -
>>> - local_ws_ver[0] = 1;
>>> - local_ws_ver[1] = 256;
>>> - global_ws_ver[0] = roi->height;
>>> - global_ws_ver[1] = ((roi->width + local_ws_ver[1] -1)/local_ws_ver[1]) * local_ws_ver[1];
>>> -
>>> -
>>> - cl_err = gegl_cl_set_kernel_args (cl_data->kernel[0],
>>> - sizeof(cl_mem), (void*)&in_tex,
>>> - sizeof(cl_mem), (void*)&aux_tex,
>>> - sizeof(cl_int), (void*)&roi->width,
>>> - sizeof(cl_int), (void*)&radius,
>>> - NULL);
>>> - CL_CHECK;
>>> -
>>> - cl_err = gegl_clEnqueueNDRangeKernel (gegl_cl_get_command_queue (),
>>> - cl_data->kernel[0], 2,
>>> - NULL, global_ws_hor, local_ws_hor,
>>> - 0, NULL, NULL);
>>> - CL_CHECK;
>>> -
>>> -
>>> - cl_err = gegl_cl_set_kernel_args (cl_data->kernel[1],
>>> - sizeof(cl_mem), (void*)&aux_tex,
>>> - sizeof(cl_mem), (void*)&out_tex,
>>> - sizeof(cl_int), (void*)&roi->width,
>>> - sizeof(cl_int), (void*)&radius,
>>> - NULL);
>>> - CL_CHECK;
>>> -
>>> - cl_err = gegl_clEnqueueNDRangeKernel (gegl_cl_get_command_queue (),
>>> - cl_data->kernel[1], 2,
>>> - NULL, global_ws_ver, local_ws_ver,
>>> - 0, NULL, NULL);
>>> - CL_CHECK;
>>> + step_size = 64;
>>> + local_ws[0]=256;
>>> + local_ws[1]=1;
>>> +
>>> +
>>> + if( radius <=110 )
>>> + {
>>> + global_ws[0] = (roi->width + local_ws[0] - 2 * radius - 1) / ( local_ws[0] - 2 * radius ) * local_ws[0];
>>> + global_ws[1] = (roi->height + step_size - 1) / step_size;
>>> + cl_err = gegl_cl_set_kernel_args(cl_data->kernel[2],
>>> + sizeof(cl_mem), (void *)&in_tex,
>>> + sizeof(cl_mem), (void *)&out_tex,
>>> + sizeof(cl_float4)*local_ws[0], (void *)NULL,
>>> + sizeof(cl_int), (void *)&roi->width,
>>> + sizeof(cl_int), (void *)&roi->height,
>>> + sizeof(cl_int), (void *)&radius,
>>> + sizeof(cl_int), (void *)&step_size, NULL);
>>> + CL_CHECK;
>>> + cl_err = gegl_clEnqueueNDRangeKernel(gegl_cl_get_command_queue(),
>>> + cl_data->kernel[2], 2,
>>> + NULL, global_ws, local_ws, 0, NULL, NULL );
>>> + CL_CHECK;
>>> +
>>> + }
>>> + else
>>> + {
>>> + local_ws_hor[0] = 1;
>>> + local_ws_hor[1] = 256;
>>> + global_ws_hor[0] = roi->height + 2 * radius;
>>> + global_ws_hor[1] = ((roi->width + local_ws_hor[1] -1)/local_ws_hor[1]) * local_ws_hor[1];
>>> +
>>> + local_ws_ver[0] = 1;
>>> + local_ws_ver[1] = 256;
>>> + global_ws_ver[0] = roi->height;
>>> + global_ws_ver[1] = ((roi->width + local_ws_ver[1] -1)/local_ws_ver[1]) * local_ws_ver[1];
>>> +
>>> +
>>> + cl_err = gegl_cl_set_kernel_args (cl_data->kernel[0],
>>> + sizeof(cl_mem), (void*)&in_tex,
>>> + sizeof(cl_mem), (void*)&aux_tex,
>>> + sizeof(cl_int), (void*)&roi->width,
>>> + sizeof(cl_int), (void*)&radius,
>>> + NULL);
>>> + CL_CHECK;
>>> + cl_err = gegl_clEnqueueNDRangeKernel (gegl_cl_get_command_queue (),
>>> + cl_data->kernel[0], 2,
>>> + NULL, global_ws_hor, local_ws_hor,
>>> + 0, NULL, NULL);
>>> + CL_CHECK;
>>> +
>>> +
>>> + cl_err = gegl_cl_set_kernel_args (cl_data->kernel[1],
>>> + sizeof(cl_mem), (void*)&aux_tex,
>>> + sizeof(cl_mem), (void*)&out_tex,
>>> + sizeof(cl_int), (void*)&roi->width,
>>> + sizeof(cl_int), (void*)&radius,
>>> + NULL);
>>> + CL_CHECK;
>>> +
>>> + cl_err = gegl_clEnqueueNDRangeKernel (gegl_cl_get_command_queue (),
>>> + cl_data->kernel[1], 2,
>>> + NULL, global_ws_ver, local_ws_ver,
>>> + 0, NULL, NULL);
>>> + CL_CHECK;
>>> + }
>>>
>>> return FALSE;
>>>
>>> --
>>> 1.8.3.2
>>>
>>>
>>> _______________________________________________
>>> gegl-developer-list mailing list
>>> List address: gegl-developer-list gnome org
>>> List membership: https://mail.gnome.org/mailman/listinfo/gegl-developer-list
>>>
>> _______________________________________________
>> gegl-developer-list mailing list
>> List address: gegl-developer-list gnome org
>> List membership:
https://mail.gnome.org/mailman/listinfo/gegl-developer-list
>>