[gegl] opencl: Fast approximation of the bilateral filter in opencl.
- From: Victor Matheus de Araujo Oliveira <vmaolive src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [gegl] opencl: Fast approximation of the bilateral filter in opencl.
- Date: Tue, 12 Mar 2013 20:01:18 +0000 (UTC)
commit a34f6fd2393d0fc1b4d936fdfe6da7fce47929b7
Author: Victor Oliveira <victormatheus gmail com>
Date: Tue Mar 12 16:58:24 2013 -0300
opencl: Fast approximation of the bilateral filter in opencl.
gegl/opencl/gegl-cl-init.c | 1 +
gegl/opencl/gegl-cl-types.h | 1 +
gegl/opencl/gegl-cl.h | 46 +++++-
opencl/bilateral-filter-fast.cl | 218 ++++++++++++++++++++++
opencl/bilateral-filter-fast.cl.h | 220 ++++++++++++++++++++++
operations/common/bilateral-filter-fast.c | 285 +++++++++++++++++++++++++---
6 files changed, 738 insertions(+), 33 deletions(-)
---
diff --git a/gegl/opencl/gegl-cl-init.c b/gegl/opencl/gegl-cl-init.c
index f97c9cc..58d04c3 100644
--- a/gegl/opencl/gegl-cl-init.c
+++ b/gegl/opencl/gegl-cl-init.c
@@ -269,6 +269,7 @@ gegl_cl_init (GError **error)
CL_LOAD_FUNCTION (clEnqueueWriteBufferRect)
CL_LOAD_FUNCTION (clEnqueueCopyBufferRect)
CL_LOAD_FUNCTION (clCreateImage2D)
+ CL_LOAD_FUNCTION (clCreateImage3D)
CL_LOAD_FUNCTION (clEnqueueWriteImage)
CL_LOAD_FUNCTION (clEnqueueReadImage)
CL_LOAD_FUNCTION (clEnqueueCopyImage)
diff --git a/gegl/opencl/gegl-cl-types.h b/gegl/opencl/gegl-cl-types.h
index 7617482..ae83aa1 100644
--- a/gegl/opencl/gegl-cl-types.h
+++ b/gegl/opencl/gegl-cl-types.h
@@ -66,6 +66,7 @@ typedef CL_API_ENTRY cl_int (CL_API_CALL *t_clEnqueueReadBufferRect
typedef CL_API_ENTRY cl_int (CL_API_CALL *t_clEnqueueWriteBufferRect ) (cl_command_queue, cl_mem,
cl_bool, const size_t [3], const size_t [3], const size_t [3], size_t, size_t, size_t, size_t, void *,
cl_uint, const cl_event *, cl_event *);
typedef CL_API_ENTRY cl_int (CL_API_CALL *t_clEnqueueCopyBufferRect ) (cl_command_queue, cl_mem,
cl_mem, const size_t [3], const size_t [3], const size_t [3], size_t, size_t, size_t, size_t, cl_uint, const
cl_event *, cl_event *);
typedef CL_API_ENTRY cl_mem (CL_API_CALL *t_clCreateImage2D ) (cl_context, cl_mem_flags,
const cl_image_format *, size_t, size_t, size_t, void *, cl_int *);
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *t_clCreateImage3D ) (cl_context, cl_mem_flags,
const cl_image_format *, size_t, size_t, size_t, size_t, size_t, void *, cl_int *);
typedef CL_API_ENTRY cl_int (CL_API_CALL *t_clEnqueueReadImage ) (cl_command_queue, cl_mem,
cl_bool, const size_t [3], const size_t [3], size_t, size_t, void *, cl_uint, const cl_event *, cl_event *);
typedef CL_API_ENTRY cl_int (CL_API_CALL *t_clEnqueueWriteImage ) (cl_command_queue, cl_mem,
cl_bool, const size_t [3], const size_t [3], size_t, size_t, const void *, cl_uint, const cl_event *,
cl_event *);
typedef CL_API_ENTRY cl_int (CL_API_CALL *t_clEnqueueCopyImage ) (cl_command_queue, cl_mem,
cl_mem, const size_t [3], const size_t [3], const size_t [3], cl_uint, const cl_event *, cl_event *);
diff --git a/gegl/opencl/gegl-cl.h b/gegl/opencl/gegl-cl.h
index 926f38c..ab67230 100644
--- a/gegl/opencl/gegl-cl.h
+++ b/gegl/opencl/gegl-cl.h
@@ -26,15 +26,57 @@
#ifdef __GEGL_DEBUG_H__
#define CL_ERROR {GEGL_NOTE (GEGL_DEBUG_OPENCL, "Error in %s:%d %s - %s\n", __FILE__, __LINE__, __func__,
gegl_cl_errstring(cl_err)); goto error;}
-#define CL_CHECK {if (cl_err != CL_SUCCESS) CL_ERROR;}
#else
/* public header for gegl ops */
#define CL_ERROR {g_warning("Error in %s:%d %s - %s\n", __FILE__, __LINE__, __func__,
gegl_cl_errstring(cl_err)); goto error;}
-#define CL_CHECK {if (cl_err != CL_SUCCESS) CL_ERROR;}
#endif
+#define CL_CHECK {if (cl_err != CL_SUCCESS) CL_ERROR;}
+
+#define GEGL_CL_ARG_START(KERNEL) \
+ { cl_kernel __mykernel=KERNEL; int __p = 0;
+
+#define GEGL_CL_ARG(TYPE, NAME) \
+ { cl_err = gegl_clSetKernelArg(__mykernel, __p++, sizeof(TYPE), (void*)& NAME); \
+ CL_CHECK; }
+
+#define GEGL_CL_ARG_END \
+ __p = -1; }
+
+#define GEGL_CL_RELEASE(obj) \
+ { cl_err = gegl_clReleaseMemObject(obj); \
+ CL_CHECK; }
+
+#define GEGL_CL_BUFFER_ITERATE_START(I, J, ERR) \
+ while (gegl_buffer_cl_iterator_next (I, & ERR)) \
+ { \
+ if (ERR) return FALSE; \
+ for (J=0; J < I ->n; J++) \
+ {
+
+#define GEGL_CL_BUFFER_ITERATE_END(ERR) \
+ if (ERR) \
+ { \
+ g_warning("[OpenCL] Error"); \
+ return FALSE; \
+ } \
+ } \
+ }
+
+
+#define GEGL_CL_BUILD(NAME, ...) \
+ if (!cl_data) \
+ { \
+ const char *kernel_name[] ={__VA_ARGS__ , NULL}; \
+ cl_data = gegl_cl_compile_and_build(NAME ## _cl_source, kernel_name); \
+ } \
+ if (!cl_data) return TRUE;
+
+#define GEGL_CL_STATIC \
+ static GeglClRunData *cl_data = NULL;
+
#endif
diff --git a/opencl/bilateral-filter-fast.cl b/opencl/bilateral-filter-fast.cl
new file mode 100644
index 0000000..56eca2e
--- /dev/null
+++ b/opencl/bilateral-filter-fast.cl
@@ -0,0 +1,218 @@
+/* This file is part of GEGL
+ *
+ * GEGL is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * GEGL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GEGL; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * Copyright 2013 Victor Oliveira (victormatheus gmail com)
+ */
+
+#define GRID(x,y,z) grid[x+sw*(y + z * sh)]
+
+__kernel void bilateral_init(__global float8 *grid,
+ int sw,
+ int sh,
+ int depth)
+{
+ const int gid_x = get_global_id(0);
+ const int gid_y = get_global_id(1);
+
+ for (int d=0; d<depth; d++)
+ {
+ GRID(gid_x,gid_y,d) = (float8)(0.0f);
+ }
+}
+
+__kernel void bilateral_downsample(__global const float4 *input,
+ __global float2 *grid,
+ int width,
+ int height,
+ int sw,
+ int sh,
+ int s_sigma,
+ float r_sigma)
+{
+ const int gid_x = get_global_id(0);
+ const int gid_y = get_global_id(1);
+
+ for (int ry=0; ry < s_sigma; ry++)
+ for (int rx=0; rx < s_sigma; rx++)
+ {
+ const int x = clamp(gid_x * s_sigma - s_sigma/2 + rx, 0, width -1);
+ const int y = clamp(gid_y * s_sigma - s_sigma/2 + ry, 0, height-1);
+
+ const float4 val = input[y * width + x];
+
+ const int4 z = convert_int4(val * (1.0f/r_sigma) + 0.5f);
+
+ grid[4*(gid_x+sw*(gid_y + z.x * sh))+0] += (float2)(val.x, 1.0f);
+ grid[4*(gid_x+sw*(gid_y + z.y * sh))+1] += (float2)(val.y, 1.0f);
+ grid[4*(gid_x+sw*(gid_y + z.z * sh))+2] += (float2)(val.z, 1.0f);
+ grid[4*(gid_x+sw*(gid_y + z.w * sh))+3] += (float2)(val.w, 1.0f);
+
+ barrier (CLK_GLOBAL_MEM_FENCE);
+ }
+}
+
+#define LOCAL_W 16
+#define LOCAL_H 16
+
+__attribute__((reqd_work_group_size(16, 16, 1)))
+__kernel void bilateral_blur(__global const float8 *grid,
+ __global float2 *blurz_r,
+ __global float2 *blurz_g,
+ __global float2 *blurz_b,
+ __global float2 *blurz_a,
+ int sw,
+ int sh,
+ int depth)
+{
+ __local float8 img1[LOCAL_H+2][LOCAL_W+2];
+ __local float8 img2[LOCAL_H+2][LOCAL_W+2];
+
+ const int gid_x = get_global_id(0);
+ const int gid_y = get_global_id(1);
+
+ const int lx = get_local_id(0);
+ const int ly = get_local_id(1);
+
+ float8 vpp = (float8)(0.0f);
+ float8 vp = (float8)(0.0f);
+ float8 v = (float8)(0.0f);
+
+ float8 k;
+
+ int x = clamp(gid_x, 0, sw-1);
+ int y = clamp(gid_y, 0, sw-1);
+
+ for (int d=0; d<depth; d++)
+ {
+ int xp = max(x-1, 0);
+ int xn = min(x+1, sw-1);
+
+ int yp = max(y-1, 0);
+ int yn = min(y+1, sh-1);
+
+ int zp = max(d-1, 0);
+ int zn = min(d+1, depth-1);
+
+ /* the corners are not going to be used, don't bother to load them */
+
+ if (ly == 0) {
+ k = GRID(x, yp, d);
+ img1[0][lx+1] = k;
+ img2[0][lx+1] = k;
+ }
+
+ if (ly == LOCAL_H-1) {
+ k = GRID(x, yn, d);
+ img1[LOCAL_H+1][lx+1] = k;
+ img2[LOCAL_H+1][lx+1] = k;
+ }
+
+ if (lx == 0) {
+ k = GRID(xp, y, d);
+ img1[ly+1][0] = k;
+ img2[ly+1][0] = k;
+ }
+
+ if (lx == LOCAL_W-1) {
+ k = GRID(xn, y, d);
+ img1[ly+1][LOCAL_W+1] = k;
+ img2[ly+1][LOCAL_W+1] = k;
+ }
+
+ img1[ly+1][lx+1] = GRID(x, y, d);
+
+ barrier (CLK_LOCAL_MEM_FENCE);
+
+ /* blur x */
+
+ img2[ly+1][lx+1] = (img1[ly+1][lx] + 2.0f * img1[ly+1][lx+1] + img1[ly+1][lx+2]) / 4.0f;
+
+ barrier (CLK_LOCAL_MEM_FENCE);
+
+ /* blur y */
+
+ v = (img2[ly][lx+1] + 2.0f * img2[ly+1][lx+1] + img2[ly+2][lx+1]) / 4.0f;
+
+ /* last three v values */
+
+ if (d == 0)
+ {
+ /* this is like CLAMP */
+ vpp = img1[ly+1][lx+1];
+ vp = img1[ly+1][lx+1];
+ }
+ else
+ {
+ vpp = vp;
+ vp = v;
+
+ float8 blurred = (vpp + 2.0f * vp + v) / 4.0f;
+
+ /* XXX: OpenCL 1.1 doesn't support writes to 3D textures */
+
+ if (gid_x < sw && gid_y < sh)
+ {
+ blurz_r[x+sw*(y+sh*(d-1))] = blurred.s01;
+ blurz_g[x+sw*(y+sh*(d-1))] = blurred.s23;
+ blurz_b[x+sw*(y+sh*(d-1))] = blurred.s45;
+ blurz_a[x+sw*(y+sh*(d-1))] = blurred.s67;
+ }
+ }
+ }
+
+ /* last z */
+
+ vpp = vp;
+ vp = v;
+
+ float8 blurred = (vpp + 2.0f * vp + v) / 4.0f;
+
+ if (gid_x < sw && gid_y < sh)
+ {
+ blurz_r[x+sw*(y+sh*(depth-1))] = blurred.s01;
+ blurz_g[x+sw*(y+sh*(depth-1))] = blurred.s23;
+ blurz_b[x+sw*(y+sh*(depth-1))] = blurred.s45;
+ blurz_a[x+sw*(y+sh*(depth-1))] = blurred.s67;
+ }
+}
+
+const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;
+
+__kernel void bilateral_interpolate(__global const float4 *input,
+ __read_only image3d_t blurz_r,
+ __read_only image3d_t blurz_g,
+ __read_only image3d_t blurz_b,
+ __read_only image3d_t blurz_a,
+ __global float4 *smoothed,
+ int width,
+ int s_sigma,
+ float r_sigma)
+{
+ const int x = get_global_id(0);
+ const int y = get_global_id(1);
+
+ const float xf = (float)(x) / s_sigma;
+ const float yf = (float)(y) / s_sigma;
+ const float4 zf = input[y * width + x] / r_sigma;
+
+ float8 val;
+
+ val.s04 = (read_imagef (blurz_r, sampler, (float4)(xf, yf, zf.x, 0.0f))).xy;
+ val.s15 = (read_imagef (blurz_g, sampler, (float4)(xf, yf, zf.y, 0.0f))).xy;
+ val.s26 = (read_imagef (blurz_b, sampler, (float4)(xf, yf, zf.z, 0.0f))).xy;
+ val.s37 = (read_imagef (blurz_a, sampler, (float4)(xf, yf, zf.w, 0.0f))).xy;
+
+ smoothed[y * width + x] = val.s0123/val.s4567;
+}
diff --git a/opencl/bilateral-filter-fast.cl.h b/opencl/bilateral-filter-fast.cl.h
new file mode 100644
index 0000000..7648245
--- /dev/null
+++ b/opencl/bilateral-filter-fast.cl.h
@@ -0,0 +1,220 @@
+static const char* bilateral_filter_fast_cl_source =
+"/* This file is part of GEGL \n"
+" * \n"
+" * GEGL is free software; you can redistribute it and/or \n"
+" * modify it under the terms of the GNU Lesser General Public \n"
+" * License as published by the Free Software Foundation; either \n"
+" * version 3 of the License, or (at your option) any later version. \n"
+" * \n"
+" * GEGL is distributed in the hope that it will be useful, \n"
+" * but WITHOUT ANY WARRANTY; without even the implied warranty of \n"
+" * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU \n"
+" * Lesser General Public License for more details. \n"
+" * \n"
+" * You should have received a copy of the GNU Lesser General Public \n"
+" * License along with GEGL; if not, see <http://www.gnu.org/licenses/>. \n"
+" * \n"
+" * Copyright 2013 Victor Oliveira (victormatheus gmail com) \n"
+" */ \n"
+" \n"
+"#define GRID(x,y,z) grid[x+sw*(y + z * sh)] \n"
+" \n"
+"__kernel void bilateral_init(__global float8 *grid, \n"
+" int sw, \n"
+" int sh, \n"
+" int depth) \n"
+"{ \n"
+" const int gid_x = get_global_id(0); \n"
+" const int gid_y = get_global_id(1); \n"
+" \n"
+" for (int d=0; d<depth; d++) \n"
+" { \n"
+" GRID(gid_x,gid_y,d) = (float8)(0.0f); \n"
+" } \n"
+"} \n"
+" \n"
+"__kernel void bilateral_downsample(__global const float4 *input, \n"
+" __global float2 *grid, \n"
+" int width, \n"
+" int height, \n"
+" int sw, \n"
+" int sh, \n"
+" int s_sigma, \n"
+" float r_sigma) \n"
+"{ \n"
+" const int gid_x = get_global_id(0); \n"
+" const int gid_y = get_global_id(1); \n"
+" \n"
+" for (int ry=0; ry < s_sigma; ry++) \n"
+" for (int rx=0; rx < s_sigma; rx++) \n"
+" { \n"
+" const int x = clamp(gid_x * s_sigma - s_sigma/2 + rx, 0, width -1); \n"
+" const int y = clamp(gid_y * s_sigma - s_sigma/2 + ry, 0, height-1); \n"
+" \n"
+" const float4 val = input[y * width + x]; \n"
+" \n"
+" const int4 z = convert_int4(val * (1.0f/r_sigma) + 0.5f); \n"
+" \n"
+" grid[4*(gid_x+sw*(gid_y + z.x * sh))+0] += (float2)(val.x, 1.0f); \n"
+" grid[4*(gid_x+sw*(gid_y + z.y * sh))+1] += (float2)(val.y, 1.0f); \n"
+" grid[4*(gid_x+sw*(gid_y + z.z * sh))+2] += (float2)(val.z, 1.0f); \n"
+" grid[4*(gid_x+sw*(gid_y + z.w * sh))+3] += (float2)(val.w, 1.0f); \n"
+" \n"
+" barrier (CLK_GLOBAL_MEM_FENCE); \n"
+" } \n"
+"} \n"
+" \n"
+"#define LOCAL_W 16 \n"
+"#define LOCAL_H 16 \n"
+" \n"
+"__attribute__((reqd_work_group_size(16, 16, 1))) \n"
+"__kernel void bilateral_blur(__global const float8 *grid, \n"
+" __global float2 *blurz_r, \n"
+" __global float2 *blurz_g, \n"
+" __global float2 *blurz_b, \n"
+" __global float2 *blurz_a, \n"
+" int sw, \n"
+" int sh, \n"
+" int depth) \n"
+"{ \n"
+" __local float8 img1[LOCAL_H+2][LOCAL_W+2]; \n"
+" __local float8 img2[LOCAL_H+2][LOCAL_W+2]; \n"
+" \n"
+" const int gid_x = get_global_id(0); \n"
+" const int gid_y = get_global_id(1); \n"
+" \n"
+" const int lx = get_local_id(0); \n"
+" const int ly = get_local_id(1); \n"
+" \n"
+" float8 vpp = (float8)(0.0f); \n"
+" float8 vp = (float8)(0.0f); \n"
+" float8 v = (float8)(0.0f); \n"
+" \n"
+" float8 k; \n"
+" \n"
+" int x = clamp(gid_x, 0, sw-1); \n"
+" int y = clamp(gid_y, 0, sw-1); \n"
+" \n"
+" for (int d=0; d<depth; d++) \n"
+" { \n"
+" int xp = max(x-1, 0); \n"
+" int xn = min(x+1, sw-1); \n"
+" \n"
+" int yp = max(y-1, 0); \n"
+" int yn = min(y+1, sh-1); \n"
+" \n"
+" int zp = max(d-1, 0); \n"
+" int zn = min(d+1, depth-1); \n"
+" \n"
+" /* the corners are not going to be used, don't bother to load them */ \n"
+" \n"
+" if (ly == 0) { \n"
+" k = GRID(x, yp, d); \n"
+" img1[0][lx+1] = k; \n"
+" img2[0][lx+1] = k; \n"
+" } \n"
+" \n"
+" if (ly == LOCAL_H-1) { \n"
+" k = GRID(x, yn, d); \n"
+" img1[LOCAL_H+1][lx+1] = k; \n"
+" img2[LOCAL_H+1][lx+1] = k; \n"
+" } \n"
+" \n"
+" if (lx == 0) { \n"
+" k = GRID(xp, y, d); \n"
+" img1[ly+1][0] = k; \n"
+" img2[ly+1][0] = k; \n"
+" } \n"
+" \n"
+" if (lx == LOCAL_W-1) { \n"
+" k = GRID(xn, y, d); \n"
+" img1[ly+1][LOCAL_W+1] = k; \n"
+" img2[ly+1][LOCAL_W+1] = k; \n"
+" } \n"
+" \n"
+" img1[ly+1][lx+1] = GRID(x, y, d); \n"
+" \n"
+" barrier (CLK_LOCAL_MEM_FENCE); \n"
+" \n"
+" /* blur x */ \n"
+" \n"
+" img2[ly+1][lx+1] = (img1[ly+1][lx] + 2.0f * img1[ly+1][lx+1] + img1[ly+1][lx+2]) / 4.0f;\n"
+" \n"
+" barrier (CLK_LOCAL_MEM_FENCE); \n"
+" \n"
+" /* blur y */ \n"
+" \n"
+" v = (img2[ly][lx+1] + 2.0f * img2[ly+1][lx+1] + img2[ly+2][lx+1]) / 4.0f;\n"
+" \n"
+" /* last three v values */ \n"
+" \n"
+" if (d == 0) \n"
+" { \n"
+" /* this is like CLAMP */ \n"
+" vpp = img1[ly+1][lx+1]; \n"
+" vp = img1[ly+1][lx+1]; \n"
+" } \n"
+" else \n"
+" { \n"
+" vpp = vp; \n"
+" vp = v; \n"
+" \n"
+" float8 blurred = (vpp + 2.0f * vp + v) / 4.0f; \n"
+" \n"
+" /* XXX: OpenCL 1.1 doesn't support writes to 3D textures */ \n"
+" \n"
+" if (gid_x < sw && gid_y < sh) \n"
+" { \n"
+" blurz_r[x+sw*(y+sh*(d-1))] = blurred.s01; \n"
+" blurz_g[x+sw*(y+sh*(d-1))] = blurred.s23; \n"
+" blurz_b[x+sw*(y+sh*(d-1))] = blurred.s45; \n"
+" blurz_a[x+sw*(y+sh*(d-1))] = blurred.s67; \n"
+" } \n"
+" } \n"
+" } \n"
+" \n"
+" /* last z */ \n"
+" \n"
+" vpp = vp; \n"
+" vp = v; \n"
+" \n"
+" float8 blurred = (vpp + 2.0f * vp + v) / 4.0f; \n"
+" \n"
+" if (gid_x < sw && gid_y < sh) \n"
+" { \n"
+" blurz_r[x+sw*(y+sh*(depth-1))] = blurred.s01; \n"
+" blurz_g[x+sw*(y+sh*(depth-1))] = blurred.s23; \n"
+" blurz_b[x+sw*(y+sh*(depth-1))] = blurred.s45; \n"
+" blurz_a[x+sw*(y+sh*(depth-1))] = blurred.s67; \n"
+" } \n"
+"} \n"
+" \n"
+"const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;\n"
+" \n"
+"__kernel void bilateral_interpolate(__global const float4 *input, \n"
+" __read_only image3d_t blurz_r, \n"
+" __read_only image3d_t blurz_g, \n"
+" __read_only image3d_t blurz_b, \n"
+" __read_only image3d_t blurz_a, \n"
+" __global float4 *smoothed, \n"
+" int width, \n"
+" int s_sigma, \n"
+" float r_sigma) \n"
+"{ \n"
+" const int x = get_global_id(0); \n"
+" const int y = get_global_id(1); \n"
+" \n"
+" const float xf = (float)(x) / s_sigma; \n"
+" const float yf = (float)(y) / s_sigma; \n"
+" const float4 zf = input[y * width + x] / r_sigma; \n"
+" \n"
+" float8 val; \n"
+" \n"
+" val.s04 = (read_imagef (blurz_r, sampler, (float4)(xf, yf, zf.x, 0.0f))).xy;\n"
+" val.s15 = (read_imagef (blurz_g, sampler, (float4)(xf, yf, zf.y, 0.0f))).xy;\n"
+" val.s26 = (read_imagef (blurz_b, sampler, (float4)(xf, yf, zf.z, 0.0f))).xy;\n"
+" val.s37 = (read_imagef (blurz_a, sampler, (float4)(xf, yf, zf.w, 0.0f))).xy;\n"
+" \n"
+" smoothed[y * width + x] = val.s0123/val.s4567; \n"
+"} \n"
+;
diff --git a/operations/common/bilateral-filter-fast.c b/operations/common/bilateral-filter-fast.c
index 72b6ed7..18c590f 100644
--- a/operations/common/bilateral-filter-fast.c
+++ b/operations/common/bilateral-filter-fast.c
@@ -51,11 +51,6 @@ inline static float lerp(float a, float b, float v)
return (1.0f - v) * a + v * b;
}
-inline static int clamp(int v, int min, int max)
-{
- return MIN(MAX(v, min), max);
-}
-
static void
bilateral_filter (GeglBuffer *src,
const GeglRectangle *src_rect,
@@ -64,12 +59,20 @@ bilateral_filter (GeglBuffer *src,
gint s_sigma,
gfloat r_sigma);
+static gboolean
+bilateral_cl_process (GeglOperation *operation,
+ GeglBuffer *input,
+ GeglBuffer *output,
+ const GeglRectangle *result,
+ gint s_sigma,
+ gfloat r_sigma);
+
#include <stdio.h>
static void bilateral_prepare (GeglOperation *operation)
{
- gegl_operation_set_format (operation, "input", babl_format ("RGB float"));
- gegl_operation_set_format (operation, "output", babl_format ("RGB float"));
+ gegl_operation_set_format (operation, "input", babl_format ("RGBA float"));
+ gegl_operation_set_format (operation, "output", babl_format ("RGBA float"));
}
static GeglRectangle
@@ -99,14 +102,10 @@ bilateral_process (GeglOperation *operation,
{
GeglChantO *o = GEGL_CHANT_PROPERTIES (operation);
- if (o->s_sigma < 1)
- {
- gegl_buffer_copy (input, result, output, result);
- }
- else
- {
- bilateral_filter (input, result, output, result, o->s_sigma, o->r_sigma/100);
- }
+ if (gegl_cl_is_accelerated () && bilateral_cl_process (operation, input, output, result, o->s_sigma,
o->r_sigma/100))
+ return TRUE;
+
+ bilateral_filter (input, result, output, result, o->s_sigma, o->r_sigma/100);
return TRUE;
}
@@ -126,15 +125,12 @@ bilateral_filter (GeglBuffer *src,
const gint width = src_rect->width;
const gint height = src_rect->height;
- const gint channels = 3;
+ const gint channels = 4;
const gint sw = (width -1) / s_sigma + 1 + (2 * padding_xy);
const gint sh = (height-1) / s_sigma + 1 + (2 * padding_xy);
const gint depth = (int)(1.0f / r_sigma) + 1 + (2 * padding_z);
- gfloat input_min[3] = { FLT_MAX, FLT_MAX, FLT_MAX};
- gfloat input_max[3] = {-FLT_MAX, -FLT_MAX, -FLT_MAX};
-
/* down-sampling */
gfloat *grid = g_new0 (gfloat, sw * sh * depth * channels * 2);
@@ -152,7 +148,7 @@ bilateral_filter (GeglBuffer *src,
#define BLURY(x,y,z,c,i) blury[i+2*(c+channels*(x+sw*(y+z*sh)))]
#define BLURZ(x,y,z,c,i) blurz[i+2*(c+channels*(x+sw*(y+z*sh)))]
- gegl_buffer_get (src, src_rect, 1.0, babl_format ("RGB float"), input, GEGL_AUTO_ROWSTRIDE,
+ gegl_buffer_get (src, src_rect, 1.0, babl_format ("RGBA float"), input, GEGL_AUTO_ROWSTRIDE,
GEGL_ABYSS_NONE);
for (k=0; k < (sw * sh * depth * channels * 2); k++)
@@ -166,6 +162,9 @@ bilateral_filter (GeglBuffer *src,
#if 0
/* in case we want to normalize the color space */
+ gfloat input_min[4] = { FLT_MAX, FLT_MAX, FLT_MAX};
+ gfloat input_max[4] = {-FLT_MAX, -FLT_MAX, -FLT_MAX};
+
for(y = 0; y < height; y++)
for(x = 0; x < width; x++)
for(c = 0; c < channels; c++)
@@ -183,9 +182,9 @@ bilateral_filter (GeglBuffer *src,
{
const float z = INPUT(x,y,c); // - input_min[c];
- const int small_x = (int)(x / s_sigma + 0.5f) + padding_xy;
- const int small_y = (int)(y / s_sigma + 0.5f) + padding_xy;
- const int small_z = (int)(z / r_sigma + 0.5f) + padding_z;
+ const int small_x = (int)((float)(x) / s_sigma + 0.5f) + padding_xy;
+ const int small_y = (int)((float)(y) / s_sigma + 0.5f) + padding_xy;
+ const int small_z = (int)((float)(z) / r_sigma + 0.5f) + padding_z;
g_assert(small_x >= 0 && small_x < sw);
g_assert(small_y >= 0 && small_y < sh);
@@ -229,13 +228,13 @@ bilateral_filter (GeglBuffer *src,
float yf = (float)(y) / s_sigma + padding_xy;
float zf = INPUT(x,y,c) / r_sigma + padding_z;
- int x1 = clamp((int)xf, 0, sw-1);
- int y1 = clamp((int)yf, 0, sh-1);
- int z1 = clamp((int)zf, 0, depth-1);
+ int x1 = CLAMP((int)xf, 0, sw-1);
+ int y1 = CLAMP((int)yf, 0, sh-1);
+ int z1 = CLAMP((int)zf, 0, depth-1);
- int x2 = clamp(x1+1, 0, sw-1);
- int y2 = clamp(y1+1, 0, sh-1);
- int z2 = clamp(z1+1, 0, depth-1);
+ int x2 = CLAMP(x1+1, 0, sw-1);
+ int y2 = CLAMP(y1+1, 0, sh-1);
+ int z2 = CLAMP(z1+1, 0, depth-1);
float x_alpha = xf - x1;
float y_alpha = yf - y1;
@@ -254,10 +253,10 @@ bilateral_filter (GeglBuffer *src,
lerp(lerp(BLURZ(x1, y1, z2, c, i), BLURZ(x2, y1, z2, c, i), x_alpha),
lerp(BLURZ(x1, y2, z2, c, i), BLURZ(x2, y2, z2, c, i), x_alpha), y_alpha), z_alpha);
- smoothed[3*(y*width+x)+c] = interpolated[0] / interpolated[1];
+ smoothed[channels*(y*width+x)+c] = interpolated[0] / interpolated[1];
}
- gegl_buffer_set (dst, dst_rect, 0, babl_format ("RGB float"), smoothed,
+ gegl_buffer_set (dst, dst_rect, 0, babl_format ("RGBA float"), smoothed,
GEGL_AUTO_ROWSTRIDE);
g_free (grid);
@@ -268,6 +267,228 @@ bilateral_filter (GeglBuffer *src,
g_free (smoothed);
}
+#include "opencl/gegl-cl.h"
+#include "buffer/gegl-buffer-cl-iterator.h"
+#include "opencl/bilateral-filter-fast.cl.h"
+
+GEGL_CL_STATIC;
+
+static gboolean
+cl_bilateral (cl_mem in_tex,
+ cl_mem out_tex,
+ const GeglRectangle *roi,
+ const GeglRectangle *src_rect,
+ gint s_sigma,
+ gfloat r_sigma)
+{
+ cl_int cl_err = 0;
+
+ gint c;
+
+ const gint width = src_rect->width;
+ const gint height = src_rect->height;
+
+ const gint sw = (width -1) / s_sigma + 1;
+ const gint sh = (height-1) / s_sigma + 1;
+ const gint depth = (int)(1.0f / r_sigma) + 1;
+
+ size_t global_ws[2];
+ size_t local_ws[2];
+
+ cl_mem grid = NULL;
+ cl_mem blur[4] = {NULL, NULL, NULL, NULL};
+ cl_mem blur_tex[4] = {NULL, NULL, NULL, NULL};
+
+ cl_image_format format = {CL_RG, CL_FLOAT};
+
+ GEGL_CL_BUILD(bilateral_filter_fast,
+ "bilateral_init",
+ "bilateral_downsample",
+ "bilateral_blur",
+ "bilateral_interpolate")
+
+ grid = gegl_clCreateBuffer (gegl_cl_get_context (),
+ CL_MEM_READ_WRITE,
+ sw * sh * depth * sizeof(cl_float8),
+ NULL,
+ &cl_err);
+ CL_CHECK;
+
+ for(c = 0; c < 4; c++)
+ {
+ blur[c] = gegl_clCreateBuffer (gegl_cl_get_context (),
+ CL_MEM_WRITE_ONLY,
+ sw * sh * depth * sizeof(cl_float2),
+ NULL, &cl_err);
+ CL_CHECK;
+
+ blur_tex[c] = gegl_clCreateImage3D (gegl_cl_get_context (),
+ CL_MEM_READ_ONLY,
+ &format,
+ sw, sh, depth,
+ 0, 0, NULL, &cl_err);
+ CL_CHECK;
+ }
+
+ {
+ global_ws[0] = sw;
+ global_ws[1] = sh;
+
+ GEGL_CL_ARG_START(cl_data->kernel[0])
+ GEGL_CL_ARG(cl_mem, grid)
+ GEGL_CL_ARG(cl_int, sw)
+ GEGL_CL_ARG(cl_int, sh)
+ GEGL_CL_ARG(cl_int, depth)
+ GEGL_CL_ARG_END
+
+ cl_err = gegl_clEnqueueNDRangeKernel(gegl_cl_get_command_queue (),
+ cl_data->kernel[0], 2,
+ NULL, global_ws, NULL,
+ 0, NULL, NULL);
+ CL_CHECK;
+ }
+
+
+ {
+ global_ws[0] = sw;
+ global_ws[1] = sh;
+
+ GEGL_CL_ARG_START(cl_data->kernel[1])
+ GEGL_CL_ARG(cl_mem, in_tex)
+ GEGL_CL_ARG(cl_mem, grid)
+ GEGL_CL_ARG(cl_int, width)
+ GEGL_CL_ARG(cl_int, height)
+ GEGL_CL_ARG(cl_int, sw)
+ GEGL_CL_ARG(cl_int, sh)
+ GEGL_CL_ARG(cl_int, s_sigma)
+ GEGL_CL_ARG(cl_float, r_sigma)
+ GEGL_CL_ARG_END
+
+ cl_err = gegl_clEnqueueNDRangeKernel(gegl_cl_get_command_queue (),
+ cl_data->kernel[1], 2,
+ NULL, global_ws, NULL,
+ 0, NULL, NULL);
+ CL_CHECK;
+ }
+
+ cl_err = gegl_clFinish(gegl_cl_get_command_queue ());
+ CL_CHECK;
+
+ {
+ local_ws[0] = 16;
+ local_ws[1] = 16;
+
+ global_ws[0] = ((sw + 15)/16)*16;
+ global_ws[1] = ((sh + 15)/16)*16;
+
+ GEGL_CL_ARG_START(cl_data->kernel[2])
+ GEGL_CL_ARG(cl_mem, grid)
+ GEGL_CL_ARG(cl_mem, blur[0])
+ GEGL_CL_ARG(cl_mem, blur[1])
+ GEGL_CL_ARG(cl_mem, blur[2])
+ GEGL_CL_ARG(cl_mem, blur[3])
+ GEGL_CL_ARG(cl_int, sw)
+ GEGL_CL_ARG(cl_int, sh)
+ GEGL_CL_ARG(cl_int, depth)
+ GEGL_CL_ARG_END
+
+ cl_err = gegl_clEnqueueNDRangeKernel(gegl_cl_get_command_queue (),
+ cl_data->kernel[2], 2,
+ NULL, global_ws, local_ws,
+ 0, NULL, NULL);
+ CL_CHECK;
+ }
+
+ cl_err = gegl_clFinish(gegl_cl_get_command_queue ());
+ CL_CHECK;
+
+ for(c = 0; c < 4; c++)
+ {
+ const size_t dst_origin[3] = {0, 0, 0};
+ const size_t dst_region[3] = {sw, sh, depth};
+
+ cl_err = gegl_clEnqueueCopyBufferToImage (gegl_cl_get_command_queue (),
+ blur[c],
+ blur_tex[c],
+ 0, dst_origin, dst_region,
+ 0, NULL, NULL);
+ CL_CHECK;
+ }
+
+ cl_err = gegl_clFinish(gegl_cl_get_command_queue ());
+ CL_CHECK;
+
+ {
+ global_ws[0] = width;
+ global_ws[1] = height;
+
+ GEGL_CL_ARG_START(cl_data->kernel[3])
+ GEGL_CL_ARG(cl_mem, in_tex)
+ GEGL_CL_ARG(cl_mem, blur_tex[0])
+ GEGL_CL_ARG(cl_mem, blur_tex[1])
+ GEGL_CL_ARG(cl_mem, blur_tex[2])
+ GEGL_CL_ARG(cl_mem, blur_tex[3])
+ GEGL_CL_ARG(cl_mem, out_tex)
+ GEGL_CL_ARG(cl_int, width)
+ GEGL_CL_ARG(cl_int, s_sigma)
+ GEGL_CL_ARG(cl_float, r_sigma)
+ GEGL_CL_ARG_END
+
+ cl_err = gegl_clEnqueueNDRangeKernel(gegl_cl_get_command_queue (),
+ cl_data->kernel[3], 2,
+ NULL, global_ws, NULL,
+ 0, NULL, NULL);
+ CL_CHECK;
+ }
+
+ cl_err = gegl_clFinish(gegl_cl_get_command_queue ());
+ CL_CHECK;
+
+ GEGL_CL_RELEASE(grid);
+
+ for(c = 0; c < 4; c++)
+ {
+ GEGL_CL_RELEASE(blur[c]);
+ GEGL_CL_RELEASE(blur_tex[c]);
+ }
+
+ return FALSE;
+
+error:
+ return TRUE;
+}
+
+static gboolean
+bilateral_cl_process (GeglOperation *operation,
+ GeglBuffer *input,
+ GeglBuffer *output,
+ const GeglRectangle *result,
+ gint s_sigma,
+ gfloat r_sigma)
+{
+ const Babl *in_format = gegl_operation_get_format (operation, "input");
+ const Babl *out_format = gegl_operation_get_format (operation, "output");
+ gint err;
+ gint j;
+ cl_int cl_err;
+
+ GeglBufferClIterator *i = gegl_buffer_cl_iterator_new (output, result, out_format, GEGL_CL_BUFFER_WRITE);
+ gint read = gegl_buffer_cl_iterator_add (i, input, result, in_format, GEGL_CL_BUFFER_READ,
GEGL_ABYSS_NONE);
+
+ GEGL_CL_BUFFER_ITERATE_START(i, j, err)
+ {
+ err = cl_bilateral(i->tex[read][j],
+ i->tex[0][j],
+ &i->roi[0][j],
+ &i->roi[read][j],
+ s_sigma,
+ r_sigma);
+ }
+ GEGL_CL_BUFFER_ITERATE_END(err)
+
+ return TRUE;
+}
+
static void
gegl_chant_class_init (GeglChantClass *klass)
{
@@ -283,6 +504,8 @@ gegl_chant_class_init (GeglChantClass *klass)
operation_class->get_required_for_output = bilateral_get_required_for_output;
operation_class->get_cached_region = bilateral_get_cached_region;
+ operation_class->opencl_support = TRUE;
+
gegl_operation_class_set_keys (operation_class,
"name" , "gegl:bilateral-filter-fast",
"categories" , "tonemapping",
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]