[gegl] opencl: Fast approximation of the bilateral filter in opencl.



commit a34f6fd2393d0fc1b4d936fdfe6da7fce47929b7
Author: Victor Oliveira <victormatheus gmail com>
Date:   Tue Mar 12 16:58:24 2013 -0300

    opencl: Fast approximation of the bilateral filter in opencl.

 gegl/opencl/gegl-cl-init.c                |    1 +
 gegl/opencl/gegl-cl-types.h               |    1 +
 gegl/opencl/gegl-cl.h                     |   46 +++++-
 opencl/bilateral-filter-fast.cl           |  218 ++++++++++++++++++++++
 opencl/bilateral-filter-fast.cl.h         |  220 ++++++++++++++++++++++
 operations/common/bilateral-filter-fast.c |  285 +++++++++++++++++++++++++---
 6 files changed, 738 insertions(+), 33 deletions(-)
---
diff --git a/gegl/opencl/gegl-cl-init.c b/gegl/opencl/gegl-cl-init.c
index f97c9cc..58d04c3 100644
--- a/gegl/opencl/gegl-cl-init.c
+++ b/gegl/opencl/gegl-cl-init.c
@@ -269,6 +269,7 @@ gegl_cl_init (GError **error)
       CL_LOAD_FUNCTION (clEnqueueWriteBufferRect)
       CL_LOAD_FUNCTION (clEnqueueCopyBufferRect)
       CL_LOAD_FUNCTION (clCreateImage2D)
+      CL_LOAD_FUNCTION (clCreateImage3D)
       CL_LOAD_FUNCTION (clEnqueueWriteImage)
       CL_LOAD_FUNCTION (clEnqueueReadImage)
       CL_LOAD_FUNCTION (clEnqueueCopyImage)
diff --git a/gegl/opencl/gegl-cl-types.h b/gegl/opencl/gegl-cl-types.h
index 7617482..ae83aa1 100644
--- a/gegl/opencl/gegl-cl-types.h
+++ b/gegl/opencl/gegl-cl-types.h
@@ -66,6 +66,7 @@ typedef CL_API_ENTRY cl_int            (CL_API_CALL *t_clEnqueueReadBufferRect
 typedef CL_API_ENTRY cl_int            (CL_API_CALL *t_clEnqueueWriteBufferRect ) (cl_command_queue, cl_mem, 
cl_bool, const size_t [3], const size_t [3], const size_t [3], size_t, size_t, size_t, size_t, void *, 
cl_uint, const cl_event *, cl_event *);
 typedef CL_API_ENTRY cl_int            (CL_API_CALL *t_clEnqueueCopyBufferRect  ) (cl_command_queue, cl_mem, 
cl_mem, const size_t [3], const size_t [3], const size_t [3], size_t, size_t, size_t, size_t, cl_uint, const 
cl_event *, cl_event *);
 typedef CL_API_ENTRY cl_mem            (CL_API_CALL *t_clCreateImage2D          ) (cl_context, cl_mem_flags, 
const cl_image_format *, size_t, size_t, size_t, void *, cl_int *);
+typedef CL_API_ENTRY cl_mem            (CL_API_CALL *t_clCreateImage3D          ) (cl_context, cl_mem_flags, 
const cl_image_format *, size_t, size_t, size_t, size_t, size_t, void *, cl_int *);
 typedef CL_API_ENTRY cl_int            (CL_API_CALL *t_clEnqueueReadImage       ) (cl_command_queue, cl_mem, 
cl_bool, const size_t [3], const size_t [3], size_t, size_t, void *, cl_uint, const cl_event *, cl_event *);
 typedef CL_API_ENTRY cl_int            (CL_API_CALL *t_clEnqueueWriteImage      ) (cl_command_queue, cl_mem, 
cl_bool, const size_t [3], const size_t [3], size_t, size_t, const void *, cl_uint, const cl_event *, 
cl_event *);
 typedef CL_API_ENTRY cl_int            (CL_API_CALL *t_clEnqueueCopyImage       ) (cl_command_queue, cl_mem, 
cl_mem, const size_t [3], const size_t [3], const size_t [3], cl_uint, const cl_event *, cl_event *);
diff --git a/gegl/opencl/gegl-cl.h b/gegl/opencl/gegl-cl.h
index 926f38c..ab67230 100644
--- a/gegl/opencl/gegl-cl.h
+++ b/gegl/opencl/gegl-cl.h
@@ -26,15 +26,57 @@
 #ifdef __GEGL_DEBUG_H__
 
 #define CL_ERROR {GEGL_NOTE (GEGL_DEBUG_OPENCL, "Error in %s:%d %s - %s\n", __FILE__, __LINE__, __func__, 
gegl_cl_errstring(cl_err)); goto error;}
-#define CL_CHECK {if (cl_err != CL_SUCCESS) CL_ERROR;}
 
 #else
 
 /* public header for gegl ops */
 
 #define CL_ERROR {g_warning("Error in %s:%d %s - %s\n", __FILE__, __LINE__, __func__, 
gegl_cl_errstring(cl_err)); goto error;}
-#define CL_CHECK {if (cl_err != CL_SUCCESS) CL_ERROR;}
 
 #endif
 
+#define CL_CHECK {if (cl_err != CL_SUCCESS) CL_ERROR;}
+
+#define GEGL_CL_ARG_START(KERNEL) \
+  { cl_kernel __mykernel=KERNEL; int __p = 0;
+
+#define GEGL_CL_ARG(TYPE, NAME) \
+  { cl_err = gegl_clSetKernelArg(__mykernel, __p++, sizeof(TYPE), (void*)& NAME); \
+    CL_CHECK; }
+
+#define GEGL_CL_ARG_END \
+  __p = -1; }
+
+#define GEGL_CL_RELEASE(obj)               \
+  { cl_err = gegl_clReleaseMemObject(obj); \
+    CL_CHECK; }
+
+#define GEGL_CL_BUFFER_ITERATE_START(I, J, ERR)      \
+  while (gegl_buffer_cl_iterator_next (I, & ERR)) \
+    {                                                \
+      if (ERR) return FALSE;                         \
+      for (J=0; J < I ->n; J++)                      \
+        {
+
+#define GEGL_CL_BUFFER_ITERATE_END(ERR)   \
+          if (ERR)                        \
+           {                              \
+             g_warning("[OpenCL] Error"); \
+             return FALSE;                \
+           }                              \
+        }                                 \
+    }
+
+
+#define GEGL_CL_BUILD(NAME, ...)                                            \
+  if (!cl_data)                                                             \
+    {                                                                       \
+      const char *kernel_name[] ={__VA_ARGS__ , NULL};                      \
+      cl_data = gegl_cl_compile_and_build(NAME ## _cl_source, kernel_name); \
+    }                                                                       \
+  if (!cl_data) return TRUE;
+
+#define GEGL_CL_STATIC \
+  static GeglClRunData *cl_data = NULL;
+
 #endif
diff --git a/opencl/bilateral-filter-fast.cl b/opencl/bilateral-filter-fast.cl
new file mode 100644
index 0000000..56eca2e
--- /dev/null
+++ b/opencl/bilateral-filter-fast.cl
@@ -0,0 +1,218 @@
+/* This file is part of GEGL
+ *
+ * GEGL is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * GEGL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GEGL; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * Copyright 2013 Victor Oliveira (victormatheus gmail com)
+ */
+
+#define GRID(x,y,z) grid[x+sw*(y + z * sh)]
+
+__kernel void bilateral_init(__global float8 *grid,
+                             int sw,
+                             int sh,
+                             int depth)
+{
+  const int gid_x = get_global_id(0);
+  const int gid_y = get_global_id(1);
+
+  for (int d=0; d<depth; d++)
+    {
+      GRID(gid_x,gid_y,d) = (float8)(0.0f);
+    }
+}
+
+__kernel void bilateral_downsample(__global const float4 *input,
+                                   __global       float2 *grid,
+                                   int width,
+                                   int height,
+                                   int sw,
+                                   int sh,
+                                   int   s_sigma,
+                                   float r_sigma)
+{
+  const int gid_x = get_global_id(0);
+  const int gid_y = get_global_id(1);
+
+  for (int ry=0; ry < s_sigma; ry++)
+    for (int rx=0; rx < s_sigma; rx++)
+      {
+        const int x = clamp(gid_x * s_sigma - s_sigma/2 + rx, 0, width -1);
+        const int y = clamp(gid_y * s_sigma - s_sigma/2 + ry, 0, height-1);
+
+        const float4 val = input[y * width + x];
+
+        const int4 z = convert_int4(val * (1.0f/r_sigma) + 0.5f);
+
+        grid[4*(gid_x+sw*(gid_y + z.x * sh))+0] += (float2)(val.x, 1.0f);
+        grid[4*(gid_x+sw*(gid_y + z.y * sh))+1] += (float2)(val.y, 1.0f);
+        grid[4*(gid_x+sw*(gid_y + z.z * sh))+2] += (float2)(val.z, 1.0f);
+        grid[4*(gid_x+sw*(gid_y + z.w * sh))+3] += (float2)(val.w, 1.0f);
+
+        barrier (CLK_GLOBAL_MEM_FENCE);
+      }
+}
+
+#define LOCAL_W 16
+#define LOCAL_H 16
+
+__attribute__((reqd_work_group_size(16, 16, 1)))
+__kernel void bilateral_blur(__global const float8 *grid,
+                             __global       float2 *blurz_r,
+                             __global       float2 *blurz_g,
+                             __global       float2 *blurz_b,
+                             __global       float2 *blurz_a,
+                             int sw,
+                             int sh,
+                             int depth)
+{
+  __local float8 img1[LOCAL_H+2][LOCAL_W+2];
+  __local float8 img2[LOCAL_H+2][LOCAL_W+2];
+
+  const int gid_x = get_global_id(0);
+  const int gid_y = get_global_id(1);
+
+  const int lx = get_local_id(0);
+  const int ly = get_local_id(1);
+
+  float8 vpp = (float8)(0.0f);
+  float8 vp  = (float8)(0.0f);
+  float8 v   = (float8)(0.0f);
+
+  float8 k;
+
+  int x  = clamp(gid_x, 0, sw-1);
+  int y  = clamp(gid_y, 0, sw-1);
+
+  for (int d=0; d<depth; d++)
+    {
+      int xp = max(x-1, 0);
+      int xn = min(x+1, sw-1);
+
+      int yp = max(y-1, 0);
+      int yn = min(y+1, sh-1);
+
+      int zp = max(d-1, 0);
+      int zn = min(d+1, depth-1);
+
+      /* the corners are not going to be used, don't bother to load them */
+
+      if (ly == 0) {
+        k = GRID(x, yp, d);
+        img1[0][lx+1] = k;
+        img2[0][lx+1] = k;
+      }
+
+      if (ly == LOCAL_H-1) {
+        k = GRID(x, yn, d);
+        img1[LOCAL_H+1][lx+1] = k;
+        img2[LOCAL_H+1][lx+1] = k;
+      }
+
+      if (lx == 0) {
+        k = GRID(xp, y, d);
+        img1[ly+1][0] = k;
+        img2[ly+1][0] = k;
+      }
+
+      if (lx == LOCAL_W-1) {
+        k = GRID(xn, y, d);
+        img1[ly+1][LOCAL_W+1] = k;
+        img2[ly+1][LOCAL_W+1] = k;
+      }
+
+      img1[ly+1][lx+1] = GRID(x, y, d);
+
+      barrier (CLK_LOCAL_MEM_FENCE);
+
+      /* blur x */
+
+      img2[ly+1][lx+1] = (img1[ly+1][lx] + 2.0f * img1[ly+1][lx+1] + img1[ly+1][lx+2]) / 4.0f;
+
+      barrier (CLK_LOCAL_MEM_FENCE);
+
+      /* blur y */
+
+      v = (img2[ly][lx+1] + 2.0f * img2[ly+1][lx+1] + img2[ly+2][lx+1]) / 4.0f;
+
+      /* last three v values */
+
+      if (d == 0)
+        {
+          /* this is like CLAMP */
+          vpp = img1[ly+1][lx+1];
+          vp  = img1[ly+1][lx+1];
+        }
+      else
+        {
+          vpp = vp;
+          vp  = v;
+
+          float8 blurred = (vpp + 2.0f * vp + v) / 4.0f;
+
+          /* XXX: OpenCL 1.1 doesn't support writes to 3D textures */
+
+          if (gid_x < sw && gid_y < sh)
+            {
+              blurz_r[x+sw*(y+sh*(d-1))] = blurred.s01;
+              blurz_g[x+sw*(y+sh*(d-1))] = blurred.s23;
+              blurz_b[x+sw*(y+sh*(d-1))] = blurred.s45;
+              blurz_a[x+sw*(y+sh*(d-1))] = blurred.s67;
+            }
+        }
+    }
+
+  /* last z */
+
+  vpp = vp;
+  vp  = v;
+
+  float8 blurred = (vpp + 2.0f * vp + v) / 4.0f;
+
+  if (gid_x < sw && gid_y < sh)
+    {
+      blurz_r[x+sw*(y+sh*(depth-1))] = blurred.s01;
+      blurz_g[x+sw*(y+sh*(depth-1))] = blurred.s23;
+      blurz_b[x+sw*(y+sh*(depth-1))] = blurred.s45;
+      blurz_a[x+sw*(y+sh*(depth-1))] = blurred.s67;
+    }
+}
+
+const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;
+
+__kernel void bilateral_interpolate(__global    const float4    *input,
+                                    __read_only       image3d_t  blurz_r,
+                                    __read_only       image3d_t  blurz_g,
+                                    __read_only       image3d_t  blurz_b,
+                                    __read_only       image3d_t  blurz_a,
+                                    __global          float4    *smoothed,
+                                    int   width,
+                                    int   s_sigma,
+                                    float r_sigma)
+{
+  const int x = get_global_id(0);
+  const int y = get_global_id(1);
+
+  const float  xf = (float)(x) / s_sigma;
+  const float  yf = (float)(y) / s_sigma;
+  const float4 zf = input[y * width + x] / r_sigma;
+
+  float8 val;
+
+  val.s04 = (read_imagef (blurz_r, sampler, (float4)(xf, yf, zf.x, 0.0f))).xy;
+  val.s15 = (read_imagef (blurz_g, sampler, (float4)(xf, yf, zf.y, 0.0f))).xy;
+  val.s26 = (read_imagef (blurz_b, sampler, (float4)(xf, yf, zf.z, 0.0f))).xy;
+  val.s37 = (read_imagef (blurz_a, sampler, (float4)(xf, yf, zf.w, 0.0f))).xy;
+
+  smoothed[y * width + x] = val.s0123/val.s4567;
+}
diff --git a/opencl/bilateral-filter-fast.cl.h b/opencl/bilateral-filter-fast.cl.h
new file mode 100644
index 0000000..7648245
--- /dev/null
+++ b/opencl/bilateral-filter-fast.cl.h
@@ -0,0 +1,220 @@
+static const char* bilateral_filter_fast_cl_source =
+"/* This file is part of GEGL                                                  \n"
+" *                                                                            \n"
+" * GEGL is free software; you can redistribute it and/or                      \n"
+" * modify it under the terms of the GNU Lesser General Public                 \n"
+" * License as published by the Free Software Foundation; either               \n"
+" * version 3 of the License, or (at your option) any later version.           \n"
+" *                                                                            \n"
+" * GEGL is distributed in the hope that it will be useful,                    \n"
+" * but WITHOUT ANY WARRANTY; without even the implied warranty of             \n"
+" * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU          \n"
+" * Lesser General Public License for more details.                            \n"
+" *                                                                            \n"
+" * You should have received a copy of the GNU Lesser General Public           \n"
+" * License along with GEGL; if not, see <http://www.gnu.org/licenses/>.       \n"
+" *                                                                            \n"
+" * Copyright 2013 Victor Oliveira (victormatheus gmail com)                   \n"
+" */                                                                           \n"
+"                                                                              \n"
+"#define GRID(x,y,z) grid[x+sw*(y + z * sh)]                                   \n"
+"                                                                              \n"
+"__kernel void bilateral_init(__global float8 *grid,                           \n"
+"                             int sw,                                          \n"
+"                             int sh,                                          \n"
+"                             int depth)                                       \n"
+"{                                                                             \n"
+"  const int gid_x = get_global_id(0);                                         \n"
+"  const int gid_y = get_global_id(1);                                         \n"
+"                                                                              \n"
+"  for (int d=0; d<depth; d++)                                                 \n"
+"    {                                                                         \n"
+"      GRID(gid_x,gid_y,d) = (float8)(0.0f);                                   \n"
+"    }                                                                         \n"
+"}                                                                             \n"
+"                                                                              \n"
+"__kernel void bilateral_downsample(__global const float4 *input,              \n"
+"                                   __global       float2 *grid,               \n"
+"                                   int width,                                 \n"
+"                                   int height,                                \n"
+"                                   int sw,                                    \n"
+"                                   int sh,                                    \n"
+"                                   int   s_sigma,                             \n"
+"                                   float r_sigma)                             \n"
+"{                                                                             \n"
+"  const int gid_x = get_global_id(0);                                         \n"
+"  const int gid_y = get_global_id(1);                                         \n"
+"                                                                              \n"
+"  for (int ry=0; ry < s_sigma; ry++)                                          \n"
+"    for (int rx=0; rx < s_sigma; rx++)                                        \n"
+"      {                                                                       \n"
+"        const int x = clamp(gid_x * s_sigma - s_sigma/2 + rx, 0, width -1);   \n"
+"        const int y = clamp(gid_y * s_sigma - s_sigma/2 + ry, 0, height-1);   \n"
+"                                                                              \n"
+"        const float4 val = input[y * width + x];                              \n"
+"                                                                              \n"
+"        const int4 z = convert_int4(val * (1.0f/r_sigma) + 0.5f);             \n"
+"                                                                              \n"
+"        grid[4*(gid_x+sw*(gid_y + z.x * sh))+0] += (float2)(val.x, 1.0f);     \n"
+"        grid[4*(gid_x+sw*(gid_y + z.y * sh))+1] += (float2)(val.y, 1.0f);     \n"
+"        grid[4*(gid_x+sw*(gid_y + z.z * sh))+2] += (float2)(val.z, 1.0f);     \n"
+"        grid[4*(gid_x+sw*(gid_y + z.w * sh))+3] += (float2)(val.w, 1.0f);     \n"
+"                                                                              \n"
+"        barrier (CLK_GLOBAL_MEM_FENCE);                                       \n"
+"      }                                                                       \n"
+"}                                                                             \n"
+"                                                                              \n"
+"#define LOCAL_W 16                                                            \n"
+"#define LOCAL_H 16                                                            \n"
+"                                                                              \n"
+"__attribute__((reqd_work_group_size(16, 16, 1)))                              \n"
+"__kernel void bilateral_blur(__global const float8 *grid,                     \n"
+"                             __global       float2 *blurz_r,                  \n"
+"                             __global       float2 *blurz_g,                  \n"
+"                             __global       float2 *blurz_b,                  \n"
+"                             __global       float2 *blurz_a,                  \n"
+"                             int sw,                                          \n"
+"                             int sh,                                          \n"
+"                             int depth)                                       \n"
+"{                                                                             \n"
+"  __local float8 img1[LOCAL_H+2][LOCAL_W+2];                                  \n"
+"  __local float8 img2[LOCAL_H+2][LOCAL_W+2];                                  \n"
+"                                                                              \n"
+"  const int gid_x = get_global_id(0);                                         \n"
+"  const int gid_y = get_global_id(1);                                         \n"
+"                                                                              \n"
+"  const int lx = get_local_id(0);                                             \n"
+"  const int ly = get_local_id(1);                                             \n"
+"                                                                              \n"
+"  float8 vpp = (float8)(0.0f);                                                \n"
+"  float8 vp  = (float8)(0.0f);                                                \n"
+"  float8 v   = (float8)(0.0f);                                                \n"
+"                                                                              \n"
+"  float8 k;                                                                   \n"
+"                                                                              \n"
+"  int x  = clamp(gid_x, 0, sw-1);                                             \n"
+"  int y  = clamp(gid_y, 0, sw-1);                                             \n"
+"                                                                              \n"
+"  for (int d=0; d<depth; d++)                                                 \n"
+"    {                                                                         \n"
+"      int xp = max(x-1, 0);                                                   \n"
+"      int xn = min(x+1, sw-1);                                                \n"
+"                                                                              \n"
+"      int yp = max(y-1, 0);                                                   \n"
+"      int yn = min(y+1, sh-1);                                                \n"
+"                                                                              \n"
+"      int zp = max(d-1, 0);                                                   \n"
+"      int zn = min(d+1, depth-1);                                             \n"
+"                                                                              \n"
+"      /* the corners are not going to be used, don't bother to load them */   \n"
+"                                                                              \n"
+"      if (ly == 0) {                                                          \n"
+"        k = GRID(x, yp, d);                                                   \n"
+"        img1[0][lx+1] = k;                                                    \n"
+"        img2[0][lx+1] = k;                                                    \n"
+"      }                                                                       \n"
+"                                                                              \n"
+"      if (ly == LOCAL_H-1) {                                                  \n"
+"        k = GRID(x, yn, d);                                                   \n"
+"        img1[LOCAL_H+1][lx+1] = k;                                            \n"
+"        img2[LOCAL_H+1][lx+1] = k;                                            \n"
+"      }                                                                       \n"
+"                                                                              \n"
+"      if (lx == 0) {                                                          \n"
+"        k = GRID(xp, y, d);                                                   \n"
+"        img1[ly+1][0] = k;                                                    \n"
+"        img2[ly+1][0] = k;                                                    \n"
+"      }                                                                       \n"
+"                                                                              \n"
+"      if (lx == LOCAL_W-1) {                                                  \n"
+"        k = GRID(xn, y, d);                                                   \n"
+"        img1[ly+1][LOCAL_W+1] = k;                                            \n"
+"        img2[ly+1][LOCAL_W+1] = k;                                            \n"
+"      }                                                                       \n"
+"                                                                              \n"
+"      img1[ly+1][lx+1] = GRID(x, y, d);                                       \n"
+"                                                                              \n"
+"      barrier (CLK_LOCAL_MEM_FENCE);                                          \n"
+"                                                                              \n"
+"      /* blur x */                                                            \n"
+"                                                                              \n"
+"      img2[ly+1][lx+1] = (img1[ly+1][lx] + 2.0f * img1[ly+1][lx+1] + img1[ly+1][lx+2]) / 4.0f;\n"
+"                                                                              \n"
+"      barrier (CLK_LOCAL_MEM_FENCE);                                          \n"
+"                                                                              \n"
+"      /* blur y */                                                            \n"
+"                                                                              \n"
+"      v = (img2[ly][lx+1] + 2.0f * img2[ly+1][lx+1] + img2[ly+2][lx+1]) / 4.0f;\n"
+"                                                                              \n"
+"      /* last three v values */                                               \n"
+"                                                                              \n"
+"      if (d == 0)                                                             \n"
+"        {                                                                     \n"
+"          /* this is like CLAMP */                                            \n"
+"          vpp = img1[ly+1][lx+1];                                             \n"
+"          vp  = img1[ly+1][lx+1];                                             \n"
+"        }                                                                     \n"
+"      else                                                                    \n"
+"        {                                                                     \n"
+"          vpp = vp;                                                           \n"
+"          vp  = v;                                                            \n"
+"                                                                              \n"
+"          float8 blurred = (vpp + 2.0f * vp + v) / 4.0f;                      \n"
+"                                                                              \n"
+"          /* XXX: OpenCL 1.1 doesn't support writes to 3D textures */         \n"
+"                                                                              \n"
+"          if (gid_x < sw && gid_y < sh)                                       \n"
+"            {                                                                 \n"
+"              blurz_r[x+sw*(y+sh*(d-1))] = blurred.s01;                       \n"
+"              blurz_g[x+sw*(y+sh*(d-1))] = blurred.s23;                       \n"
+"              blurz_b[x+sw*(y+sh*(d-1))] = blurred.s45;                       \n"
+"              blurz_a[x+sw*(y+sh*(d-1))] = blurred.s67;                       \n"
+"            }                                                                 \n"
+"        }                                                                     \n"
+"    }                                                                         \n"
+"                                                                              \n"
+"  /* last z */                                                                \n"
+"                                                                              \n"
+"  vpp = vp;                                                                   \n"
+"  vp  = v;                                                                    \n"
+"                                                                              \n"
+"  float8 blurred = (vpp + 2.0f * vp + v) / 4.0f;                              \n"
+"                                                                              \n"
+"  if (gid_x < sw && gid_y < sh)                                               \n"
+"    {                                                                         \n"
+"      blurz_r[x+sw*(y+sh*(depth-1))] = blurred.s01;                           \n"
+"      blurz_g[x+sw*(y+sh*(depth-1))] = blurred.s23;                           \n"
+"      blurz_b[x+sw*(y+sh*(depth-1))] = blurred.s45;                           \n"
+"      blurz_a[x+sw*(y+sh*(depth-1))] = blurred.s67;                           \n"
+"    }                                                                         \n"
+"}                                                                             \n"
+"                                                                              \n"
+"const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;\n"
+"                                                                              \n"
+"__kernel void bilateral_interpolate(__global    const float4    *input,       \n"
+"                                    __read_only       image3d_t  blurz_r,     \n"
+"                                    __read_only       image3d_t  blurz_g,     \n"
+"                                    __read_only       image3d_t  blurz_b,     \n"
+"                                    __read_only       image3d_t  blurz_a,     \n"
+"                                    __global          float4    *smoothed,    \n"
+"                                    int   width,                              \n"
+"                                    int   s_sigma,                            \n"
+"                                    float r_sigma)                            \n"
+"{                                                                             \n"
+"  const int x = get_global_id(0);                                             \n"
+"  const int y = get_global_id(1);                                             \n"
+"                                                                              \n"
+"  const float  xf = (float)(x) / s_sigma;                                     \n"
+"  const float  yf = (float)(y) / s_sigma;                                     \n"
+"  const float4 zf = input[y * width + x] / r_sigma;                           \n"
+"                                                                              \n"
+"  float8 val;                                                                 \n"
+"                                                                              \n"
+"  val.s04 = (read_imagef (blurz_r, sampler, (float4)(xf, yf, zf.x, 0.0f))).xy;\n"
+"  val.s15 = (read_imagef (blurz_g, sampler, (float4)(xf, yf, zf.y, 0.0f))).xy;\n"
+"  val.s26 = (read_imagef (blurz_b, sampler, (float4)(xf, yf, zf.z, 0.0f))).xy;\n"
+"  val.s37 = (read_imagef (blurz_a, sampler, (float4)(xf, yf, zf.w, 0.0f))).xy;\n"
+"                                                                              \n"
+"  smoothed[y * width + x] = val.s0123/val.s4567;                              \n"
+"}                                                                             \n"
+;
diff --git a/operations/common/bilateral-filter-fast.c b/operations/common/bilateral-filter-fast.c
index 72b6ed7..18c590f 100644
--- a/operations/common/bilateral-filter-fast.c
+++ b/operations/common/bilateral-filter-fast.c
@@ -51,11 +51,6 @@ inline static float lerp(float a, float b, float v)
   return (1.0f - v) * a + v * b;
 }
 
-inline static int clamp(int v, int min, int max)
-{
-  return MIN(MAX(v, min), max);
-}
-
 static void
 bilateral_filter (GeglBuffer          *src,
                   const GeglRectangle *src_rect,
@@ -64,12 +59,20 @@ bilateral_filter (GeglBuffer          *src,
                   gint                 s_sigma,
                   gfloat               r_sigma);
 
+static gboolean
+bilateral_cl_process (GeglOperation       *operation,
+                      GeglBuffer          *input,
+                      GeglBuffer          *output,
+                      const GeglRectangle *result,
+                      gint                 s_sigma,
+                      gfloat               r_sigma);
+
 #include <stdio.h>
 
 static void bilateral_prepare (GeglOperation *operation)
 {
-  gegl_operation_set_format (operation, "input",  babl_format ("RGB float"));
-  gegl_operation_set_format (operation, "output", babl_format ("RGB float"));
+  gegl_operation_set_format (operation, "input",  babl_format ("RGBA float"));
+  gegl_operation_set_format (operation, "output", babl_format ("RGBA float"));
 }
 
 static GeglRectangle
@@ -99,14 +102,10 @@ bilateral_process (GeglOperation       *operation,
 {
   GeglChantO   *o = GEGL_CHANT_PROPERTIES (operation);
 
-  if (o->s_sigma < 1)
-    {
-      gegl_buffer_copy (input, result, output, result);
-    }
-  else
-    {
-      bilateral_filter (input, result, output, result, o->s_sigma, o->r_sigma/100);
-    }
+  if (gegl_cl_is_accelerated () && bilateral_cl_process (operation, input, output, result, o->s_sigma, 
o->r_sigma/100))
+      return TRUE;
+
+  bilateral_filter (input, result, output, result, o->s_sigma, o->r_sigma/100);
 
   return  TRUE;
 }
@@ -126,15 +125,12 @@ bilateral_filter (GeglBuffer          *src,
 
   const gint width    = src_rect->width;
   const gint height   = src_rect->height;
-  const gint channels = 3;
+  const gint channels = 4;
 
   const gint sw = (width -1) / s_sigma + 1 + (2 * padding_xy);
   const gint sh = (height-1) / s_sigma + 1 + (2 * padding_xy);
   const gint depth = (int)(1.0f / r_sigma) + 1 + (2 * padding_z);
 
-  gfloat input_min[3] = { FLT_MAX,  FLT_MAX,  FLT_MAX};
-  gfloat input_max[3] = {-FLT_MAX, -FLT_MAX, -FLT_MAX};
-
   /* down-sampling */
 
   gfloat *grid     = g_new0 (gfloat, sw * sh * depth * channels * 2);
@@ -152,7 +148,7 @@ bilateral_filter (GeglBuffer          *src,
   #define BLURY(x,y,z,c,i) blury[i+2*(c+channels*(x+sw*(y+z*sh)))]
   #define BLURZ(x,y,z,c,i) blurz[i+2*(c+channels*(x+sw*(y+z*sh)))]
 
-  gegl_buffer_get (src, src_rect, 1.0, babl_format ("RGB float"), input, GEGL_AUTO_ROWSTRIDE,
+  gegl_buffer_get (src, src_rect, 1.0, babl_format ("RGBA float"), input, GEGL_AUTO_ROWSTRIDE,
                    GEGL_ABYSS_NONE);
 
   for (k=0; k < (sw * sh * depth * channels * 2); k++)
@@ -166,6 +162,9 @@ bilateral_filter (GeglBuffer          *src,
 #if 0
   /* in case we want to normalize the color space */
 
+  gfloat input_min[4] = { FLT_MAX,  FLT_MAX,  FLT_MAX};
+  gfloat input_max[4] = {-FLT_MAX, -FLT_MAX, -FLT_MAX};
+
   for(y = 0; y < height; y++)
     for(x = 0; x < width; x++)
       for(c = 0; c < channels; c++)
@@ -183,9 +182,9 @@ bilateral_filter (GeglBuffer          *src,
         {
           const float z = INPUT(x,y,c); // - input_min[c];
 
-          const int small_x = (int)(x / s_sigma + 0.5f) + padding_xy;
-          const int small_y = (int)(y / s_sigma + 0.5f) + padding_xy;
-          const int small_z = (int)(z / r_sigma + 0.5f) + padding_z;
+          const int small_x = (int)((float)(x) / s_sigma + 0.5f) + padding_xy;
+          const int small_y = (int)((float)(y) / s_sigma + 0.5f) + padding_xy;
+          const int small_z = (int)((float)(z) / r_sigma + 0.5f) + padding_z;
 
           g_assert(small_x >= 0 && small_x < sw);
           g_assert(small_y >= 0 && small_y < sh);
@@ -229,13 +228,13 @@ bilateral_filter (GeglBuffer          *src,
           float yf = (float)(y) / s_sigma + padding_xy;
           float zf = INPUT(x,y,c) / r_sigma + padding_z;
 
-          int x1 = clamp((int)xf, 0, sw-1);
-          int y1 = clamp((int)yf, 0, sh-1);
-          int z1 = clamp((int)zf, 0, depth-1);
+          int x1 = CLAMP((int)xf, 0, sw-1);
+          int y1 = CLAMP((int)yf, 0, sh-1);
+          int z1 = CLAMP((int)zf, 0, depth-1);
 
-          int x2 = clamp(x1+1, 0, sw-1);
-          int y2 = clamp(y1+1, 0, sh-1);
-          int z2 = clamp(z1+1, 0, depth-1);
+          int x2 = CLAMP(x1+1, 0, sw-1);
+          int y2 = CLAMP(y1+1, 0, sh-1);
+          int z2 = CLAMP(z1+1, 0, depth-1);
 
           float x_alpha = xf - x1;
           float y_alpha = yf - y1;
@@ -254,10 +253,10 @@ bilateral_filter (GeglBuffer          *src,
                    lerp(lerp(BLURZ(x1, y1, z2, c, i), BLURZ(x2, y1, z2, c, i), x_alpha),
                         lerp(BLURZ(x1, y2, z2, c, i), BLURZ(x2, y2, z2, c, i), x_alpha), y_alpha), z_alpha);
 
-          smoothed[3*(y*width+x)+c] = interpolated[0] / interpolated[1];
+          smoothed[channels*(y*width+x)+c] = interpolated[0] / interpolated[1];
         }
 
-  gegl_buffer_set (dst, dst_rect, 0, babl_format ("RGB float"), smoothed,
+  gegl_buffer_set (dst, dst_rect, 0, babl_format ("RGBA float"), smoothed,
                    GEGL_AUTO_ROWSTRIDE);
 
   g_free (grid);
@@ -268,6 +267,228 @@ bilateral_filter (GeglBuffer          *src,
   g_free (smoothed);
 }
 
+#include "opencl/gegl-cl.h"
+#include "buffer/gegl-buffer-cl-iterator.h"
+#include "opencl/bilateral-filter-fast.cl.h"
+
+GEGL_CL_STATIC;
+
+static gboolean
+cl_bilateral (cl_mem                in_tex,
+              cl_mem                out_tex,
+              const GeglRectangle  *roi,
+              const GeglRectangle  *src_rect,
+              gint                  s_sigma,
+              gfloat                r_sigma)
+{
+  cl_int cl_err = 0;
+
+  gint c;
+
+  const gint width    = src_rect->width;
+  const gint height   = src_rect->height;
+
+  const gint sw = (width -1) / s_sigma + 1;
+  const gint sh = (height-1) / s_sigma + 1;
+  const gint depth = (int)(1.0f / r_sigma) + 1;
+
+  size_t global_ws[2];
+  size_t local_ws[2];
+
+  cl_mem grid = NULL;
+  cl_mem blur[4] = {NULL, NULL, NULL, NULL};
+  cl_mem blur_tex[4] = {NULL, NULL, NULL, NULL};
+
+  cl_image_format format = {CL_RG, CL_FLOAT};
+
+  GEGL_CL_BUILD(bilateral_filter_fast,
+                "bilateral_init",
+                "bilateral_downsample",
+                "bilateral_blur",
+                "bilateral_interpolate")
+
+  grid = gegl_clCreateBuffer (gegl_cl_get_context (),
+                              CL_MEM_READ_WRITE,
+                              sw * sh * depth * sizeof(cl_float8),
+                              NULL,
+                              &cl_err);
+  CL_CHECK;
+
+  for(c = 0; c < 4; c++)
+    {
+      blur[c] = gegl_clCreateBuffer (gegl_cl_get_context (),
+                                     CL_MEM_WRITE_ONLY,
+                                     sw * sh * depth * sizeof(cl_float2),
+                                     NULL, &cl_err);
+      CL_CHECK;
+
+      blur_tex[c] = gegl_clCreateImage3D (gegl_cl_get_context (),
+                                          CL_MEM_READ_ONLY,
+                                          &format,
+                                          sw, sh, depth,
+                                          0, 0, NULL, &cl_err);
+      CL_CHECK;
+    }
+
+  {
+  global_ws[0] = sw;
+  global_ws[1] = sh;
+
+  GEGL_CL_ARG_START(cl_data->kernel[0])
+  GEGL_CL_ARG(cl_mem,   grid)
+  GEGL_CL_ARG(cl_int,   sw)
+  GEGL_CL_ARG(cl_int,   sh)
+  GEGL_CL_ARG(cl_int,   depth)
+  GEGL_CL_ARG_END
+
+  cl_err = gegl_clEnqueueNDRangeKernel(gegl_cl_get_command_queue (),
+                                       cl_data->kernel[0], 2,
+                                       NULL, global_ws, NULL,
+                                       0, NULL, NULL);
+  CL_CHECK;
+  }
+
+
+  {
+  global_ws[0] = sw;
+  global_ws[1] = sh;
+
+  GEGL_CL_ARG_START(cl_data->kernel[1])
+  GEGL_CL_ARG(cl_mem,   in_tex)
+  GEGL_CL_ARG(cl_mem,   grid)
+  GEGL_CL_ARG(cl_int,   width)
+  GEGL_CL_ARG(cl_int,   height)
+  GEGL_CL_ARG(cl_int,   sw)
+  GEGL_CL_ARG(cl_int,   sh)
+  GEGL_CL_ARG(cl_int,   s_sigma)
+  GEGL_CL_ARG(cl_float, r_sigma)
+  GEGL_CL_ARG_END
+
+  cl_err = gegl_clEnqueueNDRangeKernel(gegl_cl_get_command_queue (),
+                                       cl_data->kernel[1], 2,
+                                       NULL, global_ws, NULL,
+                                       0, NULL, NULL);
+  CL_CHECK;
+  }
+
+  cl_err = gegl_clFinish(gegl_cl_get_command_queue ());
+  CL_CHECK;
+
+  {
+  local_ws[0] = 16;
+  local_ws[1] = 16;
+
+  global_ws[0] = ((sw + 15)/16)*16;
+  global_ws[1] = ((sh + 15)/16)*16;
+
+  GEGL_CL_ARG_START(cl_data->kernel[2])
+  GEGL_CL_ARG(cl_mem, grid)
+  GEGL_CL_ARG(cl_mem, blur[0])
+  GEGL_CL_ARG(cl_mem, blur[1])
+  GEGL_CL_ARG(cl_mem, blur[2])
+  GEGL_CL_ARG(cl_mem, blur[3])
+  GEGL_CL_ARG(cl_int, sw)
+  GEGL_CL_ARG(cl_int, sh)
+  GEGL_CL_ARG(cl_int, depth)
+  GEGL_CL_ARG_END
+
+  cl_err = gegl_clEnqueueNDRangeKernel(gegl_cl_get_command_queue (),
+                                       cl_data->kernel[2], 2,
+                                       NULL, global_ws, local_ws,
+                                       0, NULL, NULL);
+  CL_CHECK;
+  }
+
+  cl_err = gegl_clFinish(gegl_cl_get_command_queue ());
+  CL_CHECK;
+
+  for(c = 0; c < 4; c++)
+    {
+      const size_t dst_origin[3] = {0, 0, 0};
+      const size_t dst_region[3] = {sw, sh, depth};
+
+      cl_err = gegl_clEnqueueCopyBufferToImage (gegl_cl_get_command_queue (),
+                                                blur[c],
+                                                blur_tex[c],
+                                                0, dst_origin, dst_region,
+                                                0, NULL, NULL);
+      CL_CHECK;
+    }
+
+  cl_err = gegl_clFinish(gegl_cl_get_command_queue ());
+  CL_CHECK;
+
+  {
+  global_ws[0] = width;
+  global_ws[1] = height;
+
+  GEGL_CL_ARG_START(cl_data->kernel[3])
+  GEGL_CL_ARG(cl_mem,   in_tex)
+  GEGL_CL_ARG(cl_mem,   blur_tex[0])
+  GEGL_CL_ARG(cl_mem,   blur_tex[1])
+  GEGL_CL_ARG(cl_mem,   blur_tex[2])
+  GEGL_CL_ARG(cl_mem,   blur_tex[3])
+  GEGL_CL_ARG(cl_mem,   out_tex)
+  GEGL_CL_ARG(cl_int,   width)
+  GEGL_CL_ARG(cl_int,   s_sigma)
+  GEGL_CL_ARG(cl_float, r_sigma)
+  GEGL_CL_ARG_END
+
+  cl_err = gegl_clEnqueueNDRangeKernel(gegl_cl_get_command_queue (),
+                                       cl_data->kernel[3], 2,
+                                       NULL, global_ws, NULL,
+                                       0, NULL, NULL);
+  CL_CHECK;
+  }
+
+  cl_err = gegl_clFinish(gegl_cl_get_command_queue ());
+  CL_CHECK;
+
+  GEGL_CL_RELEASE(grid);
+
+  for(c = 0; c < 4; c++)
+    {
+      GEGL_CL_RELEASE(blur[c]);
+      GEGL_CL_RELEASE(blur_tex[c]);
+    }
+
+  return FALSE;
+
+error:
+  return TRUE;
+}
+
+static gboolean
+bilateral_cl_process (GeglOperation       *operation,
+                      GeglBuffer          *input,
+                      GeglBuffer          *output,
+                      const GeglRectangle *result,
+                      gint                 s_sigma,
+                      gfloat               r_sigma)
+{
+  const Babl *in_format  = gegl_operation_get_format (operation, "input");
+  const Babl *out_format = gegl_operation_get_format (operation, "output");
+  gint err;
+  gint j;
+  cl_int cl_err;
+
+  GeglBufferClIterator *i = gegl_buffer_cl_iterator_new (output, result, out_format, GEGL_CL_BUFFER_WRITE);
+  gint read = gegl_buffer_cl_iterator_add (i, input, result, in_format, GEGL_CL_BUFFER_READ, 
GEGL_ABYSS_NONE);
+
+  GEGL_CL_BUFFER_ITERATE_START(i, j, err)
+    {
+       err = cl_bilateral(i->tex[read][j],
+                          i->tex[0][j],
+                          &i->roi[0][j],
+                          &i->roi[read][j],
+                          s_sigma,
+                          r_sigma);
+    }
+  GEGL_CL_BUFFER_ITERATE_END(err)
+
+  return TRUE;
+}
+
 static void
 gegl_chant_class_init (GeglChantClass *klass)
 {
@@ -283,6 +504,8 @@ gegl_chant_class_init (GeglChantClass *klass)
   operation_class->get_required_for_output = bilateral_get_required_for_output;
   operation_class->get_cached_region       = bilateral_get_cached_region;
 
+  operation_class->opencl_support = TRUE;
+
   gegl_operation_class_set_keys (operation_class,
   "name"       , "gegl:bilateral-filter-fast",
   "categories" , "tonemapping",


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]