[gegl] gegl: implement u8 fast paths for boxfilter/2x2 downscale

From: Øyvind Kolås <ok src gnome org>
To: commits-list gnome org
Cc:
Subject: [gegl] gegl: implement u8 fast paths for boxfilter/2x2 downscale
Date: Sat, 20 Jan 2018 02:57:40 +0000 (UTC)
commit 5f40ca579be3569f08be53ffbb365f233af8dae8
Author: Øyvind Kolås <pippin gimp org>
Date:   Sat Jan 20 02:52:30 2018 +0100

    gegl: implement u8 fast paths for boxfilter/2x2 downscale
    
    Bringing back the performance lost when first moving to always rendering in
    linear, we're now at about 80% of the scaled fetch speed for u8 buffers before
    the linear correctness fix.

 gegl/gegl-algorithms.c     |  622 ++++++++++++++++++++++++++++++++++++++++----
 gegl/gegl-init.c           |    4 +
 gegl/gegl-types-internal.h |    3 +
 3 files changed, 572 insertions(+), 57 deletions(-)
---
diff --git a/gegl/gegl-algorithms.c b/gegl/gegl-algorithms.c
index 5bb9d4e..66201ce 100644
--- a/gegl/gegl-algorithms.c
+++ b/gegl/gegl-algorithms.c
@@ -13,14 +13,14 @@
  * You should have received a copy of the GNU Lesser General Public
  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  *
- * Copyright 2006,2007,2015 Øyvind Kolås <pippin gimp org>
+ * Copyright 2006,2007,2015,2018 Øyvind Kolås <pippin gimp org>
  *           2013 Daniel Sabo
  */
 
 #include "config.h"
 
 #include <string.h>
-
+#include <stdint.h>
 #include <glib-object.h>
 
 #include <babl/babl.h>
@@ -32,47 +32,6 @@
 
 #include <math.h>
 
-static void
-gegl_downscale_2x2_generic (const Babl *format,
-                            gint        src_width,
-                            gint        src_height,
-                            guchar     *src_data,
-                            gint        src_rowstride,
-                            guchar     *dst_data,
-                            gint        dst_rowstride);
-
-
-GeglDownscale2x2Fun gegl_downscale_2x2_get_fun (const Babl *format)
-{
-  const Babl *comp_type = babl_format_get_type (format, 0);
-  const Babl *model     = babl_format_get_model (format);
-
-  if (gegl_babl_model_is_linear (model))
-  {
-    if (comp_type == gegl_babl_float())
-    {
-      return gegl_downscale_2x2_float;
-    }
-    else if (comp_type == gegl_babl_u8())
-    {
-      return gegl_downscale_2x2_u8;
-    }
-    else if (comp_type == gegl_babl_u16())
-    {
-      return gegl_downscale_2x2_u16;
-    }
-    else if (comp_type == gegl_babl_u32())
-    {
-      return gegl_downscale_2x2_u32;
-    }
-    else if (comp_type == gegl_babl_double())
-    {
-      return gegl_downscale_2x2_double;
-    }
-  }
-  return gegl_downscale_2x2_generic;
-}
-
 void gegl_downscale_2x2 (const Babl *format,
                          gint        src_width,
                          gint        src_height,
@@ -87,7 +46,6 @@ void gegl_downscale_2x2 (const Babl *format,
 }
 
 #include <stdio.h>
-#define ALLOCA_THRESHOLD  8192 * 4   /* maybe this needs to be reduced for win32? */
 
 static void
 gegl_downscale_2x2_generic (const Babl *format,
@@ -111,7 +69,7 @@ gegl_downscale_2x2_generic (const Babl *format,
   void *in_tmp;
   void *out_tmp;
 
-  if (src_height * in_tmp_rowstride + dst_height * out_tmp_rowstride < ALLOCA_THRESHOLD)
+  if (src_height * in_tmp_rowstride + dst_height * out_tmp_rowstride < GEGL_ALLOCA_THRESHOLD)
   {
     in_tmp = alloca (src_height * in_tmp_rowstride);
     out_tmp = alloca (dst_height * out_tmp_rowstride);
@@ -142,6 +100,503 @@ gegl_downscale_2x2_generic (const Babl *format,
    }
 }
 
+static uint16_t lut_u8_to_u16[256];
+static uint8_t lut_u16_to_u8[65537];
+
+void _gegl_init_u8_lut (void);
+void _gegl_init_u8_lut (void)
+{
+  static int lut_inited = 0;
+  uint8_t u8_ramp[256];
+  uint16_t u16_ramp[65537];
+  int i;
+
+  if (lut_inited)
+    return;
+  for (i = 0; i < 256; i++) u8_ramp[i]=i;
+  for (i = 0; i < 65536; i++) u16_ramp[i]=i;
+  babl_process (babl_fish (babl_format ("Y' u8"), babl_format("Y u16")),
+                &u8_ramp[0], &lut_u8_to_u16[0],
+                256);
+
+  /* workaround for bug, doing this conversion sample by sample */
+  for (i = 0; i < 65536; i++)
+    babl_process (babl_fish (babl_format ("Y u16"), babl_format("Y' u8")),
+                  &u16_ramp[i], &lut_u16_to_u8[i],
+                  1);
+
+  lut_inited = 1;
+}
+
+static inline void
+u8_to_u16_rows (int            components,
+                const uint8_t *source_buf,
+                int            source_stride,
+                uint16_t      *dest_buf,
+                int            dest_stride,
+                int            n,
+                int            rows)
+{
+  n *= components;
+
+  while (rows--)
+   {
+      const uint8_t *src = source_buf;
+      uint16_t      *dest = dest_buf;
+      int i = n;
+      while (i--)
+        *(dest++) = lut_u8_to_u16[*(src++)];
+      source_buf += source_stride;
+      dest_buf += (dest_stride / 2);
+   }
+}
+
+static inline void
+u16_to_u8_rows (int             components,
+                const uint16_t *source_buf,
+                int             source_stride,
+                uint8_t        *dest_buf,
+                int             dest_stride,
+                int             n,
+                int             rows)
+{
+  n *= components;
+  while (rows--)
+   {
+      int i = n;
+      const uint16_t *src = source_buf;
+      uint8_t      *dest = dest_buf;
+      while (i--)
+        *(dest++) = lut_u16_to_u8[*(src++)];
+
+      source_buf += (source_stride / 2);
+      dest_buf += dest_stride;
+   }
+}
+
+static inline int int_floorf (float x)
+{
+  int i = (int)x; /* truncate */
+  return i - ( i > x ); /* convert trunc to floor */
+}
+
+
+static void
+gegl_boxfilter_u8_nl (guchar              *dest_buf,
+                      const guchar        *source_buf,
+                      const GeglRectangle *dst_rect,
+                      const GeglRectangle *src_rect,
+                      const gint           s_rowstride,
+                      const gdouble        scale,
+                      const gint           bpp,
+                      const gint           d_rowstride)
+{
+  const uint8_t *src[9];
+  gint  components = bpp / sizeof(uint8_t);
+
+  gfloat left_weight[dst_rect->width];
+  gfloat center_weight[dst_rect->width];
+  gfloat right_weight[dst_rect->width];
+
+  gint   jj[dst_rect->width];
+
+  for (gint x = 0; x < dst_rect->width; x++)
+  {
+    gfloat sx  = (dst_rect->x + x + .5) / scale - src_rect->x;
+    jj[x]  = int_floorf (sx);
+
+    left_weight[x]   = .5 - scale * (sx - jj[x]);
+    left_weight[x]   = MAX (0.0, left_weight[x]);
+    right_weight[x]  = .5 - scale * ((jj[x] + 1) - sx);
+    right_weight[x]  = MAX (0.0, right_weight[x]);
+    center_weight[x] = 1. - left_weight[x] - right_weight[x];
+
+    jj[x] *= components;
+  }
+
+  for (gint y = 0; y < dst_rect->height; y++)
+    {
+      gfloat top_weight, middle_weight, bottom_weight;
+      const gfloat sy = (dst_rect->y + y + .5) / scale - src_rect->y;
+      const gint     ii = int_floorf (sy);
+      uint8_t             *dst = (uint8_t*)(dest_buf + y * d_rowstride);
+      const guchar  *src_base = source_buf + ii * s_rowstride;
+
+      top_weight    = .5 - scale * (sy - ii);
+      top_weight    = MAX (0., top_weight);
+      bottom_weight = .5 - scale * ((ii + 1 ) - sy);
+      bottom_weight = MAX (0., bottom_weight);
+      middle_weight = 1. - top_weight - bottom_weight;
+
+      switch (components)
+      {
+        case 4:
+          for (gint x = 0; x < dst_rect->width; x++)
+            {
+            src[4] = (const uint8_t*)src_base + jj[x];
+            src[1] = (const uint8_t*)(src_base - s_rowstride) + jj[x];
+            src[7] = (const uint8_t*)(src_base + s_rowstride) + jj[x];
+            src[2] = src[1] + 4;
+            src[5] = src[4] + 4;
+            src[8] = src[7] + 4;
+            src[0] = src[1] - 4;
+            src[3] = src[4] - 4;
+            src[6] = src[7] - 4;
+
+            if (src[0][3] == 0 &&  /* XXX: it would be even better to not call this at all for the abyss...  
*/
+                src[1][3] == 0 &&
+                src[2][3] == 0 &&
+                src[3][3] == 0 &&
+                src[4][3] == 0 &&
+                src[5][3] == 0 &&
+                src[6][3] == 0 &&
+                src[7][3] == 0)
+            {
+              dst[0] = dst[1] = dst[2] = dst[3] = 0;
+            }
+            else
+            {
+              const gfloat l = left_weight[x];
+              const gfloat c = center_weight[x];
+              const gfloat r = right_weight[x];
+
+              const gfloat t = top_weight;
+              const gfloat m = middle_weight;
+              const gfloat b = bottom_weight;
+
+#define BOXFILTER_ROUND(val) lut_u16_to_u8[((int)((val)+0.5))]
+#define C(val)               lut_u8_to_u16[(val)]
+              dst[0] = BOXFILTER_ROUND(
+                (C(src[0][0]) * t + C(src[3][0]) * m + C(src[6][0]) * b) * l +
+                (C(src[1][0]) * t + C(src[4][0]) * m + C(src[7][0]) * b) * c +
+                (C(src[2][0]) * t + C(src[5][0]) * m + C(src[8][0]) * b) * r);
+              dst[1] = BOXFILTER_ROUND(
+                (C(src[0][1]) * t + C(src[3][1]) * m + C(src[6][1]) * b) * l +
+                (C(src[1][1]) * t + C(src[4][1]) * m + C(src[7][1]) * b) * c +
+                (C(src[2][1]) * t + C(src[5][1]) * m + C(src[8][1]) * b) * r);
+              dst[2] = BOXFILTER_ROUND(
+                (C(src[0][2]) * t + C(src[3][2]) * m + C(src[6][2]) * b) * l +
+                (C(src[1][2]) * t + C(src[4][2]) * m + C(src[7][2]) * b) * c +
+                (C(src[2][2]) * t + C(src[5][2]) * m + C(src[8][2]) * b) * r);
+              dst[3] = BOXFILTER_ROUND(
+                (C(src[0][3]) * t + C(src[3][3]) * m + C(src[6][3]) * b) * l +
+                (C(src[1][3]) * t + C(src[4][3]) * m + C(src[7][3]) * b) * c +
+                (C(src[2][3]) * t + C(src[5][3]) * m + C(src[8][3]) * b) * r);
+              }
+            dst += 4;
+            }
+          break;
+        case 3:
+          for (gint x = 0; x < dst_rect->width; x++)
+            {
+            src[4] = (const uint8_t*)src_base + jj[x];
+            src[1] = (const uint8_t*)(src_base - s_rowstride) + jj[x];
+            src[7] = (const uint8_t*)(src_base + s_rowstride) + jj[x];
+            src[2] = src[1] + 3;
+            src[5] = src[4] + 3;
+            src[8] = src[7] + 3;
+            src[0] = src[1] - 3;
+            src[3] = src[4] - 3;
+            src[6] = src[7] - 3;
+            {
+              const gfloat l = left_weight[x];
+              const gfloat c = center_weight[x];
+              const gfloat r = right_weight[x];
+
+              const gfloat t = top_weight;
+              const gfloat m = middle_weight;
+              const gfloat b = bottom_weight;
+
+              dst[0] = BOXFILTER_ROUND(
+                (C(src[0][0]) * t + C(src[3][0]) * m + C(src[6][0]) * b) * l +
+                (C(src[1][0]) * t + C(src[4][0]) * m + C(src[7][0]) * b) * c +
+                (C(src[2][0]) * t + C(src[5][0]) * m + C(src[8][0]) * b) * r);
+              dst[1] = BOXFILTER_ROUND(
+                (C(src[0][1]) * t + C(src[3][1]) * m + C(src[6][1]) * b) * l +
+                (C(src[1][1]) * t + C(src[4][1]) * m + C(src[7][1]) * b) * c +
+                (C(src[2][1]) * t + C(src[5][1]) * m + C(src[8][1]) * b) * r);
+              dst[2] = BOXFILTER_ROUND(
+                (C(src[0][2]) * t + C(src[3][2]) * m + C(src[6][2]) * b) * l +
+                (C(src[1][2]) * t + C(src[4][2]) * m + C(src[7][2]) * b) * c +
+                (C(src[2][2]) * t + C(src[5][2]) * m + C(src[8][2]) * b) * r);
+            }
+            dst += 3;
+            }
+          break;
+        case 2:
+          for (gint x = 0; x < dst_rect->width; x++)
+            {
+            src[4] = (const uint8_t*)src_base + jj[x];
+            src[1] = (const uint8_t*)(src_base - s_rowstride) + jj[x];
+            src[7] = (const uint8_t*)(src_base + s_rowstride) + jj[x];
+            src[2] = src[1] + 2;
+            src[5] = src[4] + 2;
+            src[8] = src[7] + 2;
+            src[0] = src[1] - 2;
+            src[3] = src[4] - 2;
+            src[6] = src[7] - 2;
+            {
+              const gfloat l = left_weight[x];
+              const gfloat c = center_weight[x];
+              const gfloat r = right_weight[x];
+
+              const gfloat t = top_weight;
+              const gfloat m = middle_weight;
+              const gfloat b = bottom_weight;
+
+              dst[0] = BOXFILTER_ROUND(
+                (C(src[0][0]) * t + C(src[3][0]) * m + C(src[6][0]) * b) * l +
+                (C(src[1][0]) * t + C(src[4][0]) * m + C(src[7][0]) * b) * c +
+                (C(src[2][0]) * t + C(src[5][0]) * m + C(src[8][0]) * b) * r);
+              dst[1] = BOXFILTER_ROUND(
+                (C(src[0][1]) * t + C(src[3][1]) * m + C(src[6][1]) * b) * l +
+                (C(src[1][1]) * t + C(src[4][1]) * m + C(src[7][1]) * b) * c +
+                (C(src[2][1]) * t + C(src[5][1]) * m + C(src[8][1]) * b) * r);
+            }
+            dst += 2;
+            }
+          break;
+        case 1:
+          for (gint x = 0; x < dst_rect->width; x++)
+            {
+            src[4] = (const uint8_t*)src_base + jj[x];
+            src[1] = (const uint8_t*)(src_base - s_rowstride) + jj[x];
+            src[7] = (const uint8_t*)(src_base + s_rowstride) + jj[x];
+            src[2] = src[1] + 1;
+            src[5] = src[4] + 1;
+            src[8] = src[7] + 1;
+            src[0] = src[1] - 1;
+            src[3] = src[4] - 1;
+            src[6] = src[7] - 1;
+            {
+              const gfloat l = left_weight[x];
+              const gfloat c = center_weight[x];
+              const gfloat r = right_weight[x];
+
+              const gfloat t = top_weight;
+              const gfloat m = middle_weight;
+              const gfloat b = bottom_weight;
+
+              dst[0] = BOXFILTER_ROUND(
+                (C(src[0][0]) * t + C(src[3][0]) * m + C(src[6][0]) * b) * l +
+                (C(src[1][0]) * t + C(src[4][0]) * m + C(src[7][0]) * b) * c +
+                (C(src[2][0]) * t + C(src[5][0]) * m + C(src[8][0]) * b) * r);
+            }
+            dst += 1;
+            }
+          break;
+        default:
+          for (gint x = 0; x < dst_rect->width; x++)
+          {
+            src[4] = (const uint8_t*)src_base + jj[x];
+            src[1] = (const uint8_t*)(src_base - s_rowstride) + jj[x];
+            src[7] = (const uint8_t*)(src_base + s_rowstride) + jj[x];
+            src[2] = src[1] + components;
+            src[5] = src[4] + components;
+            src[8] = src[7] + components;
+            src[0] = src[1] - components;
+            src[3] = src[4] - components;
+            src[6] = src[7] - components;
+            {
+              const gfloat l = left_weight[x];
+              const gfloat c = center_weight[x];
+              const gfloat r = right_weight[x];
+
+              const gfloat t = top_weight;
+              const gfloat m = middle_weight;
+              const gfloat b = bottom_weight;
+
+              for (gint i = 0; i < components; ++i)
+                {
+                  dst[i] = BOXFILTER_ROUND(
+                  (C(src[0][i]) * t + C(src[3][i]) * m + C(src[6][i]) * b) * l +
+                  (C(src[1][i]) * t + C(src[4][i]) * m + C(src[7][i]) * b) * c +
+                  (C(src[2][i]) * t + C(src[5][i]) * m + C(src[8][i]) * b) * r);
+                }
+              }
+            dst += components;
+        }
+        break;
+      }
+    }
+}
+#undef BOXFILTER_ROUND
+#undef C
+
+static void
+gegl_downscale_2x2_u8_nl (const Babl *format,
+                          gint        src_width,
+                          gint        src_height,
+                          guchar     *src_data,
+                          gint        src_rowstride,
+                          guchar     *dst_data,
+                          gint        dst_rowstride)
+{
+  gint y;
+  gint bpp = babl_format_get_bytes_per_pixel (format);
+  gint diag = src_rowstride + bpp;
+  const gint components = bpp / sizeof(uint8_t);
+
+  if (!src_data || !dst_data)
+    return;
+
+  for (y = 0; y < src_height / 2; y++)
+    {
+      gint    x;
+      guchar *src = src_data + src_rowstride * y * 2;
+      guchar *dst = dst_data + dst_rowstride * y;
+
+      switch (components)
+      {
+          case 1:
+          for (x = 0; x < src_width / 2; x++)
+          {
+            uint8_t * aa = ((uint8_t *)(src));
+            uint8_t * ab = ((uint8_t *)(src + bpp));
+            uint8_t * ba = ((uint8_t *)(src + src_rowstride));
+            uint8_t * bb = ((uint8_t *)(src + diag));
+
+            ((uint8_t *)dst)[0] = lut_u16_to_u8[ (lut_u8_to_u16[aa[0]] +
+                                                  lut_u8_to_u16[ab[0]] +
+                                                  lut_u8_to_u16[ba[0]] +
+                                                  lut_u8_to_u16[bb[0]])>>2 ];
+
+            dst += bpp;
+            src += bpp * 2;
+          }
+          break;
+        case 2:
+        for (x = 0; x < src_width / 2; x++)
+          {
+            uint8_t * aa = ((uint8_t *)(src));
+            uint8_t * ab = ((uint8_t *)(src + bpp));
+            uint8_t * ba = ((uint8_t *)(src + src_rowstride));
+            uint8_t * bb = ((uint8_t *)(src + diag));
+
+            ((uint8_t *)dst)[0] = lut_u16_to_u8[ (lut_u8_to_u16[aa[0]] +
+                                                  lut_u8_to_u16[ab[0]] +
+                                                  lut_u8_to_u16[ba[0]] +
+                                                  lut_u8_to_u16[bb[0]])>>2 ];
+            ((uint8_t *)dst)[1] = lut_u16_to_u8[ (lut_u8_to_u16[aa[1]] +
+                                                  lut_u8_to_u16[ab[1]] +
+                                                  lut_u8_to_u16[ba[1]] +
+                                                  lut_u8_to_u16[bb[1]])>>2 ];
+
+            dst += bpp;
+            src += bpp * 2;
+          }
+          break;
+        case 3:
+        for (x = 0; x < src_width / 2; x++)
+          {
+            uint8_t * aa = ((uint8_t *)(src));
+            uint8_t * ab = ((uint8_t *)(src + bpp));
+            uint8_t * ba = ((uint8_t *)(src + src_rowstride));
+            uint8_t * bb = ((uint8_t *)(src + diag));
+
+            ((uint8_t *)dst)[0] = lut_u16_to_u8[ (lut_u8_to_u16[aa[0]] +
+                                                  lut_u8_to_u16[ab[0]] +
+                                                  lut_u8_to_u16[ba[0]] +
+                                                  lut_u8_to_u16[bb[0]])>>2 ];
+            ((uint8_t *)dst)[1] = lut_u16_to_u8[ (lut_u8_to_u16[aa[1]] +
+                                                  lut_u8_to_u16[ab[1]] +
+                                                  lut_u8_to_u16[ba[1]] +
+                                                  lut_u8_to_u16[bb[1]])>>2 ];
+            ((uint8_t *)dst)[2] = lut_u16_to_u8[ (lut_u8_to_u16[aa[2]] +
+                                                  lut_u8_to_u16[ab[2]] +
+                                                  lut_u8_to_u16[ba[2]] +
+                                                  lut_u8_to_u16[bb[2]])>>2 ];
+
+            dst += bpp;
+            src += bpp * 2;
+          }
+          break;
+        case 4:
+        for (x = 0; x < src_width / 2; x++)
+          {
+            uint8_t * aa = ((uint8_t *)(src));
+            uint8_t * ab = ((uint8_t *)(src + bpp));
+            uint8_t * ba = ((uint8_t *)(src + src_rowstride));
+            uint8_t * bb = ((uint8_t *)(src + diag));
+
+            ((uint8_t *)dst)[0] = lut_u16_to_u8[ (lut_u8_to_u16[aa[0]] +
+                                                  lut_u8_to_u16[ab[0]] +
+                                                  lut_u8_to_u16[ba[0]] +
+                                                  lut_u8_to_u16[bb[0]])>>2 ];
+            ((uint8_t *)dst)[1] = lut_u16_to_u8[ (lut_u8_to_u16[aa[1]] +
+                                                  lut_u8_to_u16[ab[1]] +
+                                                  lut_u8_to_u16[ba[1]] +
+                                                  lut_u8_to_u16[bb[1]])>>2 ];
+            ((uint8_t *)dst)[2] = lut_u16_to_u8[ (lut_u8_to_u16[aa[2]] +
+                                                  lut_u8_to_u16[ab[2]] +
+                                                  lut_u8_to_u16[ba[2]] +
+                                                  lut_u8_to_u16[bb[2]])>>2 ];
+            ((uint8_t *)dst)[3] = lut_u16_to_u8[ (lut_u8_to_u16[aa[3]] +
+                                                  lut_u8_to_u16[ab[3]] +
+                                                  lut_u8_to_u16[ba[3]] +
+                                                  lut_u8_to_u16[bb[3]])>>2 ];
+
+            dst += bpp;
+            src += bpp * 2;
+          }
+          break;
+        default:
+        for (x = 0; x < src_width / 2; x++)
+          {
+            gint i;
+            uint8_t * aa = ((uint8_t *)(src));
+            uint8_t * ab = ((uint8_t *)(src + bpp));
+            uint8_t * ba = ((uint8_t *)(src + src_rowstride));
+            uint8_t * bb = ((uint8_t *)(src + diag));
+
+            for (i = 0; i < components; i++)
+              ((uint8_t *)dst)[i] =
+                lut_u16_to_u8[ (lut_u8_to_u16[aa[i]] +
+                                lut_u8_to_u16[ab[i]] +
+                                lut_u8_to_u16[ba[i]] +
+                                lut_u8_to_u16[bb[i]])>>2 ];
+            dst += bpp;
+            src += bpp * 2;
+          }
+      }
+  }
+}
+
+
+
+GeglDownscale2x2Fun gegl_downscale_2x2_get_fun (const Babl *format)
+{
+  const Babl *comp_type = babl_format_get_type (format, 0);
+  const Babl *model     = babl_format_get_model (format);
+
+  if (gegl_babl_model_is_linear (model))
+  {
+    if (comp_type == gegl_babl_float())
+    {
+      return gegl_downscale_2x2_float;
+    }
+    else if (comp_type == gegl_babl_u8())
+    {
+      return gegl_downscale_2x2_u8;
+    }
+    else if (comp_type == gegl_babl_u16())
+    {
+      return gegl_downscale_2x2_u16;
+    }
+    else if (comp_type == gegl_babl_u32())
+    {
+      return gegl_downscale_2x2_u32;
+    }
+    else if (comp_type == gegl_babl_double())
+    {
+      return gegl_downscale_2x2_double;
+    }
+  }
+  if (comp_type == gegl_babl_u8())
+    return gegl_downscale_2x2_u8_nl;
+  return gegl_downscale_2x2_generic;
+}
+
 void
 gegl_downscale_2x2_nearest (const Babl *format,
                             gint        src_width,
@@ -173,6 +628,57 @@ gegl_downscale_2x2_nearest (const Babl *format,
 }
 
 static void
+gegl_resample_boxfilter_generic_u16 (guchar       *dest_buf,
+                                 const guchar *source_buf,
+                                 const GeglRectangle *dst_rect,
+                                 const GeglRectangle *src_rect,
+                                 gint  s_rowstride,
+                                 gdouble scale,
+                                 const Babl *format,
+                                 gint d_rowstride)
+{
+  gint components = babl_format_get_n_components (format);
+  const gint tmp_bpp     = 4 * 2;
+  gint in_tmp_rowstride  = src_rect->width * tmp_bpp;
+  gint out_tmp_rowstride = dst_rect->width * tmp_bpp;
+  gint do_free = 0;
+
+  guchar *in_tmp, *out_tmp;
+
+  if (src_rect->height * in_tmp_rowstride + dst_rect->height * out_tmp_rowstride < GEGL_ALLOCA_THRESHOLD)
+  {
+    in_tmp = alloca (src_rect->height * in_tmp_rowstride);
+    out_tmp = alloca (dst_rect->height * out_tmp_rowstride);
+  }
+  else
+  {
+    in_tmp  = gegl_malloc (src_rect->height * in_tmp_rowstride);
+    out_tmp = gegl_malloc (dst_rect->height * out_tmp_rowstride);
+    do_free = 1;
+  }
+
+  u8_to_u16_rows (components,
+                  source_buf, s_rowstride,
+                  (void*)in_tmp, in_tmp_rowstride,
+                  src_rect->width, src_rect->height);
+
+  gegl_resample_boxfilter_u16 (out_tmp, in_tmp, dst_rect, src_rect,
+                               in_tmp_rowstride, scale, tmp_bpp, out_tmp_rowstride);
+
+  u16_to_u8_rows (components,
+                  (void*)out_tmp,  out_tmp_rowstride,
+                  dest_buf, d_rowstride,
+                  dst_rect->width, dst_rect->height);
+
+  if (do_free)
+    {
+      gegl_free (in_tmp);
+      gegl_free (out_tmp);
+    }
+}
+
+
+static void
 gegl_resample_boxfilter_generic (guchar       *dest_buf,
                                  const guchar *source_buf,
                                  const GeglRectangle *dst_rect,
@@ -193,7 +699,7 @@ gegl_resample_boxfilter_generic (guchar       *dest_buf,
 
   guchar *in_tmp, *out_tmp;
 
-  if (src_rect->height * in_tmp_rowstride + dst_rect->height * out_tmp_rowstride < ALLOCA_THRESHOLD)
+  if (src_rect->height * in_tmp_rowstride + dst_rect->height * out_tmp_rowstride < GEGL_ALLOCA_THRESHOLD)
   {
     in_tmp = alloca (src_rect->height * in_tmp_rowstride);
     out_tmp = alloca (dst_rect->height * out_tmp_rowstride);
@@ -235,11 +741,11 @@ void gegl_resample_boxfilter (guchar              *dest_buf,
                               gint                 d_rowstride)
 {
   const Babl *model     = babl_format_get_model (format);
+  const Babl *comp_type  = babl_format_get_type (format, 0);
+  const gint bpp = babl_format_get_bytes_per_pixel (format);
 
   if (gegl_babl_model_is_linear (model))
   {
-    const Babl *comp_type  = babl_format_get_type (format, 0);
-    const gint bpp = babl_format_get_bytes_per_pixel (format);
 
     if (comp_type == gegl_babl_float())
       gegl_resample_boxfilter_float (dest_buf, source_buf, dst_rect, src_rect,
@@ -262,8 +768,16 @@ void gegl_resample_boxfilter (guchar              *dest_buf,
     }
   else
     {
-      gegl_resample_boxfilter_generic (dest_buf, source_buf, dst_rect, src_rect,
-                                       s_rowstride, scale, format, d_rowstride);
+      if (comp_type == gegl_babl_u8())
+        gegl_boxfilter_u8_nl (dest_buf, source_buf, dst_rect, src_rect,
+                              s_rowstride, scale, bpp, d_rowstride);
+#if 0
+        gegl_resample_boxfilter_generic_u16 (dest_buf, source_buf, dst_rect, src_rect,
+                                         s_rowstride, scale, format, d_rowstride);
+#endif
+      else
+        gegl_resample_boxfilter_generic (dest_buf, source_buf, dst_rect, src_rect,
+                                         s_rowstride, scale, format, d_rowstride);
     }
 }
 
@@ -289,7 +803,7 @@ gegl_resample_bilinear_generic (guchar              *dest_buf,
 
   guchar *in_tmp, *out_tmp;
 
-  if (src_rect->height * in_tmp_rowstride + dst_rect->height * out_tmp_rowstride < ALLOCA_THRESHOLD)
+  if (src_rect->height * in_tmp_rowstride + dst_rect->height * out_tmp_rowstride < GEGL_ALLOCA_THRESHOLD)
   {
     in_tmp = alloca (src_rect->height * in_tmp_rowstride);
     out_tmp = alloca (dst_rect->height * out_tmp_rowstride);
@@ -363,12 +877,6 @@ void gegl_resample_bilinear (guchar              *dest_buf,
     }
 }
 
-static inline int int_floorf (float x)
-{
-  int i = (int)x; /* truncate */
-  return i - ( i > x ); /* convert trunc to floor */
-}
-
 void
 gegl_resample_nearest (guchar              *dst,
                        const guchar        *src,
diff --git a/gegl/gegl-init.c b/gegl/gegl-init.c
index f45d0d0..b85d3f7 100644
--- a/gegl/gegl-init.c
+++ b/gegl/gegl-init.c
@@ -238,6 +238,8 @@ gboolean gegl_is_main_thread (void)
   return g_thread_self () == main_thread;
 }
 
+void _gegl_init_u8_lut (void);
+
 void
 gegl_init (gint    *argc,
            gchar ***argv)
@@ -250,6 +252,7 @@ gegl_init (gint    *argc,
     return;
 
 
+
   initialized = TRUE;
 
   context = g_option_context_new (NULL);
@@ -650,6 +653,7 @@ gegl_post_parse_hook (GOptionContext *context,
   gegl_config_parse_env (config);
 
   babl_init ();
+  _gegl_init_u8_lut ();
 
 #ifdef GEGL_ENABLE_DEBUG
   {
diff --git a/gegl/gegl-types-internal.h b/gegl/gegl-types-internal.h
index 992ea1f..299fed3 100644
--- a/gegl/gegl-types-internal.h
+++ b/gegl/gegl-types-internal.h
@@ -98,6 +98,7 @@ static inline gboolean gegl_babl_model_is_linear (const Babl *babl)
 GEGL_CACHED_BABL(format, rgba_float, "R'G'B'A float")
 GEGL_CACHED_BABL(format, rgbA_float, "R'aG'aB'aA float")
 GEGL_CACHED_BABL(format, rgba_linear_float, "RGBA float")
+GEGL_CACHED_BABL(format, rgba_linear_u16, "RGBA u16")
 GEGL_CACHED_BABL(format, rgbA_linear_float, "RaGaBaA float")
 GEGL_CACHED_BABL(format, ya_float, "Y'A float")
 GEGL_CACHED_BABL(format, yA_float, "Y'aA float")
@@ -106,4 +107,6 @@ GEGL_CACHED_BABL(format, yA_linear_float, "YaA float")
 
 G_END_DECLS
 
+#define GEGL_ALLOCA_THRESHOLD  8192 * 8   /* maybe this needs to be reduced for win32? */
+
 #endif /* __GEGL_TYPES_INTERNAL_H__ */
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]