[babl/wip/rishi/cie-simd: 2/2] SIMD-fy CIE



commit e01c0d0aa9355d685330d0b32cbe89582212463c
Author: Debarshi Ray <debarshir gnome org>
Date:   Sun Apr 29 01:15:46 2018 +0200

    SIMD-fy CIE

 extensions/CIE.c        |  196 +++++++++++++++++++++++++++++++++++-
 extensions/Makefile.am  |    3 +
 extensions/meson.build  |    1 +
 extensions/sse3-float.c |  262 +++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 460 insertions(+), 2 deletions(-)
---
diff --git a/extensions/CIE.c b/extensions/CIE.c
index cb70d0f..fe9b645 100644
--- a/extensions/CIE.c
+++ b/extensions/CIE.c
@@ -21,8 +21,13 @@
 
 #include "config.h"
 #include <math.h>
+#include <stdint.h>
 #include <string.h>
 
+#if defined(USE_SSE3)
+#include <pmmintrin.h>
+#endif /* defined(USE_SSE3) */
+
 #include "babl-internal.h"
 #include "extensions/util.h"
 
@@ -557,8 +562,6 @@ lchaba_to_rgba (const Babl *conversion,char *src,
  * Return cube root of x
  */
 
-#include <stdint.h>
-
 static inline float
 _cbrtf (float x)
 {
@@ -1034,6 +1037,181 @@ Lchabaf_to_Labaf (const Babl *conversion,float *src,
     }
 }
 
+#if defined(USE_SSE3)
+
+/* This is an SSE2 version of Halley's method for approximating the
+ * cube root of an IEEE float implementation.
+ *
+ * The scalar version seems to have originated from
+ * http://metamerist.com/cbrt/cbrt.htm but that's not accessible
+ * anymore. At present there's a copy in CubeRoot.cpp in the Skia
+ * sources that's licensed under a BSD-style license. There's some
+ * discussion on the implementation at
+ * http://www.voidcn.com/article/p-gpwztojr-wt.html.
+ *
+ * Note that Darktable also has an SSE2 version of the same algorithm,
+ * but uses only a single iteration of Halley's method, which is too
+ * coarse, and isn't as careful about avoiding divisions by zero.
+ */
+/* Return cube roots of the lower three single-precision floating point
+ * elements of x.
+ */
+static inline __m128
+_cbrtf_ps_sse2 (__m128 x)
+{
+  const __m128i magic = _mm_set_epi32 (0, 709921077, 709921077, 709921077);
+
+  __m128i xi = _mm_castps_si128 (x);
+  __m128 xi_3 = _mm_div_ps (_mm_cvtepi32_ps (xi), _mm_set1_ps (3.0f));
+  __m128i ai = _mm_add_epi32 (_mm_cvtps_epi32 (xi_3), magic);
+  __m128 a = _mm_castsi128_ps (ai);
+
+  __m128 a3 = _mm_mul_ps (_mm_mul_ps (a, a), a);
+  __m128 divisor = _mm_add_ps (_mm_add_ps (a3, a3), x);
+  divisor = _mm_shuffle_ps (divisor, divisor, _MM_SHUFFLE (0, 2, 1, 0));
+  a = _mm_div_ps (_mm_mul_ps (a, _mm_add_ps (a3, _mm_add_ps (x, x))), divisor);
+
+  a3 = _mm_mul_ps (_mm_mul_ps (a, a), a);
+  divisor = _mm_add_ps (_mm_add_ps (a3, a3), x);
+  divisor = _mm_shuffle_ps (divisor, divisor, _MM_SHUFFLE (0, 2, 1, 0));
+  a = _mm_div_ps (_mm_mul_ps (a, _mm_add_ps (a3, _mm_add_ps (x, x))), divisor);
+
+  return a;
+}
+
+static inline __m128
+lab_r_to_f_sse2 (__m128 r)
+{
+  const __m128 epsilon = _mm_set1_ps (LAB_EPSILON);
+  const __m128 kappa = _mm_set1_ps (LAB_KAPPA);
+
+  const __m128 f_big = _cbrtf_ps_sse2 (r);
+
+  const __m128 f_small = _mm_div_ps (_mm_add_ps (_mm_mul_ps (kappa, r), _mm_set1_ps (16.0f)),
+                                     _mm_set1_ps (116.0f));
+
+  const __m128 mask = _mm_cmpgt_ps (r, epsilon);
+  const __m128 f = _mm_or_ps (_mm_and_ps (mask, f_big), _mm_andnot_ps (mask, f_small));
+  return f;
+}
+
+static inline __m128
+hsum_ps_sse3 (__m128 v, __m128 mask)
+{
+  __m128 shuf = _mm_movehdup_ps (v);
+  __m128 sums = _mm_add_ps (v, shuf);
+
+  shuf = _mm_movehl_ps (shuf, sums);
+  sums = _mm_add_ss (sums, shuf);
+  sums = _mm_shuffle_ps (sums, sums, _MM_SHUFFLE (0, 0, 0, 0));
+  sums = _mm_and_ps (sums, mask);
+  return sums;
+}
+
+static void
+rgbaf_to_Labaf_sse3 (const Babl *conversion, const float *src, float *dst, long samples)
+{
+  const Babl *space = babl_conversion_get_source_space (conversion);
+  const float m_0_0 = space->space.RGBtoXYZf[0] / D50_WHITE_REF_X;
+  const float m_0_1 = space->space.RGBtoXYZf[1] / D50_WHITE_REF_X;
+  const float m_0_2 = space->space.RGBtoXYZf[2] / D50_WHITE_REF_X;
+  const float m_1_0 = space->space.RGBtoXYZf[3] / D50_WHITE_REF_Y;
+  const float m_1_1 = space->space.RGBtoXYZf[4] / D50_WHITE_REF_Y;
+  const float m_1_2 = space->space.RGBtoXYZf[5] / D50_WHITE_REF_Y;
+  const float m_2_0 = space->space.RGBtoXYZf[6] / D50_WHITE_REF_Z;
+  const float m_2_1 = space->space.RGBtoXYZf[7] / D50_WHITE_REF_Z;
+  const float m_2_2 = space->space.RGBtoXYZf[8] / D50_WHITE_REF_Z;
+  long i = 0;
+  long remainder;
+
+  if (((uintptr_t) src % 16) + ((uintptr_t) dst % 16) == 0)
+    {
+      const long    n = samples;
+      const __m128 *s = (const __m128 *) src;
+            __m128 *d = (__m128 *) dst;
+
+      const __m128 coef = _mm_set_ps (0.0f, 200.0f, 500.0f, 116.0f);
+      const __m128 m_0 = _mm_set_ps (0.0f, m_0_2, m_0_1, m_0_0);
+      const __m128 m_1 = _mm_set_ps (0.0f, m_1_2, m_1_1, m_1_0);
+      const __m128 m_2 = _mm_set_ps (0.0f, m_2_2, m_2_1, m_2_0);
+
+      for ( ; i < n; i++)
+        {
+          __m128 mask;
+          __m128 rgba = *s++;
+          __m128 xr = _mm_mul_ps (rgba, m_0);
+          __m128 yr = _mm_mul_ps (rgba, m_1);
+          __m128 zr = _mm_mul_ps (rgba, m_2);
+          __m128 r;
+          __m128 f;
+          __m128 Laba;
+
+          mask = _mm_castsi128_ps (_mm_set_epi32 (0, 0, 0, 0xffffffff));
+          xr = hsum_ps_sse3 (xr, mask);
+
+          mask = _mm_castsi128_ps (_mm_set_epi32 (0, 0, 0xffffffff, 0));
+          yr = hsum_ps_sse3 (yr, mask);
+
+          mask = _mm_castsi128_ps (_mm_set_epi32 (0, 0xffffffff, 0, 0));
+          zr = hsum_ps_sse3 (zr, mask);
+
+          r = _mm_or_ps (xr, yr);
+          r = _mm_or_ps (r, zr);
+
+          /* Note: since r[3] == 0.0f, f[3] == 16.0f / 116.0f. */
+          f = lab_r_to_f_sse2 (r);
+
+          /* Note: L = 116.0f * f[1] - 16.0f
+           *   or, L = 116.0f * (f[1] - 16.0f / 116.0f)
+           *   or, L = 116.0f * (f[1] - f[3])
+           */
+          Laba = _mm_mul_ps (coef,
+                             _mm_sub_ps (_mm_shuffle_ps (f, f, _MM_SHUFFLE (3, 1, 0, 1)),
+                                         _mm_shuffle_ps (f, f, _MM_SHUFFLE (3, 2, 1, 3))));
+
+          mask = _mm_castsi128_ps (_mm_set_epi32 (0xffffffff, 0, 0, 0));
+          rgba = _mm_and_ps (mask, rgba);
+
+          Laba = _mm_or_ps (rgba, Laba);
+
+          *d++ = Laba;
+        }
+    }
+
+  dst += i * 4;
+  src += i * 4;
+  remainder = samples - i;
+  while (remainder--)
+    {
+      float r = src[0];
+      float g = src[1];
+      float b = src[2];
+      float a = src[3];
+
+      float xr = m_0_0 * r + m_0_1 * g + m_0_2 * b;
+      float yr = m_1_0 * r + m_1_1 * g + m_1_2 * b;
+      float zr = m_2_0 * r + m_2_1 * g + m_2_2 * b;
+
+      float fx = xr > LAB_EPSILON ? _cbrtf (xr) : (LAB_KAPPA * xr + 16.0f) / 116.0f;
+      float fy = yr > LAB_EPSILON ? _cbrtf (yr) : (LAB_KAPPA * yr + 16.0f) / 116.0f;
+      float fz = zr > LAB_EPSILON ? _cbrtf (zr) : (LAB_KAPPA * zr + 16.0f) / 116.0f;
+
+      float L = 116.0f * fy - 16.0f;
+      float A = 500.0f * (fx - fy);
+      float B = 200.0f * (fy - fz);
+
+      dst[0] = L;
+      dst[1] = A;
+      dst[2] = B;
+      dst[3] = a;
+
+      src += 4;
+      dst += 4;
+    }
+}
+
+#endif /* defined(USE_SSE3) */
+
 static void
 conversions (void)
 {
@@ -1200,6 +1378,20 @@ conversions (void)
     NULL
   );
 
+#if defined(USE_SSE3)
+
+  if (babl_cpu_accel_get_support () & BABL_CPU_ACCEL_X86_SSE3)
+    {
+      babl_conversion_new (
+        babl_format ("RGBA float"),
+        babl_format ("CIE Lab alpha float"),
+        "linear", rgbaf_to_Labaf_sse3,
+        NULL
+      );
+    }
+
+#endif /* defined(USE_SSE3) */
+
   rgbcie_init ();
 }
 
diff --git a/extensions/Makefile.am b/extensions/Makefile.am
index a2a2943..769bf1d 100644
--- a/extensions/Makefile.am
+++ b/extensions/Makefile.am
@@ -61,6 +61,7 @@ HSV_la_SOURCES = HSV.c
 sse2_float_la_SOURCES = sse2-float.c
 sse2_int8_la_SOURCES = sse2-int8.c
 sse2_int16_la_SOURCES = sse2-int16.c
+#sse3_float_la_SOURCES = sse3-float.c
 sse4_int8_la_SOURCES = sse4-int8.c
 sse_half_la_SOURCES = sse-half.c
 two_table_la_SOURCES = two-table.c two-table-tables.h
@@ -71,8 +72,10 @@ fast_float_la_SOURCES = fast-float.c
 LIBS =  $(top_builddir)/babl/libbabl-@BABL_API_VERSION@.la \
        $(MATH_LIB) $(THREAD_LIB)
 
+CIE_la_CFLAGS = $(SSE3_EXTRA_CFLAGS)
 sse2_float_la_CFLAGS = $(SSE2_EXTRA_CFLAGS)
 sse2_int8_la_CFLAGS = $(SSE2_EXTRA_CFLAGS)
 sse2_int16_la_CFLAGS = $(SSE2_EXTRA_CFLAGS)
+#sse3_float_la_CFLAGS = $(SSE3_EXTRA_CFLAGS)
 sse4_int8_la_CFLAGS = $(SSE4_1_EXTRA_CFLAGS)
 sse_half_la_CFLAGS = $(SSE4_1_EXTRA_CFLAGS) $(F16C_EXTRA_CFLAGS)
diff --git a/extensions/meson.build b/extensions/meson.build
index afc960d..12ab12c 100644
--- a/extensions/meson.build
+++ b/extensions/meson.build
@@ -23,6 +23,7 @@ extension_names = [
   'sse2-float',
   'sse2-int16',
   'sse2-int8',
+  'sse3-float',
   'sse4-int8',
   'two-table',
   'ycbcr',
diff --git a/extensions/sse3-float.c b/extensions/sse3-float.c
new file mode 100644
index 0000000..56f3cae
--- /dev/null
+++ b/extensions/sse3-float.c
@@ -0,0 +1,262 @@
+/* babl - dynamically extendable universal pixel conversion library.
+ * Copyright (C) 2018 Red Hat, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General
+ * Public License along with this library; if not, see
+ * <http://www.gnu.org/licenses/>.
+ */
+
+#include "config.h"
+
+#if defined(USE_SSE3)
+
+/* SSE 3 */
+#include <pmmintrin.h>
+
+#include <stdint.h>
+//#include <stdlib.h>
+
+#include "babl-internal.h"
+#include "babl-cpuaccel.h"
+
+/* See extensions/CIE.c */
+#define LAB_EPSILON       (216.0f / 24389.0f)
+#define LAB_KAPPA         (24389.0f / 27.0f)
+#define D50_WHITE_REF_X   0.964202880f
+#define D50_WHITE_REF_Y   1.000000000f
+#define D50_WHITE_REF_Z   0.824905400f
+
+/*********************************************************************/
+
+/* origin: http://www.hackersdelight.org/hdcodetxt/acbrt.c.txt
+ * permissions: http://www.hackersdelight.org/permissions.htm
+ */
+
+static inline float
+_cbrtf (float x)
+{
+  union { float f; uint32_t i; } u = { x };
+
+  u.i = u.i / 4 + u.i / 16;
+  u.i = u.i + u.i / 16;
+  u.i = u.i + u.i / 256;
+  u.i = 0x2a5137a0 + u.i;
+  u.f = 0.33333333f * (2.0f * u.f + x / (u.f * u.f));
+  u.f = 0.33333333f * (2.0f * u.f + x / (u.f * u.f));
+
+  return u.f;
+}
+
+static inline __m128
+_cbrtf_ps_sse2 (__m128 x)
+{
+  __m128 uf;
+  __m128i ui = _mm_castps_si128 (x);
+
+  ui = _mm_add_epi32 (_mm_srl_epi32 (ui, _mm_set_epi32 (2, 2, 2, 2)),
+                      _mm_srl_epi32 (ui, _mm_set_epi32 (4, 4, 4, 4)));
+
+  ui = _mm_add_epi32 (ui, _mm_srl_epi32 (ui, _mm_set_epi32 (4, 4, 4, 4)));
+  ui = _mm_add_epi32 (ui, _mm_srl_epi32 (ui, _mm_set_epi32 (8, 8, 8, 8)));
+
+  ui = _mm_add_epi32 (_mm_set_epi32 (0x2a5137a0, 0x2a5137a0, 0x2a5137a0, 0x2a5137a0), ui);
+
+  uf = _mm_castsi128_ps (ui);
+
+  uf = _mm_mul_ps (_mm_set1_ps (0.33333333f),
+                   _mm_add_ps (_mm_mul_ps (_mm_set1_ps (2.0f), uf), _mm_div_ps (x, _mm_mul_ps (uf, uf))));
+
+  uf = _mm_mul_ps (_mm_set1_ps (0.33333333f),
+                   _mm_add_ps (_mm_mul_ps (_mm_set1_ps (2.0f), uf), _mm_div_ps (x, _mm_mul_ps (uf, uf))));
+
+  return uf;
+}
+
+/*********************************************************************/
+
+static inline __m128
+hsum_ps_sse3 (__m128 v, __m128 mask)
+{
+  __m128 shuf = _mm_movehdup_ps (v);
+  __m128 sums = _mm_add_ps (v, shuf);
+  //float hsum;
+
+  shuf = _mm_movehl_ps (shuf, sums);
+  sums = _mm_add_ss (sums, shuf);
+  sums = _mm_shuffle_ps (sums, sums, _MM_SHUFFLE (0, 0, 0, 0));
+  sums = _mm_and_ps (sums, mask);
+  //hsum = _mm_cvtss_f32 (sums);
+  return sums;
+}
+
+static inline __m128
+lab_r_to_f_sse2 (__m128 r)
+{
+  const __m128 epsilon = _mm_set1_ps (LAB_EPSILON);
+  const __m128 kappa = _mm_set1_ps (LAB_KAPPA);
+
+  const __m128 f_big = _cbrtf_ps_sse2 (r);
+
+  const __m128 f_small = _mm_div_ps (_mm_add_ps (_mm_mul_ps (kappa, r), _mm_set1_ps (16.0f)),
+                                     _mm_set1_ps (116.0f));
+
+  const __m128 mask = _mm_cmpgt_ps (r, epsilon);
+  const __m128 f = _mm_or_ps (_mm_and_ps (mask, f_big), _mm_andnot_ps (mask, f_small));
+  return f;
+}
+
+/* static void */
+/* conv_LabaF_rgbaF_linear (const Babl *conversion, const float *src, float *dst, long samples) */
+/* { */
+/* } */
+
+static void
+conv_rgbaF_linear_LabaF (const Babl *conversion, const float *src, float *dst, long samples)
+{
+  const Babl *space = babl_conversion_get_source_space (conversion);
+  const float m_0_0 = space->space.RGBtoXYZf[0] / D50_WHITE_REF_X;
+  const float m_0_1 = space->space.RGBtoXYZf[1] / D50_WHITE_REF_X;
+  const float m_0_2 = space->space.RGBtoXYZf[2] / D50_WHITE_REF_X;
+  const float m_1_0 = space->space.RGBtoXYZf[3] / D50_WHITE_REF_Y;
+  const float m_1_1 = space->space.RGBtoXYZf[4] / D50_WHITE_REF_Y;
+  const float m_1_2 = space->space.RGBtoXYZf[5] / D50_WHITE_REF_Y;
+  const float m_2_0 = space->space.RGBtoXYZf[6] / D50_WHITE_REF_Z;
+  const float m_2_1 = space->space.RGBtoXYZf[7] / D50_WHITE_REF_Z;
+  const float m_2_2 = space->space.RGBtoXYZf[8] / D50_WHITE_REF_Z;
+  long i = 0;
+  long remainder;
+
+  if (((uintptr_t) src % 16) + ((uintptr_t) dst % 16) == 0)
+    {
+      const long    n = (samples / 2) * 2;
+      const __m128 *s = (const __m128 *) src;
+            __m128 *d = (__m128 *) dst;
+
+      const __m128 coef = _mm_set_ps (0.0f, 200.0f, 500.0f, 116.0f);
+      const __m128 m_0 = _mm_set_ps (0.0f, m_0_2, m_0_1, m_0_0);
+      const __m128 m_1 = _mm_set_ps (0.0f, m_1_2, m_1_1, m_1_0);
+      const __m128 m_2 = _mm_set_ps (0.0f, m_2_2, m_2_1, m_2_0);
+
+      for ( ; i < n; i ++)
+        {
+          __m128 mask;
+          __m128 rgba = *s++;
+          __m128 xr = _mm_mul_ps (rgba, m_0);
+          __m128 yr = _mm_mul_ps (rgba, m_1);
+          __m128 zr = _mm_mul_ps (rgba, m_2);
+          __m128 r;
+          __m128 f;
+          __m128 Laba;
+
+          mask = _mm_castsi128_ps (_mm_set_epi32 (0, 0, 0, 0xffffffff));
+          xr = hsum_ps_sse3 (xr, mask);
+
+          mask = _mm_castsi128_ps (_mm_set_epi32 (0, 0, 0xffffffff, 0));
+          yr = hsum_ps_sse3 (yr, mask);
+
+          mask = _mm_castsi128_ps (_mm_set_epi32 (0, 0xffffffff, 0, 0));
+          zr = hsum_ps_sse3 (zr, mask);
+
+          r = _mm_or_ps (xr, yr);
+          r = _mm_or_ps (r, zr);
+
+          /* Note: since r[3] == 0.0f, f[3] == 16.0f / 116.0f. */
+          f = lab_r_to_f_sse2 (r);
+
+          /* Note: L = 116.0f * f[1] - 16.0f
+           *   or, L = 116.0f * (f[1] - 16.0f / 116.0f)
+           *   or, L = 116.0f * (f[1] - f[3])
+           */
+          Laba = _mm_mul_ps (coef,
+                             _mm_sub_ps (_mm_shuffle_ps (f, f, _MM_SHUFFLE (3, 1, 0, 1)),
+                                         _mm_shuffle_ps (f, f, _MM_SHUFFLE (3, 2, 1, 3))));
+
+          mask = _mm_castsi128_ps (_mm_set_epi32 (0xffffffff, 0, 0, 0));
+          rgba = _mm_add_ps (mask, rgba);
+          Laba = _mm_or_ps (rgba, Laba);
+
+          *d++ = Laba;
+        }
+    }
+
+  dst += i * 4;
+  src += i * 4;
+  remainder = samples - i;
+  while (remainder--)
+    {
+      float r = src[0];
+      float g = src[1];
+      float b = src[2];
+      float a = src[3];
+
+      float xr = m_0_0 * r + m_0_1 * g + m_0_2 * b;
+      float yr = m_1_0 * r + m_1_1 * g + m_1_2 * b;
+      float zr = m_2_0 * r + m_2_1 * g + m_2_2 * b;
+
+      float fx = xr > LAB_EPSILON ? _cbrtf (xr) : (LAB_KAPPA * xr + 16.0f) / 116.0f;
+      float fy = yr > LAB_EPSILON ? _cbrtf (yr) : (LAB_KAPPA * yr + 16.0f) / 116.0f;
+      float fz = zr > LAB_EPSILON ? _cbrtf (zr) : (LAB_KAPPA * zr + 16.0f) / 116.0f;
+
+      float L = 116.0f * fy - 16.0f;
+      float A = 500.0f * (fx - fy);
+      float B = 200.0f * (fy - fz);
+
+      dst[0] = L;
+      dst[1] = A;
+      dst[2] = B;
+      dst[3] = a;
+
+      src += 4;
+      dst += 4;
+    }
+}
+
+#endif /* defined(USE_SSE3) */
+
+#define o(src, dst) \
+  babl_conversion_new (src, dst, "linear", conv_ ## src ## _ ## dst, NULL)
+
+int init (void);
+
+int
+init (void)
+{
+#if defined(USE_SSE3)
+
+  const Babl *LabaF = babl_format_new (
+    babl_model ("CIE Lab alpha"),
+    babl_type ("float"),
+    babl_component ("CIE L"),
+    babl_component ("CIE a"),
+    babl_component ("CIE b"),
+    babl_component ("A"),
+    NULL);
+  const Babl *rgbaF_linear = babl_format_new (
+    babl_model ("RGBA"),
+    babl_type ("float"),
+    babl_component ("R"),
+    babl_component ("G"),
+    babl_component ("B"),
+    babl_component ("A"),
+    NULL);
+
+  if (babl_cpu_accel_get_support () & BABL_CPU_ACCEL_X86_SSE3)
+    {
+      /* o (LabaF, rgbaF_linear); */
+      o (rgbaF_linear, LabaF);
+    }
+
+#endif /* defined(USE_SSE3) */
+
+  return 0;
+}


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]