[babl/wip/rishi/cie-simd: 2/2] CIE: Add an SSE3 version of "RGBA float" to "CIE Lab alpha float"

From: Debarshi Ray <debarshir src gnome org>
To: commits-list gnome org
Cc:
Subject: [babl/wip/rishi/cie-simd: 2/2] CIE: Add an SSE3 version of "RGBA float" to "CIE Lab alpha float"
Date: Mon, 30 Apr 2018 09:21:16 +0000 (UTC)
commit d7cb138adbe85881e8d5f387448067a5749f5d68
Author: Debarshi Ray <debarshir gnome org>
Date:   Sun Apr 29 01:15:46 2018 +0200

    CIE: Add an SSE3 version of "RGBA float" to "CIE Lab alpha float"
    
    This uses the SEE3 _mm_movehdup_ps instruction for a slightly more
    efficient way of horizontally summing the components of a
    single-precision floating point vector. It's based on the
    implementation from:
    https://stackoverflow.com/questions/6996764/fastest-way-to-do-horizontal-float-vector-sum-on-x86/35270026
    
    SSE(x) doesn't have integer multiplication or division operations, and
    using bit shifts to implement integer divisions by powers of 2 seems to
    introduce errors. Therefore, it was problematic to use the cube root
    approximation from Hacker's Delight, which uses quite a few integer
    divisions to make the initial guess. Instead, Halley's method of
    approximating the cube root seems more SSEx friendly because the
    initial guess requires only one integer division, which we can manage
    by jumping through a relatively small number of hoops.
    
    The scalar version of Halley's method seems to have originated from
    http://metamerist.com/cbrt/cbrt.htm but that's not accessible anymore.
    At present there's a copy in CubeRoot.cpp in the Skia sources that's
    licensed under a BSD-style license. There's some discussion on the
    implementation at http://www.voidcn.com/article/p-gpwztojr-wt.html.
    
    Note that Darktable also has an SSE2 version of the same algorithm,
    but uses only a single iteration of Halley's method, which is too
    coarse, and isn't as careful about avoiding divisions by zero.
    
    Here's some more discussion on the cube root approximation algorithms:
    https://bugzilla.gnome.org/show_bug.cgi?id=791837
    
    https://bugzilla.gnome.org/show_bug.cgi?id=795686

 extensions/CIE.c       |  234 +++++++++++++++++++++++++++++++++++++++++++++++-
 extensions/Makefile.am |    1 +
 2 files changed, 232 insertions(+), 3 deletions(-)
---
diff --git a/extensions/CIE.c b/extensions/CIE.c
index cb70d0f..38724a4 100644
--- a/extensions/CIE.c
+++ b/extensions/CIE.c
@@ -2,7 +2,7 @@
  * Copyright (C) 2005, 2014 Øyvind Kolås.
  * Copyright (C) 2009, Martin Nordholts
  * Copyright (C) 2014, Elle Stone
- * Copyright (C) 2017, Red Hat, Inc.
+ * Copyright (C) 2017, 2018 Red Hat, Inc.
  *
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
@@ -21,8 +21,13 @@
 
 #include "config.h"
 #include <math.h>
+#include <stdint.h>
 #include <string.h>
 
+#if defined(USE_SSE3)
+#include <pmmintrin.h>
+#endif /* defined(USE_SSE3) */
+
 #include "babl-internal.h"
 #include "extensions/util.h"
 
@@ -557,8 +562,6 @@ lchaba_to_rgba (const Babl *conversion,char *src,
  * Return cube root of x
  */
 
-#include <stdint.h>
-
 static inline float
 _cbrtf (float x)
 {
@@ -1034,6 +1037,217 @@ Lchabaf_to_Labaf (const Babl *conversion,float *src,
     }
 }
 
+#if defined(USE_SSE3)
+
+/* This is an SSE2 version of Halley's method for approximating the
+ * cube root of an IEEE float implementation.
+ *
+ * The scalar version is as follows:
+ *
+ * static inline float
+ * _cbrt_5f (float x)
+ * {
+ *   union { float f; uint32_t i; } u = { x };
+ *
+ *   u.i = u.i / 3 + 709921077;
+ *   return u.f;
+ * }
+ *
+ * static inline float
+ * _cbrta_halleyf (float a, float R)
+ * {
+ *   float a3 = a * a * a;
+ *   float b = a * (a3 + R + R) / (a3 + a3 + R);
+ *   return b;
+ * }
+ *
+ * static inline float
+ * _cbrtf (float x)
+ * {
+ *   float a;
+ *
+ *   a = _cbrt_5f (x);
+ *   a = _cbrta_halleyf (a, x);
+ *   a = _cbrta_halleyf (a, x);
+ *   return a;
+ * }
+ *
+ * The above scalar version seems to have originated from
+ * http://metamerist.com/cbrt/cbrt.htm but that's not accessible
+ * anymore. At present there's a copy in CubeRoot.cpp in the Skia
+ * sources that's licensed under a BSD-style license. There's some
+ * discussion on the implementation at
+ * http://www.voidcn.com/article/p-gpwztojr-wt.html.
+ *
+ * Note that Darktable also has an SSE2 version of the same algorithm,
+ * but uses only a single iteration of Halley's method, which is too
+ * coarse, and isn't as careful about avoiding divisions by zero.
+ */
+/* Return cube roots of the lower three single-precision floating point
+ * elements of x.
+ */
+static inline __m128
+_cbrtf_ps_sse2 (__m128 x)
+{
+  const __m128i magic = _mm_set_epi32 (0, 709921077, 709921077, 709921077);
+
+  __m128i xi = _mm_castps_si128 (x);
+  __m128 xi_3 = _mm_div_ps (_mm_cvtepi32_ps (xi), _mm_set1_ps (3.0f));
+  __m128i ai = _mm_add_epi32 (_mm_cvtps_epi32 (xi_3), magic);
+  __m128 a = _mm_castsi128_ps (ai);
+
+  __m128 a3 = _mm_mul_ps (_mm_mul_ps (a, a), a);
+  __m128 divisor = _mm_add_ps (_mm_add_ps (a3, a3), x);
+  divisor = _mm_shuffle_ps (divisor, divisor, _MM_SHUFFLE (0, 2, 1, 0));
+  a = _mm_div_ps (_mm_mul_ps (a, _mm_add_ps (a3, _mm_add_ps (x, x))), divisor);
+
+  a3 = _mm_mul_ps (_mm_mul_ps (a, a), a);
+  divisor = _mm_add_ps (_mm_add_ps (a3, a3), x);
+  divisor = _mm_shuffle_ps (divisor, divisor, _MM_SHUFFLE (0, 2, 1, 0));
+  a = _mm_div_ps (_mm_mul_ps (a, _mm_add_ps (a3, _mm_add_ps (x, x))), divisor);
+
+  return a;
+}
+
+static inline __m128
+lab_r_to_f_sse2 (__m128 r)
+{
+  const __m128 epsilon = _mm_set1_ps (LAB_EPSILON);
+  const __m128 kappa = _mm_set1_ps (LAB_KAPPA);
+
+  const __m128 f_big = _cbrtf_ps_sse2 (r);
+
+  const __m128 f_small = _mm_div_ps (_mm_add_ps (_mm_mul_ps (kappa, r), _mm_set1_ps (16.0f)),
+                                     _mm_set1_ps (116.0f));
+
+  const __m128 mask = _mm_cmpgt_ps (r, epsilon);
+  const __m128 f = _mm_or_ps (_mm_and_ps (mask, f_big), _mm_andnot_ps (mask, f_small));
+  return f;
+}
+
+/* Horizontally adds all the components of v, and places the sum in
+ * the position(s) denoted by mask.
+ *
+ * Based on:
+ * https://stackoverflow.com/questions/6996764/fastest-way-to-do-horizontal-float-vector-sum-on-x86/35270026
+ */
+static inline __m128
+hsum_ps_sse3 (__m128 v, __m128 mask)
+{
+  __m128 shuf = _mm_movehdup_ps (v);
+  __m128 sums = _mm_add_ps (v, shuf);
+
+  shuf = _mm_movehl_ps (shuf, sums);
+  sums = _mm_add_ss (sums, shuf);
+  sums = _mm_shuffle_ps (sums, sums, _MM_SHUFFLE (0, 0, 0, 0));
+  sums = _mm_and_ps (sums, mask);
+  return sums;
+}
+
+static void
+rgbaf_to_Labaf_sse3 (const Babl *conversion, const float *src, float *dst, long samples)
+{
+  const Babl *space = babl_conversion_get_source_space (conversion);
+  const float m_0_0 = space->space.RGBtoXYZf[0] / D50_WHITE_REF_X;
+  const float m_0_1 = space->space.RGBtoXYZf[1] / D50_WHITE_REF_X;
+  const float m_0_2 = space->space.RGBtoXYZf[2] / D50_WHITE_REF_X;
+  const float m_1_0 = space->space.RGBtoXYZf[3] / D50_WHITE_REF_Y;
+  const float m_1_1 = space->space.RGBtoXYZf[4] / D50_WHITE_REF_Y;
+  const float m_1_2 = space->space.RGBtoXYZf[5] / D50_WHITE_REF_Y;
+  const float m_2_0 = space->space.RGBtoXYZf[6] / D50_WHITE_REF_Z;
+  const float m_2_1 = space->space.RGBtoXYZf[7] / D50_WHITE_REF_Z;
+  const float m_2_2 = space->space.RGBtoXYZf[8] / D50_WHITE_REF_Z;
+  long i = 0;
+  long remainder;
+
+  if (((uintptr_t) src % 16) + ((uintptr_t) dst % 16) == 0)
+    {
+      const long    n = samples;
+      const __m128 *s = (const __m128 *) src;
+            __m128 *d = (__m128 *) dst;
+
+      const __m128 coef = _mm_set_ps (0.0f, 200.0f, 500.0f, 116.0f);
+      const __m128 m_0 = _mm_set_ps (0.0f, m_0_2, m_0_1, m_0_0);
+      const __m128 m_1 = _mm_set_ps (0.0f, m_1_2, m_1_1, m_1_0);
+      const __m128 m_2 = _mm_set_ps (0.0f, m_2_2, m_2_1, m_2_0);
+
+      for ( ; i < n; i++)
+        {
+          __m128 mask;
+          __m128 rgba = *s++;
+          __m128 xr = _mm_mul_ps (rgba, m_0);
+          __m128 yr = _mm_mul_ps (rgba, m_1);
+          __m128 zr = _mm_mul_ps (rgba, m_2);
+          __m128 r;
+          __m128 f;
+          __m128 Laba;
+
+          mask = _mm_castsi128_ps (_mm_set_epi32 (0, 0, 0, 0xffffffff));
+          xr = hsum_ps_sse3 (xr, mask);
+
+          mask = _mm_castsi128_ps (_mm_set_epi32 (0, 0, 0xffffffff, 0));
+          yr = hsum_ps_sse3 (yr, mask);
+
+          mask = _mm_castsi128_ps (_mm_set_epi32 (0, 0xffffffff, 0, 0));
+          zr = hsum_ps_sse3 (zr, mask);
+
+          r = _mm_or_ps (xr, yr);
+          r = _mm_or_ps (r, zr);
+
+          /* Note: since r[3] == 0.0f, f[3] == 16.0f / 116.0f. */
+          f = lab_r_to_f_sse2 (r);
+
+          /* Note: L = 116.0f * f[1] - 16.0f
+           *   or, L = 116.0f * (f[1] - 16.0f / 116.0f)
+           *   or, L = 116.0f * (f[1] - f[3])
+           */
+          Laba = _mm_mul_ps (coef,
+                             _mm_sub_ps (_mm_shuffle_ps (f, f, _MM_SHUFFLE (3, 1, 0, 1)),
+                                         _mm_shuffle_ps (f, f, _MM_SHUFFLE (3, 2, 1, 3))));
+
+          mask = _mm_castsi128_ps (_mm_set_epi32 (0xffffffff, 0, 0, 0));
+          rgba = _mm_and_ps (mask, rgba);
+
+          Laba = _mm_or_ps (rgba, Laba);
+
+          *d++ = Laba;
+        }
+    }
+
+  dst += i * 4;
+  src += i * 4;
+  remainder = samples - i;
+  while (remainder--)
+    {
+      float r = src[0];
+      float g = src[1];
+      float b = src[2];
+      float a = src[3];
+
+      float xr = m_0_0 * r + m_0_1 * g + m_0_2 * b;
+      float yr = m_1_0 * r + m_1_1 * g + m_1_2 * b;
+      float zr = m_2_0 * r + m_2_1 * g + m_2_2 * b;
+
+      float fx = xr > LAB_EPSILON ? _cbrtf (xr) : (LAB_KAPPA * xr + 16.0f) / 116.0f;
+      float fy = yr > LAB_EPSILON ? _cbrtf (yr) : (LAB_KAPPA * yr + 16.0f) / 116.0f;
+      float fz = zr > LAB_EPSILON ? _cbrtf (zr) : (LAB_KAPPA * zr + 16.0f) / 116.0f;
+
+      float L = 116.0f * fy - 16.0f;
+      float A = 500.0f * (fx - fy);
+      float B = 200.0f * (fy - fz);
+
+      dst[0] = L;
+      dst[1] = A;
+      dst[2] = B;
+      dst[3] = a;
+
+      src += 4;
+      dst += 4;
+    }
+}
+
+#endif /* defined(USE_SSE3) */
+
 static void
 conversions (void)
 {
@@ -1200,6 +1414,20 @@ conversions (void)
     NULL
   );
 
+#if defined(USE_SSE3)
+
+  if (babl_cpu_accel_get_support () & BABL_CPU_ACCEL_X86_SSE3)
+    {
+      babl_conversion_new (
+        babl_format ("RGBA float"),
+        babl_format ("CIE Lab alpha float"),
+        "linear", rgbaf_to_Labaf_sse3,
+        NULL
+      );
+    }
+
+#endif /* defined(USE_SSE3) */
+
   rgbcie_init ();
 }
 
diff --git a/extensions/Makefile.am b/extensions/Makefile.am
index a2a2943..5727313 100644
--- a/extensions/Makefile.am
+++ b/extensions/Makefile.am
@@ -71,6 +71,7 @@ fast_float_la_SOURCES = fast-float.c
 LIBS =  $(top_builddir)/babl/libbabl-@BABL_API_VERSION@.la \
        $(MATH_LIB) $(THREAD_LIB)
 
+CIE_la_CFLAGS = $(SSE3_EXTRA_CFLAGS)
 sse2_float_la_CFLAGS = $(SSE2_EXTRA_CFLAGS)
 sse2_int8_la_CFLAGS = $(SSE2_EXTRA_CFLAGS)
 sse2_int16_la_CFLAGS = $(SSE2_EXTRA_CFLAGS)
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]