[babl: 2/3] CIE: Add an SSE2 version of "RGBA float" to "CIE Lab alpha float"

From: Debarshi Ray <debarshir src gnome org>
To: commits-list gnome org
Cc:
Subject: [babl: 2/3] CIE: Add an SSE2 version of "RGBA float" to "CIE Lab alpha float"
Date: Mon, 14 May 2018 13:15:48 +0000 (UTC)
commit b65deef160efa97af06ce4282f72b169a441a6a9
Author: Debarshi Ray <debarshir gnome org>
Date:   Sun Apr 29 01:15:46 2018 +0200

    CIE: Add an SSE2 version of "RGBA float" to "CIE Lab alpha float"
    
    On an Intel i7 Haswell, it now takes 0.13s to convert a 15 megapixel
    buffer from "RGBA float" to "CIE Lab alpha float" instead of the
    earlier 0.27s.
    
    SSEx doesn't have integer multiplication or division operations, and
    using bit shifts to implement integer divisions by powers of 2 seems to
    introduce errors. Therefore, it was problematic to use the cube root
    approximation from Hacker's Delight, which uses quite a few integer
    divisions to make the initial guess. Instead, Halley's method of
    approximating the cube root seems more SSEx friendly because the
    initial guess requires only one integer division, which we can manage
    by jumping through a relatively small number of hoops.
    
    The scalar version of Halley's method seems to have originated from
    http://metamerist.com/cbrt/cbrt.htm but that's not accessible anymore.
    At present there's a copy in CubeRoot.cpp in the Skia sources that's
    licensed under a BSD-style license. There's some discussion on the
    implementation at http://www.voidcn.com/article/p-gpwztojr-wt.html.
    
    Note that Darktable also has an SSE2 version of the same algorithm,
    but uses only a single iteration of Halley's method, which is too
    coarse.
    
    Here's some more discussion on the cube root approximation algorithms:
    https://bugzilla.gnome.org/show_bug.cgi?id=791837
    
    https://bugzilla.gnome.org/show_bug.cgi?id=795686

 extensions/CIE.c       |  221 +++++++++++++++++++++++++++++++++++++++++++++++-
 extensions/Makefile.am |    1 +
 2 files changed, 219 insertions(+), 3 deletions(-)
---
diff --git a/extensions/CIE.c b/extensions/CIE.c
index cb70d0f..c8a79b9 100644
--- a/extensions/CIE.c
+++ b/extensions/CIE.c
@@ -2,7 +2,7 @@
  * Copyright (C) 2005, 2014 Øyvind Kolås.
  * Copyright (C) 2009, Martin Nordholts
  * Copyright (C) 2014, Elle Stone
- * Copyright (C) 2017, Red Hat, Inc.
+ * Copyright (C) 2017, 2018 Red Hat, Inc.
  *
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
@@ -21,8 +21,13 @@
 
 #include "config.h"
 #include <math.h>
+#include <stdint.h>
 #include <string.h>
 
+#if defined(USE_SSE2)
+#include <emmintrin.h>
+#endif /* defined(USE_SSE2) */
+
 #include "babl-internal.h"
 #include "extensions/util.h"
 
@@ -557,8 +562,6 @@ lchaba_to_rgba (const Babl *conversion,char *src,
  * Return cube root of x
  */
 
-#include <stdint.h>
-
 static inline float
 _cbrtf (float x)
 {
@@ -1034,6 +1037,204 @@ Lchabaf_to_Labaf (const Babl *conversion,float *src,
     }
 }
 
+#if defined(USE_SSE2)
+
+/* This is an SSE2 version of Halley's method for approximating the
+ * cube root of an IEEE float implementation.
+ *
+ * The scalar version is as follows:
+ *
+ * static inline float
+ * _cbrt_5f (float x)
+ * {
+ *   union { float f; uint32_t i; } u = { x };
+ *
+ *   u.i = u.i / 3 + 709921077;
+ *   return u.f;
+ * }
+ *
+ * static inline float
+ * _cbrta_halleyf (float a, float R)
+ * {
+ *   float a3 = a * a * a;
+ *   float b = a * (a3 + R + R) / (a3 + a3 + R);
+ *   return b;
+ * }
+ *
+ * static inline float
+ * _cbrtf (float x)
+ * {
+ *   float a;
+ *
+ *   a = _cbrt_5f (x);
+ *   a = _cbrta_halleyf (a, x);
+ *   a = _cbrta_halleyf (a, x);
+ *   return a;
+ * }
+ *
+ * The above scalar version seems to have originated from
+ * http://metamerist.com/cbrt/cbrt.htm but that's not accessible
+ * anymore. At present there's a copy in CubeRoot.cpp in the Skia
+ * sources that's licensed under a BSD-style license. There's some
+ * discussion on the implementation at
+ * http://www.voidcn.com/article/p-gpwztojr-wt.html.
+ *
+ * Note that Darktable also has an SSE2 version of the same algorithm,
+ * but uses only a single iteration of Halley's method, which is too
+ * coarse.
+ */
+/* Return cube roots of the four single-precision floating point
+ * components of x.
+ */
+static inline __m128
+_cbrtf_ps_sse2 (__m128 x)
+{
+  const __m128i magic = _mm_set1_epi32 (709921077);
+
+  __m128i xi = _mm_castps_si128 (x);
+  __m128 xi_3 = _mm_div_ps (_mm_cvtepi32_ps (xi), _mm_set1_ps (3.0f));
+  __m128i ai = _mm_add_epi32 (_mm_cvtps_epi32 (xi_3), magic);
+  __m128 a = _mm_castsi128_ps (ai);
+
+  __m128 a3 = _mm_mul_ps (_mm_mul_ps (a, a), a);
+  __m128 divisor = _mm_add_ps (_mm_add_ps (a3, a3), x);
+  a = _mm_div_ps (_mm_mul_ps (a, _mm_add_ps (a3, _mm_add_ps (x, x))), divisor);
+
+  a3 = _mm_mul_ps (_mm_mul_ps (a, a), a);
+  divisor = _mm_add_ps (_mm_add_ps (a3, a3), x);
+  a = _mm_div_ps (_mm_mul_ps (a, _mm_add_ps (a3, _mm_add_ps (x, x))), divisor);
+
+  return a;
+}
+
+static inline __m128
+lab_r_to_f_sse2 (__m128 r)
+{
+  const __m128 epsilon = _mm_set1_ps (LAB_EPSILON);
+  const __m128 kappa = _mm_set1_ps (LAB_KAPPA);
+
+  const __m128 f_big = _cbrtf_ps_sse2 (r);
+
+  const __m128 f_small = _mm_div_ps (_mm_add_ps (_mm_mul_ps (kappa, r), _mm_set1_ps (16.0f)),
+                                     _mm_set1_ps (116.0f));
+
+  const __m128 mask = _mm_cmpgt_ps (r, epsilon);
+  const __m128 f = _mm_or_ps (_mm_and_ps (mask, f_big), _mm_andnot_ps (mask, f_small));
+  return f;
+}
+
+static void
+rgbaf_to_Labaf_sse2 (const Babl *conversion, const float *src, float *dst, long samples)
+{
+  const Babl *space = babl_conversion_get_source_space (conversion);
+  const float m_0_0 = space->space.RGBtoXYZf[0] / D50_WHITE_REF_X;
+  const float m_0_1 = space->space.RGBtoXYZf[1] / D50_WHITE_REF_X;
+  const float m_0_2 = space->space.RGBtoXYZf[2] / D50_WHITE_REF_X;
+  const float m_1_0 = space->space.RGBtoXYZf[3] / D50_WHITE_REF_Y;
+  const float m_1_1 = space->space.RGBtoXYZf[4] / D50_WHITE_REF_Y;
+  const float m_1_2 = space->space.RGBtoXYZf[5] / D50_WHITE_REF_Y;
+  const float m_2_0 = space->space.RGBtoXYZf[6] / D50_WHITE_REF_Z;
+  const float m_2_1 = space->space.RGBtoXYZf[7] / D50_WHITE_REF_Z;
+  const float m_2_2 = space->space.RGBtoXYZf[8] / D50_WHITE_REF_Z;
+  long i = 0;
+  long remainder;
+
+  if (((uintptr_t) src % 16) + ((uintptr_t) dst % 16) == 0)
+    {
+      const long    n = (samples / 4) * 4;
+      const __m128 m_0_0_v = _mm_set1_ps (m_0_0);
+      const __m128 m_0_1_v = _mm_set1_ps (m_0_1);
+      const __m128 m_0_2_v = _mm_set1_ps (m_0_2);
+      const __m128 m_1_0_v = _mm_set1_ps (m_1_0);
+      const __m128 m_1_1_v = _mm_set1_ps (m_1_1);
+      const __m128 m_1_2_v = _mm_set1_ps (m_1_2);
+      const __m128 m_2_0_v = _mm_set1_ps (m_2_0);
+      const __m128 m_2_1_v = _mm_set1_ps (m_2_1);
+      const __m128 m_2_2_v = _mm_set1_ps (m_2_2);
+
+      for ( ; i < n; i += 4)
+        {
+          __m128 Laba0;
+          __m128 Laba1;
+          __m128 Laba2;
+          __m128 Laba3;
+
+          __m128 rgba0 = _mm_load_ps (src);
+          __m128 rgba1 = _mm_load_ps (src + 4);
+          __m128 rgba2 = _mm_load_ps (src + 8);
+          __m128 rgba3 = _mm_load_ps (src + 12);
+
+          __m128 r = rgba0;
+          __m128 g = rgba1;
+          __m128 b = rgba2;
+          __m128 a = rgba3;
+          _MM_TRANSPOSE4_PS (r, g, b, a);
+
+          {
+            __m128 xr = _mm_add_ps (_mm_add_ps (_mm_mul_ps (m_0_0_v, r), _mm_mul_ps (m_0_1_v, g)),
+                                    _mm_mul_ps (m_0_2_v, b));
+            __m128 yr = _mm_add_ps (_mm_add_ps (_mm_mul_ps (m_1_0_v, r), _mm_mul_ps (m_1_1_v, g)),
+                                    _mm_mul_ps (m_1_2_v, b));
+            __m128 zr = _mm_add_ps (_mm_add_ps (_mm_mul_ps (m_2_0_v, r), _mm_mul_ps (m_2_1_v, g)),
+                                    _mm_mul_ps (m_2_2_v, b));
+
+            __m128 fx = lab_r_to_f_sse2 (xr);
+            __m128 fy = lab_r_to_f_sse2 (yr);
+            __m128 fz = lab_r_to_f_sse2 (zr);
+
+            __m128 L = _mm_sub_ps (_mm_mul_ps (_mm_set1_ps (116.0f), fy), _mm_set1_ps (16.0f));
+            __m128 A = _mm_mul_ps (_mm_set1_ps (500.0f), _mm_sub_ps (fx, fy));
+            __m128 B = _mm_mul_ps (_mm_set1_ps (200.0f), _mm_sub_ps (fy, fz));
+
+            Laba0 = L;
+            Laba1 = A;
+            Laba2 = B;
+            Laba3 = a;
+            _MM_TRANSPOSE4_PS (Laba0, Laba1, Laba2, Laba3);
+          }
+
+          _mm_store_ps (dst, Laba0);
+          _mm_store_ps (dst + 4, Laba1);
+          _mm_store_ps (dst + 8, Laba2);
+          _mm_store_ps (dst + 12, Laba3);
+
+          src += 16;
+          dst += 16;
+        }
+    }
+
+  remainder = samples - i;
+  while (remainder--)
+    {
+      float r = src[0];
+      float g = src[1];
+      float b = src[2];
+      float a = src[3];
+
+      float xr = m_0_0 * r + m_0_1 * g + m_0_2 * b;
+      float yr = m_1_0 * r + m_1_1 * g + m_1_2 * b;
+      float zr = m_2_0 * r + m_2_1 * g + m_2_2 * b;
+
+      float fx = xr > LAB_EPSILON ? _cbrtf (xr) : (LAB_KAPPA * xr + 16.0f) / 116.0f;
+      float fy = yr > LAB_EPSILON ? _cbrtf (yr) : (LAB_KAPPA * yr + 16.0f) / 116.0f;
+      float fz = zr > LAB_EPSILON ? _cbrtf (zr) : (LAB_KAPPA * zr + 16.0f) / 116.0f;
+
+      float L = 116.0f * fy - 16.0f;
+      float A = 500.0f * (fx - fy);
+      float B = 200.0f * (fy - fz);
+
+      dst[0] = L;
+      dst[1] = A;
+      dst[2] = B;
+      dst[3] = a;
+
+      src += 4;
+      dst += 4;
+    }
+}
+
+#endif /* defined(USE_SSE2) */
+
 static void
 conversions (void)
 {
@@ -1200,6 +1401,20 @@ conversions (void)
     NULL
   );
 
+#if defined(USE_SSE2)
+
+  if (babl_cpu_accel_get_support () & BABL_CPU_ACCEL_X86_SSE2)
+    {
+      babl_conversion_new (
+        babl_format ("RGBA float"),
+        babl_format ("CIE Lab alpha float"),
+        "linear", rgbaf_to_Labaf_sse2,
+        NULL
+      );
+    }
+
+#endif /* defined(USE_SSE2) */
+
   rgbcie_init ();
 }
 
diff --git a/extensions/Makefile.am b/extensions/Makefile.am
index a2a2943..a066e8d 100644
--- a/extensions/Makefile.am
+++ b/extensions/Makefile.am
@@ -71,6 +71,7 @@ fast_float_la_SOURCES = fast-float.c
 LIBS =  $(top_builddir)/babl/libbabl-@BABL_API_VERSION@.la \
        $(MATH_LIB) $(THREAD_LIB)
 
+CIE_la_CFLAGS = $(SSE2_EXTRA_CFLAGS)
 sse2_float_la_CFLAGS = $(SSE2_EXTRA_CFLAGS)
 sse2_int8_la_CFLAGS = $(SSE2_EXTRA_CFLAGS)
 sse2_int16_la_CFLAGS = $(SSE2_EXTRA_CFLAGS)
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]