[babl/sse-conversions-2013: 5/8] Float, Int16 SSE conversions, draft 1



commit 61511730f20b5aa5d5c4477c7f59a06824c9066e
Author: Daniel Sabo <DanielSabo gmail com>
Date:   Sat Mar 30 08:48:52 2013 -0700

    Float, Int16 SSE conversions, draft 1

 extensions/Makefile.am |   14 +++-
 extensions/sse-float.c |  206 ++++++++++++++++++++++++++++++++++++++++++++++++
 extensions/sse-int16.c |  195 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 411 insertions(+), 4 deletions(-)
---
diff --git a/extensions/Makefile.am b/extensions/Makefile.am
index 23ffbab..333705a 100644
--- a/extensions/Makefile.am
+++ b/extensions/Makefile.am
@@ -20,12 +20,14 @@ ext_LTLIBRARIES = \
        gegl-fixups.la  \
        gggl.la         \
        gimp-8bit.la    \
-       float.la    \
-       fast-float.la    \
+       float.la        \
+       fast-float.la   \
        naive-CMYK.la   \
-       HSV.la   \
+       HSV.la          \
        simple.la       \
-       sse-fixups.la
+       sse-fixups.la   \
+       sse-float.la    \
+       sse-int16.la
 
 cairo_la_SOURCES = cairo.c
 CIE_la_SOURCES = CIE.c
@@ -36,9 +38,13 @@ gimp_8bit_la_SOURCES = gimp-8bit.c
 naive_CMYK_la_SOURCES = naive-CMYK.c
 HSV_la_SOURCES = HSV.c
 sse_fixups_la_SOURCES = sse-fixups.c
+sse_float_la_SOURCES = sse-float.c
+sse_int16_la_SOURCES = sse-int16.c
 float_la_SOURCES = float.c
 fast_float_la_SOURCES = fast-float.c
 
 LIBS = $(top_builddir)/babl/libbabl- BABL_API_VERSION@.la $(MATH_LIB)
 
 sse_fixups_la_CFLAGS = $(MMX_EXTRA_CFLAGS) $(SSE_EXTRA_CFLAGS)
+sse_float_la_CFLAGS = $(MMX_EXTRA_CFLAGS) $(SSE_EXTRA_CFLAGS)
+sse_int16_la_CFLAGS = $(MMX_EXTRA_CFLAGS) $(SSE_EXTRA_CFLAGS)
diff --git a/extensions/sse-float.c b/extensions/sse-float.c
new file mode 100644
index 0000000..f74a33a
--- /dev/null
+++ b/extensions/sse-float.c
@@ -0,0 +1,206 @@
+/* babl - dynamically extendable universal pixel conversion library.
+ * Copyright (C) 2013 Massimo Valentini
+ * Copyright (C) 2013 Daniel Sabo
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General
+ * Public License along with this library; if not, see
+ * <http://www.gnu.org/licenses/>.
+ */
+
+#include "config.h"
+
+#if defined(__GNUC__) && (__GNUC__ >= 4) && defined(USE_SSE) && defined(USE_MMX)
+
+#include <xmmintrin.h>
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "babl.h"
+#include "babl-cpuaccel.h"
+#include "extensions/util.h"
+
+#define Q(a) { a, a, a, a }
+
+static long
+conv_rgbaF_linear_rgbAF_linear (const float *src, float *dst, long samples)
+{
+  long i = 0;
+  long remainder;
+
+  if (((uintptr_t)src % 16) + ((uintptr_t)dst % 16) == 0)
+    {
+      const long    n = (samples / 4) * 4;
+      const __v4sf *s = (const __v4sf*) src;
+            __v4sf *d = (__v4sf*)dst;
+
+      for ( ; i < n; i += 4)
+        {
+          const __v4sf s0 = s[i + 0];
+          const __v4sf s1 = s[i + 1];
+          const __v4sf s2 = s[i + 2];
+          const __v4sf s3 = s[i + 3];
+
+          /* Shuffle the pixels into a planar layout */
+          const __v4sf rg01 = _mm_unpacklo_ps (s0, s1);
+          const __v4sf ba01 = _mm_unpackhi_ps (s0, s1);
+          const __v4sf rg23 = _mm_unpacklo_ps (s2, s3);
+          const __v4sf ba23 = _mm_unpackhi_ps (s2, s3);
+
+          const __v4sf r0213 = _mm_unpacklo_ps (rg01, rg23);
+          const __v4sf g0213 = _mm_unpackhi_ps (rg01, rg23);
+          const __v4sf b0213 = _mm_unpacklo_ps (ba01, ba23);
+          const __v4sf a0213 = _mm_unpackhi_ps (ba01, ba23);
+
+          const __v4sf R0213 = r0213 * a0213;
+          const __v4sf G0213 = g0213 * a0213;
+          const __v4sf B0213 = b0213 * a0213;
+
+          const __v4sf RB02 = _mm_unpacklo_ps (R0213, B0213);
+          const __v4sf RB13 = _mm_unpackhi_ps (R0213, B0213);
+          const __v4sf Ga02 = _mm_unpacklo_ps (G0213, a0213);
+          const __v4sf Ga13 = _mm_unpackhi_ps (G0213, a0213);
+
+          d[i + 0] = _mm_unpacklo_ps (RB02, Ga02);
+          d[i + 1] = _mm_unpacklo_ps (RB13, Ga13);
+          d[i + 2] = _mm_unpackhi_ps (RB02, Ga02);
+          d[i + 3] = _mm_unpackhi_ps (RB13, Ga13);
+        }
+      _mm_empty ();
+    }
+
+  dst += i * 4;
+  src += i * 4;
+  remainder = samples - i;
+  while (remainder--)
+  {
+    const float a = src[3];
+    dst[0] = src[0] * a;
+    dst[1] = src[1] * a;
+    dst[2] = src[2] * a;
+    dst[3] = a;
+    
+    src += 4;
+    dst += 4;
+  }
+
+  return samples;
+}
+
+static long
+conv_rgbAF_linear_rgbaF_linear (const float *src, float *dst, long samples)
+{
+  long i = 0;
+  long remainder;
+
+  if (((uintptr_t)src % 16) + ((uintptr_t)dst % 16) == 0)
+    {
+      const long    n = (samples / 4) * 4;
+      const __v4sf *s = (const __v4sf*) src;
+            __v4sf *d = (__v4sf*)dst;
+
+      for ( ; i < n; i += 4)
+        {
+          const __v4sf s0 = s[i + 0];
+          const __v4sf s1 = s[i + 1];
+          const __v4sf s2 = s[i + 2];
+          const __v4sf s3 = s[i + 3];
+
+          /* Shuffle the pixels into a planar layout */
+          const __v4sf rg01 = _mm_unpacklo_ps (s0, s1);
+          const __v4sf ba01 = _mm_unpackhi_ps (s0, s1);
+          const __v4sf rg23 = _mm_unpacklo_ps (s2, s3);
+          const __v4sf ba23 = _mm_unpackhi_ps (s2, s3);
+
+          const __v4sf r0213 = _mm_unpacklo_ps (rg01, rg23);
+          const __v4sf g0213 = _mm_unpackhi_ps (rg01, rg23);
+          const __v4sf b0213 = _mm_unpacklo_ps (ba01, ba23);
+          const __v4sf a0213 = _mm_unpackhi_ps (ba01, ba23);
+
+          const __v4sf R0213 = r0213 / a0213;
+          const __v4sf G0213 = g0213 / a0213;
+          const __v4sf B0213 = b0213 / a0213;
+
+          const __v4sf RB02 = _mm_unpacklo_ps (R0213, B0213);
+          const __v4sf RB13 = _mm_unpackhi_ps (R0213, B0213);
+          const __v4sf Ga02 = _mm_unpacklo_ps (G0213, a0213);
+          const __v4sf Ga13 = _mm_unpackhi_ps (G0213, a0213);
+
+          d[i + 0] = _mm_unpacklo_ps (RB02, Ga02);
+          d[i + 1] = _mm_unpacklo_ps (RB13, Ga13);
+          d[i + 2] = _mm_unpackhi_ps (RB02, Ga02);
+          d[i + 3] = _mm_unpackhi_ps (RB13, Ga13);
+        }
+      _mm_empty ();
+    }
+
+  dst += i * 4;
+  src += i * 4;
+  remainder = samples - i;
+  while (remainder--)
+  {
+    const float a = src[3];
+    const float a_term = 1.0f / a;
+    dst[0] = src[0] * a_term;
+    dst[1] = src[1] * a_term;
+    dst[2] = src[2] * a_term;
+    dst[3] = a;
+    
+    src += 4;
+    dst += 4;
+  }
+
+  return samples;
+}
+
+#endif /* defined(__GNUC__) && (__GNUC__ >= 4) && defined(USE_SSE) && defined(USE_MMX) */
+
+#define o(src, dst) \
+  babl_conversion_new (src, dst, "linear", conv_ ## src ## _ ## dst, NULL)
+
+int init (void);
+
+int
+init (void)
+{
+#if defined(__GNUC__) && (__GNUC__ >= 4) && defined(USE_SSE) && defined(USE_MMX)
+
+  const Babl *rgbaF_linear = babl_format_new (
+    babl_model ("RGBA"),
+    babl_type ("float"),
+    babl_component ("R"),
+    babl_component ("G"),
+    babl_component ("B"),
+    babl_component ("A"),
+    NULL);
+  const Babl *rgbAF_linear = babl_format_new (
+    babl_model ("RaGaBaA"),
+    babl_type ("float"),
+    babl_component ("Ra"),
+    babl_component ("Ga"),
+    babl_component ("Ba"),
+    babl_component ("A"),
+    NULL);
+
+  if ((babl_cpu_accel_get_support () & BABL_CPU_ACCEL_X86_MMX) &&
+      (babl_cpu_accel_get_support () & BABL_CPU_ACCEL_X86_SSE))
+    {
+      o (rgbaF_linear, rgbAF_linear);
+      o (rgbAF_linear, rgbaF_linear);
+    }
+
+#endif
+
+  return 0;
+}
+
diff --git a/extensions/sse-int16.c b/extensions/sse-int16.c
new file mode 100644
index 0000000..c8ba934
--- /dev/null
+++ b/extensions/sse-int16.c
@@ -0,0 +1,195 @@
+/* babl - dynamically extendable universal pixel conversion library.
+ * Copyright (C) 2013 Massimo Valentini
+ * Copyright (C) 2013 Daniel Sabo
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General
+ * Public License along with this library; if not, see
+ * <http://www.gnu.org/licenses/>.
+ */
+
+#include "config.h"
+
+#if defined(__GNUC__) && (__GNUC__ >= 4) && defined(USE_SSE) && defined(USE_MMX)
+
+#include <xmmintrin.h>
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "babl.h"
+#include "babl-cpuaccel.h"
+#include "extensions/util.h"
+
+#define Q(a) { a, a, a, a }
+static const __v4sf  u16_float = Q (1.f / 65535);
+static const __m128i zero = { 0 };
+
+
+static long
+conv_rgba16_linear_rgbaF_linear (const uint16_t *src, float *dst, long samples)
+{
+  long i = 0;
+
+  if (((uintptr_t)src % 16) + ((uintptr_t)dst % 16) == 0)
+    {
+      long           n  = (samples / 2) * 2;
+      const __m128i *s  = (const __m128i*) src;
+            __v4sf  *d  = (__v4sf*) dst;
+
+      for (; i < n / 2; i++)
+        {
+          const __m128i t0 = _mm_unpacklo_epi16 (s[i + 0], zero);
+          const __m128i t1 = _mm_unpackhi_epi16 (s[i + 0], zero);
+
+          const __m128  u0 = _mm_cvtepi32_ps (t0);
+          const __m128  u1 = _mm_cvtepi32_ps (t1);
+
+          const __v4sf rgba0 = u0 * u16_float;
+          const __v4sf rgba1 = u1 * u16_float;
+
+          d[2 * i + 0] = rgba0;
+          d[2 * i + 1] = rgba1;
+        }
+      _mm_empty();
+    }
+  for (i *= 2 * 4; i != 4 * samples; i++)
+    dst[i] = src[i] * (1.f / 65535);
+  return samples;
+}
+
+static long
+conv_rgba16_linear_rgbAF_linear (const uint16_t *src, float *dst, long samples)
+{
+  long i = 0;
+  long remainder;
+
+  if (((uintptr_t)src % 16) + ((uintptr_t)dst % 16) == 0)
+    {
+      long           n  = (samples / 2) * 2;
+      const __m128i *s  = (const __m128i*) src;
+            __v4sf  *d  = (__v4sf*) dst;
+
+      const __v4sf  max_mask = { 0.0f, 0.0f, 0.0f, 1.0f };
+
+      for (; i < n / 2; i++)
+        {
+          /* Expand shorts to ints by loading zero in the high bits */
+          const __m128i t0 = _mm_unpacklo_epi16 (s[i + 0], zero);
+          const __m128i t1 = _mm_unpackhi_epi16 (s[i + 0], zero);
+
+          /* Convert to float */
+          const __m128  u0 = _mm_cvtepi32_ps (t0);
+          const __m128  u1 = _mm_cvtepi32_ps (t1);
+
+          /* Multiply by 1 / 2^16 */
+          __v4sf rgba0 = u0 * u16_float;
+          __v4sf rgba1 = u1 * u16_float;
+          
+          /* Expand alpha */
+          __v4sf aaaa0 = (__v4sf)_mm_shuffle_epi32(rgba0, _MM_SHUFFLE(3, 3, 3, 3));
+          __v4sf aaaa1 = (__v4sf)_mm_shuffle_epi32(rgba1, _MM_SHUFFLE(3, 3, 3, 3));
+          
+          /* Set the value in the alpha slot to 1 */
+          aaaa0 = _mm_max_ps(aaaa0, max_mask);
+          aaaa1 = _mm_max_ps(aaaa1, max_mask);
+          
+          /* Premultiply */
+          rgba0 = rgba0 * aaaa0;
+          rgba1 = rgba1 * aaaa1;
+#if 0
+          /* Rotate a to ABGR so we can replace A */
+          rgba0 = (__v4sf)_mm_shuffle_epi32(rgba0, _MM_SHUFFLE(0, 1, 2, 3));
+          rgba1 = (__v4sf)_mm_shuffle_epi32(rgba1, _MM_SHUFFLE(0, 1, 2, 3));
+          
+          /* Restore original alpha value */
+          rgba0 = _mm_move_ss(rgba0, aaaa0);
+          rgba1 = _mm_move_ss(rgba1, aaaa1);
+          
+          /* Rotate back to RGBA */
+          rgba0 = (__v4sf)_mm_shuffle_epi32(rgba0, _MM_SHUFFLE(0, 1, 2, 3));
+          rgba1 = (__v4sf)_mm_shuffle_epi32(rgba1, _MM_SHUFFLE(0, 1, 2, 3));
+#endif
+          d[2 * i + 0] = rgba0;
+          d[2 * i + 1] = rgba1;
+        }
+      _mm_empty();
+    }
+
+  dst += i * 2 * 4;
+  src += i * 2 * 4;
+  remainder = samples - (i * 2);
+  while (remainder--)
+  {
+    const float a = src[3] / 65535.0f;
+    const float a_term = a / 65535.0f;
+    dst[0] = src[0] * a_term;
+    dst[1] = src[1] * a_term;
+    dst[2] = src[2] * a_term;
+    dst[3] = a;
+    
+    src += 4;
+    dst += 4;
+  }
+
+  return samples;
+}
+
+#endif
+
+#define o(src, dst) \
+  babl_conversion_new (src, dst, "linear", conv_ ## src ## _ ## dst, NULL)
+
+int init (void);
+
+int
+init (void)
+{
+#if defined(__GNUC__) && (__GNUC__ >= 4) && defined(USE_SSE) && defined(USE_MMX)
+
+  const Babl *rgbaF_linear = babl_format_new (
+    babl_model ("RGBA"),
+    babl_type ("float"),
+    babl_component ("R"),
+    babl_component ("G"),
+    babl_component ("B"),
+    babl_component ("A"),
+    NULL);
+  const Babl *rgbAF_linear = babl_format_new (
+    babl_model ("RaGaBaA"),
+    babl_type ("float"),
+    babl_component ("Ra"),
+    babl_component ("Ga"),
+    babl_component ("Ba"),
+    babl_component ("A"),
+    NULL);
+  const Babl *rgba16_linear = babl_format_new (
+    babl_model ("RGBA"),
+    babl_type ("u16"),
+    babl_component ("R"),
+    babl_component ("G"),
+    babl_component ("B"),
+    babl_component ("A"),
+    NULL);
+
+  if ((babl_cpu_accel_get_support () & BABL_CPU_ACCEL_X86_MMX) &&
+      (babl_cpu_accel_get_support () & BABL_CPU_ACCEL_X86_SSE))
+    {
+      o (rgba16_linear, rgbaF_linear);
+      o (rgba16_linear, rgbAF_linear);
+    }
+
+#endif
+
+  return 0;
+}
+


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]