[babl] Add SSE2 conversions
- From: Daniel Sabo <daniels src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [babl] Add SSE2 conversions
- Date: Sat, 6 Apr 2013 03:58:47 +0000 (UTC)
commit 4054de8477b3ba4addb69cf9783dfb611f60faa6
Author: Daniel Sabo <DanielSabo gmail com>
Date: Sat Mar 30 08:48:52 2013 -0700
Add SSE2 conversions
This patch includes two conversions for RaGaBaA -> RGBA. Depending
on the CPU either spin or shuffle is significantly faster. Unless
I can find a consistently fast version I'm going to let them fight
it out in the babl startup benchmarks.
configure.ac | 29 +++++-
extensions/Makefile.am | 18 ++-
extensions/sse2-float.c | 299 +++++++++++++++++++++++++++++++++++++++++++++++
extensions/sse2-int16.c | 186 +++++++++++++++++++++++++++++
4 files changed, 525 insertions(+), 7 deletions(-)
---
diff --git a/configure.ac b/configure.ac
index ce5a872..296ec27 100644
--- a/configure.ac
+++ b/configure.ac
@@ -294,9 +294,14 @@ AC_ARG_ENABLE(sse,
[ --enable-sse enable SSE support (default=auto)],,
enable_sse=$enable_mmx)
+AC_ARG_ENABLE(sse2,
+ [ --enable-sse2 enable SSE2 support (default=auto)],,
+ enable_sse2=$enable_sse)
+
if test "x$enable_mmx" = xyes; then
BABL_DETECT_CFLAGS(MMX_EXTRA_CFLAGS, '-mmmx')
SSE_EXTRA_CFLAGS=
+ SSE2_EXTRA_CFLAGS=
AC_MSG_CHECKING(whether we can compile MMX code)
@@ -309,8 +314,11 @@ if test "x$enable_mmx" = xyes; then
AC_MSG_RESULT(yes)
if test "x$enable_sse" = xyes; then
+ BABL_DETECT_CFLAGS(fpmath_flag, '-mfpmath=sse')
+ SSE_EXTRA_CFLAGS="$MMX_EXTRA_CFLAGS $fpmath_flag"
+
BABL_DETECT_CFLAGS(sse_flag, '-msse')
- SSE_EXTRA_CFLAGS="$MMX_EXTRA_CFLAGS $sse_flag"
+ SSE_EXTRA_CFLAGS="$SSE_EXTRA_CFLAGS $sse_flag"
AC_MSG_CHECKING(whether we can compile SSE code)
@@ -325,6 +333,24 @@ if test "x$enable_mmx" = xyes; then
AC_MSG_WARN([The assembler does not support the SSE command set.])
)
+ if test "x$enable_sse2" = xyes; then
+ BABL_DETECT_CFLAGS(sse2_flag, '-msse2')
+ SSE2_EXTRA_CFLAGS="$SSE_EXTRA_CFLAGS $sse2_flag"
+
+ AC_MSG_CHECKING(whether we can compile SSE2 code)
+
+ CFLAGS="$CFLAGS $sse2_flag"
+
+ AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[asm ("punpckhwd %xmm0,%xmm1");])],
+ AC_DEFINE(USE_SSE2, 1, [Define to 1 if SSE2 assembly is available.])
+ AC_MSG_RESULT(yes)
+ ,
+ enable_sse2=no
+ AC_MSG_RESULT(no)
+ AC_MSG_WARN([The assembler does not support the SSE2 command set.])
+ )
+ fi
+
fi
,
enable_mmx=no
@@ -336,6 +362,7 @@ if test "x$enable_mmx" = xyes; then
AC_SUBST(MMX_EXTRA_CFLAGS)
AC_SUBST(SSE_EXTRA_CFLAGS)
+ AC_SUBST(SSE2_EXTRA_CFLAGS)
fi
diff --git a/extensions/Makefile.am b/extensions/Makefile.am
index 2636f17..30ac8c5 100644
--- a/extensions/Makefile.am
+++ b/extensions/Makefile.am
@@ -21,16 +21,18 @@ ext_LTLIBRARIES = \
gggl-lies.la \
gggl.la \
gimp-8bit.la \
- float.la \
- fast-float.la \
+ float.la \
+ fast-float.la \
naive-CMYK.la \
- HSV.la \
+ HSV.la \
simple.la \
- sse-fixups.la
+ sse-fixups.la \
+ sse2-float.la \
+ sse2-int16.la
cairo_la_SOURCES = cairo.c
CIE_la_SOURCES = CIE.c
-expar_la_SOURCES = expar.c
+simple_la_SOURCES = simple.c
gegl_fixups_la_SOURCES = gegl-fixups.c
gggl_lies_la_SOURCES = gggl-lies.c
gggl_la_SOURCES = gggl.c
@@ -38,9 +40,13 @@ gimp_8bit_la_SOURCES = gimp-8bit.c
naive_CMYK_la_SOURCES = naive-CMYK.c
HSV_la_SOURCES = HSV.c
sse_fixups_la_SOURCES = sse-fixups.c
+sse2_float_la_SOURCES = sse2-float.c
+sse2_int16_la_SOURCES = sse2-int16.c
float_la_SOURCES = float.c
fast_float_la_SOURCES = fast-float.c
LIBS = $(top_builddir)/babl/libbabl- BABL_API_VERSION@.la $(MATH_LIB)
-sse_fixups_la_CFLAGS = $(MMX_EXTRA_CFLAGS) $(SSE_EXTRA_CFLAGS)
+sse_fixups_la_CFLAGS = $(SSE_EXTRA_CFLAGS)
+sse2_float_la_CFLAGS = $(SSE2_EXTRA_CFLAGS)
+sse2_int16_la_CFLAGS = $(SSE2_EXTRA_CFLAGS)
diff --git a/extensions/sse2-float.c b/extensions/sse2-float.c
new file mode 100644
index 0000000..954e359
--- /dev/null
+++ b/extensions/sse2-float.c
@@ -0,0 +1,299 @@
+/* babl - dynamically extendable universal pixel conversion library.
+ * Copyright (C) 2013 Massimo Valentini
+ * Copyright (C) 2013 Daniel Sabo
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General
+ * Public License along with this library; if not, see
+ * <http://www.gnu.org/licenses/>.
+ */
+
+#include "config.h"
+
+#if defined(USE_SSE2)
+
+/* SSE 2 */
+#include <emmintrin.h>
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "babl.h"
+#include "babl-cpuaccel.h"
+#include "base/util.h"
+#include "extensions/util.h"
+
+#define Q(a) { a, a, a, a }
+
+static const float BABL_ALPHA_THRESHOLD_FLOAT = (float)BABL_ALPHA_THRESHOLD;
+
+static long
+conv_rgbaF_linear_rgbAF_linear (const float *src, float *dst, long samples)
+{
+ long i = 0;
+ long remainder;
+
+ if (((uintptr_t)src % 16) + ((uintptr_t)dst % 16) == 0)
+ {
+ const long n = (samples / 2) * 2;
+ const __v4sf *s = (const __v4sf*) src;
+ __v4sf *d = (__v4sf*)dst;
+
+ for ( ; i < n; i += 2)
+ {
+ __v4sf rbaa0, rbaa1;
+
+ __v4sf rgba0 = *s++;
+ __v4sf rgba1 = *s++;
+
+ /* Expand alpha */
+ __v4sf aaaa0 = (__v4sf)_mm_shuffle_epi32((__m128i)rgba0, _MM_SHUFFLE(3, 3, 3, 3));
+ __v4sf aaaa1 = (__v4sf)_mm_shuffle_epi32((__m128i)rgba1, _MM_SHUFFLE(3, 3, 3, 3));
+
+ /* Premultiply */
+ rgba0 = rgba0 * aaaa0;
+ rgba1 = rgba1 * aaaa1;
+
+ /* Shuffle the original alpha value back in */
+ rbaa0 = _mm_shuffle_ps(rgba0, aaaa0, _MM_SHUFFLE(0, 0, 2, 0));
+ rbaa1 = _mm_shuffle_ps(rgba1, aaaa1, _MM_SHUFFLE(0, 0, 2, 0));
+
+ rgba0 = _mm_shuffle_ps(rgba0, rbaa0, _MM_SHUFFLE(2, 1, 1, 0));
+ rgba1 = _mm_shuffle_ps(rgba1, rbaa1, _MM_SHUFFLE(2, 1, 1, 0));
+
+ *d++ = rgba0;
+ *d++ = rgba1;
+ }
+ _mm_empty ();
+ }
+
+ dst += i * 4;
+ src += i * 4;
+ remainder = samples - i;
+ while (remainder--)
+ {
+ const float a = src[3];
+ dst[0] = src[0] * a;
+ dst[1] = src[1] * a;
+ dst[2] = src[2] * a;
+ dst[3] = a;
+
+ src += 4;
+ dst += 4;
+ }
+
+ return samples;
+}
+
+static long
+conv_rgbAF_linear_rgbaF_linear_shuffle (const float *src, float *dst, long samples)
+{
+ long i = 0;
+ long remainder;
+
+ if (((uintptr_t)src % 16) + ((uintptr_t)dst % 16) == 0)
+ {
+ const long n = samples;
+ const __v4sf *s = (const __v4sf*) src;
+ __v4sf *d = (__v4sf*)dst;
+
+ for ( ; i < n; i += 1)
+ {
+ __v4sf pre_rgba0, rgba0, rbaa0, raaaa0;
+
+ float alpha0 = ((float *)s)[3];
+ pre_rgba0 = *s;
+
+ if (alpha0 <= 0.0f)
+ {
+ /* Zero RGB */
+ rgba0 = _mm_setzero_ps();
+ }
+ else
+ {
+ float recip0 = 1.0f/alpha0;
+
+ /* Expand reciprocal */
+ raaaa0 = _mm_load1_ps(&recip0);
+
+ /* Un-Premultiply */
+ rgba0 = pre_rgba0 * raaaa0;
+ }
+
+ /* Shuffle the original alpha value back in */
+ rbaa0 = _mm_shuffle_ps(rgba0, pre_rgba0, _MM_SHUFFLE(3, 3, 2, 0));
+ rgba0 = _mm_shuffle_ps(rgba0, rbaa0, _MM_SHUFFLE(2, 1, 1, 0));
+
+ s++;
+ *d++ = rgba0;
+ }
+ _mm_empty ();
+ }
+
+ dst += i * 4;
+ src += i * 4;
+ remainder = samples - i;
+ while (remainder--)
+ {
+ float alpha = src[3];
+ float recip;
+ if (alpha <= 0.0f)
+ recip = 0.0f;
+ else
+ recip = 1.0f/alpha;
+ dst[0] = src[0] * recip;
+ dst[1] = src[1] * recip;
+ dst[2] = src[2] * recip;
+ dst[3] = alpha;
+
+ src += 4;
+ dst += 4;
+ }
+
+ return samples;
+}
+
+static long
+conv_rgbAF_linear_rgbaF_linear_spin (const float *src, float *dst, long samples)
+{
+ long i = 0;
+ long remainder;
+
+ if (((uintptr_t)src % 16) + ((uintptr_t)dst % 16) == 0)
+ {
+ const long n = samples;
+ const __v4sf *s = (const __v4sf*) src;
+ __v4sf *d = (__v4sf*)dst;
+ const __v4sf zero = _mm_setzero_ps();
+ const __v4sf one = _mm_set_ss(1.0f);
+
+ for ( ; i < n; i += 1)
+ {
+ __v4sf pre_abgr0, abgr0, rgba0, raaaa0;
+
+
+ rgba0 = *s;
+ /* Rotate to ABGR */
+ pre_abgr0 = (__v4sf)_mm_shuffle_epi32((__m128i)rgba0, _MM_SHUFFLE(0, 1, 2, 3));
+
+ if (_mm_ucomile_ss(pre_abgr0, zero))
+ {
+ /* Zero RGB */
+ abgr0 = zero;
+ }
+ else
+ {
+ /* Un-Premultiply */
+ raaaa0 = _mm_div_ss(one, pre_abgr0);
+
+ /* Expand reciprocal */
+ raaaa0 = (__v4sf)_mm_shuffle_epi32((__m128i)raaaa0, _MM_SHUFFLE(0, 0, 0, 0));
+
+ /* Un-Premultiply */
+ abgr0 = pre_abgr0 * raaaa0;
+ }
+
+ /* Move the original alpha value back in */
+ abgr0 = _mm_move_ss(abgr0, pre_abgr0);
+
+ /* Rotate to ABGR */
+ rgba0 = (__v4sf)_mm_shuffle_epi32((__m128i)abgr0, _MM_SHUFFLE(0, 1, 2, 3));
+
+ *d++ = rgba0;
+ s++;
+ }
+ _mm_empty ();
+ }
+
+ dst += i * 4;
+ src += i * 4;
+ remainder = samples - i;
+ while (remainder--)
+ {
+ float alpha = src[3];
+ float recip;
+ if (alpha <= 0.0f)
+ recip = 0.0f;
+ else
+ recip = 1.0f/alpha;
+ dst[0] = src[0] * recip;
+ dst[1] = src[1] * recip;
+ dst[2] = src[2] * recip;
+ dst[3] = alpha;
+
+ src += 4;
+ dst += 4;
+ }
+
+ return samples;
+}
+
+#endif /* defined(USE_SSE2) */
+
+#define o(src, dst) \
+ babl_conversion_new (src, dst, "linear", conv_ ## src ## _ ## dst, NULL)
+
+int init (void);
+
+int
+init (void)
+{
+#if defined(USE_SSE2)
+
+ const Babl *rgbaF_linear = babl_format_new (
+ babl_model ("RGBA"),
+ babl_type ("float"),
+ babl_component ("R"),
+ babl_component ("G"),
+ babl_component ("B"),
+ babl_component ("A"),
+ NULL);
+ const Babl *rgbAF_linear = babl_format_new (
+ babl_model ("RaGaBaA"),
+ babl_type ("float"),
+ babl_component ("Ra"),
+ babl_component ("Ga"),
+ babl_component ("Ba"),
+ babl_component ("A"),
+ NULL);
+
+ if ((babl_cpu_accel_get_support () & BABL_CPU_ACCEL_X86_SSE) &&
+ (babl_cpu_accel_get_support () & BABL_CPU_ACCEL_X86_SSE2))
+
+ {
+ babl_conversion_new(rgbaF_linear,
+ rgbAF_linear,
+ "linear",
+ conv_rgbaF_linear_rgbAF_linear,
+ NULL);
+
+ /* Which of these is faster varies by CPU, and the difference
+ * is big enough that it's worthwhile to include both and
+ * let them fight it out in the babl benchmarks.
+ */
+ babl_conversion_new(rgbAF_linear,
+ rgbaF_linear,
+ "linear",
+ conv_rgbAF_linear_rgbaF_linear_shuffle,
+ NULL);
+ babl_conversion_new(rgbAF_linear,
+ rgbaF_linear,
+ "linear",
+ conv_rgbAF_linear_rgbaF_linear_spin,
+ NULL);
+ }
+
+#endif /* defined(USE_SSE2) */
+
+ return 0;
+}
+
diff --git a/extensions/sse2-int16.c b/extensions/sse2-int16.c
new file mode 100644
index 0000000..252d1a7
--- /dev/null
+++ b/extensions/sse2-int16.c
@@ -0,0 +1,186 @@
+/* babl - dynamically extendable universal pixel conversion library.
+ * Copyright (C) 2013 Massimo Valentini
+ * Copyright (C) 2013 Daniel Sabo
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General
+ * Public License along with this library; if not, see
+ * <http://www.gnu.org/licenses/>.
+ */
+
+#include "config.h"
+
+#if defined(USE_SSE2)
+
+/* SSE 2 */
+#include <emmintrin.h>
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "babl.h"
+#include "babl-cpuaccel.h"
+#include "extensions/util.h"
+
+#define Q(a) { a, a, a, a }
+static const __v4sf u16_float = Q (1.f / 65535);
+
+static long
+conv_rgba16_linear_rgbaF_linear (const uint16_t *src, float *dst, long samples)
+{
+ long i = 0;
+
+ if (((uintptr_t)src % 16) + ((uintptr_t)dst % 16) == 0)
+ {
+ long n = (samples / 2) * 2;
+ const __m128i *s = (const __m128i*) src;
+ __v4sf *d = (__v4sf*) dst;
+
+ for (; i < n / 2; i++)
+ {
+ /* Expand shorts to ints by loading zero in the high bits */
+ const __m128i t0 = _mm_unpacklo_epi16 (s[i + 0], (__m128i)_mm_setzero_ps());
+ const __m128i t1 = _mm_unpackhi_epi16 (s[i + 0], (__m128i)_mm_setzero_ps());
+
+ /* Convert to float */
+ const __m128 u0 = _mm_cvtepi32_ps (t0);
+ const __m128 u1 = _mm_cvtepi32_ps (t1);
+
+ const __v4sf rgba0 = u0 * u16_float;
+ const __v4sf rgba1 = u1 * u16_float;
+
+ d[2 * i + 0] = rgba0;
+ d[2 * i + 1] = rgba1;
+ }
+ _mm_empty();
+ }
+
+ for (i *= 2 * 4; i != 4 * samples; i++)
+ dst[i] = src[i] * (1.f / 65535);
+
+ return samples;
+}
+
+static long
+conv_rgba16_linear_rgbAF_linear (const uint16_t *src, float *dst, long samples)
+{
+ long i = 0;
+ long remainder;
+
+ if (((uintptr_t)src % 16) + ((uintptr_t)dst % 16) == 0)
+ {
+ long n = (samples / 2) * 2;
+ const __m128i *s = (const __m128i*) src;
+ __v4sf *d = (__v4sf*) dst;
+
+ const __v4sf max_mask = { 0.0f, 0.0f, 0.0f, 1.0f };
+
+ for (; i < n / 2; i++)
+ {
+ /* Expand shorts to ints by loading zero in the high bits */
+ const __m128i t0 = _mm_unpacklo_epi16 (s[i + 0], (__m128i)_mm_setzero_ps());
+ const __m128i t1 = _mm_unpackhi_epi16 (s[i + 0], (__m128i)_mm_setzero_ps());
+
+ /* Convert to float */
+ const __m128 u0 = _mm_cvtepi32_ps (t0);
+ const __m128 u1 = _mm_cvtepi32_ps (t1);
+
+ /* Multiply by 1 / 65535 */
+ __v4sf rgba0 = u0 * u16_float;
+ __v4sf rgba1 = u1 * u16_float;
+
+ /* Expand alpha */
+ __v4sf aaaa0 = (__v4sf)_mm_shuffle_epi32((__m128i)rgba0, _MM_SHUFFLE(3, 3, 3, 3));
+ __v4sf aaaa1 = (__v4sf)_mm_shuffle_epi32((__m128i)rgba1, _MM_SHUFFLE(3, 3, 3, 3));
+
+ /* Set the value in the alpha slot to 1.0, we know max is sufficent because alpha was a short */
+ aaaa0 = _mm_max_ps(aaaa0, max_mask);
+ aaaa1 = _mm_max_ps(aaaa1, max_mask);
+
+ /* Premultiply */
+ rgba0 = rgba0 * aaaa0;
+ rgba1 = rgba1 * aaaa1;
+
+ d[2 * i + 0] = rgba0;
+ d[2 * i + 1] = rgba1;
+ }
+ _mm_empty();
+ }
+
+ dst += i * 2 * 4;
+ src += i * 2 * 4;
+ remainder = samples - (i * 2);
+ while (remainder--)
+ {
+ const float a = src[3] / 65535.0f;
+ const float a_term = a / 65535.0f;
+ dst[0] = src[0] * a_term;
+ dst[1] = src[1] * a_term;
+ dst[2] = src[2] * a_term;
+ dst[3] = a;
+
+ src += 4;
+ dst += 4;
+ }
+
+ return samples;
+}
+
+#endif /* defined(USE_SSE2) */
+
+#define o(src, dst) \
+ babl_conversion_new (src, dst, "linear", conv_ ## src ## _ ## dst, NULL)
+
+int init (void);
+
+int
+init (void)
+{
+#if defined(USE_SSE2)
+
+ const Babl *rgbaF_linear = babl_format_new (
+ babl_model ("RGBA"),
+ babl_type ("float"),
+ babl_component ("R"),
+ babl_component ("G"),
+ babl_component ("B"),
+ babl_component ("A"),
+ NULL);
+ const Babl *rgbAF_linear = babl_format_new (
+ babl_model ("RaGaBaA"),
+ babl_type ("float"),
+ babl_component ("Ra"),
+ babl_component ("Ga"),
+ babl_component ("Ba"),
+ babl_component ("A"),
+ NULL);
+ const Babl *rgba16_linear = babl_format_new (
+ babl_model ("RGBA"),
+ babl_type ("u16"),
+ babl_component ("R"),
+ babl_component ("G"),
+ babl_component ("B"),
+ babl_component ("A"),
+ NULL);
+
+ if ((babl_cpu_accel_get_support () & BABL_CPU_ACCEL_X86_SSE) &&
+ (babl_cpu_accel_get_support () & BABL_CPU_ACCEL_X86_SSE2))
+ {
+ o (rgba16_linear, rgbaF_linear);
+ o (rgba16_linear, rgbAF_linear);
+ }
+
+#endif /* defined(USE_SSE2) */
+
+ return 0;
+}
+
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]