[babl/sse-conversions-2013: 5/8] Float, Int16 SSE conversions, draft 1
- From: Daniel Sabo <daniels src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [babl/sse-conversions-2013: 5/8] Float, Int16 SSE conversions, draft 1
- Date: Mon, 1 Apr 2013 00:07:39 +0000 (UTC)
commit 61511730f20b5aa5d5c4477c7f59a06824c9066e
Author: Daniel Sabo <DanielSabo gmail com>
Date: Sat Mar 30 08:48:52 2013 -0700
Float, Int16 SSE conversions, draft 1
extensions/Makefile.am | 14 +++-
extensions/sse-float.c | 206 ++++++++++++++++++++++++++++++++++++++++++++++++
extensions/sse-int16.c | 195 +++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 411 insertions(+), 4 deletions(-)
---
diff --git a/extensions/Makefile.am b/extensions/Makefile.am
index 23ffbab..333705a 100644
--- a/extensions/Makefile.am
+++ b/extensions/Makefile.am
@@ -20,12 +20,14 @@ ext_LTLIBRARIES = \
gegl-fixups.la \
gggl.la \
gimp-8bit.la \
- float.la \
- fast-float.la \
+ float.la \
+ fast-float.la \
naive-CMYK.la \
- HSV.la \
+ HSV.la \
simple.la \
- sse-fixups.la
+ sse-fixups.la \
+ sse-float.la \
+ sse-int16.la
cairo_la_SOURCES = cairo.c
CIE_la_SOURCES = CIE.c
@@ -36,9 +38,13 @@ gimp_8bit_la_SOURCES = gimp-8bit.c
naive_CMYK_la_SOURCES = naive-CMYK.c
HSV_la_SOURCES = HSV.c
sse_fixups_la_SOURCES = sse-fixups.c
+sse_float_la_SOURCES = sse-float.c
+sse_int16_la_SOURCES = sse-int16.c
float_la_SOURCES = float.c
fast_float_la_SOURCES = fast-float.c
LIBS = $(top_builddir)/babl/libbabl- BABL_API_VERSION@.la $(MATH_LIB)
sse_fixups_la_CFLAGS = $(MMX_EXTRA_CFLAGS) $(SSE_EXTRA_CFLAGS)
+sse_float_la_CFLAGS = $(MMX_EXTRA_CFLAGS) $(SSE_EXTRA_CFLAGS)
+sse_int16_la_CFLAGS = $(MMX_EXTRA_CFLAGS) $(SSE_EXTRA_CFLAGS)
diff --git a/extensions/sse-float.c b/extensions/sse-float.c
new file mode 100644
index 0000000..f74a33a
--- /dev/null
+++ b/extensions/sse-float.c
@@ -0,0 +1,206 @@
+/* babl - dynamically extendable universal pixel conversion library.
+ * Copyright (C) 2013 Massimo Valentini
+ * Copyright (C) 2013 Daniel Sabo
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General
+ * Public License along with this library; if not, see
+ * <http://www.gnu.org/licenses/>.
+ */
+
+#include "config.h"
+
+#if defined(__GNUC__) && (__GNUC__ >= 4) && defined(USE_SSE) && defined(USE_MMX)
+
+#include <xmmintrin.h>
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "babl.h"
+#include "babl-cpuaccel.h"
+#include "extensions/util.h"
+
+#define Q(a) { a, a, a, a }
+
+static long
+conv_rgbaF_linear_rgbAF_linear (const float *src, float *dst, long samples)
+{
+ long i = 0;
+ long remainder;
+
+ if (((uintptr_t)src % 16) + ((uintptr_t)dst % 16) == 0)
+ {
+ const long n = (samples / 4) * 4;
+ const __v4sf *s = (const __v4sf*) src;
+ __v4sf *d = (__v4sf*)dst;
+
+ for ( ; i < n; i += 4)
+ {
+ const __v4sf s0 = s[i + 0];
+ const __v4sf s1 = s[i + 1];
+ const __v4sf s2 = s[i + 2];
+ const __v4sf s3 = s[i + 3];
+
+ /* Shuffle the pixels into a planar layout */
+ const __v4sf rg01 = _mm_unpacklo_ps (s0, s1);
+ const __v4sf ba01 = _mm_unpackhi_ps (s0, s1);
+ const __v4sf rg23 = _mm_unpacklo_ps (s2, s3);
+ const __v4sf ba23 = _mm_unpackhi_ps (s2, s3);
+
+ const __v4sf r0213 = _mm_unpacklo_ps (rg01, rg23);
+ const __v4sf g0213 = _mm_unpackhi_ps (rg01, rg23);
+ const __v4sf b0213 = _mm_unpacklo_ps (ba01, ba23);
+ const __v4sf a0213 = _mm_unpackhi_ps (ba01, ba23);
+
+ const __v4sf R0213 = r0213 * a0213;
+ const __v4sf G0213 = g0213 * a0213;
+ const __v4sf B0213 = b0213 * a0213;
+
+ const __v4sf RB02 = _mm_unpacklo_ps (R0213, B0213);
+ const __v4sf RB13 = _mm_unpackhi_ps (R0213, B0213);
+ const __v4sf Ga02 = _mm_unpacklo_ps (G0213, a0213);
+ const __v4sf Ga13 = _mm_unpackhi_ps (G0213, a0213);
+
+ d[i + 0] = _mm_unpacklo_ps (RB02, Ga02);
+ d[i + 1] = _mm_unpacklo_ps (RB13, Ga13);
+ d[i + 2] = _mm_unpackhi_ps (RB02, Ga02);
+ d[i + 3] = _mm_unpackhi_ps (RB13, Ga13);
+ }
+ _mm_empty ();
+ }
+
+ dst += i * 4;
+ src += i * 4;
+ remainder = samples - i;
+ while (remainder--)
+ {
+ const float a = src[3];
+ dst[0] = src[0] * a;
+ dst[1] = src[1] * a;
+ dst[2] = src[2] * a;
+ dst[3] = a;
+
+ src += 4;
+ dst += 4;
+ }
+
+ return samples;
+}
+
+static long
+conv_rgbAF_linear_rgbaF_linear (const float *src, float *dst, long samples)
+{
+ long i = 0;
+ long remainder;
+
+ if (((uintptr_t)src % 16) + ((uintptr_t)dst % 16) == 0)
+ {
+ const long n = (samples / 4) * 4;
+ const __v4sf *s = (const __v4sf*) src;
+ __v4sf *d = (__v4sf*)dst;
+
+ for ( ; i < n; i += 4)
+ {
+ const __v4sf s0 = s[i + 0];
+ const __v4sf s1 = s[i + 1];
+ const __v4sf s2 = s[i + 2];
+ const __v4sf s3 = s[i + 3];
+
+ /* Shuffle the pixels into a planar layout */
+ const __v4sf rg01 = _mm_unpacklo_ps (s0, s1);
+ const __v4sf ba01 = _mm_unpackhi_ps (s0, s1);
+ const __v4sf rg23 = _mm_unpacklo_ps (s2, s3);
+ const __v4sf ba23 = _mm_unpackhi_ps (s2, s3);
+
+ const __v4sf r0213 = _mm_unpacklo_ps (rg01, rg23);
+ const __v4sf g0213 = _mm_unpackhi_ps (rg01, rg23);
+ const __v4sf b0213 = _mm_unpacklo_ps (ba01, ba23);
+ const __v4sf a0213 = _mm_unpackhi_ps (ba01, ba23);
+
+ const __v4sf R0213 = r0213 / a0213;
+ const __v4sf G0213 = g0213 / a0213;
+ const __v4sf B0213 = b0213 / a0213;
+
+ const __v4sf RB02 = _mm_unpacklo_ps (R0213, B0213);
+ const __v4sf RB13 = _mm_unpackhi_ps (R0213, B0213);
+ const __v4sf Ga02 = _mm_unpacklo_ps (G0213, a0213);
+ const __v4sf Ga13 = _mm_unpackhi_ps (G0213, a0213);
+
+ d[i + 0] = _mm_unpacklo_ps (RB02, Ga02);
+ d[i + 1] = _mm_unpacklo_ps (RB13, Ga13);
+ d[i + 2] = _mm_unpackhi_ps (RB02, Ga02);
+ d[i + 3] = _mm_unpackhi_ps (RB13, Ga13);
+ }
+ _mm_empty ();
+ }
+
+ dst += i * 4;
+ src += i * 4;
+ remainder = samples - i;
+ while (remainder--)
+ {
+ const float a = src[3];
+ const float a_term = 1.0f / a;
+ dst[0] = src[0] * a_term;
+ dst[1] = src[1] * a_term;
+ dst[2] = src[2] * a_term;
+ dst[3] = a;
+
+ src += 4;
+ dst += 4;
+ }
+
+ return samples;
+}
+
+#endif /* defined(__GNUC__) && (__GNUC__ >= 4) && defined(USE_SSE) && defined(USE_MMX) */
+
+#define o(src, dst) \
+ babl_conversion_new (src, dst, "linear", conv_ ## src ## _ ## dst, NULL)
+
+int init (void);
+
+int
+init (void)
+{
+#if defined(__GNUC__) && (__GNUC__ >= 4) && defined(USE_SSE) && defined(USE_MMX)
+
+ const Babl *rgbaF_linear = babl_format_new (
+ babl_model ("RGBA"),
+ babl_type ("float"),
+ babl_component ("R"),
+ babl_component ("G"),
+ babl_component ("B"),
+ babl_component ("A"),
+ NULL);
+ const Babl *rgbAF_linear = babl_format_new (
+ babl_model ("RaGaBaA"),
+ babl_type ("float"),
+ babl_component ("Ra"),
+ babl_component ("Ga"),
+ babl_component ("Ba"),
+ babl_component ("A"),
+ NULL);
+
+ if ((babl_cpu_accel_get_support () & BABL_CPU_ACCEL_X86_MMX) &&
+ (babl_cpu_accel_get_support () & BABL_CPU_ACCEL_X86_SSE))
+ {
+ o (rgbaF_linear, rgbAF_linear);
+ o (rgbAF_linear, rgbaF_linear);
+ }
+
+#endif
+
+ return 0;
+}
+
diff --git a/extensions/sse-int16.c b/extensions/sse-int16.c
new file mode 100644
index 0000000..c8ba934
--- /dev/null
+++ b/extensions/sse-int16.c
@@ -0,0 +1,195 @@
+/* babl - dynamically extendable universal pixel conversion library.
+ * Copyright (C) 2013 Massimo Valentini
+ * Copyright (C) 2013 Daniel Sabo
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General
+ * Public License along with this library; if not, see
+ * <http://www.gnu.org/licenses/>.
+ */
+
+#include "config.h"
+
+#if defined(__GNUC__) && (__GNUC__ >= 4) && defined(USE_SSE) && defined(USE_MMX)
+
+#include <xmmintrin.h>
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "babl.h"
+#include "babl-cpuaccel.h"
+#include "extensions/util.h"
+
+#define Q(a) { a, a, a, a }
+static const __v4sf u16_float = Q (1.f / 65535);
+static const __m128i zero = { 0 };
+
+
+static long
+conv_rgba16_linear_rgbaF_linear (const uint16_t *src, float *dst, long samples)
+{
+ long i = 0;
+
+ if (((uintptr_t)src % 16) + ((uintptr_t)dst % 16) == 0)
+ {
+ long n = (samples / 2) * 2;
+ const __m128i *s = (const __m128i*) src;
+ __v4sf *d = (__v4sf*) dst;
+
+ for (; i < n / 2; i++)
+ {
+ const __m128i t0 = _mm_unpacklo_epi16 (s[i + 0], zero);
+ const __m128i t1 = _mm_unpackhi_epi16 (s[i + 0], zero);
+
+ const __m128 u0 = _mm_cvtepi32_ps (t0);
+ const __m128 u1 = _mm_cvtepi32_ps (t1);
+
+ const __v4sf rgba0 = u0 * u16_float;
+ const __v4sf rgba1 = u1 * u16_float;
+
+ d[2 * i + 0] = rgba0;
+ d[2 * i + 1] = rgba1;
+ }
+ _mm_empty();
+ }
+ for (i *= 2 * 4; i != 4 * samples; i++)
+ dst[i] = src[i] * (1.f / 65535);
+ return samples;
+}
+
+static long
+conv_rgba16_linear_rgbAF_linear (const uint16_t *src, float *dst, long samples)
+{
+ long i = 0;
+ long remainder;
+
+ if (((uintptr_t)src % 16) + ((uintptr_t)dst % 16) == 0)
+ {
+ long n = (samples / 2) * 2;
+ const __m128i *s = (const __m128i*) src;
+ __v4sf *d = (__v4sf*) dst;
+
+ const __v4sf max_mask = { 0.0f, 0.0f, 0.0f, 1.0f };
+
+ for (; i < n / 2; i++)
+ {
+ /* Expand shorts to ints by loading zero in the high bits */
+ const __m128i t0 = _mm_unpacklo_epi16 (s[i + 0], zero);
+ const __m128i t1 = _mm_unpackhi_epi16 (s[i + 0], zero);
+
+ /* Convert to float */
+ const __m128 u0 = _mm_cvtepi32_ps (t0);
+ const __m128 u1 = _mm_cvtepi32_ps (t1);
+
+ /* Multiply by 1 / 2^16 */
+ __v4sf rgba0 = u0 * u16_float;
+ __v4sf rgba1 = u1 * u16_float;
+
+ /* Expand alpha */
+ __v4sf aaaa0 = (__v4sf)_mm_shuffle_epi32(rgba0, _MM_SHUFFLE(3, 3, 3, 3));
+ __v4sf aaaa1 = (__v4sf)_mm_shuffle_epi32(rgba1, _MM_SHUFFLE(3, 3, 3, 3));
+
+ /* Set the value in the alpha slot to 1 */
+ aaaa0 = _mm_max_ps(aaaa0, max_mask);
+ aaaa1 = _mm_max_ps(aaaa1, max_mask);
+
+ /* Premultiply */
+ rgba0 = rgba0 * aaaa0;
+ rgba1 = rgba1 * aaaa1;
+#if 0
+ /* Rotate a to ABGR so we can replace A */
+ rgba0 = (__v4sf)_mm_shuffle_epi32(rgba0, _MM_SHUFFLE(0, 1, 2, 3));
+ rgba1 = (__v4sf)_mm_shuffle_epi32(rgba1, _MM_SHUFFLE(0, 1, 2, 3));
+
+ /* Restore original alpha value */
+ rgba0 = _mm_move_ss(rgba0, aaaa0);
+ rgba1 = _mm_move_ss(rgba1, aaaa1);
+
+ /* Rotate back to RGBA */
+ rgba0 = (__v4sf)_mm_shuffle_epi32(rgba0, _MM_SHUFFLE(0, 1, 2, 3));
+ rgba1 = (__v4sf)_mm_shuffle_epi32(rgba1, _MM_SHUFFLE(0, 1, 2, 3));
+#endif
+ d[2 * i + 0] = rgba0;
+ d[2 * i + 1] = rgba1;
+ }
+ _mm_empty();
+ }
+
+ dst += i * 2 * 4;
+ src += i * 2 * 4;
+ remainder = samples - (i * 2);
+ while (remainder--)
+ {
+ const float a = src[3] / 65535.0f;
+ const float a_term = a / 65535.0f;
+ dst[0] = src[0] * a_term;
+ dst[1] = src[1] * a_term;
+ dst[2] = src[2] * a_term;
+ dst[3] = a;
+
+ src += 4;
+ dst += 4;
+ }
+
+ return samples;
+}
+
+#endif
+
+#define o(src, dst) \
+ babl_conversion_new (src, dst, "linear", conv_ ## src ## _ ## dst, NULL)
+
+int init (void);
+
+int
+init (void)
+{
+#if defined(__GNUC__) && (__GNUC__ >= 4) && defined(USE_SSE) && defined(USE_MMX)
+
+ const Babl *rgbaF_linear = babl_format_new (
+ babl_model ("RGBA"),
+ babl_type ("float"),
+ babl_component ("R"),
+ babl_component ("G"),
+ babl_component ("B"),
+ babl_component ("A"),
+ NULL);
+ const Babl *rgbAF_linear = babl_format_new (
+ babl_model ("RaGaBaA"),
+ babl_type ("float"),
+ babl_component ("Ra"),
+ babl_component ("Ga"),
+ babl_component ("Ba"),
+ babl_component ("A"),
+ NULL);
+ const Babl *rgba16_linear = babl_format_new (
+ babl_model ("RGBA"),
+ babl_type ("u16"),
+ babl_component ("R"),
+ babl_component ("G"),
+ babl_component ("B"),
+ babl_component ("A"),
+ NULL);
+
+ if ((babl_cpu_accel_get_support () & BABL_CPU_ACCEL_X86_MMX) &&
+ (babl_cpu_accel_get_support () & BABL_CPU_ACCEL_X86_SSE))
+ {
+ o (rgba16_linear, rgbaF_linear);
+ o (rgba16_linear, rgbAF_linear);
+ }
+
+#endif
+
+ return 0;
+}
+
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]