[babl/wip/rishi/cie-simd] SIMD-fy CIE
- From: Debarshi Ray <debarshir src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [babl/wip/rishi/cie-simd] SIMD-fy CIE
- Date: Sun, 29 Apr 2018 11:52:53 +0000 (UTC)
commit d298456a3526794bca67478fcc1c35d8d0f24690
Author: Debarshi Ray <debarshir gnome org>
Date: Sun Apr 29 01:15:46 2018 +0200
SIMD-fy CIE
extensions/Makefile.am | 2 +
extensions/meson.build | 1 +
extensions/sse3-float.c | 232 +++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 235 insertions(+), 0 deletions(-)
---
diff --git a/extensions/Makefile.am b/extensions/Makefile.am
index a2a2943..3a13627 100644
--- a/extensions/Makefile.am
+++ b/extensions/Makefile.am
@@ -38,6 +38,7 @@ ext_LTLIBRARIES = \
sse2-float.la \
sse2-int8.la \
sse2-int16.la \
+ sse3-float.la \
sse4-int8.la \
sse-half.la \
two-table.la \
@@ -61,6 +62,7 @@ HSV_la_SOURCES = HSV.c
sse2_float_la_SOURCES = sse2-float.c
sse2_int8_la_SOURCES = sse2-int8.c
sse2_int16_la_SOURCES = sse2-int16.c
+sse3_float_la_SOURCES = sse3-float.c
sse4_int8_la_SOURCES = sse4-int8.c
sse_half_la_SOURCES = sse-half.c
two_table_la_SOURCES = two-table.c two-table-tables.h
diff --git a/extensions/meson.build b/extensions/meson.build
index afc960d..12ab12c 100644
--- a/extensions/meson.build
+++ b/extensions/meson.build
@@ -23,6 +23,7 @@ extension_names = [
'sse2-float',
'sse2-int16',
'sse2-int8',
+ 'sse3-float',
'sse4-int8',
'two-table',
'ycbcr',
diff --git a/extensions/sse3-float.c b/extensions/sse3-float.c
new file mode 100644
index 0000000..38208d6
--- /dev/null
+++ b/extensions/sse3-float.c
@@ -0,0 +1,232 @@
+/* babl - dynamically extendable universal pixel conversion library.
+ * Copyright (C) 2018 Red Hat, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General
+ * Public License along with this library; if not, see
+ * <http://www.gnu.org/licenses/>.
+ */
+
+#include "config.h"
+
+#if defined(USE_SSE3)
+
+/* SSE 3 */
+#include <pmmintrin.h>
+
+#include <stdint.h>
+//#include <stdlib.h>
+
+#include "babl.h"
+#include "babl-cpuaccel.h"
+
+/* See extensions/CIE.c */
+#define LAB_EPSILON (216.0f / 24389.0f)
+#define LAB_KAPPA (24389.0f / 27.0f)
+#define D50_WHITE_REF_X 0.964202880f
+#define D50_WHITE_REF_Y 1.000000000f
+#define D50_WHITE_REF_Z 0.824905400f
+
+static inline __m128
+_cbrtf_ps (__m128 x)
+{
+ __m128i ui = _mm_castps_si128 (x);
+
+ ui = _mm_add_epi32 (_mm_srl_epi32 (ui, _mm_set_epi32 (2, 2, 2, 2)),
+ _mm_srl_epi32 (ui, _mm_set_epi32 (4, 4, 4, 4)));
+
+ ui = _mm_add_epi32 (ui, _mm_srl_epi32 (ui, _mm_set_epi32 (4, 4, 4, 4)));
+ ui = _mm_add_epi32 (ui, _mm_srl_epi32 (ui, _mm_set_epi32 (8, 8, 8, 8)));
+
+ ui = _mm_add_epi32 (_mm_set_epi32 (0x2a5137a0, 0x2a5137a0, 0x2a5137a0, 0x2a5137a0), ui);
+
+ u.f = 0.33333333f * (2.0f * u.f + x / (u.f * u.f));
+ u.f = 0.33333333f * (2.0f * u.f + x / (u.f * u.f));
+
+ return u.f;
+}
+
+static inline __m128
+hsum_ps_sse3 (__m128 v, __m128 mask)
+{
+ __m128 shuf = _mm_movehdup_ps (v);
+ __m128 sums = _mm_add_ps (v, shuf);
+ //float hsum;
+
+ shuf = _mm_movehl_ps (shuf, sums);
+ sums = _mm_add_ss (sums, shuf);
+ sums = _mm_shuffle_ps (sums, sums, _MM_SHUFFLE (0, 0, 0, 0));
+ sums = _mm_and_ps (sums, mask);
+ //hsum = _mm_cvtss_f32 (sums);
+ return sums;
+}
+
+static inline __m128
+lab_r_to_f_sse2 (__m128 r)
+{
+ const __m128 epsilon = _mm_set1_ps (LAB_EPSILON);
+ const __m128 kappa = _mm_set1_ps (LAB_KAPPA);
+
+ // calculate as if x > epsilon : result = cbrtf(x)
+ // approximate cbrtf(x):
+ const __m128 a = _mm_castsi128_ps(
+ _mm_add_epi32(_mm_cvtps_epi32(_mm_div_ps(_mm_cvtepi32_ps(_mm_castps_si128(x)), _mm_set1_ps(3.0f))),
+ _mm_set1_epi32(709921077)));
+ const __m128 a3 = _mm_mul_ps(_mm_mul_ps(a, a), a);
+ const __m128 f_big
+ = _mm_div_ps(_mm_mul_ps(a, _mm_add_ps(a3, _mm_add_ps(x, x))), _mm_add_ps(_mm_add_ps(a3, a3), x));
+
+ const __m128 f_small = _mm_div_ps (_mm_add_ps (_mm_mul_ps (kappa, r), _mm_set1_ps (16.0f)),
+ _mm_set1_ps (116.0f));
+
+ const __m128 mask = _mm_cmpgt_ps (r, epsilon);
+ const __m128 f = _mm_or_ps (_mm_and_ps (mask, f_big), _mm_andnot_ps (mask, f_small));
+ return f;
+}
+
+static void
+conv_LabaF_rgbaF_linear (const Babl *conversion, const float *src, float *dst, long samples)
+{
+}
+
+static void
+conv_rgbaF_linear_LabaF (const Babl *conversion, const float *src, float *dst, long samples)
+{
+ const __v4sf d50_white_ref_inv = _mm_set_ps (1.0f, 1.0f / 0.824905400f, 1.0f, 1.0f / 0.964202880f);
+ const __m128 d50_white_ref_x_inv = _mm_set_ps (0.0f,
+ 1.0f / D50_WHITE_REF_X,
+ 1.0f / D50_WHITE_REF_X,
+ 1.0f / D50_WHITE_REF_X);
+ const __m128 d50_white_ref_y_inv = _mm_set_ps (0.0f,
+ 1.0f / D50_WHITE_REF_Y,
+ 1.0f / D50_WHITE_REF_Y,
+ 1.0f / D50_WHITE_REF_Y);
+ const __m128 d50_white_ref_z_inv = _mm_set_ps (0.0f,
+ 1.0f / D50_WHITE_REF_Z,
+ 1.0f / D50_WHITE_REF_Z,
+ 1.0f / D50_WHITE_REF_Z);
+ const Babl *space = babl_conversion_get_source_space (conversion);
+ __m128 m_0 = _mm_set_ps (0.0f,
+ space->space.RGBtoXYZf[2],
+ space->space.RGBtoXYZf[1],
+ space->space.RGBtoXYZf[0]);
+ __m128 m_1 = _mm_set_ps (0.0f,
+ space->space.RGBtoXYZf[5],
+ space->space.RGBtoXYZf[4],
+ space->space.RGBtoXYZf[3]);
+ __m128 m_2 = _mm_set_ps (0.0f,
+ space->space.RGBtoXYZf[8],
+ space->space.RGBtoXYZf[7],
+ space->space.RGBtoXYZf[6]);
+ long i = 0;
+ long remainder;
+
+ m_0 = _mm_mul_ps (m_0, d50_white_ref_x_inv)
+ m_1 = _mm_mul_ps (m_1, d50_white_ref_y_inv)
+ m_2 = _mm_mul_ps (m_2, d50_white_ref_z_inv)
+
+ if (((uintptr_t) src % 16) + ((uintptr_t) dst % 16) == 0)
+ {
+ const long n = (samples / 2) * 2;
+ const __v4sf *s = (const __v4sf*) src;
+ __v4sf *d = (__v4sf*) dst;
+
+ for ( ; i < n; i ++)
+ {
+ __m128 mask;
+ __m128 rgba = *s++;
+
+ __m128 xr_v = _mm_mul_ps (rgba, m_0);
+ __m128 yr_v = _mm_mul_ps (rgba, m_1);
+ __m128 zr_v = _mm_mul_ps (rgba, m_2);
+
+ __m128 xr;
+ __m128 yr;
+ __m128 zr;
+
+ __m128 r;
+ __m128 f;
+
+ mask = _mm_castsi128_ps (_mm_set_epi32 (0, 0, 0, 0xffffffff));
+ xr = hsum_ps_sse3 (xr_v, mask);
+
+ mask = _mm_castsi128_ps (_mm_set_epi32 (0, 0, 0xffffffff, 0));
+ yr = hsum_ps_sse3 (yr_v, mask);
+
+ mask = _mm_castsi128_ps (_mm_set_epi32 (0, 0xffffffff, 0, 0));
+ zr = hsum_ps_sse3 (zr_v, mask);
+
+ r = _mm_or_ps (xr, yr);
+ r = _mm_or_ps (r, zr);
+
+ f = lab_r_to_f_sse2 (r);
+
+ *d++ = rgba;
+ }
+ }
+
+ dst += i * 4;
+ src += i * 4;
+ remainder = samples - i;
+ while (remainder--)
+ {
+ const float a = src[3];
+ dst[0] = src[0] * a;
+ dst[1] = src[1] * a;
+ dst[2] = src[2] * a;
+ dst[3] = a;
+
+ src += 4;
+ dst += 4;
+ }
+}
+
+#endif /* defined(USE_SSE2) */
+
+#define o(src, dst) \
+ babl_conversion_new (src, dst, "linear", conv_ ## src ## _ ## dst, NULL)
+
+int init (void);
+
+int
+init (void)
+{
+#if defined(USE_SSE3)
+
+ const Babl *LabaF = babl_format_new (
+ babl_model ("CIE Lab alpha"),
+ babl_type ("float"),
+ babl_component ("CIE L"),
+ babl_component ("CIE a"),
+ babl_component ("CIE b"),
+ babl_component ("A"),
+ NULL);
+ const Babl *rgbaF_linear = babl_format_new (
+ babl_model ("RGBA"),
+ babl_type ("float"),
+ babl_component ("R"),
+ babl_component ("G"),
+ babl_component ("B"),
+ babl_component ("A"),
+ NULL);
+
+ if (babl_cpu_accel_get_support () & BABL_CPU_ACCEL_X86_SSE3)
+ {
+ /* o (LabaF, rgbaF_linear); */
+ /* o (rgbaF_linear, LabaF); */
+ }
+
+#endif /* defined(USE_SSE3) */
+
+ return 0;
+}
+
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]