[babl/wip/msvc: 17/20] build: Enable MMX/SSE/SSE2 for Visual Studio builds
- From: Chun-wei Fan <fanchunwei src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [babl/wip/msvc: 17/20] build: Enable MMX/SSE/SSE2 for Visual Studio builds
- Date: Wed, 22 Jan 2020 08:10:59 +0000 (UTC)
commit 4533dde993be70bcaeecafb2b254dbd0482f88c7
Author: Chun-wei Fan <fanchunwei src gnome org>
Date: Tue Jan 21 17:40:17 2020 +0800
build: Enable MMX/SSE/SSE2 for Visual Studio builds
Enable those unconditionally x86 and x86_64, since the Visual Studio versions,
we currently support supported these intrinsics for a very long time. We do,
however, need to use portable intrinsics syntax for the code to build on Visual
Studio
babl/babl-space.c | 23 +++--
extensions/sse2-float.c | 259 +++++++++++++++++++++++++++---------------------
extensions/sse2-int16.c | 42 ++++----
extensions/sse2-int8.c | 34 +++----
meson.build | 15 +++
5 files changed, 217 insertions(+), 156 deletions(-)
---
diff --git a/babl/babl-space.c b/babl/babl-space.c
index 586ebcea4..34892b8db 100644
--- a/babl/babl-space.c
+++ b/babl/babl-space.c
@@ -957,21 +957,28 @@ static inline void babl_matrix_mul_vectorff_buf4_sse2 (const float *mat,
float *v_out,
int samples)
{
- const __v4sf m___0 = {m(mat, 0, 0), m(mat, 1, 0), m(mat, 2, 0), 0};
- const __v4sf m___1 = {m(mat, 0, 1), m(mat, 1, 1), m(mat, 2, 1), 0};
- const __v4sf m___2 = {m(mat, 0, 2), m(mat, 1, 2), m(mat, 2, 2), 1};
+ const __m128 m___0 = {m(mat, 0, 0), m(mat, 1, 0), m(mat, 2, 0), 0};
+ const __m128 m___1 = {m(mat, 0, 1), m(mat, 1, 1), m(mat, 2, 1), 0};
+ const __m128 m___2 = {m(mat, 0, 2), m(mat, 1, 2), m(mat, 2, 2), 1};
int i;
for (i = 0; i < samples; i ++)
{
- __v4sf a, b, c = _mm_load_ps(&v_in[0]);
- a = (__v4sf) _mm_shuffle_epi32((__m128i)c, _MM_SHUFFLE(0,0,0,0));
- b = (__v4sf) _mm_shuffle_epi32((__m128i)c, _MM_SHUFFLE(1,1,1,1));
- c = (__v4sf) _mm_shuffle_epi32((__m128i)c, _MM_SHUFFLE(3,2,2,2));
- _mm_store_ps (v_out, m___0 * a + m___1 * b + m___2 * c);
+ __m128 a, b, c = _mm_load_ps(&v_in[0]);
+ __m128 a_m0, b_m1, c_m2;
+ a = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(c), _MM_SHUFFLE(0,0,0,0)));
+ b = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(c), _MM_SHUFFLE(1,1,1,1)));
+ c = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(c), _MM_SHUFFLE(3,2,2,2)));
+ a_m0 = _mm_mul_ps(m___0, a);
+ b_m1 = _mm_mul_ps(m___1, b);
+ c_m2 = _mm_mul_ps(m___2, c);
+ _mm_store_ps (v_out, _mm_add_ps(_mm_add_ps (a_m0, b_m1), c_m2));
v_out += 4;
v_in += 4;
}
+
+#if !defined (_MSC_VER) || (!defined (_M_X64) && !defined (_M_AMD64))
_mm_empty ();
+#endif
}
#undef m
diff --git a/extensions/sse2-float.c b/extensions/sse2-float.c
index 3757ffe09..3e4fad298 100644
--- a/extensions/sse2-float.c
+++ b/extensions/sse2-float.c
@@ -49,8 +49,8 @@ conv_rgbaF_linear_rgbAF_linear (const Babl *conversion,
if (((uintptr_t)src % 16) + ((uintptr_t)dst % 16) == 0)
{
const long n = (samples / 2) * 2;
- const __v4sf *s = (const __v4sf*) src;
- __v4sf *d = (__v4sf*)dst;
+ const __m128 *s = (const __m128*) src;
+ __m128 *d = (__m128*)dst;
for ( ; i < n; i += 2)
{
@@ -60,22 +60,22 @@ conv_rgbaF_linear_rgbAF_linear (const Babl *conversion,
float used_alpha1 = babl_epsilon_for_zero_float (alpha1);
{
- __v4sf rbaa0, rbaa1;
+ __m128 rbaa0, rbaa1;
- __v4sf rgba0 = *s++;
- __v4sf rgba1 = *s++;
+ __m128 rgba0 = *s++;
+ __m128 rgba1 = *s++;
/* Expand alpha */
- __v4sf aaaa0 = (__v4sf)_mm_set1_ps(used_alpha0);
- __v4sf aaaa1 = (__v4sf)_mm_set1_ps(used_alpha1);
+ __m128 aaaa0 = _mm_set1_ps(used_alpha0);
+ __m128 aaaa1 = _mm_set1_ps(used_alpha1);
/* Premultiply */
- rgba0 = rgba0 * aaaa0;
- rgba1 = rgba1 * aaaa1;
+ rgba0 = _mm_mul_ps(rgba0, aaaa0);
+ rgba1 = _mm_mul_ps(rgba1, aaaa1);
- aaaa0 = (__v4sf)_mm_set1_ps(alpha0);
- aaaa1 = (__v4sf)_mm_set1_ps(alpha1);
+ aaaa0 = _mm_set1_ps(alpha0);
+ aaaa1 = _mm_set1_ps(alpha1);
/* Shuffle the original alpha value back in */
rbaa0 = _mm_shuffle_ps(rgba0, aaaa0, _MM_SHUFFLE(0, 0, 2, 0));
@@ -88,7 +88,10 @@ conv_rgbaF_linear_rgbAF_linear (const Babl *conversion,
*d++ = rgba1;
}
}
+
+#if !defined (_MSC_VER) || (!defined (_M_X64) && !defined (_M_AMD64))
_mm_empty ();
+#endif
}
dst += i * 4;
@@ -120,12 +123,12 @@ conv_rgbAF_linear_rgbaF_linear_shuffle (const Babl *conversion,
if (((uintptr_t)src % 16) + ((uintptr_t)dst % 16) == 0)
{
const long n = samples;
- const __v4sf *s = (const __v4sf*) src;
- __v4sf *d = (__v4sf*)dst;
+ const __m128 *s = (const __m128*) src;
+ __m128 *d = (__m128*)dst;
for ( ; i < n; i += 1)
{
- __v4sf pre_rgba0, rgba0, rbaa0, raaaa0;
+ __m128 pre_rgba0, rgba0, rbaa0, raaaa0;
float alpha0 = ((float *)s)[3];
float used_alpha0 = babl_epsilon_for_zero_float (alpha0);
@@ -138,7 +141,7 @@ conv_rgbAF_linear_rgbaF_linear_shuffle (const Babl *conversion,
raaaa0 = _mm_load1_ps(&recip0);
/* Un-Premultiply */
- rgba0 = pre_rgba0 * raaaa0;
+ rgba0 = _mm_mul_ps(pre_rgba0, raaaa0);
}
/* Shuffle the original alpha value back in */
@@ -148,7 +151,10 @@ conv_rgbAF_linear_rgbaF_linear_shuffle (const Babl *conversion,
s++;
*d++ = rgba0;
}
+
+#if !defined (_MSC_VER) || (!defined (_M_X64) && !defined (_M_AMD64))
_mm_empty ();
+#endif
}
dst += i * 4;
@@ -184,19 +190,19 @@ conv_rgbAF_linear_rgbaF_linear_spin (const Babl *conversion,
if (((uintptr_t)src % 16) + ((uintptr_t)dst % 16) == 0)
{
const long n = samples;
- const __v4sf *s = (const __v4sf*) src;
- __v4sf *d = (__v4sf*)dst;
- const __v4sf zero = _mm_set_ss (BABL_ALPHA_FLOOR_FLOAT);
- const __v4sf one = _mm_set_ss(1.0f);
+ const __m128 *s = (const __m128*) src;
+ __m128 *d = (__m128*)dst;
+ const __m128 zero = _mm_set_ss (BABL_ALPHA_FLOOR_FLOAT);
+ const __m128 one = _mm_set_ss(1.0f);
for ( ; i < n; i += 1)
{
- __v4sf pre_abgr0, abgr0, rgba0, raaaa0;
+ __m128 pre_abgr0, abgr0, rgba0, raaaa0;
rgba0 = *s;
/* Rotate to ABGR */
- pre_abgr0 = (__v4sf)_mm_shuffle_epi32((__m128i)rgba0, _MM_SHUFFLE(0, 1, 2, 3));
+ pre_abgr0 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(rgba0),_MM_SHUFFLE(0, 1, 2, 3)));
if (_mm_ucomile_ss(pre_abgr0, zero))
{
@@ -209,22 +215,25 @@ conv_rgbAF_linear_rgbaF_linear_spin (const Babl *conversion,
raaaa0 = _mm_div_ss(one, pre_abgr0);
/* Expand reciprocal */
- raaaa0 = (__v4sf)_mm_shuffle_epi32((__m128i)raaaa0, _MM_SHUFFLE(0, 0, 0, 0));
+ raaaa0 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(raaaa0), _MM_SHUFFLE(0, 0, 0, 0)));
/* Un-Premultiply */
- abgr0 = pre_abgr0 * raaaa0;
+ abgr0 = _mm_mul_ps(pre_abgr0, raaaa0);
}
/* Move the original alpha value back in */
abgr0 = _mm_move_ss(abgr0, pre_abgr0);
/* Rotate to ABGR */
- rgba0 = (__v4sf)_mm_shuffle_epi32((__m128i)abgr0, _MM_SHUFFLE(0, 1, 2, 3));
+ rgba0 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(abgr0), _MM_SHUFFLE(0, 1, 2, 3)));
*d++ = rgba0;
s++;
}
+
+#if !defined (_MSC_VER) || (!defined (_M_X64) && !defined (_M_AMD64))
_mm_empty ();
+#endif
}
dst += i * 4;
@@ -248,93 +257,109 @@ conv_rgbAF_linear_rgbaF_linear_spin (const Babl *conversion,
}
}
-#define splat4f(x) ((__v4sf){x,x,x,x})
-#define splat4i(x) ((__v4si){x,x,x,x})
+#define splat4f(x) ((__m128){x,x,x,x})
+#define splat4i(x) ((__m128i){x,x,x,x})
#define FLT_ONE 0x3f800000 // ((union {float f; int i;}){1.0f}).i
#define FLT_MANTISSA (1<<23)
+typedef union {
+ __m128 v; // SSE 4 x float vector
+ float a[4]; // scalar array of 4 floats
+} __m128_u;
+
static inline float
-sse_max_component (__v4sf x) {
- __v4sf s;
- __v4sf m;
+sse_max_component (__m128 x) {
+
+ __m128 s;
+ __m128_u m;
/* m = [max (x[3], x[1]), max (x[2], x[0])] */
- s = (__v4sf) _mm_shuffle_epi32 ((__m128i) x, _MM_SHUFFLE(0, 0, 3, 2));
- m = _mm_max_ps (x, s);
+ s = _mm_castsi128_ps( _mm_shuffle_epi32 (_mm_castps_si128(x), _MM_SHUFFLE(0, 0, 3, 2)));
+ m.v = _mm_max_ps (x, s);
/* m = [max (m[1], m[0])] = [max (max (x[3], x[1]), max (x[2], x[0]))] */
- s = (__v4sf) _mm_shuffle_epi32 ((__m128i) m, _MM_SHUFFLE(0, 0, 0, 1));
- m = _mm_max_ps (m, s);
+ s = _mm_castsi128_ps( _mm_shuffle_epi32 (_mm_castps_si128(m.v), _MM_SHUFFLE(0, 0, 0, 1)));
+ m.v = _mm_max_ps (m.v, s);
- return m[0];
+ return m.a[0];
}
-static inline __v4sf
-sse_init_newton (__v4sf x, double exponent, double c0, double c1, double c2)
+static inline __m128
+sse_init_newton (__m128 x, double exponent, double c0, double c1, double c2)
{
double norm = exponent*M_LN2/FLT_MANTISSA;
- __v4sf y = _mm_cvtepi32_ps((__m128i)((__v4si)x - splat4i(FLT_ONE)));
- return splat4f(c0) + splat4f(c1*norm)*y + splat4f(c2*norm*norm)*y*y;
+ __m128 y = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_castps_si128(x), splat4i(FLT_ONE)));
+ return _mm_add_ps(_mm_add_ps(splat4f(c0), _mm_mul_ps(splat4f(c1*norm), y)),
+ _mm_mul_ps(_mm_mul_ps(splat4f(c1*norm*norm), y), y));
}
-static inline __v4sf
-sse_pow_1_24 (__v4sf x)
+#define _mm_square_ps(x) _mm_mul_ps(x, x)
+#define _mm_cube_ps(x) _mm_mul_ps(_mm_mul_ps(x, x), x)
+#define _mm_6pow_ps(x) _mm_mul_ps(_mm_cube_ps(x), _mm_cube_ps(x))
+#define _mm_7pow_ps(x) _mm_mul_ps(_mm_mul_ps(_mm_square_ps(x), _mm_square_ps(x)), _mm_cube_ps(x))
+
+static inline __m128
+sse_pow_1_24 (__m128 x)
{
- __v4sf y, z;
- if (sse_max_component (x) > 1024.0f) {
+ __m128 y, z;
+ __m128_u u;
+ u.v = x;
+ if (sse_max_component (u.v) > 1024.0f) {
/* for large values, fall back to a slower but more accurate version */
- return _mm_set_ps (expf (logf (x[3]) * (1.0f / 2.4f)),
- expf (logf (x[2]) * (1.0f / 2.4f)),
- expf (logf (x[1]) * (1.0f / 2.4f)),
- expf (logf (x[0]) * (1.0f / 2.4f)));
+ return _mm_set_ps (expf (logf (u.a[3]) * (1.0f / 2.4f)),
+ expf (logf (u.a[2]) * (1.0f / 2.4f)),
+ expf (logf (u.a[1]) * (1.0f / 2.4f)),
+ expf (logf (u.a[0]) * (1.0f / 2.4f)));
}
- y = sse_init_newton (x, -1./12, 0.9976800269, 0.9885126933, 0.5908575383);
- x = _mm_sqrt_ps (x);
+ y = sse_init_newton (u.v, -1./12, 0.9976800269, 0.9885126933, 0.5908575383);
+ u.v = _mm_sqrt_ps (u.v);
/* newton's method for x^(-1/6) */
- z = splat4f (1.f/6.f) * x;
- y = splat4f (7.f/6.f) * y - z * ((y*y)*(y*y)*(y*y*y));
- y = splat4f (7.f/6.f) * y - z * ((y*y)*(y*y)*(y*y*y));
- return x*y;
+ z = _mm_mul_ps(splat4f (1.f/6.f), u.v);
+ y = _mm_sub_ps(_mm_mul_ps(splat4f (7.f/6.f), y), _mm_mul_ps(z, _mm_7pow_ps(y)));
+ y = _mm_sub_ps(_mm_mul_ps(splat4f (7.f/6.f), y), _mm_mul_ps(z, _mm_7pow_ps(y)));
+ return _mm_mul_ps(u.v, y);
}
-static inline __v4sf
-sse_pow_24 (__v4sf x)
+static inline __m128
+sse_pow_24 (__m128 x)
{
- __v4sf y, z;
- if (sse_max_component (x) > 16.0f) {
+ __m128 y, z;
+ __m128_u u;
+ u.v = x;
+ if (sse_max_component (u.v) > 16.0f) {
/* for large values, fall back to a slower but more accurate version */
- return _mm_set_ps (expf (logf (x[3]) * 2.4f),
- expf (logf (x[2]) * 2.4f),
- expf (logf (x[1]) * 2.4f),
- expf (logf (x[0]) * 2.4f));
+ return _mm_set_ps (expf (logf (u.a[3]) * 2.4f),
+ expf (logf (u.a[2]) * 2.4f),
+ expf (logf (u.a[1]) * 2.4f),
+ expf (logf (u.a[0]) * 2.4f));
}
- y = sse_init_newton (x, -1./5, 0.9953189663, 0.9594345146, 0.6742970332);
+ y = sse_init_newton (u.v, -1./5, 0.9953189663, 0.9594345146, 0.6742970332);
/* newton's method for x^(-1/5) */
- z = splat4f (1.f/5.f) * x;
- y = splat4f (6.f/5.f) * y - z * ((y*y*y)*(y*y*y));
- y = splat4f (6.f/5.f) * y - z * ((y*y*y)*(y*y*y));
- x *= y;
- return x*x*x;
+ z = _mm_mul_ps(splat4f (1.f/5.f), x);
+ y = _mm_sub_ps(_mm_mul_ps(splat4f (6.f/5.f), y), _mm_mul_ps(z, _mm_6pow_ps(y)));
+ y = _mm_sub_ps(_mm_mul_ps(splat4f (6.f/5.f), y), _mm_mul_ps(z, _mm_6pow_ps(y)));
+ x = _mm_mul_ps(x, y);
+ return _mm_cube_ps(x);
}
-static inline __v4sf
-linear_to_gamma_2_2_sse2 (__v4sf x)
+static inline __m128
+linear_to_gamma_2_2_sse2 (__m128 x)
{
- __v4sf curve = sse_pow_1_24 (x) * splat4f (1.055f) -
- splat4f (0.055f -
- 3.0f / (float) (1 << 24));
- /* ^ offset the result such that 1 maps to 1 */
- __v4sf line = x * splat4f (12.92f);
- __v4sf mask = _mm_cmpgt_ps (x, splat4f (0.003130804954f));
+ __m128 curve = _mm_sub_ps(_mm_mul_ps(sse_pow_1_24 (x), splat4f (1.055f)),
+ splat4f (0.055f -
+ 3.0f / (float) (1 << 24)));
+ /* ^ offset the result such that 1 maps to 1 */
+ __m128 line = _mm_mul_ps(x, splat4f (12.92f));
+ __m128 mask = _mm_cmpgt_ps (x, splat4f (0.003130804954f));
return _mm_or_ps (_mm_and_ps (mask, curve), _mm_andnot_ps (mask, line));
}
-static inline __v4sf
-gamma_2_2_to_linear_sse2 (__v4sf x)
+static inline __m128
+gamma_2_2_to_linear_sse2 (__m128 x)
{
- __v4sf curve = sse_pow_24 ((x + splat4f (0.055f)) * splat4f (1/1.055f));
- __v4sf line = x * splat4f (1/12.92f);
- __v4sf mask = _mm_cmpgt_ps (x, splat4f (0.04045f));
+ __m128 curve = sse_pow_24 (_mm_mul_ps(_mm_add_ps(x, splat4f (0.055f)), splat4f (1/1.055f)));
+ __m128 line = _mm_mul_ps(x, splat4f (1/12.92f));
+ __m128 mask = _mm_cmpgt_ps (x, splat4f (0.04045f));
return _mm_or_ps (_mm_and_ps (mask, curve), _mm_andnot_ps (mask, line));
}
@@ -348,16 +373,16 @@ func (const Babl *conversion,const float *src, float *dst, long samples)\
for (; i > 3; i -= 4, src += 16, dst += 16)\
{\
/* Pack the rgb components from 4 pixels into 3 vectors, gammafy, unpack. */\
- __v4sf x0 = _mm_load_ps (src);\
- __v4sf x1 = _mm_load_ps (src+4);\
- __v4sf x2 = _mm_load_ps (src+8);\
- __v4sf x3 = _mm_load_ps (src+12);\
- __v4sf y0 = _mm_movelh_ps (x0, x1);\
- __v4sf y1 = _mm_movelh_ps (x2, x3);\
- __v4sf z0 = _mm_unpackhi_ps (x0, x1);\
- __v4sf z1 = _mm_unpackhi_ps (x2, x3);\
- __v4sf y2 = _mm_movelh_ps (z0, z1);\
- __v4sf y3 = _mm_movehl_ps (z1, z0);\
+ __m128 x0 = _mm_load_ps (src);\
+ __m128 x1 = _mm_load_ps (src+4);\
+ __m128 x2 = _mm_load_ps (src+8);\
+ __m128 x3 = _mm_load_ps (src+12);\
+ __m128 y0 = _mm_movelh_ps (x0, x1);\
+ __m128 y1 = _mm_movelh_ps (x2, x3);\
+ __m128 z0 = _mm_unpackhi_ps (x0, x1);\
+ __m128 z1 = _mm_unpackhi_ps (x2, x3);\
+ __m128 y2 = _mm_movelh_ps (z0, z1);\
+ __m128 y3 = _mm_movehl_ps (z1, z0);\
y0 = munge (y0);\
_mm_storel_pi ((__m64*)(dst), y0);\
_mm_storeh_pi ((__m64*)(dst+4), y0);\
@@ -374,7 +399,7 @@ func (const Babl *conversion,const float *src, float *dst, long samples)\
}\
for (; i > 0; i--, src += 4, dst += 4)\
{\
- __v4sf x = munge (_mm_load_ps (src));\
+ __m128 x = munge (_mm_load_ps (src));\
float a = src[3];\
_mm_store_ps (dst, x);\
dst[3] = a;\
@@ -384,7 +409,7 @@ func (const Babl *conversion,const float *src, float *dst, long samples)\
{\
for (; i > 0; i--, src += 4, dst += 4)\
{\
- __v4sf x = munge (_mm_loadu_ps (src));\
+ __m128 x = munge (_mm_loadu_ps (src));\
float a = src[3];\
_mm_storeu_ps (dst, x);\
dst[3] = a;\
@@ -407,17 +432,17 @@ static void conv_rgbaF_linear_rgbAF_gamma (const Babl *conversion,
#define YA_APPLY(load, store, convert) \
{ \
- __v4sf yyaa0, yyaa1; \
- __v4sf yaya0 = load ((float *)s++); \
- __v4sf yaya1 = load ((float *)s++); \
- __v4sf yyyy01 = _mm_shuffle_ps (yaya0, yaya1, _MM_SHUFFLE(0, 2, 0, 2)); \
+ __m128 yyaa0, yyaa1; \
+ __m128 yaya0 = load ((float *)s++); \
+ __m128 yaya1 = load ((float *)s++); \
+ __m128 yyyy01 = _mm_shuffle_ps (yaya0, yaya1, _MM_SHUFFLE(0, 2, 0, 2)); \
\
yyyy01 = convert (yyyy01); \
\
yyaa0 = _mm_shuffle_ps (yyyy01, yaya0, _MM_SHUFFLE(3, 1, 0, 1)); \
- yaya0 = (__v4sf)_mm_shuffle_epi32((__m128i)yyaa0, _MM_SHUFFLE(3, 1, 2, 0)); \
+ yaya0 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(yyaa0), _MM_SHUFFLE(3, 1, 2, 0))); \
yyaa1 = _mm_shuffle_ps (yyyy01, yaya1, _MM_SHUFFLE(3, 1, 2, 3)); \
- yaya1 = (__v4sf)_mm_shuffle_epi32((__m128i)yyaa1, _MM_SHUFFLE(3, 1, 2, 0)); \
+ yaya1 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(yyaa1), _MM_SHUFFLE(3, 1, 2, 0))); \
\
store ((float *)d++, yaya0); \
store ((float *)d++, yaya1); \
@@ -429,8 +454,8 @@ conv_yaF_linear_yaF_gamma (const Babl *conversion,
float *dst,
long samples)
{
- const __v4sf *s = (const __v4sf*)src;
- __v4sf *d = (__v4sf*)dst;
+ const __m128 *s = (const __m128*)src;
+ __m128 *d = (__m128*)dst;
if (((uintptr_t)src % 16) + ((uintptr_t)dst % 16) == 0)
{
@@ -455,7 +480,9 @@ conv_yaF_linear_yaF_gamma (const Babl *conversion,
while (samples--)
{
float y = *src++;
- *dst++ = linear_to_gamma_2_2_sse2 (splat4f (y))[0];
+ __m128_u u;
+ u.v = linear_to_gamma_2_2_sse2 (splat4f (y));
+ *dst++ = u.a[0];
*dst++ = *src++;
}
}
@@ -467,8 +494,8 @@ conv_yaF_gamma_yaF_linear (const Babl *conversion,
float *dst,
long samples)
{
- const __v4sf *s = (const __v4sf*)src;
- __v4sf *d = (__v4sf*)dst;
+ const __m128 *s = (const __m128*)src;
+ __m128 *d = (__m128*)dst;
if (((uintptr_t)src % 16) + ((uintptr_t)dst % 16) == 0)
{
@@ -493,7 +520,9 @@ conv_yaF_gamma_yaF_linear (const Babl *conversion,
while (samples--)
{
float y = *src++;
- *dst++ = gamma_2_2_to_linear_sse2 (splat4f (y))[0];
+ __m128_u u;
+ u.v = gamma_2_2_to_linear_sse2 (splat4f (y));
+ *dst++ = u.a[0];
*dst++ = *src++;
}
}
@@ -504,14 +533,14 @@ conv_yF_linear_yF_gamma (const Babl *conversion,
float *dst,
long samples)
{
- const __v4sf *s = (const __v4sf*)src;
- __v4sf *d = (__v4sf*)dst;
+ const __m128 *s = (const __m128*)src;
+ __m128 *d = (__m128*)dst;
if (((uintptr_t)src % 16) + ((uintptr_t)dst % 16) == 0)
{
while (samples > 4)
{
- __v4sf rgba0 = _mm_load_ps ((float *)s++);
+ __m128 rgba0 = _mm_load_ps ((float *)s++);
rgba0 = linear_to_gamma_2_2_sse2 (rgba0);
_mm_store_ps ((float *)d++, rgba0);
samples -= 4;
@@ -521,7 +550,7 @@ conv_yF_linear_yF_gamma (const Babl *conversion,
{
while (samples > 4)
{
- __v4sf rgba0 = _mm_loadu_ps ((float *)s++);
+ __m128 rgba0 = _mm_loadu_ps ((float *)s++);
rgba0 = linear_to_gamma_2_2_sse2 (rgba0);
_mm_storeu_ps ((float *)d++, rgba0);
samples -= 4;
@@ -534,7 +563,9 @@ conv_yF_linear_yF_gamma (const Babl *conversion,
while (samples--)
{
float y = *src++;
- *dst++ = linear_to_gamma_2_2_sse2 (splat4f (y))[0];
+ __m128_u u;
+ u.v = linear_to_gamma_2_2_sse2 (splat4f (y));
+ *dst++ = u.a[0];
}
}
@@ -544,14 +575,14 @@ conv_yF_gamma_yF_linear (const Babl *conversion,
float *dst,
long samples)
{
- const __v4sf *s = (const __v4sf*)src;
- __v4sf *d = (__v4sf*)dst;
+ const __m128 *s = (const __m128*)src;
+ __m128 *d = (__m128*)dst;
if (((uintptr_t)src % 16) + ((uintptr_t)dst % 16) == 0)
{
while (samples > 4)
{
- __v4sf rgba0 = _mm_load_ps ((float *)s++);
+ __m128 rgba0 = _mm_load_ps ((float *)s++);
rgba0 = gamma_2_2_to_linear_sse2 (rgba0);
_mm_store_ps ((float *)d++, rgba0);
samples -= 4;
@@ -561,7 +592,7 @@ conv_yF_gamma_yF_linear (const Babl *conversion,
{
while (samples > 4)
{
- __v4sf rgba0 = _mm_loadu_ps ((float *)s++);
+ __m128 rgba0 = _mm_loadu_ps ((float *)s++);
rgba0 = gamma_2_2_to_linear_sse2 (rgba0);
_mm_storeu_ps ((float *)d++, rgba0);
samples -= 4;
@@ -574,7 +605,9 @@ conv_yF_gamma_yF_linear (const Babl *conversion,
while (samples--)
{
float y = *src++;
- *dst++ = gamma_2_2_to_linear_sse2 (splat4f (y))[0];
+ __m128_u u;
+ u.v = gamma_2_2_to_linear_sse2 (splat4f (y));
+ *dst++ = u.a[0];
}
}
diff --git a/extensions/sse2-int16.c b/extensions/sse2-int16.c
index 52ca01411..f1de8f6d9 100644
--- a/extensions/sse2-int16.c
+++ b/extensions/sse2-int16.c
@@ -32,7 +32,7 @@
#include "extensions/util.h"
#define Q(a) { a, a, a, a }
-static const __v4sf u16_float = Q (1.f / 65535);
+static const __m128 u16_float = Q (1.f / 65535);
static void
conv_rgba16_rgbaF (const Babl *conversion,
@@ -46,25 +46,28 @@ conv_rgba16_rgbaF (const Babl *conversion,
{
long n = (samples / 2) * 2;
const __m128i *s = (const __m128i*) src;
- __v4sf *d = (__v4sf*) dst;
+ __m128 *d = (__m128*) dst;
for (; i < n / 2; i++)
{
/* Expand shorts to ints by loading zero in the high bits */
- const __m128i t0 = _mm_unpacklo_epi16 (s[i + 0], (__m128i)_mm_setzero_ps());
- const __m128i t1 = _mm_unpackhi_epi16 (s[i + 0], (__m128i)_mm_setzero_ps());
+ const __m128i t0 = _mm_unpacklo_epi16 (s[i + 0], _mm_castps_si128(_mm_setzero_ps()));
+ const __m128i t1 = _mm_unpackhi_epi16 (s[i + 0], _mm_castps_si128(_mm_setzero_ps()));
/* Convert to float */
const __m128 u0 = _mm_cvtepi32_ps (t0);
const __m128 u1 = _mm_cvtepi32_ps (t1);
- const __v4sf rgba0 = u0 * u16_float;
- const __v4sf rgba1 = u1 * u16_float;
+ const __m128 rgba0 = _mm_mul_ps(u0, u16_float);
+ const __m128 rgba1 = _mm_mul_ps(u1, u16_float);
d[2 * i + 0] = rgba0;
d[2 * i + 1] = rgba1;
}
- _mm_empty();
+
+#if !defined (_MSC_VER) || (!defined (_M_X64) && !defined (_M_AMD64))
+ _mm_empty ();
+#endif
}
for (i *= 2 * 4; i != 4 * samples; i++)
@@ -84,40 +87,43 @@ conv_rgba16_rgbAF (const Babl *conversion,
{
long n = (samples / 2) * 2;
const __m128i *s = (const __m128i*) src;
- __v4sf *d = (__v4sf*) dst;
+ __m128 *d = (__m128*) dst;
- const __v4sf max_mask = { 0.0f, 0.0f, 0.0f, 1.0f };
+ const __m128 max_mask = { 0.0f, 0.0f, 0.0f, 1.0f };
for (; i < n / 2; i++)
{
/* Expand shorts to ints by loading zero in the high bits */
- const __m128i t0 = _mm_unpacklo_epi16 (s[i + 0], (__m128i)_mm_setzero_ps());
- const __m128i t1 = _mm_unpackhi_epi16 (s[i + 0], (__m128i)_mm_setzero_ps());
+ const __m128i t0 = _mm_unpacklo_epi16 (s[i + 0], _mm_castps_si128(_mm_setzero_ps()));
+ const __m128i t1 = _mm_unpackhi_epi16 (s[i + 0], _mm_castps_si128(_mm_setzero_ps()));
/* Convert to float */
const __m128 u0 = _mm_cvtepi32_ps (t0);
const __m128 u1 = _mm_cvtepi32_ps (t1);
/* Multiply by 1 / 65535 */
- __v4sf rgba0 = u0 * u16_float;
- __v4sf rgba1 = u1 * u16_float;
+ __m128 rgba0 = _mm_mul_ps(u0, u16_float);
+ __m128 rgba1 = _mm_mul_ps(u1, u16_float);
/* Expand alpha */
- __v4sf aaaa0 = (__v4sf)_mm_shuffle_epi32((__m128i)rgba0, _MM_SHUFFLE(3, 3, 3, 3));
- __v4sf aaaa1 = (__v4sf)_mm_shuffle_epi32((__m128i)rgba1, _MM_SHUFFLE(3, 3, 3, 3));
+ __m128 aaaa0 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(rgba0), _MM_SHUFFLE(3, 3, 3,
3)));
+ __m128 aaaa1 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(rgba1), _MM_SHUFFLE(3, 3, 3,
3)));
/* Set the value in the alpha slot to 1.0, we know max is sufficent because alpha was a short */
aaaa0 = _mm_max_ps(aaaa0, max_mask);
aaaa1 = _mm_max_ps(aaaa1, max_mask);
/* Premultiply */
- rgba0 = rgba0 * aaaa0;
- rgba1 = rgba1 * aaaa1;
+ rgba0 = _mm_mul_ps(rgba0, aaaa0);
+ rgba1 = _mm_mul_ps(rgba1, aaaa1);
d[2 * i + 0] = rgba0;
d[2 * i + 1] = rgba1;
}
- _mm_empty();
+
+#if !defined (_MSC_VER) || (!defined (_M_X64) && !defined (_M_AMD64))
+ _mm_empty ();
+#endif
}
dst += i * 2 * 4;
diff --git a/extensions/sse2-int8.c b/extensions/sse2-int8.c
index 6da1b5b7e..6ccdfbd0d 100644
--- a/extensions/sse2-int8.c
+++ b/extensions/sse2-int8.c
@@ -36,14 +36,14 @@ conv_yF_y8 (const Babl *conversion,
uint8_t *dst,
long samples)
{
- const __v4sf *s_vec;
+ const __m128 *s_vec;
__m128i *d_vec;
uint32_t *d_int;
long n = samples;
- const __v4sf byte_fill = _mm_set_ps1(255.0f);
- const __v4sf half = _mm_set_ps1(0.5);
+ const __m128 byte_fill = _mm_set_ps1(255.0f);
+ const __m128 half = _mm_set_ps1(0.5);
while (((uintptr_t)src % 16) && n > 0)
{
@@ -54,14 +54,14 @@ conv_yF_y8 (const Babl *conversion,
n -= 1;
}
- s_vec = (__v4sf *)src;
+ s_vec = (__m128 *)src;
d_vec = (__m128i *)dst;
/* Aligned chunks */
while (n > 16)
{
- __v4sf yyyy0, yyyy1, yyyy2, yyyy3;
+ __m128 yyyy0, yyyy1, yyyy2, yyyy3;
__m128i i32_0, i32_1, i32_2, i32_3;
__m128i i16_01, i16_23;
__m128i mm_ints;
@@ -72,23 +72,23 @@ conv_yF_y8 (const Babl *conversion,
* signed saturation, the unsigned version wasn't added
* until SSE4.
*/
- yyyy0 = *s_vec++ * byte_fill + half;
+ yyyy0 = _mm_add_ps(_mm_mul_ps(*s_vec++, byte_fill), half);
yyyy0 = _mm_min_ps(yyyy0, byte_fill);
- i32_0 = _mm_cvttps_epi32 ((__m128)yyyy0);
+ i32_0 = _mm_cvttps_epi32 (yyyy0);
- yyyy1 = *s_vec++ * byte_fill + half;
+ yyyy1 = _mm_add_ps(_mm_mul_ps(*s_vec++, byte_fill), half);
yyyy1 = _mm_min_ps(yyyy1, byte_fill);
- i32_1 = _mm_cvttps_epi32 ((__m128)yyyy1);
+ i32_1 = _mm_cvttps_epi32 (yyyy1);
i16_01 = _mm_packs_epi32 (i32_0, i32_1);
- yyyy2 = *s_vec++ * byte_fill + half;
+ yyyy2 = _mm_add_ps(_mm_mul_ps(*s_vec++, byte_fill), half);
yyyy2 = _mm_min_ps(yyyy2, byte_fill);
- i32_2 = _mm_cvttps_epi32 ((__m128)yyyy2);
+ i32_2 = _mm_cvttps_epi32 (yyyy2);
- yyyy3 = *s_vec++ * byte_fill + half;
+ yyyy3 = _mm_add_ps(_mm_mul_ps(*s_vec++, byte_fill), half);
yyyy3 = _mm_min_ps(yyyy3, byte_fill);
- i32_3 = _mm_cvttps_epi32 ((__m128)yyyy3);
+ i32_3 = _mm_cvttps_epi32 (yyyy3);
i16_23 = _mm_packs_epi32 (i32_2, i32_3);
@@ -103,15 +103,15 @@ conv_yF_y8 (const Babl *conversion,
while (n > 4)
{
- __v4sf yyyy0;
+ __m128 yyyy0;
__m128i mm_ints;
- yyyy0 = *s_vec++ * byte_fill + half;
+ yyyy0 = _mm_add_ps(_mm_mul_ps(*s_vec++, byte_fill), half);
yyyy0 = _mm_min_ps(yyyy0, byte_fill);
- mm_ints = _mm_cvttps_epi32 ((__m128)yyyy0);
+ mm_ints = _mm_cvttps_epi32 (yyyy0);
mm_ints = _mm_packs_epi32 (mm_ints, mm_ints);
mm_ints = _mm_packus_epi16 (mm_ints, mm_ints);
- _mm_store_ss ((float *)d_int++, (__v4sf)mm_ints);
+ _mm_store_ss ((float *)d_int++, _mm_castsi128_ps(mm_ints));
n -= 4;
}
diff --git a/meson.build b/meson.build
index 901807381..e99cffba6 100644
--- a/meson.build
+++ b/meson.build
@@ -278,6 +278,21 @@ if cc.get_id() != 'msvc' and cc.has_argument('-mmmx') and get_option('enable-mmx
endif
endif
+if cc.get_id() == 'msvc' and have_x86
+ if get_option('enable-mmx')
+ conf.set('USE_MMX', 1, description:
+ 'Define to 1 if MMX assembly are available.')
+ if get_option('enable-sse')
+ conf.set('USE_SSE', 1, description:
+ 'Define to 1 if SSE assembly are available.')
+ if get_option('enable-sse2')
+ conf.set('USE_SSE2', 1, description:
+ 'Define to 1 if sse2 assembly are available.')
+ endif
+ endif
+ endif
+endif
+
gcc_atomic_exchg_test = '''
int a = 0;
__atomic_exchange_n (&a, 1, __ATOMIC_ACQ_REL);
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]