[babl/wip/msvc: 3/6] build: Enable MMX/SSE/SSE2 for Visual Studio builds

From: Chun-wei Fan <fanchunwei src gnome org>
To: commits-list gnome org
Cc:
Subject: [babl/wip/msvc: 3/6] build: Enable MMX/SSE/SSE2 for Visual Studio builds
Date: Tue, 21 Jan 2020 10:19:55 +0000 (UTC)
commit 0b60f5cbd94bfbe2e0746ebf297bf173961af18c
Author: Chun-wei Fan <fanchunwei src gnome org>
Date:   Tue Jan 21 17:40:17 2020 +0800

    build: Enable MMX/SSE/SSE2 for Visual Studio builds
    
    Enable those unconditionally x86 and x86_64, since the Visual Studio versions,
    we currently support supported these intrinsics for a very long time.  We do,
    however, need to use portable intrinsics syntax for the code to build on Visual
    Studio

 babl/babl-space.c       |  23 +++--
 extensions/sse2-float.c | 259 +++++++++++++++++++++++++++---------------------
 extensions/sse2-int16.c |  42 ++++----
 extensions/sse2-int8.c  |  34 +++----
 meson.build             |  15 +++
 5 files changed, 217 insertions(+), 156 deletions(-)
---
diff --git a/babl/babl-space.c b/babl/babl-space.c
index 586ebcea4..34892b8db 100644
--- a/babl/babl-space.c
+++ b/babl/babl-space.c
@@ -957,21 +957,28 @@ static inline void babl_matrix_mul_vectorff_buf4_sse2 (const float *mat,
                                                        float       *v_out,
                                                        int          samples)
 {
-  const __v4sf m___0 = {m(mat, 0, 0), m(mat, 1, 0), m(mat, 2, 0), 0};
-  const __v4sf m___1 = {m(mat, 0, 1), m(mat, 1, 1), m(mat, 2, 1), 0};
-  const __v4sf m___2 = {m(mat, 0, 2), m(mat, 1, 2), m(mat, 2, 2), 1};
+  const __m128 m___0 = {m(mat, 0, 0), m(mat, 1, 0), m(mat, 2, 0), 0};
+  const __m128 m___1 = {m(mat, 0, 1), m(mat, 1, 1), m(mat, 2, 1), 0};
+  const __m128 m___2 = {m(mat, 0, 2), m(mat, 1, 2), m(mat, 2, 2), 1};
   int i;
   for (i = 0; i < samples; i ++)
   {
-    __v4sf a, b, c = _mm_load_ps(&v_in[0]);
-    a = (__v4sf) _mm_shuffle_epi32((__m128i)c, _MM_SHUFFLE(0,0,0,0));
-    b = (__v4sf) _mm_shuffle_epi32((__m128i)c, _MM_SHUFFLE(1,1,1,1));
-    c = (__v4sf) _mm_shuffle_epi32((__m128i)c, _MM_SHUFFLE(3,2,2,2));
-    _mm_store_ps (v_out, m___0 * a + m___1 * b + m___2 * c);
+    __m128 a, b, c = _mm_load_ps(&v_in[0]);
+    __m128 a_m0, b_m1, c_m2;
+    a = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(c), _MM_SHUFFLE(0,0,0,0)));
+    b = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(c), _MM_SHUFFLE(1,1,1,1)));
+    c = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(c), _MM_SHUFFLE(3,2,2,2)));
+    a_m0 = _mm_mul_ps(m___0, a);
+    b_m1 = _mm_mul_ps(m___1, b);
+    c_m2 = _mm_mul_ps(m___2, c);
+    _mm_store_ps (v_out, _mm_add_ps(_mm_add_ps (a_m0, b_m1), c_m2));
     v_out += 4;
     v_in  += 4;
   }
+
+#if !defined (_MSC_VER) || (!defined (_M_X64) && !defined  (_M_AMD64))
   _mm_empty ();
+#endif
 }
 
 #undef m
diff --git a/extensions/sse2-float.c b/extensions/sse2-float.c
index 3757ffe09..3e4fad298 100644
--- a/extensions/sse2-float.c
+++ b/extensions/sse2-float.c
@@ -49,8 +49,8 @@ conv_rgbaF_linear_rgbAF_linear (const Babl  *conversion,
   if (((uintptr_t)src % 16) + ((uintptr_t)dst % 16) == 0)
     {
       const long    n = (samples / 2) * 2;
-      const __v4sf *s = (const __v4sf*) src;
-            __v4sf *d = (__v4sf*)dst;
+      const __m128 *s = (const __m128*) src;
+            __m128 *d = (__m128*)dst;
 
       for ( ; i < n; i += 2)
         {
@@ -60,22 +60,22 @@ conv_rgbaF_linear_rgbAF_linear (const Babl  *conversion,
           float used_alpha1 = babl_epsilon_for_zero_float (alpha1);
 
          {
-          __v4sf rbaa0, rbaa1;
+          __m128 rbaa0, rbaa1;
         
-          __v4sf rgba0 = *s++;
-          __v4sf rgba1 = *s++;
+          __m128 rgba0 = *s++;
+          __m128 rgba1 = *s++;
 
 
           /* Expand alpha */
-          __v4sf aaaa0 = (__v4sf)_mm_set1_ps(used_alpha0);
-          __v4sf aaaa1 = (__v4sf)_mm_set1_ps(used_alpha1);
+          __m128 aaaa0 = _mm_set1_ps(used_alpha0);
+          __m128 aaaa1 = _mm_set1_ps(used_alpha1);
           
           /* Premultiply */
-          rgba0 = rgba0 * aaaa0;
-          rgba1 = rgba1 * aaaa1;
+          rgba0 = _mm_mul_ps(rgba0, aaaa0);
+          rgba1 = _mm_mul_ps(rgba1, aaaa1);
     
-          aaaa0 = (__v4sf)_mm_set1_ps(alpha0);
-          aaaa1 = (__v4sf)_mm_set1_ps(alpha1);
+          aaaa0 = _mm_set1_ps(alpha0);
+          aaaa1 = _mm_set1_ps(alpha1);
 
           /* Shuffle the original alpha value back in */
           rbaa0 = _mm_shuffle_ps(rgba0, aaaa0, _MM_SHUFFLE(0, 0, 2, 0));
@@ -88,7 +88,10 @@ conv_rgbaF_linear_rgbAF_linear (const Babl  *conversion,
           *d++ = rgba1;
          }
         }
+
+#if !defined (_MSC_VER) || (!defined (_M_X64) && !defined  (_M_AMD64))
       _mm_empty ();
+#endif
     }
 
   dst += i * 4;
@@ -120,12 +123,12 @@ conv_rgbAF_linear_rgbaF_linear_shuffle (const Babl  *conversion,
   if (((uintptr_t)src % 16) + ((uintptr_t)dst % 16) == 0)
     {
       const long    n = samples;
-      const __v4sf *s = (const __v4sf*) src;
-            __v4sf *d = (__v4sf*)dst;
+      const __m128 *s = (const __m128*) src;
+            __m128 *d = (__m128*)dst;
 
       for ( ; i < n; i += 1)
         {
-          __v4sf pre_rgba0, rgba0, rbaa0, raaaa0;
+          __m128 pre_rgba0, rgba0, rbaa0, raaaa0;
           
           float alpha0 = ((float *)s)[3];
           float used_alpha0 = babl_epsilon_for_zero_float (alpha0);
@@ -138,7 +141,7 @@ conv_rgbAF_linear_rgbaF_linear_shuffle (const Babl  *conversion,
             raaaa0 = _mm_load1_ps(&recip0);
             
             /* Un-Premultiply */
-            rgba0 = pre_rgba0 * raaaa0;
+            rgba0 = _mm_mul_ps(pre_rgba0, raaaa0);
           }
             
           /* Shuffle the original alpha value back in */
@@ -148,7 +151,10 @@ conv_rgbAF_linear_rgbaF_linear_shuffle (const Babl  *conversion,
           s++;
           *d++ = rgba0;
         }
+
+#if !defined (_MSC_VER) || (!defined (_M_X64) && !defined  (_M_AMD64))
       _mm_empty ();
+#endif
     }
 
   dst += i * 4;
@@ -184,19 +190,19 @@ conv_rgbAF_linear_rgbaF_linear_spin (const Babl  *conversion,
   if (((uintptr_t)src % 16) + ((uintptr_t)dst % 16) == 0)
     {
       const long    n = samples;
-      const __v4sf *s = (const __v4sf*) src;
-            __v4sf *d = (__v4sf*)dst;
-      const __v4sf zero = _mm_set_ss (BABL_ALPHA_FLOOR_FLOAT);
-      const __v4sf one = _mm_set_ss(1.0f);
+      const __m128 *s = (const __m128*) src;
+            __m128 *d = (__m128*)dst;
+      const __m128 zero = _mm_set_ss (BABL_ALPHA_FLOOR_FLOAT);
+      const __m128 one = _mm_set_ss(1.0f);
 
       for ( ; i < n; i += 1)
         {
-          __v4sf pre_abgr0, abgr0, rgba0, raaaa0;
+          __m128 pre_abgr0, abgr0, rgba0, raaaa0;
           
           
           rgba0 = *s;
           /* Rotate to ABGR */
-          pre_abgr0 = (__v4sf)_mm_shuffle_epi32((__m128i)rgba0, _MM_SHUFFLE(0, 1, 2, 3));
+          pre_abgr0 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(rgba0),_MM_SHUFFLE(0, 1, 2, 3)));
           
           if (_mm_ucomile_ss(pre_abgr0, zero))
           {
@@ -209,22 +215,25 @@ conv_rgbAF_linear_rgbaF_linear_spin (const Babl  *conversion,
             raaaa0 = _mm_div_ss(one, pre_abgr0);
             
             /* Expand reciprocal */
-            raaaa0 = (__v4sf)_mm_shuffle_epi32((__m128i)raaaa0, _MM_SHUFFLE(0, 0, 0, 0));
+            raaaa0 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(raaaa0), _MM_SHUFFLE(0, 0, 0, 0)));
             
             /* Un-Premultiply */
-            abgr0 = pre_abgr0 * raaaa0;
+            abgr0 = _mm_mul_ps(pre_abgr0, raaaa0);
           }
           
           /* Move the original alpha value back in */
           abgr0 = _mm_move_ss(abgr0, pre_abgr0);
           
           /* Rotate to ABGR */
-          rgba0 = (__v4sf)_mm_shuffle_epi32((__m128i)abgr0, _MM_SHUFFLE(0, 1, 2, 3));
+          rgba0 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(abgr0), _MM_SHUFFLE(0, 1, 2, 3)));
           
           *d++ = rgba0;
           s++;
         }
+
+#if !defined (_MSC_VER) || (!defined (_M_X64) && !defined  (_M_AMD64))
       _mm_empty ();
+#endif
     }
 
   dst += i * 4;
@@ -248,93 +257,109 @@ conv_rgbAF_linear_rgbaF_linear_spin (const Babl  *conversion,
     }
 }
 
-#define splat4f(x) ((__v4sf){x,x,x,x})
-#define splat4i(x) ((__v4si){x,x,x,x})
+#define splat4f(x) ((__m128){x,x,x,x})
+#define splat4i(x) ((__m128i){x,x,x,x})
 #define FLT_ONE 0x3f800000 // ((union {float f; int i;}){1.0f}).i
 #define FLT_MANTISSA (1<<23)
 
+typedef union {
+    __m128 v;    // SSE 4 x float vector
+    float a[4];  // scalar array of 4 floats
+} __m128_u;
+
 static inline float
-sse_max_component (__v4sf x) {
-  __v4sf s;
-  __v4sf m;
+sse_max_component (__m128 x) {
+
+  __m128 s;
+  __m128_u m;
 
   /* m = [max (x[3], x[1]), max (x[2], x[0])] */
-  s = (__v4sf) _mm_shuffle_epi32 ((__m128i) x, _MM_SHUFFLE(0, 0, 3, 2));
-  m = _mm_max_ps (x, s);
+  s = _mm_castsi128_ps( _mm_shuffle_epi32 (_mm_castps_si128(x), _MM_SHUFFLE(0, 0, 3, 2)));
+  m.v = _mm_max_ps (x, s);
 
   /* m = [max (m[1], m[0])] = [max (max (x[3], x[1]), max (x[2], x[0]))] */
-  s = (__v4sf) _mm_shuffle_epi32 ((__m128i) m, _MM_SHUFFLE(0, 0, 0, 1));
-  m = _mm_max_ps (m, s);
+  s = _mm_castsi128_ps( _mm_shuffle_epi32 (_mm_castps_si128(m.v), _MM_SHUFFLE(0, 0, 0, 1)));
+  m.v = _mm_max_ps (m.v, s);
 
-  return m[0];
+  return m.a[0];
 }
 
-static inline __v4sf
-sse_init_newton (__v4sf x, double exponent, double c0, double c1, double c2)
+static inline __m128
+sse_init_newton (__m128 x, double exponent, double c0, double c1, double c2)
 {
     double norm = exponent*M_LN2/FLT_MANTISSA;
-    __v4sf y = _mm_cvtepi32_ps((__m128i)((__v4si)x - splat4i(FLT_ONE)));
-    return splat4f(c0) + splat4f(c1*norm)*y + splat4f(c2*norm*norm)*y*y;
+    __m128 y = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_castps_si128(x), splat4i(FLT_ONE)));
+    return _mm_add_ps(_mm_add_ps(splat4f(c0), _mm_mul_ps(splat4f(c1*norm), y)),
+                      _mm_mul_ps(_mm_mul_ps(splat4f(c1*norm*norm), y), y));
 }
 
-static inline __v4sf
-sse_pow_1_24 (__v4sf x)
+#define _mm_square_ps(x) _mm_mul_ps(x, x)
+#define _mm_cube_ps(x) _mm_mul_ps(_mm_mul_ps(x, x), x)
+#define _mm_6pow_ps(x) _mm_mul_ps(_mm_cube_ps(x), _mm_cube_ps(x))
+#define _mm_7pow_ps(x) _mm_mul_ps(_mm_mul_ps(_mm_square_ps(x), _mm_square_ps(x)), _mm_cube_ps(x))
+
+static inline __m128
+sse_pow_1_24 (__m128 x)
 {
-  __v4sf y, z;
-  if (sse_max_component (x) > 1024.0f) {
+  __m128 y, z;
+  __m128_u u;
+  u.v = x;
+  if (sse_max_component (u.v) > 1024.0f) {
     /* for large values, fall back to a slower but more accurate version */
-    return _mm_set_ps (expf (logf (x[3]) * (1.0f / 2.4f)),
-                       expf (logf (x[2]) * (1.0f / 2.4f)),
-                       expf (logf (x[1]) * (1.0f / 2.4f)),
-                       expf (logf (x[0]) * (1.0f / 2.4f)));
+    return _mm_set_ps (expf (logf (u.a[3]) * (1.0f / 2.4f)),
+                       expf (logf (u.a[2]) * (1.0f / 2.4f)),
+                       expf (logf (u.a[1]) * (1.0f / 2.4f)),
+                       expf (logf (u.a[0]) * (1.0f / 2.4f)));
   }
-  y = sse_init_newton (x, -1./12, 0.9976800269, 0.9885126933, 0.5908575383);
-  x = _mm_sqrt_ps (x);
+  y = sse_init_newton (u.v, -1./12, 0.9976800269, 0.9885126933, 0.5908575383);
+  u.v = _mm_sqrt_ps (u.v);
   /* newton's method for x^(-1/6) */
-  z = splat4f (1.f/6.f) * x;
-  y = splat4f (7.f/6.f) * y - z * ((y*y)*(y*y)*(y*y*y));
-  y = splat4f (7.f/6.f) * y - z * ((y*y)*(y*y)*(y*y*y));
-  return x*y;
+  z = _mm_mul_ps(splat4f (1.f/6.f), u.v);
+  y = _mm_sub_ps(_mm_mul_ps(splat4f (7.f/6.f), y), _mm_mul_ps(z, _mm_7pow_ps(y)));
+  y = _mm_sub_ps(_mm_mul_ps(splat4f (7.f/6.f), y), _mm_mul_ps(z, _mm_7pow_ps(y)));
+  return _mm_mul_ps(u.v, y);
 }
 
-static inline __v4sf
-sse_pow_24 (__v4sf x)
+static inline __m128
+sse_pow_24 (__m128 x)
 {
-  __v4sf y, z;
-  if (sse_max_component (x) > 16.0f) {
+  __m128 y, z;
+  __m128_u u;
+  u.v = x;
+  if (sse_max_component (u.v) > 16.0f) {
     /* for large values, fall back to a slower but more accurate version */
-    return _mm_set_ps (expf (logf (x[3]) * 2.4f),
-                       expf (logf (x[2]) * 2.4f),
-                       expf (logf (x[1]) * 2.4f),
-                       expf (logf (x[0]) * 2.4f));
+    return _mm_set_ps (expf (logf (u.a[3]) * 2.4f),
+                       expf (logf (u.a[2]) * 2.4f),
+                       expf (logf (u.a[1]) * 2.4f),
+                       expf (logf (u.a[0]) * 2.4f));
   }
-  y = sse_init_newton (x, -1./5, 0.9953189663, 0.9594345146, 0.6742970332);
+  y = sse_init_newton (u.v, -1./5, 0.9953189663, 0.9594345146, 0.6742970332);
   /* newton's method for x^(-1/5) */
-  z = splat4f (1.f/5.f) * x;
-  y = splat4f (6.f/5.f) * y - z * ((y*y*y)*(y*y*y));
-  y = splat4f (6.f/5.f) * y - z * ((y*y*y)*(y*y*y));
-  x *= y;
-  return x*x*x;
+  z = _mm_mul_ps(splat4f (1.f/5.f), x);
+  y = _mm_sub_ps(_mm_mul_ps(splat4f (6.f/5.f), y), _mm_mul_ps(z, _mm_6pow_ps(y)));
+  y = _mm_sub_ps(_mm_mul_ps(splat4f (6.f/5.f), y), _mm_mul_ps(z, _mm_6pow_ps(y)));
+  x = _mm_mul_ps(x, y);
+  return _mm_cube_ps(x);
 }
 
-static inline __v4sf
-linear_to_gamma_2_2_sse2 (__v4sf x)
+static inline __m128
+linear_to_gamma_2_2_sse2 (__m128 x)
 {
-  __v4sf curve = sse_pow_1_24 (x) * splat4f (1.055f) -
-                 splat4f (0.055f                     -
-                          3.0f / (float) (1 << 24));
-                          /* ^ offset the result such that 1 maps to 1 */
-  __v4sf line = x * splat4f (12.92f);
-  __v4sf mask = _mm_cmpgt_ps (x, splat4f (0.003130804954f));
+  __m128 curve = _mm_sub_ps(_mm_mul_ps(sse_pow_1_24 (x), splat4f (1.055f)),
+                            splat4f (0.055f                     -
+                            3.0f / (float) (1 << 24)));
+                            /* ^ offset the result such that 1 maps to 1 */
+  __m128 line = _mm_mul_ps(x, splat4f (12.92f));
+  __m128 mask = _mm_cmpgt_ps (x, splat4f (0.003130804954f));
   return _mm_or_ps (_mm_and_ps (mask, curve), _mm_andnot_ps (mask, line));
 }
 
-static inline __v4sf
-gamma_2_2_to_linear_sse2 (__v4sf x)
+static inline __m128
+gamma_2_2_to_linear_sse2 (__m128 x)
 {
-  __v4sf curve = sse_pow_24 ((x + splat4f (0.055f)) * splat4f (1/1.055f));
-  __v4sf line = x * splat4f (1/12.92f);
-  __v4sf mask = _mm_cmpgt_ps (x, splat4f (0.04045f));
+  __m128 curve = sse_pow_24 (_mm_mul_ps(_mm_add_ps(x, splat4f (0.055f)), splat4f (1/1.055f)));
+  __m128 line = _mm_mul_ps(x, splat4f (1/12.92f));
+  __m128 mask = _mm_cmpgt_ps (x, splat4f (0.04045f));
   return _mm_or_ps (_mm_and_ps (mask, curve), _mm_andnot_ps (mask, line));
 }
 
@@ -348,16 +373,16 @@ func (const Babl *conversion,const float *src, float *dst, long samples)\
       for (; i > 3; i -= 4, src += 16, dst += 16)\
         {\
           /* Pack the rgb components from 4 pixels into 3 vectors, gammafy, unpack. */\
-          __v4sf x0 = _mm_load_ps (src);\
-          __v4sf x1 = _mm_load_ps (src+4);\
-          __v4sf x2 = _mm_load_ps (src+8);\
-          __v4sf x3 = _mm_load_ps (src+12);\
-          __v4sf y0 = _mm_movelh_ps (x0, x1);\
-          __v4sf y1 = _mm_movelh_ps (x2, x3);\
-          __v4sf z0 = _mm_unpackhi_ps (x0, x1);\
-          __v4sf z1 = _mm_unpackhi_ps (x2, x3);\
-          __v4sf y2 = _mm_movelh_ps (z0, z1);\
-          __v4sf y3 = _mm_movehl_ps (z1, z0);\
+          __m128 x0 = _mm_load_ps (src);\
+          __m128 x1 = _mm_load_ps (src+4);\
+          __m128 x2 = _mm_load_ps (src+8);\
+          __m128 x3 = _mm_load_ps (src+12);\
+          __m128 y0 = _mm_movelh_ps (x0, x1);\
+          __m128 y1 = _mm_movelh_ps (x2, x3);\
+          __m128 z0 = _mm_unpackhi_ps (x0, x1);\
+          __m128 z1 = _mm_unpackhi_ps (x2, x3);\
+          __m128 y2 = _mm_movelh_ps (z0, z1);\
+          __m128 y3 = _mm_movehl_ps (z1, z0);\
           y0 = munge (y0);\
           _mm_storel_pi ((__m64*)(dst), y0);\
           _mm_storeh_pi ((__m64*)(dst+4), y0);\
@@ -374,7 +399,7 @@ func (const Babl *conversion,const float *src, float *dst, long samples)\
         }\
       for (; i > 0; i--, src += 4, dst += 4)\
         {\
-          __v4sf x = munge (_mm_load_ps (src));\
+          __m128 x = munge (_mm_load_ps (src));\
           float a = src[3];\
           _mm_store_ps (dst, x);\
           dst[3] = a;\
@@ -384,7 +409,7 @@ func (const Babl *conversion,const float *src, float *dst, long samples)\
     {\
       for (; i > 0; i--, src += 4, dst += 4)\
         {\
-          __v4sf x = munge (_mm_loadu_ps (src));\
+          __m128 x = munge (_mm_loadu_ps (src));\
           float a = src[3];\
           _mm_storeu_ps (dst, x);\
           dst[3] = a;\
@@ -407,17 +432,17 @@ static void conv_rgbaF_linear_rgbAF_gamma (const Babl  *conversion,
 
 #define YA_APPLY(load, store, convert) \
 { \
-  __v4sf yyaa0, yyaa1; \
-  __v4sf yaya0  = load ((float *)s++); \
-  __v4sf yaya1  = load ((float *)s++); \
-  __v4sf yyyy01 = _mm_shuffle_ps (yaya0, yaya1, _MM_SHUFFLE(0, 2, 0, 2)); \
+  __m128 yyaa0, yyaa1; \
+  __m128 yaya0  = load ((float *)s++); \
+  __m128 yaya1  = load ((float *)s++); \
+  __m128 yyyy01 = _mm_shuffle_ps (yaya0, yaya1, _MM_SHUFFLE(0, 2, 0, 2)); \
 \
   yyyy01 = convert (yyyy01); \
 \
   yyaa0 = _mm_shuffle_ps (yyyy01, yaya0, _MM_SHUFFLE(3, 1, 0, 1)); \
-  yaya0 = (__v4sf)_mm_shuffle_epi32((__m128i)yyaa0, _MM_SHUFFLE(3, 1, 2, 0)); \
+  yaya0 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(yyaa0), _MM_SHUFFLE(3, 1, 2, 0))); \
   yyaa1 = _mm_shuffle_ps (yyyy01, yaya1, _MM_SHUFFLE(3, 1, 2, 3)); \
-  yaya1 = (__v4sf)_mm_shuffle_epi32((__m128i)yyaa1, _MM_SHUFFLE(3, 1, 2, 0)); \
+  yaya1 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(yyaa1), _MM_SHUFFLE(3, 1, 2, 0))); \
 \
   store ((float *)d++, yaya0); \
   store ((float *)d++, yaya1); \
@@ -429,8 +454,8 @@ conv_yaF_linear_yaF_gamma (const Babl  *conversion,
                            float       *dst, 
                            long         samples)
 {
-  const __v4sf *s = (const __v4sf*)src;
-        __v4sf *d = (__v4sf*)dst;
+  const __m128 *s = (const __m128*)src;
+        __m128 *d = (__m128*)dst;
 
   if (((uintptr_t)src % 16) + ((uintptr_t)dst % 16) == 0)
     {
@@ -455,7 +480,9 @@ conv_yaF_linear_yaF_gamma (const Babl  *conversion,
   while (samples--)
     {
       float y = *src++;
-      *dst++ = linear_to_gamma_2_2_sse2 (splat4f (y))[0];
+      __m128_u u;
+      u.v = linear_to_gamma_2_2_sse2 (splat4f (y));
+      *dst++ = u.a[0];
       *dst++ = *src++;
     }
 }
@@ -467,8 +494,8 @@ conv_yaF_gamma_yaF_linear (const Babl  *conversion,
                            float       *dst, 
                            long         samples)
 {
-  const __v4sf *s = (const __v4sf*)src;
-        __v4sf *d = (__v4sf*)dst;
+  const __m128 *s = (const __m128*)src;
+        __m128 *d = (__m128*)dst;
 
   if (((uintptr_t)src % 16) + ((uintptr_t)dst % 16) == 0)
     {
@@ -493,7 +520,9 @@ conv_yaF_gamma_yaF_linear (const Babl  *conversion,
   while (samples--)
     {
       float y = *src++;
-      *dst++ = gamma_2_2_to_linear_sse2 (splat4f (y))[0];
+      __m128_u u;
+      u.v = gamma_2_2_to_linear_sse2 (splat4f (y));
+      *dst++ = u.a[0];
       *dst++ = *src++;
     }
 }
@@ -504,14 +533,14 @@ conv_yF_linear_yF_gamma (const Babl  *conversion,
                          float       *dst, 
                          long         samples)
 {
-  const __v4sf *s = (const __v4sf*)src;
-        __v4sf *d = (__v4sf*)dst;
+  const __m128 *s = (const __m128*)src;
+        __m128 *d = (__m128*)dst;
 
   if (((uintptr_t)src % 16) + ((uintptr_t)dst % 16) == 0)
     {
       while (samples > 4)
         {
-          __v4sf rgba0 = _mm_load_ps ((float *)s++);
+          __m128 rgba0 = _mm_load_ps ((float *)s++);
           rgba0 = linear_to_gamma_2_2_sse2 (rgba0);
           _mm_store_ps ((float *)d++, rgba0);
           samples -= 4;
@@ -521,7 +550,7 @@ conv_yF_linear_yF_gamma (const Babl  *conversion,
     {
       while (samples > 4)
         {
-          __v4sf rgba0 = _mm_loadu_ps ((float *)s++);
+          __m128 rgba0 = _mm_loadu_ps ((float *)s++);
           rgba0 = linear_to_gamma_2_2_sse2 (rgba0);
           _mm_storeu_ps ((float *)d++, rgba0);
           samples -= 4;
@@ -534,7 +563,9 @@ conv_yF_linear_yF_gamma (const Babl  *conversion,
   while (samples--)
     {
       float y = *src++;
-      *dst++ = linear_to_gamma_2_2_sse2 (splat4f (y))[0];
+      __m128_u u;
+      u.v = linear_to_gamma_2_2_sse2 (splat4f (y));
+      *dst++ = u.a[0];
     }
 }
 
@@ -544,14 +575,14 @@ conv_yF_gamma_yF_linear (const Babl  *conversion,
                          float       *dst, 
                          long         samples)
 {
-  const __v4sf *s = (const __v4sf*)src;
-        __v4sf *d = (__v4sf*)dst;
+  const __m128 *s = (const __m128*)src;
+        __m128 *d = (__m128*)dst;
 
   if (((uintptr_t)src % 16) + ((uintptr_t)dst % 16) == 0)
     {
       while (samples > 4)
         {
-          __v4sf rgba0 = _mm_load_ps ((float *)s++);
+          __m128 rgba0 = _mm_load_ps ((float *)s++);
           rgba0 = gamma_2_2_to_linear_sse2 (rgba0);
           _mm_store_ps ((float *)d++, rgba0);
           samples -= 4;
@@ -561,7 +592,7 @@ conv_yF_gamma_yF_linear (const Babl  *conversion,
     {
       while (samples > 4)
         {
-          __v4sf rgba0 = _mm_loadu_ps ((float *)s++);
+          __m128 rgba0 = _mm_loadu_ps ((float *)s++);
           rgba0 = gamma_2_2_to_linear_sse2 (rgba0);
           _mm_storeu_ps ((float *)d++, rgba0);
           samples -= 4;
@@ -574,7 +605,9 @@ conv_yF_gamma_yF_linear (const Babl  *conversion,
   while (samples--)
     {
       float y = *src++;
-      *dst++ = gamma_2_2_to_linear_sse2 (splat4f (y))[0];
+      __m128_u u;
+      u.v = gamma_2_2_to_linear_sse2 (splat4f (y));
+      *dst++ = u.a[0];
     }
 }
 
diff --git a/extensions/sse2-int16.c b/extensions/sse2-int16.c
index 52ca01411..f1de8f6d9 100644
--- a/extensions/sse2-int16.c
+++ b/extensions/sse2-int16.c
@@ -32,7 +32,7 @@
 #include "extensions/util.h"
 
 #define Q(a) { a, a, a, a }
-static const __v4sf  u16_float = Q (1.f / 65535);
+static const __m128  u16_float = Q (1.f / 65535);
 
 static void
 conv_rgba16_rgbaF (const Babl     *conversion,
@@ -46,25 +46,28 @@ conv_rgba16_rgbaF (const Babl     *conversion,
     {
       long           n  = (samples / 2) * 2;
       const __m128i *s  = (const __m128i*) src;
-            __v4sf  *d  = (__v4sf*) dst;
+            __m128  *d  = (__m128*) dst;
 
       for (; i < n / 2; i++)
         {
           /* Expand shorts to ints by loading zero in the high bits */
-          const __m128i t0 = _mm_unpacklo_epi16 (s[i + 0], (__m128i)_mm_setzero_ps());
-          const __m128i t1 = _mm_unpackhi_epi16 (s[i + 0], (__m128i)_mm_setzero_ps());
+          const __m128i t0 = _mm_unpacklo_epi16 (s[i + 0], _mm_castps_si128(_mm_setzero_ps()));
+          const __m128i t1 = _mm_unpackhi_epi16 (s[i + 0], _mm_castps_si128(_mm_setzero_ps()));
 
           /* Convert to float */
           const __m128  u0 = _mm_cvtepi32_ps (t0);
           const __m128  u1 = _mm_cvtepi32_ps (t1);
 
-          const __v4sf rgba0 = u0 * u16_float;
-          const __v4sf rgba1 = u1 * u16_float;
+          const __m128 rgba0 = _mm_mul_ps(u0, u16_float);
+          const __m128 rgba1 = _mm_mul_ps(u1, u16_float);
 
           d[2 * i + 0] = rgba0;
           d[2 * i + 1] = rgba1;
         }
-      _mm_empty();
+
+#if !defined (_MSC_VER) || (!defined (_M_X64) && !defined  (_M_AMD64))
+      _mm_empty ();
+#endif
     }
 
   for (i *= 2 * 4; i != 4 * samples; i++)
@@ -84,40 +87,43 @@ conv_rgba16_rgbAF (const Babl     *conversion,
     {
       long           n  = (samples / 2) * 2;
       const __m128i *s  = (const __m128i*) src;
-            __v4sf  *d  = (__v4sf*) dst;
+            __m128  *d  = (__m128*) dst;
 
-      const __v4sf  max_mask = { 0.0f, 0.0f, 0.0f, 1.0f };
+      const __m128  max_mask = { 0.0f, 0.0f, 0.0f, 1.0f };
 
       for (; i < n / 2; i++)
         {
           /* Expand shorts to ints by loading zero in the high bits */
-          const __m128i t0 = _mm_unpacklo_epi16 (s[i + 0], (__m128i)_mm_setzero_ps());
-          const __m128i t1 = _mm_unpackhi_epi16 (s[i + 0], (__m128i)_mm_setzero_ps());
+          const __m128i t0 = _mm_unpacklo_epi16 (s[i + 0], _mm_castps_si128(_mm_setzero_ps()));
+          const __m128i t1 = _mm_unpackhi_epi16 (s[i + 0], _mm_castps_si128(_mm_setzero_ps()));
 
           /* Convert to float */
           const __m128  u0 = _mm_cvtepi32_ps (t0);
           const __m128  u1 = _mm_cvtepi32_ps (t1);
 
           /* Multiply by 1 / 65535 */
-          __v4sf rgba0 = u0 * u16_float;
-          __v4sf rgba1 = u1 * u16_float;
+          __m128 rgba0 = _mm_mul_ps(u0, u16_float);
+          __m128 rgba1 = _mm_mul_ps(u1, u16_float);
           
           /* Expand alpha */
-          __v4sf aaaa0 = (__v4sf)_mm_shuffle_epi32((__m128i)rgba0, _MM_SHUFFLE(3, 3, 3, 3));
-          __v4sf aaaa1 = (__v4sf)_mm_shuffle_epi32((__m128i)rgba1, _MM_SHUFFLE(3, 3, 3, 3));
+          __m128 aaaa0 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(rgba0), _MM_SHUFFLE(3, 3, 3, 
3)));
+          __m128 aaaa1 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(rgba1), _MM_SHUFFLE(3, 3, 3, 
3)));
           
           /* Set the value in the alpha slot to 1.0, we know max is sufficent because alpha was a short */
           aaaa0 = _mm_max_ps(aaaa0, max_mask);
           aaaa1 = _mm_max_ps(aaaa1, max_mask);
           
           /* Premultiply */
-          rgba0 = rgba0 * aaaa0;
-          rgba1 = rgba1 * aaaa1;
+          rgba0 = _mm_mul_ps(rgba0, aaaa0);
+          rgba1 = _mm_mul_ps(rgba1, aaaa1);
           
           d[2 * i + 0] = rgba0;
           d[2 * i + 1] = rgba1;
         }
-      _mm_empty();
+
+#if !defined (_MSC_VER) || (!defined (_M_X64) && !defined  (_M_AMD64))
+      _mm_empty ();
+#endif
     }
 
   dst += i * 2 * 4;
diff --git a/extensions/sse2-int8.c b/extensions/sse2-int8.c
index 6da1b5b7e..6ccdfbd0d 100644
--- a/extensions/sse2-int8.c
+++ b/extensions/sse2-int8.c
@@ -36,14 +36,14 @@ conv_yF_y8 (const Babl  *conversion,
             uint8_t     *dst, 
             long         samples)
 {
-  const __v4sf *s_vec;
+  const __m128 *s_vec;
   __m128i      *d_vec;
   uint32_t     *d_int;
 
   long n = samples;
 
-  const __v4sf byte_fill = _mm_set_ps1(255.0f);
-  const __v4sf half      = _mm_set_ps1(0.5);
+  const __m128 byte_fill = _mm_set_ps1(255.0f);
+  const __m128 half      = _mm_set_ps1(0.5);
 
   while (((uintptr_t)src % 16) && n > 0)
     {
@@ -54,14 +54,14 @@ conv_yF_y8 (const Babl  *conversion,
       n -= 1;
     }
 
-  s_vec = (__v4sf *)src;
+  s_vec = (__m128 *)src;
   d_vec = (__m128i *)dst;
 
   /* Aligned chunks */
 
   while (n > 16)
     {
-      __v4sf  yyyy0, yyyy1, yyyy2, yyyy3;
+      __m128  yyyy0, yyyy1, yyyy2, yyyy3;
       __m128i i32_0, i32_1, i32_2, i32_3;
       __m128i i16_01, i16_23;
       __m128i mm_ints;
@@ -72,23 +72,23 @@ conv_yF_y8 (const Babl  *conversion,
        * signed saturation, the unsigned version wasn't added
        * until SSE4.
        */
-      yyyy0 = *s_vec++ * byte_fill + half;
+      yyyy0 = _mm_add_ps(_mm_mul_ps(*s_vec++, byte_fill), half);
       yyyy0 = _mm_min_ps(yyyy0, byte_fill);
-      i32_0 = _mm_cvttps_epi32 ((__m128)yyyy0);
+      i32_0 = _mm_cvttps_epi32 (yyyy0);
 
-      yyyy1 = *s_vec++ * byte_fill + half;
+      yyyy1 = _mm_add_ps(_mm_mul_ps(*s_vec++, byte_fill), half);
       yyyy1 = _mm_min_ps(yyyy1, byte_fill);
-      i32_1 = _mm_cvttps_epi32 ((__m128)yyyy1);
+      i32_1 = _mm_cvttps_epi32 (yyyy1);
 
       i16_01 = _mm_packs_epi32 (i32_0, i32_1);
 
-      yyyy2 = *s_vec++ * byte_fill + half;
+      yyyy2 = _mm_add_ps(_mm_mul_ps(*s_vec++, byte_fill), half);
       yyyy2 = _mm_min_ps(yyyy2, byte_fill);
-      i32_2 = _mm_cvttps_epi32 ((__m128)yyyy2);
+      i32_2 = _mm_cvttps_epi32 (yyyy2);
 
-      yyyy3 = *s_vec++ * byte_fill + half;
+      yyyy3 = _mm_add_ps(_mm_mul_ps(*s_vec++, byte_fill), half);
       yyyy3 = _mm_min_ps(yyyy3, byte_fill);
-      i32_3 = _mm_cvttps_epi32 ((__m128)yyyy3);
+      i32_3 = _mm_cvttps_epi32 (yyyy3);
 
       i16_23 = _mm_packs_epi32 (i32_2, i32_3);
 
@@ -103,15 +103,15 @@ conv_yF_y8 (const Babl  *conversion,
 
   while (n > 4)
     {
-      __v4sf  yyyy0;
+      __m128  yyyy0;
       __m128i mm_ints;
 
-      yyyy0 = *s_vec++ * byte_fill + half;
+      yyyy0 = _mm_add_ps(_mm_mul_ps(*s_vec++, byte_fill), half);
       yyyy0 = _mm_min_ps(yyyy0, byte_fill);
-      mm_ints = _mm_cvttps_epi32 ((__m128)yyyy0);
+      mm_ints = _mm_cvttps_epi32 (yyyy0);
       mm_ints = _mm_packs_epi32 (mm_ints, mm_ints);
       mm_ints = _mm_packus_epi16 (mm_ints, mm_ints);
-      _mm_store_ss ((float *)d_int++, (__v4sf)mm_ints);
+      _mm_store_ss ((float *)d_int++, _mm_castsi128_ps(mm_ints));
 
       n -= 4;
     }
diff --git a/meson.build b/meson.build
index 58825e05f..84028aee9 100644
--- a/meson.build
+++ b/meson.build
@@ -277,6 +277,21 @@ if cc.get_id() != 'msvc' and cc.has_argument('-mmmx') and get_option('enable-mmx
   endif
 endif
 
+if cc.get_id() == 'msvc' and have_x86
+  if get_option('enable-mmx')
+    conf.set('USE_MMX', 1, description:
+      'Define to 1 if MMX assembly are available.')
+    if get_option('enable-sse')
+      conf.set('USE_SSE', 1, description:
+        'Define to 1 if SSE assembly are available.')
+      if get_option('enable-sse2')
+        conf.set('USE_SSE2', 1, description:
+          'Define to 1 if sse2 assembly are available.')
+      endif
+    endif
+  endif
+endif
+
 gcc_atomic_exchg_test = '''
   int a = 0;
   __atomic_exchange_n (&a, 1, __ATOMIC_ACQ_REL);
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]