[babl/sse-conversions-2013] SSE Float: draft 5



commit 389120e9993cc6c99beff17eaa13b85d09994631
Author: Daniel Sabo <DanielSabo gmail com>
Date:   Mon Apr 1 21:50:13 2013 -0700

    SSE Float: draft 5
    
    Include an alternate conversion for RaGaBaA -> RGBA. Depending on
    the CPU either spin or shuffle is significantly faster. Unless
    I can find a consistently fast version I'm going to let them fight
    it out in the babl startup benchmarks.

 extensions/sse-float.c |   99 ++++++++++++++++++++++++++++++++++++++++++++++-
 1 files changed, 96 insertions(+), 3 deletions(-)
---
diff --git a/extensions/sse-float.c b/extensions/sse-float.c
index 96695d4..954e359 100644
--- a/extensions/sse-float.c
+++ b/extensions/sse-float.c
@@ -95,7 +95,7 @@ conv_rgbaF_linear_rgbAF_linear (const float *src, float *dst, long samples)
 }
 
 static long
-conv_rgbAF_linear_rgbaF_linear (const float *src, float *dst, long samples)
+conv_rgbAF_linear_rgbaF_linear_shuffle (const float *src, float *dst, long samples)
 {
   long i = 0;
   long remainder;
@@ -162,6 +162,81 @@ conv_rgbAF_linear_rgbaF_linear (const float *src, float *dst, long samples)
   return samples;
 }
 
+static long
+conv_rgbAF_linear_rgbaF_linear_spin (const float *src, float *dst, long samples)
+{
+  long i = 0;
+  long remainder;
+
+  if (((uintptr_t)src % 16) + ((uintptr_t)dst % 16) == 0)
+    {
+      const long    n = samples;
+      const __v4sf *s = (const __v4sf*) src;
+            __v4sf *d = (__v4sf*)dst;
+      const __v4sf zero = _mm_setzero_ps();
+      const __v4sf one = _mm_set_ss(1.0f);
+
+      for ( ; i < n; i += 1)
+        {
+          __v4sf pre_abgr0, abgr0, rgba0, raaaa0;
+          
+          
+          rgba0 = *s;
+          /* Rotate to ABGR */
+          pre_abgr0 = (__v4sf)_mm_shuffle_epi32((__m128i)rgba0, _MM_SHUFFLE(0, 1, 2, 3));
+          
+          if (_mm_ucomile_ss(pre_abgr0, zero))
+          {
+            /* Zero RGB */
+            abgr0 = zero;
+          }
+          else
+          {
+            /* Un-Premultiply */
+            raaaa0 = _mm_div_ss(one, pre_abgr0);
+            
+            /* Expand reciprocal */
+            raaaa0 = (__v4sf)_mm_shuffle_epi32((__m128i)raaaa0, _MM_SHUFFLE(0, 0, 0, 0));
+            
+            /* Un-Premultiply */
+            abgr0 = pre_abgr0 * raaaa0;
+          }
+          
+          /* Move the original alpha value back in */
+          abgr0 = _mm_move_ss(abgr0, pre_abgr0);
+          
+          /* Rotate to ABGR */
+          rgba0 = (__v4sf)_mm_shuffle_epi32((__m128i)abgr0, _MM_SHUFFLE(0, 1, 2, 3));
+          
+          *d++ = rgba0;
+          s++;
+        }
+      _mm_empty ();
+    }
+
+  dst += i * 4;
+  src += i * 4;
+  remainder = samples - i;
+  while (remainder--)
+    {
+      float alpha = src[3];
+      float recip;
+      if (alpha <= 0.0f)
+        recip = 0.0f;
+      else
+        recip = 1.0f/alpha;
+      dst[0] = src[0] * recip;
+      dst[1] = src[1] * recip;
+      dst[2] = src[2] * recip;
+      dst[3] = alpha;
+      
+      src   += 4;
+      dst   += 4;
+    }
+
+  return samples;
+}
+
 #endif /* defined(USE_SSE2) */
 
 #define o(src, dst) \
@@ -195,8 +270,26 @@ init (void)
       (babl_cpu_accel_get_support () & BABL_CPU_ACCEL_X86_SSE2))
       
     {
-      o (rgbaF_linear, rgbAF_linear);
-      o (rgbAF_linear, rgbaF_linear);
+      babl_conversion_new(rgbaF_linear, 
+                          rgbAF_linear,
+                          "linear",
+                          conv_rgbaF_linear_rgbAF_linear,
+                          NULL);
+                          
+      /* Which of these is faster varies by CPU, and the difference
+       * is big enough that it's worthwhile to include both and
+       * let them fight it out in the babl benchmarks.
+       */
+      babl_conversion_new(rgbAF_linear, 
+                          rgbaF_linear,
+                          "linear",
+                          conv_rgbAF_linear_rgbaF_linear_shuffle,
+                          NULL);
+      babl_conversion_new(rgbAF_linear, 
+                          rgbaF_linear,
+                          "linear",
+                          conv_rgbAF_linear_rgbaF_linear_spin,
+                          NULL);
     }
 
 #endif /* defined(USE_SSE2) */


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]