[babl] make pow implementations inlineable - gaining some fixed amount of blitting performance
- From: Øyvind Kolås <ok src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [babl] make pow implementations inlineable - gaining some fixed amount of blitting performance
- Date: Sun, 20 Nov 2016 09:36:36 +0000 (UTC)
commit 1d1c34f3c83e12d65a3f18b0b748ddd9b0040ddf
Author: Øyvind Kolås <pippin gimp org>
Date: Sat Nov 19 18:27:06 2016 +0100
make pow implementations inlineable - gaining some fixed amount of blitting performance
babl/base/pow-24.c | 3 +-
babl/base/pow-24.h | 118 +++++++++++++++++++++++++++++++++++++++++++++--
extensions/sse2-float.c | 14 +++---
3 files changed, 123 insertions(+), 12 deletions(-)
---
diff --git a/babl/base/pow-24.c b/babl/base/pow-24.c
index 03fca43..dbc4071 100644
--- a/babl/base/pow-24.c
+++ b/babl/base/pow-24.c
@@ -18,7 +18,7 @@
#include <math.h>
#include "util.h"
-
+#if 0
/* a^b = exp(b*log(a))
*
* Extracting the exponent from a float gives us an approximate log.
@@ -125,3 +125,4 @@ babl_pow_1_24f (float x)
y = (7.f/6.f) * y - z * ((y*y)*(y*y)*(y*y*y));
return x*y;
}
+#endif
diff --git a/babl/base/pow-24.h b/babl/base/pow-24.h
index 2fb338f..cbd6fd4 100644
--- a/babl/base/pow-24.h
+++ b/babl/base/pow-24.h
@@ -19,9 +19,119 @@
#ifndef _BASE_POW_24_H
#define _BASE_POW_24_H
-extern double babl_pow_1_24 (double x);
-extern double babl_pow_24 (double x);
-extern float babl_pow_1_24f (float x);
-extern float babl_pow_24f (float x);
+static double babl_pow_1_24 (double x);
+static double babl_pow_24 (double x);
+static float babl_pow_1_24f (float x);
+static float babl_pow_24f (float x);
+
+
+/* a^b = exp(b*log(a))
+ *
+ * Extracting the exponent from a float gives us an approximate log.
+ * Or better yet, reinterpret the bitpattern of the whole float as an int.
+ *
+ * However, the output values of 12throot vary by less than a factor of 2
+ * over the domain we care about, so we only get log() that way, not exp().
+ *
+ * Approximate exp() with a low-degree polynomial; not exactly equal to the
+ * Taylor series since we're minimizing maximum error over a certain finite
+ * domain. It's not worthwhile to use lots of terms, since Newton's method
+ * has a better convergence rate once you get reasonably close to the answer.
+ */
+static inline double
+init_newton (double x, double exponent, double c0, double c1, double c2)
+{
+ int iexp;
+ double y = frexp(x, &iexp);
+ y = 2*y+(iexp-2);
+ c1 *= M_LN2*exponent;
+ c2 *= M_LN2*M_LN2*exponent*exponent;
+ return y = c0 + c1*y + c2*y*y;
+}
+
+/* Returns x^2.4 == (x*(x^(-1/5)))^3, using Newton's method for x^(-1/5).
+ */
+static inline double
+babl_pow_24 (double x)
+{
+ double y = init_newton (x, -1./5, 0.9953189663, 0.9594345146, 0.6742970332);
+ int i;
+ for (i = 0; i < 3; i++)
+ y = (1.+1./5)*y - ((1./5)*x*(y*y))*((y*y)*(y*y));
+ x *= y;
+ return x*x*x;
+}
+
+/* Returns x^(1/2.4) == x*((x^(-1/6))^(1/2))^7, using Newton's method for x^(-1/6).
+ */
+static inline double
+babl_pow_1_24 (double x)
+{
+ double y = init_newton (x, -1./12, 0.9976800269, 0.9885126933, 0.5908575383);
+ int i;
+ double z;
+ x = sqrt (x);
+ /* newton's method for x^(-1/6) */
+ z = (1./6.) * x;
+ for (i = 0; i < 3; i++)
+ y = (7./6.) * y - z * ((y*y)*(y*y)*(y*y*y));
+ return x*y;
+}
+
+//////////////////////////////////////////////
+/* a^b = exp(b*log(a))
+ *
+ * Extracting the exponent from a float gives us an approximate log.
+ * Or better yet, reinterpret the bitpattern of the whole float as an int.
+ *
+ * However, the output values of 12throot vary by less than a factor of 2
+ * over the domain we care about, so we only get log() that way, not exp().
+ *
+ * Approximate exp() with a low-degree polynomial; not exactly equal to the
+ * Taylor series since we're minimizing maximum error over a certain finite
+ * domain. It's not worthwhile to use lots of terms, since Newton's method
+ * has a better convergence rate once you get reasonably close to the answer.
+ */
+static inline float
+init_newtonf (float x, float exponent, float c0, float c1, float c2)
+{
+ int iexp;
+ float y = frexpf(x, &iexp);
+ y = 2*y+(iexp-2);
+ c1 *= M_LN2*exponent;
+ c2 *= M_LN2*M_LN2*exponent*exponent;
+ return y = c0 + c1*y + c2*y*y;
+}
+
+/* Returns x^2.4 == (x*(x^(-1/5)))^3, using Newton's method for x^(-1/5).
+ */
+static inline float
+babl_pow_24f (float x)
+{
+ float y = init_newtonf (x, -1.f/5, 0.9953189663f, 0.9594345146f, 0.6742970332f);
+ int i;
+ for (i = 0; i < 3; i++)
+ y = (1.f+1.f/5)*y - ((1./5)*x*(y*y))*((y*y)*(y*y));
+ x *= y;
+ return x*x*x;
+}
+
+/* Returns x^(1/2.4) == x*((x^(-1/6))^(1/2))^7, using Newton's method for x^(-1/6).
+ */
+static inline float
+babl_pow_1_24f (float x)
+{
+ float y = init_newtonf (x, -1.f/12, 0.9976800269f, 0.9885126933f, 0.5908575383f);
+ int i;
+ float z;
+ x = sqrtf (x);
+ /* newton's method for x^(-1/6) */
+ z = (1.f/6.f) * x;
+ for (i = 0; i < 3; i++)
+ y = (7.f/6.f) * y - z * ((y*y)*(y*y)*(y*y*y));
+ return x*y;
+}
+
+
#endif
diff --git a/extensions/sse2-float.c b/extensions/sse2-float.c
index 8148bf2..72463ee 100644
--- a/extensions/sse2-float.c
+++ b/extensions/sse2-float.c
@@ -244,7 +244,7 @@ conv_rgbAF_linear_rgbaF_linear_spin (const float *src, float *dst, long samples)
#define FLT_MANTISSA (1<<23)
static inline __v4sf
-init_newton (__v4sf x, double exponent, double c0, double c1, double c2)
+sse_init_newton (__v4sf x, double exponent, double c0, double c1, double c2)
{
double norm = exponent*M_LN2/FLT_MANTISSA;
__v4sf y = _mm_cvtepi32_ps((__m128i)((__v4si)x - splat4i(FLT_ONE)));
@@ -252,10 +252,10 @@ init_newton (__v4sf x, double exponent, double c0, double c1, double c2)
}
static inline __v4sf
-pow_1_24 (__v4sf x)
+sse_pow_1_24 (__v4sf x)
{
__v4sf y, z;
- y = init_newton (x, -1./12, 0.9976800269, 0.9885126933, 0.5908575383);
+ y = sse_init_newton (x, -1./12, 0.9976800269, 0.9885126933, 0.5908575383);
x = _mm_sqrt_ps (x);
/* newton's method for x^(-1/6) */
z = splat4f (1.f/6.f) * x;
@@ -265,10 +265,10 @@ pow_1_24 (__v4sf x)
}
static inline __v4sf
-pow_24 (__v4sf x)
+sse_pow_24 (__v4sf x)
{
__v4sf y, z;
- y = init_newton (x, -1./5, 0.9953189663, 0.9594345146, 0.6742970332);
+ y = sse_init_newton (x, -1./5, 0.9953189663, 0.9594345146, 0.6742970332);
/* newton's method for x^(-1/5) */
z = splat4f (1.f/5.f) * x;
y = splat4f (6.f/5.f) * y - z * ((y*y*y)*(y*y*y));
@@ -280,7 +280,7 @@ pow_24 (__v4sf x)
static inline __v4sf
linear_to_gamma_2_2_sse2 (__v4sf x)
{
- __v4sf curve = pow_1_24 (x) * splat4f (1.055f) - splat4f (0.055f);
+ __v4sf curve = sse_pow_1_24 (x) * splat4f (1.055f) - splat4f (0.055f);
__v4sf line = x * splat4f (12.92f);
__v4sf mask = _mm_cmpgt_ps (x, splat4f (0.003130804954f));
return _mm_or_ps (_mm_and_ps (mask, curve), _mm_andnot_ps (mask, line));
@@ -289,7 +289,7 @@ linear_to_gamma_2_2_sse2 (__v4sf x)
static inline __v4sf
gamma_2_2_to_linear_sse2 (__v4sf x)
{
- __v4sf curve = pow_24 ((x + splat4f (0.055f)) * splat4f (1/1.055f));
+ __v4sf curve = sse_pow_24 ((x + splat4f (0.055f)) * splat4f (1/1.055f));
__v4sf line = x * splat4f (1/12.92f);
__v4sf mask = _mm_cmpgt_ps (x, splat4f (0.04045f));
return _mm_or_ps (_mm_and_ps (mask, curve), _mm_andnot_ps (mask, line));
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]