[babl] babl, sse2-float: fall back to slow, accurate path for large pow-2.4 inputs
- From: N/A <ell src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [babl] babl, sse2-float: fall back to slow, accurate path for large pow-2.4 inputs
- Date: Tue, 7 Nov 2017 21:08:01 +0000 (UTC)
commit 9aed0e5eb66c57e4352a5bc293d157eadba77abd
Author: Ell <ell_se yahoo com>
Date: Tue Nov 7 15:44:39 2017 -0500
babl, sse2-float: fall back to slow, accurate path for large pow-2.4 inputs
The approximations we use for pow_24() and pow_1_24() diverge from
the actual function for large-enough input values. This can lead
not just to inaccurate results, but also to infinities and NaNs,
especially when multiple conversions are strung in a row.
When the input value is large enough to produce notable divergence
(the difference between the approximate and actual values is ~1% at
the chosen limits,) fall back to a slower, but more accurate
version.
For the SSE2 float conversions, this results in an increase of ~5%
in conversion time, when all values are below the limit. When most
values are above the limit, performance can be 10x slower or worse.
babl/base/pow-24.c | 28 ++++++++++++++++++++++++----
babl/base/pow-24.h | 28 ++++++++++++++++++++++++----
extensions/sse2-float.c | 30 ++++++++++++++++++++++++++++++
3 files changed, 78 insertions(+), 8 deletions(-)
---
diff --git a/babl/base/pow-24.c b/babl/base/pow-24.c
index dbc4071..5a4f0ad 100644
--- a/babl/base/pow-24.c
+++ b/babl/base/pow-24.c
@@ -48,8 +48,13 @@ init_newton (double x, double exponent, double c0, double c1, double c2)
double
babl_pow_24 (double x)
{
- double y = init_newton (x, -1./5, 0.9953189663, 0.9594345146, 0.6742970332);
+ double y;
int i;
+ if (x > 16.0) {
+ /* for large values, fall back to a slower but more accurate version */
+ return exp (log (x) * 2.4);
+ }
+ y = init_newton (x, -1./5, 0.9953189663, 0.9594345146, 0.6742970332);
for (i = 0; i < 3; i++)
y = (1.+1./5)*y - ((1./5)*x*(y*y))*((y*y)*(y*y));
x *= y;
@@ -61,9 +66,14 @@ babl_pow_24 (double x)
double
babl_pow_1_24 (double x)
{
- double y = init_newton (x, -1./12, 0.9976800269, 0.9885126933, 0.5908575383);
+ double y;
int i;
double z;
+ if (x > 1024.0) {
+ /* for large values, fall back to a slower but more accurate version */
+ return exp (log (x) * (1.0 / 2.4));
+ }
+ y = init_newton (x, -1./12, 0.9976800269, 0.9885126933, 0.5908575383);
x = sqrt (x);
/* newton's method for x^(-1/6) */
z = (1./6.) * x;
@@ -102,8 +112,13 @@ init_newtonf (float x, float exponent, float c0, float c1, float c2)
float
babl_pow_24f (float x)
{
- float y = init_newtonf (x, -1.f/5, 0.9953189663f, 0.9594345146f, 0.6742970332f);
+ float y;
int i;
+ if (x > 16.0f) {
+ /* for large values, fall back to a slower but more accurate version */
+ return expf (logf (x) * 2.4f);
+ }
+ y = init_newtonf (x, -1.f/5, 0.9953189663f, 0.9594345146f, 0.6742970332f);
for (i = 0; i < 3; i++)
y = (1.f+1.f/5)*y - ((1./5)*x*(y*y))*((y*y)*(y*y));
x *= y;
@@ -115,9 +130,14 @@ babl_pow_24f (float x)
float
babl_pow_1_24f (float x)
{
- float y = init_newtonf (x, -1.f/12, 0.9976800269f, 0.9885126933f, 0.5908575383f);
+ float y;
int i;
float z;
+ if (x > 1024.0f) {
+ /* for large values, fall back to a slower but more accurate version */
+ return expf (logf (x) * (1.0f / 2.4f));
+ }
+ y = init_newtonf (x, -1.f/12, 0.9976800269f, 0.9885126933f, 0.5908575383f);
x = sqrtf (x);
/* newton's method for x^(-1/6) */
z = (1.f/6.f) * x;
diff --git a/babl/base/pow-24.h b/babl/base/pow-24.h
index a55c029..ed42cfe 100644
--- a/babl/base/pow-24.h
+++ b/babl/base/pow-24.h
@@ -54,8 +54,13 @@ init_newton (double x, double exponent, double c0, double c1, double c2)
static inline double
babl_pow_24 (double x)
{
- double y = init_newton (x, -1./5, 0.9953189663, 0.9594345146, 0.6742970332);
+ double y;
int i;
+ if (x > 16.0) {
+ /* for large values, fall back to a slower but more accurate version */
+ return exp (log (x) * 2.4);
+ }
+ y = init_newton (x, -1./5, 0.9953189663, 0.9594345146, 0.6742970332);
for (i = 0; i < 3; i++)
y = (1.+1./5)*y - ((1./5)*x*(y*y))*((y*y)*(y*y));
x *= y;
@@ -67,9 +72,14 @@ babl_pow_24 (double x)
static inline double
babl_pow_1_24 (double x)
{
- double y = init_newton (x, -1./12, 0.9976800269, 0.9885126933, 0.5908575383);
+ double y;
int i;
double z;
+ if (x > 1024.0) {
+ /* for large values, fall back to a slower but more accurate version */
+ return exp (log (x) * (1.0 / 2.4));
+ }
+ y = init_newton (x, -1./12, 0.9976800269, 0.9885126933, 0.5908575383);
x = sqrt (x);
/* newton's method for x^(-1/6) */
z = (1./6.) * x;
@@ -133,8 +143,13 @@ init_newtonf (float x, float exponent, float c0, float c1, float c2)
static inline float
babl_pow_24f (float x)
{
- float y = init_newtonf (x, -1.f/5, 0.9953189663f, 0.9594345146f, 0.6742970332f);
+ float y;
int i;
+ if (x > 16.0f) {
+ /* for large values, fall back to a slower but more accurate version */
+ return expf (logf (x) * 2.4f);
+ }
+ y = init_newtonf (x, -1.f/5, 0.9953189663f, 0.9594345146f, 0.6742970332f);
for (i = 0; i < 3; i++)
y = (1.f+1.f/5)*y - ((1./5)*x*(y*y))*((y*y)*(y*y));
x *= y;
@@ -146,9 +161,14 @@ babl_pow_24f (float x)
static inline float
babl_pow_1_24f (float x)
{
- float y = init_newtonf (x, -1.f/12, 0.9976800269f, 0.9885126933f, 0.5908575383f);
+ float y;
int i;
float z;
+ if (x > 1024.0f) {
+ /* for large values, fall back to a slower but more accurate version */
+ return expf (logf (x) * (1.0f / 2.4f));
+ }
+ y = init_newtonf (x, -1.f/12, 0.9976800269f, 0.9885126933f, 0.5908575383f);
x = sqrtf (x);
/* newton's method for x^(-1/6) */
z = (1.f/6.f) * x;
diff --git a/extensions/sse2-float.c b/extensions/sse2-float.c
index d26073c..223f85c 100644
--- a/extensions/sse2-float.c
+++ b/extensions/sse2-float.c
@@ -237,6 +237,22 @@ conv_rgbAF_linear_rgbaF_linear_spin (const Babl *conversion,const float *src, fl
#define FLT_ONE 0x3f800000 // ((union {float f; int i;}){1.0f}).i
#define FLT_MANTISSA (1<<23)
+static inline float
+sse_max_component (__v4sf x) {
+ __v4sf s;
+ __v4sf m;
+
+ /* m = [max (x[3], x[1]), max (x[2], x[0])] */
+ s = (__v4sf) _mm_shuffle_epi32 ((__m128i) x, _MM_SHUFFLE(0, 0, 3, 2));
+ m = _mm_max_ps (x, s);
+
+ /* m = [max (m[1], m[0])] = [max (max (x[3], x[1]), max (x[2], x[0]))] */
+ s = (__v4sf) _mm_shuffle_epi32 ((__m128i) m, _MM_SHUFFLE(0, 0, 0, 1));
+ m = _mm_max_ps (m, s);
+
+ return m[0];
+}
+
static inline __v4sf
sse_init_newton (__v4sf x, double exponent, double c0, double c1, double c2)
{
@@ -249,6 +265,13 @@ static inline __v4sf
sse_pow_1_24 (__v4sf x)
{
__v4sf y, z;
+ if (sse_max_component (x) > 1024.0f) {
+ /* for large values, fall back to a slower but more accurate version */
+ return _mm_set_ps (expf (logf (x[3]) * (1.0f / 2.4f)),
+ expf (logf (x[2]) * (1.0f / 2.4f)),
+ expf (logf (x[1]) * (1.0f / 2.4f)),
+ expf (logf (x[0]) * (1.0f / 2.4f)));
+ }
y = sse_init_newton (x, -1./12, 0.9976800269, 0.9885126933, 0.5908575383);
x = _mm_sqrt_ps (x);
/* newton's method for x^(-1/6) */
@@ -262,6 +285,13 @@ static inline __v4sf
sse_pow_24 (__v4sf x)
{
__v4sf y, z;
+ if (sse_max_component (x) > 16.0f) {
+ /* for large values, fall back to a slower but more accurate version */
+ return _mm_set_ps (expf (logf (x[3]) * 2.4f),
+ expf (logf (x[2]) * 2.4f),
+ expf (logf (x[1]) * 2.4f),
+ expf (logf (x[0]) * 2.4f));
+ }
y = sse_init_newton (x, -1./5, 0.9953189663, 0.9594345146, 0.6742970332);
/* newton's method for x^(-1/5) */
z = splat4f (1.f/5.f) * x;
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]