[babl] babl: add sse2 acceleration for color transform
- From: Øyvind Kolås <ok src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [babl] babl: add sse2 acceleration for color transform
- Date: Fri, 1 Sep 2017 16:38:12 +0000 (UTC)
commit 3a777a36c73a55e2d61f80e5022a68672fc3fe71
Author: Øyvind Kolås <pippin gimp org>
Date: Fri Sep 1 18:34:13 2017 +0200
babl: add sse2 acceleration for color transform
Many of the code paths are now duplicated, the duplication should decrease as
the SSE2/SIMD versions evolve further and stop resembling the code they were
cloned from.
babl/Makefile.am | 3 +
babl/babl-fish-path.c | 351 ++++++++++++++++++++++++++++++++++++-------------
2 files changed, 264 insertions(+), 90 deletions(-)
---
diff --git a/babl/Makefile.am b/babl/Makefile.am
index 7324328..9fddfbf 100644
--- a/babl/Makefile.am
+++ b/babl/Makefile.am
@@ -88,6 +88,9 @@ AM_CPPFLAGS = \
-I$(top_srcdir)/babl/base
lib_LTLIBRARIES= libbabl-@BABL_API_VERSION@.la
+
+libbabl_@BABL_API_VERSION@_la_CFLAGS= $(SSE2_EXTRA_CFLAGS)
+
libbabl_@BABL_API_VERSION@_la_SOURCES= $(h_sources) $(c_sources)
libbabl_@BABL_API_VERSION@_la_LIBADD= \
base/libbase.la \
diff --git a/babl/babl-fish-path.c b/babl/babl-fish-path.c
index 202778a..7f69d5e 100644
--- a/babl/babl-fish-path.c
+++ b/babl/babl-fish-path.c
@@ -300,15 +300,6 @@ show_item (Babl *babl,
return 0;
}
-static inline int
-show_fmt (Babl *babl,
- void *user_data)
-{
- fprintf (stderr, "[[%s\n", babl_get_name (babl));
- return 0;
-}
-
-
static int
alias_conversion (Babl *babl,
void *user_data)
@@ -521,10 +512,10 @@ universal_nonlinear_rgb_linear_converter (const Babl *conversion,unsigned char *
for (i = 0; i < samples; i++)
{
- rgba_out[i*4]=to_linear_red(to_trc_red, rgba_in[0]);
- rgba_out[i*4+1]=to_linear_green(to_trc_green, rgba_in[1]);
- rgba_out[i*4+2]=to_linear_blue(to_trc_blue, rgba_in[2]);
- rgba_out[i*4+3]=rgba_in[3];
+ rgba_out[i*4] = to_linear_red (to_trc_red, rgba_in[0]);
+ rgba_out[i*4+1] = to_linear_green(to_trc_green, rgba_in[1]);
+ rgba_out[i*4+2] = to_linear_blue (to_trc_blue, rgba_in[2]);
+ rgba_out[i*4+3] = rgba_in[3];
rgba_in += 4;
}
@@ -545,29 +536,26 @@ universal_nonlinear_rgba_u8_converter (const Babl *conversion,unsigned char *src
uint8_t *rgba_in_u8 = (void*)src_char;
uint8_t *rgba_out_u8 = (void*)dst_char;
- float *rgb = alloca (sizeof(float) * 3 * samples);
+ float *rgb = aligned_alloc (16, sizeof(float) * 4 * samples);
for (i = 0; i < samples; i++)
{
- rgb[i*3+0]=in_trc_lut[rgba_in_u8[i*4+0]];
- rgb[i*3+1]=in_trc_lut[rgba_in_u8[i*4+1]];
- rgb[i*3+2]=in_trc_lut[rgba_in_u8[i*4+2]];
+ rgb[i*4+0]=in_trc_lut[rgba_in_u8[i*4+0]];
+ rgb[i*4+1]=in_trc_lut[rgba_in_u8[i*4+1]];
+ rgb[i*4+2]=in_trc_lut[rgba_in_u8[i*4+2]];
}
- babl_matrix_mul_vectorff_buf3 (matrixf, rgb, rgb, samples);
+ babl_matrix_mul_vectorff_buf4 (matrixf, rgb, rgb, samples);
{
const Babl *from_trc_red = (void*)destination_space->space.trc[0];
const Babl *from_trc_green = (void*)destination_space->space.trc[1];
const Babl *from_trc_blue = (void*)destination_space->space.trc[2];
- float (*from_linear_red) (const Babl *trc, float value) = from_trc_red->trc.fun_from_linear;
- float (*from_linear_green) (const Babl *trc, float value) = from_trc_green->trc.fun_from_linear;
- float (*from_linear_blue) (const Babl *trc, float value) = from_trc_blue->trc.fun_from_linear;
for (i = 0; i < samples; i++)
{
- rgba_out_u8[0] = from_linear_red (from_trc_red, rgb[i*3+0]) * 255.5f;
- rgba_out_u8[1] = from_linear_green (from_trc_green, rgb[i*3+1]) * 255.5f;
- rgba_out_u8[2] = from_linear_blue (from_trc_blue , rgb[i*3+2]) * 255.5f;
+ rgba_out_u8[0] = babl_trc_from_linear (from_trc_red, rgb[i*4+0]) * 255.5f;
+ rgba_out_u8[1] = babl_trc_from_linear (from_trc_green, rgb[i*4+1]) * 255.5f;
+ rgba_out_u8[2] = babl_trc_from_linear (from_trc_blue, rgb[i*4+2]) * 255.5f;
rgba_out_u8[3] = rgba_in_u8[3];
rgba_in_u8 += 4;
rgba_out_u8 += 4;
@@ -579,50 +567,216 @@ universal_nonlinear_rgba_u8_converter (const Babl *conversion,unsigned char *src
static inline long
-universal_nonlinear_rgba_u8_float_converter (const Babl *conversion,unsigned char *src_char, unsigned char
*dst_char, long samples)
+universal_rgba_converter (const Babl *conversion,unsigned char *src_char, unsigned char *dst_char, long
samples)
{
+ float *matrixf = conversion->conversion.data;
+ float *rgba_in = (void*)src_char;
+ float *rgba_out = (void*)dst_char;
+
+ babl_matrix_mul_vectorff_buf4 (matrixf, rgba_in, rgba_out, samples);
+
+ return samples;
+}
+
+static inline long
+universal_rgb_converter (const Babl *conversion,unsigned char *src_char, unsigned char *dst_char, long
samples)
+{
+ float *matrixf = conversion->conversion.data;
+ float *rgb_in = (void*)src_char;
+ float *rgb_out = (void*)dst_char;
+
+ babl_matrix_mul_vectorff_buf3 (matrixf, rgb_in, rgb_out, samples);
+
+ return samples;
+}
+
+#if defined(USE_SSE2)
+
+#define m(matr, j, i) matr[j*3+i]
+
+#include <emmintrin.h>
+
+static inline void babl_matrix_mul_vectorff_buf4_sse2 (const float *mat,
+ const float *v_in,
+ float *v_out,
+ int samples)
+{
+ const __v4sf m___0 = {m(mat, 0, 0), m(mat, 1, 0), m(mat, 2, 0), 0};
+ const __v4sf m___1 = {m(mat, 0, 1), m(mat, 1, 1), m(mat, 2, 1), 0};
+ const __v4sf m___2 = {m(mat, 0, 2), m(mat, 1, 2), m(mat, 2, 2), 0};
+ int i;
+ for (i = 0; i < samples; i ++)
+ {
+ const __v4sf a = _mm_load1_ps(&v_in[0]);
+ const __v4sf b = _mm_load1_ps(&v_in[1]);
+ const __v4sf c = _mm_load1_ps(&v_in[2]);
+ __v4sf out; // = m___0 * a + m___1 * b + m___2 * c;
+ out = _mm_mul_ps (m___0, a);
+ out = _mm_add_ps (out, _mm_mul_ps (m___1, b));
+ out = _mm_add_ps (out, _mm_mul_ps (m___2, c));
+ _mm_store_ps (v_out, out);
+ v_out[3] = v_in[3];
+ v_out += 4;
+ v_in += 4;
+ }
+ _mm_empty ();
+}
+
+#undef m
+
+static inline long
+universal_nonlinear_rgb_converter_sse2 (const Babl *conversion,unsigned char *src_char, unsigned char
*dst_char, long samples)
+{
+ const Babl *source_space = babl_conversion_get_source_space (conversion);
+ const Babl *destination_space = babl_conversion_get_destination_space (conversion);
+
+ void *to_trc_red;
+ void *to_trc_green;
+ void *to_trc_blue;
+ float (*to_linear_red) (void *trc, float value);
+ float (*to_linear_green) (void *trc, float value);
+ float (*to_linear_blue) (void *trc, float value);
+ void *from_trc_red;
+ void *from_trc_green;
+ void *from_trc_blue;
+ float (*from_linear_red) (void *trc, float value);
+ float (*from_linear_green) (void *trc, float value);
+ float (*from_linear_blue) (void *trc, float value);
+
float * matrixf = conversion->conversion.data;
- float * in_trc_lut = matrixf + 9;
int i;
- uint8_t *rgba_in_u8 = (void*)src_char;
- float *rgba_out = (void*)dst_char;
+ float *rgba_in = (void*)src_char;
+ float *rgba_out = (void*)dst_char;
+
+ to_linear_red = (void*)source_space->space.trc[0]->trc.fun_to_linear;
+ to_trc_red = (void*)source_space->space.trc[0];
+ from_linear_red = (void*)destination_space->space.trc[0]->trc.fun_from_linear;
+ from_trc_red = (void*)destination_space->space.trc[0];
+
+ to_linear_green = (void*)source_space->space.trc[1]->trc.fun_to_linear;
+ to_trc_green = (void*)source_space->space.trc[1];
+ from_linear_green= (void*)destination_space->space.trc[1]->trc.fun_from_linear;
+ from_trc_green = (void*)destination_space->space.trc[1];
+
+ to_linear_blue = (void*)source_space->space.trc[2]->trc.fun_to_linear;
+ to_trc_blue = (void*)source_space->space.trc[2];
+ from_linear_blue= (void*)destination_space->space.trc[2]->trc.fun_from_linear;
+ from_trc_blue = (void*)destination_space->space.trc[2];
for (i = 0; i < samples; i++)
{
- rgba_out[i*3+0]=in_trc_lut[rgba_in_u8[i*4+0]];
- rgba_out[i*3+1]=in_trc_lut[rgba_in_u8[i*4+1]];
- rgba_out[i*3+2]=in_trc_lut[rgba_in_u8[i*4+2]];
- rgba_out[i*3+2]=rgba_in_u8[i*4+3] / 255.0;
+ rgba_out[i*4] =to_linear_red(to_trc_red, rgba_in[i*4]);
+ rgba_out[i*4+1]=to_linear_green(to_trc_green, rgba_in[i*4+1]);
+ rgba_out[i*4+2]=to_linear_blue(to_trc_blue, rgba_in[i*4+1]);
+ rgba_out[i*4+3]=rgba_in[3];
+ }
+
+ babl_matrix_mul_vectorff_buf4_sse2 (matrixf, rgba_out, rgba_out, samples);
+
+ for (i = 0; i < samples; i++)
+ {
+ rgba_out[0] = from_linear_red(from_trc_red, rgba_out[0]);
+ rgba_out[1] = from_linear_green(from_trc_green, rgba_out[1]);
+ rgba_out[2] = from_linear_blue(from_trc_blue, rgba_out[2]);
+ rgba_in += 4;
+ rgba_out += 4;
}
- babl_matrix_mul_vectorff_buf4 (matrixf, rgba_out, rgba_out, samples);
return samples;
}
+
static inline long
-universal_rgba_converter (const Babl *conversion,unsigned char *src_char, unsigned char *dst_char, long
samples)
+universal_rgba_converter_sse2 (const Babl *conversion,unsigned char *src_char, unsigned char *dst_char, long
samples)
{
float *matrixf = conversion->conversion.data;
float *rgba_in = (void*)src_char;
float *rgba_out = (void*)dst_char;
- babl_matrix_mul_vectorff_buf4 (matrixf, rgba_in, rgba_out, samples);
+ babl_matrix_mul_vectorff_buf4_sse2 (matrixf, rgba_in, rgba_out, samples);
return samples;
}
+static inline long
+universal_nonlinear_rgba_u8_converter_sse2 (const Babl *conversion,unsigned char *src_char, unsigned char
*dst_char, long samples)
+{
+ const Babl *destination_space = conversion->conversion.destination->format.space;
+
+ float * matrixf = conversion->conversion.data;
+ float * in_trc_lut = matrixf + 9;
+ int i;
+ uint8_t *rgba_in_u8 = (void*)src_char;
+ uint8_t *rgba_out_u8 = (void*)dst_char;
+
+ float *rgb = aligned_alloc (16, sizeof(float) * 4 * samples);
+
+ for (i = 0; i < samples; i++)
+ {
+ rgb[i*4+0]=in_trc_lut[rgba_in_u8[i*4+0]];
+ rgb[i*4+1]=in_trc_lut[rgba_in_u8[i*4+1]];
+ rgb[i*4+2]=in_trc_lut[rgba_in_u8[i*4+2]];
+ }
+
+ babl_matrix_mul_vectorff_buf4_sse2 (matrixf, rgb, rgb, samples);
+
+ {
+ const Babl *from_trc_red = (void*)destination_space->space.trc[0];
+ const Babl *from_trc_green = (void*)destination_space->space.trc[1];
+ const Babl *from_trc_blue = (void*)destination_space->space.trc[2];
+ for (i = 0; i < samples; i++)
+ {
+ rgba_out_u8[0] = babl_trc_from_linear (from_trc_red, rgb[i*4+0]) * 255.5f;
+ rgba_out_u8[1] = babl_trc_from_linear (from_trc_green, rgb[i*4+1]) * 255.5f;
+ rgba_out_u8[2] = babl_trc_from_linear (from_trc_blue, rgb[i*4+2]) * 255.5f;
+ rgba_out_u8[3] = rgba_in_u8[3];
+ rgba_in_u8 += 4;
+ rgba_out_u8 += 4;
+ }
+ }
+
+ return samples;
+}
static inline long
-universal_rgb_converter (const Babl *conversion,unsigned char *src_char, unsigned char *dst_char, long
samples)
+universal_nonlinear_rgb_linear_converter_sse2 (const Babl *conversion,unsigned char *src_char, unsigned char
*dst_char, long samples)
{
- float *matrixf = conversion->conversion.data;
- float *rgb_in = (void*)src_char;
- float *rgb_out = (void*)dst_char;
+ const Babl *source_space = babl_conversion_get_source_space (conversion);
- babl_matrix_mul_vectorff_buf3 (matrixf, rgb_in, rgb_out, samples);
+ void *to_trc_red;
+ void *to_trc_green;
+ void *to_trc_blue;
+ float (*to_linear_red) (void *trc, float value);
+ float (*to_linear_green) (void *trc, float value);
+ float (*to_linear_blue) (void *trc, float value);
+
+ float * matrixf = conversion->conversion.data;
+ int i;
+ float *rgba_in = (void*)src_char;
+ float *rgba_out = (void*)dst_char;
+
+ to_linear_red = (void*)source_space->space.trc[0]->trc.fun_to_linear;
+ to_trc_red = (void*)source_space->space.trc[0];
+ to_linear_green = (void*)source_space->space.trc[1]->trc.fun_to_linear;
+ to_trc_green = (void*)source_space->space.trc[1];
+ to_linear_blue = (void*)source_space->space.trc[2]->trc.fun_to_linear;
+ to_trc_blue = (void*)source_space->space.trc[2];
+
+ for (i = 0; i < samples; i++)
+ {
+ rgba_out[i*4] = to_linear_red (to_trc_red, rgba_in[0]);
+ rgba_out[i*4+1] = to_linear_green(to_trc_green, rgba_in[1]);
+ rgba_out[i*4+2] = to_linear_blue (to_trc_blue, rgba_in[2]);
+ rgba_out[i*4+3] = rgba_in[3];
+ rgba_in += 4;
+ }
+
+ babl_matrix_mul_vectorff_buf4_sse2 (matrixf, rgba_out, rgba_out, samples);
return samples;
}
+#endif
+
static int
add_rgb_adapter (Babl *babl,
@@ -630,54 +784,73 @@ add_rgb_adapter (Babl *babl,
{
if (babl != space)
{
- prep_conversion(babl_conversion_new(babl_format_with_space("RGBA float", space),
- babl_format_with_space("RGBA float", babl),
- "linear", universal_rgba_converter,
- NULL));
- prep_conversion(babl_conversion_new(babl_format_with_space("RGBA float", babl),
- babl_format_with_space("RGBA float", space),
- "linear", universal_rgba_converter,
- NULL));
+
+#if defined(USE_SSE2)
+ if ((babl_cpu_accel_get_support () & BABL_CPU_ACCEL_X86_SSE) &&
+ (babl_cpu_accel_get_support () & BABL_CPU_ACCEL_X86_SSE2))
+ {
+ prep_conversion(babl_conversion_new(babl_format_with_space("RGBA float", space),
+ babl_format_with_space("RGBA float", babl),
+ "linear", universal_rgba_converter_sse2,
+ NULL));
+ prep_conversion(babl_conversion_new(babl_format_with_space("RGBA float", babl),
+ babl_format_with_space("RGBA float", space),
+ "linear", universal_rgba_converter_sse2,
+ NULL));
+ prep_conversion(babl_conversion_new(babl_format_with_space("R'G'B'A float", space),
+ babl_format_with_space("R'G'B'A float", babl),
+ "linear", universal_nonlinear_rgb_converter_sse2,
+ NULL));
+ prep_conversion(babl_conversion_new(babl_format_with_space("R'G'B'A float", babl),
+ babl_format_with_space("R'G'B'A float", space),
+ "linear", universal_nonlinear_rgb_converter_sse2,
+ NULL));
+ prep_conversion(babl_conversion_new(babl_format_with_space("R'G'B'A u8", space),
+ babl_format_with_space("R'G'B'A u8", babl),
+ "linear", universal_nonlinear_rgba_u8_converter_sse2,
+ NULL));
+ prep_conversion(babl_conversion_new(babl_format_with_space("R'G'B'A u8", babl),
+ babl_format_with_space("R'G'B'A u8", space),
+ "linear", universal_nonlinear_rgba_u8_converter_sse2,
+ NULL));
+ }
+ else
+#endif
+ {
+ prep_conversion(babl_conversion_new(babl_format_with_space("RGBA float", space),
+ babl_format_with_space("RGBA float", babl),
+ "linear", universal_rgba_converter,
+ NULL));
+ prep_conversion(babl_conversion_new(babl_format_with_space("RGBA float", babl),
+ babl_format_with_space("RGBA float", space),
+ "linear", universal_rgba_converter,
+ NULL));
+ prep_conversion(babl_conversion_new(babl_format_with_space("R'G'B'A float", space),
+ babl_format_with_space("R'G'B'A float", babl),
+ "linear", universal_nonlinear_rgb_converter,
+ NULL));
+ prep_conversion(babl_conversion_new(babl_format_with_space("R'G'B'A float", babl),
+ babl_format_with_space("R'G'B'A float", space),
+ "linear", universal_nonlinear_rgb_converter,
+ NULL));
+ prep_conversion(babl_conversion_new(babl_format_with_space("R'G'B'A u8", space),
+ babl_format_with_space("R'G'B'A u8", babl),
+ "linear", universal_nonlinear_rgba_u8_converter,
+ NULL));
+ prep_conversion(babl_conversion_new(babl_format_with_space("R'G'B'A u8", babl),
+ babl_format_with_space("R'G'B'A u8", space),
+ "linear", universal_nonlinear_rgba_u8_converter,
+ NULL));
+ }
+
prep_conversion(babl_conversion_new(babl_format_with_space("RGB float", space),
- babl_format_with_space("RGB float", babl),
- "linear", universal_rgb_converter,
- NULL));
+ babl_format_with_space("RGB float", babl),
+ "linear", universal_rgb_converter,
+ NULL));
prep_conversion(babl_conversion_new(babl_format_with_space("RGB float", babl),
- babl_format_with_space("RGB float", space),
- "linear", universal_rgb_converter,
- NULL));
- prep_conversion(babl_conversion_new(babl_format_with_space("R'G'B'A float", space),
- babl_format_with_space("R'G'B'A float", babl),
- "linear", universal_nonlinear_rgb_converter,
- NULL));
- prep_conversion(babl_conversion_new(babl_format_with_space("R'G'B'A float", babl),
- babl_format_with_space("R'G'B'A float", space),
- "linear", universal_nonlinear_rgb_converter,
- NULL));
- prep_conversion(babl_conversion_new(babl_format_with_space("R'G'B'A float", space),
- babl_format_with_space("RGBA float", babl),
- "linear", universal_nonlinear_rgb_linear_converter,
- NULL));
- prep_conversion(babl_conversion_new(babl_format_with_space("R'G'B'A float", babl),
- babl_format_with_space("RGBA float", space),
- "linear", universal_nonlinear_rgb_linear_converter,
- NULL));
- prep_conversion(babl_conversion_new(babl_format_with_space("R'G'B'A u8", space),
- babl_format_with_space("R'G'B'A u8", babl),
- "linear", universal_nonlinear_rgba_u8_converter,
- NULL));
- prep_conversion(babl_conversion_new(babl_format_with_space("R'G'B'A u8", babl),
- babl_format_with_space("R'G'B'A u8", space),
- "linear", universal_nonlinear_rgba_u8_converter,
- NULL));
- prep_conversion(babl_conversion_new(babl_format_with_space("R'G'B'A u8", space),
- babl_format_with_space("RGBA float", babl),
- "linear", universal_nonlinear_rgba_u8_float_converter,
- NULL));
- prep_conversion(babl_conversion_new(babl_format_with_space("R'G'B'A u8", babl),
- babl_format_with_space("RGBA float", space),
- "linear", universal_nonlinear_rgba_u8_float_converter,
- NULL));
+ babl_format_with_space("RGB float", space),
+ "linear", universal_rgb_converter,
+ NULL));
}
return 0;
}
@@ -736,11 +909,9 @@ babl_fish_path (const Babl *source,
add_universal_rgb (destination->format.space);
}
- if (!done)
+ if (!done && 0)
{
- if(0)babl_conversion_class_for_each (show_item, (void*)source->format.space);
- //babl_format_class_for_each (show_fmt, NULL);
- //babl_model_class_for_each (show_fmt, NULL);
+ babl_conversion_class_for_each (show_item, (void*)source->format.space);
}
}
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]