[babl] make trampoline for lut processing



commit b3e884edf3b5c58fb4c2cede1346bd8a9d9c4a1e
Author: Øyvind Kolås <pippin gimp org>
Date:   Mon Jan 24 07:05:12 2022 +0100

    make trampoline for lut processing

 babl/babl-fish-path.c          | 137 +++++++----------------------------------
 babl/babl-internal.h           |  13 ++++
 babl/babl.c                    |  41 ++++++++++--
 babl/base/babl-rgb-converter.c | 122 ++++++++++++++++++++++++++++++++++++
 4 files changed, 193 insertions(+), 120 deletions(-)
---
diff --git a/babl/babl-fish-path.c b/babl/babl-fish-path.c
index f709c3fd1..7278ec2a1 100644
--- a/babl/babl-fish-path.c
+++ b/babl/babl-fish-path.c
@@ -76,7 +76,7 @@ get_path_instrumentation (FishPathInstrumentation *fpi,
 
 
 static inline void
-process_conversion_path (BablList   *path,
+_babl_process_conversion_path (BablList   *path,
                          const void *source_buffer,
                          int         source_bpp,
                          void       *destination_buffer,
@@ -756,118 +756,6 @@ babl_gc_fishes (void)
   //  is responsibility of higher layers
 }
 
-static int babl_fish_lut_process_maybe (const Babl *babl,
-                                        const char *source,
-                                        const char *destination,
-                                        long        n,
-                                        void       *data)
-{
-     uint32_t *lut = (uint32_t*)babl->fish_path.u8_lut;
-     ((Babl*)babl)->fish.pixels += n;
-
-
-     if (!lut && babl->fish.pixels > 256 * 128)
-     {
-#if 0
-       fprintf (stderr, "building LUT for %s to %s\n",
-                        babl_get_name (babl->conversion.source),
-                        babl_get_name (babl->conversion.destination));
-#endif
-       lut = malloc (256 * 256 * 256 * 4);
-       if (babl->fish_path.source_bpp == 8)
-       {
-          uint64_t *lut_in = malloc (256 * 256  * 256 * 8);
-          for (int o = 0; o < 256 * 256 * 256; o++)
-          {
-            uint64_t v = o;
-            uint64_t v0 =       v & 0xff;
-            uint64_t v1 =   (v & 0xff00) >> 8;
-            uint64_t v2 = (v & 0xff0000) >> 16;
-
-#if 1
-            // gives same results... but purer white is better?
-            v0 = (v0 <<  8) | (((v0&1)?0xff:0)<<0);
-            v1 = (v1 << 24) | (((v1&1)?(uint64_t)0xff:0)<<16);
-            v2 = (v2 << 40) | (((v2&1)?(uint64_t)0xff:0)<<32);
-#else
-            v0 = (v0 <<  8);
-            v1 = (v1 << 24);
-            v2 = (v2 << 40);
-#endif
-            lut_in[o] = v;
-          }
-
-          process_conversion_path (babl->fish_path.conversion_list,
-                                   lut_in,
-                                   babl->fish_path.source_bpp,
-                                   lut,
-                                   babl->fish_path.dest_bpp,
-                                   256*256*256);
-          free (lut_in);
-       }
-       else
-       {
-       for (int o = 0; o < 256 * 256 * 256; o++)
-         lut[o] = o;
-       process_conversion_path (babl->fish_path.conversion_list,
-                                lut,
-                                babl->fish_path.source_bpp,
-                                lut,
-                                babl->fish_path.dest_bpp,
-                                256*256*256);
-       }
-       // XXX : there is still a micro race, if lost we should only
-       // leak a LUT not produce wrong results.
-       if (babl->fish_path.u8_lut == NULL)
-       {
-         (((Babl*)babl)->fish_path.u8_lut) = (uint8_t*)lut;
-
-       }
-       else
-       {
-         free (lut);
-         lut = (uint32_t*)babl->fish_path.u8_lut;
-       }
-     }
-     if (lut)
-     {
-        if (babl->fish_path.source_bpp == 8) // 16 bit, not working yet
-        {                                    // half and u16 need their
-                                             // own separate handling
-          uint32_t *src = (uint32_t*)source;
-          uint32_t *dst = (uint32_t*)destination;
-          lut = (uint32_t*)babl->fish_path.u8_lut;
-          while (n--)
-          {
-             uint32_t col_a = *src++;
-             uint32_t col_b = *src++;
-             uint32_t col;
-
-             uint32_t c_ar = ((col_a & 0xff000000)|
-                             ((col_a & 0x0000ff00) << 8));
-             uint32_t c_gb = ((col_b & 0xff000000)|
-                             ((col_b & 0x0000ff00) << 8))>>16;
-             col = c_ar|c_gb;
-
-             *dst++ = lut[col & 0xffffff] | (col & 0xff000000);
-          }
-        }
-        else
-        {
-          uint32_t *src = (uint32_t*)source;
-          uint32_t *dst = (uint32_t*)destination;
-          lut = (uint32_t*)babl->fish_path.u8_lut;
-          while (n--)
-          {
-             uint32_t col = *src++;
-             *dst++ = lut[col & 0xffffff] | (col & 0xff000000);
-          }
-        }
-        BABL(babl)->fish_path.last_lut_use = babl_ticks ();
-        return 1;
-     }
-     return 0;
-}
 
 static void
 babl_fish_path_process (const Babl *babl,
@@ -895,7 +783,7 @@ babl_fish_path_process (const Babl *babl,
       conv_counter = 0;
     }
   }
-  process_conversion_path (babl->fish_path.conversion_list,
+  _babl_process_conversion_path (babl->fish_path.conversion_list,
                            source,
                            babl->fish_path.source_bpp,
                            destination,
@@ -1037,7 +925,7 @@ static void inline *align_16 (unsigned char *ret)
 }
 
 static inline void
-process_conversion_path (BablList   *path,
+_babl_process_conversion_path (BablList   *path,
                          const void *source_buffer,
                          int         source_bpp,
                          void       *destination_buffer,
@@ -1109,6 +997,23 @@ process_conversion_path (BablList   *path,
   }
 }
 
+void
+babl_process_conversion_path (BablList   *path,
+                         const void *source_buffer,
+                         int         source_bpp,
+                         void       *destination_buffer,
+                         int         dest_bpp,
+                         long        n)
+{
+  _babl_process_conversion_path (path,
+                          source_buffer,
+                          source_bpp,
+                          destination_buffer,
+                          dest_bpp,
+                          n);
+}
+
+
 static void
 init_path_instrumentation (FishPathInstrumentation *fpi,
                            Babl                    *fmt_source,
@@ -1244,7 +1149,7 @@ get_path_instrumentation (FishPathInstrumentation *fpi,
   /* calculate this path's view of what the result should be */
   ticks_start = babl_ticks ();
   for (int i = 0; i < BABL_TEST_ITER; i ++)
-  process_conversion_path (path, fpi->source, source_bpp, fpi->destination,
+  _babl_process_conversion_path (path, fpi->source, source_bpp, fpi->destination,
                            dest_bpp, fpi->num_test_pixels);
   ticks_end = babl_ticks ();
   *path_cost = (ticks_end - ticks_start);
diff --git a/babl/babl-internal.h b/babl/babl-internal.h
index ec6008b6d..4377ec379 100644
--- a/babl/babl-internal.h
+++ b/babl/babl-internal.h
@@ -373,6 +373,12 @@ extern const Babl *
 extern const Babl *
 (*babl_trc_lookup_by_name) (const char *name);
 
+extern int (*babl_fish_lut_process_maybe) (const Babl *babl,
+                                        const char *source,
+                                        const char *destination,
+                                        long        n,
+                                        void       *data);
+
 void babl_space_to_xyz   (const Babl *space, const double *rgb, double *xyz);
 void babl_space_from_xyz (const Babl *space, const double *xyz, double *rgb);
 
@@ -473,5 +479,12 @@ _babl_space_for_lcms (const char *icc_data, int icc_length); // XXX pass profile
 
 void
 babl_trc_class_init (void);
+void
+babl_process_conversion_path (BablList   *path,
+                         const void *source_buffer,
+                         int         source_bpp,
+                         void       *destination_buffer,
+                         int         dest_bpp,
+                         long        n);
 
 #endif
diff --git a/babl/babl.c b/babl/babl.c
index 515fa09b0..7bfe60f6a 100644
--- a/babl/babl.c
+++ b/babl/babl.c
@@ -200,6 +200,19 @@ void (*babl_base_init)  (void) = babl_base_init_generic;
 
 const Babl * babl_trc_lookup_by_name_generic (const char *name);
 
+int babl_fish_lut_process_maybe_generic (const Babl *babl,
+                                        const char *source,
+                                        const char *destination,
+                                        long        n,
+                                        void       *data);
+
+int (*babl_fish_lut_process_maybe) (const Babl *babl,
+                                    const char *source,
+                                    const char *destination,
+                                    long        n,
+                                    void       *data) =
+         babl_fish_lut_process_maybe_generic;
+
 
 const Babl *
 babl_trc_new_generic (const char *name,
@@ -222,15 +235,25 @@ const Babl *
               float      *lut) = babl_trc_new_generic;
 
 #ifdef ARCH_X86_64
+
+int babl_fish_lut_process_maybe_x86_64_v2 (const Babl *babl,
+                                           const char *source,
+                                           const char *destination,
+                                           long        n,
+                                           void       *data);
+int babl_fish_lut_process_maybe_x86_64_v3 (const Babl *babl,
+                                           const char *source,
+                                           const char *destination,
+                                           long        n,
+                                           void       *data);
+
 void babl_base_init_x86_64_v2 (void);
 void babl_base_init_x86_64_v3 (void);
 void _babl_space_add_universal_rgb_x86_64_v2 (const Babl *space);
 void _babl_space_add_universal_rgb_x86_64_v3 (const Babl *space);
 
-const Babl *
-babl_trc_lookup_by_name_x86_64_v2 (const char *name);
-const Babl *
-babl_trc_lookup_by_name_x86_64_v3 (const char *name);
+const Babl * babl_trc_lookup_by_name_x86_64_v2 (const char *name);
+const Babl * babl_trc_lookup_by_name_x86_64_v3 (const char *name);
 
 const Babl *
 babl_trc_new_x86_64_v2 (const char *name,
@@ -247,6 +270,13 @@ babl_trc_new_x86_64_v3 (const char *name,
 
 #endif
 #ifdef ARCH_ARM
+
+int babl_fish_lut_process_maybe_arm_neon (const Babl *babl,
+                                          const char *source,
+                                          const char *destination,
+                                          long        n,
+                                          void       *data);
+
 void babl_base_init_arm_neon (void);
 void _babl_space_add_universal_rgb_arm_neon (const Babl *space);
 
@@ -268,6 +298,7 @@ static void simd_init (void)
   BablCpuAccelFlags accel = babl_cpu_accel_get_support ();
   if ((accel & BABL_CPU_ACCEL_X86_64_V3) == BABL_CPU_ACCEL_X86_64_V3)
   {
+    babl_fish_lut_process_maybe = babl_fish_lut_process_maybe_x86_64_v3;
     babl_base_init = babl_base_init_x86_64_v2; /// !!
                                                // this is correct,
                                                // it performs better
@@ -278,6 +309,7 @@ static void simd_init (void)
   }
   else if ((accel & BABL_CPU_ACCEL_X86_64_V2) == BABL_CPU_ACCEL_X86_64_V2)
   {
+    babl_fish_lut_process_maybe = babl_fish_lut_process_maybe_x86_64_v2;
     babl_base_init = babl_base_init_x86_64_v2;
     babl_trc_new = babl_trc_new_x86_64_v2;
     babl_trc_lookup_by_name = babl_trc_lookup_by_name_x86_64_v2;
@@ -288,6 +320,7 @@ static void simd_init (void)
   BablCpuAccelFlags accel = babl_cpu_accel_get_support ();
   if ((accel & BABL_CPU_ACCEL_ARM_NEON) == BABL_CPU_ACCEL_ARM_NEON)
   {
+    babl_fish_lut_process_maybe = babl_fish_lut_process_maybe_arm_neon;
     babl_base_init = babl_base_init_arm_neon;
     babl_trc_new = babl_trc_new_arm_neon;
     babl_trc_lookup_by_name = babl_trc_lookup_by_name_arm_neon;
diff --git a/babl/base/babl-rgb-converter.c b/babl/base/babl-rgb-converter.c
index 3f4da04d3..5c3d2ca08 100644
--- a/babl/base/babl-rgb-converter.c
+++ b/babl/base/babl-rgb-converter.c
@@ -533,3 +533,125 @@ BABL_SIMD_SUFFIX(_babl_space_add_universal_rgb) (const Babl *space)
 {
   babl_space_class_for_each (add_rgb_adapter, (void*)space);
 }
+
+void
+babl_process_conversion_path (BablList   *path,
+                         const void *source_buffer,
+                         int         source_bpp,
+                         void       *destination_buffer,
+                         int         dest_bpp,
+                         long        n);
+
+int BABL_SIMD_SUFFIX(babl_fish_lut_process_maybe) (const Babl *babl,
+                                         const char *source,
+                                         const char *destination,
+                                         long        n,
+                                         void       *data)
+{
+     uint32_t *lut = (uint32_t*)babl->fish_path.u8_lut;
+     ((Babl*)babl)->fish.pixels += n;
+
+
+     if (!lut && babl->fish.pixels > 256 * 128)
+     {
+#if 0
+       fprintf (stderr, "building LUT for %s to %s\n",
+                        babl_get_name (babl->conversion.source),
+                        babl_get_name (babl->conversion.destination));
+#endif
+       lut = malloc (256 * 256 * 256 * 4);
+       if (babl->fish_path.source_bpp == 8)
+       {
+          uint64_t *lut_in = malloc (256 * 256  * 256 * 8);
+          for (int o = 0; o < 256 * 256 * 256; o++)
+          {
+            uint64_t v = o;
+            uint64_t v0 =       v & 0xff;
+            uint64_t v1 =   (v & 0xff00) >> 8;
+            uint64_t v2 = (v & 0xff0000) >> 16;
+
+#if 1
+            // gives same results... but purer white is better?
+            v0 = (v0 <<  8) | (((v0&1)?0xff:0)<<0);
+            v1 = (v1 << 24) | (((v1&1)?(uint64_t)0xff:0)<<16);
+            v2 = (v2 << 40) | (((v2&1)?(uint64_t)0xff:0)<<32);
+#else
+            v0 = (v0 <<  8);
+            v1 = (v1 << 24);
+            v2 = (v2 << 40);
+#endif
+            lut_in[o] = v;
+          }
+
+          babl_process_conversion_path (babl->fish_path.conversion_list,
+                                   lut_in,
+                                   babl->fish_path.source_bpp,
+                                   lut,
+                                   babl->fish_path.dest_bpp,
+                                   256*256*256);
+          free (lut_in);
+       }
+       else
+       {
+       for (int o = 0; o < 256 * 256 * 256; o++)
+         lut[o] = o;
+       babl_process_conversion_path (babl->fish_path.conversion_list,
+                                lut,
+                                babl->fish_path.source_bpp,
+                                lut,
+                                babl->fish_path.dest_bpp,
+                                256*256*256);
+       }
+       // XXX : there is still a micro race, if lost we should only
+       // leak a LUT not produce wrong results.
+       if (babl->fish_path.u8_lut == NULL)
+       {
+         (((Babl*)babl)->fish_path.u8_lut) = (uint8_t*)lut;
+
+       }
+       else
+       {
+         free (lut);
+         lut = (uint32_t*)babl->fish_path.u8_lut;
+       }
+     }
+     if (lut)
+     {
+        if (babl->fish_path.source_bpp == 8) // 16 bit, not working yet
+        {                                    // half and u16 need their
+                                             // own separate handling
+          uint32_t *src = (uint32_t*)source;
+          uint32_t *dst = (uint32_t*)destination;
+          lut = (uint32_t*)babl->fish_path.u8_lut;
+          while (n--)
+          {
+             uint32_t col_a = *src++;
+             uint32_t col_b = *src++;
+             uint32_t col;
+
+             uint32_t c_ar = ((col_a & 0xff000000)|
+                             ((col_a & 0x0000ff00) << 8));
+             uint32_t c_gb = ((col_b & 0xff000000)|
+                             ((col_b & 0x0000ff00) << 8))>>16;
+             col = c_ar|c_gb;
+
+             *dst++ = lut[col & 0xffffff] | (col & 0xff000000);
+          }
+        }
+        else
+        {
+          uint32_t *src = (uint32_t*)source;
+          uint32_t *dst = (uint32_t*)destination;
+          lut = (uint32_t*)babl->fish_path.u8_lut;
+          while (n--)
+          {
+             uint32_t col = *src++;
+             *dst++ = lut[col & 0xffffff] | (col & 0xff000000);
+          }
+        }
+        BABL(babl)->fish_path.last_lut_use = babl_ticks ();
+        return 1;
+     }
+     return 0;
+}
+


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]