[babl] Revert "make trampoline for lut processing"



commit 22bd31c3666bfc9a07033a62ff7e94fae9b7af6d
Author: Øyvind Kolås <pippin gimp org>
Date:   Mon Jan 24 07:39:06 2022 +0100

    Revert "make trampoline for lut processing"
    
    This reverts commit b3e884edf3b5c58fb4c2cede1346bd8a9d9c4a1e.
    
    benchmarking on x86_64 saw no effect, and as a surprise on arm
    it pessimises performance.

 babl/babl-fish-path.c          | 137 ++++++++++++++++++++++++++++++++++-------
 babl/babl-internal.h           |  13 ----
 babl/babl.c                    |  41 ++----------
 babl/base/babl-rgb-converter.c | 122 ------------------------------------
 4 files changed, 120 insertions(+), 193 deletions(-)
---
diff --git a/babl/babl-fish-path.c b/babl/babl-fish-path.c
index 7278ec2a1..f709c3fd1 100644
--- a/babl/babl-fish-path.c
+++ b/babl/babl-fish-path.c
@@ -76,7 +76,7 @@ get_path_instrumentation (FishPathInstrumentation *fpi,
 
 
 static inline void
-_babl_process_conversion_path (BablList   *path,
+process_conversion_path (BablList   *path,
                          const void *source_buffer,
                          int         source_bpp,
                          void       *destination_buffer,
@@ -756,6 +756,118 @@ babl_gc_fishes (void)
   //  is responsibility of higher layers
 }
 
+static int babl_fish_lut_process_maybe (const Babl *babl,
+                                        const char *source,
+                                        const char *destination,
+                                        long        n,
+                                        void       *data)
+{
+     uint32_t *lut = (uint32_t*)babl->fish_path.u8_lut;
+     ((Babl*)babl)->fish.pixels += n;
+
+
+     if (!lut && babl->fish.pixels > 256 * 128)
+     {
+#if 0
+       fprintf (stderr, "building LUT for %s to %s\n",
+                        babl_get_name (babl->conversion.source),
+                        babl_get_name (babl->conversion.destination));
+#endif
+       lut = malloc (256 * 256 * 256 * 4);
+       if (babl->fish_path.source_bpp == 8)
+       {
+          uint64_t *lut_in = malloc (256 * 256  * 256 * 8);
+          for (int o = 0; o < 256 * 256 * 256; o++)
+          {
+            uint64_t v = o;
+            uint64_t v0 =       v & 0xff;
+            uint64_t v1 =   (v & 0xff00) >> 8;
+            uint64_t v2 = (v & 0xff0000) >> 16;
+
+#if 1
+            // gives same results... but purer white is better?
+            v0 = (v0 <<  8) | (((v0&1)?0xff:0)<<0);
+            v1 = (v1 << 24) | (((v1&1)?(uint64_t)0xff:0)<<16);
+            v2 = (v2 << 40) | (((v2&1)?(uint64_t)0xff:0)<<32);
+#else
+            v0 = (v0 <<  8);
+            v1 = (v1 << 24);
+            v2 = (v2 << 40);
+#endif
+            lut_in[o] = v;
+          }
+
+          process_conversion_path (babl->fish_path.conversion_list,
+                                   lut_in,
+                                   babl->fish_path.source_bpp,
+                                   lut,
+                                   babl->fish_path.dest_bpp,
+                                   256*256*256);
+          free (lut_in);
+       }
+       else
+       {
+       for (int o = 0; o < 256 * 256 * 256; o++)
+         lut[o] = o;
+       process_conversion_path (babl->fish_path.conversion_list,
+                                lut,
+                                babl->fish_path.source_bpp,
+                                lut,
+                                babl->fish_path.dest_bpp,
+                                256*256*256);
+       }
+       // XXX : there is still a micro race, if lost we should only
+       // leak a LUT not produce wrong results.
+       if (babl->fish_path.u8_lut == NULL)
+       {
+         (((Babl*)babl)->fish_path.u8_lut) = (uint8_t*)lut;
+
+       }
+       else
+       {
+         free (lut);
+         lut = (uint32_t*)babl->fish_path.u8_lut;
+       }
+     }
+     if (lut)
+     {
+        if (babl->fish_path.source_bpp == 8) // 16 bit, not working yet
+        {                                    // half and u16 need their
+                                             // own separate handling
+          uint32_t *src = (uint32_t*)source;
+          uint32_t *dst = (uint32_t*)destination;
+          lut = (uint32_t*)babl->fish_path.u8_lut;
+          while (n--)
+          {
+             uint32_t col_a = *src++;
+             uint32_t col_b = *src++;
+             uint32_t col;
+
+             uint32_t c_ar = ((col_a & 0xff000000)|
+                             ((col_a & 0x0000ff00) << 8));
+             uint32_t c_gb = ((col_b & 0xff000000)|
+                             ((col_b & 0x0000ff00) << 8))>>16;
+             col = c_ar|c_gb;
+
+             *dst++ = lut[col & 0xffffff] | (col & 0xff000000);
+          }
+        }
+        else
+        {
+          uint32_t *src = (uint32_t*)source;
+          uint32_t *dst = (uint32_t*)destination;
+          lut = (uint32_t*)babl->fish_path.u8_lut;
+          while (n--)
+          {
+             uint32_t col = *src++;
+             *dst++ = lut[col & 0xffffff] | (col & 0xff000000);
+          }
+        }
+        BABL(babl)->fish_path.last_lut_use = babl_ticks ();
+        return 1;
+     }
+     return 0;
+}
 
 static void
 babl_fish_path_process (const Babl *babl,
@@ -783,7 +895,7 @@ babl_fish_path_process (const Babl *babl,
       conv_counter = 0;
     }
   }
-  _babl_process_conversion_path (babl->fish_path.conversion_list,
+  process_conversion_path (babl->fish_path.conversion_list,
                            source,
                            babl->fish_path.source_bpp,
                            destination,
@@ -925,7 +1037,7 @@ static void inline *align_16 (unsigned char *ret)
 }
 
 static inline void
-_babl_process_conversion_path (BablList   *path,
+process_conversion_path (BablList   *path,
                          const void *source_buffer,
                          int         source_bpp,
                          void       *destination_buffer,
@@ -997,23 +1109,6 @@ _babl_process_conversion_path (BablList   *path,
   }
 }
 
-void
-babl_process_conversion_path (BablList   *path,
-                         const void *source_buffer,
-                         int         source_bpp,
-                         void       *destination_buffer,
-                         int         dest_bpp,
-                         long        n)
-{
-  _babl_process_conversion_path (path,
-                          source_buffer,
-                          source_bpp,
-                          destination_buffer,
-                          dest_bpp,
-                          n);
-}
-
-
 static void
 init_path_instrumentation (FishPathInstrumentation *fpi,
                            Babl                    *fmt_source,
@@ -1149,7 +1244,7 @@ get_path_instrumentation (FishPathInstrumentation *fpi,
   /* calculate this path's view of what the result should be */
   ticks_start = babl_ticks ();
   for (int i = 0; i < BABL_TEST_ITER; i ++)
-  _babl_process_conversion_path (path, fpi->source, source_bpp, fpi->destination,
+  process_conversion_path (path, fpi->source, source_bpp, fpi->destination,
                            dest_bpp, fpi->num_test_pixels);
   ticks_end = babl_ticks ();
   *path_cost = (ticks_end - ticks_start);
diff --git a/babl/babl-internal.h b/babl/babl-internal.h
index 4377ec379..ec6008b6d 100644
--- a/babl/babl-internal.h
+++ b/babl/babl-internal.h
@@ -373,12 +373,6 @@ extern const Babl *
 extern const Babl *
 (*babl_trc_lookup_by_name) (const char *name);
 
-extern int (*babl_fish_lut_process_maybe) (const Babl *babl,
-                                        const char *source,
-                                        const char *destination,
-                                        long        n,
-                                        void       *data);
-
 void babl_space_to_xyz   (const Babl *space, const double *rgb, double *xyz);
 void babl_space_from_xyz (const Babl *space, const double *xyz, double *rgb);
 
@@ -479,12 +473,5 @@ _babl_space_for_lcms (const char *icc_data, int icc_length); // XXX pass profile
 
 void
 babl_trc_class_init (void);
-void
-babl_process_conversion_path (BablList   *path,
-                         const void *source_buffer,
-                         int         source_bpp,
-                         void       *destination_buffer,
-                         int         dest_bpp,
-                         long        n);
 
 #endif
diff --git a/babl/babl.c b/babl/babl.c
index 7bfe60f6a..515fa09b0 100644
--- a/babl/babl.c
+++ b/babl/babl.c
@@ -200,19 +200,6 @@ void (*babl_base_init)  (void) = babl_base_init_generic;
 
 const Babl * babl_trc_lookup_by_name_generic (const char *name);
 
-int babl_fish_lut_process_maybe_generic (const Babl *babl,
-                                        const char *source,
-                                        const char *destination,
-                                        long        n,
-                                        void       *data);
-
-int (*babl_fish_lut_process_maybe) (const Babl *babl,
-                                    const char *source,
-                                    const char *destination,
-                                    long        n,
-                                    void       *data) =
-         babl_fish_lut_process_maybe_generic;
-
 
 const Babl *
 babl_trc_new_generic (const char *name,
@@ -235,25 +222,15 @@ const Babl *
               float      *lut) = babl_trc_new_generic;
 
 #ifdef ARCH_X86_64
-
-int babl_fish_lut_process_maybe_x86_64_v2 (const Babl *babl,
-                                           const char *source,
-                                           const char *destination,
-                                           long        n,
-                                           void       *data);
-int babl_fish_lut_process_maybe_x86_64_v3 (const Babl *babl,
-                                           const char *source,
-                                           const char *destination,
-                                           long        n,
-                                           void       *data);
-
 void babl_base_init_x86_64_v2 (void);
 void babl_base_init_x86_64_v3 (void);
 void _babl_space_add_universal_rgb_x86_64_v2 (const Babl *space);
 void _babl_space_add_universal_rgb_x86_64_v3 (const Babl *space);
 
-const Babl * babl_trc_lookup_by_name_x86_64_v2 (const char *name);
-const Babl * babl_trc_lookup_by_name_x86_64_v3 (const char *name);
+const Babl *
+babl_trc_lookup_by_name_x86_64_v2 (const char *name);
+const Babl *
+babl_trc_lookup_by_name_x86_64_v3 (const char *name);
 
 const Babl *
 babl_trc_new_x86_64_v2 (const char *name,
@@ -270,13 +247,6 @@ babl_trc_new_x86_64_v3 (const char *name,
 
 #endif
 #ifdef ARCH_ARM
-
-int babl_fish_lut_process_maybe_arm_neon (const Babl *babl,
-                                          const char *source,
-                                          const char *destination,
-                                          long        n,
-                                          void       *data);
-
 void babl_base_init_arm_neon (void);
 void _babl_space_add_universal_rgb_arm_neon (const Babl *space);
 
@@ -298,7 +268,6 @@ static void simd_init (void)
   BablCpuAccelFlags accel = babl_cpu_accel_get_support ();
   if ((accel & BABL_CPU_ACCEL_X86_64_V3) == BABL_CPU_ACCEL_X86_64_V3)
   {
-    babl_fish_lut_process_maybe = babl_fish_lut_process_maybe_x86_64_v3;
     babl_base_init = babl_base_init_x86_64_v2; /// !!
                                                // this is correct,
                                                // it performs better
@@ -309,7 +278,6 @@ static void simd_init (void)
   }
   else if ((accel & BABL_CPU_ACCEL_X86_64_V2) == BABL_CPU_ACCEL_X86_64_V2)
   {
-    babl_fish_lut_process_maybe = babl_fish_lut_process_maybe_x86_64_v2;
     babl_base_init = babl_base_init_x86_64_v2;
     babl_trc_new = babl_trc_new_x86_64_v2;
     babl_trc_lookup_by_name = babl_trc_lookup_by_name_x86_64_v2;
@@ -320,7 +288,6 @@ static void simd_init (void)
   BablCpuAccelFlags accel = babl_cpu_accel_get_support ();
   if ((accel & BABL_CPU_ACCEL_ARM_NEON) == BABL_CPU_ACCEL_ARM_NEON)
   {
-    babl_fish_lut_process_maybe = babl_fish_lut_process_maybe_arm_neon;
     babl_base_init = babl_base_init_arm_neon;
     babl_trc_new = babl_trc_new_arm_neon;
     babl_trc_lookup_by_name = babl_trc_lookup_by_name_arm_neon;
diff --git a/babl/base/babl-rgb-converter.c b/babl/base/babl-rgb-converter.c
index 5c3d2ca08..3f4da04d3 100644
--- a/babl/base/babl-rgb-converter.c
+++ b/babl/base/babl-rgb-converter.c
@@ -533,125 +533,3 @@ BABL_SIMD_SUFFIX(_babl_space_add_universal_rgb) (const Babl *space)
 {
   babl_space_class_for_each (add_rgb_adapter, (void*)space);
 }
-
-void
-babl_process_conversion_path (BablList   *path,
-                         const void *source_buffer,
-                         int         source_bpp,
-                         void       *destination_buffer,
-                         int         dest_bpp,
-                         long        n);
-
-int BABL_SIMD_SUFFIX(babl_fish_lut_process_maybe) (const Babl *babl,
-                                         const char *source,
-                                         const char *destination,
-                                         long        n,
-                                         void       *data)
-{
-     uint32_t *lut = (uint32_t*)babl->fish_path.u8_lut;
-     ((Babl*)babl)->fish.pixels += n;
-
-
-     if (!lut && babl->fish.pixels > 256 * 128)
-     {
-#if 0
-       fprintf (stderr, "building LUT for %s to %s\n",
-                        babl_get_name (babl->conversion.source),
-                        babl_get_name (babl->conversion.destination));
-#endif
-       lut = malloc (256 * 256 * 256 * 4);
-       if (babl->fish_path.source_bpp == 8)
-       {
-          uint64_t *lut_in = malloc (256 * 256  * 256 * 8);
-          for (int o = 0; o < 256 * 256 * 256; o++)
-          {
-            uint64_t v = o;
-            uint64_t v0 =       v & 0xff;
-            uint64_t v1 =   (v & 0xff00) >> 8;
-            uint64_t v2 = (v & 0xff0000) >> 16;
-
-#if 1
-            // gives same results... but purer white is better?
-            v0 = (v0 <<  8) | (((v0&1)?0xff:0)<<0);
-            v1 = (v1 << 24) | (((v1&1)?(uint64_t)0xff:0)<<16);
-            v2 = (v2 << 40) | (((v2&1)?(uint64_t)0xff:0)<<32);
-#else
-            v0 = (v0 <<  8);
-            v1 = (v1 << 24);
-            v2 = (v2 << 40);
-#endif
-            lut_in[o] = v;
-          }
-
-          babl_process_conversion_path (babl->fish_path.conversion_list,
-                                   lut_in,
-                                   babl->fish_path.source_bpp,
-                                   lut,
-                                   babl->fish_path.dest_bpp,
-                                   256*256*256);
-          free (lut_in);
-       }
-       else
-       {
-       for (int o = 0; o < 256 * 256 * 256; o++)
-         lut[o] = o;
-       babl_process_conversion_path (babl->fish_path.conversion_list,
-                                lut,
-                                babl->fish_path.source_bpp,
-                                lut,
-                                babl->fish_path.dest_bpp,
-                                256*256*256);
-       }
-       // XXX : there is still a micro race, if lost we should only
-       // leak a LUT not produce wrong results.
-       if (babl->fish_path.u8_lut == NULL)
-       {
-         (((Babl*)babl)->fish_path.u8_lut) = (uint8_t*)lut;
-
-       }
-       else
-       {
-         free (lut);
-         lut = (uint32_t*)babl->fish_path.u8_lut;
-       }
-     }
-     if (lut)
-     {
-        if (babl->fish_path.source_bpp == 8) // 16 bit, not working yet
-        {                                    // half and u16 need their
-                                             // own separate handling
-          uint32_t *src = (uint32_t*)source;
-          uint32_t *dst = (uint32_t*)destination;
-          lut = (uint32_t*)babl->fish_path.u8_lut;
-          while (n--)
-          {
-             uint32_t col_a = *src++;
-             uint32_t col_b = *src++;
-             uint32_t col;
-
-             uint32_t c_ar = ((col_a & 0xff000000)|
-                             ((col_a & 0x0000ff00) << 8));
-             uint32_t c_gb = ((col_b & 0xff000000)|
-                             ((col_b & 0x0000ff00) << 8))>>16;
-             col = c_ar|c_gb;
-
-             *dst++ = lut[col & 0xffffff] | (col & 0xff000000);
-          }
-        }
-        else
-        {
-          uint32_t *src = (uint32_t*)source;
-          uint32_t *dst = (uint32_t*)destination;
-          lut = (uint32_t*)babl->fish_path.u8_lut;
-          while (n--)
-          {
-             uint32_t col = *src++;
-             *dst++ = lut[col & 0xffffff] | (col & 0xff000000);
-          }
-        }
-        BABL(babl)->fish_path.last_lut_use = babl_ticks ();
-        return 1;
-     }
-     return 0;
-}
-


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]