[gegl] utils: improve performance of gegl_memset_pattern()



commit d195c300f164921d74f24ee06eb99d8d22e8ba2f
Author: Ell <ell_se yahoo com>
Date:   Mon Feb 6 08:00:32 2017 -0500

    utils: improve performance of gegl_memset_pattern()
    
    Copy the pattern using exponentially increasing block sizes, using
    the already-copied data as the source, yielding a logarithmic number
    of memcpy()s, instead of linear.  The block size is not increased
    indefinitely, as to not saturate the cache, so for large copies, the
    number of memcpy()s is still linear, but using a big-enough block
    size as to make it irrelevant.
    
    I actually had a benchmark for this, but it died with the old hdd :P
    IIRC, in some cases we care about, this about doubles performance.
    In other cases, it's far less significant, but it never (in realistic
    terms) seems to be a pessimization.

 gegl/gegl-utils.c |   81 ++++++++++++++++++++++------------------------------
 1 files changed, 34 insertions(+), 47 deletions(-)
---
diff --git a/gegl/gegl-utils.c b/gegl/gegl-utils.c
index ffe0650..4c3e491 100644
--- a/gegl/gegl-utils.c
+++ b/gegl/gegl-utils.c
@@ -276,64 +276,51 @@ gegl_free (gpointer buf)
   g_free (*((gpointer*)buf -1));
 }
 
-#define MAKE_COPY_CASE(typesize)\
-case typesize: \
-  while (count--) \
-    { \
-      memcpy (dst, src, typesize); \
-      dst += typesize; \
-    } \
-  return;
-
 void
 gegl_memset_pattern (void * restrict       dst_ptr,
                      const void * restrict src_ptr,
                      gint                  pattern_size,
                      gint                  count)
 {
-  guchar *dst = dst_ptr;
+  guchar       *dst = dst_ptr;
   const guchar *src = src_ptr;
 
-  switch (pattern_size)
-  {
-    case 1: /* Y u8 */
+  /* g_assert (pattern_size > 0 && count >= 0); */
+
+  if (pattern_size == 1 || count == 0)
+    {
       memset (dst, *src, count);
-      return;
-MAKE_COPY_CASE(2) /* YA u8 */
-MAKE_COPY_CASE(3) /* RGB u8 */
-#ifdef ARCH_X86_64
-    case 4: /* RGBA u8 */
-      if (count >= 2)
-        {
-          guint64 pat2 = *(guint32 *)src_ptr;
-          pat2 = pat2 | pat2 << 32;
-          do {
-            memcpy (dst, &pat2, 8);
-            dst += 8;
-            count -= 2;
-          } while (count >= 2);
-        }
-      if (count)
-        {
-          memcpy (dst, src, 4);
-          dst += 4;
-        }
-      return;
-#else
-MAKE_COPY_CASE(4) /* RGBA u8 */
-#endif /* ARCH_X86_64 */
-MAKE_COPY_CASE(6) /* RGB u16 */
-MAKE_COPY_CASE(8) /* RGBA u16 */
-MAKE_COPY_CASE(12) /* RGB float */
-MAKE_COPY_CASE(16) /* RGBA float */
-    default:
-      while (count--)
+    }
+  else
+    {
+      gsize block_size;
+      gsize remaining_size;
+
+      block_size = pattern_size,
+
+      memcpy (dst, src, block_size);
+      src  = dst;
+      dst += block_size;
+
+      remaining_size = (count - 1) * block_size;
+
+      while (block_size < remaining_size)
         {
-          memcpy (dst, src, pattern_size);
-          dst += pattern_size;
+          memcpy (dst, src, block_size);
+          dst += block_size;
+
+          remaining_size -= block_size;
+
+          /* limit the block size, so that we don't saturate the cache.
+           * 
+           * FIXME: optimal limit could use more benchmarking.
+           */
+          if (block_size <= 2048)
+            block_size *= 2;
         }
-      return;
-  }
+
+      memcpy (dst, src, remaining_size);
+    }
 }
 
 #undef MAKE_COPY_CASE


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]