[gegl] gegl/buffer: some more micro-optimizations for column get/set



commit bacba9796ec0e914d790c3aa9aafa815c17b978e
Author: Øyvind Kolås <pippin gimp org>
Date:   Thu Dec 28 16:18:40 2017 +0100

    gegl/buffer: some more micro-optimizations for column get/set
    
    Factor out a multiplication from loops, add more px size optimizations.

 gegl/buffer/gegl-buffer-access.c |  258 ++++++++++++++++++++++++++++++++------
 1 files changed, 222 insertions(+), 36 deletions(-)
---
diff --git a/gegl/buffer/gegl-buffer-access.c b/gegl/buffer/gegl-buffer-access.c
index 40b751c..be484f0 100644
--- a/gegl/buffer/gegl-buffer-access.c
+++ b/gegl/buffer/gegl-buffer-access.c
@@ -478,6 +478,7 @@ gegl_buffer_iterate_write (GeglBuffer          *buffer,
             }
           else
             {
+              int lskip_offset = lskip * px_size;
               switch (pixels * px_size)
                 {
                   case 1:
@@ -488,7 +489,7 @@ gegl_buffer_iterate_write (GeglBuffer          *buffer,
                       if (buffer_y + y >= buffer_abyss_y &&
                           buffer_y + y < abyss_y_total)
                         {
-                          tp[lskip * px_size] = bp[lskip * px_size];
+                          tp[lskip_offset] = bp[lskip_offset];
                         }
                       tp += tile_stride;
                       bp += buf_stride;
@@ -502,8 +503,8 @@ gegl_buffer_iterate_write (GeglBuffer          *buffer,
                       if (buffer_y + y >= buffer_abyss_y &&
                           buffer_y + y < abyss_y_total)
                         {
-                          ((uint16_t*)(&tp[lskip * px_size]))[0] =
-                          ((uint16_t*)(&bp[lskip * px_size]))[0];
+                          ((uint16_t*)(&tp[lskip_offset]))[0] =
+                          ((uint16_t*)(&bp[lskip_offset]))[0];
                         }
                       tp += tile_stride;
                       bp += buf_stride;
@@ -517,9 +518,9 @@ gegl_buffer_iterate_write (GeglBuffer          *buffer,
                       if (buffer_y + y >= buffer_abyss_y &&
                           buffer_y + y < abyss_y_total)
                         {
-                          tp[lskip * px_size] = bp[lskip * px_size];
-                          tp[lskip * px_size+1] = bp[lskip * px_size+1];
-                          tp[lskip * px_size+2] = bp[lskip * px_size+2];
+                          tp[lskip_offset] = bp[lskip_offset];
+                          tp[lskip_offset+1] = bp[lskip_offset+1];
+                          tp[lskip_offset+2] = bp[lskip_offset+2];
                         }
                       tp += tile_stride;
                       bp += buf_stride;
@@ -533,8 +534,8 @@ gegl_buffer_iterate_write (GeglBuffer          *buffer,
                       if (buffer_y + y >= buffer_abyss_y &&
                           buffer_y + y < abyss_y_total)
                         {
-                          ((uint32_t*)(&tp[lskip * px_size]))[0] =
-                          ((uint32_t*)(&bp[lskip * px_size]))[0];
+                          ((uint32_t*)(&tp[lskip_offset]))[0] =
+                          ((uint32_t*)(&bp[lskip_offset]))[0];
                         }
                       tp += tile_stride;
                       bp += buf_stride;
@@ -548,8 +549,8 @@ gegl_buffer_iterate_write (GeglBuffer          *buffer,
                       if (buffer_y + y >= buffer_abyss_y &&
                           buffer_y + y < abyss_y_total)
                         {
-                          ((uint64_t*)(&tp[lskip * px_size]))[0] =
-                          ((uint64_t*)(&bp[lskip * px_size]))[0];
+                          ((uint64_t*)(&tp[lskip_offset]))[0] =
+                          ((uint64_t*)(&bp[lskip_offset]))[0];
                         }
                       tp += tile_stride;
                       bp += buf_stride;
@@ -563,12 +564,12 @@ gegl_buffer_iterate_write (GeglBuffer          *buffer,
                       if (buffer_y + y >= buffer_abyss_y &&
                           buffer_y + y < abyss_y_total)
                         {
-                          ((uint32_t*)(&tp[lskip * px_size]))[0] =
-                          ((uint32_t*)(&bp[lskip * px_size]))[0];
-                          ((uint32_t*)(&tp[lskip * px_size]))[1] =
-                          ((uint32_t*)(&bp[lskip * px_size]))[1];
-                          ((uint32_t*)(&tp[lskip * px_size]))[2] =
-                          ((uint32_t*)(&bp[lskip * px_size]))[2];
+                          ((uint32_t*)(&tp[lskip_offset]))[0] =
+                          ((uint32_t*)(&bp[lskip_offset]))[0];
+                          ((uint32_t*)(&tp[lskip_offset]))[1] =
+                          ((uint32_t*)(&bp[lskip_offset]))[1];
+                          ((uint32_t*)(&tp[lskip_offset]))[2] =
+                          ((uint32_t*)(&bp[lskip_offset]))[2];
                         }
                       tp += tile_stride;
                       bp += buf_stride;
@@ -582,10 +583,10 @@ gegl_buffer_iterate_write (GeglBuffer          *buffer,
                       if (buffer_y + y >= buffer_abyss_y &&
                           buffer_y + y < abyss_y_total)
                         {
-                          ((uint64_t*)(&tp[lskip * px_size]))[0] =
-                          ((uint64_t*)(&bp[lskip * px_size]))[0];
-                          ((uint64_t*)(&tp[lskip * px_size]))[1] =
-                          ((uint64_t*)(&bp[lskip * px_size]))[1];
+                          ((uint64_t*)(&tp[lskip_offset]))[0] =
+                          ((uint64_t*)(&bp[lskip_offset]))[0];
+                          ((uint64_t*)(&tp[lskip_offset]))[1] =
+                          ((uint64_t*)(&bp[lskip_offset]))[1];
                         }
                       tp += tile_stride;
                       bp += buf_stride;
@@ -599,12 +600,12 @@ gegl_buffer_iterate_write (GeglBuffer          *buffer,
                       if (buffer_y + y >= buffer_abyss_y &&
                           buffer_y + y < abyss_y_total)
                         {
-                          ((uint64_t*)(&tp[lskip * px_size]))[0] =
-                          ((uint64_t*)(&bp[lskip * px_size]))[0];
-                          ((uint64_t*)(&tp[lskip * px_size]))[1] =
-                          ((uint64_t*)(&bp[lskip * px_size]))[1];
-                          ((uint64_t*)(&tp[lskip * px_size]))[2] =
-                          ((uint64_t*)(&bp[lskip * px_size]))[2];
+                          ((uint64_t*)(&tp[lskip_offset]))[0] =
+                          ((uint64_t*)(&bp[lskip_offset]))[0];
+                          ((uint64_t*)(&tp[lskip_offset]))[1] =
+                          ((uint64_t*)(&bp[lskip_offset]))[1];
+                          ((uint64_t*)(&tp[lskip_offset]))[2] =
+                          ((uint64_t*)(&bp[lskip_offset]))[2];
                         }
                       tp += tile_stride;
                       bp += buf_stride;
@@ -618,14 +619,118 @@ gegl_buffer_iterate_write (GeglBuffer          *buffer,
                       if (buffer_y + y >= buffer_abyss_y &&
                           buffer_y + y < abyss_y_total)
                         {
-                          ((uint64_t*)(&tp[lskip * px_size]))[0] =
-                          ((uint64_t*)(&bp[lskip * px_size]))[0];
-                          ((uint64_t*)(&tp[lskip * px_size]))[1] =
-                          ((uint64_t*)(&bp[lskip * px_size]))[1];
-                          ((uint64_t*)(&tp[lskip * px_size]))[2] =
-                          ((uint64_t*)(&bp[lskip * px_size]))[2];
-                          ((uint64_t*)(&tp[lskip * px_size]))[3] =
-                          ((uint64_t*)(&bp[lskip * px_size]))[3];
+                          ((uint64_t*)(&tp[lskip_offset]))[0] =
+                          ((uint64_t*)(&bp[lskip_offset]))[0];
+                          ((uint64_t*)(&tp[lskip_offset]))[1] =
+                          ((uint64_t*)(&bp[lskip_offset]))[1];
+                          ((uint64_t*)(&tp[lskip_offset]))[2] =
+                          ((uint64_t*)(&bp[lskip_offset]))[2];
+                          ((uint64_t*)(&tp[lskip_offset]))[3] =
+                          ((uint64_t*)(&bp[lskip_offset]))[3];
+                        }
+                      tp += tile_stride;
+                      bp += buf_stride;
+                    }
+                    break;
+                  case 40:
+                    for (row = offsety;
+                         row < tile_height && y < height;
+                         row++, y++)
+                    {
+                      if (buffer_y + y >= buffer_abyss_y &&
+                          buffer_y + y < abyss_y_total)
+                        {
+                          ((uint64_t*)(&tp[lskip_offset]))[0] =
+                          ((uint64_t*)(&bp[lskip_offset]))[0];
+                          ((uint64_t*)(&tp[lskip_offset]))[1] =
+                          ((uint64_t*)(&bp[lskip_offset]))[1];
+                          ((uint64_t*)(&tp[lskip_offset]))[2] =
+                          ((uint64_t*)(&bp[lskip_offset]))[2];
+                          ((uint64_t*)(&tp[lskip_offset]))[3] =
+                          ((uint64_t*)(&bp[lskip_offset]))[3];
+                          ((uint64_t*)(&tp[lskip_offset]))[4] =
+                          ((uint64_t*)(&bp[lskip_offset]))[4];
+                        }
+                      tp += tile_stride;
+                      bp += buf_stride;
+                    }
+                    break;
+                  case 48:
+                    for (row = offsety;
+                         row < tile_height && y < height;
+                         row++, y++)
+                    {
+                      if (buffer_y + y >= buffer_abyss_y &&
+                          buffer_y + y < abyss_y_total)
+                        {
+                          ((uint64_t*)(&tp[lskip_offset]))[0] =
+                          ((uint64_t*)(&bp[lskip_offset]))[0];
+                          ((uint64_t*)(&tp[lskip_offset]))[1] =
+                          ((uint64_t*)(&bp[lskip_offset]))[1];
+                          ((uint64_t*)(&tp[lskip_offset]))[2] =
+                          ((uint64_t*)(&bp[lskip_offset]))[2];
+                          ((uint64_t*)(&tp[lskip_offset]))[3] =
+                          ((uint64_t*)(&bp[lskip_offset]))[3];
+                          ((uint64_t*)(&tp[lskip_offset]))[4] =
+                          ((uint64_t*)(&bp[lskip_offset]))[4];
+                          ((uint64_t*)(&tp[lskip_offset]))[5] =
+                          ((uint64_t*)(&bp[lskip_offset]))[5];
+                        }
+                      tp += tile_stride;
+                      bp += buf_stride;
+                    }
+                    break;
+                  case 56:
+                    for (row = offsety;
+                         row < tile_height && y < height;
+                         row++, y++)
+                    {
+                      if (buffer_y + y >= buffer_abyss_y &&
+                          buffer_y + y < abyss_y_total)
+                        {
+                          ((uint64_t*)(&tp[lskip_offset]))[0] =
+                          ((uint64_t*)(&bp[lskip_offset]))[0];
+                          ((uint64_t*)(&tp[lskip_offset]))[1] =
+                          ((uint64_t*)(&bp[lskip_offset]))[1];
+                          ((uint64_t*)(&tp[lskip_offset]))[2] =
+                          ((uint64_t*)(&bp[lskip_offset]))[2];
+                          ((uint64_t*)(&tp[lskip_offset]))[3] =
+                          ((uint64_t*)(&bp[lskip_offset]))[3];
+                          ((uint64_t*)(&tp[lskip_offset]))[4] =
+                          ((uint64_t*)(&bp[lskip_offset]))[4];
+                          ((uint64_t*)(&tp[lskip_offset]))[5] =
+                          ((uint64_t*)(&bp[lskip_offset]))[5];
+                          ((uint64_t*)(&tp[lskip_offset]))[6] =
+                          ((uint64_t*)(&bp[lskip_offset]))[6];
+                        }
+                      tp += tile_stride;
+                      bp += buf_stride;
+                    }
+                    break;
+                  case 64:
+                    for (row = offsety;
+                         row < tile_height && y < height;
+                         row++, y++)
+                    {
+                      if (buffer_y + y >= buffer_abyss_y &&
+                          buffer_y + y < abyss_y_total)
+                        {
+                          ((uint64_t*)(&tp[lskip_offset]))[0] =
+                          ((uint64_t*)(&bp[lskip_offset]))[0];
+                          ((uint64_t*)(&tp[lskip_offset]))[1] =
+                          ((uint64_t*)(&bp[lskip_offset]))[1];
+                          ((uint64_t*)(&tp[lskip_offset]))[2] =
+                          ((uint64_t*)(&bp[lskip_offset]))[2];
+                          ((uint64_t*)(&tp[lskip_offset]))[3] =
+                          ((uint64_t*)(&bp[lskip_offset]))[3];
+                          ((uint64_t*)(&tp[lskip_offset]))[4] =
+                          ((uint64_t*)(&bp[lskip_offset]))[4];
+                          ((uint64_t*)(&tp[lskip_offset]))[5] =
+                          ((uint64_t*)(&bp[lskip_offset]))[5];
+                          ((uint64_t*)(&tp[lskip_offset]))[6] =
+                          ((uint64_t*)(&bp[lskip_offset]))[6];
+                          ((uint64_t*)(&tp[lskip_offset]))[7] =
+                          ((uint64_t*)(&bp[lskip_offset]))[7];
                         }
                       tp += tile_stride;
                       bp += buf_stride;
@@ -639,8 +744,8 @@ gegl_buffer_iterate_write (GeglBuffer          *buffer,
                       if (buffer_y + y >= buffer_abyss_y &&
                           buffer_y + y < abyss_y_total)
                         {
-                          memcpy (tp + lskip * px_size,
-                                  bp + lskip * px_size,
+                          memcpy (tp + lskip_offset,
+                                  bp + lskip_offset,
                                   pixels * px_size);
                         }
                       tp += tile_stride;
@@ -830,6 +935,17 @@ gegl_buffer_iterate_read_simple (GeglBuffer          *buffer,
                          bp += buf_stride;
                       }
                     break;
+                  case 3:
+                    for (row = offsety; row < tile_height && y < height;
+                         row++, y++)
+                      {
+                         bp[0] = tp[0];
+                         bp[1] = tp[1];
+                         bp[2] = tp[2];
+                         tp += tile_stride;
+                         bp += buf_stride;
+                      }
+                    break;
                   case 4:
                     for (row = offsety; row < tile_height && y < height;
                          row++, y++)
@@ -839,6 +955,17 @@ gegl_buffer_iterate_read_simple (GeglBuffer          *buffer,
                          bp += buf_stride;
                       }
                     break;
+                  case 6:
+                    for (row = offsety; row < tile_height && y < height;
+                         row++, y++)
+                      {
+                         ((uint16_t*)bp)[0] = ((uint16_t*)tp)[0];
+                         ((uint16_t*)bp)[1] = ((uint16_t*)tp)[1];
+                         ((uint16_t*)bp)[2] = ((uint16_t*)tp)[2];
+                         tp += tile_stride;
+                         bp += buf_stride;
+                      }
+                    break;
                   case 8:
                     for (row = offsety; row < tile_height && y < height;
                          row++, y++)
@@ -892,6 +1019,65 @@ gegl_buffer_iterate_read_simple (GeglBuffer          *buffer,
                          bp += buf_stride;
                       }
                     break;
+                  case 40:
+                    for (row = offsety; row < tile_height && y < height;
+                         row++, y++)
+                      {
+                         ((uint64_t*)bp)[0] = ((uint64_t*)tp)[0];
+                         ((uint64_t*)bp)[1] = ((uint64_t*)tp)[1];
+                         ((uint64_t*)bp)[2] = ((uint64_t*)tp)[2];
+                         ((uint64_t*)bp)[3] = ((uint64_t*)tp)[3];
+                         ((uint64_t*)bp)[4] = ((uint64_t*)tp)[4];
+                         tp += tile_stride;
+                         bp += buf_stride;
+                      }
+                    break;
+                  case 48:
+                    for (row = offsety; row < tile_height && y < height;
+                         row++, y++)
+                      {
+                         ((uint64_t*)bp)[0] = ((uint64_t*)tp)[0];
+                         ((uint64_t*)bp)[1] = ((uint64_t*)tp)[1];
+                         ((uint64_t*)bp)[2] = ((uint64_t*)tp)[2];
+                         ((uint64_t*)bp)[3] = ((uint64_t*)tp)[3];
+                         ((uint64_t*)bp)[4] = ((uint64_t*)tp)[4];
+                         ((uint64_t*)bp)[5] = ((uint64_t*)tp)[5];
+                         tp += tile_stride;
+                         bp += buf_stride;
+                      }
+                    break;
+                  case 56:
+                    for (row = offsety; row < tile_height && y < height;
+                         row++, y++)
+                      {
+                         ((uint64_t*)bp)[0] = ((uint64_t*)tp)[0];
+                         ((uint64_t*)bp)[1] = ((uint64_t*)tp)[1];
+                         ((uint64_t*)bp)[2] = ((uint64_t*)tp)[2];
+                         ((uint64_t*)bp)[3] = ((uint64_t*)tp)[3];
+                         ((uint64_t*)bp)[4] = ((uint64_t*)tp)[4];
+                         ((uint64_t*)bp)[5] = ((uint64_t*)tp)[5];
+                         ((uint64_t*)bp)[6] = ((uint64_t*)tp)[6];
+                         tp += tile_stride;
+                         bp += buf_stride;
+                      }
+                    break;
+                  case 64:
+                    for (row = offsety; row < tile_height && y < height;
+                         row++, y++)
+                      {
+                         ((uint64_t*)bp)[0] = ((uint64_t*)tp)[0];
+                         ((uint64_t*)bp)[1] = ((uint64_t*)tp)[1];
+                         ((uint64_t*)bp)[2] = ((uint64_t*)tp)[2];
+                         ((uint64_t*)bp)[3] = ((uint64_t*)tp)[3];
+                         ((uint64_t*)bp)[4] = ((uint64_t*)tp)[4];
+                         ((uint64_t*)bp)[5] = ((uint64_t*)tp)[5];
+                         ((uint64_t*)bp)[6] = ((uint64_t*)tp)[6];
+                         ((uint64_t*)bp)[7] = ((uint64_t*)tp)[7];
+                         tp += tile_stride;
+                         bp += buf_stride;
+                      }
+                    break;
+
                   default:
                     for (row = offsety;
                          row < tile_height && y < height;


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]