[gtk/wip/chergert/glproto: 457/493] implement basic framebuffer sorting

From: Christian Hergert <chergert src gnome org>
To: commits-list gnome org
Cc:
Subject: [gtk/wip/chergert/glproto: 457/493] implement basic framebuffer sorting
Date: Fri, 19 Feb 2021 02:25:24 +0000 (UTC)

commit 4758f50d35f8188d09ad05ba28e7bf77a5325089
Author: Christian Hergert <chergert redhat com>
Date:   Fri Feb 12 18:55:06 2021 -0800

    implement basic framebuffer sorting
    
    this sorts batches by the framebuffer, moving batches later in the pipeline
    to be near other batches with the same framebuffer. The goal here is to
    reduce how often we change render targets, which is quite expensive.
    
    This is still disabled, however, because we are not yet capturing the whole
    uniform state when appending batches.
    
    Changing uniforms is much faster than changing render targets, so capturing
    the whole state should still be a net win.

 gsk/next/gskglcommandqueue.c        | 179 +++++++++++++++++++++++++++++++++++-
 gsk/next/gskglcommandqueueprivate.h |  45 ++++++---
 2 files changed, 206 insertions(+), 18 deletions(-)
---
diff --git a/gsk/next/gskglcommandqueue.c b/gsk/next/gskglcommandqueue.c
index 0581dc182b..d1bea6cc46 100644
--- a/gsk/next/gskglcommandqueue.c
+++ b/gsk/next/gskglcommandqueue.c
@@ -312,6 +312,7 @@ begin_next_batch (GskGLCommandQueue *self)
 
   batch = &self->batches[self->n_batches++];
   batch->any.next_batch_index = -1;
+  batch->any.prev_batch_index = self->tail_batch_index;
 
   return batch;
 }
@@ -326,6 +327,9 @@ enqueue_batch (GskGLCommandQueue *self)
 
   index = self->n_batches - 1;
 
+  if (self->head_batch_index == -1)
+    self->head_batch_index = index;
+
   if (self->tail_batch_index != -1)
     {
       GskGLCommandBatch *prev = &self->batches[self->tail_batch_index];
@@ -356,6 +360,9 @@ gsk_gl_command_queue_begin_draw (GskGLCommandQueue     *self,
   g_assert (self->in_draw == FALSE);
   g_assert (viewport != NULL);
 
+  if (self->n_batches == G_MAXINT16)
+    return;
+
   self->program_info = program;
 
   batch = begin_next_batch (self);
@@ -372,6 +379,8 @@ gsk_gl_command_queue_begin_draw (GskGLCommandQueue     *self,
   batch->draw.vbo_count = 0;
   batch->draw.vbo_offset = gsk_gl_buffer_get_offset (&self->vertices);
 
+  self->fbo_max = MAX (self->fbo_max, batch->draw.framebuffer);
+
   self->in_draw = TRUE;
 }
 
@@ -401,10 +410,13 @@ gsk_gl_command_queue_end_draw (GskGLCommandQueue *self)
 
   g_assert (GSK_IS_GL_COMMAND_QUEUE (self));
   g_assert (self->n_batches > 0);
-  g_assert (self->in_draw == TRUE);
+
+  if (self->n_batches == G_MAXINT16)
+    return;
 
   batch = &self->batches[self->n_batches - 1];
 
+  g_assert (self->in_draw == TRUE);
   g_assert (batch->any.kind == GSK_GL_COMMAND_KIND_DRAW);
 
   if G_UNLIKELY (batch->draw.vbo_count == 0)
@@ -417,6 +429,7 @@ gsk_gl_command_queue_end_draw (GskGLCommandQueue *self)
   /* Track the destination framebuffer in case it changed */
   batch->draw.framebuffer = self->attachments->fbo.id;
   self->attachments->fbo.changed = FALSE;
+  self->fbo_max = MAX (self->fbo_max, self->attachments->fbo.id);
 
   /* To avoid many g_array_set_size() calls, we first resize the
    * array to be large enough for all our changes. Then we just ++
@@ -550,7 +563,9 @@ gsk_gl_command_queue_clear (GskGLCommandQueue     *self,
 
   g_assert (GSK_IS_GL_COMMAND_QUEUE (self));
   g_assert (self->in_draw == FALSE);
-  g_assert (self->n_batches < G_MAXINT);
+
+  if (self->n_batches == G_MAXINT16)
+    return;
 
   if (clear_bits == 0)
     clear_bits = GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT | GL_STENCIL_BUFFER_BIT;
@@ -564,6 +579,8 @@ gsk_gl_command_queue_clear (GskGLCommandQueue     *self,
   batch->any.next_batch_index = -1;
   batch->any.program = 0;
 
+  self->fbo_max = MAX (self->fbo_max, batch->clear.framebuffer);
+
   enqueue_batch (self);
 
   self->attachments->fbo.changed = FALSE;
@@ -578,7 +595,9 @@ gsk_gl_command_queue_push_debug_group (GskGLCommandQueue *self,
 
   g_assert (GSK_IS_GL_COMMAND_QUEUE (self));
   g_assert (self->in_draw == FALSE);
-  g_assert (self->n_batches < G_MAXINT);
+
+  if (self->n_batches == G_MAXINT16)
+    return;
 
   batch = begin_next_batch (self);
   batch->any.kind = GSK_GL_COMMAND_KIND_PUSH_DEBUG_GROUP;
@@ -598,7 +617,9 @@ gsk_gl_command_queue_pop_debug_group (GskGLCommandQueue *self)
 
   g_assert (GSK_IS_GL_COMMAND_QUEUE (self));
   g_assert (self->in_draw == FALSE);
-  g_assert (self->n_batches < G_MAXINT);
+
+  if (self->n_batches == G_MAXINT16)
+    return;
 
   batch = begin_next_batch (self);
   batch->any.kind = GSK_GL_COMMAND_KIND_POP_DEBUG_GROUP;
@@ -782,6 +803,144 @@ apply_framebuffer (int *framebuffer,
   return FALSE;
 }
 
+static inline void
+gsk_gl_command_queue_unlink (GskGLCommandQueue *self,
+                             GskGLCommandBatch *batch)
+{
+  if (batch->any.prev_batch_index == -1)
+    self->head_batch_index = batch->any.next_batch_index;
+  else
+    self->batches[batch->any.prev_batch_index].any.next_batch_index = batch->any.next_batch_index;
+
+  if (batch->any.next_batch_index == -1)
+    self->tail_batch_index = batch->any.prev_batch_index;
+  else
+    self->batches[batch->any.next_batch_index].any.prev_batch_index = batch->any.prev_batch_index;
+
+  batch->any.prev_batch_index = -1;
+  batch->any.next_batch_index = -1;
+}
+
+static inline void
+gsk_gl_command_queue_insert_before (GskGLCommandQueue *self,
+                                    GskGLCommandBatch *batch,
+                                    GskGLCommandBatch *sibling)
+{
+  int sibling_index = sibling - self->batches;
+  int index = batch - self->batches;
+
+  g_assert (batch >= self->batches);
+  g_assert (batch < &self->batches[self->n_batches]);
+  g_assert (sibling >= self->batches);
+  g_assert (sibling < &self->batches[self->n_batches]);
+
+  batch->any.next_batch_index = sibling_index;
+  batch->any.prev_batch_index = sibling->any.prev_batch_index;
+
+  if (batch->any.prev_batch_index > -1)
+    self->batches[batch->any.prev_batch_index].any.next_batch_index = index;
+
+  sibling->any.prev_batch_index = index;
+
+  if (batch->any.prev_batch_index == -1)
+    self->head_batch_index = index;
+}
+
+static void
+gsk_gl_command_queue_sort_batches (GskGLCommandQueue *self)
+{
+  int *seen;
+  int *seen_free = NULL;
+  int index;
+
+  g_assert (GSK_IS_GL_COMMAND_QUEUE (self));
+  g_assert (self->tail_batch_index >= 0);
+  g_assert (self->fbo_max >= 0);
+
+  /* Create our seen list with most recent index set to -1,
+   * meaning we haven't yet seen that framebuffer.
+   */
+  if (self->fbo_max < 1024)
+    seen = g_alloca (sizeof (int) * (self->fbo_max + 1));
+  else
+    seen = seen_free = g_new0 (int, (self->fbo_max + 1));
+  for (int i = 0; i <= self->fbo_max; i++)
+    seen[i] = -1;
+
+  /* Walk in reverse, and if we've seen that framebuffer before,
+   * we want to delay this operation until right before the last
+   * batch we saw for that framebuffer.
+   */
+  index = self->tail_batch_index;
+
+  while (index >= 0)
+    {
+      GskGLCommandBatch *batch = &self->batches[index];
+      int cur_index = index;
+      int fbo = -1;
+
+      g_assert (index > -1);
+      g_assert (index < self->n_batches);
+
+      switch (batch->any.kind)
+        {
+        case GSK_GL_COMMAND_KIND_POP_DEBUG_GROUP:
+        case GSK_GL_COMMAND_KIND_PUSH_DEBUG_GROUP:
+          break;
+
+        case GSK_GL_COMMAND_KIND_DRAW:
+          fbo = batch->draw.framebuffer;
+          break;
+
+        case GSK_GL_COMMAND_KIND_CLEAR:
+          fbo = batch->clear.framebuffer;
+          break;
+
+        default:
+          g_assert_not_reached ();
+        }
+
+      index = batch->any.prev_batch_index;
+
+      g_assert (index >= -1);
+      g_assert (index < (int)self->n_batches);
+      g_assert (fbo >= -1);
+
+      if (fbo == -1)
+        continue;
+
+      g_assert (fbo <= self->fbo_max);
+      g_assert (seen[fbo] >= -1);
+      g_assert (seen[fbo] < (int)self->n_batches);
+
+      if (seen[fbo] != -1)
+        {
+          int mru_index = seen[fbo];
+          GskGLCommandBatch *mru = &self->batches[mru_index];
+
+          g_assert (mru_index > -1);
+
+          gsk_gl_command_queue_unlink (self, batch);
+
+          g_assert (batch->any.prev_batch_index == -1);
+          g_assert (batch->any.next_batch_index == -1);
+
+          gsk_gl_command_queue_insert_before (self, batch, mru);
+
+          g_assert (batch->any.prev_batch_index > -1 ||
+                    self->head_batch_index == cur_index);
+          g_assert (batch->any.next_batch_index == seen[fbo]);
+        }
+
+      g_assert (cur_index > -1);
+      g_assert (seen[fbo] >= -1);
+
+      seen[fbo] = cur_index;
+    }
+
+  g_free (seen_free);
+}
+
 /**
  * gsk_gl_command_queue_execute:
  * @self: a #GskGLCommandQueue
@@ -819,6 +978,14 @@ gsk_gl_command_queue_execute (GskGLCommandQueue    *self,
   if (self->n_batches == 0)
     return;
 
+#if 0
+  /* TODO: For batch sorting to work, we need to snapshot the whole uniform
+   * state with each batch so that we can re-apply all uniforms since what
+   * we captured may not reflect the current state.
+   */
+  gsk_gl_command_queue_sort_batches (self);
+#endif
+
   gsk_gl_command_queue_make_current (self);
 
 #ifdef G_ENABLE_DEBUG
@@ -867,7 +1034,7 @@ gsk_gl_command_queue_execute (GskGLCommandQueue    *self,
 
   apply_scissor (&scissor_state, framebuffer, &scissor_test, has_scissor);
 
-  next_batch_index = 0;
+  next_batch_index = self->head_batch_index;
 
   while (next_batch_index >= 0)
     {
@@ -1016,7 +1183,9 @@ gsk_gl_command_queue_begin_frame (GskGLCommandQueue *self)
 
   gsk_gl_command_queue_make_current (self);
 
+  self->fbo_max = 0;
   self->tail_batch_index = -1;
+  self->head_batch_index = -1;
   self->in_frame = TRUE;
 }
 
diff --git a/gsk/next/gskglcommandqueueprivate.h b/gsk/next/gskglcommandqueueprivate.h
index 92968321e0..98d2f03e56 100644
--- a/gsk/next/gskglcommandqueueprivate.h
+++ b/gsk/next/gskglcommandqueueprivate.h
@@ -81,7 +81,12 @@ typedef struct _GskGLCommandBatchAny
    * as a sort of integer-based linked list to simplify out-of-order
    * batching without moving memory around. -1 indicates last batch.
    */
-  int next_batch_index;
+  gint16 next_batch_index;
+
+  /* Same but for reverse direction as we sort in reverse to get the
+   * batches ordered by framebuffer.
+   */
+  gint16 prev_batch_index;
 
   /* The viewport size of the batch. We check this as we process
    * batches to determine if we need to resize the viewport.
@@ -140,6 +145,23 @@ typedef struct _GskGLCommandDraw
 
 G_STATIC_ASSERT (sizeof (GskGLCommandDraw) == 32);
 
+typedef struct _GskGLCommandDebug
+{
+  GskGLCommandBatchAny any;
+  const char *debug_group;
+} GskGLCommandDebug;
+
+G_STATIC_ASSERT (sizeof (GskGLCommandDebug) == 24);
+
+typedef struct _GskGLCommandClear
+{
+  GskGLCommandBatchAny  any;
+  guint                 bits;
+  guint                 framebuffer;
+} GskGLCommandClear;
+
+G_STATIC_ASSERT (sizeof (GskGLCommandClear) == 20);
+
 typedef struct _GskGLCommandUniform
 {
   GskGLUniformInfo info;
@@ -150,17 +172,10 @@ G_STATIC_ASSERT (sizeof (GskGLCommandUniform) == 8);
 
 typedef union _GskGLCommandBatch
 {
-  GskGLCommandBatchAny    any;
-  GskGLCommandDraw        draw;
-  struct {
-    GskGLCommandBatchAny  any;
-    const char           *debug_group;
-  } debug_group;
-  struct {
-    GskGLCommandBatchAny  any;
-    guint                 bits;
-    guint                 framebuffer;
-  } clear;
+  GskGLCommandBatchAny any;
+  GskGLCommandDraw     draw;
+  GskGLCommandDebug    debug_group;
+  GskGLCommandClear    clear;
 } GskGLCommandBatch;
 
 G_STATIC_ASSERT (sizeof (GskGLCommandBatch) == 32);
@@ -241,7 +256,11 @@ struct _GskGLCommandQueue
    * at the end of the array, as batches can be reordered. This is used to
    * update the "next" index when adding a new batch.
    */
-  int tail_batch_index;
+  gint16 tail_batch_index;
+  gint16 head_batch_index;
+
+  /* Max framebuffer we used, so we can sort items faster */
+  guint fbo_max;
 
   /* Various GSK and GDK metric counter ids */
   struct {
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]