[ostree] deltas: Compute rollsum targets



commit 3df8be0d92a48b2d121664097c6ece73c6904a2f
Author: Colin Walters <walters verbum org>
Date:   Thu Jan 29 17:39:34 2015 -0500

    deltas: Compute rollsum targets

 Makefile-libostree.am                              |    2 +
 .../ostree-repo-static-delta-compilation.c         |  224 +++++++++++++++++++-
 2 files changed, 220 insertions(+), 6 deletions(-)
---
diff --git a/Makefile-libostree.am b/Makefile-libostree.am
index 7985f0a..9f59642 100644
--- a/Makefile-libostree.am
+++ b/Makefile-libostree.am
@@ -49,6 +49,8 @@ libostree_1_la_SOURCES = \
        src/libostree/ostree-lzma-compressor.h \
        src/libostree/ostree-lzma-decompressor.c \
        src/libostree/ostree-lzma-decompressor.h \
+       src/libostree/bupsplit.h \
+       src/libostree/bupsplit.c \
        src/libostree/ostree-varint.h \
        src/libostree/ostree-varint.c \
        src/libostree/ostree-linuxfsutil.h \
diff --git a/src/libostree/ostree-repo-static-delta-compilation.c 
b/src/libostree/ostree-repo-static-delta-compilation.c
index 6dbd3e1..5db5fa4 100644
--- a/src/libostree/ostree-repo-static-delta-compilation.c
+++ b/src/libostree/ostree-repo-static-delta-compilation.c
@@ -21,6 +21,7 @@
 #include "config.h"
 
 #include <string.h>
+#include <zlib.h>
 
 #include "ostree-core-private.h"
 #include "ostree-repo-private.h"
@@ -379,6 +380,185 @@ process_one_object (OstreeRepo                       *repo,
   return ret;
 }
 
+typedef struct {
+  GPtrArray *keys;
+  GHashTable *values;
+} OrderedRollsums;
+
+static void
+ordered_rollsums_free (OrderedRollsums  *ohash)
+{
+  g_ptr_array_unref (ohash->keys);
+  g_hash_table_unref (ohash->values);
+  g_free (ohash);
+}
+
+static gboolean
+rollsum_chunks_crc32 (GInputStream     *istream,
+                      OrderedRollsums **out_rollsums,
+                      GCancellable     *cancellable,
+                      GError          **error)
+{
+  gboolean ret = FALSE;
+  gsize start = 0;
+  gboolean rollsum_end = FALSE;
+  OrderedRollsums *ret_rollsums = g_new0 (OrderedRollsums, 1);
+  gs_unref_object GBufferedInputStream *bufinput =
+    (GBufferedInputStream*) g_buffered_input_stream_new_sized (istream, ROLLSUM_BLOB_MAX);
+
+  ret_rollsums->keys = g_ptr_array_new_with_free_func ((GDestroyNotify)g_variant_unref);
+  ret_rollsums->values = g_hash_table_new (NULL, NULL);
+
+  while (TRUE)
+    {
+      gssize bytes_read;
+      const guint8 *buf;
+      gsize bufsize;
+      int offset, bits;
+
+      bytes_read = g_buffered_input_stream_fill (bufinput, -1, cancellable, error);
+      if (bytes_read == -1)
+        goto out;
+      if (bytes_read == 0)
+        break;
+
+      buf = g_buffered_input_stream_peek_buffer (bufinput, &bufsize);
+
+      if (!rollsum_end)
+        {
+          offset = bupsplit_find_ofs (buf, MIN(G_MAXINT32, bufsize), &bits); 
+          if (offset == 0)
+            {
+              rollsum_end = TRUE;
+              offset = MIN(ROLLSUM_BLOB_MAX, bufsize);
+            }
+          else if (offset > ROLLSUM_BLOB_MAX)
+            offset = ROLLSUM_BLOB_MAX;
+        }
+      else
+        offset = MIN(ROLLSUM_BLOB_MAX, bufsize);
+
+      if (!g_input_stream_skip ((GInputStream*)bufinput, bufsize, cancellable, error))
+        goto out;
+
+      /* Use zlib's crc32 */
+      { guint32 crc = crc32 (0L, NULL, 0);
+        GVariant *val;
+
+        crc = crc32 (crc, buf, offset);
+
+        val = g_variant_ref_sink (g_variant_new ("(utt)", crc, (guint64) start, (guint64)offset));
+        g_ptr_array_add (ret_rollsums->keys, val);
+        g_hash_table_insert (ret_rollsums->values, GUINT_TO_POINTER (crc), val);
+      }
+
+      start += offset;
+    }
+
+  ret = TRUE;
+  gs_transfer_out_value (out_rollsums, &ret_rollsums);
+ out:
+  if (ret_rollsums)
+    ordered_rollsums_free (ret_rollsums);
+  return ret;
+}
+
+typedef struct {
+  char *from_checksum;
+  OrderedRollsums *from_rollsums;
+  OrderedRollsums *to_rollsums;
+  guint match_ratio;
+} ContentRollsum;
+
+static void
+content_rollsums_free (ContentRollsum  *rollsum)
+{
+  g_free (rollsum->from_checksum);
+  ordered_rollsums_free (rollsum->from_rollsums);
+  ordered_rollsums_free (rollsum->to_rollsums);
+  g_free (rollsum);
+}
+
+static gboolean
+try_content_rollsum (OstreeRepo                       *repo,
+                     const char                       *from,
+                     const char                       *to,
+                     ContentRollsum                  **out_rollsum,
+                     GCancellable                     *cancellable,
+                     GError                          **error)
+{
+  gboolean ret = FALSE;
+  OrderedRollsums *from_rollsum = NULL;
+  OrderedRollsums *to_rollsum = NULL;
+  gs_unref_object GInputStream *from_istream = NULL;
+  gs_unref_object GFileInfo *from_finfo = NULL;
+  gs_unref_object GInputStream *to_istream = NULL;
+  gs_unref_object GFileInfo *to_finfo = NULL;
+  ContentRollsum *ret_rollsum = NULL;
+  guint total = 0;
+  guint matches = 0;
+  guint match_ratio = 0;
+  gpointer hkey, hvalue;
+  GHashTableIter hiter;
+
+  *out_rollsum = NULL;
+
+  if (!ostree_repo_load_file (repo, from, &from_istream, &from_finfo, NULL,
+                              cancellable, error))
+    goto out;
+  if (!ostree_repo_load_file (repo, to, &to_istream, &to_finfo, NULL,
+                              cancellable, error))
+    goto out;
+
+  /* Only try to rollsum regular files obviously */ 
+  if (!(g_file_info_get_file_type (from_finfo) == G_FILE_TYPE_REGULAR
+        && g_file_info_get_file_type (to_finfo) == G_FILE_TYPE_REGULAR))
+    {
+      ret = TRUE;
+      goto out;
+    }
+
+  g_assert (from_istream && to_istream);
+
+  if (!rollsum_chunks_crc32 (from_istream, &from_rollsum, cancellable, error))
+    goto out;
+  if (!rollsum_chunks_crc32 (to_istream, &to_rollsum, cancellable, error))
+    goto out;
+
+  g_clear_object (&from_istream);
+  g_clear_object (&to_istream);
+
+  g_hash_table_iter_init (&hiter, to_rollsum->values);
+  while (g_hash_table_iter_next (&hiter, &hkey, &hvalue))
+    {
+      if (g_hash_table_contains (from_rollsum->values, hkey))
+        matches++;
+      total++;
+    }
+
+  match_ratio = (matches*100)/total;
+
+  /* Only proceed if the file contains (arbitrary) more than 25% of
+   * the previous chunks.
+   */
+  if (match_ratio < 25)
+    {
+      ret = TRUE;
+      goto out;
+    }
+
+  ret_rollsum = g_new0 (ContentRollsum, 1);
+  ret_rollsum->match_ratio = match_ratio;
+  ret_rollsum->from_checksum = g_strdup (from);
+  ret_rollsum->from_rollsums = from_rollsum; from_rollsum = NULL;
+  ret_rollsum->to_rollsums = to_rollsum; to_rollsum = NULL;
+  
+  ret = TRUE;
+  gs_transfer_out_value (out_rollsum, &ret_rollsum);
+ out:
+  return ret;
+}
+
 static gboolean 
 generate_delta_lowlatency (OstreeRepo                       *repo,
                            const char                       *from,
@@ -402,6 +582,7 @@ generate_delta_lowlatency (OstreeRepo                       *repo,
   gs_unref_hashtable GHashTable *new_reachable_metadata = NULL;
   gs_unref_hashtable GHashTable *new_reachable_content = NULL;
   gs_unref_hashtable GHashTable *modified_content_objects = NULL;
+  gs_unref_hashtable GHashTable *rollsum_optimized_content_objects = NULL;
   gs_unref_hashtable GHashTable *content_object_to_size = NULL;
 
   if (from != NULL)
@@ -424,15 +605,21 @@ generate_delta_lowlatency (OstreeRepo                       *repo,
                          cancellable, error))
     goto out;
 
-  modified_content_objects = g_hash_table_new_full (ostree_hash_object_name, g_variant_equal,
-                                                    NULL,
-                                                    (GDestroyNotify) g_variant_unref);
+  modified_content_objects = g_hash_table_new_full (g_str_hash, g_str_equal,
+                                                    g_free, g_free);
   for (i = 0; i < modified->len; i++)
     {
       OstreeDiffItem *diffitem = modified->pdata[i];
-      GVariant *objname = ostree_object_name_serialize (diffitem->target_checksum,
-                                                        OSTREE_OBJECT_TYPE_FILE);
-      g_hash_table_add (modified_content_objects, objname);
+      /* Theoretically, a target file could replace multiple source
+       * files.  That could happen if say a project changed from having
+       * multiple binaries to one binary.
+       *
+       * In that case, we have last one wins behavior.  For ELF rollsum
+       * tends to be useless unless there's a large static data blob.
+       */
+      g_hash_table_replace (modified_content_objects,
+                            g_strdup (diffitem->target_checksum),
+                            g_strdup (diffitem->src_checksum));
     }
 
   if (from)
@@ -478,6 +665,31 @@ generate_delta_lowlatency (OstreeRepo                       *repo,
   g_hash_table_remove (new_reachable_metadata,
                        ostree_object_name_serialize (to, OSTREE_OBJECT_TYPE_COMMIT));
 
+  rollsum_optimized_content_objects = g_hash_table_new_full (g_str_hash, g_str_equal,
+                                                             g_free,
+                                                             (GDestroyNotify) content_rollsums_free);
+
+  g_hash_table_iter_init (&hashiter, modified_content_objects);
+  while (g_hash_table_iter_next (&hashiter, &key, &value))
+    {
+      const char *to_checksum = key;
+      const char *from_checksum = value;
+      ContentRollsum *rollsum;
+
+      if (!try_content_rollsum (repo, from_checksum, to_checksum,
+                                &rollsum, cancellable, error))
+        goto out;
+
+      if (!rollsum)
+        continue;
+
+      g_hash_table_insert (rollsum_optimized_content_objects, g_strdup (to_checksum), rollsum);
+    }
+
+  g_printerr ("rollsum for %u/%u modified\n",
+              g_hash_table_size (rollsum_optimized_content_objects),
+              g_hash_table_size (modified_content_objects));
+
   /* Scan for large objects, so we can fall back to plain HTTP-based
    * fetch.  In the future this should come after an rsync-style
    * rolling delta check for modified files.


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]