[ostree] deltas: Implement rollsums



commit 9aa7e30b38f33794fc0fda12fa8b0ef50d9fbab5
Author: Colin Walters <walters verbum org>
Date:   Fri Jan 30 10:13:07 2015 -0500

    deltas: Implement rollsums
    
    This does an rsync-style prepared delta basically.  On my test data,
    it shaves ~6MB of uncompressed data.  Not a huge amount, but I expect
    this to be more useful for things like binaries which embed data, etc.

 Makefile-libostree.am                              |    2 +
 Makefile-tests.am                                  |    8 +-
 .../ostree-repo-static-delta-compilation.c         |  474 ++++++++++++--------
 src/libostree/ostree-repo-static-delta-private.h   |    3 +-
 .../ostree-repo-static-delta-processing.c          |   59 ++-
 src/libostree/ostree-rollsum.c                     |  201 +++++++++
 src/libostree/ostree-rollsum.h                     |   44 ++
 tests/test-rollsum.c                               |  145 ++-----
 8 files changed, 620 insertions(+), 316 deletions(-)
---
diff --git a/Makefile-libostree.am b/Makefile-libostree.am
index 9f59642..f77a36a 100644
--- a/Makefile-libostree.am
+++ b/Makefile-libostree.am
@@ -51,6 +51,8 @@ libostree_1_la_SOURCES = \
        src/libostree/ostree-lzma-decompressor.h \
        src/libostree/bupsplit.h \
        src/libostree/bupsplit.c \
+       src/libostree/ostree-rollsum.h \
+       src/libostree/ostree-rollsum.c \
        src/libostree/ostree-varint.h \
        src/libostree/ostree-varint.c \
        src/libostree/ostree-linuxfsutil.h \
diff --git a/Makefile-tests.am b/Makefile-tests.am
index 8571fa1..42d1be7 100644
--- a/Makefile-tests.am
+++ b/Makefile-tests.am
@@ -87,12 +87,6 @@ INSTALL_DATA_HOOKS += install-gpg-data-hook
         echo 'Output=TAP' >> $  tmp; \
         mv $  tmp $@)
 
-%.test: tests/%.js Makefile
-       $(AM_V_GEN) (echo '[Test]' > $  tmp; \
-        echo 'Exec=env TESTDATADIR=$(pkglibexecdir)/installed-tests 
$(pkglibexecdir)/installed-tests/$(notdir $<)' >> $  tmp; \
-        echo 'Type=session' >> $  tmp; \
-        mv $  tmp $@)
-
 if BUILDOPT_GJS
 insttest_SCRIPTS += tests/test-core.js \
        tests/test-sizes.js \
@@ -109,7 +103,7 @@ check_PROGRAMS = tests/test-rollsum tests/test-varint tests/test-ot-unix-utils
 tests_test_ot_unix_utils_CFLAGS = $(ostree_bin_shared_cflags) $(OT_INTERNAL_GIO_UNIX_CFLAGS)
 tests_test_ot_unix_utils_LDADD = $(ostree_bin_shared_ldadd) $(OT_INTERNAL_GIO_UNIX_LIBS)
 
-tests_test_rollsum_SOURCES = src/libostree/bupsplit.c tests/test-rollsum.c
+tests_test_rollsum_SOURCES = src/libostree/bupsplit.c src/libostree/ostree-rollsum.c tests/test-rollsum.c
 tests_test_rollsum_CFLAGS = $(ostree_bin_shared_cflags) $(OT_INTERNAL_GIO_UNIX_CFLAGS)
 tests_test_rollsum_LDADD = $(ostree_bin_shared_ldadd) $(OT_INTERNAL_GIO_UNIX_LIBS)
 
diff --git a/src/libostree/ostree-repo-static-delta-compilation.c 
b/src/libostree/ostree-repo-static-delta-compilation.c
index ad995d0..594b432 100644
--- a/src/libostree/ostree-repo-static-delta-compilation.c
+++ b/src/libostree/ostree-repo-static-delta-compilation.c
@@ -21,18 +21,16 @@
 #include "config.h"
 
 #include <string.h>
-#include <zlib.h>
+#include <gio/gunixoutputstream.h>
 
 #include "ostree-core-private.h"
 #include "ostree-repo-private.h"
 #include "ostree-lzma-compressor.h"
 #include "ostree-repo-static-delta-private.h"
 #include "ostree-diff.h"
+#include "ostree-rollsum.h"
 #include "otutil.h"
 #include "ostree-varint.h"
-#include "bupsplit.h"
-
-#define ROLLSUM_BLOB_MAX (8192*4)
 
 typedef struct {
   guint64 uncompressed_size;
@@ -260,6 +258,36 @@ splice_stream_to_payload (OstreeStaticDeltaPartBuilder  *current_part,
   return ret;
 }
 
+static void
+write_content_mode_xattrs (OstreeRepo                       *repo,
+                           OstreeStaticDeltaPartBuilder     *current_part,
+                           GFileInfo                        *content_finfo,
+                           GVariant                         *content_xattrs,
+                           gsize                            *out_mode_offset,
+                           gsize                            *out_xattr_offset)
+{
+  guint32 uid =
+    g_file_info_get_attribute_uint32 (content_finfo, "unix::uid");
+  guint32 gid =
+    g_file_info_get_attribute_uint32 (content_finfo, "unix::gid");
+  guint32 mode =
+    g_file_info_get_attribute_uint32 (content_finfo, "unix::mode");
+  gs_unref_variant GVariant *modev
+    = g_variant_ref_sink (g_variant_new ("(uuu)", 
+                                         GUINT32_TO_BE (uid),
+                                         GUINT32_TO_BE (gid),
+                                         GUINT32_TO_BE (mode)));
+
+  *out_mode_offset = write_unique_variant_chunk (current_part,
+                                                 current_part->mode_set,
+                                                 current_part->modes,
+                                                 modev);
+  *out_xattr_offset = write_unique_variant_chunk (current_part,
+                                                  current_part->xattr_set,
+                                                  current_part->xattrs,
+                                                  content_xattrs);
+}
+
 static gboolean
 process_one_object (OstreeRepo                       *repo,
                     OstreeStaticDeltaBuilder         *builder,
@@ -327,26 +355,12 @@ process_one_object (OstreeRepo                       *repo,
   else
     {
       gsize mode_offset, xattr_offset, content_offset;
-      guint32 uid =
-        g_file_info_get_attribute_uint32 (content_finfo, "unix::uid");
-      guint32 gid =
-        g_file_info_get_attribute_uint32 (content_finfo, "unix::gid");
-      guint32 mode =
-        g_file_info_get_attribute_uint32 (content_finfo, "unix::mode");
-      gs_unref_variant GVariant *modev
-        = g_variant_ref_sink (g_variant_new ("(uuu)", 
-                                             GUINT32_TO_BE (uid),
-                                             GUINT32_TO_BE (gid),
-                                             GUINT32_TO_BE (mode)));
-
-      mode_offset = write_unique_variant_chunk (current_part,
-                                                current_part->mode_set,
-                                                current_part->modes,
-                                                modev);
-      xattr_offset = write_unique_variant_chunk (current_part,
-                                                 current_part->xattr_set,
-                                                 current_part->xattrs,
-                                                 content_xattrs);
+      guint32 mode;
+
+      mode = g_file_info_get_attribute_uint32 (content_finfo, "unix::mode");
+
+      write_content_mode_xattrs (repo, current_part, content_finfo, content_xattrs,
+                                 &mode_offset, &xattr_offset);
 
       if (S_ISLNK (mode))
         {
@@ -382,105 +396,74 @@ process_one_object (OstreeRepo                       *repo,
 }
 
 typedef struct {
-  GPtrArray *keys;
-  GHashTable *values;
-} OrderedRollsums;
+  char *from_checksum;
+  OstreeRollsumMatches *matches;
+  GBytes *tmp_to;
+} ContentRollsum;
 
 static void
-ordered_rollsums_free (OrderedRollsums  *ohash)
+content_rollsums_free (ContentRollsum  *rollsum)
 {
-  g_ptr_array_unref (ohash->keys);
-  g_hash_table_unref (ohash->values);
-  g_free (ohash);
+  g_free (rollsum->from_checksum);
+  _ostree_rollsum_matches_free (rollsum->matches);
+  g_bytes_unref (rollsum->tmp_to);
+  g_free (rollsum);
 }
 
+/* Load a content object, uncompressing it to an unlinked tmpfile
+   that's mmap()'d and suitable for seeking.
+ */
 static gboolean
-rollsum_chunks_crc32 (GInputStream     *istream,
-                      OrderedRollsums **out_rollsums,
-                      GCancellable     *cancellable,
-                      GError          **error)
+get_unpacked_unlinked_content (OstreeRepo       *repo,
+                               const char       *checksum,
+                               GBytes          **out_content,
+                               GFileInfo       **out_finfo,
+                               GCancellable     *cancellable,
+                               GError          **error)
 {
   gboolean ret = FALSE;
-  gsize start = 0;
-  gboolean rollsum_end = FALSE;
-  OrderedRollsums *ret_rollsums = g_new0 (OrderedRollsums, 1);
-  gs_unref_object GBufferedInputStream *bufinput =
-    (GBufferedInputStream*) g_buffered_input_stream_new_sized (istream, ROLLSUM_BLOB_MAX);
-
-  ret_rollsums->keys = g_ptr_array_new_with_free_func ((GDestroyNotify)g_variant_unref);
-  ret_rollsums->values = g_hash_table_new (NULL, NULL);
-
-  while (TRUE)
+  gs_free char *tmpname = g_strdup ("tmpostree-deltaobj-XXXXXX");
+  gs_fd_close int fd = -1;
+  gs_unref_bytes GBytes *ret_content = NULL;
+  gs_unref_object GInputStream *istream = NULL;
+  gs_unref_object GFileInfo *ret_finfo = NULL;
+  gs_unref_object GOutputStream *out = NULL;
+
+  fd = g_mkstemp (tmpname);
+  if (fd == -1)
     {
-      gssize bytes_read;
-      const guint8 *buf;
-      gsize bufsize;
-      int offset, bits;
-
-      bytes_read = g_buffered_input_stream_fill (bufinput, -1, cancellable, error);
-      if (bytes_read == -1)
-        goto out;
-      if (bytes_read == 0)
-        break;
-
-      buf = g_buffered_input_stream_peek_buffer (bufinput, &bufsize);
-
-      if (!rollsum_end)
-        {
-          offset = bupsplit_find_ofs (buf, MIN(G_MAXINT32, bufsize), &bits); 
-          if (offset == 0)
-            {
-              rollsum_end = TRUE;
-              offset = MIN(ROLLSUM_BLOB_MAX, bufsize);
-            }
-          else if (offset > ROLLSUM_BLOB_MAX)
-            offset = ROLLSUM_BLOB_MAX;
-        }
-      else
-        offset = MIN(ROLLSUM_BLOB_MAX, bufsize);
-
-      if (!g_input_stream_skip ((GInputStream*)bufinput, bufsize, cancellable, error))
-        goto out;
-
-      /* Use zlib's crc32 */
-      { guint32 crc = crc32 (0L, NULL, 0);
-        GVariant *val;
-
-        crc = crc32 (crc, buf, offset);
+      gs_set_error_from_errno (error, errno);
+      goto out;
+    }
+  /* Doesn't need a name */
+  (void) unlink (tmpname);
 
-        val = g_variant_ref_sink (g_variant_new ("(utt)", crc, (guint64) start, (guint64)offset));
-        g_ptr_array_add (ret_rollsums->keys, val);
-        g_hash_table_insert (ret_rollsums->values, GUINT_TO_POINTER (crc), val);
-      }
+  if (!ostree_repo_load_file (repo, checksum, &istream, &ret_finfo, NULL,
+                              cancellable, error))
+    goto out;
 
-      start += offset;
+  if (g_file_info_get_file_type (ret_finfo) != G_FILE_TYPE_REGULAR)
+    {
+      ret = TRUE;
+      goto out;
     }
+  
+  out = g_unix_output_stream_new (fd, FALSE);
+  if (g_output_stream_splice (out, istream, G_OUTPUT_STREAM_SPLICE_CLOSE_TARGET,
+                              cancellable, error) < 0)
+    goto out;
+  
+  { GMappedFile *mfile = g_mapped_file_new_from_fd (fd, FALSE, error);
+    ret_content = g_mapped_file_get_bytes (mfile);
+    g_mapped_file_unref (mfile);
+  }
 
   ret = TRUE;
-  gs_transfer_out_value (out_rollsums, &ret_rollsums);
+  gs_transfer_out_value (out_content, &ret_content);
  out:
-  if (ret_rollsums)
-    ordered_rollsums_free (ret_rollsums);
   return ret;
 }
 
-typedef struct {
-  char *from_checksum;
-  OrderedRollsums *from_rollsums;
-  OrderedRollsums *to_rollsums;
-  guint match_ratio;
-  guint64 match_size;
-} ContentRollsum;
-
-static void
-content_rollsums_free (ContentRollsum  *rollsum)
-{
-  g_free (rollsum->from_checksum);
-  ordered_rollsums_free (rollsum->from_rollsums);
-  ordered_rollsums_free (rollsum->to_rollsums);
-  g_free (rollsum);
-}
-
 static gboolean
 try_content_rollsum (OstreeRepo                       *repo,
                      const char                       *from,
@@ -490,85 +473,203 @@ try_content_rollsum (OstreeRepo                       *repo,
                      GError                          **error)
 {
   gboolean ret = FALSE;
-  OrderedRollsums *from_rollsum = NULL;
-  OrderedRollsums *to_rollsum = NULL;
-  gs_unref_object GInputStream *from_istream = NULL;
+  gs_unref_hashtable GHashTable *from_rollsum = NULL;
+  gs_unref_hashtable GHashTable *to_rollsum = NULL;
+  gs_unref_bytes GBytes *tmp_from = NULL;
+  gs_unref_bytes GBytes *tmp_to = NULL;
   gs_unref_object GFileInfo *from_finfo = NULL;
-  gs_unref_object GInputStream *to_istream = NULL;
   gs_unref_object GFileInfo *to_finfo = NULL;
+  OstreeRollsumMatches *matches;
   ContentRollsum *ret_rollsum = NULL;
-  guint total = 0;
-  guint matches = 0;
-  guint match_ratio = 0;
-  guint64 match_size = 0;
-  gpointer hkey, hvalue;
-  GHashTableIter hiter;
 
   *out_rollsum = NULL;
 
-  if (!ostree_repo_load_file (repo, from, &from_istream, &from_finfo, NULL,
-                              cancellable, error))
+  /* Load the content objects, splice them to uncompressed temporary files that
+   * we can just mmap() and seek around in conveniently.
+   */
+  if (!get_unpacked_unlinked_content (repo, from, &tmp_from, &from_finfo,
+                                      cancellable, error))
     goto out;
-  if (!ostree_repo_load_file (repo, to, &to_istream, &to_finfo, NULL,
-                              cancellable, error))
+  if (!get_unpacked_unlinked_content (repo, to, &tmp_to, &to_finfo,
+                                      cancellable, error))
     goto out;
 
   /* Only try to rollsum regular files obviously */ 
-  if (!(g_file_info_get_file_type (from_finfo) == G_FILE_TYPE_REGULAR
-        && g_file_info_get_file_type (to_finfo) == G_FILE_TYPE_REGULAR))
+  if (!(tmp_from && tmp_to))
     {
       ret = TRUE;
       goto out;
     }
 
-  g_assert (from_istream && to_istream);
+  matches = _ostree_compute_rollsum_matches (tmp_from, tmp_to);
 
-  if (!rollsum_chunks_crc32 (from_istream, &from_rollsum, cancellable, error))
-    goto out;
-  if (!rollsum_chunks_crc32 (to_istream, &to_rollsum, cancellable, error))
-    goto out;
+  { guint match_ratio = (matches->bufmatches*100)/matches->total;
 
-  g_clear_object (&from_istream);
-  g_clear_object (&to_istream);
+    /* Only proceed if the file contains (arbitrary) more than 25% of
+     * the previous chunks.
+     */
+    if (match_ratio < 25)
+      {
+        ret = TRUE;
+        goto out;
+      }
+  }
+
+  g_printerr ("rollsum for %s; crcs=%u bufs=%u total=%u matchsize=%llu\n",
+              to, matches->crcmatches,
+              matches->bufmatches,
+              matches->total, (unsigned long long)matches->match_size);
+
+  ret_rollsum = g_new0 (ContentRollsum, 1);
+  ret_rollsum->from_checksum = g_strdup (from);
+  ret_rollsum->matches = matches; matches = NULL;
+  ret_rollsum->tmp_to = tmp_to; tmp_to = NULL;
+  
+  ret = TRUE;
+  gs_transfer_out_value (out_rollsum, &ret_rollsum);
+ out:
+  if (matches)
+    _ostree_rollsum_matches_free (matches);
+  return ret;
+}
+
+static void
+append_payload_chunk_and_write (OstreeStaticDeltaPartBuilder    *current_part,
+                                const guint8                    *buf,
+                                guint64                          offset)
+{
+  guint64 payload_start;
+
+  payload_start = current_part->payload->len;
+  g_string_append_len (current_part->payload, (char*)buf, offset);
+  g_string_append_c (current_part->operations, (gchar)OSTREE_STATIC_DELTA_OP_WRITE);
+  _ostree_write_varuint64 (current_part->operations, offset);
+  _ostree_write_varuint64 (current_part->operations, payload_start);
+}
+
+static gboolean
+process_one_rollsum (OstreeRepo                       *repo,
+                     OstreeStaticDeltaBuilder         *builder,
+                     OstreeStaticDeltaPartBuilder    **current_part_val,
+                     const char                       *to_checksum,
+                     ContentRollsum                   *rollsum,
+                     GCancellable                     *cancellable,
+                     GError                          **error)
+{
+  gboolean ret = FALSE;
+  guint64 content_size;
+  gs_unref_object GInputStream *content_stream = NULL;
+  gs_unref_object GFileInfo *content_finfo = NULL;
+  gs_unref_variant GVariant *content_xattrs = NULL;
+  OstreeStaticDeltaPartBuilder *current_part = *current_part_val;
+  const guint8 *tmp_to_buf;
+  gsize tmp_to_len;
 
-  g_hash_table_iter_init (&hiter, to_rollsum->values);
-  while (g_hash_table_iter_next (&hiter, &hkey, &hvalue))
+  /* Check to see if this delta has gone over maximum size */
+  if (current_part->objects->len > 0 &&
+      current_part->payload->len > builder->max_chunk_size_bytes)
     {
-      GVariant *chunk = hvalue;
-      if (g_hash_table_contains (from_rollsum->values, hkey))
+      *current_part_val = current_part = allocate_part (builder);
+    } 
+
+  tmp_to_buf = g_bytes_get_data (rollsum->tmp_to, &tmp_to_len);
+
+  if (!ostree_repo_load_file (repo, to_checksum, &content_stream,
+                              &content_finfo, &content_xattrs,
+                              cancellable, error))
+    goto out;
+  content_size = g_file_info_get_size (content_finfo);
+  g_assert_cmpint (tmp_to_len, ==, content_size);
+
+  current_part->uncompressed_size += content_size;
+
+  g_ptr_array_add (current_part->objects, ostree_object_name_serialize (to_checksum, 
OSTREE_OBJECT_TYPE_FILE));
+
+  { gsize mode_offset, xattr_offset, from_csum_offset;
+    gboolean reading_payload = TRUE;
+    guchar source_csum[32];
+    guint i;
+
+    write_content_mode_xattrs (repo, current_part, content_finfo, content_xattrs,
+                               &mode_offset, &xattr_offset);
+
+    /* Write the origin checksum */
+    ostree_checksum_inplace_to_bytes (rollsum->from_checksum, source_csum);
+    from_csum_offset = current_part->payload->len;
+    g_string_append_len (current_part->payload, (char*)source_csum, sizeof (source_csum));
+
+    g_string_append_c (current_part->operations, (gchar)OSTREE_STATIC_DELTA_OP_OPEN);
+    _ostree_write_varuint64 (current_part->operations, mode_offset);
+    _ostree_write_varuint64 (current_part->operations, xattr_offset);
+    _ostree_write_varuint64 (current_part->operations, content_size);
+
+    { guint64 writing_offset = 0;
+      guint64 offset = 0, to_start = 0, from_start = 0;
+      GPtrArray *matchlist = rollsum->matches->matches;
+
+      g_assert (matchlist->len > 0);
+      for (i = 0; i < matchlist->len; i++)
         {
-          guint64 offset;
-          g_variant_get (chunk, "(utt)", NULL, NULL, &offset);
-          matches++;
-          match_size += offset;
+          GVariant *match = matchlist->pdata[i];
+          guint32 crc;
+          guint64 prefix;
+          
+          g_variant_get (match, "(uttt)", &crc, &offset, &to_start, &from_start);
+
+          prefix = to_start - writing_offset;
+
+          if (prefix > 0)
+            {
+              if (!reading_payload)
+                {
+                  g_string_append_c (current_part->operations, 
(gchar)OSTREE_STATIC_DELTA_OP_UNSET_READ_SOURCE);
+                  reading_payload = TRUE;
+                }
+              
+              g_assert_cmpint (writing_offset + prefix, <=, tmp_to_len);
+              append_payload_chunk_and_write (current_part, tmp_to_buf + writing_offset, prefix);
+              writing_offset += prefix;
+            }
+
+          if (reading_payload)
+            {
+              g_string_append_c (current_part->operations, (gchar)OSTREE_STATIC_DELTA_OP_SET_READ_SOURCE);
+              _ostree_write_varuint64 (current_part->operations, from_csum_offset);
+              reading_payload = FALSE;
+            }
+
+          g_string_append_c (current_part->operations, (gchar)OSTREE_STATIC_DELTA_OP_WRITE);
+          _ostree_write_varuint64 (current_part->operations, offset);
+          _ostree_write_varuint64 (current_part->operations, from_start);
+          writing_offset += offset;
         }
-      total++;
-    }
 
-  match_ratio = (matches*100)/total;
+      if (!reading_payload)
+        {
+          g_string_append_c (current_part->operations, (gchar)OSTREE_STATIC_DELTA_OP_UNSET_READ_SOURCE);
+          reading_payload = TRUE;
+        }
+      
+      { guint64 remainder = tmp_to_len - writing_offset;
+        if (remainder > 0)
+          append_payload_chunk_and_write (current_part, tmp_to_buf + writing_offset, remainder);
+        writing_offset += remainder;
+        g_assert_cmpint (writing_offset, ==, tmp_to_len);
+      }
 
-  /* Only proceed if the file contains (arbitrary) more than 25% of
-   * the previous chunks.
-   */
-  if (match_ratio < 25)
-    {
-      ret = TRUE;
-      goto out;
+      g_assert_cmpint (writing_offset, ==, content_size);
     }
 
-  ret_rollsum = g_new0 (ContentRollsum, 1);
-  ret_rollsum->match_ratio = match_ratio;
-  ret_rollsum->match_size = match_size;
-  ret_rollsum->from_checksum = g_strdup (from);
-  ret_rollsum->from_rollsums = from_rollsum; from_rollsum = NULL;
-  ret_rollsum->to_rollsums = to_rollsum; to_rollsum = NULL;
-  
+
+    g_string_append_c (current_part->operations, (gchar)OSTREE_STATIC_DELTA_OP_CLOSE);
+  }
+
   ret = TRUE;
-  gs_transfer_out_value (out_rollsum, &ret_rollsum);
  out:
   return ret;
 }
 
+
 static gboolean 
 generate_delta_lowlatency (OstreeRepo                       *repo,
                            const char                       *from,
@@ -694,16 +795,47 @@ generate_delta_lowlatency (OstreeRepo                       *repo,
         continue;
 
       g_hash_table_insert (rollsum_optimized_content_objects, g_strdup (to_checksum), rollsum);
-      builder->rollsum_size += rollsum->match_size;
+      builder->rollsum_size += rollsum->matches->match_size;
     }
 
   g_printerr ("rollsum for %u/%u modified\n",
               g_hash_table_size (rollsum_optimized_content_objects),
               g_hash_table_size (modified_content_objects));
 
+  current_part = allocate_part (builder);
+
+  /* Pack the metadata first */
+  g_hash_table_iter_init (&hashiter, new_reachable_metadata);
+  while (g_hash_table_iter_next (&hashiter, &key, &value))
+    {
+      GVariant *serialized_key = key;
+      const char *checksum;
+      OstreeObjectType objtype;
+
+      ostree_object_name_deserialize (serialized_key, &checksum, &objtype);
+
+      if (!process_one_object (repo, builder, &current_part,
+                               checksum, objtype,
+                               cancellable, error))
+        goto out;
+    }
+
+  /* Now do rollsummed objects */
+
+  g_hash_table_iter_init (&hashiter, rollsum_optimized_content_objects);
+  while (g_hash_table_iter_next (&hashiter, &key, &value))
+    {
+      const char *checksum = key;
+      ContentRollsum *rollsum = value;
+
+      if (!process_one_rollsum (repo, builder, &current_part,
+                               checksum, rollsum,
+                               cancellable, error))
+        goto out;
+    }
+
   /* Scan for large objects, so we can fall back to plain HTTP-based
-   * fetch.  In the future this should come after an rsync-style
-   * rolling delta check for modified files.
+   * fetch.
    */
   g_hash_table_iter_init (&hashiter, new_reachable_content);
   while (g_hash_table_iter_next (&hashiter, &key, &value))
@@ -716,6 +848,10 @@ generate_delta_lowlatency (OstreeRepo                       *repo,
 
       ostree_object_name_deserialize (serialized_key, &checksum, &objtype);
 
+      /* Skip content objects we rollsum'd */
+      if (g_hash_table_contains (rollsum_optimized_content_objects, checksum))
+        continue;
+
       if (!ostree_repo_load_object_stream (repo, objtype, checksum,
                                            NULL, &uncompressed_size,
                                            cancellable, error))
@@ -734,25 +870,7 @@ generate_delta_lowlatency (OstreeRepo                       *repo,
         }
     }
 
-  current_part = allocate_part (builder);
-
-  /* Pack the metadata first */
-  g_hash_table_iter_init (&hashiter, new_reachable_metadata);
-  while (g_hash_table_iter_next (&hashiter, &key, &value))
-    {
-      GVariant *serialized_key = key;
-      const char *checksum;
-      OstreeObjectType objtype;
-
-      ostree_object_name_deserialize (serialized_key, &checksum, &objtype);
-
-      if (!process_one_object (repo, builder, &current_part,
-                               checksum, objtype,
-                               cancellable, error))
-        goto out;
-    }
-
-  /* Now content */
+  /* Now non-rollsummed content */
   g_hash_table_iter_init (&hashiter, new_reachable_content);
   while (g_hash_table_iter_next (&hashiter, &key, &value))
     {
@@ -762,6 +880,10 @@ generate_delta_lowlatency (OstreeRepo                       *repo,
 
       ostree_object_name_deserialize (serialized_key, &checksum, &objtype);
 
+      /* Skip content objects we rollsum'd */
+      if (g_hash_table_contains (rollsum_optimized_content_objects, checksum))
+        continue;
+
       if (!process_one_object (repo, builder, &current_part,
                                checksum, objtype,
                                cancellable, error))
diff --git a/src/libostree/ostree-repo-static-delta-private.h 
b/src/libostree/ostree-repo-static-delta-private.h
index 09d4898..2478f16 100644
--- a/src/libostree/ostree-repo-static-delta-private.h
+++ b/src/libostree/ostree-repo-static-delta-private.h
@@ -135,7 +135,8 @@ typedef enum {
   OSTREE_STATIC_DELTA_OP_OPEN_SPLICE_AND_CLOSE = 'S',
   OSTREE_STATIC_DELTA_OP_OPEN = 'o',
   OSTREE_STATIC_DELTA_OP_WRITE = 'w',
-  OSTREE_STATIC_DELTA_OP_SET_READ_SOURCE = 'R',
+  OSTREE_STATIC_DELTA_OP_SET_READ_SOURCE = 'r',
+  OSTREE_STATIC_DELTA_OP_UNSET_READ_SOURCE = 'R',
   OSTREE_STATIC_DELTA_OP_CLOSE = 'c'
 } OstreeStaticDeltaOpCode;
 
diff --git a/src/libostree/ostree-repo-static-delta-processing.c 
b/src/libostree/ostree-repo-static-delta-processing.c
index 1cb66de..b268fb2 100644
--- a/src/libostree/ostree-repo-static-delta-processing.c
+++ b/src/libostree/ostree-repo-static-delta-processing.c
@@ -98,6 +98,7 @@ OPPROTO(open_splice_and_close)
 OPPROTO(open)
 OPPROTO(write)
 OPPROTO(set_read_source)
+OPPROTO(unset_read_source)
 OPPROTO(close)
 #undef OPPROTO
 
@@ -250,6 +251,10 @@ _ostree_static_delta_part_execute_raw (OstreeRepo      *repo,
           if (!dispatch_set_read_source (repo, state, cancellable, error))
             goto out;
           break;
+        case OSTREE_STATIC_DELTA_OP_UNSET_READ_SOURCE:
+          if (!dispatch_unset_read_source (repo, state, cancellable, error))
+            goto out;
+          break;
         case OSTREE_STATIC_DELTA_OP_CLOSE:
           if (!dispatch_close (repo, state, cancellable, error))
             goto out;
@@ -491,7 +496,7 @@ dispatch_open_splice_and_close (OstreeRepo                 *repo,
 
   if (!open_output_target (state, cancellable, error))
     goto out;
-  
+
   if (OSTREE_OBJECT_TYPE_IS_META (state->output_objtype))
     {
       gs_unref_variant GVariant *metadata = NULL;
@@ -519,7 +524,6 @@ dispatch_open_splice_and_close (OstreeRepo                 *repo,
     {
       guint64 content_offset;
       guint64 objlen;
-      guint64 content_size;
       gsize bytes_written;
       gs_unref_object GInputStream *object_input = NULL;
       gs_unref_object GInputStream *memin = NULL;
@@ -527,11 +531,11 @@ dispatch_open_splice_and_close (OstreeRepo                 *repo,
       if (!do_content_open_generic (repo, state, cancellable, error))
         goto out;
 
-      if (!read_varuint64 (state, &content_size, error))
+      if (!read_varuint64 (state, &state->content_size, error))
         goto out;
       if (!read_varuint64 (state, &content_offset, error))
         goto out;
-      if (!validate_ofs (state, content_offset, content_size, error))
+      if (!validate_ofs (state, content_offset, state->content_size, error))
         goto out;
       
       /* Fast path for regular files to bare repositories */
@@ -551,7 +555,7 @@ dispatch_open_splice_and_close (OstreeRepo                 *repo,
             {
               if (!g_output_stream_write_all (state->content_out,
                                               state->payload_data + content_offset,
-                                              content_size,
+                                              state->content_size,
                                               &bytes_written,
                                               cancellable, error))
                 goto out;
@@ -567,14 +571,14 @@ dispatch_open_splice_and_close (OstreeRepo                 *repo,
           if (S_ISLNK (state->mode))
             {
               gs_free char *nulterminated_target =
-                g_strndup ((char*)state->payload_data + content_offset, content_size);
+                g_strndup ((char*)state->payload_data + content_offset, state->content_size);
               g_file_info_set_symlink_target (finfo, nulterminated_target);
             }
           else
             {
               g_assert (S_ISREG (state->mode));
-              g_file_info_set_size (finfo, content_size);
-              memin = g_memory_input_stream_new_from_data (state->payload_data + content_offset, 
content_size, NULL);
+              g_file_info_set_size (finfo, state->content_size);
+              memin = g_memory_input_stream_new_from_data (state->payload_data + content_offset, 
state->content_size, NULL);
             }
 
           if (!ostree_raw_file_to_content_stream (memin, finfo, state->xattrs,
@@ -621,6 +625,9 @@ dispatch_open (OstreeRepo                 *repo,
   if (!do_content_open_generic (repo, state, cancellable, error))
     goto out;
 
+  if (!read_varuint64 (state, &state->content_size, error))
+    goto out;
+
   if (!_ostree_repo_open_trusted_content_bare (repo, state->checksum,
                                                state->content_size,
                                                &state->barecommitstate,
@@ -651,8 +658,6 @@ dispatch_write (OstreeRepo                 *repo,
     goto out;
   if (!read_varuint64 (state, &content_offset, error))
     goto out;
-  if (!validate_ofs (state, content_offset, content_size, error))
-    goto out;
 
   if (!state->have_obj)
     {
@@ -695,6 +700,9 @@ dispatch_write (OstreeRepo                 *repo,
         }
       else
         {
+          if (!validate_ofs (state, content_offset, content_size, error))
+            goto out;
+
           if (!g_output_stream_write_all (state->content_out,
                                           state->payload_data + content_offset,
                                           content_size,
@@ -746,6 +754,29 @@ dispatch_set_read_source (OstreeRepo                 *repo,
 }
 
 static gboolean
+dispatch_unset_read_source (OstreeRepo                 *repo,
+                            StaticDeltaExecutionState  *state,
+                            GCancellable               *cancellable,  
+                            GError                    **error)
+{
+  gboolean ret = FALSE;
+
+  if (state->read_source_fd)
+    {
+      (void) close (state->read_source_fd);
+      state->read_source_fd = -1;
+    }
+
+  g_clear_pointer (&state->read_source_object, g_free);
+  
+  ret = TRUE;
+  /* out: */
+  if (!ret)
+    g_prefix_error (error, "opcode unset-read-source: ");
+  return ret;
+}
+
+static gboolean
 dispatch_close (OstreeRepo                 *repo,
                 StaticDeltaExecutionState  *state,
                 GCancellable               *cancellable,  
@@ -765,14 +796,10 @@ dispatch_close (OstreeRepo                 *repo,
         goto out;
     }
 
-  if (state->read_source_fd)
-    {
-      (void) close (state->read_source_fd);
-      state->read_source_fd = -1;
-    }
+  if (!dispatch_unset_read_source (repo, state, cancellable, error))
+    goto out;
       
   g_clear_pointer (&state->xattrs, g_variant_unref);
-  g_clear_pointer (&state->read_source_object, g_free);
   g_clear_object (&state->content_out);
   
   state->checksum_index++;
diff --git a/src/libostree/ostree-rollsum.c b/src/libostree/ostree-rollsum.c
new file mode 100644
index 0000000..5a57e1c
--- /dev/null
+++ b/src/libostree/ostree-rollsum.c
@@ -0,0 +1,201 @@
+/* -*- mode: C; c-file-style: "gnu"; indent-tabs-mode: nil; -*-
+ *
+ * Copyright (C) 2015 Colin Walters <walters verbum org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+#include "config.h"
+
+#include <string.h>
+#include <zlib.h>
+
+#include "ostree-rollsum.h"
+#include "libgsystem.h"
+#include "bupsplit.h"
+
+#define ROLLSUM_BLOB_MAX (8192*4)
+
+static GHashTable *
+rollsum_chunks_crc32 (GBytes           *bytes)
+{
+  gsize start = 0;
+  gboolean rollsum_end = FALSE;
+  GHashTable *ret_rollsums = NULL;
+  const guint8 *buf;
+  gsize buflen;
+  gsize remaining;
+
+  ret_rollsums = g_hash_table_new_full (NULL, NULL, NULL, (GDestroyNotify)g_ptr_array_unref);
+
+  buf = g_bytes_get_data (bytes, &buflen);
+
+  remaining = buflen;
+  while (remaining > 0)
+    {
+      int offset, bits;
+
+      if (!rollsum_end)
+        {
+          offset = bupsplit_find_ofs (buf + start, MIN(G_MAXINT32, remaining), &bits); 
+          if (offset == 0)
+            {
+              rollsum_end = TRUE;
+              offset = MIN(ROLLSUM_BLOB_MAX, remaining);
+            }
+          else if (offset > ROLLSUM_BLOB_MAX)
+            offset = ROLLSUM_BLOB_MAX;
+        }
+      else
+        offset = MIN(ROLLSUM_BLOB_MAX, remaining);
+
+      /* Use zlib's crc32 */
+      { guint32 crc = crc32 (0L, NULL, 0);
+        GVariant *val;
+        GPtrArray *matches;
+
+        crc = crc32 (crc, buf, offset);
+
+        val = g_variant_ref_sink (g_variant_new ("(utt)", crc, (guint64) start, (guint64)offset));
+        matches = g_hash_table_lookup (ret_rollsums, GUINT_TO_POINTER (crc));
+        if (!matches)
+          {
+            matches = g_ptr_array_new_with_free_func ((GDestroyNotify)g_variant_unref);
+            g_hash_table_insert (ret_rollsums, GUINT_TO_POINTER (crc), matches);
+          }
+        g_ptr_array_add (matches, val);
+      }
+
+      start += offset;
+      remaining -= offset;
+    }
+
+  return ret_rollsums;
+}
+
+static gint
+compare_matches (const void *app,
+                 const void *bpp)
+{
+  GVariant **avpp = (GVariant**)app;
+  GVariant *a = *avpp;
+  GVariant **bvpp = (GVariant**)bpp;
+  GVariant *b = *bvpp;
+  guint64 a_start, b_start;
+  
+  g_variant_get_child (a, 2, "t", &a_start);
+  g_variant_get_child (b, 2, "t", &b_start);
+
+  g_assert_cmpint (a_start, !=, b_start);
+
+  if (a_start < b_start)
+    return -1;
+  return 1;
+}
+
+OstreeRollsumMatches *
+_ostree_compute_rollsum_matches (GBytes                           *from,
+                                 GBytes                           *to)
+{
+  OstreeRollsumMatches *ret_rollsum = NULL;
+  gs_unref_hashtable GHashTable *from_rollsum = NULL;
+  gs_unref_hashtable GHashTable *to_rollsum = NULL;
+  gs_unref_ptrarray GPtrArray *matches = NULL;
+  const guint8 *from_buf;
+  gsize from_len;
+  const guint8 *to_buf;
+  gsize to_len;
+  gpointer hkey, hvalue;
+  GHashTableIter hiter;
+
+  ret_rollsum = g_new0 (OstreeRollsumMatches, 1);
+
+  matches = g_ptr_array_new_with_free_func ((GDestroyNotify)g_variant_unref);
+
+  from_buf = g_bytes_get_data (from, &from_len);
+  to_buf = g_bytes_get_data (to, &to_len);
+
+  from_rollsum = rollsum_chunks_crc32 (from);
+  to_rollsum = rollsum_chunks_crc32 (to);
+
+  g_hash_table_iter_init (&hiter, to_rollsum);
+  while (g_hash_table_iter_next (&hiter, &hkey, &hvalue))
+    {
+      GPtrArray *to_chunks = hvalue;
+      GPtrArray *from_chunks;
+
+      from_chunks = g_hash_table_lookup (from_rollsum, hkey);
+      if (from_chunks != NULL)
+        {
+          guint i;
+
+          ret_rollsum->crcmatches++;
+
+          for (i = 0; i < to_chunks->len; i++)
+            {
+              GVariant *to_chunk = to_chunks->pdata[i];
+              guint64 to_start, to_offset;
+              guint32 tocrc;
+              guint j;
+
+              g_variant_get (to_chunk, "(utt)", &tocrc, &to_start, &to_offset);
+
+              for (j = 0; j < from_chunks->len; j++)
+                {
+                  GVariant *from_chunk = from_chunks->pdata[j];
+                  guint32 fromcrc;
+                  guint64 from_start, from_offset;
+
+                  g_variant_get (from_chunk, "(utt)", &fromcrc, &from_start, &from_offset);
+
+                  g_assert (fromcrc == tocrc);
+                  g_assert (to_offset == from_offset);
+                  
+                  /* Rsync uses a cryptographic checksum, but let's be
+                   * very conservative here and just memcmp.
+                   */
+                  if (memcmp (from_buf + from_start, to_buf + to_start, to_offset) == 0)
+                    {
+                      GVariant *match = g_variant_new ("(uttt)", fromcrc, to_offset, to_start, from_start);
+                      ret_rollsum->bufmatches++;
+                      ret_rollsum->match_size += to_offset;
+                      g_ptr_array_add (matches, g_variant_ref_sink (match));
+                      break; /* Don't need any more matches */
+                    } 
+                }
+            }
+        }
+
+      ret_rollsum->total += to_chunks->len;
+    }
+
+  g_ptr_array_sort (matches, compare_matches);
+
+  ret_rollsum->from_rollsums = from_rollsum; from_rollsum = NULL;
+  ret_rollsum->to_rollsums = to_rollsum; to_rollsum = NULL;
+  ret_rollsum->matches = matches; matches = NULL;
+
+  return ret_rollsum;
+}
+
+void
+_ostree_rollsum_matches_free (OstreeRollsumMatches *rollsum)
+{
+  g_hash_table_unref (rollsum->to_rollsums);
+  g_hash_table_unref (rollsum->from_rollsums);
+  g_ptr_array_unref (rollsum->matches);
+  g_free (rollsum);
+}
diff --git a/src/libostree/ostree-rollsum.h b/src/libostree/ostree-rollsum.h
new file mode 100644
index 0000000..37003d8
--- /dev/null
+++ b/src/libostree/ostree-rollsum.h
@@ -0,0 +1,44 @@
+/* -*- mode: C; c-file-style: "gnu"; indent-tabs-mode: nil; -*-
+ *
+ * Copyright (C) 2015 Colin Walters <walters verbum org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+#pragma once
+
+#include <gio/gio.h>
+
+G_BEGIN_DECLS
+
+typedef struct {
+  GHashTable *from_rollsums;
+  GHashTable *to_rollsums;
+  guint crcmatches;
+  guint bufmatches;
+  guint total;
+  guint64 match_size;
+  GPtrArray *matches;
+} OstreeRollsumMatches;
+
+OstreeRollsumMatches *
+_ostree_compute_rollsum_matches (GBytes                           *from,
+                                 GBytes                           *to);
+
+void _ostree_rollsum_matches_free (OstreeRollsumMatches *rollsum);
+
+G_END_DECLS
+
diff --git a/tests/test-rollsum.c b/tests/test-rollsum.c
index 1b9174c..e79c895 100644
--- a/tests/test-rollsum.c
+++ b/tests/test-rollsum.c
@@ -20,138 +20,51 @@
 
 #include "config.h"
 
-#include "libgsystem.h"
-
-#include "bupsplit.h"
-
-#define BLOB_MAX (8192*4)
-
-static GPtrArray *
-rollsum_checksums_for_data (GBytes     *bytes)
-{
-  const guint8 *start;
-  gsize len;
-  gboolean rollsum_end = FALSE;
-  GPtrArray *ret = g_ptr_array_new_with_free_func ((GDestroyNotify)g_variant_unref);
-
-  start = g_bytes_get_data (bytes, &len);
-  while (len > 0)
-    {
-      int offset, bits;
-      if (!rollsum_end)
-        {
-          offset = bupsplit_find_ofs (start, MIN(G_MAXINT32, len), &bits); 
-          if (offset == 0)
-            {
-              rollsum_end = TRUE;
-              offset = MIN(BLOB_MAX, len);
-            }
-          else if (offset > BLOB_MAX)
-            offset = BLOB_MAX;
-        }
-      else
-        offset = MIN(BLOB_MAX, len);
-
-      {
-        gs_free char *blobcsum =
-          g_compute_checksum_for_data (G_CHECKSUM_SHA256,
-                                       start, offset);
-        g_ptr_array_add (ret, g_variant_ref_sink (g_variant_new ("(st)",
-                                                                 blobcsum, (guint64)offset)));
-      }
-      start += offset;
-      len -= offset;
-    }
-  return ret;
-}
+#include "ostree-rollsum.h"
+#include <unistd.h>
+#include <stdlib.h>
 
-static void
-print_rollsums (GPtrArray  *rollsums)
-{
-  guint i;
-  for (i = 0; i < rollsums->len; i++)
-    {
-      GVariant *sum = rollsums->pdata[i];
-      const char *csum;
-      guint64 val;
-      g_variant_get (sum, "(&st)", &csum, &val);
-      g_print ("chunk %s %" G_GUINT64_FORMAT "\n", csum, val);
-    }
-}
+#include "libgsystem.h"
 
 int
 main (int argc, char **argv)
 {
-  GCancellable *cancellable = NULL;
   GError *local_error = NULL;
   GError **error = &local_error;
-  gs_unref_object GFile *path = NULL;
-  GBytes *bytes = NULL;
+  GBytes *from_bytes = NULL;
+  GBytes *to_bytes = NULL;
+  const char *from_path;
+  const char *to_path;
+  OstreeRollsumMatches *matches;
+  GMappedFile *mfile;
 
   g_setenv ("GIO_USE_VFS", "local", TRUE);
 
-  if (argc == 2)
-    {
-      gs_unref_ptrarray GPtrArray *rollsums = NULL;
+  if (argc < 3)
+    exit (1);
 
-      path = g_file_new_for_path (argv[1]);
-      bytes = gs_file_map_readonly (path, cancellable, error);
-      if (!bytes)
-       goto out;
+  from_path = argv[1];
+  to_path = argv[2];
 
-      rollsums = rollsum_checksums_for_data (bytes);
-      print_rollsums (rollsums);
-    }
-  else if (argc > 2)
-    {
-      guint i;
-      gs_unref_hashtable GHashTable *sums = g_hash_table_new_full (g_str_hash, g_str_equal, g_free, NULL);
-      guint64 input_size = 0;
-      guint64 rollsum_size = 0;
+  mfile = g_mapped_file_new (from_path, FALSE, error);
+  if (!mfile)
+    goto out;
+  from_bytes = g_mapped_file_get_bytes (mfile);
+  g_mapped_file_unref (mfile);
+  mfile = g_mapped_file_new (to_path, FALSE, error);
+  if (!mfile)
+    goto out;
+  to_bytes = g_mapped_file_get_bytes (mfile);
+  g_mapped_file_unref (mfile);
 
-      for (i = 1; i < argc; i++)
-        {
-          guint j;
-          gs_unref_ptrarray GPtrArray *rollsums = NULL;
-          guint64 this_rollsum_size = 0;
+  matches = _ostree_compute_rollsum_matches (from_bytes, to_bytes);
 
-          path = g_file_new_for_path (argv[i]);
-          bytes = gs_file_map_readonly (path, cancellable, error);
-          if (!bytes)
-            goto out;
-
-          input_size += g_bytes_get_size (bytes);
-          
-          g_print ("input: %s size: %" G_GUINT64_FORMAT "\n", argv[i], g_bytes_get_size (bytes));
-
-          rollsums = rollsum_checksums_for_data (bytes);
-          print_rollsums (rollsums);
-          for (j = 0; j < rollsums->len; j++)
-            {
-              GVariant *sum = rollsums->pdata[j];
-              const char *csum;
-              guint64 ofs;
-              g_variant_get (sum, "(&st)", &csum, &ofs);
-              if (!g_hash_table_contains (sums, csum))
-                {
-                  g_hash_table_add (sums, g_strdup (csum));
-                  rollsum_size += ofs;
-                }
-              this_rollsum_size += ofs;
-            }
-          g_print ("input: rollsum size: %" G_GUINT64_FORMAT "\n", this_rollsum_size);
-        }
-      g_print ("rollsum total:%u input:%" G_GUINT64_FORMAT " output: %" G_GUINT64_FORMAT " speedup:%f\n",
-               g_hash_table_size (sums), input_size, rollsum_size,
-               (((double)(input_size+1)) / ((double) rollsum_size + 1)));
-    }
-  else
-    {
-      bupsplit_selftest ();
-    }
+  g_printerr ("rollsum crcs=%u bufs=%u total=%u matchsize=%llu\n",
+              matches->crcmatches,
+              matches->bufmatches,
+              matches->total, (unsigned long long)matches->match_size);
 
  out:
-  g_clear_pointer (&bytes, g_bytes_unref);
   if (local_error)
     {
       g_printerr ("%s\n", local_error->message);



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]