[ostree/wip/delta: 28/29] core: Import bup's "rollsum" code, add a test case



commit f5d005ba56075f8f7ac641aa1c9e1cf0566898bc
Author: Colin Walters <walters verbum org>
Date:   Thu Aug 15 09:14:26 2013 -0400

    core: Import bup's "rollsum" code, add a test case
    
    For static deltas, one strategy that will be employed is to split each
    object into chunks, and only include changed chunks in the deltas.

 Makefile-tests.am        |    5 ++
 src/libostree/bupsplit.c |  152 ++++++++++++++++++++++++++++++++++++++++++++++
 src/libostree/bupsplit.h |   49 +++++++++++++++
 tests/test-rollsum.c     |   79 ++++++++++++++++++++++++
 4 files changed, 285 insertions(+), 0 deletions(-)
---
diff --git a/Makefile-tests.am b/Makefile-tests.am
index e991dab..07b5909 100644
--- a/Makefile-tests.am
+++ b/Makefile-tests.am
@@ -64,4 +64,9 @@ test_varint_CFLAGS = $(ostree_bin_shared_cflags) $(OT_INTERNAL_GIO_UNIX_CFLAGS)
 test_varint_LDADD = $(ostree_bin_shared_ldadd) $(OT_INTERNAL_GIO_UNIX_LIBS)
 testmeta_DATA += test-varint.test
 
+insttest_PROGRAMS += test-rollsum
+test_rollsum_SOURCES = src/libostree/bupsplit.c tests/test-rollsum.c
+test_rollsum_CFLAGS = $(ostree_bin_shared_cflags) $(OT_INTERNAL_GIO_UNIX_CFLAGS)
+test_rollsum_LDADD = $(ostree_bin_shared_ldadd) $(OT_INTERNAL_GIO_UNIX_LIBS)
+
 endif
diff --git a/src/libostree/bupsplit.c b/src/libostree/bupsplit.c
new file mode 100644
index 0000000..067b2e9
--- /dev/null
+++ b/src/libostree/bupsplit.c
@@ -0,0 +1,152 @@
+/*
+ * Copyright 2011 Avery Pennarun. All rights reserved.
+ * 
+ * (This license applies to bupsplit.c and bupsplit.h only.)
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ * 
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY AVERY PENNARUN ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include "bupsplit.h"
+#include <stdint.h>
+#include <memory.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+// According to librsync/rollsum.h:
+// "We should make this something other than zero to improve the
+// checksum algorithm: tridge suggests a prime number."
+// apenwarr: I unscientifically tried 0 and 7919, and they both ended up
+// slightly worse than the librsync value of 31 for my arbitrary test data.
+#define ROLLSUM_CHAR_OFFSET 31
+
+typedef struct {
+    unsigned s1, s2;
+    uint8_t window[BUP_WINDOWSIZE];
+    int wofs;
+} Rollsum;
+
+
+// These formulas are based on rollsum.h in the librsync project.
+static void rollsum_add(Rollsum *r, uint8_t drop, uint8_t add)
+{
+    r->s1 += add - drop;
+    r->s2 += r->s1 - (BUP_WINDOWSIZE * (drop + ROLLSUM_CHAR_OFFSET));
+}
+
+
+static void rollsum_init(Rollsum *r)
+{
+    r->s1 = BUP_WINDOWSIZE * ROLLSUM_CHAR_OFFSET;
+    r->s2 = BUP_WINDOWSIZE * (BUP_WINDOWSIZE-1) * ROLLSUM_CHAR_OFFSET;
+    r->wofs = 0;
+    memset(r->window, 0, BUP_WINDOWSIZE);
+}
+
+
+// For some reason, gcc 4.3 (at least) optimizes badly if find_ofs()
+// is static and rollsum_roll is an inline function.  Let's use a macro
+// here instead to help out the optimizer.
+#define rollsum_roll(r, ch) do { \
+    rollsum_add((r), (r)->window[(r)->wofs], (ch)); \
+    (r)->window[(r)->wofs] = (ch); \
+    (r)->wofs = ((r)->wofs + 1) % BUP_WINDOWSIZE; \
+} while (0)
+
+
+static uint32_t rollsum_digest(Rollsum *r)
+{
+    return (r->s1 << 16) | (r->s2 & 0xffff);
+}
+
+
+static uint32_t rollsum_sum(uint8_t *buf, size_t ofs, size_t len)
+{
+    size_t count;
+    Rollsum r;
+    rollsum_init(&r);
+    for (count = ofs; count < len; count++)
+       rollsum_roll(&r, buf[count]);
+    return rollsum_digest(&r);
+}
+
+
+int bupsplit_find_ofs(const unsigned char *buf, int len, int *bits)
+{
+    Rollsum r;
+    int count;
+    
+    rollsum_init(&r);
+    for (count = 0; count < len; count++)
+    {
+       rollsum_roll(&r, buf[count]);
+       if ((r.s2 & (BUP_BLOBSIZE-1)) == ((~0) & (BUP_BLOBSIZE-1)))
+       {
+           if (bits)
+           {
+               unsigned rsum = rollsum_digest(&r);
+               *bits = BUP_BLOBBITS;
+               rsum >>= BUP_BLOBBITS;
+               for (*bits = BUP_BLOBBITS; (rsum >>= 1) & 1; (*bits)++)
+                   ;
+           }
+           return count+1;
+       }
+    }
+    return 0;
+}
+
+
+#ifndef BUP_NO_SELFTEST
+#define BUP_SELFTEST_SIZE 100000
+
+int bupsplit_selftest()
+{
+    uint8_t *buf = malloc(BUP_SELFTEST_SIZE);
+    uint32_t sum1a, sum1b, sum2a, sum2b, sum3a, sum3b;
+    unsigned count;
+    
+    srandom(1);
+    for (count = 0; count < BUP_SELFTEST_SIZE; count++)
+       buf[count] = random();
+    
+    sum1a = rollsum_sum(buf, 0, BUP_SELFTEST_SIZE);
+    sum1b = rollsum_sum(buf, 1, BUP_SELFTEST_SIZE);
+    sum2a = rollsum_sum(buf, BUP_SELFTEST_SIZE - BUP_WINDOWSIZE*5/2,
+                       BUP_SELFTEST_SIZE - BUP_WINDOWSIZE);
+    sum2b = rollsum_sum(buf, 0, BUP_SELFTEST_SIZE - BUP_WINDOWSIZE);
+    sum3a = rollsum_sum(buf, 0, BUP_WINDOWSIZE+3);
+    sum3b = rollsum_sum(buf, 3, BUP_WINDOWSIZE+3);
+    
+    fprintf(stderr, "sum1a = 0x%08x\n", sum1a);
+    fprintf(stderr, "sum1b = 0x%08x\n", sum1b);
+    fprintf(stderr, "sum2a = 0x%08x\n", sum2a);
+    fprintf(stderr, "sum2b = 0x%08x\n", sum2b);
+    fprintf(stderr, "sum3a = 0x%08x\n", sum3a);
+    fprintf(stderr, "sum3b = 0x%08x\n", sum3b);
+    
+    free(buf);
+    return sum1a!=sum1b || sum2a!=sum2b || sum3a!=sum3b;
+}
+
+#endif // !BUP_NO_SELFTEST
diff --git a/src/libostree/bupsplit.h b/src/libostree/bupsplit.h
new file mode 100644
index 0000000..e37a56a
--- /dev/null
+++ b/src/libostree/bupsplit.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2011 Avery Pennarun. All rights reserved.
+ * 
+ * (This license applies to bupsplit.c and bupsplit.h only.)
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ * 
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY AVERY PENNARUN ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef __BUPSPLIT_H
+#define __BUPSPLIT_H
+
+#define BUP_BLOBBITS (13)
+#define BUP_BLOBSIZE (1<<BUP_BLOBBITS)
+#define BUP_WINDOWBITS (7)
+#define BUP_WINDOWSIZE (1<<(BUP_WINDOWBITS-1))
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+    
+int bupsplit_find_ofs(const unsigned char *buf, int len, int *bits);
+int bupsplit_selftest(void);
+
+#ifdef __cplusplus
+}
+#endif
+    
+#endif /* __BUPSPLIT_H */
diff --git a/tests/test-rollsum.c b/tests/test-rollsum.c
new file mode 100644
index 0000000..4d7f50e
--- /dev/null
+++ b/tests/test-rollsum.c
@@ -0,0 +1,79 @@
+/* -*- mode: C; c-file-style: "gnu"; indent-tabs-mode: nil; -*-
+ *
+ * Copyright (C) 2013 Colin Walters <walters verbum org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+#include "config.h"
+
+#include "libgsystem.h"
+
+#include "bupsplit.h"
+
+#define BLOB_MAX (8192*4)
+#define BLOB_READ_SIZE (1024*1024)
+
+int
+main (int argc, char **argv)
+{
+  GCancellable *cancellable = NULL;
+  GError *local_error = NULL;
+  GError **error = &local_error;
+  gs_unref_object GFile *path = NULL;
+  GBytes *bytes = NULL;
+
+  g_setenv ("GIO_USE_VFS", "local", TRUE);
+
+  if (argc > 1)
+    {
+      const guint8 *start;
+      gsize len;
+
+      path = g_file_new_for_path (argv[1]);
+      bytes = gs_file_map_readonly (path, cancellable, error);
+      if (!bytes)
+       goto out;
+
+      start = g_bytes_get_data (bytes, &len);
+      while (TRUE)
+       {
+         int offset, bits;
+         offset = bupsplit_find_ofs (start, MIN(G_MAXINT32, len), &bits); 
+         if (offset == 0)
+           break;
+         if (offset > BLOB_MAX)
+           offset = BLOB_MAX;
+          g_print ("%" G_GUINT64_FORMAT "\n", (guint64)offset);
+         start += offset;
+          len -= offset;
+        }
+    }
+  else
+    {
+      bupsplit_selftest ();
+    }
+
+ out:
+  g_clear_pointer (&bytes, g_bytes_unref);
+  if (local_error)
+    {
+      g_printerr ("%s\n", local_error->message);
+      g_error_free (local_error);
+      return 1;
+    }
+  return 0;
+}


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]