浏览代码

Switch to a buzhash (cyclic polynomial) chunkify implementation

Jonas Borgström 12 年之前
父节点
当前提交
5b28d428b7
共有 3 个文件被更改,包括 173 次插入46 次删除
  1. 142 41
      darc/_speedups.c
  2. 5 5
      darc/archive.py
  3. 26 0
      darc/test.py

+ 142 - 41
darc/_speedups.c

@@ -1,37 +1,84 @@
 #include <Python.h>
 #include <structmember.h>
+#include <stdint.h>
 
-#define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
-#define ABS(X) ((X) < 0 ? (-(X)) : (X))
+/* Cyclic polynomial / buzhash: https://en.wikipedia.org/wiki/Rolling_hash */
 
-static unsigned long int
-checksum(const unsigned char *data, int len, unsigned long int sum)
+static uint32_t table_base[] =
 {
-    unsigned long int s1, s2, i;
-    s1 = sum & 0xffff;
-    s2 = sum >> 16;
-    for(i=0; i < len; i++)
+    0xe7f831ec, 0xf4026465, 0xafb50cae, 0x6d553c7a, 0xd639efe3, 0x19a7b895, 0x9aba5b21, 0x5417d6d4,
+    0x35fd2b84, 0xd1f6a159, 0x3f8e323f, 0xb419551c, 0xf444cebf, 0x21dc3b80, 0xde8d1e36, 0x84a32436,
+    0xbeb35a9d, 0xa36f24aa, 0xa4e60186, 0x98d18ffe, 0x3f042f9e, 0xdb228bcd, 0x096474b7, 0x5c20c2f7,
+    0xf9eec872, 0xe8625275, 0xb9d38f80, 0xd48eb716, 0x22a950b4, 0x3cbaaeaa, 0xc37cddd3, 0x8fea6f6a,
+    0x1d55d526, 0x7fd6d3b3, 0xdaa072ee, 0x4345ac40, 0xa077c642, 0x8f2bd45b, 0x28509110, 0x55557613,
+    0xffc17311, 0xd961ffef, 0xe532c287, 0xaab95937, 0x46d38365, 0xb065c703, 0xf2d91d0f, 0x92cd4bb0,
+    0x4007c712, 0xf35509dd, 0x505b2f69, 0x557ead81, 0x310f4563, 0xbddc5be8, 0x9760f38c, 0x701e0205,
+    0x00157244, 0x14912826, 0xdc4ca32b, 0x67b196de, 0x5db292e8, 0x8c1b406b, 0x01f34075, 0xfa2520f7,
+    0x73bc37ab, 0x1e18bc30, 0xfe2c6cb3, 0x20c522d0, 0x5639e3db, 0x942bda35, 0x899af9d1, 0xced44035,
+    0x98cc025b, 0x255f5771, 0x70fefa24, 0xe928fa4d, 0x2c030405, 0xb9325590, 0x20cb63bd, 0xa166305d,
+    0x80e52c0a, 0xa8fafe2f, 0x1ad13f7d, 0xcfaf3685, 0x6c83a199, 0x7d26718a, 0xde5dfcd9, 0x79cf7355,
+    0x8979d7fb, 0xebf8c55e, 0xebe408e4, 0xcd2affba, 0xe483be6e, 0xe239d6de, 0x5dc1e9e0, 0x0473931f,
+    0x851b097c, 0xac5db249, 0x09c0f9f2, 0xd8d2f134, 0xe6f38e41, 0xb1c71bf1, 0x52b6e4db, 0x07224424,
+    0x6cf73e85, 0x4f25d89c, 0x782a7d74, 0x10a68dcd, 0x3a868189, 0xd570d2dc, 0x69630745, 0x9542ed86,
+    0x331cd6b2, 0xa84b5b28, 0x07879c9d, 0x38372f64, 0x7185db11, 0x25ba7c83, 0x01061523, 0xe6792f9f,
+    0xe5df07d1, 0x4321b47f, 0x7d2469d8, 0x1a3a4f90, 0x48be29a3, 0x669071af, 0x8ec8dd31, 0x0810bfbf,
+    0x813a06b4, 0x68538345, 0x65865ddc, 0x43a71b8e, 0x78619a56, 0x5a34451d, 0x5bdaa3ed, 0x71edc7e9,
+    0x17ac9a20, 0x78d10bfa, 0x6c1e7f35, 0xd51839d9, 0x240cbc51, 0x33513cc1, 0xd2b4f795, 0xccaa8186,
+    0x0babe682, 0xa33cf164, 0x18c643ea, 0xc1ca105f, 0x9959147a, 0x6d3d94de, 0x0b654fbe, 0xed902ca0,
+    0x7d835cb5, 0x99ba1509, 0x6445c922, 0x495e76c2, 0xf07194bc, 0xa1631d7e, 0x677076a5, 0x89fffe35, 
+    0x1a49bcf3, 0x8e6c948a, 0x0144c917, 0x8d93aea1, 0x16f87ddf, 0xc8f25d49, 0x1fb11297, 0x27e750cd,
+    0x2f422da1, 0xdee89a77, 0x1534c643, 0x457b7b8b, 0xaf172f7a, 0x6b9b09d6, 0x33573f7f, 0xf14e15c4,
+    0x526467d5, 0xaf488241, 0x87c3ee0d, 0x33be490c, 0x95aa6e52, 0x43ec242e, 0xd77de99b, 0xd018334f,
+    0x5b78d407, 0x498eb66b, 0xb1279fa8, 0xb38b0ea6, 0x90718376, 0xe325dee2, 0x8e2f2cba, 0xcaa5bdec,
+    0x9d652c56, 0xad68f5cb, 0xa77591af, 0x88e37ee8, 0xf8faa221, 0xfcbbbe47, 0x4f407786, 0xaf393889,
+    0xf444a1d9, 0x15ae1a2f, 0x40aa7097, 0x6f9486ac, 0x29d232a3, 0xe47609e9, 0xe8b631ff, 0xba8565f4,
+    0x11288749, 0x46c9a838, 0xeb1b7cd8, 0xf516bbb1, 0xfb74fda0, 0x010996e6, 0x4c994653, 0x1d889512,
+    0x53dcd9a3, 0xdd074697, 0x1e78e17c, 0x637c98bf, 0x930bb219, 0xcf7f75b0, 0xcb9355fb, 0x9e623009,
+    0xe466d82c, 0x28f968d3, 0xfeb385d9, 0x238e026c, 0xb8ed0560, 0x0c6a027a, 0x3d6fec4b, 0xbb4b2ec2,
+    0xe715031c, 0xeded011d, 0xcdc4d3b9, 0xc456fc96, 0xdd0eea20, 0xb3df8ec9, 0x12351993, 0xd9cbb01c,
+    0x603147a2, 0xcf37d17d, 0xf7fcd9dc, 0xd8556fa3, 0x104c8131, 0x13152774, 0xb4715811, 0x6a72c2c9,
+    0xc5ae37bb, 0xa76ce12a, 0x8150d8f3, 0x2ec29218, 0xa35f0984, 0x48c0647e, 0x0b5ff98c, 0x71893f7b
+};
+
+#define BARREL_SHIFT(v, shift) ( ((v) << ((shift) & 0x1f)) | ((v) >> (32 - ((shift) & 0x1f))) )
+
+
+static uint32_t *
+init_buzhash_table(uint32_t seed)
+{
+    int i;
+    uint32_t *table = malloc(1024);
+    for(i = 0; i < 256; i++)
+    {
+        table[i] = table_base[i] ^ seed;
+    }
+    return table;
+}
+
+static uint32_t
+buzhash(const unsigned char *data, int len, const uint32_t *h)
+{
+    int i;
+    uint32_t sum = 0;
+    for(i = len - 1; i > 0; i--)
     {
-        s1 += data[i] + 1;
-        s2 += s1;
+        sum ^= BARREL_SHIFT(h[*data], i);
+        data++;
     }
-    return ((s2 & 0xffff) << 16) | (s1 & 0xffff);
+    return sum ^ h[*data];
 }
 
-static unsigned long int
-roll_checksum(unsigned long int sum, unsigned char remove, unsigned char add, int len)
+static uint32_t
+buzhash_update(uint32_t sum, unsigned char remove, unsigned char add, int len, const uint32_t *h)
 {
-    unsigned long int s1, s2;
-    s1 = sum & 0xffff;
-    s2 = sum >> 16;
-    s1 -= remove - add;
-    s2 -= len * (remove + 1) - s1;
-    return ((s2 & 0xffff) << 16) | (s1 & 0xffff);
+    return BARREL_SHIFT(sum, 1) ^ BARREL_SHIFT(h[remove], len) ^ h[add];
 }
 
 typedef struct {
     PyObject_HEAD
-    int chunk_size, window_size, last, done, buf_size, seed, remaining, position;
+    int window_size, last, done, buf_size, remaining, position, chunk_mask, min_size;
+    size_t bytes_read, bytes_yielded;
+    uint32_t *h;
     PyObject *chunks, *fd;
     unsigned char *data;
 } ChunkifyIter;
@@ -44,6 +91,8 @@ ChunkifyIter_iter(PyObject *self)
     c->position = 0;
     c->done = 0;
     c->last = 0;
+    c->bytes_read = 0;
+    c->bytes_yielded = 0;
     Py_INCREF(self);
     return self;
 }
@@ -54,6 +103,7 @@ ChunkifyIter_dealloc(PyObject *self)
     ChunkifyIter *c = (ChunkifyIter *)self;
     Py_DECREF(c->fd);
     free(c->data);
+    free(c->h);
     self->ob_type->tp_free(self);
 }
 
@@ -68,6 +118,7 @@ ChunkifyIter_fill(PyObject *self)
     int n = PyString_Size(data);
     memcpy(c->data + c->position + c->remaining, PyString_AsString(data), n);
     c->remaining += n;
+    c->bytes_read += n;
     Py_DECREF(data);
 }
 
@@ -75,41 +126,54 @@ static PyObject*
 ChunkifyIter_iternext(PyObject *self)
 {
     ChunkifyIter *c = (ChunkifyIter *)self;
-    unsigned long int sum;
+    uint32_t sum, chunk_mask = c->chunk_mask, min_size = c->min_size, window_size = c->window_size;
+    int n = 0;
 
     if(c->done) {
-        PyErr_SetNone(PyExc_StopIteration);
+        if(c->bytes_read == c->bytes_yielded)
+            PyErr_SetNone(PyExc_StopIteration);
+        else
+            PyErr_SetString(PyExc_Exception, "chunkifier byte count mismatch");
         return NULL;
     }
-    if(c->remaining <= c->window_size) {
+    if(c->remaining <= window_size) {
         ChunkifyIter_fill(self);
     }
-    if(c->remaining < c->window_size) {
+    if(c->remaining < window_size) {
         c->done = 1;
         if(c->remaining) {
+            c->bytes_yielded += c->remaining;
             return PyBuffer_FromMemory(c->data + c->position, c->remaining);
         }
         else {
-            PyErr_SetNone(PyExc_StopIteration);
+            if(c->bytes_read == c->bytes_yielded)
+                PyErr_SetNone(PyExc_StopIteration);
+            else
+                PyErr_SetString(PyExc_Exception, "chunkifier byte count mismatch");
             return NULL;
         }
     }
-    sum = checksum(c->data + c->position, c->window_size, 0);
-    c->remaining -= c->window_size;
-    c->position += c->window_size;
-    while(c->remaining && (sum & 0xffff) != c->seed) {
-        sum = roll_checksum(sum, c->data[c->position - c->window_size],
-                            c->data[c->position],
-                            c->window_size);
+    sum = buzhash(c->data + c->position, window_size, c->h);
+    while(c->remaining >= c->window_size && ((sum & chunk_mask) || n < min_size)) {
+        sum = buzhash_update(sum, c->data[c->position],
+                             c->data[c->position + window_size],
+                             window_size, c->h);
         c->position++;
         c->remaining--;
-        if(c->remaining == 0) {
+        n++;
+        if(c->remaining <= window_size) {
             ChunkifyIter_fill(self);
         }
     }
+    if(c->remaining <= window_size) {
+        c->position += c->remaining;
+        c->remaining = 0;
+    }
     int old_last = c->last;
     c->last = c->position;
-    return PyBuffer_FromMemory(c->data + old_last, c->last - old_last);
+    n = c->last - old_last;
+    c->bytes_yielded += n;
+    return PyBuffer_FromMemory(c->data + old_last, n);
 }
 
 static PyTypeObject ChunkifyIterType = {
@@ -149,10 +213,10 @@ static PyObject *
 chunkify(PyObject *self, PyObject *args)
 {
     PyObject *fd;
-    int chunk_size, window_size, seed;
+    int seed, window_size, chunk_mask, min_size;
     ChunkifyIter *c;
 
-    if (!PyArg_ParseTuple(args, "Oiii", &fd, &chunk_size, &window_size, &seed))
+    if (!PyArg_ParseTuple(args, "Oiiii", &fd, &window_size, &chunk_mask, &min_size, &seed))
     {
         return NULL;
     }
@@ -163,27 +227,64 @@ chunkify(PyObject *self, PyObject *args)
     PyObject_Init((PyObject *)c, &ChunkifyIterType);
     c->buf_size = 10 * 1024 * 1024;
     c->data = malloc(c->buf_size);
+    c->h = init_buzhash_table(seed & 0xffffffff);
     c->fd = fd;
-    c->chunk_size = chunk_size;
     c->window_size = window_size;
-    c->seed = seed % chunk_size;
+    c->chunk_mask = chunk_mask;
+    c->min_size = min_size;
     Py_INCREF(fd);
     return (PyObject *)c;
 }
 
+static PyObject *
+do_buzhash(PyObject *self, PyObject *args)
+{
+    unsigned char *data;
+    int len;
+    unsigned long int seed, sum;
+    uint32_t *h;
+
+    if (!PyArg_ParseTuple(args, "s#k", &data, &len, &seed))
+    {
+        return NULL;
+    }
+    h = init_buzhash_table(seed & 0xffffffff);
+    sum = buzhash(data, len, h);
+    free(h);
+    return PyLong_FromUnsignedLong(sum);
+}
+
+static PyObject *
+do_buzhash_update(PyObject *self, PyObject *args)
+{
+    unsigned long int sum, seed;
+    unsigned char remove, add;
+    uint32_t *h;
+    int len;
+
+    if (!PyArg_ParseTuple(args, "kbbik", &sum, &remove, &add, &len, &seed))
+    {
+        return NULL;
+    }
+    h = init_buzhash_table(seed & 0xffffffff);
+    sum = buzhash_update(sum, remove, add, len, h);
+    free(h);
+    return PyLong_FromUnsignedLong(sum);
+}
+
 
 static PyMethodDef ChunkifierMethods[] = {
     {"chunkify",  chunkify, METH_VARARGS, ""},
+    {"buzhash",   do_buzhash, METH_VARARGS, ""},
+    {"buzhash_update",   do_buzhash_update, METH_VARARGS, ""},
     {NULL, NULL, 0, NULL}        /* Sentinel */
 };
 
 PyMODINIT_FUNC
 init_speedups(void)
 {
-  PyObject* m;
-
   ChunkifyIterType.tp_new = PyType_GenericNew;
   if (PyType_Ready(&ChunkifyIterType) < 0)  return;
 
-  m = Py_InitModule("_speedups", ChunkifierMethods);
+  Py_InitModule("_speedups", ChunkifierMethods);
 }

+ 5 - 5
darc/archive.py

@@ -16,8 +16,9 @@ from .helpers import uid2user, user2uid, gid2group, group2gid, \
     encode_filename, Statistics
 
 ITEMS_BUFFER = 1024 * 1024
-CHUNK_SIZE = 64 * 1024
-WINDOW_SIZE = 4096
+CHUNK_MIN = 1024
+WINDOW_SIZE = 0xfff
+CHUNK_MASK = 0xffff
 
 have_lchmod = hasattr(os, 'lchmod')
 linux = sys.platform == 'linux2'
@@ -158,7 +159,7 @@ class Archive(object):
         if self.items.tell() == 0:
             return
         self.items.seek(0)
-        chunks = list(str(s) for s in chunkify(self.items, CHUNK_SIZE, WINDOW_SIZE, self.key.chunk_seed))
+        chunks = list(str(s) for s in chunkify(self.items, WINDOW_SIZE, CHUNK_MASK, CHUNK_MIN, self.key.chunk_seed))
         self.items.seek(0)
         self.items.truncate()
         for chunk in chunks[:-1]:
@@ -399,8 +400,7 @@ class Archive(object):
         if chunks is None:
             with open(path, 'rb') as fd:
                 chunks = []
-                for chunk in chunkify(fd, CHUNK_SIZE, WINDOW_SIZE,
-                                      self.key.chunk_seed):
+                for chunk in chunkify(fd, WINDOW_SIZE, CHUNK_MASK, CHUNK_MIN, self.key.chunk_seed):
                     chunks.append(cache.add_chunk(self.key.id_hash(chunk), chunk, self.stats))
             ids = [id for id, _, _ in chunks]
             cache.memorize_file(path_hash, st, ids)

+ 26 - 0
darc/test.py

@@ -11,6 +11,7 @@ import unittest
 from xattr import xattr, XATTR_NOFOLLOW
 
 from . import helpers, lrucache
+from ._speedups import buzhash, buzhash_update, chunkify
 from .archiver import Archiver
 from .key import suite as KeySuite
 from .store import Store, suite as StoreSuite
@@ -186,12 +187,37 @@ class Test(unittest.TestCase):
         assert 'test2' in output
 
 
+class ChunkTest(unittest.TestCase):
+
+    def test_chunkify(self):
+        data = '0' * 1024 * 1024 * 15 + 'Y'
+        parts = [str(c) for c in chunkify(StringIO(data), 2, 0x3, 2, 0)]
+        self.assertEqual(len(parts), 2)
+        self.assertEqual(''.join(parts), data)
+        self.assertEqual([str(c) for c in chunkify(StringIO(''), 2, 0x3, 2, 0)], [])
+        self.assertEqual([str(c) for c in chunkify(StringIO('foobarboobaz' * 3), 2, 0x3, 2, 0)], ['fooba', 'rboobaz', 'fooba', 'rboobaz', 'fooba', 'rboobaz'])
+        self.assertEqual([str(c) for c in chunkify(StringIO('foobarboobaz' * 3), 2, 0x3, 2, 1)], ['fo', 'obarb', 'oob', 'azf', 'oobarb', 'oob', 'azf', 'oobarb', 'oobaz'])
+        self.assertEqual([str(c) for c in chunkify(StringIO('foobarboobaz' * 3), 2, 0x3, 2, 2)], ['foob', 'ar', 'boobazfoob', 'ar', 'boobazfoob', 'ar', 'boobaz'])
+        self.assertEqual([str(c) for c in chunkify(StringIO('foobarboobaz' * 3), 3, 0x3, 3, 0)], ['foobarboobaz' * 3])
+        self.assertEqual([str(c) for c in chunkify(StringIO('foobarboobaz' * 3), 3, 0x3, 3, 1)], ['foobar', 'boo', 'bazfo', 'obar', 'boo', 'bazfo', 'obar', 'boobaz'])
+        self.assertEqual([str(c) for c in chunkify(StringIO('foobarboobaz' * 3), 3, 0x3, 3, 2)], ['foo', 'barboobaz', 'foo', 'barboobaz', 'foo', 'barboobaz'])
+        self.assertEqual([str(c) for c in chunkify(StringIO('foobarboobaz' * 3), 3, 0x3, 4, 0)], ['foobarboobaz' * 3])
+        self.assertEqual([str(c) for c in chunkify(StringIO('foobarboobaz' * 3), 3, 0x3, 4, 1)], ['foobar', 'boobazfo', 'obar', 'boobazfo', 'obar', 'boobaz'])
+        self.assertEqual([str(c) for c in chunkify(StringIO('foobarboobaz' * 3), 3, 0x3, 4, 2)], ['foob', 'arboobaz', 'foob', 'arboobaz', 'foob', 'arboobaz'])
+
+    def test_buzhash(self):
+        self.assertEqual(buzhash('abcdefghijklmnop', 0), 3795437769L)
+        self.assertEqual(buzhash('abcdefghijklmnop', 1), 3795400502L)
+        self.assertEqual(buzhash('abcdefghijklmnop', 1), buzhash_update(buzhash('Xabcdefghijklmno', 1), ord('X'), ord('p'), 16, 1))
+
+
 class RemoteTest(Test):
     prefix = 'localhost:'
 
 
 def suite():
     suite = unittest.TestSuite()
+    suite.addTest(unittest.TestLoader().loadTestsFromTestCase(ChunkTest))
     suite.addTest(unittest.TestLoader().loadTestsFromTestCase(Test))
     suite.addTest(unittest.TestLoader().loadTestsFromTestCase(RemoteTest))
     suite.addTest(KeySuite())