Просмотр исходного кода

Switch to a buzhash (cyclic polynomial) chunkify implementation

Jonas Borgström 12 лет назад
Родитель
Сommit
5b28d428b7
3 измененных файлов с 173 добавлено и 46 удалено
  1. 142 41
      darc/_speedups.c
  2. 5 5
      darc/archive.py
  3. 26 0
      darc/test.py

+ 142 - 41
darc/_speedups.c

@@ -1,37 +1,84 @@
 #include <Python.h>
 #include <Python.h>
 #include <structmember.h>
 #include <structmember.h>
+#include <stdint.h>
 
 
-#define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
-#define ABS(X) ((X) < 0 ? (-(X)) : (X))
+/* Cyclic polynomial / buzhash: https://en.wikipedia.org/wiki/Rolling_hash */
 
 
-static unsigned long int
-checksum(const unsigned char *data, int len, unsigned long int sum)
+static uint32_t table_base[] =
 {
 {
-    unsigned long int s1, s2, i;
-    s1 = sum & 0xffff;
-    s2 = sum >> 16;
-    for(i=0; i < len; i++)
+    0xe7f831ec, 0xf4026465, 0xafb50cae, 0x6d553c7a, 0xd639efe3, 0x19a7b895, 0x9aba5b21, 0x5417d6d4,
+    0x35fd2b84, 0xd1f6a159, 0x3f8e323f, 0xb419551c, 0xf444cebf, 0x21dc3b80, 0xde8d1e36, 0x84a32436,
+    0xbeb35a9d, 0xa36f24aa, 0xa4e60186, 0x98d18ffe, 0x3f042f9e, 0xdb228bcd, 0x096474b7, 0x5c20c2f7,
+    0xf9eec872, 0xe8625275, 0xb9d38f80, 0xd48eb716, 0x22a950b4, 0x3cbaaeaa, 0xc37cddd3, 0x8fea6f6a,
+    0x1d55d526, 0x7fd6d3b3, 0xdaa072ee, 0x4345ac40, 0xa077c642, 0x8f2bd45b, 0x28509110, 0x55557613,
+    0xffc17311, 0xd961ffef, 0xe532c287, 0xaab95937, 0x46d38365, 0xb065c703, 0xf2d91d0f, 0x92cd4bb0,
+    0x4007c712, 0xf35509dd, 0x505b2f69, 0x557ead81, 0x310f4563, 0xbddc5be8, 0x9760f38c, 0x701e0205,
+    0x00157244, 0x14912826, 0xdc4ca32b, 0x67b196de, 0x5db292e8, 0x8c1b406b, 0x01f34075, 0xfa2520f7,
+    0x73bc37ab, 0x1e18bc30, 0xfe2c6cb3, 0x20c522d0, 0x5639e3db, 0x942bda35, 0x899af9d1, 0xced44035,
+    0x98cc025b, 0x255f5771, 0x70fefa24, 0xe928fa4d, 0x2c030405, 0xb9325590, 0x20cb63bd, 0xa166305d,
+    0x80e52c0a, 0xa8fafe2f, 0x1ad13f7d, 0xcfaf3685, 0x6c83a199, 0x7d26718a, 0xde5dfcd9, 0x79cf7355,
+    0x8979d7fb, 0xebf8c55e, 0xebe408e4, 0xcd2affba, 0xe483be6e, 0xe239d6de, 0x5dc1e9e0, 0x0473931f,
+    0x851b097c, 0xac5db249, 0x09c0f9f2, 0xd8d2f134, 0xe6f38e41, 0xb1c71bf1, 0x52b6e4db, 0x07224424,
+    0x6cf73e85, 0x4f25d89c, 0x782a7d74, 0x10a68dcd, 0x3a868189, 0xd570d2dc, 0x69630745, 0x9542ed86,
+    0x331cd6b2, 0xa84b5b28, 0x07879c9d, 0x38372f64, 0x7185db11, 0x25ba7c83, 0x01061523, 0xe6792f9f,
+    0xe5df07d1, 0x4321b47f, 0x7d2469d8, 0x1a3a4f90, 0x48be29a3, 0x669071af, 0x8ec8dd31, 0x0810bfbf,
+    0x813a06b4, 0x68538345, 0x65865ddc, 0x43a71b8e, 0x78619a56, 0x5a34451d, 0x5bdaa3ed, 0x71edc7e9,
+    0x17ac9a20, 0x78d10bfa, 0x6c1e7f35, 0xd51839d9, 0x240cbc51, 0x33513cc1, 0xd2b4f795, 0xccaa8186,
+    0x0babe682, 0xa33cf164, 0x18c643ea, 0xc1ca105f, 0x9959147a, 0x6d3d94de, 0x0b654fbe, 0xed902ca0,
+    0x7d835cb5, 0x99ba1509, 0x6445c922, 0x495e76c2, 0xf07194bc, 0xa1631d7e, 0x677076a5, 0x89fffe35, 
+    0x1a49bcf3, 0x8e6c948a, 0x0144c917, 0x8d93aea1, 0x16f87ddf, 0xc8f25d49, 0x1fb11297, 0x27e750cd,
+    0x2f422da1, 0xdee89a77, 0x1534c643, 0x457b7b8b, 0xaf172f7a, 0x6b9b09d6, 0x33573f7f, 0xf14e15c4,
+    0x526467d5, 0xaf488241, 0x87c3ee0d, 0x33be490c, 0x95aa6e52, 0x43ec242e, 0xd77de99b, 0xd018334f,
+    0x5b78d407, 0x498eb66b, 0xb1279fa8, 0xb38b0ea6, 0x90718376, 0xe325dee2, 0x8e2f2cba, 0xcaa5bdec,
+    0x9d652c56, 0xad68f5cb, 0xa77591af, 0x88e37ee8, 0xf8faa221, 0xfcbbbe47, 0x4f407786, 0xaf393889,
+    0xf444a1d9, 0x15ae1a2f, 0x40aa7097, 0x6f9486ac, 0x29d232a3, 0xe47609e9, 0xe8b631ff, 0xba8565f4,
+    0x11288749, 0x46c9a838, 0xeb1b7cd8, 0xf516bbb1, 0xfb74fda0, 0x010996e6, 0x4c994653, 0x1d889512,
+    0x53dcd9a3, 0xdd074697, 0x1e78e17c, 0x637c98bf, 0x930bb219, 0xcf7f75b0, 0xcb9355fb, 0x9e623009,
+    0xe466d82c, 0x28f968d3, 0xfeb385d9, 0x238e026c, 0xb8ed0560, 0x0c6a027a, 0x3d6fec4b, 0xbb4b2ec2,
+    0xe715031c, 0xeded011d, 0xcdc4d3b9, 0xc456fc96, 0xdd0eea20, 0xb3df8ec9, 0x12351993, 0xd9cbb01c,
+    0x603147a2, 0xcf37d17d, 0xf7fcd9dc, 0xd8556fa3, 0x104c8131, 0x13152774, 0xb4715811, 0x6a72c2c9,
+    0xc5ae37bb, 0xa76ce12a, 0x8150d8f3, 0x2ec29218, 0xa35f0984, 0x48c0647e, 0x0b5ff98c, 0x71893f7b
+};
+
+#define BARREL_SHIFT(v, shift) ( ((v) << ((shift) & 0x1f)) | ((v) >> (32 - ((shift) & 0x1f))) )
+
+
+static uint32_t *
+init_buzhash_table(uint32_t seed)
+{
+    int i;
+    uint32_t *table = malloc(1024);
+    for(i = 0; i < 256; i++)
+    {
+        table[i] = table_base[i] ^ seed;
+    }
+    return table;
+}
+
+static uint32_t
+buzhash(const unsigned char *data, int len, const uint32_t *h)
+{
+    int i;
+    uint32_t sum = 0;
+    for(i = len - 1; i > 0; i--)
     {
     {
-        s1 += data[i] + 1;
-        s2 += s1;
+        sum ^= BARREL_SHIFT(h[*data], i);
+        data++;
     }
     }
-    return ((s2 & 0xffff) << 16) | (s1 & 0xffff);
+    return sum ^ h[*data];
 }
 }
 
 
-static unsigned long int
-roll_checksum(unsigned long int sum, unsigned char remove, unsigned char add, int len)
+static uint32_t
+buzhash_update(uint32_t sum, unsigned char remove, unsigned char add, int len, const uint32_t *h)
 {
 {
-    unsigned long int s1, s2;
-    s1 = sum & 0xffff;
-    s2 = sum >> 16;
-    s1 -= remove - add;
-    s2 -= len * (remove + 1) - s1;
-    return ((s2 & 0xffff) << 16) | (s1 & 0xffff);
+    return BARREL_SHIFT(sum, 1) ^ BARREL_SHIFT(h[remove], len) ^ h[add];
 }
 }
 
 
 typedef struct {
 typedef struct {
     PyObject_HEAD
     PyObject_HEAD
-    int chunk_size, window_size, last, done, buf_size, seed, remaining, position;
+    int window_size, last, done, buf_size, remaining, position, chunk_mask, min_size;
+    size_t bytes_read, bytes_yielded;
+    uint32_t *h;
     PyObject *chunks, *fd;
     PyObject *chunks, *fd;
     unsigned char *data;
     unsigned char *data;
 } ChunkifyIter;
 } ChunkifyIter;
@@ -44,6 +91,8 @@ ChunkifyIter_iter(PyObject *self)
     c->position = 0;
     c->position = 0;
     c->done = 0;
     c->done = 0;
     c->last = 0;
     c->last = 0;
+    c->bytes_read = 0;
+    c->bytes_yielded = 0;
     Py_INCREF(self);
     Py_INCREF(self);
     return self;
     return self;
 }
 }
@@ -54,6 +103,7 @@ ChunkifyIter_dealloc(PyObject *self)
     ChunkifyIter *c = (ChunkifyIter *)self;
     ChunkifyIter *c = (ChunkifyIter *)self;
     Py_DECREF(c->fd);
     Py_DECREF(c->fd);
     free(c->data);
     free(c->data);
+    free(c->h);
     self->ob_type->tp_free(self);
     self->ob_type->tp_free(self);
 }
 }
 
 
@@ -68,6 +118,7 @@ ChunkifyIter_fill(PyObject *self)
     int n = PyString_Size(data);
     int n = PyString_Size(data);
     memcpy(c->data + c->position + c->remaining, PyString_AsString(data), n);
     memcpy(c->data + c->position + c->remaining, PyString_AsString(data), n);
     c->remaining += n;
     c->remaining += n;
+    c->bytes_read += n;
     Py_DECREF(data);
     Py_DECREF(data);
 }
 }
 
 
@@ -75,41 +126,54 @@ static PyObject*
 ChunkifyIter_iternext(PyObject *self)
 ChunkifyIter_iternext(PyObject *self)
 {
 {
     ChunkifyIter *c = (ChunkifyIter *)self;
     ChunkifyIter *c = (ChunkifyIter *)self;
-    unsigned long int sum;
+    uint32_t sum, chunk_mask = c->chunk_mask, min_size = c->min_size, window_size = c->window_size;
+    int n = 0;
 
 
     if(c->done) {
     if(c->done) {
-        PyErr_SetNone(PyExc_StopIteration);
+        if(c->bytes_read == c->bytes_yielded)
+            PyErr_SetNone(PyExc_StopIteration);
+        else
+            PyErr_SetString(PyExc_Exception, "chunkifier byte count mismatch");
         return NULL;
         return NULL;
     }
     }
-    if(c->remaining <= c->window_size) {
+    if(c->remaining <= window_size) {
         ChunkifyIter_fill(self);
         ChunkifyIter_fill(self);
     }
     }
-    if(c->remaining < c->window_size) {
+    if(c->remaining < window_size) {
         c->done = 1;
         c->done = 1;
         if(c->remaining) {
         if(c->remaining) {
+            c->bytes_yielded += c->remaining;
             return PyBuffer_FromMemory(c->data + c->position, c->remaining);
             return PyBuffer_FromMemory(c->data + c->position, c->remaining);
         }
         }
         else {
         else {
-            PyErr_SetNone(PyExc_StopIteration);
+            if(c->bytes_read == c->bytes_yielded)
+                PyErr_SetNone(PyExc_StopIteration);
+            else
+                PyErr_SetString(PyExc_Exception, "chunkifier byte count mismatch");
             return NULL;
             return NULL;
         }
         }
     }
     }
-    sum = checksum(c->data + c->position, c->window_size, 0);
-    c->remaining -= c->window_size;
-    c->position += c->window_size;
-    while(c->remaining && (sum & 0xffff) != c->seed) {
-        sum = roll_checksum(sum, c->data[c->position - c->window_size],
-                            c->data[c->position],
-                            c->window_size);
+    sum = buzhash(c->data + c->position, window_size, c->h);
+    while(c->remaining >= c->window_size && ((sum & chunk_mask) || n < min_size)) {
+        sum = buzhash_update(sum, c->data[c->position],
+                             c->data[c->position + window_size],
+                             window_size, c->h);
         c->position++;
         c->position++;
         c->remaining--;
         c->remaining--;
-        if(c->remaining == 0) {
+        n++;
+        if(c->remaining <= window_size) {
             ChunkifyIter_fill(self);
             ChunkifyIter_fill(self);
         }
         }
     }
     }
+    if(c->remaining <= window_size) {
+        c->position += c->remaining;
+        c->remaining = 0;
+    }
     int old_last = c->last;
     int old_last = c->last;
     c->last = c->position;
     c->last = c->position;
-    return PyBuffer_FromMemory(c->data + old_last, c->last - old_last);
+    n = c->last - old_last;
+    c->bytes_yielded += n;
+    return PyBuffer_FromMemory(c->data + old_last, n);
 }
 }
 
 
 static PyTypeObject ChunkifyIterType = {
 static PyTypeObject ChunkifyIterType = {
@@ -149,10 +213,10 @@ static PyObject *
 chunkify(PyObject *self, PyObject *args)
 chunkify(PyObject *self, PyObject *args)
 {
 {
     PyObject *fd;
     PyObject *fd;
-    int chunk_size, window_size, seed;
+    int seed, window_size, chunk_mask, min_size;
     ChunkifyIter *c;
     ChunkifyIter *c;
 
 
-    if (!PyArg_ParseTuple(args, "Oiii", &fd, &chunk_size, &window_size, &seed))
+    if (!PyArg_ParseTuple(args, "Oiiii", &fd, &window_size, &chunk_mask, &min_size, &seed))
     {
     {
         return NULL;
         return NULL;
     }
     }
@@ -163,27 +227,64 @@ chunkify(PyObject *self, PyObject *args)
     PyObject_Init((PyObject *)c, &ChunkifyIterType);
     PyObject_Init((PyObject *)c, &ChunkifyIterType);
     c->buf_size = 10 * 1024 * 1024;
     c->buf_size = 10 * 1024 * 1024;
     c->data = malloc(c->buf_size);
     c->data = malloc(c->buf_size);
+    c->h = init_buzhash_table(seed & 0xffffffff);
     c->fd = fd;
     c->fd = fd;
-    c->chunk_size = chunk_size;
     c->window_size = window_size;
     c->window_size = window_size;
-    c->seed = seed % chunk_size;
+    c->chunk_mask = chunk_mask;
+    c->min_size = min_size;
     Py_INCREF(fd);
     Py_INCREF(fd);
     return (PyObject *)c;
     return (PyObject *)c;
 }
 }
 
 
+static PyObject *
+do_buzhash(PyObject *self, PyObject *args)
+{
+    unsigned char *data;
+    int len;
+    unsigned long int seed, sum;
+    uint32_t *h;
+
+    if (!PyArg_ParseTuple(args, "s#k", &data, &len, &seed))
+    {
+        return NULL;
+    }
+    h = init_buzhash_table(seed & 0xffffffff);
+    sum = buzhash(data, len, h);
+    free(h);
+    return PyLong_FromUnsignedLong(sum);
+}
+
+static PyObject *
+do_buzhash_update(PyObject *self, PyObject *args)
+{
+    unsigned long int sum, seed;
+    unsigned char remove, add;
+    uint32_t *h;
+    int len;
+
+    if (!PyArg_ParseTuple(args, "kbbik", &sum, &remove, &add, &len, &seed))
+    {
+        return NULL;
+    }
+    h = init_buzhash_table(seed & 0xffffffff);
+    sum = buzhash_update(sum, remove, add, len, h);
+    free(h);
+    return PyLong_FromUnsignedLong(sum);
+}
+
 
 
 static PyMethodDef ChunkifierMethods[] = {
 static PyMethodDef ChunkifierMethods[] = {
     {"chunkify",  chunkify, METH_VARARGS, ""},
     {"chunkify",  chunkify, METH_VARARGS, ""},
+    {"buzhash",   do_buzhash, METH_VARARGS, ""},
+    {"buzhash_update",   do_buzhash_update, METH_VARARGS, ""},
     {NULL, NULL, 0, NULL}        /* Sentinel */
     {NULL, NULL, 0, NULL}        /* Sentinel */
 };
 };
 
 
 PyMODINIT_FUNC
 PyMODINIT_FUNC
 init_speedups(void)
 init_speedups(void)
 {
 {
-  PyObject* m;
-
   ChunkifyIterType.tp_new = PyType_GenericNew;
   ChunkifyIterType.tp_new = PyType_GenericNew;
   if (PyType_Ready(&ChunkifyIterType) < 0)  return;
   if (PyType_Ready(&ChunkifyIterType) < 0)  return;
 
 
-  m = Py_InitModule("_speedups", ChunkifierMethods);
+  Py_InitModule("_speedups", ChunkifierMethods);
 }
 }

+ 5 - 5
darc/archive.py

@@ -16,8 +16,9 @@ from .helpers import uid2user, user2uid, gid2group, group2gid, \
     encode_filename, Statistics
     encode_filename, Statistics
 
 
 ITEMS_BUFFER = 1024 * 1024
 ITEMS_BUFFER = 1024 * 1024
-CHUNK_SIZE = 64 * 1024
-WINDOW_SIZE = 4096
+CHUNK_MIN = 1024
+WINDOW_SIZE = 0xfff
+CHUNK_MASK = 0xffff
 
 
 have_lchmod = hasattr(os, 'lchmod')
 have_lchmod = hasattr(os, 'lchmod')
 linux = sys.platform == 'linux2'
 linux = sys.platform == 'linux2'
@@ -158,7 +159,7 @@ class Archive(object):
         if self.items.tell() == 0:
         if self.items.tell() == 0:
             return
             return
         self.items.seek(0)
         self.items.seek(0)
-        chunks = list(str(s) for s in chunkify(self.items, CHUNK_SIZE, WINDOW_SIZE, self.key.chunk_seed))
+        chunks = list(str(s) for s in chunkify(self.items, WINDOW_SIZE, CHUNK_MASK, CHUNK_MIN, self.key.chunk_seed))
         self.items.seek(0)
         self.items.seek(0)
         self.items.truncate()
         self.items.truncate()
         for chunk in chunks[:-1]:
         for chunk in chunks[:-1]:
@@ -399,8 +400,7 @@ class Archive(object):
         if chunks is None:
         if chunks is None:
             with open(path, 'rb') as fd:
             with open(path, 'rb') as fd:
                 chunks = []
                 chunks = []
-                for chunk in chunkify(fd, CHUNK_SIZE, WINDOW_SIZE,
-                                      self.key.chunk_seed):
+                for chunk in chunkify(fd, WINDOW_SIZE, CHUNK_MASK, CHUNK_MIN, self.key.chunk_seed):
                     chunks.append(cache.add_chunk(self.key.id_hash(chunk), chunk, self.stats))
                     chunks.append(cache.add_chunk(self.key.id_hash(chunk), chunk, self.stats))
             ids = [id for id, _, _ in chunks]
             ids = [id for id, _, _ in chunks]
             cache.memorize_file(path_hash, st, ids)
             cache.memorize_file(path_hash, st, ids)

+ 26 - 0
darc/test.py

@@ -11,6 +11,7 @@ import unittest
 from xattr import xattr, XATTR_NOFOLLOW
 from xattr import xattr, XATTR_NOFOLLOW
 
 
 from . import helpers, lrucache
 from . import helpers, lrucache
+from ._speedups import buzhash, buzhash_update, chunkify
 from .archiver import Archiver
 from .archiver import Archiver
 from .key import suite as KeySuite
 from .key import suite as KeySuite
 from .store import Store, suite as StoreSuite
 from .store import Store, suite as StoreSuite
@@ -186,12 +187,37 @@ class Test(unittest.TestCase):
         assert 'test2' in output
         assert 'test2' in output
 
 
 
 
+class ChunkTest(unittest.TestCase):
+
+    def test_chunkify(self):
+        data = '0' * 1024 * 1024 * 15 + 'Y'
+        parts = [str(c) for c in chunkify(StringIO(data), 2, 0x3, 2, 0)]
+        self.assertEqual(len(parts), 2)
+        self.assertEqual(''.join(parts), data)
+        self.assertEqual([str(c) for c in chunkify(StringIO(''), 2, 0x3, 2, 0)], [])
+        self.assertEqual([str(c) for c in chunkify(StringIO('foobarboobaz' * 3), 2, 0x3, 2, 0)], ['fooba', 'rboobaz', 'fooba', 'rboobaz', 'fooba', 'rboobaz'])
+        self.assertEqual([str(c) for c in chunkify(StringIO('foobarboobaz' * 3), 2, 0x3, 2, 1)], ['fo', 'obarb', 'oob', 'azf', 'oobarb', 'oob', 'azf', 'oobarb', 'oobaz'])
+        self.assertEqual([str(c) for c in chunkify(StringIO('foobarboobaz' * 3), 2, 0x3, 2, 2)], ['foob', 'ar', 'boobazfoob', 'ar', 'boobazfoob', 'ar', 'boobaz'])
+        self.assertEqual([str(c) for c in chunkify(StringIO('foobarboobaz' * 3), 3, 0x3, 3, 0)], ['foobarboobaz' * 3])
+        self.assertEqual([str(c) for c in chunkify(StringIO('foobarboobaz' * 3), 3, 0x3, 3, 1)], ['foobar', 'boo', 'bazfo', 'obar', 'boo', 'bazfo', 'obar', 'boobaz'])
+        self.assertEqual([str(c) for c in chunkify(StringIO('foobarboobaz' * 3), 3, 0x3, 3, 2)], ['foo', 'barboobaz', 'foo', 'barboobaz', 'foo', 'barboobaz'])
+        self.assertEqual([str(c) for c in chunkify(StringIO('foobarboobaz' * 3), 3, 0x3, 4, 0)], ['foobarboobaz' * 3])
+        self.assertEqual([str(c) for c in chunkify(StringIO('foobarboobaz' * 3), 3, 0x3, 4, 1)], ['foobar', 'boobazfo', 'obar', 'boobazfo', 'obar', 'boobaz'])
+        self.assertEqual([str(c) for c in chunkify(StringIO('foobarboobaz' * 3), 3, 0x3, 4, 2)], ['foob', 'arboobaz', 'foob', 'arboobaz', 'foob', 'arboobaz'])
+
+    def test_buzhash(self):
+        self.assertEqual(buzhash('abcdefghijklmnop', 0), 3795437769L)
+        self.assertEqual(buzhash('abcdefghijklmnop', 1), 3795400502L)
+        self.assertEqual(buzhash('abcdefghijklmnop', 1), buzhash_update(buzhash('Xabcdefghijklmno', 1), ord('X'), ord('p'), 16, 1))
+
+
 class RemoteTest(Test):
 class RemoteTest(Test):
     prefix = 'localhost:'
     prefix = 'localhost:'
 
 
 
 
 def suite():
 def suite():
     suite = unittest.TestSuite()
     suite = unittest.TestSuite()
+    suite.addTest(unittest.TestLoader().loadTestsFromTestCase(ChunkTest))
     suite.addTest(unittest.TestLoader().loadTestsFromTestCase(Test))
     suite.addTest(unittest.TestLoader().loadTestsFromTestCase(Test))
     suite.addTest(unittest.TestLoader().loadTestsFromTestCase(RemoteTest))
     suite.addTest(unittest.TestLoader().loadTestsFromTestCase(RemoteTest))
     suite.addTest(KeySuite())
     suite.addTest(KeySuite())