Răsfoiți Sursa

Chunkifier improvements. Including tin foil hat compliant seeding.

Jonas Borgström 14 ani în urmă
părinte
comite
330315ba0d
4 a modificat fișierele cu 34 adăugiri și 143 ștergeri
  1. 26 18
      darc/_speedups.c
  2. 5 3
      darc/archive.py
  3. 0 122
      darc/chunkifier.py
  4. 3 0
      darc/keychain.py

+ 26 - 18
darc/_speedups.c

@@ -1,6 +1,9 @@
 #include <Python.h>
 #include <structmember.h>
 
+#define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
+#define ABS(X) ((X) < 0 ? (-(X)) : (X))
+
 static unsigned long int
 checksum(const unsigned char *data, int len, unsigned long int sum)
 {
@@ -28,7 +31,7 @@ roll_checksum(unsigned long int sum, unsigned char remove, unsigned char add, in
 
 typedef struct {
     PyObject_HEAD
-    int chunk_size, window_size, i, last, eof, done, buf_size, data_len, initial;
+    int chunk_size, window_size, i, last, eof, done, buf_size, data_len, seed;
     PyObject *chunks, *fd;
     unsigned long int sum;
     unsigned char *data, add, remove;
@@ -43,8 +46,7 @@ ChunkifyIter_iter(PyObject *self)
     c->eof = 0;
     c->i = 0;
     c->sum = 0;
-    c->last = -1;
-    c->initial = c->window_size;
+    c->last = 0;
     Py_INCREF(self);
     return self;
 }
@@ -62,6 +64,8 @@ static PyObject*
 ChunkifyIter_iternext(PyObject *self)
 {
     ChunkifyIter *c = (ChunkifyIter *)self;
+    int initial = c->window_size;
+
     if(c->done)
     {
         PyErr_SetNone(PyExc_StopIteration);
@@ -72,6 +76,7 @@ ChunkifyIter_iternext(PyObject *self)
         if(c->i == c->buf_size)
         {
             int diff = c->last + 1 - c->window_size;
+            assert(diff >= 0);
             memmove(c->data, c->data + diff, c->buf_size - diff);
             c->i -= diff;
             c->last -= diff;
@@ -90,18 +95,20 @@ ChunkifyIter_iternext(PyObject *self)
         }
         if(c->i == c->data_len)
         {
-            if(c->last < c->i - 1) {
+            if(c->last < c->i) {
                 c->done = 1;
-                return PyString_FromStringAndSize((char *)(c->data + c->last + 1),
-                                                  c->data_len - c->last - 1);
+                return PyString_FromStringAndSize((char *)(c->data + c->last),
+                                                  c->data_len - c->last);
             }
             PyErr_SetNone(PyExc_StopIteration);
             return NULL;
         }
-        if(c->initial)
+        if(initial)
         {
-            c->initial--;
-            c->sum = checksum(c->data + c->i, 1, c->sum);
+            int bytes = MIN(initial, c->data_len - c->i);
+            initial -= bytes;
+            c->sum = checksum(c->data + c->i, bytes, 0);
+            c->i += bytes;
         }
         else
         {
@@ -109,20 +116,20 @@ ChunkifyIter_iternext(PyObject *self)
                                    c->data[c->i - c->window_size],
                                    c->data[c->i],
                                    c->window_size);
+            c->i++;
         }
-        c->i++;
-        if(c->i == c->buf_size && c->last == -1)
+        if((c->sum % c->chunk_size) == c->seed)
         {
             int old_last = c->last;
-            c->last = c->i - 1;
-            return PyString_FromStringAndSize((char *)(c->data + old_last + 1),
+            c->last = c->i;
+            return PyString_FromStringAndSize((char *)(c->data + old_last),
                                               c->last - old_last);
         }
-        else if((c->sum % c->chunk_size) == 0)
+        if(c->i == c->buf_size && c->last <= c->window_size)
         {
             int old_last = c->last;
-            c->last = c->i - 1;
-            return PyString_FromStringAndSize((char *)(c->data + old_last + 1),
+            c->last = c->i;
+            return PyString_FromStringAndSize((char *)(c->data + old_last),
                                               c->last - old_last);
         }
     }
@@ -167,10 +174,10 @@ static PyObject *
 chunkify(PyObject *self, PyObject *args)
 {
     PyObject *fd;
-    long int chunk_size, window_size;
+    int chunk_size, window_size, seed;
     ChunkifyIter *c;
 
-    if (!PyArg_ParseTuple(args, "Oii", &fd, &chunk_size, &window_size))
+    if (!PyArg_ParseTuple(args, "Oiii", &fd, &chunk_size, &window_size, &seed))
     {
         return NULL;
     }
@@ -184,6 +191,7 @@ chunkify(PyObject *self, PyObject *args)
     c->fd = fd;
     c->chunk_size = chunk_size;
     c->window_size = window_size;
+    c->seed = seed % chunk_size;
     Py_INCREF(fd);
     return (PyObject *)c;
 }

+ 5 - 3
darc/archive.py

@@ -8,10 +8,11 @@ import sys
 from xattr import xattr, XATTR_NOFOLLOW
 
 from . import NS_ARCHIVE_METADATA, NS_ARCHIVE_ITEMS, NS_ARCHIVE_CHUNKS, NS_CHUNK
-from .chunkifier import chunkify
+from ._speedups import chunkify
 from .helpers import uid2user, user2uid, gid2group, group2gid, IntegrityError
 
-CHUNK_SIZE = 55001
+CHUNK_SIZE = 64 * 1024
+WINDOW_SIZE = 4096
 
 have_lchmod = hasattr(os, 'lchmod')
 linux = sys.platform == 'linux2'
@@ -253,7 +254,8 @@ class Archive(object):
                 size = 0
                 ids = []
                 chunks = []
-                for chunk in chunkify(fd, CHUNK_SIZE, 30):
+                for chunk in chunkify(fd, CHUNK_SIZE, WINDOW_SIZE,
+                                      self.keychain.get_chunkify_seed()):
                     id = self.keychain.id_hash(chunk)
                     ids.append(id)
                     try:

+ 0 - 122
darc/chunkifier.py

@@ -1,122 +0,0 @@
-def checksum(data, sum=0):
-    """Simple but fast checksum that can be updated at either end.
-
-    >>> checksum('FOOBAR')
-    102367679
-    >>> checksum('FOOBAR') == checksum('BAR', checksum('FOO'))
-    True
-    """
-    s1 = sum & 0xffff
-    s2 = sum >> 16
-    for c in data:
-        s1 += ord(c) + 1
-        s2 += s1
-    return ((s2 & 0xffff) << 16) + (s1 & 0xffff)
-
-
-def roll_checksum(sum, remove, add, len):
-    """
-    >>> roll_checksum(checksum('XFOOBA'), 'X', 'R', 6) == checksum('FOOBAR')
-    True
-    """
-    s1 = sum & 0xffff
-    s2 = sum >> 16
-    add = ord(add)
-    remove = ord(remove)
-    s1 -= remove - add
-    s2 -= len * (remove + 1) - s1
-    return (s1 & 0xffff) + ((s2 & 0xffff) << 16)
-
-
-class ChunkifyIter(object):
-
-    def __init__(self, fd, chunk_size, window_size):
-        self.fd = fd
-        self.chunk_size = chunk_size
-        self.window_size = window_size
-        self.buf_size = self.chunk_size * 10
-
-    def __iter__(self):
-        self.data = ''
-        self.done = False
-        self.i = 0
-        self.sum = 0
-        self.last = -1
-        self.initial = self.window_size
-        return self
-
-    def next(self):
-        if self.done:
-            raise StopIteration
-        while True:
-            if self.i == self.buf_size:
-                diff = self.last + 1 - self.window_size
-                if diff < 0:
-                    import ipdb
-                    ipdb.set_trace()
-                self.data = self.data[diff:]
-                self.last -= diff
-                self.i -= diff
-            if self.i == len(self.data):
-                self.data += self.fd.read(self.buf_size - len(self.data))
-            if self.i == len(self.data):
-                if self.last < self.i - 1:
-                    self.done = True
-                    return self.data[self.last + 1:]
-                raise StopIteration
-            if self.initial:
-                self.initial -= 1
-                self.sum = checksum(self.data[self.i], self.sum)
-            else:
-                self.sum = roll_checksum(self.sum,
-                                         self.data[self.i - self.window_size],
-                                         self.data[self.i],
-                                         self.window_size)
-            self.i += 1
-            if self.i == self.buf_size and self.last == -1:
-                old_last = self.last
-                self.last = self.i - 1
-                return self.data[old_last + 1:self.last + 1]
-            elif self.sum % self.chunk_size == 0:
-                old_last = self.last
-                self.last = self.i - 1
-                return self.data[old_last + 1:self.last + 1]
-
-
-def chunkify(fd, chunk_size, chunks):
-    """
-    >>> list(chunkify(StringIO.StringIO(''), 5, 3))
-    []
-    >>> list(chunkify(StringIO.StringIO('A'), 5, 3))
-    ['A']
-    >>> list(chunkify(StringIO.StringIO('AB'), 5, 3))
-    ['AB']
-    >>> list(chunkify(StringIO.StringIO('1B'), 5, 3))
-    ['1', 'B']
-    >>> list(chunkify(StringIO.StringIO('ABCDEFGHIJKLMNOPQ'), 5, 3))
-    ['ABCD', 'EFGHI', 'JKLMN', 'OPQ']
-    >>> list(chunkify(StringIO.StringIO('1ABCDEFGHIJKLMNOPQ'), 5, 3))
-    ['1', 'ABCD', 'EFGHI', 'JKLMN', 'OPQ']
-    >>> list(chunkify(StringIO.StringIO('12ABCDEFGHIJKLMNOPQ'), 5, 3))
-    ['1', '2A', 'BCD', 'EFGHI', 'JKLMN', 'OPQ']
-    >>> list(chunkify(StringIO.StringIO('12ABCDEFGHIJKLMNOPQRSTUVWXYZ'), 5, 3))
-    ['1', '2A', 'BCD', 'EFGHI', 'JKLMN', 'OPQRS', 'TUVWX', 'YZ']
-    >>> list(chunkify(StringIO.StringIO('12ABCDEFGHIJKLMNOPQRSTUVWXYZ'), 5, 3))
-    ['1', '2A', 'BCD', 'EFGHI', 'JKLMN', 'OPQRS', 'TUVWX', 'YZ']
-    """
-    return ChunkifyIter(fd, chunk_size, chunks)
-
-try:
-    import _speedups
-    checksum = _speedups.checksum
-    roll_checksum = _speedups.roll_checksum
-    py_chunkify = chunkify
-    chunkify = _speedups.chunkify
-except ImportError:
-    print 'Failed to load _speedups module, things will be slow'
-
-
-if __name__ == '__main__':
-    import doctest
-    import StringIO
-    doctest.testmod()

+ 3 - 0
darc/keychain.py

@@ -30,6 +30,9 @@ class Keychain(object):
         if path:
             self.open(path)
 
+    def get_chunkify_seed(self):
+        return bytes_to_long(self.aes_id[:4])
+
     def open(self, path):
         print 'Opening keychain "%s"' % path
         with open(path, 'rb') as fd: