2
0
Эх сурвалжийг харах

Partial C implementation of chunkifier

Jonas Borgström 15 жил өмнө
parent
commit
c1a6a4f68c

+ 153 - 74
dedupestore/_speedups.c

@@ -1,35 +1,129 @@
 #include <Python/Python.h>
 #include <Python/structmember.h>
 
+static unsigned long int
+checksum(const unsigned char *data, int len, unsigned long int sum)
+{
+    unsigned long int s1, s2, i;
+    s1 = sum & 0xffff;
+    s2 = sum >> 16;
+    for(i=0; i < len; i++)
+    {
+        s1 += data[i] + 1;
+        s2 += s1;
+    }
+    return ((s2 & 0xffff) << 16) | (s1 & 0xffff);
+}
+
+static unsigned long int
+roll_checksum(unsigned long int sum, unsigned char remove, unsigned char add, int len)
+{
+    unsigned long int s1, s2;
+    s1 = sum & 0xffff;
+    s2 = sum >> 16;
+    s1 -= remove - add;
+    s2 -= len * (remove + 1) - s1;
+    return ((s2 & 0xffff) << 16) | (s1 & 0xffff);
+}
+
 typedef struct {
-  PyObject_HEAD
-  long int m;
-  long int i;
-  long int chunk_size;
-  unsigned long int sum;
-  PyObject *data;
-  PyObject *fd;
-  PyObject *chunks;
+    PyObject_HEAD
+    int chunk_size, i, full_sum, done, buf_size, data_len;
+    PyObject *chunks, *fd, *extra;
+    unsigned long sum;
+    unsigned char *data, add, remove;
 } ChunkifyIter;
 
-PyObject* ChunkifyIter_iter(PyObject *self)
+static PyObject*
+ChunkifyIter_iter(PyObject *self)
+{
+    Py_INCREF(self);
+    return self;
+}
+
+static void
+ChunkifyIter_dealloc(PyObject *self)
 {
-  Py_INCREF(self);
-  return self;
+    ChunkifyIter *c = (ChunkifyIter *)self;
+    Py_DECREF(c->fd);
+    Py_DECREF(c->chunks);
+    free(c->data);
+    self->ob_type->tp_free(self);
 }
 
-PyObject* ChunkifyIter_iternext(PyObject *self)
+static PyObject*
+ChunkifyIter_iternext(PyObject *self)
 {
-  ChunkifyIter *p = (ChunkifyIter *)self;
-  if (p->i < p->m) {
-    PyObject *tmp = Py_BuildValue("l", p->i);
-    (p->i)++;
-    return tmp;
-  } else {
-    /* Raising of standard StopIteration exception with empty value. */
+    ChunkifyIter *c = (ChunkifyIter *)self;
+    int o = 0;
+    if(c->done)
+    {
+        PyErr_SetNone(PyExc_StopIteration);
+        return NULL;
+    }
+    if(c->extra)
+    {
+        c->done = 1;
+        return c->extra;
+    }
+    for(;;)
+    {
+        if(c->i > c->buf_size - c->chunk_size)
+        {
+            memmove(c->data, c->data + c->i - o, c->data_len - c->i + o);
+            c->data_len -= c->i - o;
+            c->i = o;
+        }
+        if(c->data_len - c->i < c->chunk_size)
+        {
+            PyObject *data = PyObject_CallMethod(c->fd, "read", "i", c->buf_size - c->data_len);
+            int n = PyString_Size(data);
+            memcpy(c->data + c->data_len, PyString_AsString(data), n);
+            c->data_len += n;
+            Py_DECREF(data);
+        }
+        if(c->i == c->data_len)
+        {
+            PyErr_SetNone(PyExc_StopIteration);
+            return NULL;
+        }
+        if(c->data_len - c->i < c->chunk_size) /* EOF ? */
+        {
+            if(o == 1)
+            {
+                c->done = 1;
+                return PyString_FromStringAndSize((char *)(c->data + c->i - 1), c->data_len - c->i + 1);
+            }
+            else if(o > 1)
+            {
+                c->extra = PyString_FromStringAndSize((char *)(c->data + c->i - 1), c->chunk_size);
+                return PyString_FromStringAndSize((char *)(c->data + c->i - o), o - 1);
+            }
+            else
+            {
+                c->done = 1;
+                return PyString_FromStringAndSize((char *)(c->data + c->i), c->data_len - c->i);
+            }
+        }
+        if(o == c->chunk_size)
+        {
+            return PyString_FromStringAndSize((char *)(c->data + c->i - c->chunk_size), c->chunk_size);
+        }
+        if(c->full_sum || c->i + c->chunk_size > c->data_len)
+        {
+            c->full_sum = 0;
+            c->sum = checksum(c->data + c->i, c->data_len - c->i, 0);
+        }
+        else
+        {
+            c->sum = roll_checksum(c->sum, c->remove, c->data[c->i + c->chunk_size], c->chunk_size);
+        }
+        c->remove = c->data[c->i];
+        o++;
+        c->i++;
+    }
     PyErr_SetNone(PyExc_StopIteration);
     return NULL;
-  }
 }
 
 static PyTypeObject ChunkifyIterType = {
@@ -38,7 +132,7 @@ static PyTypeObject ChunkifyIterType = {
     "_chunkifier._ChunkifyIter",       /*tp_name*/
     sizeof(ChunkifyIter),       /*tp_basicsize*/
     0,                         /*tp_itemsize*/
-    0,                         /*tp_dealloc*/
+    ChunkifyIter_dealloc,      /*tp_dealloc*/
     0,                         /*tp_print*/
     0,                         /*tp_getattr*/
     0,                         /*tp_setattr*/
@@ -68,86 +162,71 @@ static PyTypeObject ChunkifyIterType = {
 static PyObject *
 chunkify(PyObject *self, PyObject *args)
 {
-  PyObject *fd;
-  PyObject *chunks;
-  long int chunk_size;
-  ChunkifyIter *p;
-
-  if (!PyArg_ParseTuple(args, "OlO", &fd, &chunk_size, &chunks))  return NULL;
-
-  /* I don't need python callable __init__() method for this iterator,
-     so I'll simply allocate it as PyObject and initialize it by hand. */
-
-  p = PyObject_New(ChunkifyIter, &ChunkifyIterType);
-  if (!p) return NULL;
-
-  /* I'm not sure if it's strictly necessary. */
-  if (!PyObject_Init((PyObject *)p, &ChunkifyIterType)) {
-    Py_DECREF(p);
-    return NULL;
-  }
+    PyObject *fd, *chunks;
+    long int chunk_size;
+    ChunkifyIter *c;
 
-    p->m = 10;
-    p->i = 0;
-    p->fd = fd;
-    p->chunk_size = chunk_size;
-    p->chunks = chunks;
-    return (PyObject *)p;
-  }
+    if (!PyArg_ParseTuple(args, "OiO", &fd, &chunk_size, &chunks))
+    {
+        return NULL;
+    }
+    if (!(c = PyObject_New(ChunkifyIter, &ChunkifyIterType)))
+    {
+        return NULL;
+    }
+    PyObject_Init((PyObject *)c, &ChunkifyIterType);
+    c->buf_size = chunk_size * 10;
+    c->data = malloc(c->buf_size);
+    c->data_len = 0;
+    c->i = 0;
+    c->full_sum = 1;
+    c->done = 0;
+    c->extra = NULL;
+    c->fd = fd;
+    c->chunk_size = chunk_size;
+    c->chunks = chunks;
+    Py_INCREF(fd);
+    Py_INCREF(chunks);
+    return (PyObject *)c;
+}
 
 static PyObject *
-checksum(PyObject *self, PyObject *args)
+py_checksum(PyObject *self, PyObject *args)
 {
-    unsigned long int sum = 0, s1, s2;
     PyObject *data;
-    Py_ssize_t i, len;
-    const char *ptr;
-    if(!PyArg_ParseTuple(args, "O|l", &data, &sum))  return NULL;
+    unsigned long int sum = 0;
+    if(!PyArg_ParseTuple(args, "O|k", &data, &sum))  return NULL;
     if(!PyString_Check(data))
     {
         PyErr_SetNone(PyExc_TypeError);
-        Py_INCREF(data);
         return NULL;
     }
-    len = PyString_Size(data);
-    ptr = PyString_AsString(data);
-    s1 = sum & 0xffff;
-    s2 = sum >> 16;
-    for(i=0; i < len; i++)
-    {
-        s1 += ptr[i] + 1;
-        s2 += s1;
-    }
-    return PyInt_FromLong(((s2 & 0xffff) << 16) | (s1 & 0xffff));
+    return PyInt_FromLong(checksum((unsigned char *)PyString_AsString(data),
+                                   PyString_Size(data), sum));
 }
 
 static PyObject *
-roll_checksum(PyObject *self, PyObject *args)
+py_roll_checksum(PyObject *self, PyObject *args)
 {
-    unsigned long int sum = 0, len, s1, s2, a, r;
+    unsigned long int sum = 0, len, a, r;
     PyObject *add, *remove;
-    if (!PyArg_ParseTuple(args, "lOOl", &sum, &remove, &add, &len))  return NULL;
+    if (!PyArg_ParseTuple(args, "kOOk", &sum, &remove, &add, &len))  return NULL;
     if(!PyString_Check(remove) || !PyString_Check(add) || 
         PyString_Size(remove) != 1 || PyString_Size(add) != 1)
     {
         PyErr_SetNone(PyExc_TypeError);
-        Py_INCREF(remove);
-        Py_INCREF(add);
         return NULL;
     }
     a = *((const unsigned char *)PyString_AsString(add));
     r = *((const unsigned char *)PyString_AsString(remove));
-    s1 = sum & 0xffff;
-    s2 = sum >> 16;
-    s1 -= r - a;
-    s2 -= len * (r + 1) - s1;
-    return PyInt_FromLong(((s2 & 0xffff) << 16) | (s1 & 0xffff));
+    return PyInt_FromLong(roll_checksum(sum, r, a, len));
 }
 
+
 static PyMethodDef ChunkifierMethods[] = {
     {"chunkify",  chunkify, METH_VARARGS, ""},
-    {"checksum",  checksum, METH_VARARGS, ""},
-    {"roll_checksum",  roll_checksum, METH_VARARGS, ""},
+    {"checksum",  py_checksum, METH_VARARGS, ""},
+    {"roll_checksum",  py_roll_checksum, METH_VARARGS, ""},
     {NULL, NULL, 0, NULL}        /* Sentinel */
 };
 

+ 38 - 18
dedupestore/chunkifier.py

@@ -58,16 +58,11 @@ class ChunkifyIter(object):
                 self.data += self.fd.read(self.buf_size - len(self.data))
             if len(self.data) == self.i:
                 raise StopIteration
-            if self.full_sum or len(self.data) - self.i < self.chunk_size:
-                self.sum = checksum(self.data[self.i:self.i + self.chunk_size])
-                self.full_sum = False
-                self.remove = self.data[self.i]
-            else:
-                self.sum = roll_checksum(self.sum, self.remove, self.data[self.i + self.chunk_size - 1], 
-                                         self.chunk_size)
-                self.remove = self.data[self.i]
             if len(self.data) - self.i < self.chunk_size:  # EOF?
-                if o > 0:
+                if o == 1:
+                    self.done = True
+                    return self.data[self.i - 1:]
+                elif o > 1:
                     self.extra = self.data[-self.chunk_size:]
                     return self.data[-self.chunk_size - o + 1:-self.chunk_size]
                 else:
@@ -75,7 +70,15 @@ class ChunkifyIter(object):
                     return self.data[self.i:]
             elif o == self.chunk_size:
                 return self.data[self.i-self.chunk_size:self.i]
-            elif self.sum in self.chunks:
+            if self.full_sum or len(self.data) - self.i < self.chunk_size:
+                self.sum = checksum(self.data[self.i:self.i + self.chunk_size])
+                self.full_sum = False
+                self.remove = self.data[self.i]
+            else:
+                self.sum = roll_checksum(self.sum, self.remove, self.data[self.i + self.chunk_size - 1], 
+                                         self.chunk_size)
+                self.remove = self.data[self.i]
+            if self.sum in self.chunks:
                 if o > 0:
                     chunk = self.data[self.i - o:self.i]
                 else:
@@ -90,23 +93,38 @@ class ChunkifyIter(object):
 
 def chunkify(fd, chunk_size, chunks):
     """
-    >>> fd = StringIO.StringIO('ABCDEFGHIJKLMN')
-    >>> list(chunkify(fd, 4, {}))
+    >>> list(chunkify(StringIO.StringIO('A'), 4, {}))
+    ['A']
+    >>> list(chunkify(StringIO.StringIO('AB'), 4, {}))
+    ['AB']
+    >>> list(chunkify(StringIO.StringIO('ABC'), 4, {}))
+    ['ABC']
+    >>> list(chunkify(StringIO.StringIO('ABCD'), 4, {}))
+    ['ABCD']
+    >>> list(chunkify(StringIO.StringIO('ABCDE'), 4, {}))
+    ['A', 'BCDE']
+    >>> list(chunkify(StringIO.StringIO('ABCDEF'), 4, {}))
+    ['AB', 'CDEF']
+    >>> list(chunkify(StringIO.StringIO('ABCDEFG'), 4, {}))
+    ['ABC', 'DEFG']
+    >>> list(chunkify(StringIO.StringIO('ABCDEFGH'), 4, {}))
+    ['ABCD', 'EFGH']
+    >>> list(chunkify(StringIO.StringIO('ABCDEFGHI'), 4, {}))
+    ['ABCD', 'E', 'FGHI']
+
+    >>> list(chunkify(StringIO.StringIO('ABCDEFGHIJKLMN'), 4, {}))
     ['ABCD', 'EFGH', 'IJ', 'KLMN']
 
-    >>> fd = StringIO.StringIO('ABCDEFGHIJKLMN')
     >>> chunks = {44564754: True} # 'BCDE'
-    >>> list(chunkify(fd, 4, chunks))
+    >>> list(chunkify(StringIO.StringIO('ABCDEFGHIJKLMN'), 4, chunks))
     ['A', 'BCDE', 'FGHI', 'J', 'KLMN']
 
-    >>> fd = StringIO.StringIO('ABCDEFGHIJKLMN')
     >>> chunks = {44564754: True, 48496938: True} # 'BCDE', 'HIJK'
-    >>> list(chunkify(fd, 4, chunks))
+    >>> list(chunkify(StringIO.StringIO('ABCDEFGHIJKLMN'), 4, chunks))
     ['A', 'BCDE', 'FG', 'HIJK', 'LMN']
 
-    >>> fd = StringIO.StringIO('ABCDEFGHIJKLMN')
     >>> chunks = {43909390: True, 50463030: True} # 'ABCD', 'KLMN'
-    >>> list(chunkify(fd, 4, chunks))
+    >>> list(chunkify(StringIO.StringIO('ABCDEFGHIJKLMN'), 4, chunks))
     ['ABCD', 'EFGH', 'IJ', 'KLMN']
     """
     return ChunkifyIter(fd, chunk_size, chunks)
@@ -115,6 +133,8 @@ try:
     import _speedups
     checksum = _speedups.checksum
     roll_checksum = _speedups.roll_checksum
+    py_chunkify = chunkify
+    chunkify = _speedups.chunkify
 except ImportError:
     print 'Failed to load _speedups module, things will be slow'