Browse Source

Use Cython for all native code

Jonas Borgström 12 years ago
parent
commit
3e5433855c
10 changed files with 137 additions and 196 deletions
  1. 1 0
      .gitignore
  2. 48 149
      darc/_chunker.c
  3. 32 12
      darc/_hashindex.c
  4. 1 1
      darc/archive.py
  5. 48 0
      darc/chunker.pyx
  6. 0 28
      darc/hashindex.h
  7. 2 1
      darc/hashindex.pyx
  8. 1 0
      darc/remote.py
  9. 1 1
      darc/test.py
  10. 3 4
      setup.py

+ 1 - 0
.gitignore

@@ -3,6 +3,7 @@ build
 dist
 env
 hashindex.c
+chunker.c
 *.egg-info
 *.pyc
 *.pyo

+ 48 - 149
darc/_speedups.c → darc/_chunker.c

@@ -1,6 +1,4 @@
 #include <Python.h>
-#include <structmember.h>
-#include <stdint.h>
 
 /* Cyclic polynomial / buzhash: https://en.wikipedia.org/wiki/Rolling_hash */
 
@@ -44,7 +42,7 @@ static uint32_t table_base[] =
 
 
 static uint32_t *
-init_buzhash_table(uint32_t seed)
+buzhash_init_table(uint32_t seed)
 {
     int i;
     uint32_t *table = malloc(1024);
@@ -56,9 +54,9 @@ init_buzhash_table(uint32_t seed)
 }
 
 static uint32_t
-buzhash(const unsigned char *data, int len, const uint32_t *h)
+buzhash(const unsigned char *data, size_t len, const uint32_t *h)
 {
-    int i;
+    size_t i;
     uint32_t sum = 0;
     for(i = len - 1; i > 0; i--)
     {
@@ -69,63 +67,72 @@ buzhash(const unsigned char *data, int len, const uint32_t *h)
 }
 
 static uint32_t
-buzhash_update(uint32_t sum, unsigned char remove, unsigned char add, int len, const uint32_t *h)
+buzhash_update(uint32_t sum, unsigned char remove, unsigned char add, size_t len, const uint32_t *h)
 {
     return BARREL_SHIFT(sum, 1) ^ BARREL_SHIFT(h[remove], len) ^ h[add];
 }
 
 typedef struct {
-    PyObject_HEAD
-    int window_size, last, done, buf_size, remaining, position, chunk_mask, min_size;
-    size_t bytes_read, bytes_yielded;
-    uint32_t *h;
-    PyObject *chunks, *fd;
-    unsigned char *data;
-} ChunkifyIter;
+    int window_size, chunk_mask, min_size;
+    size_t buf_size;
+    uint32_t *table;
+    uint8_t *data;
+    PyObject *fd;
+    int done;
+    size_t remaining, bytes_read, bytes_yielded, position, last;
+} Chunker;
 
-static PyObject*
-ChunkifyIter_iter(PyObject *self)
+static Chunker *
+chunker_init(PyObject *fd, int window_size, int chunk_mask, int min_size, uint32_t seed)
 {
-    ChunkifyIter *c = (ChunkifyIter *)self;
-    c->remaining = 0;
-    c->position = 0;
+    Chunker *c = malloc(sizeof(Chunker));
+    c->window_size = window_size;
+    c->chunk_mask = chunk_mask;
+    c->min_size = min_size;
+    c->table = buzhash_init_table(seed);
+    c->buf_size = 10 * 1024 * 1024;
+    c->data = malloc(c->buf_size);
+    c->fd = fd;
+    Py_INCREF(fd);
     c->done = 0;
-    c->last = 0;
+    c->remaining = 0;
     c->bytes_read = 0;
     c->bytes_yielded = 0;
-    Py_INCREF(self);
-    return self;
+    c->position = 0;
+    c->last = 0;
+    return c;
 }
 
 static void
-ChunkifyIter_dealloc(PyObject *self)
+chunker_free(Chunker *c)
 {
-    ChunkifyIter *c = (ChunkifyIter *)self;
     Py_DECREF(c->fd);
+    free(c->table);
     free(c->data);
-    free(c->h);
-    self->ob_type->tp_free(self);
+    free(c);
 }
 
-static void
-ChunkifyIter_fill(PyObject *self)
+static int
+chunker_fill(Chunker *c)
 {
-    ChunkifyIter *c = (ChunkifyIter *)self;
     memmove(c->data, c->data + c->last, c->position + c->remaining - c->last);
     c->position -= c->last;
     c->last = 0;
     PyObject *data = PyObject_CallMethod(c->fd, "read", "i", c->buf_size - c->position - c->remaining);
+    if(!data) {
+        return 0;
+    }
     int n = PyString_Size(data);
     memcpy(c->data + c->position + c->remaining, PyString_AsString(data), n);
     c->remaining += n;
     c->bytes_read += n;
     Py_DECREF(data);
+    return 1;
 }
 
-static PyObject*
-ChunkifyIter_iternext(PyObject *self)
+static PyObject *
+chunker_process(Chunker *c)
 {
-    ChunkifyIter *c = (ChunkifyIter *)self;
     uint32_t sum, chunk_mask = c->chunk_mask, min_size = c->min_size, window_size = c->window_size;
     int n = 0;
 
@@ -137,7 +144,9 @@ ChunkifyIter_iternext(PyObject *self)
         return NULL;
     }
     if(c->remaining <= window_size) {
-        ChunkifyIter_fill(self);
+        if(!chunker_fill(c)) {
+            return NULL;
+        }
     }
     if(c->remaining < window_size) {
         c->done = 1;
@@ -153,16 +162,18 @@ ChunkifyIter_iternext(PyObject *self)
             return NULL;
         }
     }
-    sum = buzhash(c->data + c->position, window_size, c->h);
+    sum = buzhash(c->data + c->position, window_size, c->table);
     while(c->remaining >= c->window_size && ((sum & chunk_mask) || n < min_size)) {
         sum = buzhash_update(sum, c->data[c->position],
                              c->data[c->position + window_size],
-                             window_size, c->h);
+                             window_size, c->table);
         c->position++;
         c->remaining--;
         n++;
         if(c->remaining <= window_size) {
-            ChunkifyIter_fill(self);
+            if(!chunker_fill(c)) {
+                return NULL;
+            }
         }
     }
     if(c->remaining <= window_size) {
@@ -174,117 +185,5 @@ ChunkifyIter_iternext(PyObject *self)
     n = c->last - old_last;
     c->bytes_yielded += n;
     return PyBuffer_FromMemory(c->data + old_last, n);
-}
-
-static PyTypeObject ChunkifyIterType = {
-    PyObject_HEAD_INIT(NULL)
-    0,                         /*ob_size*/
-    "_chunkifier._ChunkifyIter",       /*tp_name*/
-    sizeof(ChunkifyIter),       /*tp_basicsize*/
-    0,                         /*tp_itemsize*/
-    ChunkifyIter_dealloc,      /*tp_dealloc*/
-    0,                         /*tp_print*/
-    0,                         /*tp_getattr*/
-    0,                         /*tp_setattr*/
-    0,                         /*tp_compare*/
-    0,                         /*tp_repr*/
-    0,                         /*tp_as_number*/
-    0,                         /*tp_as_sequence*/
-    0,                         /*tp_as_mapping*/
-    0,                         /*tp_hash */
-    0,                         /*tp_call*/
-    0,                         /*tp_str*/
-    0,                         /*tp_getattro*/
-    0,                         /*tp_setattro*/
-    0,                         /*tp_as_buffer*/
-    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_ITER,
-      /* tp_flags: Py_TPFLAGS_HAVE_ITER tells python to
-         use tp_iter and tp_iternext fields. */
-    "",           /* tp_doc */
-    0,  /* tp_traverse */
-    0,  /* tp_clear */
-    0,  /* tp_richcompare */
-    0,  /* tp_weaklistoffset */
-    ChunkifyIter_iter,  /* tp_iter: __iter__() method */
-    ChunkifyIter_iternext  /* tp_iternext: next() method */
-};
-
-static PyObject *
-chunkify(PyObject *self, PyObject *args)
-{
-    PyObject *fd;
-    int seed, window_size, chunk_mask, min_size;
-    ChunkifyIter *c;
-
-    if (!PyArg_ParseTuple(args, "Oiiii", &fd, &window_size, &chunk_mask, &min_size, &seed))
-    {
-        return NULL;
-    }
-    if (!(c = PyObject_New(ChunkifyIter, &ChunkifyIterType)))
-    {
-        return NULL;
-    }
-    PyObject_Init((PyObject *)c, &ChunkifyIterType);
-    c->buf_size = 10 * 1024 * 1024;
-    c->data = malloc(c->buf_size);
-    c->h = init_buzhash_table(seed & 0xffffffff);
-    c->fd = fd;
-    c->window_size = window_size;
-    c->chunk_mask = chunk_mask;
-    c->min_size = min_size;
-    Py_INCREF(fd);
-    return (PyObject *)c;
-}
-
-static PyObject *
-do_buzhash(PyObject *self, PyObject *args)
-{
-    unsigned char *data;
-    int len;
-    unsigned long int seed, sum;
-    uint32_t *h;
-
-    if (!PyArg_ParseTuple(args, "s#k", &data, &len, &seed))
-    {
-        return NULL;
-    }
-    h = init_buzhash_table(seed & 0xffffffff);
-    sum = buzhash(data, len, h);
-    free(h);
-    return PyLong_FromUnsignedLong(sum);
-}
-
-static PyObject *
-do_buzhash_update(PyObject *self, PyObject *args)
-{
-    unsigned long int sum, seed;
-    unsigned char remove, add;
-    uint32_t *h;
-    int len;
-
-    if (!PyArg_ParseTuple(args, "kbbik", &sum, &remove, &add, &len, &seed))
-    {
-        return NULL;
-    }
-    h = init_buzhash_table(seed & 0xffffffff);
-    sum = buzhash_update(sum, remove, add, len, h);
-    free(h);
-    return PyLong_FromUnsignedLong(sum);
-}
-
-
-static PyMethodDef ChunkifierMethods[] = {
-    {"chunkify",  chunkify, METH_VARARGS, ""},
-    {"buzhash",   do_buzhash, METH_VARARGS, ""},
-    {"buzhash_update",   do_buzhash_update, METH_VARARGS, ""},
-    {NULL, NULL, 0, NULL}        /* Sentinel */
-};
-
-PyMODINIT_FUNC
-init_speedups(void)
-{
-  ChunkifyIterType.tp_new = PyType_GenericNew;
-  if (PyType_Ready(&ChunkifyIterType) < 0)  return;
-
-  Py_InitModule("_speedups", ChunkifierMethods);
-}
+    
+}

+ 32 - 12
darc/_hashindex.c

@@ -9,8 +9,6 @@
 #include <unistd.h>
 #include <sys/mman.h>
 
-#include "hashindex.h"
-
 typedef struct {
     char magic[8];
     int32_t num_entries;
@@ -19,6 +17,18 @@ typedef struct {
     int8_t  value_size;
 } __attribute__((__packed__)) HashHeader;
 
+typedef struct {
+    char *path;
+    void *map_addr;
+    off_t map_length;
+    void *buckets;
+    int num_entries;
+    int num_buckets;
+    int key_size;
+    int value_size;
+    int bucket_size;
+    int limit;
+} HashIndex;
 
 #define MAGIC "DARCHASH"
 #define EMPTY ((int32_t)-1)
@@ -33,6 +43,16 @@ typedef struct {
 
 #define BUCKET_MARK_DELETED(index, idx) (*((int32_t *)(BUCKET_ADDR_WRITE(index, idx) + index->key_size)) = DELETED)
 
+static HashIndex *hashindex_open(const char *path);
+static void hashindex_close(HashIndex *index);
+static void hashindex_clear(HashIndex *index);
+static void hashindex_flush(HashIndex *index);
+static HashIndex *hashindex_create(const char *path, int capacity, int key_size, int value_size);
+static const void *hashindex_get(HashIndex *index, const void *key);
+static void hashindex_set(HashIndex *index, const void *key, const void *value);
+static void hashindex_delete(HashIndex *index, const void *key);
+static void *hashindex_next_key(HashIndex *index, const void *key);
+
 
 /* Private API */
 static int
@@ -97,7 +117,7 @@ hashindex_resize(HashIndex *index, int capacity)
 }
 
 /* Public API */
-HashIndex *
+static HashIndex *
 hashindex_open(const char *path)
 {
     int fd = open(path, O_RDWR);
@@ -127,7 +147,7 @@ hashindex_open(const char *path)
     return index;
 }
 
-HashIndex *
+static HashIndex *
 hashindex_create(const char *path, int capacity, int key_size, int value_size)
 {
     FILE *fd;
@@ -160,7 +180,7 @@ error:
     return NULL;
 }
 
-void
+static void
 hashindex_clear(HashIndex *index)
 {
     int i;
@@ -171,7 +191,7 @@ hashindex_clear(HashIndex *index)
     hashindex_resize(index, 16);
 }
 
-void
+static void
 hashindex_flush(HashIndex *index)
 {
     *((int32_t *)(index->map_addr + 8)) = index->num_entries;
@@ -179,7 +199,7 @@ hashindex_flush(HashIndex *index)
     msync(index->map_addr, index->map_length, MS_SYNC);
 }
 
-void
+static void
 hashindex_close(HashIndex *index)
 {
     hashindex_flush(index);
@@ -188,7 +208,7 @@ hashindex_close(HashIndex *index)
     free(index);
 }
 
-const void *
+static const void *
 hashindex_get(HashIndex *index, const void *key)
 {
     int idx = hashindex_lookup(index, key);
@@ -198,7 +218,7 @@ hashindex_get(HashIndex *index, const void *key)
     return BUCKET_ADDR_READ(index, idx) + index->key_size;
 }
 
-void
+static void
 hashindex_set(HashIndex *index, const void *key, const void *value)
 {
     int idx = hashindex_lookup(index, key);
@@ -223,7 +243,7 @@ hashindex_set(HashIndex *index, const void *key, const void *value)
     }
 }
 
-void
+static void
 hashindex_delete(HashIndex *index, const void *key)
 {
     int idx = hashindex_lookup(index, key);
@@ -234,7 +254,7 @@ hashindex_delete(HashIndex *index, const void *key)
     index->num_entries -= 1;
 }
 
-void *
+static void *
 hashindex_next_key(HashIndex *index, const void *key)
 {
     int idx = 0;
@@ -251,7 +271,7 @@ hashindex_next_key(HashIndex *index, const void *key)
     return BUCKET_ADDR_READ(index, idx);
 }
 
-int
+static int
 hashindex_get_size(HashIndex *index)
 {
     return index->num_entries;

+ 1 - 1
darc/archive.py

@@ -11,7 +11,7 @@ import time
 from cStringIO import StringIO
 from xattr import xattr, XATTR_NOFOLLOW
 
-from ._speedups import chunkify
+from .chunker import chunkify
 from .helpers import uid2user, user2uid, gid2group, group2gid, \
     encode_filename, Statistics
 

+ 48 - 0
darc/chunker.pyx

@@ -0,0 +1,48 @@
+# -*- coding: utf-8 -*-
+
+from libc.stdlib cimport free
+
+cdef extern from "_chunker.c":
+    ctypedef int uint32_t
+    ctypedef struct Chunker:
+        pass
+    Chunker *chunker_init(object fd, int window_size, int chunk_mask, int min_size, uint32_t seed)
+    void chunker_free(Chunker *chunker)
+    object chunker_process(Chunker *chunker)
+    uint32_t *buzhash_init_table(uint32_t seed)
+    uint32_t c_buzhash "buzhash"(const unsigned char *data, size_t len, const uint32_t *h)
+    uint32_t c_buzhash_update  "buzhash_update"(uint32_t sum, unsigned char remove, unsigned char add, size_t len, const uint32_t *h)
+
+
+cdef class chunkify:
+    cdef Chunker *chunker
+
+    def __cinit__(self, fd, window_size, chunk_mask, min_size, seed):
+        self.chunker = chunker_init(fd, window_size, chunk_mask, min_size, seed & 0xffffffff)
+
+    def __dealloc__(self):
+        if self.chunker:
+            chunker_free(self.chunker)
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        return chunker_process(self.chunker)
+
+
+def buzhash(unsigned char *data, unsigned long seed):
+    cdef uint32_t *table
+    cdef uint32_t sum
+    table = buzhash_init_table(seed & 0xffffffff)
+    sum = c_buzhash(data, len(data), table)
+    free(table)
+    return sum
+
+
+def buzhash_update(uint32_t sum, unsigned char remove, unsigned char add, size_t len, unsigned long seed):
+    cdef uint32_t *table
+    table = buzhash_init_table(seed & 0xffffffff)
+    sum = c_buzhash_update(sum, remove, add, len, table)
+    free(table)
+    return sum

+ 0 - 28
darc/hashindex.h

@@ -1,28 +0,0 @@
-#ifndef __HASHINDEX_H__
-#define __HASHINDEX_H__
-
-typedef struct {
-    char *path;
-    void *map_addr;
-    off_t map_length;
-    void *buckets;
-    int num_entries;
-    int num_buckets;
-    int key_size;
-    int value_size;
-    int bucket_size;
-    int limit;
-} HashIndex;
-
-HashIndex *hashindex_open(const char *path);
-void hashindex_close(HashIndex *index);
-void hashindex_clear(HashIndex *index);
-void hashindex_flush(HashIndex *index);
-HashIndex *hashindex_create(const char *path, int capacity, int key_size, int value_size);
-const void *hashindex_get(HashIndex *index, const void *key);
-void hashindex_set(HashIndex *index, const void *key, const void *value);
-void hashindex_delete(HashIndex *index, const void *key);
-void *hashindex_next_key(HashIndex *index, const void *key);
-int hashindex_get_size(HashIndex *index);
-
-#endif

+ 2 - 1
darc/hashindex.pyx

@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 
-cdef extern from "hashindex.h":
+cdef extern from "_hashindex.c":
     ctypedef struct HashIndex:
         pass
 
@@ -15,6 +15,7 @@ cdef extern from "hashindex.h":
     void hashindex_delete(HashIndex *index, void *key)
     void hashindex_set(HashIndex *index, void *key, void *value)
 
+
 _NoDefault = object()
 
 cdef class IndexBase:

+ 1 - 0
darc/remote.py

@@ -67,6 +67,7 @@ class RemoteStore(object):
             self.name = name
 
     def __init__(self, location, create=False):
+        self.p = None
         self.cache = LRUCache(256)
         self.to_send = ''
         self.extra = {}

+ 1 - 1
darc/test.py

@@ -11,7 +11,7 @@ import unittest
 from xattr import xattr, XATTR_NOFOLLOW
 
 from . import helpers, lrucache
-from ._speedups import buzhash, buzhash_update, chunkify
+from .chunker import chunkify, buzhash, buzhash_update
 from .archiver import Archiver
 from .key import suite as KeySuite
 from .store import Store, suite as StoreSuite

+ 3 - 4
setup.py

@@ -23,7 +23,6 @@ except ImportError:
 from distutils.core import setup
 from distutils.extension import Extension
 from distutils.command.sdist import sdist
-hashindex_sources = ['darc/hashindex.pyx', 'darc/_hashindex.c']
 
 try:
     from Cython.Distutils import build_ext
@@ -57,8 +56,8 @@ setup(name='darc',
       packages=['darc'],
       cmdclass={'build_ext': build_ext, 'sdist': Sdist},
       ext_modules=[
-      Extension('darc._speedups', ['darc/_speedups.c']),
-      Extension('darc.hashindex', hashindex_sources)],
-      scripts = ['scripts/darc'],
+      Extension('darc.chunker', ['darc/chunker.pyx']),
+      Extension('darc.hashindex', ['darc/hashindex.pyx'])],
+      scripts=['scripts/darc'],
     )