Browse Source

Use Cython for all native code

Jonas Borgström 12 năm trước cách đây
mục cha
commit
3e5433855c
10 tập tin đã thay đổi với 137 bổ sung196 xóa
  1. 1 0
      .gitignore
  2. 48 149
      darc/_chunker.c
  3. 32 12
      darc/_hashindex.c
  4. 1 1
      darc/archive.py
  5. 48 0
      darc/chunker.pyx
  6. 0 28
      darc/hashindex.h
  7. 2 1
      darc/hashindex.pyx
  8. 1 0
      darc/remote.py
  9. 1 1
      darc/test.py
  10. 3 4
      setup.py

+ 1 - 0
.gitignore

@@ -3,6 +3,7 @@ build
 dist
 env
 hashindex.c
+chunker.c
 *.egg-info
 *.pyc
 *.pyo

+ 48 - 149
darc/_speedups.c → darc/_chunker.c

@@ -1,6 +1,4 @@
 #include <Python.h>
-#include <structmember.h>
-#include <stdint.h>
 
 /* Cyclic polynomial / buzhash: https://en.wikipedia.org/wiki/Rolling_hash */
 
@@ -44,7 +42,7 @@ static uint32_t table_base[] =
 
 
 static uint32_t *
-init_buzhash_table(uint32_t seed)
+buzhash_init_table(uint32_t seed)
 {
     int i;
     uint32_t *table = malloc(1024);
@@ -56,9 +54,9 @@ init_buzhash_table(uint32_t seed)
 }
 
 static uint32_t
-buzhash(const unsigned char *data, int len, const uint32_t *h)
+buzhash(const unsigned char *data, size_t len, const uint32_t *h)
 {
-    int i;
+    size_t i;
     uint32_t sum = 0;
     for(i = len - 1; i > 0; i--)
     {
@@ -69,63 +67,72 @@ buzhash(const unsigned char *data, int len, const uint32_t *h)
 }
 
 static uint32_t
-buzhash_update(uint32_t sum, unsigned char remove, unsigned char add, int len, const uint32_t *h)
+buzhash_update(uint32_t sum, unsigned char remove, unsigned char add, size_t len, const uint32_t *h)
 {
     return BARREL_SHIFT(sum, 1) ^ BARREL_SHIFT(h[remove], len) ^ h[add];
 }
 
 typedef struct {
-    PyObject_HEAD
-    int window_size, last, done, buf_size, remaining, position, chunk_mask, min_size;
-    size_t bytes_read, bytes_yielded;
-    uint32_t *h;
-    PyObject *chunks, *fd;
-    unsigned char *data;
-} ChunkifyIter;
+    int window_size, chunk_mask, min_size;
+    size_t buf_size;
+    uint32_t *table;
+    uint8_t *data;
+    PyObject *fd;
+    int done;
+    size_t remaining, bytes_read, bytes_yielded, position, last;
+} Chunker;
 
-static PyObject*
-ChunkifyIter_iter(PyObject *self)
+static Chunker *
+chunker_init(PyObject *fd, int window_size, int chunk_mask, int min_size, uint32_t seed)
 {
-    ChunkifyIter *c = (ChunkifyIter *)self;
-    c->remaining = 0;
-    c->position = 0;
+    Chunker *c = malloc(sizeof(Chunker));
+    c->window_size = window_size;
+    c->chunk_mask = chunk_mask;
+    c->min_size = min_size;
+    c->table = buzhash_init_table(seed);
+    c->buf_size = 10 * 1024 * 1024;
+    c->data = malloc(c->buf_size);
+    c->fd = fd;
+    Py_INCREF(fd);
     c->done = 0;
-    c->last = 0;
+    c->remaining = 0;
     c->bytes_read = 0;
     c->bytes_yielded = 0;
-    Py_INCREF(self);
-    return self;
+    c->position = 0;
+    c->last = 0;
+    return c;
 }
 
 static void
-ChunkifyIter_dealloc(PyObject *self)
+chunker_free(Chunker *c)
 {
-    ChunkifyIter *c = (ChunkifyIter *)self;
     Py_DECREF(c->fd);
+    free(c->table);
     free(c->data);
-    free(c->h);
-    self->ob_type->tp_free(self);
+    free(c);
 }
 
-static void
-ChunkifyIter_fill(PyObject *self)
+static int
+chunker_fill(Chunker *c)
 {
-    ChunkifyIter *c = (ChunkifyIter *)self;
     memmove(c->data, c->data + c->last, c->position + c->remaining - c->last);
     c->position -= c->last;
     c->last = 0;
     PyObject *data = PyObject_CallMethod(c->fd, "read", "i", c->buf_size - c->position - c->remaining);
+    if(!data) {
+        return 0;
+    }
     int n = PyString_Size(data);
     memcpy(c->data + c->position + c->remaining, PyString_AsString(data), n);
     c->remaining += n;
     c->bytes_read += n;
     Py_DECREF(data);
+    return 1;
 }
 
-static PyObject*
-ChunkifyIter_iternext(PyObject *self)
+static PyObject *
+chunker_process(Chunker *c)
 {
-    ChunkifyIter *c = (ChunkifyIter *)self;
     uint32_t sum, chunk_mask = c->chunk_mask, min_size = c->min_size, window_size = c->window_size;
     int n = 0;
 
@@ -137,7 +144,9 @@ ChunkifyIter_iternext(PyObject *self)
         return NULL;
     }
     if(c->remaining <= window_size) {
-        ChunkifyIter_fill(self);
+        if(!chunker_fill(c)) {
+            return NULL;
+        }
     }
     if(c->remaining < window_size) {
         c->done = 1;
@@ -153,16 +162,18 @@ ChunkifyIter_iternext(PyObject *self)
             return NULL;
         }
     }
-    sum = buzhash(c->data + c->position, window_size, c->h);
+    sum = buzhash(c->data + c->position, window_size, c->table);
     while(c->remaining >= c->window_size && ((sum & chunk_mask) || n < min_size)) {
         sum = buzhash_update(sum, c->data[c->position],
                              c->data[c->position + window_size],
-                             window_size, c->h);
+                             window_size, c->table);
         c->position++;
         c->remaining--;
         n++;
         if(c->remaining <= window_size) {
-            ChunkifyIter_fill(self);
+            if(!chunker_fill(c)) {
+                return NULL;
+            }
         }
     }
     if(c->remaining <= window_size) {
@@ -174,117 +185,5 @@ ChunkifyIter_iternext(PyObject *self)
     n = c->last - old_last;
     c->bytes_yielded += n;
     return PyBuffer_FromMemory(c->data + old_last, n);
-}
-
-static PyTypeObject ChunkifyIterType = {
-    PyObject_HEAD_INIT(NULL)
-    0,                         /*ob_size*/
-    "_chunkifier._ChunkifyIter",       /*tp_name*/
-    sizeof(ChunkifyIter),       /*tp_basicsize*/
-    0,                         /*tp_itemsize*/
-    ChunkifyIter_dealloc,      /*tp_dealloc*/
-    0,                         /*tp_print*/
-    0,                         /*tp_getattr*/
-    0,                         /*tp_setattr*/
-    0,                         /*tp_compare*/
-    0,                         /*tp_repr*/
-    0,                         /*tp_as_number*/
-    0,                         /*tp_as_sequence*/
-    0,                         /*tp_as_mapping*/
-    0,                         /*tp_hash */
-    0,                         /*tp_call*/
-    0,                         /*tp_str*/
-    0,                         /*tp_getattro*/
-    0,                         /*tp_setattro*/
-    0,                         /*tp_as_buffer*/
-    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_ITER,
-      /* tp_flags: Py_TPFLAGS_HAVE_ITER tells python to
-         use tp_iter and tp_iternext fields. */
-    "",           /* tp_doc */
-    0,  /* tp_traverse */
-    0,  /* tp_clear */
-    0,  /* tp_richcompare */
-    0,  /* tp_weaklistoffset */
-    ChunkifyIter_iter,  /* tp_iter: __iter__() method */
-    ChunkifyIter_iternext  /* tp_iternext: next() method */
-};
-
-static PyObject *
-chunkify(PyObject *self, PyObject *args)
-{
-    PyObject *fd;
-    int seed, window_size, chunk_mask, min_size;
-    ChunkifyIter *c;
-
-    if (!PyArg_ParseTuple(args, "Oiiii", &fd, &window_size, &chunk_mask, &min_size, &seed))
-    {
-        return NULL;
-    }
-    if (!(c = PyObject_New(ChunkifyIter, &ChunkifyIterType)))
-    {
-        return NULL;
-    }
-    PyObject_Init((PyObject *)c, &ChunkifyIterType);
-    c->buf_size = 10 * 1024 * 1024;
-    c->data = malloc(c->buf_size);
-    c->h = init_buzhash_table(seed & 0xffffffff);
-    c->fd = fd;
-    c->window_size = window_size;
-    c->chunk_mask = chunk_mask;
-    c->min_size = min_size;
-    Py_INCREF(fd);
-    return (PyObject *)c;
-}
-
-static PyObject *
-do_buzhash(PyObject *self, PyObject *args)
-{
-    unsigned char *data;
-    int len;
-    unsigned long int seed, sum;
-    uint32_t *h;
-
-    if (!PyArg_ParseTuple(args, "s#k", &data, &len, &seed))
-    {
-        return NULL;
-    }
-    h = init_buzhash_table(seed & 0xffffffff);
-    sum = buzhash(data, len, h);
-    free(h);
-    return PyLong_FromUnsignedLong(sum);
-}
-
-static PyObject *
-do_buzhash_update(PyObject *self, PyObject *args)
-{
-    unsigned long int sum, seed;
-    unsigned char remove, add;
-    uint32_t *h;
-    int len;
-
-    if (!PyArg_ParseTuple(args, "kbbik", &sum, &remove, &add, &len, &seed))
-    {
-        return NULL;
-    }
-    h = init_buzhash_table(seed & 0xffffffff);
-    sum = buzhash_update(sum, remove, add, len, h);
-    free(h);
-    return PyLong_FromUnsignedLong(sum);
-}
-
-
-static PyMethodDef ChunkifierMethods[] = {
-    {"chunkify",  chunkify, METH_VARARGS, ""},
-    {"buzhash",   do_buzhash, METH_VARARGS, ""},
-    {"buzhash_update",   do_buzhash_update, METH_VARARGS, ""},
-    {NULL, NULL, 0, NULL}        /* Sentinel */
-};
-
-PyMODINIT_FUNC
-init_speedups(void)
-{
-  ChunkifyIterType.tp_new = PyType_GenericNew;
-  if (PyType_Ready(&ChunkifyIterType) < 0)  return;
-
-  Py_InitModule("_speedups", ChunkifierMethods);
-}
+    
+}

+ 32 - 12
darc/_hashindex.c

@@ -9,8 +9,6 @@
 #include <unistd.h>
 #include <sys/mman.h>
 
-#include "hashindex.h"
-
 typedef struct {
     char magic[8];
     int32_t num_entries;
@@ -19,6 +17,18 @@ typedef struct {
     int8_t  value_size;
 } __attribute__((__packed__)) HashHeader;
 
+typedef struct {
+    char *path;
+    void *map_addr;
+    off_t map_length;
+    void *buckets;
+    int num_entries;
+    int num_buckets;
+    int key_size;
+    int value_size;
+    int bucket_size;
+    int limit;
+} HashIndex;
 
 #define MAGIC "DARCHASH"
 #define EMPTY ((int32_t)-1)
@@ -33,6 +43,16 @@ typedef struct {
 
 #define BUCKET_MARK_DELETED(index, idx) (*((int32_t *)(BUCKET_ADDR_WRITE(index, idx) + index->key_size)) = DELETED)
 
+static HashIndex *hashindex_open(const char *path);
+static void hashindex_close(HashIndex *index);
+static void hashindex_clear(HashIndex *index);
+static void hashindex_flush(HashIndex *index);
+static HashIndex *hashindex_create(const char *path, int capacity, int key_size, int value_size);
+static const void *hashindex_get(HashIndex *index, const void *key);
+static void hashindex_set(HashIndex *index, const void *key, const void *value);
+static void hashindex_delete(HashIndex *index, const void *key);
+static void *hashindex_next_key(HashIndex *index, const void *key);
+
 
 /* Private API */
 static int
@@ -97,7 +117,7 @@ hashindex_resize(HashIndex *index, int capacity)
 }
 
 /* Public API */
-HashIndex *
+static HashIndex *
 hashindex_open(const char *path)
 {
     int fd = open(path, O_RDWR);
@@ -127,7 +147,7 @@ hashindex_open(const char *path)
     return index;
 }
 
-HashIndex *
+static HashIndex *
 hashindex_create(const char *path, int capacity, int key_size, int value_size)
 {
     FILE *fd;
@@ -160,7 +180,7 @@ error:
     return NULL;
 }
 
-void
+static void
 hashindex_clear(HashIndex *index)
 {
     int i;
@@ -171,7 +191,7 @@ hashindex_clear(HashIndex *index)
     hashindex_resize(index, 16);
 }
 
-void
+static void
 hashindex_flush(HashIndex *index)
 {
     *((int32_t *)(index->map_addr + 8)) = index->num_entries;
@@ -179,7 +199,7 @@ hashindex_flush(HashIndex *index)
     msync(index->map_addr, index->map_length, MS_SYNC);
 }
 
-void
+static void
 hashindex_close(HashIndex *index)
 {
     hashindex_flush(index);
@@ -188,7 +208,7 @@ hashindex_close(HashIndex *index)
     free(index);
 }
 
-const void *
+static const void *
 hashindex_get(HashIndex *index, const void *key)
 {
     int idx = hashindex_lookup(index, key);
@@ -198,7 +218,7 @@ hashindex_get(HashIndex *index, const void *key)
     return BUCKET_ADDR_READ(index, idx) + index->key_size;
 }
 
-void
+static void
 hashindex_set(HashIndex *index, const void *key, const void *value)
 {
     int idx = hashindex_lookup(index, key);
@@ -223,7 +243,7 @@ hashindex_set(HashIndex *index, const void *key, const void *value)
     }
 }
 
-void
+static void
 hashindex_delete(HashIndex *index, const void *key)
 {
     int idx = hashindex_lookup(index, key);
@@ -234,7 +254,7 @@ hashindex_delete(HashIndex *index, const void *key)
     index->num_entries -= 1;
 }
 
-void *
+static void *
 hashindex_next_key(HashIndex *index, const void *key)
 {
     int idx = 0;
@@ -251,7 +271,7 @@ hashindex_next_key(HashIndex *index, const void *key)
     return BUCKET_ADDR_READ(index, idx);
 }
 
-int
+static int
 hashindex_get_size(HashIndex *index)
 {
     return index->num_entries;

+ 1 - 1
darc/archive.py

@@ -11,7 +11,7 @@ import time
 from cStringIO import StringIO
 from xattr import xattr, XATTR_NOFOLLOW
 
-from ._speedups import chunkify
+from .chunker import chunkify
 from .helpers import uid2user, user2uid, gid2group, group2gid, \
     encode_filename, Statistics
 

+ 48 - 0
darc/chunker.pyx

@@ -0,0 +1,48 @@
+# -*- coding: utf-8 -*-
+
+from libc.stdlib cimport free
+
+cdef extern from "_chunker.c":
+    ctypedef int uint32_t
+    ctypedef struct Chunker:
+        pass
+    Chunker *chunker_init(object fd, int window_size, int chunk_mask, int min_size, uint32_t seed)
+    void chunker_free(Chunker *chunker)
+    object chunker_process(Chunker *chunker)
+    uint32_t *buzhash_init_table(uint32_t seed)
+    uint32_t c_buzhash "buzhash"(const unsigned char *data, size_t len, const uint32_t *h)
+    uint32_t c_buzhash_update  "buzhash_update"(uint32_t sum, unsigned char remove, unsigned char add, size_t len, const uint32_t *h)
+
+
+cdef class chunkify:
+    cdef Chunker *chunker
+
+    def __cinit__(self, fd, window_size, chunk_mask, min_size, seed):
+        self.chunker = chunker_init(fd, window_size, chunk_mask, min_size, seed & 0xffffffff)
+
+    def __dealloc__(self):
+        if self.chunker:
+            chunker_free(self.chunker)
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        return chunker_process(self.chunker)
+
+
+def buzhash(unsigned char *data, unsigned long seed):
+    cdef uint32_t *table
+    cdef uint32_t sum
+    table = buzhash_init_table(seed & 0xffffffff)
+    sum = c_buzhash(data, len(data), table)
+    free(table)
+    return sum
+
+
+def buzhash_update(uint32_t sum, unsigned char remove, unsigned char add, size_t len, unsigned long seed):
+    cdef uint32_t *table
+    table = buzhash_init_table(seed & 0xffffffff)
+    sum = c_buzhash_update(sum, remove, add, len, table)
+    free(table)
+    return sum

+ 0 - 28
darc/hashindex.h

@@ -1,28 +0,0 @@
-#ifndef __HASHINDEX_H__
-#define __HASHINDEX_H__
-
-typedef struct {
-    char *path;
-    void *map_addr;
-    off_t map_length;
-    void *buckets;
-    int num_entries;
-    int num_buckets;
-    int key_size;
-    int value_size;
-    int bucket_size;
-    int limit;
-} HashIndex;
-
-HashIndex *hashindex_open(const char *path);
-void hashindex_close(HashIndex *index);
-void hashindex_clear(HashIndex *index);
-void hashindex_flush(HashIndex *index);
-HashIndex *hashindex_create(const char *path, int capacity, int key_size, int value_size);
-const void *hashindex_get(HashIndex *index, const void *key);
-void hashindex_set(HashIndex *index, const void *key, const void *value);
-void hashindex_delete(HashIndex *index, const void *key);
-void *hashindex_next_key(HashIndex *index, const void *key);
-int hashindex_get_size(HashIndex *index);
-
-#endif

+ 2 - 1
darc/hashindex.pyx

@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 
-cdef extern from "hashindex.h":
+cdef extern from "_hashindex.c":
     ctypedef struct HashIndex:
         pass
 
@@ -15,6 +15,7 @@ cdef extern from "hashindex.h":
     void hashindex_delete(HashIndex *index, void *key)
     void hashindex_set(HashIndex *index, void *key, void *value)
 
+
 _NoDefault = object()
 
 cdef class IndexBase:

+ 1 - 0
darc/remote.py

@@ -67,6 +67,7 @@ class RemoteStore(object):
             self.name = name
 
     def __init__(self, location, create=False):
+        self.p = None
         self.cache = LRUCache(256)
         self.to_send = ''
         self.extra = {}

+ 1 - 1
darc/test.py

@@ -11,7 +11,7 @@ import unittest
 from xattr import xattr, XATTR_NOFOLLOW
 
 from . import helpers, lrucache
-from ._speedups import buzhash, buzhash_update, chunkify
+from .chunker import chunkify, buzhash, buzhash_update
 from .archiver import Archiver
 from .key import suite as KeySuite
 from .store import Store, suite as StoreSuite

+ 3 - 4
setup.py

@@ -23,7 +23,6 @@ except ImportError:
 from distutils.core import setup
 from distutils.extension import Extension
 from distutils.command.sdist import sdist
-hashindex_sources = ['darc/hashindex.pyx', 'darc/_hashindex.c']
 
 try:
     from Cython.Distutils import build_ext
@@ -57,8 +56,8 @@ setup(name='darc',
       packages=['darc'],
       cmdclass={'build_ext': build_ext, 'sdist': Sdist},
       ext_modules=[
-      Extension('darc._speedups', ['darc/_speedups.c']),
-      Extension('darc.hashindex', hashindex_sources)],
-      scripts = ['scripts/darc'],
+      Extension('darc.chunker', ['darc/chunker.pyx']),
+      Extension('darc.hashindex', ['darc/hashindex.pyx'])],
+      scripts=['scripts/darc'],
     )