瀏覽代碼

Reuse chunker buffer between files.

Jonas Borgström 11 年之前
父節點
當前提交
9f64e39d9f
共有 5 個文件被更改,包括 43 次插入29 次删除
  1. 10 4
      attic/_chunker.c
  2. 5 3
      attic/archive.py
  3. 14 9
      attic/chunker.pyx
  4. 2 1
      attic/helpers.py
  5. 12 12
      attic/testsuite/chunker.py

+ 10 - 4
attic/_chunker.c

@@ -85,15 +85,22 @@ typedef struct {
 } Chunker;
 
 static Chunker *
-chunker_init(PyObject *fd, int window_size, int chunk_mask, int min_size, uint32_t seed)
+chunker_init(int window_size, int chunk_mask, int min_size, uint32_t seed)
 {
-    Chunker *c = malloc(sizeof(Chunker));
+    Chunker *c = calloc(sizeof(Chunker), 1);
     c->window_size = window_size;
     c->chunk_mask = chunk_mask;
     c->min_size = min_size;
     c->table = buzhash_init_table(seed);
     c->buf_size = 10 * 1024 * 1024;
     c->data = malloc(c->buf_size);
+    return c;
+}
+
+static void
+chunker_set_fd(Chunker *c, PyObject *fd)
+{
+    Py_XDECREF(c->fd);
     c->fd = fd;
     Py_INCREF(fd);
     c->done = 0;
@@ -103,13 +110,12 @@ chunker_init(PyObject *fd, int window_size, int chunk_mask, int min_size, uint32
     c->position = 0;
     c->last = 0;
     c->eof = 0;
-    return c;
 }
 
 static void
 chunker_free(Chunker *c)
 {
-    Py_DECREF(c->fd);
+    Py_XDECREF(c->fd);
     free(c->table);
     free(c->data);
     free(c);

+ 5 - 3
attic/archive.py

@@ -15,7 +15,7 @@ import time
 from io import BytesIO
 from attic import xattr
 from attic.platform import acl_get, acl_set
-from attic.chunker import chunkify
+from attic.chunker import Chunker
 from attic.hashindex import ChunkIndex
 from attic.helpers import Error, uid2user, user2uid, gid2group, group2gid, \
     Manifest, Statistics, decode_dict, st_mtime_ns, make_path_safe, StableDict, int_to_bigint, bigint_to_int
@@ -65,6 +65,7 @@ class ChunkBuffer:
         self.packer = msgpack.Packer(unicode_errors='surrogateescape')
         self.chunks = []
         self.key = key
+        self.chunker = Chunker(WINDOW_SIZE, CHUNK_MASK, CHUNK_MIN, self.key.chunk_seed)
 
     def add(self, item):
         self.buffer.write(self.packer.pack(StableDict(item)))
@@ -78,7 +79,7 @@ class ChunkBuffer:
         if self.buffer.tell() == 0:
             return
         self.buffer.seek(0)
-        chunks = list(bytes(s) for s in chunkify(self.buffer, WINDOW_SIZE, CHUNK_MASK, CHUNK_MIN, self.key.chunk_seed))
+        chunks = list(bytes(s) for s in self.chunker.chunkify(self.buffer))
         self.buffer.seek(0)
         self.buffer.truncate(0)
         # Leave the last parital chunk in the buffer unless flush is True
@@ -126,6 +127,7 @@ class Archive:
         self.numeric_owner = numeric_owner
         self.items_buffer = CacheChunkBuffer(self.cache, self.key, self.stats)
         self.pipeline = DownloadPipeline(self.repository, self.key)
+        self.chunker = Chunker(WINDOW_SIZE, CHUNK_MASK, CHUNK_MIN, self.key.chunk_seed)
         if create:
             if name in manifest.archives:
                 raise self.AlreadyExists(name)
@@ -399,7 +401,7 @@ class Archive:
         if chunks is None:
             with open(path, 'rb') as fd:
                 chunks = []
-                for chunk in chunkify(fd, WINDOW_SIZE, CHUNK_MASK, CHUNK_MIN, self.key.chunk_seed):
+                for chunk in self.chunker.chunkify(fd):
                     chunks.append(cache.add_chunk(self.key.id_hash(chunk), chunk, self.stats))
             cache.memorize_file(path_hash, st, [c[0] for c in chunks])
         item = {b'path': safe_path, b'chunks': chunks}

+ 14 - 9
attic/chunker.pyx

@@ -1,26 +1,31 @@
 # -*- coding: utf-8 -*-
 
-API_VERSION = 1
+API_VERSION = 2
 
 from libc.stdlib cimport free
 
 cdef extern from "_chunker.c":
     ctypedef int uint32_t
-    ctypedef struct Chunker:
+    ctypedef struct _Chunker "Chunker":
         pass
-    Chunker *chunker_init(object fd, int window_size, int chunk_mask, int min_size, uint32_t seed)
-    void chunker_free(Chunker *chunker)
-    object chunker_process(Chunker *chunker)
+    _Chunker *chunker_init(int window_size, int chunk_mask, int min_size, uint32_t seed)
+    void chunker_set_fd(_Chunker *chunker, object fd)
+    void chunker_free(_Chunker *chunker)
+    object chunker_process(_Chunker *chunker)
     uint32_t *buzhash_init_table(uint32_t seed)
     uint32_t c_buzhash "buzhash"(unsigned char *data, size_t len, uint32_t *h)
     uint32_t c_buzhash_update  "buzhash_update"(uint32_t sum, unsigned char remove, unsigned char add, size_t len, uint32_t *h)
 
 
-cdef class chunkify:
-    cdef Chunker *chunker
+cdef class Chunker:
+    cdef _Chunker *chunker
 
-    def __cinit__(self, fd, window_size, chunk_mask, min_size, seed):
-        self.chunker = chunker_init(fd, window_size, chunk_mask, min_size, seed & 0xffffffff)
+    def __cinit__(self, window_size, chunk_mask, min_size, seed):
+        self.chunker = chunker_init(window_size, chunk_mask, min_size, seed & 0xffffffff)
+
+    def chunkify(self, fd):
+        chunker_set_fd(self.chunker, fd)
+        return self
 
     def __dealloc__(self):
         if self.chunker:

+ 2 - 1
attic/helpers.py

@@ -74,7 +74,7 @@ class UpgradableLock:
 def check_extension_modules():
     import attic.platform
     if (attic.hashindex.API_VERSION != 2 or
-        attic.chunker.API_VERSION != 1 or
+        attic.chunker.API_VERSION != 2 or
         attic.crypto.API_VERSION != 2 or
         attic.platform.API_VERSION != 2):
         raise ExtensionModuleError
@@ -577,3 +577,4 @@ def int_to_bigint(value):
     if value.bit_length() > 63:
         return value.to_bytes((value.bit_length() + 9) // 8, 'little', signed=True)
     return value
+

+ 12 - 12
attic/testsuite/chunker.py

@@ -1,4 +1,4 @@
-from attic.chunker import chunkify, buzhash, buzhash_update
+from attic.chunker import Chunker, buzhash, buzhash_update
 from attic.testsuite import AtticTestCase
 from io import BytesIO
 
@@ -7,19 +7,19 @@ class ChunkerTestCase(AtticTestCase):
 
     def test_chunkify(self):
         data = b'0' * 1024 * 1024 * 15 + b'Y'
-        parts = [bytes(c) for c in chunkify(BytesIO(data), 2, 0x3, 2, 0)]
+        parts = [bytes(c) for c in Chunker(2, 0x3, 2, 0).chunkify(BytesIO(data))]
         self.assert_equal(len(parts), 2)
         self.assert_equal(b''.join(parts), data)
-        self.assert_equal([bytes(c) for c in chunkify(BytesIO(b''), 2, 0x3, 2, 0)], [])
-        self.assert_equal([bytes(c) for c in chunkify(BytesIO(b'foobarboobaz' * 3), 2, 0x3, 2, 0)], [b'fooba', b'rboobaz', b'fooba', b'rboobaz', b'fooba', b'rboobaz'])
-        self.assert_equal([bytes(c) for c in chunkify(BytesIO(b'foobarboobaz' * 3), 2, 0x3, 2, 1)], [b'fo', b'obarb', b'oob', b'azf', b'oobarb', b'oob', b'azf', b'oobarb', b'oobaz'])
-        self.assert_equal([bytes(c) for c in chunkify(BytesIO(b'foobarboobaz' * 3), 2, 0x3, 2, 2)], [b'foob', b'ar', b'boobazfoob', b'ar', b'boobazfoob', b'ar', b'boobaz'])
-        self.assert_equal([bytes(c) for c in chunkify(BytesIO(b'foobarboobaz' * 3), 3, 0x3, 3, 0)], [b'foobarboobaz' * 3])
-        self.assert_equal([bytes(c) for c in chunkify(BytesIO(b'foobarboobaz' * 3), 3, 0x3, 3, 1)], [b'foobar', b'boo', b'bazfo', b'obar', b'boo', b'bazfo', b'obar', b'boobaz'])
-        self.assert_equal([bytes(c) for c in chunkify(BytesIO(b'foobarboobaz' * 3), 3, 0x3, 3, 2)], [b'foo', b'barboobaz', b'foo', b'barboobaz', b'foo', b'barboobaz'])
-        self.assert_equal([bytes(c) for c in chunkify(BytesIO(b'foobarboobaz' * 3), 3, 0x3, 4, 0)], [b'foobarboobaz' * 3])
-        self.assert_equal([bytes(c) for c in chunkify(BytesIO(b'foobarboobaz' * 3), 3, 0x3, 4, 1)], [b'foobar', b'boobazfo', b'obar', b'boobazfo', b'obar', b'boobaz'])
-        self.assert_equal([bytes(c) for c in chunkify(BytesIO(b'foobarboobaz' * 3), 3, 0x3, 4, 2)], [b'foob', b'arboobaz', b'foob', b'arboobaz', b'foob', b'arboobaz'])
+        self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, 0).chunkify(BytesIO(b''))], [])
+        self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, 0).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fooba', b'rboobaz', b'fooba', b'rboobaz', b'fooba', b'rboobaz'])
+        self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, 1).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fo', b'obarb', b'oob', b'azf', b'oobarb', b'oob', b'azf', b'oobarb', b'oobaz'])
+        self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'ar', b'boobazfoob', b'ar', b'boobazfoob', b'ar', b'boobaz'])
+        self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 3, 0).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3])
+        self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 3, 1).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobar', b'boo', b'bazfo', b'obar', b'boo', b'bazfo', b'obar', b'boobaz'])
+        self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 3, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foo', b'barboobaz', b'foo', b'barboobaz', b'foo', b'barboobaz'])
+        self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 4, 0).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3])
+        self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 4, 1).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobar', b'boobazfo', b'obar', b'boobazfo', b'obar', b'boobaz'])
+        self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 4, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'arboobaz', b'foob', b'arboobaz', b'foob', b'arboobaz'])
 
     def test_buzhash(self):
         self.assert_equal(buzhash(b'abcdefghijklmnop', 0), 3795437769)