소스 검색

PR #284 - Merge branch 'sparse_files' into merge

Thomas Waldmann 10 년 전
부모
커밋
b6ed1c742b
5개의 변경된 파일63개의 추가작업 그리고 21개의 파일을 삭제
  1. 2 2
      attic/_chunker.c
  2. 12 3
      attic/archive.py
  3. 3 3
      attic/chunker.pyx
  4. 33 1
      attic/testsuite/archiver.py
  5. 13 12
      attic/testsuite/chunker.py

+ 2 - 2
attic/_chunker.c

@@ -87,14 +87,14 @@ typedef struct {
 } Chunker;
 
 static Chunker *
-chunker_init(int window_size, int chunk_mask, int min_size, uint32_t seed)
+chunker_init(int window_size, int chunk_mask, int min_size, int max_size, uint32_t seed)
 {
     Chunker *c = calloc(sizeof(Chunker), 1);
     c->window_size = window_size;
     c->chunk_mask = chunk_mask;
     c->min_size = min_size;
     c->table = buzhash_init_table(seed);
-    c->buf_size = 10 * 1024 * 1024;
+    c->buf_size = max_size;
     c->data = malloc(c->buf_size);
     c->read_buf = malloc(c->buf_size);
     return c;

+ 12 - 3
attic/archive.py

@@ -22,9 +22,12 @@ from attic.helpers import Error, uid2user, user2uid, gid2group, group2gid, \
 
 ITEMS_BUFFER = 1024 * 1024
 CHUNK_MIN = 1024
+CHUNK_MAX = 10 * 1024 * 1024
 WINDOW_SIZE = 0xfff
 CHUNK_MASK = 0xffff
 
+ZEROS = b'\0' * CHUNK_MAX
+
 utime_supports_fd = os.utime in getattr(os, 'supports_fd', {})
 utime_supports_follow_symlinks = os.utime in getattr(os, 'supports_follow_symlinks', {})
 has_mtime_ns = sys.version >= '3.3'
@@ -71,7 +74,7 @@ class ChunkBuffer:
         self.packer = msgpack.Packer(unicode_errors='surrogateescape')
         self.chunks = []
         self.key = key
-        self.chunker = Chunker(WINDOW_SIZE, CHUNK_MASK, CHUNK_MIN, self.key.chunk_seed)
+        self.chunker = Chunker(WINDOW_SIZE, CHUNK_MASK, CHUNK_MIN, CHUNK_MAX,self.key.chunk_seed)
 
     def add(self, item):
         self.buffer.write(self.packer.pack(StableDict(item)))
@@ -136,7 +139,7 @@ class Archive:
         self.pipeline = DownloadPipeline(self.repository, self.key)
         if create:
             self.items_buffer = CacheChunkBuffer(self.cache, self.key, self.stats)
-            self.chunker = Chunker(WINDOW_SIZE, CHUNK_MASK, CHUNK_MIN, self.key.chunk_seed)
+            self.chunker = Chunker(WINDOW_SIZE, CHUNK_MASK, CHUNK_MIN, CHUNK_MAX, self.key.chunk_seed)
             if name in manifest.archives:
                 raise self.AlreadyExists(name)
             self.last_checkpoint = time.time()
@@ -285,7 +288,13 @@ class Archive:
                 with open(path, 'wb') as fd:
                     ids = [c[0] for c in item[b'chunks']]
                     for data in self.pipeline.fetch_many(ids, is_preloaded=True):
-                        fd.write(data)
+                        if ZEROS.startswith(data):
+                            # all-zero chunk: create a hole in a sparse file
+                            fd.seek(len(data), 1)
+                        else:
+                            fd.write(data)
+                    pos = fd.tell()
+                    fd.truncate(pos)
                     fd.flush()
                     self.restore_attrs(path, item, fd=fd.fileno())
         elif stat.S_ISFIFO(mode):

+ 3 - 3
attic/chunker.pyx

@@ -8,7 +8,7 @@ cdef extern from "_chunker.c":
     ctypedef int uint32_t
     ctypedef struct _Chunker "Chunker":
         pass
-    _Chunker *chunker_init(int window_size, int chunk_mask, int min_size, uint32_t seed)
+    _Chunker *chunker_init(int window_size, int chunk_mask, int min_size, int max_size, uint32_t seed)
     void chunker_set_fd(_Chunker *chunker, object f, int fd)
     void chunker_free(_Chunker *chunker)
     object chunker_process(_Chunker *chunker)
@@ -20,8 +20,8 @@ cdef extern from "_chunker.c":
 cdef class Chunker:
     cdef _Chunker *chunker
 
-    def __cinit__(self, window_size, chunk_mask, min_size, seed):
-        self.chunker = chunker_init(window_size, chunk_mask, min_size, seed & 0xffffffff)
+    def __cinit__(self, window_size, chunk_mask, min_size, max_size, seed):
+        self.chunker = chunker_init(window_size, chunk_mask, min_size, max_size, seed & 0xffffffff)
 
     def chunkify(self, fd, fh=-1):
         """

+ 33 - 1
attic/testsuite/archiver.py

@@ -11,7 +11,7 @@ import time
 import unittest
 from hashlib import sha256
 from attic import xattr
-from attic.archive import Archive, ChunkBuffer
+from attic.archive import Archive, ChunkBuffer, CHUNK_MAX
 from attic.archiver import Archiver
 from attic.cache import Cache
 from attic.crypto import bytes_to_long, num_aes_blocks
@@ -206,6 +206,38 @@ class ArchiverTestCase(ArchiverTestCaseBase):
             config.write(fd)
         return Repository(self.repository_path).id
 
+    def test_sparse_file(self):
+        filename = os.path.join(self.input_path, 'sparse')
+        content = b'foobar'
+        hole_size = 5 * CHUNK_MAX  # 5 full chunker buffers
+        with open(filename, 'wb') as fd:
+            # create a file that has a hole at the beginning and end
+            fd.seek(hole_size, 1)
+            fd.write(content)
+            fd.seek(hole_size, 1)
+            pos = fd.tell()
+            fd.truncate(pos)
+        total_len = hole_size + len(content) + hole_size
+        st = os.stat(filename)
+        self.assert_equal(st.st_size, total_len)
+        if hasattr(st, 'st_blocks'):
+            self.assert_true(st.st_blocks * 512 < total_len / 10)  # is input sparse?
+        self.attic('init', self.repository_location)
+        self.attic('create', self.repository_location + '::test', 'input')
+        with changedir('output'):
+            self.attic('extract', self.repository_location + '::test')
+        self.assert_dirs_equal('input', 'output/input')
+        filename = os.path.join(self.output_path, 'input', 'sparse')
+        with open(filename, 'rb') as fd:
+            # check if file contents are as expected
+            self.assert_equal(fd.read(hole_size), b'\0' * hole_size)
+            self.assert_equal(fd.read(len(content)), content)
+            self.assert_equal(fd.read(hole_size), b'\0' * hole_size)
+        st = os.stat(filename)
+        self.assert_equal(st.st_size, total_len)
+        if hasattr(st, 'st_blocks'):
+            self.assert_true(st.st_blocks * 512 < total_len / 10)  # is output sparse?
+
     def test_repository_swap_detection(self):
         self.create_test_files()
         os.environ['ATTIC_PASSPHRASE'] = 'passphrase'

+ 13 - 12
attic/testsuite/chunker.py

@@ -1,25 +1,26 @@
 from attic.chunker import Chunker, buzhash, buzhash_update
 from attic.testsuite import AtticTestCase
+from attic.archive import CHUNK_MAX
 from io import BytesIO
 
 
 class ChunkerTestCase(AtticTestCase):
 
     def test_chunkify(self):
-        data = b'0' * 1024 * 1024 * 15 + b'Y'
-        parts = [bytes(c) for c in Chunker(2, 0x3, 2, 0).chunkify(BytesIO(data))]
+        data = b'0' * int(1.5 * CHUNK_MAX) + b'Y'
+        parts = [bytes(c) for c in Chunker(2, 0x3, 2, CHUNK_MAX, 0).chunkify(BytesIO(data))]
         self.assert_equal(len(parts), 2)
         self.assert_equal(b''.join(parts), data)
-        self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, 0).chunkify(BytesIO(b''))], [])
-        self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, 0).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fooba', b'rboobaz', b'fooba', b'rboobaz', b'fooba', b'rboobaz'])
-        self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, 1).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fo', b'obarb', b'oob', b'azf', b'oobarb', b'oob', b'azf', b'oobarb', b'oobaz'])
-        self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'ar', b'boobazfoob', b'ar', b'boobazfoob', b'ar', b'boobaz'])
-        self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 3, 0).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3])
-        self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 3, 1).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobar', b'boo', b'bazfo', b'obar', b'boo', b'bazfo', b'obar', b'boobaz'])
-        self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 3, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foo', b'barboobaz', b'foo', b'barboobaz', b'foo', b'barboobaz'])
-        self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 4, 0).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3])
-        self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 4, 1).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobar', b'boobazfo', b'obar', b'boobazfo', b'obar', b'boobaz'])
-        self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 4, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'arboobaz', b'foob', b'arboobaz', b'foob', b'arboobaz'])
+        self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, CHUNK_MAX, 0).chunkify(BytesIO(b''))], [])
+        self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, CHUNK_MAX, 0).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fooba', b'rboobaz', b'fooba', b'rboobaz', b'fooba', b'rboobaz'])
+        self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, CHUNK_MAX, 1).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fo', b'obarb', b'oob', b'azf', b'oobarb', b'oob', b'azf', b'oobarb', b'oobaz'])
+        self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, CHUNK_MAX, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'ar', b'boobazfoob', b'ar', b'boobazfoob', b'ar', b'boobaz'])
+        self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 3, CHUNK_MAX, 0).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3])
+        self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 3, CHUNK_MAX, 1).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobar', b'boo', b'bazfo', b'obar', b'boo', b'bazfo', b'obar', b'boobaz'])
+        self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 3, CHUNK_MAX, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foo', b'barboobaz', b'foo', b'barboobaz', b'foo', b'barboobaz'])
+        self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 4, CHUNK_MAX, 0).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3])
+        self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 4, CHUNK_MAX, 1).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobar', b'boobazfo', b'obar', b'boobazfo', b'obar', b'boobaz'])
+        self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 4, CHUNK_MAX, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'arboobaz', b'foob', b'arboobaz', b'foob', b'arboobaz'])
 
     def test_buzhash(self):
         self.assert_equal(buzhash(b'abcdefghijklmnop', 0), 3795437769)