Selaa lähdekoodia

misc chunker parameter changes

- use power-of-2 sizes / n bit hash mask so one can give them more easily
- chunker api: give seed first, so we can give *chunker_params after it
- fix some tests that aren't possible with 2^N
- make sparse file extraction zero detection flexible for variable chunk max size
Thomas Waldmann 10 vuotta sitten
vanhempi
sitoutus
54e8dd8419
6 muutettua tiedostoa jossa 35 lisäystä ja 30 poistoa
  1. 10 11
      borg/archive.py
  2. 2 2
      borg/archiver.py
  3. 5 2
      borg/chunker.pyx
  4. 2 2
      borg/testsuite/archiver.py
  5. 13 13
      borg/testsuite/chunker.py
  6. 3 0
      docs/usage.rst

+ 10 - 11
borg/archive.py

@@ -22,13 +22,13 @@ from .helpers import parse_timestamp, Error, uid2user, user2uid, gid2group, grou
 
 ITEMS_BUFFER = 1024 * 1024
 
-CHUNK_MIN = 1024
-CHUNK_MAX = 10 * 1024 * 1024
-WINDOW_SIZE = 0xfff
-CHUNK_MASK = 0xffff
-CHUNKER_PARAMS = (WINDOW_SIZE, CHUNK_MASK, CHUNK_MIN, CHUNK_MAX)
+CHUNK_MIN_EXP = 10  # 2**10 == 1kiB
+CHUNK_MAX_EXP = 23  # 2**23 == 8MiB
+HASH_WINDOW_SIZE = 0xfff  # 4095B
+HASH_MASK_BITS = 16  # results in ~64kiB chunks statistically
 
-ZEROS = b'\0' * CHUNK_MAX
+# defaults, use --chunker-params to override
+CHUNKER_PARAMS = (CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH_WINDOW_SIZE)
 
 utime_supports_fd = os.utime in getattr(os, 'supports_fd', {})
 utime_supports_follow_symlinks = os.utime in getattr(os, 'supports_follow_symlinks', {})
@@ -76,8 +76,7 @@ class ChunkBuffer:
         self.packer = msgpack.Packer(unicode_errors='surrogateescape')
         self.chunks = []
         self.key = key
-        chunker_params += (self.key.chunk_seed, )
-        self.chunker = Chunker(*chunker_params)
+        self.chunker = Chunker(self.key.chunk_seed, *chunker_params)
 
     def add(self, item):
         self.buffer.write(self.packer.pack(StableDict(item)))
@@ -147,8 +146,7 @@ class Archive:
         self.pipeline = DownloadPipeline(self.repository, self.key)
         if create:
             self.items_buffer = CacheChunkBuffer(self.cache, self.key, self.stats, chunker_params)
-            chunker_params += (self.key.chunk_seed, )
-            self.chunker = Chunker(*chunker_params)
+            self.chunker = Chunker(self.key.chunk_seed, *chunker_params)
             if name in manifest.archives:
                 raise self.AlreadyExists(name)
             self.last_checkpoint = time.time()
@@ -163,6 +161,7 @@ class Archive:
                 raise self.DoesNotExist(name)
             info = self.manifest.archives[name]
             self.load(info[b'id'])
+            self.zeros = b'\0' * (1 << chunker_params[1])
 
     def _load_meta(self, id):
         data = self.key.decrypt(id, self.repository.get(id))
@@ -291,7 +290,7 @@ class Archive:
                 with open(path, 'wb') as fd:
                     ids = [c[0] for c in item[b'chunks']]
                     for data in self.pipeline.fetch_many(ids, is_preloaded=True):
-                        if sparse and ZEROS.startswith(data):
+                        if sparse and self.zeros.startswith(data):
                             # all-zero chunk: create a hole in a sparse file
                             fd.seek(len(data), 1)
                         else:

+ 2 - 2
borg/archiver.py

@@ -628,8 +628,8 @@ Type "Yes I am sure" if you understand this and want to continue.\n""")
                                     'alternatively, give a reference file/directory.')
         subparser.add_argument('--chunker-params', dest='chunker_params',
                                type=ChunkerParams, default=CHUNKER_PARAMS,
-                               metavar='WINDOW_SIZE,CHUNK_MASK,CHUNK_MIN,CHUNK_MAX',
-                               help='specify the chunker parameters. default: %r' % (CHUNKER_PARAMS, ))
+                               metavar='CHUNK_MIN_EXP,CHUNK_MAX_EXP,HASH_MASK_BITS,HASH_WINDOW_SIZE',
+                               help='specify the chunker parameters. default: %d,%d,%d,%d' % CHUNKER_PARAMS)
         subparser.add_argument('archive', metavar='ARCHIVE',
                                type=location_validator(archive=True),
                                help='archive to create')

+ 5 - 2
borg/chunker.pyx

@@ -20,8 +20,11 @@ cdef extern from "_chunker.c":
 cdef class Chunker:
     cdef _Chunker *chunker
 
-    def __cinit__(self, window_size, chunk_mask, min_size, max_size, seed):
-        self.chunker = chunker_init(window_size, chunk_mask, min_size, max_size, seed & 0xffffffff)
+    def __cinit__(self, seed, chunk_min_exp, chunk_max_exp, hash_mask_bits, hash_window_size):
+        min_size = 1 << chunk_min_exp
+        max_size = 1 << chunk_max_exp
+        hash_mask = (1 << hash_mask_bits) - 1
+        self.chunker = chunker_init(hash_window_size, hash_mask, min_size, max_size, seed & 0xffffffff)
 
     def chunkify(self, fd, fh=-1):
         """

+ 2 - 2
borg/testsuite/archiver.py

@@ -12,7 +12,7 @@ import unittest
 from hashlib import sha256
 
 from .. import xattr
-from ..archive import Archive, ChunkBuffer, CHUNK_MAX
+from ..archive import Archive, ChunkBuffer, CHUNK_MAX_EXP
 from ..archiver import Archiver
 from ..cache import Cache
 from ..crypto import bytes_to_long, num_aes_blocks
@@ -213,7 +213,7 @@ class ArchiverTestCase(ArchiverTestCaseBase):
         sparse_support = sys.platform != 'darwin'
         filename = os.path.join(self.input_path, 'sparse')
         content = b'foobar'
-        hole_size = 5 * CHUNK_MAX  # 5 full chunker buffers
+        hole_size = 5 * (1 << CHUNK_MAX_EXP)  # 5 full chunker buffers
         with open(filename, 'wb') as fd:
             # create a file that has a hole at the beginning and end (if the
             # OS and filesystem supports sparse files)

+ 13 - 13
borg/testsuite/chunker.py

@@ -1,27 +1,27 @@
 from io import BytesIO
 
 from ..chunker import Chunker, buzhash, buzhash_update
-from ..archive import CHUNK_MAX
+from ..archive import CHUNK_MAX_EXP
 from . import BaseTestCase
 
 
 class ChunkerTestCase(BaseTestCase):
 
     def test_chunkify(self):
-        data = b'0' * int(1.5 * CHUNK_MAX) + b'Y'
-        parts = [bytes(c) for c in Chunker(2, 0x3, 2, CHUNK_MAX, 0).chunkify(BytesIO(data))]
+        data = b'0' * int(1.5 * (1 << CHUNK_MAX_EXP)) + b'Y'
+        parts = [bytes(c) for c in Chunker(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(data))]
         self.assert_equal(len(parts), 2)
         self.assert_equal(b''.join(parts), data)
-        self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, CHUNK_MAX, 0).chunkify(BytesIO(b''))], [])
-        self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, CHUNK_MAX, 0).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fooba', b'rboobaz', b'fooba', b'rboobaz', b'fooba', b'rboobaz'])
-        self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, CHUNK_MAX, 1).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fo', b'obarb', b'oob', b'azf', b'oobarb', b'oob', b'azf', b'oobarb', b'oobaz'])
-        self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, CHUNK_MAX, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'ar', b'boobazfoob', b'ar', b'boobazfoob', b'ar', b'boobaz'])
-        self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 3, CHUNK_MAX, 0).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3])
-        self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 3, CHUNK_MAX, 1).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobar', b'boo', b'bazfo', b'obar', b'boo', b'bazfo', b'obar', b'boobaz'])
-        self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 3, CHUNK_MAX, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foo', b'barboobaz', b'foo', b'barboobaz', b'foo', b'barboobaz'])
-        self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 4, CHUNK_MAX, 0).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3])
-        self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 4, CHUNK_MAX, 1).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobar', b'boobazfo', b'obar', b'boobazfo', b'obar', b'boobaz'])
-        self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 4, CHUNK_MAX, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'arboobaz', b'foob', b'arboobaz', b'foob', b'arboobaz'])
+        self.assert_equal([bytes(c) for c in Chunker(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b''))], [])
+        self.assert_equal([bytes(c) for c in Chunker(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fooba', b'rboobaz', b'fooba', b'rboobaz', b'fooba', b'rboobaz'])
+        self.assert_equal([bytes(c) for c in Chunker(1, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fo', b'obarb', b'oob', b'azf', b'oobarb', b'oob', b'azf', b'oobarb', b'oobaz'])
+        self.assert_equal([bytes(c) for c in Chunker(2, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'ar', b'boobazfoob', b'ar', b'boobazfoob', b'ar', b'boobaz'])
+        self.assert_equal([bytes(c) for c in Chunker(0, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3])
+        self.assert_equal([bytes(c) for c in Chunker(1, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobar', b'boobazfo', b'obar', b'boobazfo', b'obar', b'boobaz'])
+        self.assert_equal([bytes(c) for c in Chunker(2, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'arboobaz', b'foob', b'arboobaz', b'foob', b'arboobaz'])
+        self.assert_equal([bytes(c) for c in Chunker(0, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3])
+        self.assert_equal([bytes(c) for c in Chunker(1, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarbo', b'obazfoobar', b'boobazfo', b'obarboobaz'])
+        self.assert_equal([bytes(c) for c in Chunker(2, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz', b'foobarboobaz', b'foobarboobaz'])
 
     def test_buzhash(self):
         self.assert_equal(buzhash(b'abcdefghijklmnop', 0), 3795437769)

+ 3 - 0
docs/usage.rst

@@ -50,6 +50,9 @@ Examples
     NAME="root-`date +%Y-%m-%d`"
     $ borg create /mnt/backup::$NAME / --do-not-cross-mountpoints
 
+    # Backup huge files with little chunk management overhead
+    $ borg create --chunker-params 19,23,21,4095 /mnt/backup::VMs /srv/VMs
+
 
 .. include:: usage/extract.rst.inc