Browse Source

Merge pull request #5620 from ThomasWaldmann/sparse-file-integr2

Sparse file support (integration)
TW 4 years ago
parent
commit
699256edbd

+ 43 - 15
src/borg/archive.py

@@ -19,7 +19,7 @@ from .logger import create_logger
 logger = create_logger()
 logger = create_logger()
 
 
 from . import xattr
 from . import xattr
-from .chunker import get_chunker, max_chunk_size
+from .chunker import get_chunker, Chunk
 from .cache import ChunkListEntry
 from .cache import ChunkListEntry
 from .crypto.key import key_factory
 from .crypto.key import key_factory
 from .compress import Compressor, CompressionSpec
 from .compress import Compressor, CompressionSpec
@@ -41,6 +41,7 @@ from .helpers import ellipsis_truncate, ProgressIndicatorPercent, log_multi
 from .helpers import os_open, flags_normal, flags_dir
 from .helpers import os_open, flags_normal, flags_dir
 from .helpers import msgpack
 from .helpers import msgpack
 from .helpers import sig_int
 from .helpers import sig_int
+from .lrucache import LRUCache
 from .patterns import PathPrefixPattern, FnmatchPattern, IECommand
 from .patterns import PathPrefixPattern, FnmatchPattern, IECommand
 from .item import Item, ArchiveItem, ItemDiff
 from .item import Item, ArchiveItem, ItemDiff
 from .platform import acl_get, acl_set, set_flags, get_flags, swidth, hostname
 from .platform import acl_get, acl_set, set_flags, get_flags, swidth, hostname
@@ -336,7 +337,10 @@ class ChunkBuffer:
         self.buffer.seek(0)
         self.buffer.seek(0)
         # The chunker returns a memoryview to its internal buffer,
         # The chunker returns a memoryview to its internal buffer,
         # thus a copy is needed before resuming the chunker iterator.
         # thus a copy is needed before resuming the chunker iterator.
-        chunks = list(bytes(s) for s in self.chunker.chunkify(self.buffer))
+        # note: this is the items metadata stream chunker, we only will get CH_DATA allocation here (because there are,
+        #       no all-zero chunks in a metadata stream), thus chunk.data will always be bytes/memoryview and allocation
+        #       is always CH_DATA and never CH_ALLOC/CH_HOLE).
+        chunks = list(bytes(chunk.data) for chunk in self.chunker.chunkify(self.buffer))
         self.buffer.seek(0)
         self.buffer.seek(0)
         self.buffer.truncate(0)
         self.buffer.truncate(0)
         # Leave the last partial chunk in the buffer unless flush is True
         # Leave the last partial chunk in the buffer unless flush is True
@@ -422,7 +426,6 @@ class Archive:
             if info is None:
             if info is None:
                 raise self.DoesNotExist(name)
                 raise self.DoesNotExist(name)
             self.load(info.id)
             self.load(info.id)
-            self.zeros = None
 
 
     def _load_meta(self, id):
     def _load_meta(self, id):
         data = self.key.decrypt(id, self.repository.get(id))
         data = self.key.decrypt(id, self.repository.get(id))
@@ -735,8 +738,6 @@ Utilization of max. archive size: {csize_max:.0%}
                                      hardlink_masters) as hardlink_set:
                                      hardlink_masters) as hardlink_set:
                 if hardlink_set:
                 if hardlink_set:
                     return
                     return
-                if sparse and self.zeros is None:
-                    self.zeros = b'\0' * max_chunk_size(*self.chunker_params)
                 with backup_io('open'):
                 with backup_io('open'):
                     fd = open(path, 'wb')
                     fd = open(path, 'wb')
                 with fd:
                 with fd:
@@ -745,7 +746,7 @@ Utilization of max. archive size: {csize_max:.0%}
                         if pi:
                         if pi:
                             pi.show(increase=len(data), info=[remove_surrogates(item.path)])
                             pi.show(increase=len(data), info=[remove_surrogates(item.path)])
                         with backup_io('write'):
                         with backup_io('write'):
-                            if sparse and self.zeros.startswith(data):
+                            if sparse and zeros.startswith(data):
                                 # all-zero chunk: create a hole in a sparse file
                                 # all-zero chunk: create a hole in a sparse file
                                 fd.seek(len(data), 1)
                                 fd.seek(len(data), 1)
                             else:
                             else:
@@ -1089,6 +1090,32 @@ class MetadataCollector:
         return attrs
         return attrs
 
 
 
 
+# remember a few recently used all-zero chunk hashes in this mapping.
+# (hash_func, chunk_length) -> chunk_hash
+# we play safe and have the hash_func in the mapping key, in case we
+# have different hash_funcs within the same borg run.
+zero_chunk_ids = LRUCache(10, dispose=lambda _: None)
+
+
+def cached_hash(chunk, id_hash):
+    allocation = chunk.meta['allocation']
+    if allocation == CH_DATA:
+        data = chunk.data
+        chunk_id = id_hash(data)
+    elif allocation in (CH_HOLE, CH_ALLOC):
+        size = chunk.meta['size']
+        assert size <= len(zeros)
+        data = memoryview(zeros)[:size]
+        try:
+            chunk_id = zero_chunk_ids[(id_hash, size)]
+        except KeyError:
+            chunk_id = id_hash(data)
+            zero_chunk_ids[(id_hash, size)] = chunk_id
+    else:
+        raise ValueError('unexpected allocation type')
+    return chunk_id, data
+
+
 class ChunksProcessor:
 class ChunksProcessor:
     # Processes an iterator of chunks for an Item
     # Processes an iterator of chunks for an Item
 
 
@@ -1133,8 +1160,9 @@ class ChunksProcessor:
 
 
     def process_file_chunks(self, item, cache, stats, show_progress, chunk_iter, chunk_processor=None):
     def process_file_chunks(self, item, cache, stats, show_progress, chunk_iter, chunk_processor=None):
         if not chunk_processor:
         if not chunk_processor:
-            def chunk_processor(data):
-                chunk_entry = cache.add_chunk(self.key.id_hash(data), data, stats, wait=False)
+            def chunk_processor(chunk):
+                chunk_id, data = cached_hash(chunk, self.key.id_hash)
+                chunk_entry = cache.add_chunk(chunk_id, data, stats, wait=False)
                 self.cache.repository.async_response(wait=False)
                 self.cache.repository.async_response(wait=False)
                 return chunk_entry
                 return chunk_entry
 
 
@@ -1145,8 +1173,8 @@ class ChunksProcessor:
             del item.chunks_healthy
             del item.chunks_healthy
         from_chunk = 0
         from_chunk = 0
         part_number = 1
         part_number = 1
-        for data in chunk_iter:
-            item.chunks.append(chunk_processor(data))
+        for chunk in chunk_iter:
+            item.chunks.append(chunk_processor(chunk))
             if show_progress:
             if show_progress:
                 stats.show_progress(item=item, dt=0.2)
                 stats.show_progress(item=item, dt=0.2)
             from_chunk, part_number = self.maybe_checkpoint(item, from_chunk, part_number, forced=False)
             from_chunk, part_number = self.maybe_checkpoint(item, from_chunk, part_number, forced=False)
@@ -1662,8 +1690,8 @@ class ArchiveChecker:
             If a previously missing file chunk re-appears, the replacement chunk is replaced by the correct one.
             If a previously missing file chunk re-appears, the replacement chunk is replaced by the correct one.
             """
             """
             def replacement_chunk(size):
             def replacement_chunk(size):
-                data = bytes(size)
-                chunk_id = self.key.id_hash(data)
+                chunk = Chunk(None, allocation=CH_ALLOC, size=size)
+                chunk_id, data = cached_hash(chunk, self.key.id_hash)
                 cdata = self.key.encrypt(data)
                 cdata = self.key.encrypt(data)
                 csize = len(cdata)
                 csize = len(cdata)
                 return chunk_id, size, csize, cdata
                 return chunk_id, size, csize, cdata
@@ -1982,8 +2010,8 @@ class ArchiveRecreater:
         chunk_processor = partial(self.chunk_processor, target)
         chunk_processor = partial(self.chunk_processor, target)
         target.process_file_chunks(item, self.cache, target.stats, self.progress, chunk_iterator, chunk_processor)
         target.process_file_chunks(item, self.cache, target.stats, self.progress, chunk_iterator, chunk_processor)
 
 
-    def chunk_processor(self, target, data):
-        chunk_id = self.key.id_hash(data)
+    def chunk_processor(self, target, chunk):
+        chunk_id, data = cached_hash(chunk, self.key.id_hash)
         if chunk_id in self.seen_chunks:
         if chunk_id in self.seen_chunks:
             return self.cache.chunk_incref(chunk_id, target.stats)
             return self.cache.chunk_incref(chunk_id, target.stats)
         overwrite = self.recompress
         overwrite = self.recompress
@@ -2007,7 +2035,7 @@ class ArchiveRecreater:
             yield from target.chunker.chunkify(file)
             yield from target.chunker.chunkify(file)
         else:
         else:
             for chunk in chunk_iterator:
             for chunk in chunk_iterator:
-                yield chunk
+                yield Chunk(chunk, size=len(chunk), allocation=CH_DATA)
 
 
     def save(self, archive, target, comment=None, replace_original=True):
     def save(self, archive, target, comment=None, replace_original=True):
         if self.dry_run:
         if self.dry_run:

+ 2 - 1
src/borg/archiver.py

@@ -453,9 +453,10 @@ class Archiver:
         def test_files(path, count, size, random):
         def test_files(path, count, size, random):
             path = os.path.join(path, 'borg-test-data')
             path = os.path.join(path, 'borg-test-data')
             os.makedirs(path)
             os.makedirs(path)
+            z_buff = None if random else memoryview(zeros)[:size] if size <= len(zeros) else b'\0' * size
             for i in range(count):
             for i in range(count):
                 fname = os.path.join(path, 'file_%d' % i)
                 fname = os.path.join(path, 'file_%d' % i)
-                data = b'\0' * size if not random else os.urandom(size)
+                data = z_buff if not random else os.urandom(size)
                 with SyncFile(fname, binary=True) as fd:  # used for posix_fadvise's sake
                 with SyncFile(fname, binary=True) as fd:  # used for posix_fadvise's sake
                     fd.write(data)
                     fd.write(data)
             yield path
             yield path

+ 49 - 14
src/borg/chunker.pyx

@@ -4,6 +4,9 @@ API_VERSION = '1.2_01'
 
 
 import errno
 import errno
 import os
 import os
+from collections import namedtuple
+
+from .constants import CH_DATA, CH_ALLOC, CH_HOLE, MAX_DATA_SIZE, zeros
 
 
 from libc.stdlib cimport free
 from libc.stdlib cimport free
 
 
@@ -26,6 +29,29 @@ cdef extern from "_chunker.c":
 has_seek_hole = hasattr(os, 'SEEK_DATA') and hasattr(os, 'SEEK_HOLE')
 has_seek_hole = hasattr(os, 'SEEK_DATA') and hasattr(os, 'SEEK_HOLE')
 
 
 
 
+_Chunk = namedtuple('_Chunk', 'meta data')
+_Chunk.__doc__ = """\
+    Chunk namedtuple
+
+    meta is always a dictionary, data depends on allocation.
+
+    data chunk read from a DATA range of a file (not from a sparse hole):
+        meta = {'allocation' = CH_DATA, 'size' = size_of_chunk }
+        data = read_data [bytes or memoryview]
+
+    all-zero chunk read from a DATA range of a file (not from a sparse hole, but detected to be all-zero):
+        meta = {'allocation' = CH_ALLOC, 'size' = size_of_chunk }
+        data = None
+
+    all-zero chunk from a HOLE range of a file (from a sparse hole):
+        meta = {'allocation' = CH_HOLE, 'size' = size_of_chunk }
+        data = None
+"""
+
+def Chunk(data, **meta):
+    return _Chunk(meta, data)
+
+
 def dread(offset, size, fd=None, fh=-1):
 def dread(offset, size, fd=None, fh=-1):
     use_fh = fh >= 0
     use_fh = fh >= 0
     if use_fh:
     if use_fh:
@@ -124,7 +150,7 @@ class ChunkerFixed:
         # should borg try to do sparse input processing?
         # should borg try to do sparse input processing?
         # whether it actually can be done depends on the input file being seekable.
         # whether it actually can be done depends on the input file being seekable.
         self.try_sparse = sparse and has_seek_hole
         self.try_sparse = sparse and has_seek_hole
-        self.zeros = memoryview(bytes(block_size))
+        assert block_size <= len(zeros)
 
 
     def chunkify(self, fd=None, fh=-1, fmap=None):
     def chunkify(self, fd=None, fh=-1, fmap=None):
         """
         """
@@ -178,15 +204,22 @@ class ChunkerFixed:
                 if is_data:
                 if is_data:
                     # read block from the range
                     # read block from the range
                     data = dread(offset, wanted, fd, fh)
                     data = dread(offset, wanted, fd, fh)
+                    got = len(data)
+                    if zeros.startswith(data):
+                        data = None
+                        allocation = CH_ALLOC
+                    else:
+                        allocation = CH_DATA
                 else:  # hole
                 else:  # hole
                     # seek over block from the range
                     # seek over block from the range
                     pos = dseek(wanted, os.SEEK_CUR, fd, fh)
                     pos = dseek(wanted, os.SEEK_CUR, fd, fh)
-                    data = self.zeros[:pos - offset]  # for now, create zero-bytes here
-                got = len(data)
+                    got = pos - offset
+                    data = None
+                    allocation = CH_HOLE
                 if got > 0:
                 if got > 0:
                     offset += got
                     offset += got
                     range_size -= got
                     range_size -= got
-                    yield data  # later, use a better api that tags data vs. hole
+                    yield Chunk(data, size=got, allocation=allocation)
                 if got < wanted:
                 if got < wanted:
                     # we did not get enough data, looks like EOF.
                     # we did not get enough data, looks like EOF.
                     return
                     return
@@ -209,6 +242,7 @@ cdef class Chunker:
     def __cinit__(self, int seed, int chunk_min_exp, int chunk_max_exp, int hash_mask_bits, int hash_window_size):
     def __cinit__(self, int seed, int chunk_min_exp, int chunk_max_exp, int hash_mask_bits, int hash_window_size):
         min_size = 1 << chunk_min_exp
         min_size = 1 << chunk_min_exp
         max_size = 1 << chunk_max_exp
         max_size = 1 << chunk_max_exp
+        assert max_size <= len(zeros)
         # see chunker_process, first while loop condition, first term must be able to get True:
         # see chunker_process, first while loop condition, first term must be able to get True:
         assert hash_window_size + min_size + 1 <= max_size, "too small max_size"
         assert hash_window_size + min_size + 1 <= max_size, "too small max_size"
         hash_mask = (1 << hash_mask_bits) - 1
         hash_mask = (1 << hash_mask_bits) - 1
@@ -233,7 +267,17 @@ cdef class Chunker:
         return self
         return self
 
 
     def __next__(self):
     def __next__(self):
-        return chunker_process(self.chunker)
+        data = chunker_process(self.chunker)
+        got = len(data)
+        # we do not have SEEK_DATA/SEEK_HOLE support in chunker_process C code,
+        # but we can just check if data was all-zero (and either came from a hole
+        # or from stored zeros - we can not detect that here).
+        if zeros.startswith(data):
+            data = None
+            allocation = CH_ALLOC
+        else:
+            allocation = CH_DATA
+        return Chunk(data, size=got, allocation=allocation)
 
 
 
 
 def get_chunker(algo, *params, **kw):
 def get_chunker(algo, *params, **kw):
@@ -246,15 +290,6 @@ def get_chunker(algo, *params, **kw):
     raise TypeError('unsupported chunker algo %r' % algo)
     raise TypeError('unsupported chunker algo %r' % algo)
 
 
 
 
-def max_chunk_size(algo, *params):
-    # see also parseformat.ChunkerParams return values
-    if algo == 'buzhash':
-        return 1 << params[1]
-    if algo == 'fixed':
-        return max(params[0], params[1])
-    raise TypeError('unsupported chunker algo %r' % algo)
-
-
 def buzhash(data, unsigned long seed):
 def buzhash(data, unsigned long seed):
     cdef uint32_t *table
     cdef uint32_t *table
     cdef uint32_t sum
     cdef uint32_t sum

+ 7 - 0
src/borg/constants.py

@@ -45,6 +45,10 @@ assert MAX_OBJECT_SIZE == 20 * 1024 * 1024
 # repo config max_segment_size value must be below this limit to stay within uint32 offsets:
 # repo config max_segment_size value must be below this limit to stay within uint32 offsets:
 MAX_SEGMENT_SIZE_LIMIT = 2 ** 32 - MAX_OBJECT_SIZE
 MAX_SEGMENT_SIZE_LIMIT = 2 ** 32 - MAX_OBJECT_SIZE
 
 
+# have one all-zero bytes object
+# we use it at all places where we need to detect or create all-zero buffers
+zeros = bytes(MAX_DATA_SIZE)
+
 # borg.remote read() buffer size
 # borg.remote read() buffer size
 BUFSIZE = 10 * 1024 * 1024
 BUFSIZE = 10 * 1024 * 1024
 
 
@@ -75,6 +79,9 @@ CHUNKER_PARAMS = (CH_BUZHASH, CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH
 # chunker params for the items metadata stream, finer granularity
 # chunker params for the items metadata stream, finer granularity
 ITEMS_CHUNKER_PARAMS = (CH_BUZHASH, 15, 19, 17, HASH_WINDOW_SIZE)
 ITEMS_CHUNKER_PARAMS = (CH_BUZHASH, 15, 19, 17, HASH_WINDOW_SIZE)
 
 
+# normal on-disk data, allocated (but not written, all zeros), not allocated hole (all zeros)
+CH_DATA, CH_ALLOC, CH_HOLE = 0, 1, 2
+
 # operating mode of the files cache (for fast skipping of unchanged files)
 # operating mode of the files cache (for fast skipping of unchanged files)
 DEFAULT_FILES_CACHE_MODE_UI = 'ctime,size,inode'
 DEFAULT_FILES_CACHE_MODE_UI = 'ctime,size,inode'
 DEFAULT_FILES_CACHE_MODE = 'cis'  # == CacheMode(DEFAULT_FILES_CACHE_MODE_UI)
 DEFAULT_FILES_CACHE_MODE = 'cis'  # == CacheMode(DEFAULT_FILES_CACHE_MODE_UI)

+ 3 - 1
src/borg/testsuite/benchmark.py

@@ -11,6 +11,7 @@ import os
 import pytest
 import pytest
 
 
 from .archiver import changedir, cmd
 from .archiver import changedir, cmd
+from ..constants import zeros
 
 
 
 
 @pytest.fixture
 @pytest.fixture
@@ -34,12 +35,13 @@ def repo(request, cmd, repo_url):
 @pytest.fixture(scope='session', params=["zeros", "random"])
 @pytest.fixture(scope='session', params=["zeros", "random"])
 def testdata(request, tmpdir_factory):
 def testdata(request, tmpdir_factory):
     count, size = 10, 1000*1000
     count, size = 10, 1000*1000
+    assert size <= len(zeros)
     p = tmpdir_factory.mktemp('data')
     p = tmpdir_factory.mktemp('data')
     data_type = request.param
     data_type = request.param
     if data_type == 'zeros':
     if data_type == 'zeros':
         # do not use a binary zero (\0) to avoid sparse detection
         # do not use a binary zero (\0) to avoid sparse detection
         def data(size):
         def data(size):
-            return b'0' * size
+            return memoryview(zeros)[:size]
     elif data_type == 'random':
     elif data_type == 'random':
         def data(size):
         def data(size):
             return os.urandom(size)
             return os.urandom(size)

+ 34 - 20
src/borg/testsuite/chunker.py

@@ -8,18 +8,32 @@ from . import BaseTestCase
 #       See borg.selftest for details. If you add/remove test methods, update SELFTEST_COUNT
 #       See borg.selftest for details. If you add/remove test methods, update SELFTEST_COUNT
 
 
 
 
+def cf(chunks):
+    """chunk filter"""
+    # this is to simplify testing: either return the data piece (bytes) or the hole length (int).
+    def _cf(chunk):
+        if chunk.meta['allocation'] == CH_DATA:
+            assert len(chunk.data) == chunk.meta['size']
+            return bytes(chunk.data)  # make sure we have bytes, not memoryview
+        if chunk.meta['allocation'] in (CH_HOLE, CH_ALLOC):
+            assert chunk.data is None
+            return chunk.meta['size']
+        assert False, "unexpected allocation value"
+    return [_cf(chunk) for chunk in chunks]
+
+
 class ChunkerFixedTestCase(BaseTestCase):
 class ChunkerFixedTestCase(BaseTestCase):
 
 
     def test_chunkify_just_blocks(self):
     def test_chunkify_just_blocks(self):
         data = b'foobar' * 1500
         data = b'foobar' * 1500
         chunker = ChunkerFixed(4096)
         chunker = ChunkerFixed(4096)
-        parts = [c for c in chunker.chunkify(BytesIO(data))]
+        parts = cf(chunker.chunkify(BytesIO(data)))
         self.assert_equal(parts, [data[0:4096], data[4096:8192], data[8192:]])
         self.assert_equal(parts, [data[0:4096], data[4096:8192], data[8192:]])
 
 
     def test_chunkify_header_and_blocks(self):
     def test_chunkify_header_and_blocks(self):
         data = b'foobar' * 1500
         data = b'foobar' * 1500
         chunker = ChunkerFixed(4096, 123)
         chunker = ChunkerFixed(4096, 123)
-        parts = [c for c in chunker.chunkify(BytesIO(data))]
+        parts = cf(chunker.chunkify(BytesIO(data)))
         self.assert_equal(parts, [data[0:123], data[123:123+4096], data[123+4096:123+8192], data[123+8192:]])
         self.assert_equal(parts, [data[0:123], data[123:123+4096], data[123+4096:123+8192], data[123+8192:]])
 
 
     def test_chunkify_just_blocks_fmap_complete(self):
     def test_chunkify_just_blocks_fmap_complete(self):
@@ -30,7 +44,7 @@ class ChunkerFixedTestCase(BaseTestCase):
             (4096, 8192, True),
             (4096, 8192, True),
             (8192, 99999999, True),
             (8192, 99999999, True),
         ]
         ]
-        parts = [c for c in chunker.chunkify(BytesIO(data), fmap=fmap)]
+        parts = cf(chunker.chunkify(BytesIO(data), fmap=fmap))
         self.assert_equal(parts, [data[0:4096], data[4096:8192], data[8192:]])
         self.assert_equal(parts, [data[0:4096], data[4096:8192], data[8192:]])
 
 
     def test_chunkify_header_and_blocks_fmap_complete(self):
     def test_chunkify_header_and_blocks_fmap_complete(self):
@@ -42,7 +56,7 @@ class ChunkerFixedTestCase(BaseTestCase):
             (123+4096, 4096, True),
             (123+4096, 4096, True),
             (123+8192, 4096, True),
             (123+8192, 4096, True),
         ]
         ]
-        parts = [c for c in chunker.chunkify(BytesIO(data), fmap=fmap)]
+        parts = cf(chunker.chunkify(BytesIO(data), fmap=fmap))
         self.assert_equal(parts, [data[0:123], data[123:123+4096], data[123+4096:123+8192], data[123+8192:]])
         self.assert_equal(parts, [data[0:123], data[123:123+4096], data[123+4096:123+8192], data[123+8192:]])
 
 
     def test_chunkify_header_and_blocks_fmap_zeros(self):
     def test_chunkify_header_and_blocks_fmap_zeros(self):
@@ -54,9 +68,9 @@ class ChunkerFixedTestCase(BaseTestCase):
             (123+4096, 4096, True),
             (123+4096, 4096, True),
             (123+8192, 4096, False),
             (123+8192, 4096, False),
         ]
         ]
-        parts = [c for c in chunker.chunkify(BytesIO(data), fmap=fmap)]
-        # because we marked the '_' ranges as holes, we will get '\0' ranges instead!
-        self.assert_equal(parts, [data[0:123], b'\0' * 4096, data[123+4096:123+8192], b'\0' * 4096])
+        parts = cf(chunker.chunkify(BytesIO(data), fmap=fmap))
+        # because we marked the '_' ranges as holes, we will get hole ranges instead!
+        self.assert_equal(parts, [data[0:123], 4096, data[123+4096:123+8192], 4096])
 
 
     def test_chunkify_header_and_blocks_fmap_partial(self):
     def test_chunkify_header_and_blocks_fmap_partial(self):
         data = b'H' * 123 + b'_' * 4096 + b'X' * 4096 + b'_' * 4096
         data = b'H' * 123 + b'_' * 4096 + b'X' * 4096 + b'_' * 4096
@@ -67,7 +81,7 @@ class ChunkerFixedTestCase(BaseTestCase):
             (123+4096, 4096, True),
             (123+4096, 4096, True),
             # (123+8192, 4096, False),
             # (123+8192, 4096, False),
         ]
         ]
-        parts = [c for c in chunker.chunkify(BytesIO(data), fmap=fmap)]
+        parts = cf(chunker.chunkify(BytesIO(data), fmap=fmap))
         # because we left out the '_' ranges from the fmap, we will not get them at all!
         # because we left out the '_' ranges from the fmap, we will not get them at all!
         self.assert_equal(parts, [data[0:123], data[123+4096:123+8192]])
         self.assert_equal(parts, [data[0:123], data[123+4096:123+8192]])
 
 
@@ -76,19 +90,19 @@ class ChunkerTestCase(BaseTestCase):
 
 
     def test_chunkify(self):
     def test_chunkify(self):
         data = b'0' * int(1.5 * (1 << CHUNK_MAX_EXP)) + b'Y'
         data = b'0' * int(1.5 * (1 << CHUNK_MAX_EXP)) + b'Y'
-        parts = [bytes(c) for c in Chunker(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(data))]
+        parts = cf(Chunker(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(data)))
         self.assert_equal(len(parts), 2)
         self.assert_equal(len(parts), 2)
         self.assert_equal(b''.join(parts), data)
         self.assert_equal(b''.join(parts), data)
-        self.assert_equal([bytes(c) for c in Chunker(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b''))], [])
-        self.assert_equal([bytes(c) for c in Chunker(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fooba', b'rboobaz', b'fooba', b'rboobaz', b'fooba', b'rboobaz'])
-        self.assert_equal([bytes(c) for c in Chunker(1, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fo', b'obarb', b'oob', b'azf', b'oobarb', b'oob', b'azf', b'oobarb', b'oobaz'])
-        self.assert_equal([bytes(c) for c in Chunker(2, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'ar', b'boobazfoob', b'ar', b'boobazfoob', b'ar', b'boobaz'])
-        self.assert_equal([bytes(c) for c in Chunker(0, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3])
-        self.assert_equal([bytes(c) for c in Chunker(1, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobar', b'boobazfo', b'obar', b'boobazfo', b'obar', b'boobaz'])
-        self.assert_equal([bytes(c) for c in Chunker(2, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'arboobaz', b'foob', b'arboobaz', b'foob', b'arboobaz'])
-        self.assert_equal([bytes(c) for c in Chunker(0, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3])
-        self.assert_equal([bytes(c) for c in Chunker(1, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarbo', b'obazfoobar', b'boobazfo', b'obarboobaz'])
-        self.assert_equal([bytes(c) for c in Chunker(2, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz', b'foobarboobaz', b'foobarboobaz'])
+        self.assert_equal(cf(Chunker(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b''))), [])
+        self.assert_equal(cf(Chunker(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b'foobarboobaz' * 3))), [b'fooba', b'rboobaz', b'fooba', b'rboobaz', b'fooba', b'rboobaz'])
+        self.assert_equal(cf(Chunker(1, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b'foobarboobaz' * 3))), [b'fo', b'obarb', b'oob', b'azf', b'oobarb', b'oob', b'azf', b'oobarb', b'oobaz'])
+        self.assert_equal(cf(Chunker(2, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b'foobarboobaz' * 3))), [b'foob', b'ar', b'boobazfoob', b'ar', b'boobazfoob', b'ar', b'boobaz'])
+        self.assert_equal(cf(Chunker(0, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))), [b'foobarboobaz' * 3])
+        self.assert_equal(cf(Chunker(1, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))), [b'foobar', b'boobazfo', b'obar', b'boobazfo', b'obar', b'boobaz'])
+        self.assert_equal(cf(Chunker(2, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))), [b'foob', b'arboobaz', b'foob', b'arboobaz', b'foob', b'arboobaz'])
+        self.assert_equal(cf(Chunker(0, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))), [b'foobarboobaz' * 3])
+        self.assert_equal(cf(Chunker(1, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))), [b'foobarbo', b'obazfoobar', b'boobazfo', b'obarboobaz'])
+        self.assert_equal(cf(Chunker(2, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))), [b'foobarboobaz', b'foobarboobaz', b'foobarboobaz'])
 
 
     def test_buzhash(self):
     def test_buzhash(self):
         self.assert_equal(buzhash(b'abcdefghijklmnop', 0), 3795437769)
         self.assert_equal(buzhash(b'abcdefghijklmnop', 0), 3795437769)
@@ -106,5 +120,5 @@ class ChunkerTestCase(BaseTestCase):
                 return self.input[:1]
                 return self.input[:1]
 
 
         chunker = get_chunker(*CHUNKER_PARAMS, seed=0)
         chunker = get_chunker(*CHUNKER_PARAMS, seed=0)
-        reconstructed = b''.join(chunker.chunkify(SmallReadFile()))
+        reconstructed = b''.join(cf(chunker.chunkify(SmallReadFile())))
         assert reconstructed == b'a' * 20
         assert reconstructed == b'a' * 20

+ 14 - 15
src/borg/testsuite/chunker_pytest.py

@@ -4,6 +4,7 @@ import tempfile
 
 
 import pytest
 import pytest
 
 
+from .chunker import cf
 from ..chunker import ChunkerFixed, sparsemap, has_seek_hole
 from ..chunker import ChunkerFixed, sparsemap, has_seek_hole
 from ..constants import *  # NOQA
 from ..constants import *  # NOQA
 
 
@@ -50,20 +51,18 @@ def make_sparsefile(fname, sparsemap, header_size=0):
 
 
 
 
 def make_content(sparsemap, header_size=0):
 def make_content(sparsemap, header_size=0):
-    with BytesIO() as fd:
-        total = 0
-        if header_size:
-            fd.write(b'H' * header_size)
-            total += header_size
-        for offset, size, is_data in sparsemap:
-            if is_data:
-                fd.write(b'X' * size)
-            else:
-                fd.write(b'\0' * size)
-            total += size
-        content = fd.getvalue()
-    assert len(content) == total
-    return content
+    result = []
+    total = 0
+    if header_size:
+        result.append(b'H' * header_size)
+        total += header_size
+    for offset, size, is_data in sparsemap:
+        if is_data:
+            result.append(b'X' * size)  # bytes!
+        else:
+            result.append(size)  # int!
+        total += size
+    return result
 
 
 
 
 def fs_supports_sparse():
 def fs_supports_sparse():
@@ -132,7 +131,7 @@ def test_chunkify_sparse(tmpdir, fname, sparse_map, header_size, sparse):
     def get_chunks(fname, sparse, header_size):
     def get_chunks(fname, sparse, header_size):
         chunker = ChunkerFixed(4096, header_size=header_size, sparse=sparse)
         chunker = ChunkerFixed(4096, header_size=header_size, sparse=sparse)
         with open(fname, 'rb') as fd:
         with open(fname, 'rb') as fd:
-            return b''.join([c for c in chunker.chunkify(fd)])
+            return cf(chunker.chunkify(fd))
 
 
     fn = str(tmpdir / fname)
     fn = str(tmpdir / fname)
     make_sparsefile(fn, sparse_map, header_size=header_size)
     make_sparsefile(fn, sparse_map, header_size=header_size)

+ 2 - 1
src/borg/testsuite/chunker_slow.py

@@ -1,6 +1,7 @@
 from io import BytesIO
 from io import BytesIO
 from binascii import unhexlify
 from binascii import unhexlify
 
 
+from .chunker import cf
 from ..chunker import Chunker
 from ..chunker import Chunker
 from ..crypto.low_level import blake2b_256
 from ..crypto.low_level import blake2b_256
 from ..constants import *  # NOQA
 from ..constants import *  # NOQA
@@ -30,7 +31,7 @@ class ChunkerRegressionTestCase(BaseTestCase):
                         for seed in (1849058162, 1234567653):
                         for seed in (1849058162, 1234567653):
                             fh = BytesIO(data)
                             fh = BytesIO(data)
                             chunker = Chunker(seed, minexp, maxexp, maskbits, winsize)
                             chunker = Chunker(seed, minexp, maxexp, maskbits, winsize)
-                            chunks = [blake2b_256(b'', c) for c in chunker.chunkify(fh, -1)]
+                            chunks = [blake2b_256(b'', c) for c in cf(chunker.chunkify(fh, -1))]
                             runs.append(blake2b_256(b'', b''.join(chunks)))
                             runs.append(blake2b_256(b'', b''.join(chunks)))
 
 
         # The "correct" hash below matches the existing chunker behavior.
         # The "correct" hash below matches the existing chunker behavior.