4 лет назад · 699256edbd
--- a/src/borg/archive.py
+++ b/src/borg/archive.py
@@ -19,7 +19,7 @@ from .logger import create_logger
 
				 logger = create_logger()
			
 
				 
			
 
				 from . import xattr
			
 
				-from .chunker import get_chunker, max_chunk_size
			
 
				+from .chunker import get_chunker, Chunk
			
 
				 from .cache import ChunkListEntry
			
 
				 from .crypto.key import key_factory
			
 
				 from .compress import Compressor, CompressionSpec
			
@@ -41,6 +41,7 @@ from .helpers import ellipsis_truncate, ProgressIndicatorPercent, log_multi
 
				 from .helpers import os_open, flags_normal, flags_dir
			
 
				 from .helpers import msgpack
			
 
				 from .helpers import sig_int
			
 
				+from .lrucache import LRUCache
			
 
				 from .patterns import PathPrefixPattern, FnmatchPattern, IECommand
			
 
				 from .item import Item, ArchiveItem, ItemDiff
			
 
				 from .platform import acl_get, acl_set, set_flags, get_flags, swidth, hostname
			
@@ -336,7 +337,10 @@ class ChunkBuffer:
 
				         self.buffer.seek(0)
			
 
				         # The chunker returns a memoryview to its internal buffer,
			
 
				         # thus a copy is needed before resuming the chunker iterator.
			
 
				-        chunks = list(bytes(s) for s in self.chunker.chunkify(self.buffer))
			
 
				+        # note: this is the items metadata stream chunker, we only will get CH_DATA allocation here (because there are,
			
 
				+        #       no all-zero chunks in a metadata stream), thus chunk.data will always be bytes/memoryview and allocation
			
 
				+        #       is always CH_DATA and never CH_ALLOC/CH_HOLE).
			
 
				+        chunks = list(bytes(chunk.data) for chunk in self.chunker.chunkify(self.buffer))
			
 
				         self.buffer.seek(0)
			
 
				         self.buffer.truncate(0)
			
 
				         # Leave the last partial chunk in the buffer unless flush is True
			
@@ -422,7 +426,6 @@ class Archive:
 
				             if info is None:
			
 
				                 raise self.DoesNotExist(name)
			
 
				             self.load(info.id)
			
 
				-            self.zeros = None
			
 
				 
			
 
				     def _load_meta(self, id):
			
 
				         data = self.key.decrypt(id, self.repository.get(id))
			
@@ -735,8 +738,6 @@ Utilization of max. archive size: {csize_max:.0%}
 
				                                      hardlink_masters) as hardlink_set:
			
 
				                 if hardlink_set:
			
 
				                     return
			
 
				-                if sparse and self.zeros is None:
			
 
				-                    self.zeros = b'\0' * max_chunk_size(*self.chunker_params)
			
 
				                 with backup_io('open'):
			
 
				                     fd = open(path, 'wb')
			
 
				                 with fd:
			
@@ -745,7 +746,7 @@ Utilization of max. archive size: {csize_max:.0%}
 
				                         if pi:
			
 
				                             pi.show(increase=len(data), info=[remove_surrogates(item.path)])
			
 
				                         with backup_io('write'):
			
 
				-                            if sparse and self.zeros.startswith(data):
			
 
				+                            if sparse and zeros.startswith(data):
			
 
				                                 # all-zero chunk: create a hole in a sparse file
			
 
				                                 fd.seek(len(data), 1)
			
 
				                             else:
			
@@ -1089,6 +1090,32 @@ class MetadataCollector:
 
				         return attrs
			
 
				 
			
 
				 
			
 
				+# remember a few recently used all-zero chunk hashes in this mapping.
			
 
				+# (hash_func, chunk_length) -> chunk_hash
			
 
				+# we play safe and have the hash_func in the mapping key, in case we
			
 
				+# have different hash_funcs within the same borg run.
			
 
				+zero_chunk_ids = LRUCache(10, dispose=lambda _: None)
			
 
				+
			
 
				+
			
 
				+def cached_hash(chunk, id_hash):
			
 
				+    allocation = chunk.meta['allocation']
			
 
				+    if allocation == CH_DATA:
			
 
				+        data = chunk.data
			
 
				+        chunk_id = id_hash(data)
			
 
				+    elif allocation in (CH_HOLE, CH_ALLOC):
			
 
				+        size = chunk.meta['size']
			
 
				+        assert size <= len(zeros)
			
 
				+        data = memoryview(zeros)[:size]
			
 
				+        try:
			
 
				+            chunk_id = zero_chunk_ids[(id_hash, size)]
			
 
				+        except KeyError:
			
 
				+            chunk_id = id_hash(data)
			
 
				+            zero_chunk_ids[(id_hash, size)] = chunk_id
			
 
				+    else:
			
 
				+        raise ValueError('unexpected allocation type')
			
 
				+    return chunk_id, data
			
 
				+
			
 
				+
			
 
				 class ChunksProcessor:
			
 
				     # Processes an iterator of chunks for an Item
			
 
				 
			
@@ -1133,8 +1160,9 @@ class ChunksProcessor:
 
				 
			
 
				     def process_file_chunks(self, item, cache, stats, show_progress, chunk_iter, chunk_processor=None):
			
 
				         if not chunk_processor:
			
 
				-            def chunk_processor(data):
			
 
				-                chunk_entry = cache.add_chunk(self.key.id_hash(data), data, stats, wait=False)
			
 
				+            def chunk_processor(chunk):
			
 
				+                chunk_id, data = cached_hash(chunk, self.key.id_hash)
			
 
				+                chunk_entry = cache.add_chunk(chunk_id, data, stats, wait=False)
			
 
				                 self.cache.repository.async_response(wait=False)
			
 
				                 return chunk_entry
			
 
				 
			
@@ -1145,8 +1173,8 @@ class ChunksProcessor:
 
				             del item.chunks_healthy
			
 
				         from_chunk = 0
			
 
				         part_number = 1
			
 
				-        for data in chunk_iter:
			
 
				-            item.chunks.append(chunk_processor(data))
			
 
				+        for chunk in chunk_iter:
			
 
				+            item.chunks.append(chunk_processor(chunk))
			
 
				             if show_progress:
			
 
				                 stats.show_progress(item=item, dt=0.2)
			
 
				             from_chunk, part_number = self.maybe_checkpoint(item, from_chunk, part_number, forced=False)
			
@@ -1662,8 +1690,8 @@ class ArchiveChecker:
 
				             If a previously missing file chunk re-appears, the replacement chunk is replaced by the correct one.
			
 
				             """
			
 
				             def replacement_chunk(size):
			
 
				-                data = bytes(size)
			
 
				-                chunk_id = self.key.id_hash(data)
			
 
				+                chunk = Chunk(None, allocation=CH_ALLOC, size=size)
			
 
				+                chunk_id, data = cached_hash(chunk, self.key.id_hash)
			
 
				                 cdata = self.key.encrypt(data)
			
 
				                 csize = len(cdata)
			
 
				                 return chunk_id, size, csize, cdata
			
@@ -1982,8 +2010,8 @@ class ArchiveRecreater:
 
				         chunk_processor = partial(self.chunk_processor, target)
			
 
				         target.process_file_chunks(item, self.cache, target.stats, self.progress, chunk_iterator, chunk_processor)
			
 
				 
			
 
				-    def chunk_processor(self, target, data):
			
 
				-        chunk_id = self.key.id_hash(data)
			
 
				+    def chunk_processor(self, target, chunk):
			
 
				+        chunk_id, data = cached_hash(chunk, self.key.id_hash)
			
 
				         if chunk_id in self.seen_chunks:
			
 
				             return self.cache.chunk_incref(chunk_id, target.stats)
			
 
				         overwrite = self.recompress
			
@@ -2007,7 +2035,7 @@ class ArchiveRecreater:
 
				             yield from target.chunker.chunkify(file)
			
 
				         else:
			
 
				             for chunk in chunk_iterator:
			
 
				-                yield chunk
			
 
				+                yield Chunk(chunk, size=len(chunk), allocation=CH_DATA)
			
 
				 
			
 
				     def save(self, archive, target, comment=None, replace_original=True):
			
 
				         if self.dry_run:
			
--- a/src/borg/archiver.py
+++ b/src/borg/archiver.py
@@ -453,9 +453,10 @@ class Archiver:
 
				         def test_files(path, count, size, random):
			
 
				             path = os.path.join(path, 'borg-test-data')
			
 
				             os.makedirs(path)
			
 
				+            z_buff = None if random else memoryview(zeros)[:size] if size <= len(zeros) else b'\0' * size
			
 
				             for i in range(count):
			
 
				                 fname = os.path.join(path, 'file_%d' % i)
			
 
				-                data = b'\0' * size if not random else os.urandom(size)
			
 
				+                data = z_buff if not random else os.urandom(size)
			
 
				                 with SyncFile(fname, binary=True) as fd:  # used for posix_fadvise's sake
			
 
				                     fd.write(data)
			
 
				             yield path
			
--- a/src/borg/chunker.pyx
+++ b/src/borg/chunker.pyx
@@ -4,6 +4,9 @@ API_VERSION = '1.2_01'
 
				 
			
 
				 import errno
			
 
				 import os
			
 
				+from collections import namedtuple
			
 
				+
			
 
				+from .constants import CH_DATA, CH_ALLOC, CH_HOLE, MAX_DATA_SIZE, zeros
			
 
				 
			
 
				 from libc.stdlib cimport free
			
 
				 
			
@@ -26,6 +29,29 @@ cdef extern from "_chunker.c":
 
				 has_seek_hole = hasattr(os, 'SEEK_DATA') and hasattr(os, 'SEEK_HOLE')
			
 
				 
			
 
				 
			
 
				+_Chunk = namedtuple('_Chunk', 'meta data')
			
 
				+_Chunk.__doc__ = """\
			
 
				+    Chunk namedtuple
			
 
				+
			
 
				+    meta is always a dictionary, data depends on allocation.
			
 
				+
			
 
				+    data chunk read from a DATA range of a file (not from a sparse hole):
			
 
				+        meta = {'allocation' = CH_DATA, 'size' = size_of_chunk }
			
 
				+        data = read_data [bytes or memoryview]
			
 
				+
			
 
				+    all-zero chunk read from a DATA range of a file (not from a sparse hole, but detected to be all-zero):
			
 
				+        meta = {'allocation' = CH_ALLOC, 'size' = size_of_chunk }
			
 
				+        data = None
			
 
				+
			
 
				+    all-zero chunk from a HOLE range of a file (from a sparse hole):
			
 
				+        meta = {'allocation' = CH_HOLE, 'size' = size_of_chunk }
			
 
				+        data = None
			
 
				+"""
			
 
				+
			
 
				+def Chunk(data, **meta):
			
 
				+    return _Chunk(meta, data)
			
 
				+
			
 
				+
			
 
				 def dread(offset, size, fd=None, fh=-1):
			
 
				     use_fh = fh >= 0
			
 
				     if use_fh:
			
@@ -124,7 +150,7 @@ class ChunkerFixed:
 
				         # should borg try to do sparse input processing?
			
 
				         # whether it actually can be done depends on the input file being seekable.
			
 
				         self.try_sparse = sparse and has_seek_hole
			
 
				-        self.zeros = memoryview(bytes(block_size))
			
 
				+        assert block_size <= len(zeros)
			
 
				 
			
 
				     def chunkify(self, fd=None, fh=-1, fmap=None):
			
 
				         """
			
@@ -178,15 +204,22 @@ class ChunkerFixed:
 
				                 if is_data:
			
 
				                     # read block from the range
			
 
				                     data = dread(offset, wanted, fd, fh)
			
 
				+                    got = len(data)
			
 
				+                    if zeros.startswith(data):
			
 
				+                        data = None
			
 
				+                        allocation = CH_ALLOC
			
 
				+                    else:
			
 
				+                        allocation = CH_DATA
			
 
				                 else:  # hole
			
 
				                     # seek over block from the range
			
 
				                     pos = dseek(wanted, os.SEEK_CUR, fd, fh)
			
 
				-                    data = self.zeros[:pos - offset]  # for now, create zero-bytes here
			
 
				-                got = len(data)
			
 
				+                    got = pos - offset
			
 
				+                    data = None
			
 
				+                    allocation = CH_HOLE
			
 
				                 if got > 0:
			
 
				                     offset += got
			
 
				                     range_size -= got
			
 
				-                    yield data  # later, use a better api that tags data vs. hole
			
 
				+                    yield Chunk(data, size=got, allocation=allocation)
			
 
				                 if got < wanted:
			
 
				                     # we did not get enough data, looks like EOF.
			
 
				                     return
			
@@ -209,6 +242,7 @@ cdef class Chunker:
 
				     def __cinit__(self, int seed, int chunk_min_exp, int chunk_max_exp, int hash_mask_bits, int hash_window_size):
			
 
				         min_size = 1 << chunk_min_exp
			
 
				         max_size = 1 << chunk_max_exp
			
 
				+        assert max_size <= len(zeros)
			
 
				         # see chunker_process, first while loop condition, first term must be able to get True:
			
 
				         assert hash_window_size + min_size + 1 <= max_size, "too small max_size"
			
 
				         hash_mask = (1 << hash_mask_bits) - 1
			
@@ -233,7 +267,17 @@ cdef class Chunker:
 
				         return self
			
 
				 
			
 
				     def __next__(self):
			
 
				-        return chunker_process(self.chunker)
			
 
				+        data = chunker_process(self.chunker)
			
 
				+        got = len(data)
			
 
				+        # we do not have SEEK_DATA/SEEK_HOLE support in chunker_process C code,
			
 
				+        # but we can just check if data was all-zero (and either came from a hole
			
 
				+        # or from stored zeros - we can not detect that here).
			
 
				+        if zeros.startswith(data):
			
 
				+            data = None
			
 
				+            allocation = CH_ALLOC
			
 
				+        else:
			
 
				+            allocation = CH_DATA
			
 
				+        return Chunk(data, size=got, allocation=allocation)
			
 
				 
			
 
				 
			
 
				 def get_chunker(algo, *params, **kw):
			
@@ -246,15 +290,6 @@ def get_chunker(algo, *params, **kw):
 
				     raise TypeError('unsupported chunker algo %r' % algo)
			
 
				 
			
 
				 
			
 
				-def max_chunk_size(algo, *params):
			
 
				-    # see also parseformat.ChunkerParams return values
			
 
				-    if algo == 'buzhash':
			
 
				-        return 1 << params[1]
			
 
				-    if algo == 'fixed':
			
 
				-        return max(params[0], params[1])
			
 
				-    raise TypeError('unsupported chunker algo %r' % algo)
			
 
				-
			
 
				-
			
 
				 def buzhash(data, unsigned long seed):
			
 
				     cdef uint32_t *table
			
 
				     cdef uint32_t sum
			
--- a/src/borg/constants.py
+++ b/src/borg/constants.py
@@ -45,6 +45,10 @@ assert MAX_OBJECT_SIZE == 20 * 1024 * 1024
 
				 # repo config max_segment_size value must be below this limit to stay within uint32 offsets:
			
 
				 MAX_SEGMENT_SIZE_LIMIT = 2 ** 32 - MAX_OBJECT_SIZE
			
 
				 
			
 
				+# have one all-zero bytes object
			
 
				+# we use it at all places where we need to detect or create all-zero buffers
			
 
				+zeros = bytes(MAX_DATA_SIZE)
			
 
				+
			
 
				 # borg.remote read() buffer size
			
 
				 BUFSIZE = 10 * 1024 * 1024
			
 
				 
			
@@ -75,6 +79,9 @@ CHUNKER_PARAMS = (CH_BUZHASH, CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH
 
				 # chunker params for the items metadata stream, finer granularity
			
 
				 ITEMS_CHUNKER_PARAMS = (CH_BUZHASH, 15, 19, 17, HASH_WINDOW_SIZE)
			
 
				 
			
 
				+# normal on-disk data, allocated (but not written, all zeros), not allocated hole (all zeros)
			
 
				+CH_DATA, CH_ALLOC, CH_HOLE = 0, 1, 2
			
 
				+
			
 
				 # operating mode of the files cache (for fast skipping of unchanged files)
			
 
				 DEFAULT_FILES_CACHE_MODE_UI = 'ctime,size,inode'
			
 
				 DEFAULT_FILES_CACHE_MODE = 'cis'  # == CacheMode(DEFAULT_FILES_CACHE_MODE_UI)
			
--- a/src/borg/testsuite/benchmark.py
+++ b/src/borg/testsuite/benchmark.py
@@ -11,6 +11,7 @@ import os
 
				 import pytest
			
 
				 
			
 
				 from .archiver import changedir, cmd
			
 
				+from ..constants import zeros
			
 
				 
			
 
				 
			
 
				 @pytest.fixture
			
@@ -34,12 +35,13 @@ def repo(request, cmd, repo_url):
 
				 @pytest.fixture(scope='session', params=["zeros", "random"])
			
 
				 def testdata(request, tmpdir_factory):
			
 
				     count, size = 10, 1000*1000
			
 
				+    assert size <= len(zeros)
			
 
				     p = tmpdir_factory.mktemp('data')
			
 
				     data_type = request.param
			
 
				     if data_type == 'zeros':
			
 
				         # do not use a binary zero (\0) to avoid sparse detection
			
 
				         def data(size):
			
 
				-            return b'0' * size
			
 
				+            return memoryview(zeros)[:size]
			
 
				     elif data_type == 'random':
			
 
				         def data(size):
			
 
				             return os.urandom(size)
			
--- a/src/borg/testsuite/chunker.py
+++ b/src/borg/testsuite/chunker.py
@@ -8,18 +8,32 @@ from . import BaseTestCase
 
				 #       See borg.selftest for details. If you add/remove test methods, update SELFTEST_COUNT
			
 
				 
			
 
				 
			
 
				+def cf(chunks):
			
 
				+    """chunk filter"""
			
 
				+    # this is to simplify testing: either return the data piece (bytes) or the hole length (int).
			
 
				+    def _cf(chunk):
			
 
				+        if chunk.meta['allocation'] == CH_DATA:
			
 
				+            assert len(chunk.data) == chunk.meta['size']
			
 
				+            return bytes(chunk.data)  # make sure we have bytes, not memoryview
			
 
				+        if chunk.meta['allocation'] in (CH_HOLE, CH_ALLOC):
			
 
				+            assert chunk.data is None
			
 
				+            return chunk.meta['size']
			
 
				+        assert False, "unexpected allocation value"
			
 
				+    return [_cf(chunk) for chunk in chunks]
			
 
				+
			
 
				+
			
 
				 class ChunkerFixedTestCase(BaseTestCase):
			
 
				 
			
 
				     def test_chunkify_just_blocks(self):
			
 
				         data = b'foobar' * 1500
			
 
				         chunker = ChunkerFixed(4096)
			
 
				-        parts = [c for c in chunker.chunkify(BytesIO(data))]
			
 
				+        parts = cf(chunker.chunkify(BytesIO(data)))
			
 
				         self.assert_equal(parts, [data[0:4096], data[4096:8192], data[8192:]])
			
 
				 
			
 
				     def test_chunkify_header_and_blocks(self):
			
 
				         data = b'foobar' * 1500
			
 
				         chunker = ChunkerFixed(4096, 123)
			
 
				-        parts = [c for c in chunker.chunkify(BytesIO(data))]
			
 
				+        parts = cf(chunker.chunkify(BytesIO(data)))
			
 
				         self.assert_equal(parts, [data[0:123], data[123:123+4096], data[123+4096:123+8192], data[123+8192:]])
			
 
				 
			
 
				     def test_chunkify_just_blocks_fmap_complete(self):
			
@@ -30,7 +44,7 @@ class ChunkerFixedTestCase(BaseTestCase):
 
				             (4096, 8192, True),
			
 
				             (8192, 99999999, True),
			
 
				         ]
			
 
				-        parts = [c for c in chunker.chunkify(BytesIO(data), fmap=fmap)]
			
 
				+        parts = cf(chunker.chunkify(BytesIO(data), fmap=fmap))
			
 
				         self.assert_equal(parts, [data[0:4096], data[4096:8192], data[8192:]])
			
 
				 
			
 
				     def test_chunkify_header_and_blocks_fmap_complete(self):
			
@@ -42,7 +56,7 @@ class ChunkerFixedTestCase(BaseTestCase):
 
				             (123+4096, 4096, True),
			
 
				             (123+8192, 4096, True),
			
 
				         ]
			
 
				-        parts = [c for c in chunker.chunkify(BytesIO(data), fmap=fmap)]
			
 
				+        parts = cf(chunker.chunkify(BytesIO(data), fmap=fmap))
			
 
				         self.assert_equal(parts, [data[0:123], data[123:123+4096], data[123+4096:123+8192], data[123+8192:]])
			
 
				 
			
 
				     def test_chunkify_header_and_blocks_fmap_zeros(self):
			
@@ -54,9 +68,9 @@ class ChunkerFixedTestCase(BaseTestCase):
 
				             (123+4096, 4096, True),
			
 
				             (123+8192, 4096, False),
			
 
				         ]
			
 
				-        parts = [c for c in chunker.chunkify(BytesIO(data), fmap=fmap)]
			
 
				-        # because we marked the '_' ranges as holes, we will get '\0' ranges instead!
			
 
				-        self.assert_equal(parts, [data[0:123], b'\0' * 4096, data[123+4096:123+8192], b'\0' * 4096])
			
 
				+        parts = cf(chunker.chunkify(BytesIO(data), fmap=fmap))
			
 
				+        # because we marked the '_' ranges as holes, we will get hole ranges instead!
			
 
				+        self.assert_equal(parts, [data[0:123], 4096, data[123+4096:123+8192], 4096])
			
 
				 
			
 
				     def test_chunkify_header_and_blocks_fmap_partial(self):
			
 
				         data = b'H' * 123 + b'_' * 4096 + b'X' * 4096 + b'_' * 4096
			
@@ -67,7 +81,7 @@ class ChunkerFixedTestCase(BaseTestCase):
 
				             (123+4096, 4096, True),
			
 
				             # (123+8192, 4096, False),
			
 
				         ]
			
 
				-        parts = [c for c in chunker.chunkify(BytesIO(data), fmap=fmap)]
			
 
				+        parts = cf(chunker.chunkify(BytesIO(data), fmap=fmap))
			
 
				         # because we left out the '_' ranges from the fmap, we will not get them at all!
			
 
				         self.assert_equal(parts, [data[0:123], data[123+4096:123+8192]])
			
 
				 
			
@@ -76,19 +90,19 @@ class ChunkerTestCase(BaseTestCase):
 
				 
			
 
				     def test_chunkify(self):
			
 
				         data = b'0' * int(1.5 * (1 << CHUNK_MAX_EXP)) + b'Y'
			
 
				-        parts = [bytes(c) for c in Chunker(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(data))]
			
 
				+        parts = cf(Chunker(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(data)))
			
 
				         self.assert_equal(len(parts), 2)
			
 
				         self.assert_equal(b''.join(parts), data)
			
 
				-        self.assert_equal([bytes(c) for c in Chunker(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b''))], [])
			
 
				-        self.assert_equal([bytes(c) for c in Chunker(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fooba', b'rboobaz', b'fooba', b'rboobaz', b'fooba', b'rboobaz'])
			
 
				-        self.assert_equal([bytes(c) for c in Chunker(1, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fo', b'obarb', b'oob', b'azf', b'oobarb', b'oob', b'azf', b'oobarb', b'oobaz'])
			
 
				-        self.assert_equal([bytes(c) for c in Chunker(2, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'ar', b'boobazfoob', b'ar', b'boobazfoob', b'ar', b'boobaz'])
			
 
				-        self.assert_equal([bytes(c) for c in Chunker(0, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3])
			
 
				-        self.assert_equal([bytes(c) for c in Chunker(1, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobar', b'boobazfo', b'obar', b'boobazfo', b'obar', b'boobaz'])
			
 
				-        self.assert_equal([bytes(c) for c in Chunker(2, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'arboobaz', b'foob', b'arboobaz', b'foob', b'arboobaz'])
			
 
				-        self.assert_equal([bytes(c) for c in Chunker(0, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3])
			
 
				-        self.assert_equal([bytes(c) for c in Chunker(1, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarbo', b'obazfoobar', b'boobazfo', b'obarboobaz'])
			
 
				-        self.assert_equal([bytes(c) for c in Chunker(2, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz', b'foobarboobaz', b'foobarboobaz'])
			
 
				+        self.assert_equal(cf(Chunker(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b''))), [])
			
 
				+        self.assert_equal(cf(Chunker(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b'foobarboobaz' * 3))), [b'fooba', b'rboobaz', b'fooba', b'rboobaz', b'fooba', b'rboobaz'])
			
 
				+        self.assert_equal(cf(Chunker(1, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b'foobarboobaz' * 3))), [b'fo', b'obarb', b'oob', b'azf', b'oobarb', b'oob', b'azf', b'oobarb', b'oobaz'])
			
 
				+        self.assert_equal(cf(Chunker(2, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b'foobarboobaz' * 3))), [b'foob', b'ar', b'boobazfoob', b'ar', b'boobazfoob', b'ar', b'boobaz'])
			
 
				+        self.assert_equal(cf(Chunker(0, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))), [b'foobarboobaz' * 3])
			
 
				+        self.assert_equal(cf(Chunker(1, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))), [b'foobar', b'boobazfo', b'obar', b'boobazfo', b'obar', b'boobaz'])
			
 
				+        self.assert_equal(cf(Chunker(2, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))), [b'foob', b'arboobaz', b'foob', b'arboobaz', b'foob', b'arboobaz'])
			
 
				+        self.assert_equal(cf(Chunker(0, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))), [b'foobarboobaz' * 3])
			
 
				+        self.assert_equal(cf(Chunker(1, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))), [b'foobarbo', b'obazfoobar', b'boobazfo', b'obarboobaz'])
			
 
				+        self.assert_equal(cf(Chunker(2, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))), [b'foobarboobaz', b'foobarboobaz', b'foobarboobaz'])
			
 
				 
			
 
				     def test_buzhash(self):
			
 
				         self.assert_equal(buzhash(b'abcdefghijklmnop', 0), 3795437769)
			
@@ -106,5 +120,5 @@ class ChunkerTestCase(BaseTestCase):
 
				                 return self.input[:1]
			
 
				 
			
 
				         chunker = get_chunker(*CHUNKER_PARAMS, seed=0)
			
 
				-        reconstructed = b''.join(chunker.chunkify(SmallReadFile()))
			
 
				+        reconstructed = b''.join(cf(chunker.chunkify(SmallReadFile())))
			
 
				         assert reconstructed == b'a' * 20
			
--- a/src/borg/testsuite/chunker_pytest.py
+++ b/src/borg/testsuite/chunker_pytest.py
@@ -4,6 +4,7 @@ import tempfile
 
				 
			
 
				 import pytest
			
 
				 
			
 
				+from .chunker import cf
			
 
				 from ..chunker import ChunkerFixed, sparsemap, has_seek_hole
			
 
				 from ..constants import *  # NOQA
			
 
				 
			
@@ -50,20 +51,18 @@ def make_sparsefile(fname, sparsemap, header_size=0):
 
				 
			
 
				 
			
 
				 def make_content(sparsemap, header_size=0):
			
 
				-    with BytesIO() as fd:
			
 
				-        total = 0
			
 
				-        if header_size:
			
 
				-            fd.write(b'H' * header_size)
			
 
				-            total += header_size
			
 
				-        for offset, size, is_data in sparsemap:
			
 
				-            if is_data:
			
 
				-                fd.write(b'X' * size)
			
 
				-            else:
			
 
				-                fd.write(b'\0' * size)
			
 
				-            total += size
			
 
				-        content = fd.getvalue()
			
 
				-    assert len(content) == total
			
 
				-    return content
			
 
				+    result = []
			
 
				+    total = 0
			
 
				+    if header_size:
			
 
				+        result.append(b'H' * header_size)
			
 
				+        total += header_size
			
 
				+    for offset, size, is_data in sparsemap:
			
 
				+        if is_data:
			
 
				+            result.append(b'X' * size)  # bytes!
			
 
				+        else:
			
 
				+            result.append(size)  # int!
			
 
				+        total += size
			
 
				+    return result
			
 
				 
			
 
				 
			
 
				 def fs_supports_sparse():
			
@@ -132,7 +131,7 @@ def test_chunkify_sparse(tmpdir, fname, sparse_map, header_size, sparse):
 
				     def get_chunks(fname, sparse, header_size):
			
 
				         chunker = ChunkerFixed(4096, header_size=header_size, sparse=sparse)
			
 
				         with open(fname, 'rb') as fd:
			
 
				-            return b''.join([c for c in chunker.chunkify(fd)])
			
 
				+            return cf(chunker.chunkify(fd))
			
 
				 
			
 
				     fn = str(tmpdir / fname)
			
 
				     make_sparsefile(fn, sparse_map, header_size=header_size)
			
--- a/src/borg/testsuite/chunker_slow.py
+++ b/src/borg/testsuite/chunker_slow.py
@@ -1,6 +1,7 @@
 
				 from io import BytesIO
			
 
				 from binascii import unhexlify
			
 
				 
			
 
				+from .chunker import cf
			
 
				 from ..chunker import Chunker
			
 
				 from ..crypto.low_level import blake2b_256
			
 
				 from ..constants import *  # NOQA
			
@@ -30,7 +31,7 @@ class ChunkerRegressionTestCase(BaseTestCase):
 
				                         for seed in (1849058162, 1234567653):
			
 
				                             fh = BytesIO(data)
			
 
				                             chunker = Chunker(seed, minexp, maxexp, maskbits, winsize)
			
 
				-                            chunks = [blake2b_256(b'', c) for c in chunker.chunkify(fh, -1)]
			
 
				+                            chunks = [blake2b_256(b'', c) for c in cf(chunker.chunkify(fh, -1))]
			
 
				                             runs.append(blake2b_256(b'', b''.join(chunks)))
			
 
				 
			
 
				         # The "correct" hash below matches the existing chunker behavior.