Ver Fonte

Merge pull request #8902 from ThomasWaldmann/separate-chunkers

Separate chunkers
TW há 1 semana atrás
pai
commit
fb527051cb

+ 2 - 1
.gitignore

@@ -6,7 +6,8 @@ dist
 src/borg/compress.c
 src/borg/crypto/low_level.c
 src/borg/item.c
-src/borg/chunker.c
+src/borg/chunkers/buzhash.c
+src/borg/chunkers/reader.c
 src/borg/checksums.c
 src/borg/platform/darwin.c
 src/borg/platform/freebsd.c

+ 2 - 1
scripts/make.py

@@ -542,7 +542,8 @@ class BuildMan:
 cython_sources = """
 src/borg/compress.pyx
 src/borg/crypto/low_level.pyx
-src/borg/chunker.pyx
+src/borg/chunkers/buzhash.pyx
+src/borg/chunkers/reader.pyx
 src/borg/hashindex.pyx
 src/borg/item.pyx
 src/borg/checksums.pyx

+ 6 - 3
setup.py

@@ -50,7 +50,8 @@ cflags = ["-Wall", "-Wextra", "-Wpointer-arith", "-Wno-unreachable-code-fallthro
 
 compress_source = "src/borg/compress.pyx"
 crypto_ll_source = "src/borg/crypto/low_level.pyx"
-chunker_source = "src/borg/chunker.pyx"
+buzhash_source = "src/borg/chunkers/buzhash.pyx"
+reader_source = "src/borg/chunkers/reader.pyx"
 hashindex_source = "src/borg/hashindex.pyx"
 item_source = "src/borg/item.pyx"
 checksums_source = "src/borg/checksums.pyx"
@@ -64,7 +65,8 @@ platform_windows_source = "src/borg/platform/windows.pyx"
 cython_sources = [
     compress_source,
     crypto_ll_source,
-    chunker_source,
+    buzhash_source,
+    reader_source,
     hashindex_source,
     item_source,
     checksums_source,
@@ -182,7 +184,8 @@ if not on_rtd:
         Extension("borg.compress", **compress_ext_kwargs),
         Extension("borg.hashindex", [hashindex_source], extra_compile_args=cflags),
         Extension("borg.item", [item_source], extra_compile_args=cflags),
-        Extension("borg.chunker", [chunker_source], extra_compile_args=cflags, undef_macros=["NDEBUG"]),
+        Extension("borg.chunkers.buzhash", [buzhash_source], extra_compile_args=cflags, undef_macros=["NDEBUG"]),
+        Extension("borg.chunkers.reader", [reader_source], extra_compile_args=cflags, undef_macros=["NDEBUG"]),
         Extension("borg.checksums", **checksums_ext_kwargs),
     ]
 

+ 1 - 1
src/borg/archive.py

@@ -21,7 +21,7 @@ from .logger import create_logger
 logger = create_logger()
 
 from . import xattr
-from .chunker import get_chunker, Chunk
+from .chunkers import get_chunker, Chunk
 from .cache import ChunkListEntry, build_chunkindex_from_repo, delete_chunkindex_cache
 from .crypto.key import key_factory, UnsupportedPayloadError
 from .compress import CompressionSpec

+ 1 - 1
src/borg/archiver/benchmark_cmd.py

@@ -134,7 +134,7 @@ class BenchmarkMixIn:
         key_96 = os.urandom(12)
 
         import io
-        from ..chunker import get_chunker
+        from ..chunkers import get_chunker
 
         print("Chunkers =======================================================")
         size = "1GB"

+ 1 - 1
src/borg/archiver/transfer_cmd.py

@@ -2,7 +2,7 @@ import argparse
 
 from ._common import with_repository, with_other_repository, Highlander
 from ..archive import Archive, cached_hash, DownloadPipeline
-from ..chunker import get_chunker
+from ..chunkers import get_chunker
 from ..compress import CompressionSpec
 from ..constants import *  # NOQA
 from ..crypto.key import uses_same_id_hash, uses_same_chunker_secret

+ 0 - 781
src/borg/chunker.pyx

@@ -1,781 +0,0 @@
-# cython: language_level=3
-
-API_VERSION = '1.2_01'
-
-import cython
-import os
-import errno
-import time
-from collections import namedtuple
-from cpython.bytes cimport PyBytes_AsString
-from libc.stdint cimport uint8_t, uint32_t
-from libc.stdlib cimport malloc, free
-from libc.string cimport memcpy, memmove
-
-from .constants import CH_DATA, CH_ALLOC, CH_HOLE, zeros
-from .platform import safe_fadvise
-
-# this will be True if Python's seek implementation supports data/holes seeking.
-# this does not imply that it will actually work on the filesystem,
-# because the FS also needs to support this.
-has_seek_hole = hasattr(os, 'SEEK_DATA') and hasattr(os, 'SEEK_HOLE')
-
-
-_Chunk = namedtuple('_Chunk', 'meta data')
-_Chunk.__doc__ = """\
-    Chunk namedtuple
-
-    meta is always a dictionary, data depends on allocation.
-
-    data chunk read from a DATA range of a file (not from a sparse hole):
-        meta = {'allocation' = CH_DATA, 'size' = size_of_chunk }
-        data = read_data [bytes or memoryview]
-
-    all-zero chunk read from a DATA range of a file (not from a sparse hole, but detected to be all-zero):
-        meta = {'allocation' = CH_ALLOC, 'size' = size_of_chunk }
-        data = None
-
-    all-zero chunk from a HOLE range of a file (from a sparse hole):
-        meta = {'allocation' = CH_HOLE, 'size' = size_of_chunk }
-        data = None
-"""
-
-def Chunk(data, **meta):
-    return _Chunk(meta, data)
-
-
-def dread(offset, size, fd=None, fh=-1):
-    use_fh = fh >= 0
-    if use_fh:
-        data = os.read(fh, size)
-        safe_fadvise(fh, offset, len(data), "DONTNEED")
-        return data
-    else:
-        return fd.read(size)
-
-
-def dseek(amount, whence, fd=None, fh=-1):
-    use_fh = fh >= 0
-    if use_fh:
-        return os.lseek(fh, amount, whence)
-    else:
-        return fd.seek(amount, whence)
-
-
-def dpos_curr_end(fd=None, fh=-1):
-    """
-    determine current position, file end position (== file length)
-    """
-    curr = dseek(0, os.SEEK_CUR, fd, fh)
-    end = dseek(0, os.SEEK_END, fd, fh)
-    dseek(curr, os.SEEK_SET, fd, fh)
-    return curr, end
-
-
-def sparsemap(fd=None, fh=-1):
-    """
-    generator yielding a (start, length, is_data) tuple for each range.
-    is_data is indicating data ranges (True) or hole ranges (False).
-
-    note:
-    the map is generated starting from the current seek position (it
-    is not required to be 0 / to be at the start of the file) and
-    work from there up to the end of the file.
-    when the generator is finished, the file pointer position will be
-    reset to where it was before calling this function.
-    """
-    curr, file_len = dpos_curr_end(fd, fh)
-    start = curr
-    try:
-        whence = os.SEEK_HOLE
-        while True:
-            is_data = whence == os.SEEK_HOLE  # True: range with data, False: range is a hole
-            try:
-                end = dseek(start, whence, fd, fh)
-            except OSError as e:
-                if e.errno == errno.ENXIO:
-                    if not is_data and start < file_len:
-                        # if there is a hole at the end of a file, we can not find the file end by SEEK_DATA
-                        # (because we run into ENXIO), thus we must manually deal with this case:
-                        end = file_len
-                        yield (start, end - start, is_data)
-                    break
-                else:
-                    raise
-            # we do not want to yield zero-length ranges with start == end:
-            if end > start:
-                yield (start, end - start, is_data)
-            start = end
-            whence = os.SEEK_DATA if is_data else os.SEEK_HOLE
-    finally:
-        # seek to same position as before calling this function
-        dseek(curr, os.SEEK_SET, fd, fh)
-
-
-class ChunkerFailing:
-    """
-    This is a very simple chunker for testing purposes.
-
-    Reads block_size chunks, starts failing at block <fail_start>, <fail_count> failures, then succeeds.
-    """
-    def __init__(self, block_size, map):
-        self.block_size = block_size
-        # one char per block: r/R = successful read, e/E = I/O Error, e.g.: "rrrrErrrEEr"
-        # blocks beyond the map will have same behaviour as the last map char indicates.
-        map = map.upper()
-        if not set(map).issubset({"R", "E"}):
-            raise ValueError("unsupported map character")
-        self.map = map
-        self.count = 0
-        self.chunking_time = 0.0  # not updated, just provided so that caller does not crash
-
-    def chunkify(self, fd=None, fh=-1):
-        """
-        Cut a file into chunks.
-
-        :param fd: Python file object
-        :param fh: OS-level file handle (if available),
-                   defaults to -1 which means not to use OS-level fd.
-        """
-        use_fh = fh >= 0
-        wanted = self.block_size
-        while True:
-            data = os.read(fh, wanted) if use_fh else fd.read(wanted)
-            got = len(data)
-            if got > 0:
-                idx = self.count if self.count < len(self.map) else -1
-                behaviour = self.map[idx]
-                if behaviour == "E":
-                    self.count += 1
-                    fname = None if use_fh else getattr(fd, "name", None)
-                    raise OSError(errno.EIO, "simulated I/O error", fname)
-                elif behaviour == "R":
-                    self.count += 1
-                    yield Chunk(data, size=got, allocation=CH_DATA)
-                else:
-                    raise ValueError("unsupported map character")
-            if got < wanted:
-                # we did not get enough data, looks like EOF.
-                return
-
-
-class FileFMAPReader:
-    """
-    This is for reading blocks from a file.
-
-    It optionally supports:
-
-    - using a sparsemap to read only data ranges and seek over hole ranges
-      for sparse files.
-    - using an externally given filemap to read only specific ranges from
-      a file.
-
-    Note: the last block of a data or hole range may be less than the read_size,
-          this is supported and not considered to be an error.
-    """
-    def __init__(self, *, fd=None, fh=-1, read_size=0, sparse=False, fmap=None):
-        assert fd is not None or fh >= 0
-        self.fd = fd
-        self.fh = fh
-        assert 0 < read_size <= len(zeros)
-        self.read_size = read_size  # how much data we want to read at once
-        self.reading_time = 0.0  # time spent in reading/seeking
-        # should borg try to do sparse input processing?
-        # whether it actually can be done depends on the input file being seekable.
-        self.try_sparse = sparse and has_seek_hole
-        self.fmap = fmap
-
-    def _build_fmap(self):
-        started_fmap = time.monotonic()
-        fmap = None
-        if self.try_sparse:
-            try:
-                fmap = list(sparsemap(self.fd, self.fh))
-            except OSError as err:
-                # seeking did not work
-                pass
-
-        if fmap is None:
-            # either sparse processing (building the fmap) was not tried or it failed.
-            # in these cases, we just build a "fake fmap" that considers the whole file
-            # as range(s) of data (no holes), so we can use the same code.
-            fmap = [(0, 2 ** 62, True), ]
-        self.reading_time += time.monotonic() - started_fmap
-        return fmap
-
-    def blockify(self):
-        """
-        Read <read_size> sized blocks from a file.
-        """
-        if self.fmap is None:
-            self.fmap = self._build_fmap()
-
-        offset = 0
-        for range_start, range_size, is_data in self.fmap:
-            if range_start != offset:
-                # this is for the case when the fmap does not cover the file completely,
-                # e.g. it could be without the ranges of holes or of unchanged data.
-                offset = range_start
-                dseek(offset, os.SEEK_SET, self.fd, self.fh)
-            while range_size:
-                started_reading = time.monotonic()
-                wanted = min(range_size, self.read_size)
-                if is_data:
-                    # read block from the range
-                    data = dread(offset, wanted, self.fd, self.fh)
-                    got = len(data)
-                    if zeros.startswith(data):
-                        data = None
-                        allocation = CH_ALLOC
-                    else:
-                        allocation = CH_DATA
-                else:  # hole
-                    # seek over block from the range
-                    pos = dseek(wanted, os.SEEK_CUR, self.fd, self.fh)
-                    got = pos - offset
-                    data = None
-                    allocation = CH_HOLE
-                self.reading_time += time.monotonic() - started_reading
-                if got > 0:
-                    offset += got
-                    range_size -= got
-                    yield Chunk(data, size=got, allocation=allocation)
-                if got < wanted:
-                    # we did not get enough data, looks like EOF.
-                    return
-
-
-class FileReader:
-    """
-    This is a buffered reader for file data.
-
-    It maintains a buffer that is filled with Chunks from the FileFMAPReader.blockify generator.
-    The data in that buffer is consumed by clients calling FileReader.read, which returns a Chunk.
-
-    Most complexity in here comes from the desired size when a user calls FileReader.read does
-    not need to match the Chunk sizes we got from the FileFMAPReader.
-    """
-    def __init__(self, *, fd=None, fh=-1, read_size=0, sparse=False, fmap=None):
-        assert read_size > 0
-        self.reader = FileFMAPReader(fd=fd, fh=fh, read_size=read_size, sparse=sparse, fmap=fmap)
-        self.buffer = []  # list of Chunk objects
-        self.offset = 0  # offset into the first buffer object's data
-        self.remaining_bytes = 0  # total bytes available in buffer
-        self.blockify_gen = None  # generator from FileFMAPReader.blockify
-        self.fd = fd
-        self.fh = fh
-        self.fmap = fmap
-
-    def _fill_buffer(self):
-        """
-        Fill the buffer with more data from the blockify generator.
-        Returns True if more data was added, False if EOF.
-        """
-        if self.blockify_gen is None:
-            return False
-
-        try:
-            chunk = next(self.blockify_gen)
-            # Store the Chunk object directly in the buffer
-            self.buffer.append(chunk)
-            self.remaining_bytes += chunk.meta["size"]
-            return True
-        except StopIteration:
-            self.blockify_gen = None
-            return False
-
-    def read(self, size):
-        """
-        Read a Chunk of up to 'size' bytes from the file.
-
-        This method tries to yield a Chunk of the requested size, if possible, by considering
-        multiple chunks from the buffer.
-
-        The allocation type of the resulting chunk depends on the allocation types of the contributing chunks:
-        - If one of the chunks is CH_DATA, it will create all-zero bytes for other chunks that are not CH_DATA
-        - If all contributing chunks are CH_HOLE, the resulting chunk will also be CH_HOLE
-        - If the contributing chunks are a mix of CH_HOLE and CH_ALLOC, the resulting chunk will be CH_HOLE
-
-        :param size: Number of bytes to read
-        :return: Chunk object containing the read data.
-                 If no data is available, returns Chunk(None, size=0, allocation=CH_ALLOC).
-                 If less than requested bytes were available (at EOF), the returned chunk might be smaller
-                 than requested.
-        """
-        # Initialize if not already done
-        if self.blockify_gen is None:
-            self.buffer = []
-            self.offset = 0
-            self.remaining_bytes = 0
-            self.blockify_gen = self.reader.blockify()
-
-        # If we don't have enough data in the buffer, try to fill it
-        while self.remaining_bytes < size:
-            if not self._fill_buffer():
-                # No more data available, return what we have
-                break
-
-        # If we have no data at all, return an empty Chunk
-        if not self.buffer:
-            return Chunk(b"", size=0, allocation=CH_DATA)
-
-        # Prepare to collect the requested data
-        result = bytearray()
-        bytes_to_read = min(size, self.remaining_bytes)
-        bytes_read = 0
-
-        # Track if we've seen different allocation types
-        has_data = False
-        has_hole = False
-        has_alloc = False
-
-        # Read data from the buffer, combining chunks as needed
-        while bytes_read < bytes_to_read and self.buffer:
-            chunk = self.buffer[0]
-            chunk_size = chunk.meta["size"]
-            allocation = chunk.meta["allocation"]
-            data = chunk.data
-
-            # Track allocation types
-            if allocation == CH_DATA:
-                has_data = True
-            elif allocation == CH_HOLE:
-                has_hole = True
-            elif allocation == CH_ALLOC:
-                has_alloc = True
-            else:
-                raise ValueError(f"Invalid allocation type: {allocation}")
-
-            # Calculate how much we can read from this chunk
-            available = chunk_size - self.offset
-            to_read = min(available, bytes_to_read - bytes_read)
-
-            # Process the chunk based on its allocation type
-            if allocation == CH_DATA:
-                assert data is not None
-                # For data chunks, add the actual data
-                result.extend(data[self.offset:self.offset + to_read])
-            else:
-                # For non-data chunks, add zeros if we've seen a data chunk
-                if has_data:
-                    result.extend(b'\0' * to_read)
-                # Otherwise, we'll just track the size without adding data
-
-            bytes_read += to_read
-
-            # Update offset or remove chunk if fully consumed
-            if to_read < available:
-                self.offset += to_read
-            else:
-                self.offset = 0
-                self.buffer.pop(0)
-
-            self.remaining_bytes -= to_read
-
-        # Determine the allocation type of the resulting chunk
-        if has_data:
-            # If any chunk was CH_DATA, the result is CH_DATA
-            return Chunk(bytes(result), size=bytes_read, allocation=CH_DATA)
-        elif has_hole:
-            # If any chunk was CH_HOLE (and none were CH_DATA), the result is CH_HOLE
-            return Chunk(None, size=bytes_read, allocation=CH_HOLE)
-        else:
-            # Otherwise, all chunks were CH_ALLOC
-            return Chunk(None, size=bytes_read, allocation=CH_ALLOC)
-
-
-class ChunkerFixed:
-    """
-    This is a simple chunker for input data with data usually staying at same
-    offset and / or with known block/record sizes:
-
-    - raw disk images
-    - block devices
-    - database files with simple header + fixed-size records layout
-
-    It optionally supports:
-
-    - a header block of different size
-    - using a sparsemap to read only data ranges and seek over hole ranges
-      for sparse files.
-    - using an externally given filemap to read only specific ranges from
-      a file.
-
-    Note: the last block of a data or hole range may be less than the block size,
-          this is supported and not considered to be an error.
-    """
-    def __init__(self, block_size, header_size=0, sparse=False):
-        self.block_size = block_size
-        self.header_size = header_size
-        self.chunking_time = 0.0  # likely will stay close to zero - not much to do here.
-        self.reader_block_size = 1024 * 1024
-        self.reader = None
-        self.sparse = sparse
-
-    def chunkify(self, fd=None, fh=-1, fmap=None):
-        """
-        Cut a file into chunks.
-
-        :param fd: Python file object
-        :param fh: OS-level file handle (if available),
-                   defaults to -1 which means not to use OS-level fd.
-        :param fmap: a file map, same format as generated by sparsemap
-        """
-        # Initialize the reader with the file descriptors
-        self.reader = FileReader(fd=fd, fh=fh, read_size=self.reader_block_size,
-                                sparse=self.sparse, fmap=fmap)
-
-        # Handle header if present
-        if self.header_size > 0:
-            # Read the header block using read
-            started_chunking = time.monotonic()
-            header_chunk = self.reader.read(self.header_size)
-            self.chunking_time += time.monotonic() - started_chunking
-
-            if header_chunk.meta["size"] > 0:
-                # Yield the header chunk
-                yield header_chunk
-
-        # Process the rest of the file using read
-        while True:
-            started_chunking = time.monotonic()
-            chunk = self.reader.read(self.block_size)
-            self.chunking_time += time.monotonic() - started_chunking
-            size = chunk.meta["size"]
-            if size == 0:
-                break  # EOF
-            assert size <= self.block_size
-            yield chunk
-
-
-# Cyclic polynomial / buzhash
-#
-# https://en.wikipedia.org/wiki/Rolling_hash
-#
-# http://www.serve.net/buz/Notes.1st.year/HTML/C6/rand.012.html (by "BUZ", the inventor)
-#
-# http://www.dcs.gla.ac.uk/~hamer/cakes-talk.pdf (see buzhash slide)
-#
-# Some properties of buzhash / of this implementation:
-#
-# (1) the hash is designed for inputs <= 32 bytes, but the chunker uses it on a 4095 byte window;
-#     any repeating bytes at distance 32 within those 4095 bytes can cause cancellation within
-#     the hash function, e.g. in "X <any 31 bytes> X", the last X would cancel out the influence
-#     of the first X on the hash value.
-#
-# (2) the hash table is supposed to have (according to the BUZ) exactly a 50% distribution of
-#     0/1 bit values per position, but the hard coded table below doesn't fit that property.
-#
-# (3) if you would use a window size divisible by 64, the seed would cancel itself out completely.
-#     this is why we use a window size of 4095 bytes.
-#
-# Another quirk is that, even with the 4095 byte window, XORing the entire table by a constant
-# is equivalent to XORing the hash output with a different constant. but since the seed is stored
-# encrypted, i think it still serves its purpose.
-
-cdef uint32_t table_base[256]
-table_base = [
-    0xe7f831ec, 0xf4026465, 0xafb50cae, 0x6d553c7a, 0xd639efe3, 0x19a7b895, 0x9aba5b21, 0x5417d6d4,
-    0x35fd2b84, 0xd1f6a159, 0x3f8e323f, 0xb419551c, 0xf444cebf, 0x21dc3b80, 0xde8d1e36, 0x84a32436,
-    0xbeb35a9d, 0xa36f24aa, 0xa4e60186, 0x98d18ffe, 0x3f042f9e, 0xdb228bcd, 0x096474b7, 0x5c20c2f7,
-    0xf9eec872, 0xe8625275, 0xb9d38f80, 0xd48eb716, 0x22a950b4, 0x3cbaaeaa, 0xc37cddd3, 0x8fea6f6a,
-    0x1d55d526, 0x7fd6d3b3, 0xdaa072ee, 0x4345ac40, 0xa077c642, 0x8f2bd45b, 0x28509110, 0x55557613,
-    0xffc17311, 0xd961ffef, 0xe532c287, 0xaab95937, 0x46d38365, 0xb065c703, 0xf2d91d0f, 0x92cd4bb0,
-    0x4007c712, 0xf35509dd, 0x505b2f69, 0x557ead81, 0x310f4563, 0xbddc5be8, 0x9760f38c, 0x701e0205,
-    0x00157244, 0x14912826, 0xdc4ca32b, 0x67b196de, 0x5db292e8, 0x8c1b406b, 0x01f34075, 0xfa2520f7,
-    0x73bc37ab, 0x1e18bc30, 0xfe2c6cb3, 0x20c522d0, 0x5639e3db, 0x942bda35, 0x899af9d1, 0xced44035,
-    0x98cc025b, 0x255f5771, 0x70fefa24, 0xe928fa4d, 0x2c030405, 0xb9325590, 0x20cb63bd, 0xa166305d,
-    0x80e52c0a, 0xa8fafe2f, 0x1ad13f7d, 0xcfaf3685, 0x6c83a199, 0x7d26718a, 0xde5dfcd9, 0x79cf7355,
-    0x8979d7fb, 0xebf8c55e, 0xebe408e4, 0xcd2affba, 0xe483be6e, 0xe239d6de, 0x5dc1e9e0, 0x0473931f,
-    0x851b097c, 0xac5db249, 0x09c0f9f2, 0xd8d2f134, 0xe6f38e41, 0xb1c71bf1, 0x52b6e4db, 0x07224424,
-    0x6cf73e85, 0x4f25d89c, 0x782a7d74, 0x10a68dcd, 0x3a868189, 0xd570d2dc, 0x69630745, 0x9542ed86,
-    0x331cd6b2, 0xa84b5b28, 0x07879c9d, 0x38372f64, 0x7185db11, 0x25ba7c83, 0x01061523, 0xe6792f9f,
-    0xe5df07d1, 0x4321b47f, 0x7d2469d8, 0x1a3a4f90, 0x48be29a3, 0x669071af, 0x8ec8dd31, 0x0810bfbf,
-    0x813a06b4, 0x68538345, 0x65865ddc, 0x43a71b8e, 0x78619a56, 0x5a34451d, 0x5bdaa3ed, 0x71edc7e9,
-    0x17ac9a20, 0x78d10bfa, 0x6c1e7f35, 0xd51839d9, 0x240cbc51, 0x33513cc1, 0xd2b4f795, 0xccaa8186,
-    0x0babe682, 0xa33cf164, 0x18c643ea, 0xc1ca105f, 0x9959147a, 0x6d3d94de, 0x0b654fbe, 0xed902ca0,
-    0x7d835cb5, 0x99ba1509, 0x6445c922, 0x495e76c2, 0xf07194bc, 0xa1631d7e, 0x677076a5, 0x89fffe35,
-    0x1a49bcf3, 0x8e6c948a, 0x0144c917, 0x8d93aea1, 0x16f87ddf, 0xc8f25d49, 0x1fb11297, 0x27e750cd,
-    0x2f422da1, 0xdee89a77, 0x1534c643, 0x457b7b8b, 0xaf172f7a, 0x6b9b09d6, 0x33573f7f, 0xf14e15c4,
-    0x526467d5, 0xaf488241, 0x87c3ee0d, 0x33be490c, 0x95aa6e52, 0x43ec242e, 0xd77de99b, 0xd018334f,
-    0x5b78d407, 0x498eb66b, 0xb1279fa8, 0xb38b0ea6, 0x90718376, 0xe325dee2, 0x8e2f2cba, 0xcaa5bdec,
-    0x9d652c56, 0xad68f5cb, 0xa77591af, 0x88e37ee8, 0xf8faa221, 0xfcbbbe47, 0x4f407786, 0xaf393889,
-    0xf444a1d9, 0x15ae1a2f, 0x40aa7097, 0x6f9486ac, 0x29d232a3, 0xe47609e9, 0xe8b631ff, 0xba8565f4,
-    0x11288749, 0x46c9a838, 0xeb1b7cd8, 0xf516bbb1, 0xfb74fda0, 0x010996e6, 0x4c994653, 0x1d889512,
-    0x53dcd9a3, 0xdd074697, 0x1e78e17c, 0x637c98bf, 0x930bb219, 0xcf7f75b0, 0xcb9355fb, 0x9e623009,
-    0xe466d82c, 0x28f968d3, 0xfeb385d9, 0x238e026c, 0xb8ed0560, 0x0c6a027a, 0x3d6fec4b, 0xbb4b2ec2,
-    0xe715031c, 0xeded011d, 0xcdc4d3b9, 0xc456fc96, 0xdd0eea20, 0xb3df8ec9, 0x12351993, 0xd9cbb01c,
-    0x603147a2, 0xcf37d17d, 0xf7fcd9dc, 0xd8556fa3, 0x104c8131, 0x13152774, 0xb4715811, 0x6a72c2c9,
-    0xc5ae37bb, 0xa76ce12a, 0x8150d8f3, 0x2ec29218, 0xa35f0984, 0x48c0647e, 0x0b5ff98c, 0x71893f7b
-]
-
-# This seems to be the most reliable way to inline this code, using a C preprocessor macro:
-cdef extern from *:
-   """
-   #define BARREL_SHIFT(v, shift) (((v) << (shift)) | ((v) >> (((32 - (shift)) & 0x1f))))
-   """
-   uint32_t BARREL_SHIFT(uint32_t v, uint32_t shift)
-
-
-@cython.boundscheck(False)  # Deactivate bounds checking
-@cython.wraparound(False)  # Deactivate negative indexing.
-cdef uint32_t* buzhash_init_table(uint32_t seed):
-    """Initialize the buzhash table with the given seed."""
-    cdef int i
-    cdef uint32_t* table = <uint32_t*>malloc(1024)  # 256 * sizeof(uint32_t)
-    for i in range(256):
-        table[i] = table_base[i] ^ seed
-    return table
-
-
-@cython.boundscheck(False)  # Deactivate bounds checking
-@cython.wraparound(False)  # Deactivate negative indexing.
-@cython.cdivision(True)  # Use C division/modulo semantics for integer division.
-cdef uint32_t _buzhash(const unsigned char* data, size_t len, const uint32_t* h):
-    """Calculate the buzhash of the given data."""
-    cdef uint32_t i
-    cdef uint32_t sum = 0, imod
-    for i in range(len - 1, 0, -1):
-        imod = i & 0x1f
-        sum ^= BARREL_SHIFT(h[data[0]], imod)
-        data += 1
-    return sum ^ h[data[0]]
-
-
-@cython.boundscheck(False)  # Deactivate bounds checking
-@cython.wraparound(False)  # Deactivate negative indexing.
-@cython.cdivision(True)  # Use C division/modulo semantics for integer division.
-cdef uint32_t _buzhash_update(uint32_t sum, unsigned char remove, unsigned char add, size_t len, const uint32_t* h):
-    """Update the buzhash with a new byte."""
-    cdef uint32_t lenmod = len & 0x1f
-    return BARREL_SHIFT(sum, 1) ^ BARREL_SHIFT(h[remove], lenmod) ^ h[add]
-
-
-cdef class Chunker:
-    """
-    Content-Defined Chunker, variable chunk sizes.
-
-    This chunker makes quite some effort to cut mostly chunks of the same-content, even if
-    the content moves to a different offset inside the file. It uses the buzhash
-    rolling-hash algorithm to identify the chunk cutting places by looking at the
-    content inside the moving window and computing the rolling hash value over the
-    window contents. If the last n bits of the rolling hash are 0, a chunk is cut.
-    Additionally it obeys some more criteria, like a minimum and maximum chunk size.
-    It also uses a per-repo random seed to avoid some chunk length fingerprinting attacks.
-    """
-    cdef uint32_t chunk_mask
-    cdef uint32_t* table
-    cdef uint8_t* data
-    cdef object _fd  # Python object for file descriptor
-    cdef int fh
-    cdef int done, eof
-    cdef size_t min_size, buf_size, window_size, remaining, position, last
-    cdef long long bytes_read, bytes_yielded  # off_t in C, using long long for compatibility
-    cdef readonly float chunking_time
-    cdef object file_reader  # FileReader instance
-    cdef size_t reader_block_size
-    cdef bint sparse
-
-    def __cinit__(self, int seed, int chunk_min_exp, int chunk_max_exp, int hash_mask_bits, int hash_window_size, bint sparse=False):
-        min_size = 1 << chunk_min_exp
-        max_size = 1 << chunk_max_exp
-        assert max_size <= len(zeros)
-        # see chunker_process, first while loop condition, first term must be able to get True:
-        assert hash_window_size + min_size + 1 <= max_size, "too small max_size"
-
-        self.window_size = hash_window_size
-        self.chunk_mask = (1 << hash_mask_bits) - 1
-        self.min_size = min_size
-        self.table = buzhash_init_table(seed & 0xffffffff)
-        self.buf_size = max_size
-        self.data = <uint8_t*>malloc(self.buf_size)
-        self.fh = -1
-        self.done = 0
-        self.eof = 0
-        self.remaining = 0
-        self.position = 0
-        self.last = 0
-        self.bytes_read = 0
-        self.bytes_yielded = 0
-        self._fd = None
-        self.chunking_time = 0.0
-        self.reader_block_size = 1024 * 1024
-        self.sparse = sparse
-
-    def __dealloc__(self):
-        """Free the chunker's resources."""
-        if self.table != NULL:
-            free(self.table)
-            self.table = NULL
-        if self.data != NULL:
-            free(self.data)
-            self.data = NULL
-
-    cdef int fill(self) except 0:
-        """Fill the chunker's buffer with more data."""
-        cdef ssize_t n
-        cdef object chunk
-
-        # Move remaining data to the beginning of the buffer
-        memmove(self.data, self.data + self.last, self.position + self.remaining - self.last)
-        self.position -= self.last
-        self.last = 0
-        n = self.buf_size - self.position - self.remaining
-
-        if self.eof or n == 0:
-            return 1
-
-        # Use FileReader to read data
-        chunk = self.file_reader.read(n)
-        n = chunk.meta["size"]
-
-        if n > 0:
-            # Only copy data if it's not a hole
-            if chunk.meta["allocation"] == CH_DATA:
-                # Copy data from chunk to our buffer
-                memcpy(self.data + self.position + self.remaining, <const unsigned char*>PyBytes_AsString(chunk.data), n)
-            else:
-                # For holes, fill with zeros
-                memcpy(self.data + self.position + self.remaining, <const unsigned char*>PyBytes_AsString(zeros[:n]), n)
-
-            self.remaining += n
-            self.bytes_read += n
-        else:
-            self.eof = 1
-
-        return 1
-
-    cdef object process(self) except *:
-        """Process the chunker's buffer and return the next chunk."""
-        cdef uint32_t sum, chunk_mask = self.chunk_mask
-        cdef size_t n, old_last, min_size = self.min_size, window_size = self.window_size
-        cdef uint8_t* p
-        cdef uint8_t* stop_at
-        cdef size_t did_bytes
-
-        if self.done:
-            if self.bytes_read == self.bytes_yielded:
-                raise StopIteration
-            else:
-                raise Exception("chunkifier byte count mismatch")
-
-        while self.remaining < min_size + window_size + 1 and not self.eof:  # see assert in Chunker init
-            if not self.fill():
-                return None
-
-        # Here we either are at eof...
-        if self.eof:
-            self.done = 1
-            if self.remaining:
-                self.bytes_yielded += self.remaining
-                # Return a memory view of the remaining data
-                return memoryview((self.data + self.position)[:self.remaining])
-            else:
-                if self.bytes_read == self.bytes_yielded:
-                    raise StopIteration
-                else:
-                    raise Exception("chunkifier byte count mismatch")
-
-        # ... or we have at least min_size + window_size + 1 bytes remaining.
-        # We do not want to "cut" a chunk smaller than min_size and the hash
-        # window starts at the potential cutting place.
-        self.position += min_size
-        self.remaining -= min_size
-        sum = _buzhash(self.data + self.position, window_size, self.table)
-
-        while self.remaining > self.window_size and (sum & chunk_mask) and not (self.eof and self.remaining <= window_size):
-            p = self.data + self.position
-            stop_at = p + self.remaining - window_size
-
-            while p < stop_at and (sum & chunk_mask):
-                sum = _buzhash_update(sum, p[0], p[window_size], window_size, self.table)
-                p += 1
-
-            did_bytes = p - (self.data + self.position)
-            self.position += did_bytes
-            self.remaining -= did_bytes
-
-            if self.remaining <= window_size:
-                if not self.fill():
-                    return None
-
-        if self.remaining <= window_size:
-            self.position += self.remaining
-            self.remaining = 0
-
-        old_last = self.last
-        self.last = self.position
-        n = self.last - old_last
-        self.bytes_yielded += n
-
-        # Return a memory view of the chunk
-        return memoryview((self.data + old_last)[:n])
-
-    def chunkify(self, fd, fh=-1, fmap=None):
-        """
-        Cut a file into chunks.
-
-        :param fd: Python file object
-        :param fh: OS-level file handle (if available),
-                   defaults to -1 which means not to use OS-level fd.
-        :param fmap: a file map, same format as generated by sparsemap
-        """
-        self._fd = fd
-        self.fh = fh
-        self.file_reader = FileReader(fd=fd, fh=fh, read_size=self.reader_block_size, sparse=self.sparse, fmap=fmap)
-        self.done = 0
-        self.remaining = 0
-        self.bytes_read = 0
-        self.bytes_yielded = 0
-        self.position = 0
-        self.last = 0
-        self.eof = 0
-        return self
-
-    def __iter__(self):
-        return self
-
-    def __next__(self):
-        started_chunking = time.monotonic()
-        data = self.process()
-        got = len(data)
-        # we do not have SEEK_DATA/SEEK_HOLE support in chunker_process C code,
-        # but we can just check if data was all-zero (and either came from a hole
-        # or from stored zeros - we can not detect that here).
-        if zeros.startswith(data):
-            data = None
-            allocation = CH_ALLOC
-        else:
-            allocation = CH_DATA
-        self.chunking_time += time.monotonic() - started_chunking
-        return Chunk(data, size=got, allocation=allocation)
-
-
-def buzhash(data, unsigned long seed):
-    cdef uint32_t *table
-    cdef uint32_t sum
-    table = buzhash_init_table(seed & 0xffffffff)
-    sum = _buzhash(<const unsigned char *> data, len(data), table)
-    free(table)
-    return sum
-
-
-def buzhash_update(uint32_t sum, unsigned char remove, unsigned char add, size_t len, unsigned long seed):
-    cdef uint32_t *table
-    table = buzhash_init_table(seed & 0xffffffff)
-    sum = _buzhash_update(sum, remove, add, len, table)
-    free(table)
-    return sum
-
-
-def get_chunker(algo, *params, **kw):
-    if algo == 'buzhash':
-        seed = kw['seed']
-        sparse = kw['sparse']
-        return Chunker(seed, *params, sparse=sparse)
-    if algo == 'fixed':
-        sparse = kw['sparse']
-        return ChunkerFixed(*params, sparse=sparse)
-    if algo == 'fail':
-        return ChunkerFailing(*params)
-    raise TypeError('unsupported chunker algo %r' % algo)

+ 19 - 0
src/borg/chunkers/__init__.py

@@ -0,0 +1,19 @@
+from .buzhash import Chunker
+from .failing import ChunkerFailing
+from .fixed import ChunkerFixed
+from .reader import *  # noqa
+
+API_VERSION = "1.2_01"
+
+
+def get_chunker(algo, *params, **kw):
+    if algo == "buzhash":
+        seed = kw["seed"]
+        sparse = kw["sparse"]
+        return Chunker(seed, *params, sparse=sparse)
+    if algo == "fixed":
+        sparse = kw["sparse"]
+        return ChunkerFixed(*params, sparse=sparse)
+    if algo == "fail":
+        return ChunkerFailing(*params)
+    raise TypeError("unsupported chunker algo %r" % algo)

+ 20 - 0
src/borg/chunkers/buzhash.pyi

@@ -0,0 +1,20 @@
+from typing import List, Iterator, BinaryIO
+
+from .reader import fmap_entry
+
+API_VERSION: str
+
+def buzhash(data: bytes, seed: int) -> int: ...
+def buzhash_update(sum: int, remove: int, add: int, len: int, seed: int) -> int: ...
+
+class Chunker:
+    def __init__(
+        self,
+        seed: int,
+        chunk_min_exp: int,
+        chunk_max_exp: int,
+        hash_mask_bits: int,
+        hash_window_size: int,
+        sparse: bool = False,
+    ) -> None: ...
+    def chunkify(self, fd: BinaryIO = None, fh: int = -1, fmap: List[fmap_entry] = None) -> Iterator: ...

+ 332 - 0
src/borg/chunkers/buzhash.pyx

@@ -0,0 +1,332 @@
+# cython: language_level=3
+
+API_VERSION = '1.2_01'
+
+import cython
+import time
+from cpython.bytes cimport PyBytes_AsString
+from libc.stdint cimport uint8_t, uint32_t
+from libc.stdlib cimport malloc, free
+from libc.string cimport memcpy, memmove
+
+from ..constants import CH_DATA, CH_ALLOC, CH_HOLE, zeros
+from .reader import FileReader, Chunk
+
+# Cyclic polynomial / buzhash
+#
+# https://en.wikipedia.org/wiki/Rolling_hash
+#
+# http://www.serve.net/buz/Notes.1st.year/HTML/C6/rand.012.html (by "BUZ", the inventor)
+#
+# http://www.dcs.gla.ac.uk/~hamer/cakes-talk.pdf (see buzhash slide)
+#
+# Some properties of buzhash / of this implementation:
+#
+# (1) the hash is designed for inputs <= 32 bytes, but the chunker uses it on a 4095 byte window;
+#     any repeating bytes at distance 32 within those 4095 bytes can cause cancellation within
+#     the hash function, e.g. in "X <any 31 bytes> X", the last X would cancel out the influence
+#     of the first X on the hash value.
+#
+# (2) the hash table is supposed to have (according to the BUZ) exactly a 50% distribution of
+#     0/1 bit values per position, but the hard coded table below doesn't fit that property.
+#
+# (3) if you would use a window size divisible by 64, the seed would cancel itself out completely.
+#     this is why we use a window size of 4095 bytes.
+#
+# Another quirk is that, even with the 4095 byte window, XORing the entire table by a constant
+# is equivalent to XORing the hash output with a different constant. but since the seed is stored
+# encrypted, i think it still serves its purpose.
+
+cdef uint32_t table_base[256]
+table_base = [
+    0xe7f831ec, 0xf4026465, 0xafb50cae, 0x6d553c7a, 0xd639efe3, 0x19a7b895, 0x9aba5b21, 0x5417d6d4,
+    0x35fd2b84, 0xd1f6a159, 0x3f8e323f, 0xb419551c, 0xf444cebf, 0x21dc3b80, 0xde8d1e36, 0x84a32436,
+    0xbeb35a9d, 0xa36f24aa, 0xa4e60186, 0x98d18ffe, 0x3f042f9e, 0xdb228bcd, 0x096474b7, 0x5c20c2f7,
+    0xf9eec872, 0xe8625275, 0xb9d38f80, 0xd48eb716, 0x22a950b4, 0x3cbaaeaa, 0xc37cddd3, 0x8fea6f6a,
+    0x1d55d526, 0x7fd6d3b3, 0xdaa072ee, 0x4345ac40, 0xa077c642, 0x8f2bd45b, 0x28509110, 0x55557613,
+    0xffc17311, 0xd961ffef, 0xe532c287, 0xaab95937, 0x46d38365, 0xb065c703, 0xf2d91d0f, 0x92cd4bb0,
+    0x4007c712, 0xf35509dd, 0x505b2f69, 0x557ead81, 0x310f4563, 0xbddc5be8, 0x9760f38c, 0x701e0205,
+    0x00157244, 0x14912826, 0xdc4ca32b, 0x67b196de, 0x5db292e8, 0x8c1b406b, 0x01f34075, 0xfa2520f7,
+    0x73bc37ab, 0x1e18bc30, 0xfe2c6cb3, 0x20c522d0, 0x5639e3db, 0x942bda35, 0x899af9d1, 0xced44035,
+    0x98cc025b, 0x255f5771, 0x70fefa24, 0xe928fa4d, 0x2c030405, 0xb9325590, 0x20cb63bd, 0xa166305d,
+    0x80e52c0a, 0xa8fafe2f, 0x1ad13f7d, 0xcfaf3685, 0x6c83a199, 0x7d26718a, 0xde5dfcd9, 0x79cf7355,
+    0x8979d7fb, 0xebf8c55e, 0xebe408e4, 0xcd2affba, 0xe483be6e, 0xe239d6de, 0x5dc1e9e0, 0x0473931f,
+    0x851b097c, 0xac5db249, 0x09c0f9f2, 0xd8d2f134, 0xe6f38e41, 0xb1c71bf1, 0x52b6e4db, 0x07224424,
+    0x6cf73e85, 0x4f25d89c, 0x782a7d74, 0x10a68dcd, 0x3a868189, 0xd570d2dc, 0x69630745, 0x9542ed86,
+    0x331cd6b2, 0xa84b5b28, 0x07879c9d, 0x38372f64, 0x7185db11, 0x25ba7c83, 0x01061523, 0xe6792f9f,
+    0xe5df07d1, 0x4321b47f, 0x7d2469d8, 0x1a3a4f90, 0x48be29a3, 0x669071af, 0x8ec8dd31, 0x0810bfbf,
+    0x813a06b4, 0x68538345, 0x65865ddc, 0x43a71b8e, 0x78619a56, 0x5a34451d, 0x5bdaa3ed, 0x71edc7e9,
+    0x17ac9a20, 0x78d10bfa, 0x6c1e7f35, 0xd51839d9, 0x240cbc51, 0x33513cc1, 0xd2b4f795, 0xccaa8186,
+    0x0babe682, 0xa33cf164, 0x18c643ea, 0xc1ca105f, 0x9959147a, 0x6d3d94de, 0x0b654fbe, 0xed902ca0,
+    0x7d835cb5, 0x99ba1509, 0x6445c922, 0x495e76c2, 0xf07194bc, 0xa1631d7e, 0x677076a5, 0x89fffe35,
+    0x1a49bcf3, 0x8e6c948a, 0x0144c917, 0x8d93aea1, 0x16f87ddf, 0xc8f25d49, 0x1fb11297, 0x27e750cd,
+    0x2f422da1, 0xdee89a77, 0x1534c643, 0x457b7b8b, 0xaf172f7a, 0x6b9b09d6, 0x33573f7f, 0xf14e15c4,
+    0x526467d5, 0xaf488241, 0x87c3ee0d, 0x33be490c, 0x95aa6e52, 0x43ec242e, 0xd77de99b, 0xd018334f,
+    0x5b78d407, 0x498eb66b, 0xb1279fa8, 0xb38b0ea6, 0x90718376, 0xe325dee2, 0x8e2f2cba, 0xcaa5bdec,
+    0x9d652c56, 0xad68f5cb, 0xa77591af, 0x88e37ee8, 0xf8faa221, 0xfcbbbe47, 0x4f407786, 0xaf393889,
+    0xf444a1d9, 0x15ae1a2f, 0x40aa7097, 0x6f9486ac, 0x29d232a3, 0xe47609e9, 0xe8b631ff, 0xba8565f4,
+    0x11288749, 0x46c9a838, 0xeb1b7cd8, 0xf516bbb1, 0xfb74fda0, 0x010996e6, 0x4c994653, 0x1d889512,
+    0x53dcd9a3, 0xdd074697, 0x1e78e17c, 0x637c98bf, 0x930bb219, 0xcf7f75b0, 0xcb9355fb, 0x9e623009,
+    0xe466d82c, 0x28f968d3, 0xfeb385d9, 0x238e026c, 0xb8ed0560, 0x0c6a027a, 0x3d6fec4b, 0xbb4b2ec2,
+    0xe715031c, 0xeded011d, 0xcdc4d3b9, 0xc456fc96, 0xdd0eea20, 0xb3df8ec9, 0x12351993, 0xd9cbb01c,
+    0x603147a2, 0xcf37d17d, 0xf7fcd9dc, 0xd8556fa3, 0x104c8131, 0x13152774, 0xb4715811, 0x6a72c2c9,
+    0xc5ae37bb, 0xa76ce12a, 0x8150d8f3, 0x2ec29218, 0xa35f0984, 0x48c0647e, 0x0b5ff98c, 0x71893f7b
+]
+
+# This seems to be the most reliable way to inline this code, using a C preprocessor macro:
+cdef extern from *:
+   """
+   #define BARREL_SHIFT(v, shift) (((v) << (shift)) | ((v) >> (((32 - (shift)) & 0x1f))))
+   """
+   uint32_t BARREL_SHIFT(uint32_t v, uint32_t shift)
+
+
+@cython.boundscheck(False)  # Deactivate bounds checking
+@cython.wraparound(False)  # Deactivate negative indexing.
+cdef uint32_t* buzhash_init_table(uint32_t seed):
+    """Initialize the buzhash table with the given seed."""
+    cdef int i
+    cdef uint32_t* table = <uint32_t*>malloc(1024)  # 256 * sizeof(uint32_t)
+    for i in range(256):
+        table[i] = table_base[i] ^ seed
+    return table
+
+
+@cython.boundscheck(False)  # Deactivate bounds checking
+@cython.wraparound(False)  # Deactivate negative indexing.
+@cython.cdivision(True)  # Use C division/modulo semantics for integer division.
+cdef uint32_t _buzhash(const unsigned char* data, size_t len, const uint32_t* h):
+    """Calculate the buzhash of the given data."""
+    cdef uint32_t i
+    cdef uint32_t sum = 0, imod
+    for i in range(len - 1, 0, -1):
+        imod = i & 0x1f
+        sum ^= BARREL_SHIFT(h[data[0]], imod)
+        data += 1
+    return sum ^ h[data[0]]
+
+
+@cython.boundscheck(False)  # Deactivate bounds checking
+@cython.wraparound(False)  # Deactivate negative indexing.
+@cython.cdivision(True)  # Use C division/modulo semantics for integer division.
+cdef uint32_t _buzhash_update(uint32_t sum, unsigned char remove, unsigned char add, size_t len, const uint32_t* h):
+    """Update the buzhash with a new byte."""
+    cdef uint32_t lenmod = len & 0x1f
+    return BARREL_SHIFT(sum, 1) ^ BARREL_SHIFT(h[remove], lenmod) ^ h[add]
+
+
+cdef class Chunker:
+    """
+    Content-Defined Chunker, variable chunk sizes.
+
+    This chunker makes quite some effort to cut mostly chunks of the same-content, even if
+    the content moves to a different offset inside the file. It uses the buzhash
+    rolling-hash algorithm to identify the chunk cutting places by looking at the
+    content inside the moving window and computing the rolling hash value over the
+    window contents. If the last n bits of the rolling hash are 0, a chunk is cut.
+    Additionally it obeys some more criteria, like a minimum and maximum chunk size.
+    It also uses a per-repo random seed to avoid some chunk length fingerprinting attacks.
+    """
+    cdef uint32_t chunk_mask
+    cdef uint32_t* table
+    cdef uint8_t* data
+    cdef object _fd  # Python object for file descriptor
+    cdef int fh
+    cdef int done, eof
+    cdef size_t min_size, buf_size, window_size, remaining, position, last
+    cdef long long bytes_read, bytes_yielded  # off_t in C, using long long for compatibility
+    cdef readonly float chunking_time
+    cdef object file_reader  # FileReader instance
+    cdef size_t reader_block_size
+    cdef bint sparse
+
+    def __cinit__(self, int seed, int chunk_min_exp, int chunk_max_exp, int hash_mask_bits, int hash_window_size, bint sparse=False):
+        min_size = 1 << chunk_min_exp
+        max_size = 1 << chunk_max_exp
+        assert max_size <= len(zeros)
+        # see chunker_process, first while loop condition, first term must be able to get True:
+        assert hash_window_size + min_size + 1 <= max_size, "too small max_size"
+
+        self.window_size = hash_window_size
+        self.chunk_mask = (1 << hash_mask_bits) - 1
+        self.min_size = min_size
+        self.table = buzhash_init_table(seed & 0xffffffff)
+        self.buf_size = max_size
+        self.data = <uint8_t*>malloc(self.buf_size)
+        self.fh = -1
+        self.done = 0
+        self.eof = 0
+        self.remaining = 0
+        self.position = 0
+        self.last = 0
+        self.bytes_read = 0
+        self.bytes_yielded = 0
+        self._fd = None
+        self.chunking_time = 0.0
+        self.reader_block_size = 1024 * 1024
+        self.sparse = sparse
+
+    def __dealloc__(self):
+        """Free the chunker's resources."""
+        if self.table != NULL:
+            free(self.table)
+            self.table = NULL
+        if self.data != NULL:
+            free(self.data)
+            self.data = NULL
+
+    cdef int fill(self) except 0:
+        """Fill the chunker's buffer with more data."""
+        cdef ssize_t n
+        cdef object chunk
+
+        # Move remaining data to the beginning of the buffer
+        memmove(self.data, self.data + self.last, self.position + self.remaining - self.last)
+        self.position -= self.last
+        self.last = 0
+        n = self.buf_size - self.position - self.remaining
+
+        if self.eof or n == 0:
+            return 1
+
+        # Use FileReader to read data
+        chunk = self.file_reader.read(n)
+        n = chunk.meta["size"]
+
+        if n > 0:
+            # Only copy data if it's not a hole
+            if chunk.meta["allocation"] == CH_DATA:
+                # Copy data from chunk to our buffer
+                memcpy(self.data + self.position + self.remaining, <const unsigned char*>PyBytes_AsString(chunk.data), n)
+            else:
+                # For holes, fill with zeros
+                memcpy(self.data + self.position + self.remaining, <const unsigned char*>PyBytes_AsString(zeros[:n]), n)
+
+            self.remaining += n
+            self.bytes_read += n
+        else:
+            self.eof = 1
+
+        return 1
+
+    cdef object process(self) except *:
+        """Process the chunker's buffer and return the next chunk."""
+        cdef uint32_t sum, chunk_mask = self.chunk_mask
+        cdef size_t n, old_last, min_size = self.min_size, window_size = self.window_size
+        cdef uint8_t* p
+        cdef uint8_t* stop_at
+        cdef size_t did_bytes
+
+        if self.done:
+            if self.bytes_read == self.bytes_yielded:
+                raise StopIteration
+            else:
+                raise Exception("chunkifier byte count mismatch")
+
+        while self.remaining < min_size + window_size + 1 and not self.eof:  # see assert in Chunker init
+            if not self.fill():
+                return None
+
+        # Here we either are at eof...
+        if self.eof:
+            self.done = 1
+            if self.remaining:
+                self.bytes_yielded += self.remaining
+                # Return a memory view of the remaining data
+                return memoryview((self.data + self.position)[:self.remaining])
+            else:
+                if self.bytes_read == self.bytes_yielded:
+                    raise StopIteration
+                else:
+                    raise Exception("chunkifier byte count mismatch")
+
+        # ... or we have at least min_size + window_size + 1 bytes remaining.
+        # We do not want to "cut" a chunk smaller than min_size and the hash
+        # window starts at the potential cutting place.
+        self.position += min_size
+        self.remaining -= min_size
+        sum = _buzhash(self.data + self.position, window_size, self.table)
+
+        while self.remaining > self.window_size and (sum & chunk_mask) and not (self.eof and self.remaining <= window_size):
+            p = self.data + self.position
+            stop_at = p + self.remaining - window_size
+
+            while p < stop_at and (sum & chunk_mask):
+                sum = _buzhash_update(sum, p[0], p[window_size], window_size, self.table)
+                p += 1
+
+            did_bytes = p - (self.data + self.position)
+            self.position += did_bytes
+            self.remaining -= did_bytes
+
+            if self.remaining <= window_size:
+                if not self.fill():
+                    return None
+
+        if self.remaining <= window_size:
+            self.position += self.remaining
+            self.remaining = 0
+
+        old_last = self.last
+        self.last = self.position
+        n = self.last - old_last
+        self.bytes_yielded += n
+
+        # Return a memory view of the chunk
+        return memoryview((self.data + old_last)[:n])
+
+    def chunkify(self, fd, fh=-1, fmap=None):
+        """
+        Cut a file into chunks.
+
+        :param fd: Python file object
+        :param fh: OS-level file handle (if available),
+                   defaults to -1 which means not to use OS-level fd.
+        :param fmap: a file map, same format as generated by sparsemap
+        """
+        self._fd = fd
+        self.fh = fh
+        self.file_reader = FileReader(fd=fd, fh=fh, read_size=self.reader_block_size, sparse=self.sparse, fmap=fmap)
+        self.done = 0
+        self.remaining = 0
+        self.bytes_read = 0
+        self.bytes_yielded = 0
+        self.position = 0
+        self.last = 0
+        self.eof = 0
+        return self
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        started_chunking = time.monotonic()
+        data = self.process()
+        got = len(data)
+        # we do not have SEEK_DATA/SEEK_HOLE support in chunker_process C code,
+        # but we can just check if data was all-zero (and either came from a hole
+        # or from stored zeros - we can not detect that here).
+        if zeros.startswith(data):
+            data = None
+            allocation = CH_ALLOC
+        else:
+            allocation = CH_DATA
+        self.chunking_time += time.monotonic() - started_chunking
+        return Chunk(data, size=got, allocation=allocation)
+
+
+def buzhash(data, unsigned long seed):
+    cdef uint32_t *table
+    cdef uint32_t sum
+    table = buzhash_init_table(seed & 0xffffffff)
+    sum = _buzhash(<const unsigned char *> data, len(data), table)
+    free(table)
+    return sum
+
+
+def buzhash_update(uint32_t sum, unsigned char remove, unsigned char add, size_t len, unsigned long seed):
+    cdef uint32_t *table
+    table = buzhash_init_table(seed & 0xffffffff)
+    sum = _buzhash_update(sum, remove, add, len, table)
+    free(table)
+    return sum

+ 56 - 0
src/borg/chunkers/failing.py

@@ -0,0 +1,56 @@
+API_VERSION = "1.2_01"
+
+import os
+import errno
+from typing import BinaryIO, Iterator
+
+from ..constants import CH_DATA
+from .reader import Chunk
+
+
+class ChunkerFailing:
+    """
+    This is a very simple chunker for testing purposes.
+
+    Reads block_size chunks, starts failing at block <fail_start>, <fail_count> failures, then succeeds.
+    """
+
+    def __init__(self, block_size: int, map: str) -> None:
+        self.block_size = block_size
+        # one char per block: r/R = successful read, e/E = I/O Error, e.g.: "rrrrErrrEEr"
+        # blocks beyond the map will have same behaviour as the last map char indicates.
+        map = map.upper()
+        if not set(map).issubset({"R", "E"}):
+            raise ValueError("unsupported map character")
+        self.map = map
+        self.count = 0
+        self.chunking_time = 0.0  # not updated, just provided so that caller does not crash
+
+    def chunkify(self, fd: BinaryIO = None, fh: int = -1) -> Iterator:
+        """
+        Cut a file into chunks.
+
+        :param fd: Python file object
+        :param fh: OS-level file handle (if available),
+                   defaults to -1 which means not to use OS-level fd.
+        """
+        use_fh = fh >= 0
+        wanted = self.block_size
+        while True:
+            data = os.read(fh, wanted) if use_fh else fd.read(wanted)
+            got = len(data)
+            if got > 0:
+                idx = self.count if self.count < len(self.map) else -1
+                behaviour = self.map[idx]
+                if behaviour == "E":
+                    self.count += 1
+                    fname = None if use_fh else getattr(fd, "name", None)
+                    raise OSError(errno.EIO, "simulated I/O error", fname)
+                elif behaviour == "R":
+                    self.count += 1
+                    yield Chunk(data, size=got, allocation=CH_DATA)
+                else:
+                    raise ValueError("unsupported map character")
+            if got < wanted:
+                # we did not get enough data, looks like EOF.
+                return

+ 71 - 0
src/borg/chunkers/fixed.py

@@ -0,0 +1,71 @@
+from typing import List, Iterator, BinaryIO
+
+API_VERSION = "1.2_01"
+
+import time
+
+from .reader import FileReader
+
+
+class ChunkerFixed:
+    """
+    This is a simple chunker for input data with data usually staying at same
+    offset and / or with known block/record sizes:
+
+    - raw disk images
+    - block devices
+    - database files with simple header + fixed-size records layout
+
+    It optionally supports:
+
+    - a header block of different size
+    - using a sparsemap to read only data ranges and seek over hole ranges
+      for sparse files.
+    - using an externally given filemap to read only specific ranges from
+      a file.
+
+    Note: the last block of a data or hole range may be less than the block size,
+          this is supported and not considered to be an error.
+    """
+
+    def __init__(self, block_size: int, header_size: int = 0, sparse: bool = False) -> None:
+        self.block_size = block_size
+        self.header_size = header_size
+        self.chunking_time = 0.0  # likely will stay close to zero - not much to do here.
+        self.reader_block_size = 1024 * 1024
+        self.reader: FileReader = None
+        self.sparse = sparse
+
+    def chunkify(self, fd: BinaryIO = None, fh: int = -1, fmap: List = None) -> Iterator:
+        """
+        Cut a file into chunks.
+
+        :param fd: Python file object
+        :param fh: OS-level file handle (if available),
+                   defaults to -1 which means not to use OS-level fd.
+        :param fmap: a file map, same format as generated by sparsemap
+        """
+        # Initialize the reader with the file descriptors
+        self.reader = FileReader(fd=fd, fh=fh, read_size=self.reader_block_size, sparse=self.sparse, fmap=fmap)
+
+        # Handle header if present
+        if self.header_size > 0:
+            # Read the header block using read
+            started_chunking = time.monotonic()
+            header_chunk = self.reader.read(self.header_size)
+            self.chunking_time += time.monotonic() - started_chunking
+
+            if header_chunk.meta["size"] > 0:
+                # Yield the header chunk
+                yield header_chunk
+
+        # Process the rest of the file using read
+        while True:
+            started_chunking = time.monotonic()
+            chunk = self.reader.read(self.block_size)
+            self.chunking_time += time.monotonic() - started_chunking
+            size = chunk.meta["size"]
+            if size == 0:
+                break  # EOF
+            assert size <= self.block_size
+            yield chunk

+ 1 - 18
src/borg/chunker.pyi → src/borg/chunkers/reader.pyi

@@ -1,4 +1,4 @@
-from typing import NamedTuple, Tuple, List, Dict, Any, Type, Iterator, BinaryIO
+from typing import NamedTuple, Tuple, Dict, List, Any, Type, BinaryIO, Iterator
 
 API_VERSION: str
 
@@ -9,18 +9,11 @@ class _Chunk(NamedTuple):
     meta: Dict[str, Any]
 
 def Chunk(data: bytes, **meta) -> Type[_Chunk]: ...
-def buzhash(data: bytes, seed: int) -> int: ...
-def buzhash_update(sum: int, remove: int, add: int, len: int, seed: int) -> int: ...
-def get_chunker(algo: str, *params, **kw) -> Any: ...
 
 fmap_entry = Tuple[int, int, bool]
 
 def sparsemap(fd: BinaryIO = None, fh: int = -1) -> List[fmap_entry]: ...
 
-class ChunkerFailing:
-    def __init__(self, block_size: int, map: str) -> None: ...
-    def chunkify(self, fd: BinaryIO = None, fh: int = -1) -> Iterator: ...
-
 class FileFMAPReader:
     def __init__(
         self,
@@ -46,13 +39,3 @@ class FileReader:
     ) -> None: ...
     def _fill_buffer(self) -> bool: ...
     def read(self, size: int) -> Type[_Chunk]: ...
-
-class ChunkerFixed:
-    def __init__(self, block_size: int, header_size: int = 0, sparse: bool = False) -> None: ...
-    def chunkify(self, fd: BinaryIO = None, fh: int = -1, fmap: List[fmap_entry] = None) -> Iterator: ...
-
-class Chunker:
-    def __init__(
-        self, seed: int, chunk_min_exp: int, chunk_max_exp: int, hash_mask_bits: int, hash_window_size: int
-    ) -> None: ...
-    def chunkify(self, fd: BinaryIO = None, fh: int = -1) -> Iterator: ...

+ 333 - 0
src/borg/chunkers/reader.pyx

@@ -0,0 +1,333 @@
+# cython: language_level=3
+
+API_VERSION = '1.2_01'
+
+import os
+import errno
+import time
+from collections import namedtuple
+
+from ..platform import safe_fadvise
+from ..constants import CH_DATA, CH_ALLOC, CH_HOLE, zeros
+
+# this will be True if Python's seek implementation supports data/holes seeking.
+# this does not imply that it will actually work on the filesystem,
+# because the FS also needs to support this.
+has_seek_hole = hasattr(os, 'SEEK_DATA') and hasattr(os, 'SEEK_HOLE')
+
+_Chunk = namedtuple('_Chunk', 'meta data')
+_Chunk.__doc__ = """\
+    Chunk namedtuple
+
+    meta is always a dictionary, data depends on allocation.
+
+    data chunk read from a DATA range of a file (not from a sparse hole):
+        meta = {'allocation' = CH_DATA, 'size' = size_of_chunk }
+        data = read_data [bytes or memoryview]
+
+    all-zero chunk read from a DATA range of a file (not from a sparse hole, but detected to be all-zero):
+        meta = {'allocation' = CH_ALLOC, 'size' = size_of_chunk }
+        data = None
+
+    all-zero chunk from a HOLE range of a file (from a sparse hole):
+        meta = {'allocation' = CH_HOLE, 'size' = size_of_chunk }
+        data = None
+"""
+
+def Chunk(data, **meta):
+    return _Chunk(meta, data)
+
+
+def dread(offset, size, fd=None, fh=-1):
+    use_fh = fh >= 0
+    if use_fh:
+        data = os.read(fh, size)
+        safe_fadvise(fh, offset, len(data), "DONTNEED")
+        return data
+    else:
+        return fd.read(size)
+
+
+def dseek(amount, whence, fd=None, fh=-1):
+    use_fh = fh >= 0
+    if use_fh:
+        return os.lseek(fh, amount, whence)
+    else:
+        return fd.seek(amount, whence)
+
+
+def dpos_curr_end(fd=None, fh=-1):
+    """
+    determine current position, file end position (== file length)
+    """
+    curr = dseek(0, os.SEEK_CUR, fd, fh)
+    end = dseek(0, os.SEEK_END, fd, fh)
+    dseek(curr, os.SEEK_SET, fd, fh)
+    return curr, end
+
+
+def sparsemap(fd=None, fh=-1):
+    """
+    generator yielding a (start, length, is_data) tuple for each range.
+    is_data is indicating data ranges (True) or hole ranges (False).
+
+    note:
+    the map is generated starting from the current seek position (it
+    is not required to be 0 / to be at the start of the file) and
+    work from there up to the end of the file.
+    when the generator is finished, the file pointer position will be
+    reset to where it was before calling this function.
+    """
+    curr, file_len = dpos_curr_end(fd, fh)
+    start = curr
+    try:
+        whence = os.SEEK_HOLE
+        while True:
+            is_data = whence == os.SEEK_HOLE  # True: range with data, False: range is a hole
+            try:
+                end = dseek(start, whence, fd, fh)
+            except OSError as e:
+                if e.errno == errno.ENXIO:
+                    if not is_data and start < file_len:
+                        # if there is a hole at the end of a file, we can not find the file end by SEEK_DATA
+                        # (because we run into ENXIO), thus we must manually deal with this case:
+                        end = file_len
+                        yield (start, end - start, is_data)
+                    break
+                else:
+                    raise
+            # we do not want to yield zero-length ranges with start == end:
+            if end > start:
+                yield (start, end - start, is_data)
+            start = end
+            whence = os.SEEK_DATA if is_data else os.SEEK_HOLE
+    finally:
+        # seek to same position as before calling this function
+        dseek(curr, os.SEEK_SET, fd, fh)
+
+
+class FileFMAPReader:
+    """
+    This is for reading blocks from a file.
+
+    It optionally supports:
+
+    - using a sparsemap to read only data ranges and seek over hole ranges
+      for sparse files.
+    - using an externally given filemap to read only specific ranges from
+      a file.
+
+    Note: the last block of a data or hole range may be less than the read_size,
+          this is supported and not considered to be an error.
+    """
+    def __init__(self, *, fd=None, fh=-1, read_size=0, sparse=False, fmap=None):
+        assert fd is not None or fh >= 0
+        self.fd = fd
+        self.fh = fh
+        assert 0 < read_size <= len(zeros)
+        self.read_size = read_size  # how much data we want to read at once
+        self.reading_time = 0.0  # time spent in reading/seeking
+        # should borg try to do sparse input processing?
+        # whether it actually can be done depends on the input file being seekable.
+        self.try_sparse = sparse and has_seek_hole
+        self.fmap = fmap
+
+    def _build_fmap(self):
+        started_fmap = time.monotonic()
+        fmap = None
+        if self.try_sparse:
+            try:
+                fmap = list(sparsemap(self.fd, self.fh))
+            except OSError as err:
+                # seeking did not work
+                pass
+
+        if fmap is None:
+            # either sparse processing (building the fmap) was not tried or it failed.
+            # in these cases, we just build a "fake fmap" that considers the whole file
+            # as range(s) of data (no holes), so we can use the same code.
+            fmap = [(0, 2 ** 62, True), ]
+        self.reading_time += time.monotonic() - started_fmap
+        return fmap
+
+    def blockify(self):
+        """
+        Read <read_size> sized blocks from a file.
+        """
+        if self.fmap is None:
+            self.fmap = self._build_fmap()
+
+        offset = 0
+        for range_start, range_size, is_data in self.fmap:
+            if range_start != offset:
+                # this is for the case when the fmap does not cover the file completely,
+                # e.g. it could be without the ranges of holes or of unchanged data.
+                offset = range_start
+                dseek(offset, os.SEEK_SET, self.fd, self.fh)
+            while range_size:
+                started_reading = time.monotonic()
+                wanted = min(range_size, self.read_size)
+                if is_data:
+                    # read block from the range
+                    data = dread(offset, wanted, self.fd, self.fh)
+                    got = len(data)
+                    if zeros.startswith(data):
+                        data = None
+                        allocation = CH_ALLOC
+                    else:
+                        allocation = CH_DATA
+                else:  # hole
+                    # seek over block from the range
+                    pos = dseek(wanted, os.SEEK_CUR, self.fd, self.fh)
+                    got = pos - offset
+                    data = None
+                    allocation = CH_HOLE
+                self.reading_time += time.monotonic() - started_reading
+                if got > 0:
+                    offset += got
+                    range_size -= got
+                    yield Chunk(data, size=got, allocation=allocation)
+                if got < wanted:
+                    # we did not get enough data, looks like EOF.
+                    return
+
+
+class FileReader:
+    """
+    This is a buffered reader for file data.
+
+    It maintains a buffer that is filled with Chunks from the FileFMAPReader.blockify generator.
+    The data in that buffer is consumed by clients calling FileReader.read, which returns a Chunk.
+
+    Most complexity in here comes from the desired size when a user calls FileReader.read does
+    not need to match the Chunk sizes we got from the FileFMAPReader.
+    """
+    def __init__(self, *, fd=None, fh=-1, read_size=0, sparse=False, fmap=None):
+        assert read_size > 0
+        self.reader = FileFMAPReader(fd=fd, fh=fh, read_size=read_size, sparse=sparse, fmap=fmap)
+        self.buffer = []  # list of Chunk objects
+        self.offset = 0  # offset into the first buffer object's data
+        self.remaining_bytes = 0  # total bytes available in buffer
+        self.blockify_gen = None  # generator from FileFMAPReader.blockify
+        self.fd = fd
+        self.fh = fh
+        self.fmap = fmap
+
+    def _fill_buffer(self):
+        """
+        Fill the buffer with more data from the blockify generator.
+        Returns True if more data was added, False if EOF.
+        """
+        if self.blockify_gen is None:
+            return False
+
+        try:
+            chunk = next(self.blockify_gen)
+            # Store the Chunk object directly in the buffer
+            self.buffer.append(chunk)
+            self.remaining_bytes += chunk.meta["size"]
+            return True
+        except StopIteration:
+            self.blockify_gen = None
+            return False
+
+    def read(self, size):
+        """
+        Read a Chunk of up to 'size' bytes from the file.
+
+        This method tries to yield a Chunk of the requested size, if possible, by considering
+        multiple chunks from the buffer.
+
+        The allocation type of the resulting chunk depends on the allocation types of the contributing chunks:
+        - If one of the chunks is CH_DATA, it will create all-zero bytes for other chunks that are not CH_DATA
+        - If all contributing chunks are CH_HOLE, the resulting chunk will also be CH_HOLE
+        - If the contributing chunks are a mix of CH_HOLE and CH_ALLOC, the resulting chunk will be CH_HOLE
+
+        :param size: Number of bytes to read
+        :return: Chunk object containing the read data.
+                 If no data is available, returns Chunk(None, size=0, allocation=CH_ALLOC).
+                 If less than requested bytes were available (at EOF), the returned chunk might be smaller
+                 than requested.
+        """
+        # Initialize if not already done
+        if self.blockify_gen is None:
+            self.buffer = []
+            self.offset = 0
+            self.remaining_bytes = 0
+            self.blockify_gen = self.reader.blockify()
+
+        # If we don't have enough data in the buffer, try to fill it
+        while self.remaining_bytes < size:
+            if not self._fill_buffer():
+                # No more data available, return what we have
+                break
+
+        # If we have no data at all, return an empty Chunk
+        if not self.buffer:
+            return Chunk(b"", size=0, allocation=CH_DATA)
+
+        # Prepare to collect the requested data
+        result = bytearray()
+        bytes_to_read = min(size, self.remaining_bytes)
+        bytes_read = 0
+
+        # Track if we've seen different allocation types
+        has_data = False
+        has_hole = False
+        has_alloc = False
+
+        # Read data from the buffer, combining chunks as needed
+        while bytes_read < bytes_to_read and self.buffer:
+            chunk = self.buffer[0]
+            chunk_size = chunk.meta["size"]
+            allocation = chunk.meta["allocation"]
+            data = chunk.data
+
+            # Track allocation types
+            if allocation == CH_DATA:
+                has_data = True
+            elif allocation == CH_HOLE:
+                has_hole = True
+            elif allocation == CH_ALLOC:
+                has_alloc = True
+            else:
+                raise ValueError(f"Invalid allocation type: {allocation}")
+
+            # Calculate how much we can read from this chunk
+            available = chunk_size - self.offset
+            to_read = min(available, bytes_to_read - bytes_read)
+
+            # Process the chunk based on its allocation type
+            if allocation == CH_DATA:
+                assert data is not None
+                # For data chunks, add the actual data
+                result.extend(data[self.offset:self.offset + to_read])
+            else:
+                # For non-data chunks, add zeros if we've seen a data chunk
+                if has_data:
+                    result.extend(b'\0' * to_read)
+                # Otherwise, we'll just track the size without adding data
+
+            bytes_read += to_read
+
+            # Update offset or remove chunk if fully consumed
+            if to_read < available:
+                self.offset += to_read
+            else:
+                self.offset = 0
+                self.buffer.pop(0)
+
+            self.remaining_bytes -= to_read
+
+        # Determine the allocation type of the resulting chunk
+        if has_data:
+            # If any chunk was CH_DATA, the result is CH_DATA
+            return Chunk(bytes(result), size=bytes_read, allocation=CH_DATA)
+        elif has_hole:
+            # If any chunk was CH_HOLE (and none were CH_DATA), the result is CH_HOLE
+            return Chunk(None, size=bytes_read, allocation=CH_HOLE)
+        else:
+            # Otherwise, all chunks were CH_ALLOC
+            return Chunk(None, size=bytes_read, allocation=CH_ALLOC)
+
+

+ 2 - 2
src/borg/helpers/checks.py

@@ -14,12 +14,12 @@ def check_python():
 
 
 def check_extension_modules():
-    from .. import platform, compress, crypto, item, chunker, hashindex
+    from .. import platform, compress, crypto, item, hashindex, chunkers
 
     msg = """The Borg binary extension modules do not seem to be properly installed."""
     if hashindex.API_VERSION != "1.2_01":
         raise RTError(msg)
-    if chunker.API_VERSION != "1.2_01":
+    if chunkers.API_VERSION != "1.2_01":
         raise RTError(msg)
     if compress.API_VERSION != "1.2_02":
         raise RTError(msg)

+ 4 - 3
src/borg/selftest.py

@@ -22,11 +22,12 @@ import time
 from unittest import TestResult, TestSuite, defaultTestLoader
 
 from .testsuite.crypto_test import CryptoTestCase
-from .testsuite.chunker_test import ChunkerTestCase
+from .testsuite.chunkers.buzhash_self_test import ChunkerTestCase
+from .testsuite.chunkers.fixed_self_test import ChunkerFixedTestCase
 
-SELFTEST_CASES = [CryptoTestCase, ChunkerTestCase]
+SELFTEST_CASES = [CryptoTestCase, ChunkerTestCase, ChunkerFixedTestCase]
 
-SELFTEST_COUNT = 11
+SELFTEST_COUNT = 17
 
 
 class SelfTestResult(TestResult):

+ 1 - 1
src/borg/testsuite/archiver/extract_cmd_test.py

@@ -7,7 +7,7 @@ from unittest.mock import patch
 import pytest
 
 from ... import xattr
-from ...chunker import has_seek_hole
+from ...chunkers import has_seek_hole
 from ...constants import *  # NOQA
 from ...helpers import EXIT_WARNING, BackupPermissionError, bin_to_hex
 from ...helpers import flags_noatime, flags_normal

+ 0 - 41
src/borg/testsuite/chunker_slow_test.py

@@ -1,41 +0,0 @@
-from hashlib import sha256
-from io import BytesIO
-
-from .chunker_test import cf
-from ..chunker import Chunker
-from ..constants import *  # NOQA
-from ..helpers import hex_to_bin
-
-
-def H(data):
-    return sha256(data).digest()
-
-
-def test_chunkpoints_unchanged():
-    def twist(size):
-        x = 1
-        a = bytearray(size)
-        for i in range(size):
-            x = (x * 1103515245 + 12345) & 0x7FFFFFFF
-            a[i] = x & 0xFF
-        return a
-
-    data = twist(100000)
-
-    runs = []
-    for winsize in (65, 129, HASH_WINDOW_SIZE, 7351):
-        for minexp in (4, 6, 7, 11, 12):
-            for maxexp in (15, 17):
-                if minexp >= maxexp:
-                    continue
-                for maskbits in (4, 7, 10, 12):
-                    for seed in (1849058162, 1234567653):
-                        fh = BytesIO(data)
-                        chunker = Chunker(seed, minexp, maxexp, maskbits, winsize)
-                        chunks = [H(c) for c in cf(chunker.chunkify(fh, -1))]
-                        runs.append(H(b"".join(chunks)))
-
-    # The "correct" hash below matches the existing chunker behavior.
-    # Future chunker optimisations must not change this, or existing repos will bloat.
-    overall_hash = H(b"".join(runs))
-    assert overall_hash == hex_to_bin("a43d0ecb3ae24f38852fcc433a83dacd28fe0748d09cc73fc11b69cf3f1a7299")

+ 85 - 0
src/borg/testsuite/chunkers/__init__.py

@@ -0,0 +1,85 @@
+import os
+import tempfile
+
+from borg.constants import *  # noqa
+
+from ...chunkers import has_seek_hole
+
+
+def cf(chunks):
+    """chunk filter"""
+
+    # this is to simplify testing: either return the data piece (bytes) or the hole length (int).
+    def _cf(chunk):
+        if chunk.meta["allocation"] == CH_DATA:
+            assert len(chunk.data) == chunk.meta["size"]
+            return bytes(chunk.data)  # make sure we have bytes, not memoryview
+        if chunk.meta["allocation"] in (CH_HOLE, CH_ALLOC):
+            assert chunk.data is None
+            return chunk.meta["size"]
+        assert False, "unexpected allocation value"
+
+    return [_cf(chunk) for chunk in chunks]
+
+
+def make_sparsefile(fname, sparsemap, header_size=0):
+    with open(fname, "wb") as fd:
+        total = 0
+        if header_size:
+            fd.write(b"H" * header_size)
+            total += header_size
+        for offset, size, is_data in sparsemap:
+            if is_data:
+                fd.write(b"X" * size)
+            else:
+                fd.seek(size, os.SEEK_CUR)
+            total += size
+        fd.truncate(total)
+    assert os.path.getsize(fname) == total
+
+
+def make_content(sparsemap, header_size=0):
+    result = []
+    total = 0
+    if header_size:
+        result.append(b"H" * header_size)
+        total += header_size
+    for offset, size, is_data in sparsemap:
+        if is_data:
+            result.append(b"X" * size)  # bytes!
+        else:
+            result.append(size)  # int!
+        total += size
+    return result
+
+
+def fs_supports_sparse():
+    if not has_seek_hole:
+        return False
+    with tempfile.TemporaryDirectory() as tmpdir:
+        fn = os.path.join(tmpdir, "test_sparse")
+        make_sparsefile(fn, [(0, BS, False), (BS, BS, True)])
+        with open(fn, "rb") as f:
+            try:
+                offset_hole = f.seek(0, os.SEEK_HOLE)
+                offset_data = f.seek(0, os.SEEK_DATA)
+            except OSError:
+                # no sparse support if these seeks do not work
+                return False
+        return offset_hole == 0 and offset_data == BS
+
+
+BS = 4096  # fs block size
+
+# some sparse files. X = content blocks, _ = sparse blocks.
+# X__XXX____
+map_sparse1 = [(0 * BS, 1 * BS, True), (1 * BS, 2 * BS, False), (3 * BS, 3 * BS, True), (6 * BS, 4 * BS, False)]
+
+# _XX___XXXX
+map_sparse2 = [(0 * BS, 1 * BS, False), (1 * BS, 2 * BS, True), (3 * BS, 3 * BS, False), (6 * BS, 4 * BS, True)]
+
+# XXX
+map_notsparse = [(0 * BS, 3 * BS, True)]
+
+# ___
+map_onlysparse = [(0 * BS, 3 * BS, False)]

+ 5 - 72
src/borg/testsuite/chunker_test.py → src/borg/testsuite/chunkers/buzhash_self_test.py

@@ -3,78 +3,11 @@
 
 from io import BytesIO
 
-from ..chunker import ChunkerFixed, Chunker, get_chunker, buzhash, buzhash_update
-from ..constants import *  # NOQA
-from . import BaseTestCase
-
-
-def cf(chunks):
-    """chunk filter"""
-
-    # this is to simplify testing: either return the data piece (bytes) or the hole length (int).
-    def _cf(chunk):
-        if chunk.meta["allocation"] == CH_DATA:
-            assert len(chunk.data) == chunk.meta["size"]
-            return bytes(chunk.data)  # make sure we have bytes, not memoryview
-        if chunk.meta["allocation"] in (CH_HOLE, CH_ALLOC):
-            assert chunk.data is None
-            return chunk.meta["size"]
-        assert False, "unexpected allocation value"
-
-    return [_cf(chunk) for chunk in chunks]
-
-
-class ChunkerFixedTestCase(BaseTestCase):
-    def test_chunkify_just_blocks(self):
-        data = b"foobar" * 1500
-        chunker = ChunkerFixed(4096)
-        parts = cf(chunker.chunkify(BytesIO(data)))
-        self.assert_equal(parts, [data[0:4096], data[4096:8192], data[8192:]])
-
-    def test_chunkify_header_and_blocks(self):
-        data = b"foobar" * 1500
-        chunker = ChunkerFixed(4096, 123)
-        parts = cf(chunker.chunkify(BytesIO(data)))
-        self.assert_equal(
-            parts, [data[0:123], data[123 : 123 + 4096], data[123 + 4096 : 123 + 8192], data[123 + 8192 :]]
-        )
-
-    def test_chunkify_just_blocks_fmap_complete(self):
-        data = b"foobar" * 1500
-        chunker = ChunkerFixed(4096)
-        fmap = [(0, 4096, True), (4096, 8192, True), (8192, 99999999, True)]
-        parts = cf(chunker.chunkify(BytesIO(data), fmap=fmap))
-        self.assert_equal(parts, [data[0:4096], data[4096:8192], data[8192:]])
-
-    def test_chunkify_header_and_blocks_fmap_complete(self):
-        data = b"foobar" * 1500
-        chunker = ChunkerFixed(4096, 123)
-        fmap = [(0, 123, True), (123, 4096, True), (123 + 4096, 4096, True), (123 + 8192, 4096, True)]
-        parts = cf(chunker.chunkify(BytesIO(data), fmap=fmap))
-        self.assert_equal(
-            parts, [data[0:123], data[123 : 123 + 4096], data[123 + 4096 : 123 + 8192], data[123 + 8192 :]]
-        )
-
-    def test_chunkify_header_and_blocks_fmap_zeros(self):
-        data = b"H" * 123 + b"_" * 4096 + b"X" * 4096 + b"_" * 4096
-        chunker = ChunkerFixed(4096, 123)
-        fmap = [(0, 123, True), (123, 4096, False), (123 + 4096, 4096, True), (123 + 8192, 4096, False)]
-        parts = cf(chunker.chunkify(BytesIO(data), fmap=fmap))
-        # because we marked the '_' ranges as holes, we will get hole ranges instead!
-        self.assert_equal(parts, [data[0:123], 4096, data[123 + 4096 : 123 + 8192], 4096])
-
-    def test_chunkify_header_and_blocks_fmap_partial(self):
-        data = b"H" * 123 + b"_" * 4096 + b"X" * 4096 + b"_" * 4096
-        chunker = ChunkerFixed(4096, 123)
-        fmap = [
-            (0, 123, True),
-            # (123, 4096, False),
-            (123 + 4096, 4096, True),
-            # (123+8192, 4096, False),
-        ]
-        parts = cf(chunker.chunkify(BytesIO(data), fmap=fmap))
-        # because we left out the '_' ranges from the fmap, we will not get them at all!
-        self.assert_equal(parts, [data[0:123], data[123 + 4096 : 123 + 8192]])
+from ...chunkers import get_chunker
+from ...chunkers.buzhash import buzhash, buzhash_update, Chunker
+from ...constants import *  # NOQA
+from .. import BaseTestCase
+from . import cf
 
 
 class ChunkerTestCase(BaseTestCase):

+ 69 - 0
src/borg/testsuite/chunkers/buzhash_test.py

@@ -0,0 +1,69 @@
+from hashlib import sha256
+from io import BytesIO
+import os
+
+from . import cf
+from ...chunkers import Chunker
+from ...constants import *  # NOQA
+from ...helpers import hex_to_bin
+
+
+def H(data):
+    return sha256(data).digest()
+
+
+def test_chunkpoints_unchanged():
+    def twist(size):
+        x = 1
+        a = bytearray(size)
+        for i in range(size):
+            x = (x * 1103515245 + 12345) & 0x7FFFFFFF
+            a[i] = x & 0xFF
+        return a
+
+    data = twist(100000)
+
+    runs = []
+    for winsize in (65, 129, HASH_WINDOW_SIZE, 7351):
+        for minexp in (4, 6, 7, 11, 12):
+            for maxexp in (15, 17):
+                if minexp >= maxexp:
+                    continue
+                for maskbits in (4, 7, 10, 12):
+                    for seed in (1849058162, 1234567653):
+                        fh = BytesIO(data)
+                        chunker = Chunker(seed, minexp, maxexp, maskbits, winsize)
+                        chunks = [H(c) for c in cf(chunker.chunkify(fh, -1))]
+                        runs.append(H(b"".join(chunks)))
+
+    # The "correct" hash below matches the existing chunker behavior.
+    # Future chunker optimisations must not change this, or existing repos will bloat.
+    overall_hash = H(b"".join(runs))
+    assert overall_hash == hex_to_bin("a43d0ecb3ae24f38852fcc433a83dacd28fe0748d09cc73fc11b69cf3f1a7299")
+
+
+def test_buzhash_chunksize_distribution():
+    data = os.urandom(1048576)
+    min_exp, max_exp, mask = 10, 16, 14  # chunk size target 16kiB, clip at 1kiB and 64kiB
+    chunker = Chunker(0, min_exp, max_exp, mask, 4095)
+    f = BytesIO(data)
+    chunks = cf(chunker.chunkify(f))
+    del chunks[-1]  # get rid of the last chunk, it can be smaller than 2**min_exp
+    chunk_sizes = [len(chunk) for chunk in chunks]
+    chunks_count = len(chunks)
+    min_chunksize_observed = min(chunk_sizes)
+    max_chunksize_observed = max(chunk_sizes)
+    min_count = sum(int(size == 2**min_exp) for size in chunk_sizes)
+    max_count = sum(int(size == 2**max_exp) for size in chunk_sizes)
+    print(
+        f"count: {chunks_count} min: {min_chunksize_observed} max: {max_chunksize_observed} "
+        f"min count: {min_count} max count: {max_count}"
+    )
+    # usually there will about 64 chunks
+    assert 32 < chunks_count < 128
+    # chunks always must be between min and max (clipping must work):
+    assert min_chunksize_observed >= 2**min_exp
+    assert max_chunksize_observed <= 2**max_exp
+    # most chunks should be cut due to buzhash triggering, not due to clipping at min/max size:
+    assert min_count < 10
+    assert max_count < 10

+ 32 - 0
src/borg/testsuite/chunkers/failing_test.py

@@ -0,0 +1,32 @@
+from io import BytesIO
+
+import pytest
+
+from ...chunkers import ChunkerFailing
+from ...constants import *  # NOQA
+
+
+def test_chunker_failing():
+    SIZE = 4096
+    data = bytes(2 * SIZE + 1000)
+    chunker = ChunkerFailing(SIZE, "rEErrr")  # cut <SIZE> chunks, start failing at block 1, fail 2 times
+    with BytesIO(data) as fd:
+        ch = chunker.chunkify(fd)
+        c1 = next(ch)  # block 0: ok
+        assert c1.meta["allocation"] == CH_DATA
+        assert c1.data == data[:SIZE]
+        with pytest.raises(OSError):  # block 1: failure 1
+            next(ch)
+    with BytesIO(data) as fd:
+        ch = chunker.chunkify(fd)
+        with pytest.raises(OSError):  # block 2: failure 2
+            next(ch)
+    with BytesIO(data) as fd:
+        ch = chunker.chunkify(fd)
+        c1 = next(ch)  # block 3: success!
+        c2 = next(ch)  # block 4: success!
+        c3 = next(ch)  # block 5: success!
+        assert c1.meta["allocation"] == c2.meta["allocation"] == c3.meta["allocation"] == CH_DATA
+        assert c1.data == data[:SIZE]
+        assert c2.data == data[SIZE : 2 * SIZE]
+        assert c3.data == data[2 * SIZE :]

+ 62 - 0
src/borg/testsuite/chunkers/fixed_self_test.py

@@ -0,0 +1,62 @@
+# Note: these tests are part of the self test, do not use or import pytest functionality here.
+#       See borg.selftest for details. If you add/remove test methods, update SELFTEST_COUNT
+
+from io import BytesIO
+
+from ...chunkers.fixed import ChunkerFixed
+from ...constants import *  # NOQA
+from .. import BaseTestCase
+from . import cf
+
+
+class ChunkerFixedTestCase(BaseTestCase):
+    def test_chunkify_just_blocks(self):
+        data = b"foobar" * 1500
+        chunker = ChunkerFixed(4096)
+        parts = cf(chunker.chunkify(BytesIO(data)))
+        self.assert_equal(parts, [data[0:4096], data[4096:8192], data[8192:]])
+
+    def test_chunkify_header_and_blocks(self):
+        data = b"foobar" * 1500
+        chunker = ChunkerFixed(4096, 123)
+        parts = cf(chunker.chunkify(BytesIO(data)))
+        self.assert_equal(
+            parts, [data[0:123], data[123 : 123 + 4096], data[123 + 4096 : 123 + 8192], data[123 + 8192 :]]
+        )
+
+    def test_chunkify_just_blocks_fmap_complete(self):
+        data = b"foobar" * 1500
+        chunker = ChunkerFixed(4096)
+        fmap = [(0, 4096, True), (4096, 8192, True), (8192, 99999999, True)]
+        parts = cf(chunker.chunkify(BytesIO(data), fmap=fmap))
+        self.assert_equal(parts, [data[0:4096], data[4096:8192], data[8192:]])
+
+    def test_chunkify_header_and_blocks_fmap_complete(self):
+        data = b"foobar" * 1500
+        chunker = ChunkerFixed(4096, 123)
+        fmap = [(0, 123, True), (123, 4096, True), (123 + 4096, 4096, True), (123 + 8192, 4096, True)]
+        parts = cf(chunker.chunkify(BytesIO(data), fmap=fmap))
+        self.assert_equal(
+            parts, [data[0:123], data[123 : 123 + 4096], data[123 + 4096 : 123 + 8192], data[123 + 8192 :]]
+        )
+
+    def test_chunkify_header_and_blocks_fmap_zeros(self):
+        data = b"H" * 123 + b"_" * 4096 + b"X" * 4096 + b"_" * 4096
+        chunker = ChunkerFixed(4096, 123)
+        fmap = [(0, 123, True), (123, 4096, False), (123 + 4096, 4096, True), (123 + 8192, 4096, False)]
+        parts = cf(chunker.chunkify(BytesIO(data), fmap=fmap))
+        # because we marked the '_' ranges as holes, we will get hole ranges instead!
+        self.assert_equal(parts, [data[0:123], 4096, data[123 + 4096 : 123 + 8192], 4096])
+
+    def test_chunkify_header_and_blocks_fmap_partial(self):
+        data = b"H" * 123 + b"_" * 4096 + b"X" * 4096 + b"_" * 4096
+        chunker = ChunkerFixed(4096, 123)
+        fmap = [
+            (0, 123, True),
+            # (123, 4096, False),
+            (123 + 4096, 4096, True),
+            # (123+8192, 4096, False),
+        ]
+        parts = cf(chunker.chunkify(BytesIO(data), fmap=fmap))
+        # because we left out the '_' ranges from the fmap, we will not get them at all!
+        self.assert_equal(parts, [data[0:123], data[123 + 4096 : 123 + 8192]])

+ 39 - 0
src/borg/testsuite/chunkers/fixed_test.py

@@ -0,0 +1,39 @@
+import pytest
+
+from . import cf, make_sparsefile, make_content, fs_supports_sparse
+from . import BS, map_sparse1, map_sparse2, map_onlysparse, map_notsparse
+from ...chunkers import ChunkerFixed
+from ...constants import *  # NOQA
+
+
+@pytest.mark.skipif(not fs_supports_sparse(), reason="fs does not support sparse files")
+@pytest.mark.parametrize(
+    "fname, sparse_map, header_size, sparse",
+    [
+        ("sparse1", map_sparse1, 0, False),
+        ("sparse1", map_sparse1, 0, True),
+        ("sparse1", map_sparse1, BS, False),
+        ("sparse1", map_sparse1, BS, True),
+        ("sparse2", map_sparse2, 0, False),
+        ("sparse2", map_sparse2, 0, True),
+        ("sparse2", map_sparse2, BS, False),
+        ("sparse2", map_sparse2, BS, True),
+        ("onlysparse", map_onlysparse, 0, False),
+        ("onlysparse", map_onlysparse, 0, True),
+        ("onlysparse", map_onlysparse, BS, False),
+        ("onlysparse", map_onlysparse, BS, True),
+        ("notsparse", map_notsparse, 0, False),
+        ("notsparse", map_notsparse, 0, True),
+        ("notsparse", map_notsparse, BS, False),
+        ("notsparse", map_notsparse, BS, True),
+    ],
+)
+def test_chunkify_sparse(tmpdir, fname, sparse_map, header_size, sparse):
+    def get_chunks(fname, sparse, header_size):
+        chunker = ChunkerFixed(4096, header_size=header_size, sparse=sparse)
+        with open(fname, "rb") as fd:
+            return cf(chunker.chunkify(fd))
+
+    fn = str(tmpdir / fname)
+    make_sparsefile(fn, sparse_map, header_size=header_size)
+    get_chunks(fn, sparse=sparse, header_size=header_size) == make_content(sparse_map, header_size=header_size)

+ 5 - 153
src/borg/testsuite/chunker_pytest_test.py → src/borg/testsuite/chunkers/reader_test.py

@@ -1,74 +1,12 @@
-from io import BytesIO
 import os
-import tempfile
+from io import BytesIO
 
 import pytest
 
-from .chunker_test import cf
-from ..chunker import Chunker, ChunkerFixed, sparsemap, has_seek_hole, ChunkerFailing, FileReader, FileFMAPReader, Chunk
-from ..constants import *  # NOQA
-
-BS = 4096  # fs block size
-
-# some sparse files. X = content blocks, _ = sparse blocks.
-# X__XXX____
-map_sparse1 = [(0 * BS, 1 * BS, True), (1 * BS, 2 * BS, False), (3 * BS, 3 * BS, True), (6 * BS, 4 * BS, False)]
-
-# _XX___XXXX
-map_sparse2 = [(0 * BS, 1 * BS, False), (1 * BS, 2 * BS, True), (3 * BS, 3 * BS, False), (6 * BS, 4 * BS, True)]
-
-# XXX
-map_notsparse = [(0 * BS, 3 * BS, True)]
-
-# ___
-map_onlysparse = [(0 * BS, 3 * BS, False)]
-
-
-def make_sparsefile(fname, sparsemap, header_size=0):
-    with open(fname, "wb") as fd:
-        total = 0
-        if header_size:
-            fd.write(b"H" * header_size)
-            total += header_size
-        for offset, size, is_data in sparsemap:
-            if is_data:
-                fd.write(b"X" * size)
-            else:
-                fd.seek(size, os.SEEK_CUR)
-            total += size
-        fd.truncate(total)
-    assert os.path.getsize(fname) == total
-
-
-def make_content(sparsemap, header_size=0):
-    result = []
-    total = 0
-    if header_size:
-        result.append(b"H" * header_size)
-        total += header_size
-    for offset, size, is_data in sparsemap:
-        if is_data:
-            result.append(b"X" * size)  # bytes!
-        else:
-            result.append(size)  # int!
-        total += size
-    return result
-
-
-def fs_supports_sparse():
-    if not has_seek_hole:
-        return False
-    with tempfile.TemporaryDirectory() as tmpdir:
-        fn = os.path.join(tmpdir, "test_sparse")
-        make_sparsefile(fn, [(0, BS, False), (BS, BS, True)])
-        with open(fn, "rb") as f:
-            try:
-                offset_hole = f.seek(0, os.SEEK_HOLE)
-                offset_data = f.seek(0, os.SEEK_DATA)
-            except OSError:
-                # no sparse support if these seeks do not work
-                return False
-        return offset_hole == 0 and offset_data == BS
+from . import make_sparsefile, fs_supports_sparse
+from . import BS, map_sparse1, map_sparse2, map_onlysparse, map_notsparse
+from ...chunkers import sparsemap, FileReader, FileFMAPReader, Chunk
+from ...constants import *  # NOQA
 
 
 @pytest.mark.skipif(not fs_supports_sparse(), reason="fs does not support sparse files")
@@ -94,92 +32,6 @@ def test_sparsemap(tmpdir, fname, sparse_map):
     assert get_sparsemap_fd(fn) == sparse_map
 
 
-@pytest.mark.skipif(not fs_supports_sparse(), reason="fs does not support sparse files")
-@pytest.mark.parametrize(
-    "fname, sparse_map, header_size, sparse",
-    [
-        ("sparse1", map_sparse1, 0, False),
-        ("sparse1", map_sparse1, 0, True),
-        ("sparse1", map_sparse1, BS, False),
-        ("sparse1", map_sparse1, BS, True),
-        ("sparse2", map_sparse2, 0, False),
-        ("sparse2", map_sparse2, 0, True),
-        ("sparse2", map_sparse2, BS, False),
-        ("sparse2", map_sparse2, BS, True),
-        ("onlysparse", map_onlysparse, 0, False),
-        ("onlysparse", map_onlysparse, 0, True),
-        ("onlysparse", map_onlysparse, BS, False),
-        ("onlysparse", map_onlysparse, BS, True),
-        ("notsparse", map_notsparse, 0, False),
-        ("notsparse", map_notsparse, 0, True),
-        ("notsparse", map_notsparse, BS, False),
-        ("notsparse", map_notsparse, BS, True),
-    ],
-)
-def test_chunkify_sparse(tmpdir, fname, sparse_map, header_size, sparse):
-    def get_chunks(fname, sparse, header_size):
-        chunker = ChunkerFixed(4096, header_size=header_size, sparse=sparse)
-        with open(fname, "rb") as fd:
-            return cf(chunker.chunkify(fd))
-
-    fn = str(tmpdir / fname)
-    make_sparsefile(fn, sparse_map, header_size=header_size)
-    get_chunks(fn, sparse=sparse, header_size=header_size) == make_content(sparse_map, header_size=header_size)
-
-
-def test_chunker_failing():
-    SIZE = 4096
-    data = bytes(2 * SIZE + 1000)
-    chunker = ChunkerFailing(SIZE, "rEErrr")  # cut <SIZE> chunks, start failing at block 1, fail 2 times
-    with BytesIO(data) as fd:
-        ch = chunker.chunkify(fd)
-        c1 = next(ch)  # block 0: ok
-        assert c1.meta["allocation"] == CH_DATA
-        assert c1.data == data[:SIZE]
-        with pytest.raises(OSError):  # block 1: failure 1
-            next(ch)
-    with BytesIO(data) as fd:
-        ch = chunker.chunkify(fd)
-        with pytest.raises(OSError):  # block 2: failure 2
-            next(ch)
-    with BytesIO(data) as fd:
-        ch = chunker.chunkify(fd)
-        c1 = next(ch)  # block 3: success!
-        c2 = next(ch)  # block 4: success!
-        c3 = next(ch)  # block 5: success!
-        assert c1.meta["allocation"] == c2.meta["allocation"] == c3.meta["allocation"] == CH_DATA
-        assert c1.data == data[:SIZE]
-        assert c2.data == data[SIZE : 2 * SIZE]
-        assert c3.data == data[2 * SIZE :]
-
-
-def test_buzhash_chunksize_distribution():
-    data = os.urandom(1048576)
-    min_exp, max_exp, mask = 10, 16, 14  # chunk size target 16kiB, clip at 1kiB and 64kiB
-    chunker = Chunker(0, min_exp, max_exp, mask, 4095)
-    f = BytesIO(data)
-    chunks = cf(chunker.chunkify(f))
-    del chunks[-1]  # get rid of the last chunk, it can be smaller than 2**min_exp
-    chunk_sizes = [len(chunk) for chunk in chunks]
-    chunks_count = len(chunks)
-    min_chunksize_observed = min(chunk_sizes)
-    max_chunksize_observed = max(chunk_sizes)
-    min_count = sum(int(size == 2**min_exp) for size in chunk_sizes)
-    max_count = sum(int(size == 2**max_exp) for size in chunk_sizes)
-    print(
-        f"count: {chunks_count} min: {min_chunksize_observed} max: {max_chunksize_observed} "
-        f"min count: {min_count} max count: {max_count}"
-    )
-    # usually there will about 64 chunks
-    assert 32 < chunks_count < 128
-    # chunks always must be between min and max (clipping must work):
-    assert min_chunksize_observed >= 2**min_exp
-    assert max_chunksize_observed <= 2**max_exp
-    # most chunks should be cut due to buzhash triggering, not due to clipping at min/max size:
-    assert min_count < 10
-    assert max_count < 10
-
-
 @pytest.mark.parametrize(
     "file_content, read_size, expected_data, expected_allocation, expected_size",
     [