3 months ago · 53c0d0a449
--- a/src/borg/chunkers/reader.pyx
+++ b/src/borg/chunkers/reader.pyx
@@ -303,10 +303,11 @@ class FileReader:
 
				                 # For data chunks, add the actual data
			
 
				                 result.extend(data[self.offset:self.offset + to_read])
			
 
				             else:
			
 
				-                # For non-data chunks, add zeros if we've seen a data chunk
			
 
				-                if has_data:
			
 
				-                    result.extend(b'\0' * to_read)
			
 
				-                # Otherwise, we'll just track the size without adding data
			
 
				+                # For non-data chunks, always add zeros to the result.
			
 
				+                # We will only yield a CH_DATA chunk with the result bytes,
			
 
				+                # if there was at least one CH_DATA chunk contributing to the result,
			
 
				+                # otherwise we will yield a CH_HOLE or CH_ALLOC chunk.
			
 
				+                result.extend(b'\0' * to_read)
			
 
				 
			
 
				             bytes_read += to_read
			
 
				 
			
--- a/src/borg/testsuite/chunkers/interaction_test.py
+++ b/src/borg/testsuite/chunkers/interaction_test.py
@@ -0,0 +1,70 @@
 
				+import os
			
 
				+import pytest
			
 
				+from io import BytesIO
			
 
				+
			
 
				+from ...chunkers import get_chunker
			
 
				+from ...constants import *  # NOQA
			
 
				+
			
 
				+
			
 
				+@pytest.mark.parametrize(
			
 
				+    "chunker_params",
			
 
				+    [
			
 
				+        (CH_FIXED, 1048576, 0),  # == reader_block_size
			
 
				+        (CH_FIXED, 1048576 // 2, 0),  # reader_block_size / N
			
 
				+        (CH_FIXED, 1048576 * 2, 0),  # N * reader_block_size
			
 
				+        (CH_FIXED, 1234567, 0),  # does not fit well, larger than reader_block_size
			
 
				+        (CH_FIXED, 123456, 0),  # does not fit well, smaller than reader_block_size
			
 
				+        (CH_BUZHASH, CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH_WINDOW_SIZE),
			
 
				+        (CH_BUZHASH64, CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH_WINDOW_SIZE),
			
 
				+    ],
			
 
				+)
			
 
				+def test_reader_chunker_interaction(chunker_params):
			
 
				+    """
			
 
				+    Test that chunking random/zero data produces chunks that can be reassembled to match the original data.
			
 
				+
			
 
				+    If one of these fails, there is likely a problem with buffer management.
			
 
				+    """
			
 
				+    # Generate some data
			
 
				+    data_size = 6 * 12341234
			
 
				+    random_data = os.urandom(data_size // 3) + b"\0" * (data_size // 3) + os.urandom(data_size // 3)
			
 
				+
			
 
				+    # Chunk the data
			
 
				+    chunker = get_chunker(*chunker_params)
			
 
				+    data_file = BytesIO(random_data)
			
 
				+    chunks = list(chunker.chunkify(data_file))
			
 
				+
			
 
				+    data_chunks = 0
			
 
				+    hole_chunks = 0
			
 
				+    alloc_chunks = 0
			
 
				+    for chunk in chunks:
			
 
				+        if chunk.meta["allocation"] == CH_DATA:
			
 
				+            data_chunks += 1
			
 
				+        elif chunk.meta["allocation"] == CH_HOLE:
			
 
				+            hole_chunks += 1
			
 
				+        elif chunk.meta["allocation"] == CH_ALLOC:
			
 
				+            alloc_chunks += 1
			
 
				+
			
 
				+    assert data_chunks > 0, "No data chunks found"
			
 
				+    assert alloc_chunks > 0, "No alloc chunks found"
			
 
				+    assert hole_chunks == 0, "Hole chunks found, this is not expected!"
			
 
				+
			
 
				+    # Reassemble the chunks
			
 
				+    reassembled = BytesIO()
			
 
				+    for i, chunk in enumerate(chunks):
			
 
				+        if chunk.meta["allocation"] == CH_DATA:
			
 
				+            # For data chunks, write the actual data
			
 
				+            reassembled.write(bytes(chunk.data))
			
 
				+        elif chunk.meta["allocation"] in (CH_HOLE, CH_ALLOC):
			
 
				+            # For hole or alloc chunks, write zeros
			
 
				+            reassembled.write(b"\0" * chunk.meta["size"])
			
 
				+
			
 
				+    # Check that the reassembled data has the correct size
			
 
				+    reassembled_size = reassembled.tell()
			
 
				+    assert (
			
 
				+        reassembled_size == data_size
			
 
				+    ), f"Reassembled data size ({reassembled_size}) does not equal original data size ({data_size})"
			
 
				+
			
 
				+    # Verify that the reassembled data matches the original data
			
 
				+    reassembled.seek(0)
			
 
				+    reassembled_data = reassembled.read()
			
 
				+    assert reassembled_data == random_data, "Reassembled data does not match original data"