5 months ago · b9646f236e
--- a/src/borg/chunkers/__init__.py
+++ b/src/borg/chunkers/__init__.py
@@ -3,7 +3,6 @@ from .buzhash64 import ChunkerBuzHash64
 
				 from .failing import ChunkerFailing
			
 
				 from .fixed import ChunkerFixed
			
 
				 from .reader import *  # noqa
			
 
				-from ..crypto.key import PlaintextKey
			
 
				 
			
 
				 API_VERSION = "1.2_01"
			
 
				 
			
@@ -13,15 +12,12 @@ def get_chunker(algo, *params, **kw):
 
				     sparse = kw.get("sparse", False)
			
 
				     # key.chunk_seed only has 32bits
			
 
				     seed = key.chunk_seed if key is not None else 0
			
 
				-    # we want 64bits for buzhash64, get them from crypt_key
			
 
				-    if key is None or isinstance(key, PlaintextKey):
			
 
				-        seed64 = 0
			
 
				-    else:
			
 
				-        seed64 = int.from_bytes(key.crypt_key[:8], byteorder="little")
			
 
				+    # for buzhash64, we want a much longer key, so we derive it from the id key
			
 
				+    bh64_key = key.derive_key(salt=b"", domain=b"buzhash64", size=32, from_id_key=True) if key is not None else b""
			
 
				     if algo == "buzhash":
			
 
				         return Chunker(seed, *params, sparse=sparse)
			
 
				     if algo == "buzhash64":
			
 
				-        return ChunkerBuzHash64(seed64, *params, sparse=sparse)
			
 
				+        return ChunkerBuzHash64(bh64_key, *params, sparse=sparse)
			
 
				     if algo == "fixed":
			
 
				         return ChunkerFixed(*params, sparse=sparse)
			
 
				     if algo == "fail":
			
--- a/src/borg/chunkers/buzhash64.pyi
+++ b/src/borg/chunkers/buzhash64.pyi
@@ -4,13 +4,13 @@ from .reader import fmap_entry
 
				 
			
 
				 API_VERSION: str
			
 
				 
			
 
				-def buzhash64(data: bytes, seed: int) -> int: ...
			
 
				-def buzhash64_update(sum: int, remove: int, add: int, len: int, seed: int) -> int: ...
			
 
				+def buzhash64(data: bytes, key: bytes) -> int: ...
			
 
				+def buzhash64_update(sum: int, remove: int, add: int, len: int, key: bytes) -> int: ...
			
 
				 
			
 
				 class ChunkerBuzHash64:
			
 
				     def __init__(
			
 
				         self,
			
 
				-        seed: int,
			
 
				+        key: bytes,
			
 
				         chunk_min_exp: int,
			
 
				         chunk_max_exp: int,
			
 
				         hash_mask_bits: int,
			
--- a/src/borg/chunkers/buzhash64.pyx
+++ b/src/borg/chunkers/buzhash64.pyx
@@ -39,13 +39,13 @@ cdef extern from *:
 
				 
			
 
				 @cython.boundscheck(False)  # Deactivate bounds checking
			
 
				 @cython.wraparound(False)  # Deactivate negative indexing.
			
 
				-cdef uint64_t* buzhash64_init_table(uint64_t seed):
			
 
				-    """Initialize the buzhash table with the given seed."""
			
 
				+cdef uint64_t* buzhash64_init_table(bytes key):
			
 
				+    """Initialize the buzhash table using the given key."""
			
 
				     cdef int i
			
 
				     cdef uint64_t* table = <uint64_t*>malloc(2048)  # 256 * sizeof(uint64_t)
			
 
				     for i in range(256):
			
 
				-        # deterministically generate a pseudo-random 64-bit unsigned integer for table entry i involving the seed:
			
 
				-        v = f"{i:02x}{seed:016x}".encode()
			
 
				+        # deterministically generate a pseudo-random 64-bit unsigned integer for table entry i involving the key:
			
 
				+        v = f"{i:02x}".encode() + key
			
 
				         d64 = sha256(v).digest()[:8]
			
 
				         table[i] = <uint64_t> int.from_bytes(d64, byteorder='little')
			
 
				     return table
			
@@ -99,7 +99,7 @@ cdef class ChunkerBuzHash64:
 
				     cdef size_t reader_block_size
			
 
				     cdef bint sparse
			
 
				 
			
 
				-    def __cinit__(self, int seed, int chunk_min_exp, int chunk_max_exp, int hash_mask_bits, int hash_window_size, bint sparse=False):
			
 
				+    def __cinit__(self, bytes key, int chunk_min_exp, int chunk_max_exp, int hash_mask_bits, int hash_window_size, bint sparse=False):
			
 
				         min_size = 1 << chunk_min_exp
			
 
				         max_size = 1 << chunk_max_exp
			
 
				         assert max_size <= len(zeros)
			
@@ -109,7 +109,7 @@ cdef class ChunkerBuzHash64:
 
				         self.window_size = hash_window_size
			
 
				         self.chunk_mask = (1 << hash_mask_bits) - 1
			
 
				         self.min_size = min_size
			
 
				-        self.table = buzhash64_init_table(seed & 0xffffffffffffffff)
			
 
				+        self.table = buzhash64_init_table(key)
			
 
				         self.buf_size = max_size
			
 
				         self.data = <uint8_t*>malloc(self.buf_size)
			
 
				         self.fh = -1
			
@@ -274,18 +274,18 @@ cdef class ChunkerBuzHash64:
 
				         return Chunk(data, size=got, allocation=allocation)
			
 
				 
			
 
				 
			
 
				-def buzhash64(data, unsigned long seed):
			
 
				+def buzhash64(data, bytes key):
			
 
				     cdef uint64_t *table
			
 
				     cdef uint64_t sum
			
 
				-    table = buzhash64_init_table(seed & 0xffffffffffffffff)
			
 
				+    table = buzhash64_init_table(key)
			
 
				     sum = _buzhash64(<const unsigned char *> data, len(data), table)
			
 
				     free(table)
			
 
				     return sum
			
 
				 
			
 
				 
			
 
				-def buzhash64_update(uint64_t sum, unsigned char remove, unsigned char add, size_t len, unsigned long seed):
			
 
				+def buzhash64_update(uint64_t sum, unsigned char remove, unsigned char add, size_t len, bytes key):
			
 
				     cdef uint64_t *table
			
 
				-    table = buzhash64_init_table(seed & 0xffffffffffffffff)
			
 
				+    table = buzhash64_init_table(key)
			
 
				     sum = _buzhash64_update(sum, remove, add, len, table)
			
 
				     free(table)
			
 
				     return sum
			
--- a/src/borg/testsuite/chunkers/buzhash64_self_test.py
+++ b/src/borg/testsuite/chunkers/buzhash64_self_test.py
@@ -13,56 +13,56 @@ from . import cf
 
				 class ChunkerBuzHash64TestCase(BaseTestCase):
			
 
				     def test_chunkify64(self):
			
 
				         data = b"0" * int(1.5 * (1 << CHUNK_MAX_EXP)) + b"Y"
			
 
				-        parts = cf(ChunkerBuzHash64(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(data)))
			
 
				+        parts = cf(ChunkerBuzHash64(b"0", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(data)))
			
 
				         self.assert_equal(len(parts), 2)
			
 
				         self.assert_equal(b"".join(parts), data)
			
 
				-        self.assert_equal(cf(ChunkerBuzHash64(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b""))), [])
			
 
				+        self.assert_equal(cf(ChunkerBuzHash64(b"0", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b""))), [])
			
 
				         self.assert_equal(
			
 
				-            cf(ChunkerBuzHash64(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))),
			
 
				-            [b"fo", b"oba", b"rbo", b"ob", b"azfo", b"oba", b"rbo", b"ob", b"azfo", b"oba", b"rbo", b"obaz"],
			
 
				+            cf(ChunkerBuzHash64(b"0", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))),
			
 
				+            [b"fo", b"obarbo", b"ob", b"azfo", b"obarbo", b"ob", b"azfo", b"obarbo", b"obaz"],
			
 
				         )
			
 
				         self.assert_equal(
			
 
				-            cf(ChunkerBuzHash64(1, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))),
			
 
				-            [b"foobarboobazfoobarboobazfoobarboobaz"],
			
 
				+            cf(ChunkerBuzHash64(b"1", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))),
			
 
				+            [b"fooba", b"rboobaz", b"fooba", b"rboobaz", b"fooba", b"rboobaz"],
			
 
				         )
			
 
				         self.assert_equal(
			
 
				-            cf(ChunkerBuzHash64(2, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))),
			
 
				-            [b"foobarboob", b"azfoobarboob", b"azfoobarboobaz"],
			
 
				+            cf(ChunkerBuzHash64(b"2", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))),
			
 
				+            [b"foob", b"arboobazfoob", b"arboobazfoob", b"arboobaz"],
			
 
				         )
			
 
				         self.assert_equal(
			
 
				-            cf(ChunkerBuzHash64(0, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
			
 
				-            [b"foobar", b"boobazfoo", b"barboobazfoo", b"barboobaz"],
			
 
				+            cf(ChunkerBuzHash64(b"0", 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
			
 
				+            [b"foobarb", b"oobaz", b"foobarb", b"oobaz", b"foobarb", b"oobaz"],
			
 
				         )
			
 
				         self.assert_equal(
			
 
				-            cf(ChunkerBuzHash64(1, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
			
 
				-            [b"foobarbooba", b"zfoobarbooba", b"zfoobarboobaz"],
			
 
				-        )
			
 
				-        self.assert_equal(
			
 
				-            cf(ChunkerBuzHash64(2, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
			
 
				+            cf(ChunkerBuzHash64(b"1", 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
			
 
				             [b"foobarbo", b"obazfo", b"obarbo", b"obazfo", b"obarbo", b"obaz"],
			
 
				         )
			
 
				         self.assert_equal(
			
 
				-            cf(ChunkerBuzHash64(0, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
			
 
				-            [b"foobarboobazfoo", b"barboobazfoo", b"barboobaz"],
			
 
				+            cf(ChunkerBuzHash64(b"2", 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
			
 
				+            [b"foobarboobaz", b"foobarboobaz", b"foobarboobaz"],
			
 
				         )
			
 
				         self.assert_equal(
			
 
				-            cf(ChunkerBuzHash64(1, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
			
 
				-            [b"foobarbooba", b"zfoobarbooba", b"zfoobarboobaz"],
			
 
				+            cf(ChunkerBuzHash64(b"0", 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
			
 
				+            [b"foobarbo", b"obazfoobarb", b"oobazfoo", b"barboobaz"],
			
 
				         )
			
 
				         self.assert_equal(
			
 
				-            cf(ChunkerBuzHash64(2, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
			
 
				+            cf(ChunkerBuzHash64(b"1", 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
			
 
				             [b"foobarbo", b"obazfoobarbo", b"obazfoobarbo", b"obaz"],
			
 
				         )
			
 
				+        self.assert_equal(
			
 
				+            cf(ChunkerBuzHash64(b"2", 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
			
 
				+            [b"foobarboobaz", b"foobarboobaz", b"foobarboobaz"],
			
 
				+        )
			
 
				 
			
 
				     def test_buzhash64(self):
			
 
				-        self.assert_equal(buzhash64(b"abcdefghijklmnop", 0), 13314711829666336849)
			
 
				-        self.assert_equal(buzhash64(b"abcdefghijklmnop", 1), 17807676237451361719)
			
 
				-        expected = buzhash64(b"abcdefghijklmnop", 1)
			
 
				-        previous = buzhash64(b"Xabcdefghijklmno", 1)
			
 
				-        this = buzhash64_update(previous, ord("X"), ord("p"), 16, 1)
			
 
				+        self.assert_equal(buzhash64(b"abcdefghijklmnop", b"0"), 13095190927899934478)
			
 
				+        self.assert_equal(buzhash64(b"abcdefghijklmnop", b"1"), 10129419249308136910)
			
 
				+        expected = buzhash64(b"abcdefghijklmnop", b"1")
			
 
				+        previous = buzhash64(b"Xabcdefghijklmno", b"1")
			
 
				+        this = buzhash64_update(previous, ord("X"), ord("p"), 16, b"1")
			
 
				         self.assert_equal(this, expected)
			
 
				         # Test with more than 63 bytes to make sure our barrel_shift macro works correctly
			
 
				-        self.assert_equal(buzhash64(b"abcdefghijklmnopqrstuvwxyz" * 4, 0), 592868834756664313)
			
 
				+        self.assert_equal(buzhash64(b"abcdefghijklmnopqrstuvwxyz" * 4, b"0"), 9064183923498167899)
			
 
				 
			
 
				     def test_small_reads64(self):
			
 
				         class SmallReadFile:
			
--- a/src/borg/testsuite/chunkers/buzhash64_test.py
+++ b/src/borg/testsuite/chunkers/buzhash64_test.py
@@ -30,22 +30,22 @@ def test_chunkpoints64_unchanged():
 
				                 if minexp >= maxexp:
			
 
				                     continue
			
 
				                 for maskbits in (4, 7, 10, 12):
			
 
				-                    for seed in (1849058162, 1234567653):
			
 
				+                    for key in (b"first_key", b"second_key"):
			
 
				                         fh = BytesIO(data)
			
 
				-                        chunker = ChunkerBuzHash64(seed, minexp, maxexp, maskbits, winsize)
			
 
				+                        chunker = ChunkerBuzHash64(key, minexp, maxexp, maskbits, winsize)
			
 
				                         chunks = [H(c) for c in cf(chunker.chunkify(fh, -1))]
			
 
				                         runs.append(H(b"".join(chunks)))
			
 
				 
			
 
				     # The "correct" hash below matches the existing chunker behavior.
			
 
				     # Future chunker optimisations must not change this, or existing repos will bloat.
			
 
				     overall_hash = H(b"".join(runs))
			
 
				-    assert overall_hash == hex_to_bin("fa9002758c0358721404f55f3020bb56b987cb3cd9a688ff9641f4023215f4e7")
			
 
				+    assert overall_hash == hex_to_bin("ab98713d28c5a544eeb8b6a2b5ba6405847bd6924d45fb7e267d173892ad0cdc")
			
 
				 
			
 
				 
			
 
				 def test_buzhash64_chunksize_distribution():
			
 
				     data = os.urandom(1048576)
			
 
				     min_exp, max_exp, mask = 10, 16, 14  # chunk size target 16kiB, clip at 1kiB and 64kiB
			
 
				-    chunker = ChunkerBuzHash64(0, min_exp, max_exp, mask, 4095)
			
 
				+    chunker = ChunkerBuzHash64(b"", min_exp, max_exp, mask, 4095)
			
 
				     f = BytesIO(data)
			
 
				     chunks = cf(chunker.chunkify(f))
			
 
				     del chunks[-1]  # get rid of the last chunk, it can be smaller than 2**min_exp