Преглед изворни кода

buzhash64: init table using a 256bit key derived from ID key

That way we can feed lots of entropy into the table creation.

The bh64_key is derived from the id_key (NOT the crypt_key), thus
it will create the same key for related repositories (even if they
use different encryption/authentication keys). Due to that, it will
also create the same buzhash64 table, will cut chunks at the same
points and deduplication will work amongst the related repositories.
Thomas Waldmann пре 1 недеља
родитељ
комит
b9646f236e

+ 3 - 7
src/borg/chunkers/__init__.py

@@ -3,7 +3,6 @@ from .buzhash64 import ChunkerBuzHash64
 from .failing import ChunkerFailing
 from .fixed import ChunkerFixed
 from .reader import *  # noqa
-from ..crypto.key import PlaintextKey
 
 API_VERSION = "1.2_01"
 
@@ -13,15 +12,12 @@ def get_chunker(algo, *params, **kw):
     sparse = kw.get("sparse", False)
     # key.chunk_seed only has 32bits
     seed = key.chunk_seed if key is not None else 0
-    # we want 64bits for buzhash64, get them from crypt_key
-    if key is None or isinstance(key, PlaintextKey):
-        seed64 = 0
-    else:
-        seed64 = int.from_bytes(key.crypt_key[:8], byteorder="little")
+    # for buzhash64, we want a much longer key, so we derive it from the id key
+    bh64_key = key.derive_key(salt=b"", domain=b"buzhash64", size=32, from_id_key=True) if key is not None else b""
     if algo == "buzhash":
         return Chunker(seed, *params, sparse=sparse)
     if algo == "buzhash64":
-        return ChunkerBuzHash64(seed64, *params, sparse=sparse)
+        return ChunkerBuzHash64(bh64_key, *params, sparse=sparse)
     if algo == "fixed":
         return ChunkerFixed(*params, sparse=sparse)
     if algo == "fail":

+ 3 - 3
src/borg/chunkers/buzhash64.pyi

@@ -4,13 +4,13 @@ from .reader import fmap_entry
 
 API_VERSION: str
 
-def buzhash64(data: bytes, seed: int) -> int: ...
-def buzhash64_update(sum: int, remove: int, add: int, len: int, seed: int) -> int: ...
+def buzhash64(data: bytes, key: bytes) -> int: ...
+def buzhash64_update(sum: int, remove: int, add: int, len: int, key: bytes) -> int: ...
 
 class ChunkerBuzHash64:
     def __init__(
         self,
-        seed: int,
+        key: bytes,
         chunk_min_exp: int,
         chunk_max_exp: int,
         hash_mask_bits: int,

+ 10 - 10
src/borg/chunkers/buzhash64.pyx

@@ -39,13 +39,13 @@ cdef extern from *:
 
 @cython.boundscheck(False)  # Deactivate bounds checking
 @cython.wraparound(False)  # Deactivate negative indexing.
-cdef uint64_t* buzhash64_init_table(uint64_t seed):
-    """Initialize the buzhash table with the given seed."""
+cdef uint64_t* buzhash64_init_table(bytes key):
+    """Initialize the buzhash table using the given key."""
     cdef int i
     cdef uint64_t* table = <uint64_t*>malloc(2048)  # 256 * sizeof(uint64_t)
     for i in range(256):
-        # deterministically generate a pseudo-random 64-bit unsigned integer for table entry i involving the seed:
-        v = f"{i:02x}{seed:016x}".encode()
+        # deterministically generate a pseudo-random 64-bit unsigned integer for table entry i involving the key:
+        v = f"{i:02x}".encode() + key
         d64 = sha256(v).digest()[:8]
         table[i] = <uint64_t> int.from_bytes(d64, byteorder='little')
     return table
@@ -99,7 +99,7 @@ cdef class ChunkerBuzHash64:
     cdef size_t reader_block_size
     cdef bint sparse
 
-    def __cinit__(self, int seed, int chunk_min_exp, int chunk_max_exp, int hash_mask_bits, int hash_window_size, bint sparse=False):
+    def __cinit__(self, bytes key, int chunk_min_exp, int chunk_max_exp, int hash_mask_bits, int hash_window_size, bint sparse=False):
         min_size = 1 << chunk_min_exp
         max_size = 1 << chunk_max_exp
         assert max_size <= len(zeros)
@@ -109,7 +109,7 @@ cdef class ChunkerBuzHash64:
         self.window_size = hash_window_size
         self.chunk_mask = (1 << hash_mask_bits) - 1
         self.min_size = min_size
-        self.table = buzhash64_init_table(seed & 0xffffffffffffffff)
+        self.table = buzhash64_init_table(key)
         self.buf_size = max_size
         self.data = <uint8_t*>malloc(self.buf_size)
         self.fh = -1
@@ -274,18 +274,18 @@ cdef class ChunkerBuzHash64:
         return Chunk(data, size=got, allocation=allocation)
 
 
-def buzhash64(data, unsigned long seed):
+def buzhash64(data, bytes key):
     cdef uint64_t *table
     cdef uint64_t sum
-    table = buzhash64_init_table(seed & 0xffffffffffffffff)
+    table = buzhash64_init_table(key)
     sum = _buzhash64(<const unsigned char *> data, len(data), table)
     free(table)
     return sum
 
 
-def buzhash64_update(uint64_t sum, unsigned char remove, unsigned char add, size_t len, unsigned long seed):
+def buzhash64_update(uint64_t sum, unsigned char remove, unsigned char add, size_t len, bytes key):
     cdef uint64_t *table
-    table = buzhash64_init_table(seed & 0xffffffffffffffff)
+    table = buzhash64_init_table(key)
     sum = _buzhash64_update(sum, remove, add, len, table)
     free(table)
     return sum

+ 26 - 26
src/borg/testsuite/chunkers/buzhash64_self_test.py

@@ -13,56 +13,56 @@ from . import cf
 class ChunkerBuzHash64TestCase(BaseTestCase):
     def test_chunkify64(self):
         data = b"0" * int(1.5 * (1 << CHUNK_MAX_EXP)) + b"Y"
-        parts = cf(ChunkerBuzHash64(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(data)))
+        parts = cf(ChunkerBuzHash64(b"0", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(data)))
         self.assert_equal(len(parts), 2)
         self.assert_equal(b"".join(parts), data)
-        self.assert_equal(cf(ChunkerBuzHash64(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b""))), [])
+        self.assert_equal(cf(ChunkerBuzHash64(b"0", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b""))), [])
         self.assert_equal(
-            cf(ChunkerBuzHash64(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))),
-            [b"fo", b"oba", b"rbo", b"ob", b"azfo", b"oba", b"rbo", b"ob", b"azfo", b"oba", b"rbo", b"obaz"],
+            cf(ChunkerBuzHash64(b"0", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))),
+            [b"fo", b"obarbo", b"ob", b"azfo", b"obarbo", b"ob", b"azfo", b"obarbo", b"obaz"],
         )
         self.assert_equal(
-            cf(ChunkerBuzHash64(1, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))),
-            [b"foobarboobazfoobarboobazfoobarboobaz"],
+            cf(ChunkerBuzHash64(b"1", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))),
+            [b"fooba", b"rboobaz", b"fooba", b"rboobaz", b"fooba", b"rboobaz"],
         )
         self.assert_equal(
-            cf(ChunkerBuzHash64(2, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))),
-            [b"foobarboob", b"azfoobarboob", b"azfoobarboobaz"],
+            cf(ChunkerBuzHash64(b"2", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))),
+            [b"foob", b"arboobazfoob", b"arboobazfoob", b"arboobaz"],
         )
         self.assert_equal(
-            cf(ChunkerBuzHash64(0, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
-            [b"foobar", b"boobazfoo", b"barboobazfoo", b"barboobaz"],
+            cf(ChunkerBuzHash64(b"0", 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
+            [b"foobarb", b"oobaz", b"foobarb", b"oobaz", b"foobarb", b"oobaz"],
         )
         self.assert_equal(
-            cf(ChunkerBuzHash64(1, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
-            [b"foobarbooba", b"zfoobarbooba", b"zfoobarboobaz"],
-        )
-        self.assert_equal(
-            cf(ChunkerBuzHash64(2, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
+            cf(ChunkerBuzHash64(b"1", 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
             [b"foobarbo", b"obazfo", b"obarbo", b"obazfo", b"obarbo", b"obaz"],
         )
         self.assert_equal(
-            cf(ChunkerBuzHash64(0, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
-            [b"foobarboobazfoo", b"barboobazfoo", b"barboobaz"],
+            cf(ChunkerBuzHash64(b"2", 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
+            [b"foobarboobaz", b"foobarboobaz", b"foobarboobaz"],
         )
         self.assert_equal(
-            cf(ChunkerBuzHash64(1, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
-            [b"foobarbooba", b"zfoobarbooba", b"zfoobarboobaz"],
+            cf(ChunkerBuzHash64(b"0", 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
+            [b"foobarbo", b"obazfoobarb", b"oobazfoo", b"barboobaz"],
         )
         self.assert_equal(
-            cf(ChunkerBuzHash64(2, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
+            cf(ChunkerBuzHash64(b"1", 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
             [b"foobarbo", b"obazfoobarbo", b"obazfoobarbo", b"obaz"],
         )
+        self.assert_equal(
+            cf(ChunkerBuzHash64(b"2", 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
+            [b"foobarboobaz", b"foobarboobaz", b"foobarboobaz"],
+        )
 
     def test_buzhash64(self):
-        self.assert_equal(buzhash64(b"abcdefghijklmnop", 0), 13314711829666336849)
-        self.assert_equal(buzhash64(b"abcdefghijklmnop", 1), 17807676237451361719)
-        expected = buzhash64(b"abcdefghijklmnop", 1)
-        previous = buzhash64(b"Xabcdefghijklmno", 1)
-        this = buzhash64_update(previous, ord("X"), ord("p"), 16, 1)
+        self.assert_equal(buzhash64(b"abcdefghijklmnop", b"0"), 13095190927899934478)
+        self.assert_equal(buzhash64(b"abcdefghijklmnop", b"1"), 10129419249308136910)
+        expected = buzhash64(b"abcdefghijklmnop", b"1")
+        previous = buzhash64(b"Xabcdefghijklmno", b"1")
+        this = buzhash64_update(previous, ord("X"), ord("p"), 16, b"1")
         self.assert_equal(this, expected)
         # Test with more than 63 bytes to make sure our barrel_shift macro works correctly
-        self.assert_equal(buzhash64(b"abcdefghijklmnopqrstuvwxyz" * 4, 0), 592868834756664313)
+        self.assert_equal(buzhash64(b"abcdefghijklmnopqrstuvwxyz" * 4, b"0"), 9064183923498167899)
 
     def test_small_reads64(self):
         class SmallReadFile:

+ 4 - 4
src/borg/testsuite/chunkers/buzhash64_test.py

@@ -30,22 +30,22 @@ def test_chunkpoints64_unchanged():
                 if minexp >= maxexp:
                     continue
                 for maskbits in (4, 7, 10, 12):
-                    for seed in (1849058162, 1234567653):
+                    for key in (b"first_key", b"second_key"):
                         fh = BytesIO(data)
-                        chunker = ChunkerBuzHash64(seed, minexp, maxexp, maskbits, winsize)
+                        chunker = ChunkerBuzHash64(key, minexp, maxexp, maskbits, winsize)
                         chunks = [H(c) for c in cf(chunker.chunkify(fh, -1))]
                         runs.append(H(b"".join(chunks)))
 
     # The "correct" hash below matches the existing chunker behavior.
     # Future chunker optimisations must not change this, or existing repos will bloat.
     overall_hash = H(b"".join(runs))
-    assert overall_hash == hex_to_bin("fa9002758c0358721404f55f3020bb56b987cb3cd9a688ff9641f4023215f4e7")
+    assert overall_hash == hex_to_bin("ab98713d28c5a544eeb8b6a2b5ba6405847bd6924d45fb7e267d173892ad0cdc")
 
 
 def test_buzhash64_chunksize_distribution():
     data = os.urandom(1048576)
     min_exp, max_exp, mask = 10, 16, 14  # chunk size target 16kiB, clip at 1kiB and 64kiB
-    chunker = ChunkerBuzHash64(0, min_exp, max_exp, mask, 4095)
+    chunker = ChunkerBuzHash64(b"", min_exp, max_exp, mask, 4095)
     f = BytesIO(data)
     chunks = cf(chunker.chunkify(f))
     del chunks[-1]  # get rid of the last chunk, it can be smaller than 2**min_exp