Prechádzať zdrojové kódy

buzhash64: deterministically create a balanced bh table

the previous approach had cryptographic strength randomness, but a precise
50:50 0/1 bit distribution per bit position in the table was not assured.

now this is always the case due to the way how the table is constructed.
Thomas Waldmann 6 dní pred
rodič
commit
d48c9643e8

+ 1 - 0
src/borg/chunkers/buzhash64.pyi

@@ -6,6 +6,7 @@ API_VERSION: str
 
 def buzhash64(data: bytes, key: bytes) -> int: ...
 def buzhash64_update(sum: int, remove: int, add: int, len: int, key: bytes) -> int: ...
+def buzhash64_get_table(key: bytes) -> List[int]: ...
 
 class ChunkerBuzHash64:
     def __init__(

+ 35 - 7
src/borg/chunkers/buzhash64.pyx

@@ -3,8 +3,8 @@
 API_VERSION = '1.2_01'
 
 import cython
+import random
 import time
-from hashlib import sha256
 
 from cpython.bytes cimport PyBytes_AsString
 from libc.stdint cimport uint8_t, uint64_t
@@ -40,14 +40,31 @@ cdef extern from *:
 @cython.boundscheck(False)  # Deactivate bounds checking
 @cython.wraparound(False)  # Deactivate negative indexing.
 cdef uint64_t* buzhash64_init_table(bytes key):
-    """Initialize the buzhash table using the given key."""
-    cdef int i
+    """
+    Generate a balanced pseudo-random table deterministically from a 256-bit key.
+    Balanced means that for each bit position 0..63, exactly 50% of the table values have the bit set to 1.
+    """
+    # Create deterministic random number generator
+    rng = random.Random(int.from_bytes(key, 'big'))
+
+    cdef int i, j, bit_pos
     cdef uint64_t* table = <uint64_t*>malloc(2048)  # 256 * sizeof(uint64_t)
+
+    # Initialize all values to 0
     for i in range(256):
-        # deterministically generate a pseudo-random 64-bit unsigned integer for table entry i involving the key:
-        v = f"{i:02x}".encode() + key
-        d64 = sha256(v).digest()[:8]
-        table[i] = <uint64_t> int.from_bytes(d64, byteorder='little')
+        table[i] = 0
+
+    # For each bit position, deterministically assign exactly 128 positions to have that bit set
+    for bit_pos in range(64):
+        # Create a list of indices and shuffle deterministically
+        indices = list(range(256))
+        rng.shuffle(indices)
+
+        # Set the bit at bit_pos for the first 128 shuffled indices
+        for i in range(128):
+            j = indices[i]
+            table[j] |= (1ULL << bit_pos)
+
     return table
 
 
@@ -289,3 +306,14 @@ def buzhash64_update(uint64_t sum, unsigned char remove, unsigned char add, size
     sum = _buzhash64_update(sum, remove, add, len, table)
     free(table)
     return sum
+
+
+def buzhash64_get_table(bytes key):
+    """Get the buzhash table generated from <key>."""
+    cdef uint64_t *table
+    cdef int i
+    table = buzhash64_init_table(key)
+    try:
+        return [table[i] for i in range(256)]
+    finally:
+        free(table)

+ 32 - 26
src/borg/testsuite/chunkers/buzhash64_self_test.py

@@ -6,63 +6,69 @@ from io import BytesIO
 from ...chunkers import get_chunker
 from ...chunkers.buzhash64 import buzhash64, buzhash64_update, ChunkerBuzHash64
 from ...constants import *  # NOQA
+from ...helpers import hex_to_bin
 from .. import BaseTestCase
 from . import cf
 
+# from os.urandom(32)
+key0 = hex_to_bin("ad9f89095817f0566337dc9ee292fcd59b70f054a8200151f1df5f21704824da")
+key1 = hex_to_bin("f1088c7e9e6ae83557ad1558ff36c44a369ea719d1081c29684f52ffccb72cb8")
+key2 = hex_to_bin("57174a65fde67fe127b18430525b50a58406f1bd6cc629535208c7832e181067")
+
 
 class ChunkerBuzHash64TestCase(BaseTestCase):
     def test_chunkify64(self):
         data = b"0" * int(1.5 * (1 << CHUNK_MAX_EXP)) + b"Y"
-        parts = cf(ChunkerBuzHash64(b"0", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(data)))
+        parts = cf(ChunkerBuzHash64(key0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(data)))
         self.assert_equal(len(parts), 2)
         self.assert_equal(b"".join(parts), data)
-        self.assert_equal(cf(ChunkerBuzHash64(b"0", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b""))), [])
+        self.assert_equal(cf(ChunkerBuzHash64(key0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b""))), [])
         self.assert_equal(
-            cf(ChunkerBuzHash64(b"0", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))),
-            [b"fo", b"obarbo", b"ob", b"azfo", b"obarbo", b"ob", b"azfo", b"obarbo", b"obaz"],
+            cf(ChunkerBuzHash64(key0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))),
+            [b"foobarboobaz", b"foobarboobaz", b"foobarboobaz"],
         )
         self.assert_equal(
-            cf(ChunkerBuzHash64(b"1", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))),
-            [b"fooba", b"rboobaz", b"fooba", b"rboobaz", b"fooba", b"rboobaz"],
+            cf(ChunkerBuzHash64(key1, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))),
+            [b"foobar", b"boob", b"az", b"foobar", b"boob", b"az", b"foobar", b"boobaz"],
         )
         self.assert_equal(
-            cf(ChunkerBuzHash64(b"2", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))),
-            [b"foob", b"arboobazfoob", b"arboobazfoob", b"arboobaz"],
+            cf(ChunkerBuzHash64(key2, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))),
+            [b"foobarb", b"oob", b"az", b"foobarb", b"oob", b"az", b"foobarb", b"oobaz"],
         )
         self.assert_equal(
-            cf(ChunkerBuzHash64(b"0", 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
-            [b"foobarb", b"oobaz", b"foobarb", b"oobaz", b"foobarb", b"oobaz"],
+            cf(ChunkerBuzHash64(key0, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
+            [b"foobarb", b"oobazf", b"oobarb", b"oobazf", b"oobarb", b"oobaz"],
         )
         self.assert_equal(
-            cf(ChunkerBuzHash64(b"1", 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
-            [b"foobarbo", b"obazfo", b"obarbo", b"obazfo", b"obarbo", b"obaz"],
+            cf(ChunkerBuzHash64(key1, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
+            [b"foobarb", b"oobaz", b"foobarb", b"oobaz", b"foobarb", b"oobaz"],
         )
         self.assert_equal(
-            cf(ChunkerBuzHash64(b"2", 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
-            [b"foobarboobaz", b"foobarboobaz", b"foobarboobaz"],
+            cf(ChunkerBuzHash64(key2, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
+            [b"foobarbooba", b"zfoobarbooba", b"zfoobarboobaz"],
         )
         self.assert_equal(
-            cf(ChunkerBuzHash64(b"0", 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
-            [b"foobarbo", b"obazfoobarb", b"oobazfoo", b"barboobaz"],
+            cf(ChunkerBuzHash64(key0, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
+            [b"foobarboobazf", b"oobarboobazf", b"oobarboobaz"],
         )
         self.assert_equal(
-            cf(ChunkerBuzHash64(b"1", 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
-            [b"foobarbo", b"obazfoobarbo", b"obazfoobarbo", b"obaz"],
+            cf(ChunkerBuzHash64(key1, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
+            [b"foobarbo", b"obazfoobarb", b"oobazfoobarb", b"oobaz"],
         )
         self.assert_equal(
-            cf(ChunkerBuzHash64(b"2", 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
-            [b"foobarboobaz", b"foobarboobaz", b"foobarboobaz"],
+            cf(ChunkerBuzHash64(key2, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
+            [b"foobarbooba", b"zfoobarbooba", b"zfoobarboobaz"],
         )
 
     def test_buzhash64(self):
-        self.assert_equal(buzhash64(b"abcdefghijklmnop", b"0"), 13095190927899934478)
-        self.assert_equal(buzhash64(b"abcdefghijklmnop", b"1"), 10129419249308136910)
-        expected = buzhash64(b"abcdefghijklmnop", b"1")
-        previous = buzhash64(b"Xabcdefghijklmno", b"1")
-        this = buzhash64_update(previous, ord("X"), ord("p"), 16, b"1")
+        self.assert_equal(buzhash64(b"abcdefghijklmnop", key0), 15080163834872228739)
+        self.assert_equal(buzhash64(b"abcdefghijklmnop", key1), 9505908538285923444)
+        expected = buzhash64(b"abcdefghijklmnop", key0)
+        previous = buzhash64(b"Xabcdefghijklmno", key0)
+        this = buzhash64_update(previous, ord("X"), ord("p"), 16, key0)
         self.assert_equal(this, expected)
         # Test with more than 63 bytes to make sure our barrel_shift macro works correctly
-        self.assert_equal(buzhash64(b"abcdefghijklmnopqrstuvwxyz" * 4, b"0"), 9064183923498167899)
+        self.assert_equal(buzhash64(b"abcdefghijklmnopqrstuvwxyz" * 4, key0), 1936382207158378368)
 
     def test_small_reads64(self):
         class SmallReadFile:

+ 32 - 1
src/borg/testsuite/chunkers/buzhash64_test.py

@@ -4,10 +4,16 @@ import os
 
 from . import cf
 from ...chunkers import ChunkerBuzHash64
+from ...chunkers.buzhash64 import buzhash64_get_table
 from ...constants import *  # NOQA
 from ...helpers import hex_to_bin
 
 
+# from os.urandom(32)
+key0 = hex_to_bin("ad9f89095817f0566337dc9ee292fcd59b70f054a8200151f1df5f21704824da")
+key1 = hex_to_bin("f1088c7e9e6ae83557ad1558ff36c44a369ea719d1081c29684f52ffccb72cb8")
+
+
 def H(data):
     return sha256(data).digest()
 
@@ -39,7 +45,8 @@ def test_chunkpoints64_unchanged():
     # The "correct" hash below matches the existing chunker behavior.
     # Future chunker optimisations must not change this, or existing repos will bloat.
     overall_hash = H(b"".join(runs))
-    assert overall_hash == hex_to_bin("ab98713d28c5a544eeb8b6a2b5ba6405847bd6924d45fb7e267d173892ad0cdc")
+    print(overall_hash.hex())
+    assert overall_hash == hex_to_bin("db4b37fbe0cb841d79cfbb52bff8ac2f11040bf83a7d389640c7afb314fc4bfb")
 
 
 def test_buzhash64_chunksize_distribution():
@@ -67,3 +74,27 @@ def test_buzhash64_chunksize_distribution():
     # most chunks should be cut due to buzhash triggering, not due to clipping at min/max size:
     assert min_count < 10
     assert max_count < 10
+
+
+def test_buzhash64_table():
+    # Test that the function returns a list of 256 integers
+    table0 = buzhash64_get_table(key0)
+    assert len(table0) == 256
+
+    # Test that all elements are integers
+    for value in table0:
+        assert isinstance(value, int)
+
+    # Test that the function is deterministic (same key produces same table)
+    table0_again = buzhash64_get_table(key0)
+    assert table0 == table0_again
+
+    # Test that different keys produce different tables
+    table1 = buzhash64_get_table(key1)
+    assert table0 != table1
+
+    # Test that the table has balanced bit distribution
+    # For each bit position 0..63, exactly 50% of the table values should have the bit set to 1
+    for bit_pos in range(64):
+        bit_count = sum(1 for value in table0 if value & (1 << bit_pos))
+        assert bit_count == 128  # 50% of 256 = 128