瀏覽代碼

Hashindex header work, fixes #6960 (#7064)

support reading new, improved hashindex header format, fixes #6960

Bit of a pain to work with that code:
- C code
- needs to still be able to read the old hashindex file format,
- while also supporting the new file format.
- the hash computed while reading the file causes additional problems because
  it expects all places in the file get read exactly once and in sequential order.
  I solved this by separately opening the file in the python part of the code and
  checking for the magic.
  BORG_IDX means the legacy file format and legacy layout of the hashtable,
  BORG2IDX means the new file format and the new layout of the hashtable.

Done:
- added a version int32 directly after the magic and set it to 2 (like borg 2).
  the old header had no version info, but could be denoted as version 1 in case
  we ever need it (currently it decides based on the magic).
- added num_empty as indicated by a TODO in count_empty, so it does not need a
  full hashtable scan to determine the amount of empty buckets.
- to keep it simpler, I just filled the HashHeader struct with a
  `char reserved[1024 - 32];`
  1024 being the desired overall header size and 32 being the currently used size.
  this alignment might be useful in case we mmap() the hashindex file one day.
TW 2 年之前
父節點
當前提交
c29d4a096b
共有 5 個文件被更改,包括 267 次插入83 次删除
  1. 212 41
      src/borg/_hashindex.c
  2. 14 13
      src/borg/hashindex.pyx
  3. 2 2
      src/borg/repository.py
  4. 9 3
      src/borg/testsuite/cache.py
  5. 30 24
      src/borg/testsuite/hashindex.py

+ 212 - 41
src/borg/_hashindex.c

@@ -19,7 +19,8 @@
 #   define BORG_PACKED(x) x __attribute__((packed))
 #endif
 
-#define MAGIC "BORG_IDX"
+#define MAGIC  "BORG2IDX"
+#define MAGIC1 "BORG_IDX"  // legacy
 #define MAGIC_LEN 8
 
 #define DEBUG 0
@@ -39,6 +40,18 @@ typedef struct {
     int32_t num_buckets;
     int8_t  key_size;
     int8_t  value_size;
+}) HashHeader1;
+
+BORG_PACKED(
+typedef struct {
+    char magic[MAGIC_LEN];
+    int32_t version;
+    int32_t num_entries;
+    int32_t num_buckets;
+    int32_t num_empty;
+    int32_t key_size;
+    int32_t value_size;
+    char reserved[1024 - 32];  // filler to 1024 bytes total
 }) HashHeader;
 
 typedef struct {
@@ -110,8 +123,8 @@ static int hash_sizes[] = {
 #define EPRINTF_PATH(path, msg, ...) fprintf(stderr, "hashindex: %s: " msg " (%s)\n", path, ##__VA_ARGS__, strerror(errno))
 
 #ifndef BORG_NO_PYTHON
-static HashIndex *hashindex_read(PyObject *file_py, int permit_compact);
-static void hashindex_write(HashIndex *index, PyObject *file_py);
+static HashIndex *hashindex_read(PyObject *file_py, int permit_compact, int legacy);
+static void hashindex_write(HashIndex *index, PyObject *file_py, int legacy);
 #endif
 
 static uint64_t hashindex_compact(HashIndex *index);
@@ -265,9 +278,7 @@ int shrink_size(int current){
 
 int
 count_empty(HashIndex *index)
-{   /* count empty (never used) buckets. this does NOT include deleted buckets (tombstones).
-     * TODO: if we ever change HashHeader, save the count there so we do not need this function.
-     */
+{   /* count empty (never used) buckets. this does NOT include deleted buckets (tombstones). */
     int i, count = 0, capacity = index->num_buckets;
     for(i = 0; i < capacity; i++) {
         if(BUCKET_IS_EMPTY(index, i))
@@ -276,19 +287,16 @@ count_empty(HashIndex *index)
     return count;
 }
 
-/* Public API */
-
-#ifndef BORG_NO_PYTHON
-static HashIndex *
-hashindex_read(PyObject *file_py, int permit_compact)
+HashIndex *
+read_hashheader1(PyObject *file_py)
 {
-    Py_ssize_t length, buckets_length, bytes_read;
+    Py_ssize_t bytes_read, length, buckets_length;
     Py_buffer header_buffer;
-    PyObject *header_bytes, *length_object, *bucket_bytes, *tmp;
-    HashHeader *header;
+    PyObject *header_bytes, *length_object, *tmp;
     HashIndex *index = NULL;
+    HashHeader1 *header;
 
-    header_bytes = PyObject_CallMethod(file_py, "read", "n", (Py_ssize_t)sizeof(HashHeader));
+    header_bytes = PyObject_CallMethod(file_py, "read", "n", (Py_ssize_t)sizeof(*header));
     if(!header_bytes) {
         assert(PyErr_Occurred());
         goto fail;
@@ -299,11 +307,11 @@ hashindex_read(PyObject *file_py, int permit_compact)
         /* TypeError, not a bytes() object */
         goto fail_decref_header;
     }
-    if(bytes_read != sizeof(HashHeader)) {
+    if(bytes_read != sizeof(*header)) {
         /* Truncated file */
         /* Note: %zd is the format for Py_ssize_t, %zu is for size_t */
         PyErr_Format(PyExc_ValueError, "Could not read header (expected %zu, but read %zd bytes)",
-                     sizeof(HashHeader), bytes_read);
+                     sizeof(*header), bytes_read);
         goto fail_decref_header;
     }
 
@@ -334,7 +342,7 @@ hashindex_read(PyObject *file_py, int permit_compact)
         goto fail_decref_header;
     }
 
-    tmp = PyObject_CallMethod(file_py, "seek", "ni", (Py_ssize_t)sizeof(HashHeader), SEEK_SET);
+    tmp = PyObject_CallMethod(file_py, "seek", "ni", (Py_ssize_t)sizeof(*header), SEEK_SET);
     Py_XDECREF(tmp);
     if(PyErr_Occurred()) {
         goto fail_decref_header;
@@ -351,23 +359,168 @@ hashindex_read(PyObject *file_py, int permit_compact)
         goto fail_free_index;
     }
 
-    header = (HashHeader*) header_buffer.buf;
-    if(memcmp(header->magic, MAGIC, MAGIC_LEN)) {
+    header = (HashHeader1*) header_buffer.buf;
+    if(memcmp(header->magic, MAGIC1, MAGIC_LEN)) {
         PyErr_Format(PyExc_ValueError, "Unknown MAGIC in header");
         goto fail_release_header_buffer;
     }
 
     buckets_length = (Py_ssize_t)_le32toh(header->num_buckets) * (header->key_size + header->value_size);
-    if((Py_ssize_t)length != (Py_ssize_t)sizeof(HashHeader) + buckets_length) {
+    if((Py_ssize_t)length != (Py_ssize_t)sizeof(*header) + buckets_length) {
         PyErr_Format(PyExc_ValueError, "Incorrect file length (expected %zd, got %zd)",
-                     sizeof(HashHeader) + buckets_length, length);
+                     sizeof(*header) + buckets_length, length);
         goto fail_release_header_buffer;
     }
 
     index->num_entries = _le32toh(header->num_entries);
     index->num_buckets = _le32toh(header->num_buckets);
+    index->num_empty = -1;  // unknown, needs counting
     index->key_size = header->key_size;
     index->value_size = header->value_size;
+
+fail_release_header_buffer:
+    PyBuffer_Release(&header_buffer);
+fail_free_index:
+    if(PyErr_Occurred()) {
+        free(index);
+        index = NULL;
+    }
+fail_decref_header:
+    Py_DECREF(header_bytes);
+fail:
+    return index;
+}
+
+HashIndex *
+read_hashheader(PyObject *file_py)
+{
+    Py_ssize_t bytes_read, length, buckets_length;
+    Py_buffer header_buffer;
+    PyObject *header_bytes, *length_object, *tmp;
+    HashIndex *index = NULL;
+    HashHeader *header;
+
+    header_bytes = PyObject_CallMethod(file_py, "read", "n", (Py_ssize_t)sizeof(*header));
+    if(!header_bytes) {
+        assert(PyErr_Occurred());
+        goto fail;
+    }
+
+    bytes_read = PyBytes_Size(header_bytes);
+    if(PyErr_Occurred()) {
+        /* TypeError, not a bytes() object */
+        goto fail_decref_header;
+    }
+    if(bytes_read != sizeof(*header)) {
+        /* Truncated file */
+        /* Note: %zd is the format for Py_ssize_t, %zu is for size_t */
+        PyErr_Format(PyExc_ValueError, "Could not read header (expected %zu, but read %zd bytes)",
+                     sizeof(*header), bytes_read);
+        goto fail_decref_header;
+    }
+
+    /*
+     * Hash the header
+     * If the header is corrupted this bails before doing something stupid (like allocating 3.8 TB of memory)
+     */
+    tmp = PyObject_CallMethod(file_py, "hash_part", "s", "HashHeader");
+    Py_XDECREF(tmp);
+    if(PyErr_Occurred()) {
+        if(PyErr_ExceptionMatches(PyExc_AttributeError)) {
+            /* Be able to work with regular file objects which do not have a hash_part method. */
+            PyErr_Clear();
+        } else {
+            goto fail_decref_header;
+        }
+    }
+
+    /* Find length of file */
+    length_object = PyObject_CallMethod(file_py, "seek", "ni", (Py_ssize_t)0, SEEK_END);
+    if(PyErr_Occurred()) {
+        goto fail_decref_header;
+    }
+    length = PyNumber_AsSsize_t(length_object, PyExc_OverflowError);
+    Py_DECREF(length_object);
+    if(PyErr_Occurred()) {
+        /* This shouldn't generally happen; but can if seek() returns something that's not a number */
+        goto fail_decref_header;
+    }
+
+    tmp = PyObject_CallMethod(file_py, "seek", "ni", (Py_ssize_t)sizeof(*header), SEEK_SET);
+    Py_XDECREF(tmp);
+    if(PyErr_Occurred()) {
+        goto fail_decref_header;
+    }
+
+    /* Set up the in-memory header */
+    if(!(index = malloc(sizeof(HashIndex)))) {
+        PyErr_NoMemory();
+        goto fail_decref_header;
+    }
+
+    PyObject_GetBuffer(header_bytes, &header_buffer, PyBUF_SIMPLE);
+    if(PyErr_Occurred()) {
+        goto fail_free_index;
+    }
+
+    header = (HashHeader*) header_buffer.buf;
+    if(memcmp(header->magic, MAGIC, MAGIC_LEN)) {
+        PyErr_Format(PyExc_ValueError, "Unknown MAGIC in header");
+        goto fail_release_header_buffer;
+    }
+
+    buckets_length = (Py_ssize_t)_le32toh(header->num_buckets) *
+                         (_le32toh(header->key_size) + _le32toh(header->value_size));
+    if ((Py_ssize_t)length != (Py_ssize_t)sizeof(*header) + buckets_length) {
+        PyErr_Format(PyExc_ValueError, "Incorrect file length (expected %zd, got %zd)",
+                     sizeof(*header) + buckets_length, length);
+        goto fail_release_header_buffer;
+    }
+
+    index->num_entries = _le32toh(header->num_entries);
+    index->num_buckets = _le32toh(header->num_buckets);
+    index->num_empty = _le32toh(header->num_empty);
+    index->key_size = _le32toh(header->key_size);
+    index->value_size = _le32toh(header->value_size);
+
+    int header_version = _le32toh(header->version);
+    if (header_version != 2) {
+        PyErr_Format(PyExc_ValueError, "Unsupported header version (expected %d, got %d)",
+                     2, header_version);
+        goto fail_release_header_buffer;
+    }
+
+fail_release_header_buffer:
+    PyBuffer_Release(&header_buffer);
+fail_free_index:
+    if(PyErr_Occurred()) {
+        free(index);
+        index = NULL;
+    }
+fail_decref_header:
+    Py_DECREF(header_bytes);
+fail:
+    return index;
+}
+
+/* Public API */
+
+#ifndef BORG_NO_PYTHON
+static HashIndex *
+hashindex_read(PyObject *file_py, int permit_compact, int legacy)
+{
+    Py_ssize_t buckets_length, bytes_read;
+    PyObject *bucket_bytes;
+    HashIndex *index = NULL;
+
+    if (legacy)
+        index = read_hashheader1(file_py);
+    else
+        index = read_hashheader(file_py);
+
+    if (!index)
+        goto fail;
+
     index->bucket_size = index->key_size + index->value_size;
     index->lower_limit = get_lower_limit(index->num_buckets);
     index->upper_limit = get_upper_limit(index->num_buckets);
@@ -381,10 +534,11 @@ hashindex_read(PyObject *file_py, int permit_compact)
      * will issue multiple underlying reads if necessary. This supports indices
      * >2 GB on Linux. We also compare lengths later.
      */
+    buckets_length = (Py_ssize_t)(index->num_buckets) * (index->key_size + index->value_size);
     bucket_bytes = PyObject_CallMethod(file_py, "read", "n", buckets_length);
     if(!bucket_bytes) {
         assert(PyErr_Occurred());
-        goto fail_release_header_buffer;
+        goto fail_free_index;
     }
     bytes_read = PyBytes_Size(bucket_bytes);
     if(PyErr_Occurred()) {
@@ -404,7 +558,8 @@ hashindex_read(PyObject *file_py, int permit_compact)
 
     if(!permit_compact) {
         index->min_empty = get_min_empty(index->num_buckets);
-        index->num_empty = count_empty(index);
+        if (index->num_empty == -1)  // we read a legacy index without num_empty value
+            index->num_empty = count_empty(index);
 
         if(index->num_empty < index->min_empty) {
             /* too many tombstones here / not enough empty buckets, do a same-size rebuild */
@@ -426,15 +581,11 @@ fail_free_buckets:
     }
 fail_decref_buckets:
     Py_DECREF(bucket_bytes);
-fail_release_header_buffer:
-    PyBuffer_Release(&header_buffer);
 fail_free_index:
     if(PyErr_Occurred()) {
         free(index);
         index = NULL;
     }
-fail_decref_header:
-    Py_DECREF(header_bytes);
 fail:
     return index;
 }
@@ -481,33 +632,37 @@ hashindex_free(HashIndex *index)
     free(index);
 }
 
-#ifndef BORG_NO_PYTHON
-static void
-hashindex_write(HashIndex *index, PyObject *file_py)
+int
+write_hashheader(HashIndex *index, PyObject *file_py)
 {
-    PyObject *length_object, *buckets_view, *tmp;
+    PyObject *length_object, *tmp;
     Py_ssize_t length;
-    Py_ssize_t buckets_length = (Py_ssize_t)index->num_buckets * index->bucket_size;
+
+    _Static_assert(sizeof(HashHeader) == 1024, "HashHeader struct should be exactly 1024 bytes in size");
+
     HashHeader header = {
         .magic = MAGIC,
+        .version = _htole32(2),
         .num_entries = _htole32(index->num_entries),
         .num_buckets = _htole32(index->num_buckets),
-        .key_size = index->key_size,
-        .value_size = index->value_size
+        .num_empty = _htole32(index->num_empty),
+        .key_size = _htole32(index->key_size),
+        .value_size = _htole32(index->value_size),
+        .reserved = {0}
     };
 
-    length_object = PyObject_CallMethod(file_py, "write", "y#", &header, (Py_ssize_t)sizeof(HashHeader));
+    length_object = PyObject_CallMethod(file_py, "write", "y#", &header, (Py_ssize_t)sizeof(header));
     if(PyErr_Occurred()) {
-        return;
+        return 0;
     }
     length = PyNumber_AsSsize_t(length_object, PyExc_OverflowError);
     Py_DECREF(length_object);
     if(PyErr_Occurred()) {
-        return;
+        return 0;
     }
-    if(length != sizeof(HashHeader)) {
+    if(length != sizeof(header)) {
         PyErr_SetString(PyExc_ValueError, "Failed to write header");
-        return;
+        return 0;
     }
 
     /*
@@ -520,9 +675,24 @@ hashindex_write(HashIndex *index, PyObject *file_py)
             /* Be able to work with regular file objects which do not have a hash_part method. */
             PyErr_Clear();
         } else {
-            return;
+            return 0;
         }
     }
+    return 1;
+}
+
+#ifndef BORG_NO_PYTHON
+static void
+hashindex_write(HashIndex *index, PyObject *file_py, int legacy)
+{
+    PyObject *length_object, *buckets_view;
+    Py_ssize_t length;
+    Py_ssize_t buckets_length = (Py_ssize_t)index->num_buckets * index->bucket_size;
+
+    assert(!legacy);  // we do not ever write legacy hashindexes
+
+    if(!write_hashheader(index, file_py))
+        return;
 
     /* Note: explicitly construct view; BuildValue can convert (pointer, length) to Python objects, but copies them for doing so */
     buckets_view = PyMemoryView_FromMemory((char*)index->buckets, buckets_length, PyBUF_READ);
@@ -698,6 +868,7 @@ hashindex_compact(HashIndex *index)
     }
 
     index->num_buckets = index->num_entries;
+    index->num_empty = 0;
     return saved_size;
 }
 

+ 14 - 13
src/borg/hashindex.pyx

@@ -17,12 +17,12 @@ cdef extern from "_hashindex.c":
         uint32_t version
         char hash[16]
 
-    HashIndex *hashindex_read(object file_py, int permit_compact) except *
+    HashIndex *hashindex_read(object file_py, int permit_compact, int legacy) except *
     HashIndex *hashindex_init(int capacity, int key_size, int value_size)
     void hashindex_free(HashIndex *index)
     int hashindex_len(HashIndex *index)
     int hashindex_size(HashIndex *index)
-    void hashindex_write(HashIndex *index, object file_py) except *
+    void hashindex_write(HashIndex *index, object file_py, int legacy) except *
     unsigned char *hashindex_get(HashIndex *index, unsigned char *key)
     unsigned char *hashindex_next_key(HashIndex *index, unsigned char *key)
     int hashindex_delete(HashIndex *index, unsigned char *key)
@@ -75,21 +75,21 @@ assert _MAX_VALUE % 2 == 1
 def hashindex_variant(fn):
     """peek into an index file and find out what it is"""
     with open(fn, 'rb') as f:
-        hh = f.read(18)  # len(HashHeader)
-    magic = hh[0:8]
+        magic = f.read(8)  # MAGIC_LEN
     if magic == b'BORG_IDX':
-        key_size = hh[16]
-        value_size = hh[17]
-        return f'k{key_size}_v{value_size}'
+        return 1  # legacy
+    if magic == b'BORG2IDX':
+        return 2
     if magic == b'12345678':  # used by unit tests
-        return 'k32_v16'  # just return the current variant
-    raise ValueError(f'unknown hashindex format, magic: {magic!r}')
+        return 2  # just return the current variant
+    raise ValueError(f'unknown hashindex magic: {magic!r}')
 
 
 @cython.internal
 cdef class IndexBase:
     cdef HashIndex *index
     cdef int key_size
+    legacy = 0
 
     _key_size = 32
 
@@ -101,9 +101,9 @@ cdef class IndexBase:
         if path:
             if isinstance(path, (str, bytes)):
                 with open(path, 'rb') as fd:
-                    self.index = hashindex_read(fd, permit_compact)
+                    self.index = hashindex_read(fd, permit_compact, self.legacy)
             else:
-                self.index = hashindex_read(path, permit_compact)
+                self.index = hashindex_read(path, permit_compact, self.legacy)
             assert self.index, 'hashindex_read() returned NULL with no exception set'
         else:
             if usable is not None:
@@ -123,9 +123,9 @@ cdef class IndexBase:
     def write(self, path):
         if isinstance(path, (str, bytes)):
             with open(path, 'wb') as fd:
-                hashindex_write(self.index, fd)
+                hashindex_write(self.index, fd, self.legacy)
         else:
-            hashindex_write(self.index, path)
+            hashindex_write(self.index, path, self.legacy)
 
     def clear(self):
         hashindex_free(self.index)
@@ -314,6 +314,7 @@ cdef class NSKeyIterator:
 
 cdef class NSIndex1(IndexBase):  # legacy borg 1.x
 
+    legacy = 1
     value_size = 8
 
     def __getitem__(self, key):

+ 2 - 2
src/borg/repository.py

@@ -543,9 +543,9 @@ class Repository:
         integrity_data = self._read_integrity(transaction_id, "index")
         try:
             with IntegrityCheckedFile(index_path, write=False, integrity_data=integrity_data) as fd:
-                if variant == "k32_v16":
+                if variant == 2:
                     return NSIndex.read(fd)
-                if variant == "k32_v8":  # legacy
+                if variant == 1:  # legacy
                     return NSIndex1.read(fd)
         except (ValueError, OSError, FileIntegrityError) as exc:
             logger.warning("Repository index missing or corrupted, trying to recover from: %s", exc)

+ 9 - 3
src/borg/testsuite/cache.py

@@ -108,15 +108,21 @@ class TestCacheSynchronizer:
 
     def make_index_with_refcount(self, refcount):
         index_data = io.BytesIO()
-        index_data.write(b"BORG_IDX")
+        index_data.write(b"BORG2IDX")
+        # version
+        index_data.write((2).to_bytes(4, "little"))
         # num_entries
         index_data.write((1).to_bytes(4, "little"))
         # num_buckets
         index_data.write((1).to_bytes(4, "little"))
+        # num_empty
+        index_data.write((0).to_bytes(4, "little"))
         # key_size
-        index_data.write((32).to_bytes(1, "little"))
+        index_data.write((32).to_bytes(4, "little"))
         # value_size
-        index_data.write((3 * 4).to_bytes(1, "little"))
+        index_data.write((3 * 4).to_bytes(4, "little"))
+        # reserved
+        index_data.write(bytes(1024 - 32))
 
         index_data.write(H(0))
         index_data.write(refcount.to_bytes(4, "little"))

+ 30 - 24
src/borg/testsuite/hashindex.py

@@ -86,12 +86,12 @@ class HashIndexTestCase(BaseTestCase):
 
     def test_nsindex(self):
         self._generic_test(
-            NSIndex, lambda x: (x, x, x), "7d70671d0b7e9d2f51b2691ecf35184b9f8ecc1202cceb2748c905c8fc04c256"
+            NSIndex, lambda x: (x, x, x), "0d7880dbe02b64f03c471e60e193a1333879b4f23105768b10c9222accfeac5e"
         )
 
     def test_chunkindex(self):
         self._generic_test(
-            ChunkIndex, lambda x: (x, x), "85f72b036c692c8266e4f51ccf0cff2147204282b5e316ae508d30a448d88fef"
+            ChunkIndex, lambda x: (x, x), "5915fcf986da12e5f3ac68e05242b9c729e6101b0460b1d4e4a9e9f7cdf1b7da"
         )
 
     def test_resize(self):
@@ -252,7 +252,7 @@ class HashIndexExtraTestCase(BaseTestCase):
 class HashIndexSizeTestCase(BaseTestCase):
     def test_size_on_disk(self):
         idx = ChunkIndex()
-        assert idx.size() == 18 + 1031 * (32 + 2 * 4)
+        assert idx.size() == 1024 + 1031 * (32 + 2 * 4)
 
     def test_size_on_disk_accurate(self):
         idx = ChunkIndex()
@@ -368,12 +368,12 @@ class HashIndexRefcountingTestCase(BaseTestCase):
 
 
 class HashIndexDataTestCase(BaseTestCase):
-    # This bytestring was created with borg2-pre 2022-06-10
+    # This bytestring was created with borg2-pre 2022-09-30
     HASHINDEX = (
-        b"eJzt0LEJg1AYhdE/JqBjOEJMNhBBrAQrO9ewc+HsoG+CPMsEz1cfbnHbceqXoZvvEVE+IuoqMu2pnOE4"
-        b"juM4juM4juM4juM4juM4juM4juM4juM4juM4juM4juM4juM4juM4juM4juM4juM4juM4juM4juM4juM4"
-        b"juM4juM4juM4jruie36vuSVT5N0rzW0n9t7r5z9+4TiO4ziO4ziO4ziO4ziO4ziO4ziO4ziO4ziO4ziO"
-        b"4ziO4ziO4ziO4ziO4ziO437LHbSVHGw="
+        b"eJzt0DEKgwAMQNFoBXsMj9DqDUQoToKTR3Hzwr2DZi+0HS19HwIZHhnST/OjHYeljIhLTl1FVDlN7te"
+        b"Q9M/tGcdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHM"
+        b"dxHMdxHMdxHMdxHMdxHMdxHPfqbu+7F2nKz67Nc9sX97r1+Rt/4TiO4ziO4ziO4ziO4ziO4ziO4ziO4"
+        b"ziO4ziO4ziO4ziO4ziO4ziO4ziO4ziO487lDoRvHEk="
     )
 
     def _serialize_hashindex(self, idx):
@@ -439,17 +439,23 @@ class HashIndexIntegrityTestCase(HashIndexDataTestCase):
 
 
 class HashIndexCompactTestCase(HashIndexDataTestCase):
-    def index(self, num_entries, num_buckets):
+    def index(self, num_entries, num_buckets, num_empty):
         index_data = io.BytesIO()
-        index_data.write(b"BORG_IDX")
+        index_data.write(b"BORG2IDX")
+        # version
+        index_data.write((2).to_bytes(4, "little"))
         # num_entries
         index_data.write(num_entries.to_bytes(4, "little"))
         # num_buckets
         index_data.write(num_buckets.to_bytes(4, "little"))
+        # num_empty
+        index_data.write(num_empty.to_bytes(4, "little"))
         # key_size
-        index_data.write((32).to_bytes(1, "little"))
+        index_data.write((32).to_bytes(4, "little"))
         # value_size
-        index_data.write((3 * 4).to_bytes(1, "little"))
+        index_data.write((3 * 4).to_bytes(4, "little"))
+        # reserved
+        index_data.write(bytes(1024 - 32))
 
         self.index_data = index_data
 
@@ -481,7 +487,7 @@ class HashIndexCompactTestCase(HashIndexDataTestCase):
         self.write_entry(key, 0xFFFFFFFE, 0, 0)
 
     def test_simple(self):
-        self.index(num_entries=3, num_buckets=6)
+        self.index(num_entries=3, num_buckets=6, num_empty=2)
         self.write_entry(H2(0), 1, 2, 3)
         self.write_deleted(H2(1))
         self.write_empty(H2(2))
@@ -491,14 +497,14 @@ class HashIndexCompactTestCase(HashIndexDataTestCase):
 
         compact_index = self.index_from_data_compact_to_data()
 
-        self.index(num_entries=3, num_buckets=3)
+        self.index(num_entries=3, num_buckets=3, num_empty=0)
         self.write_entry(H2(0), 1, 2, 3)
         self.write_entry(H2(3), 5, 6, 7)
         self.write_entry(H2(4), 8, 9, 10)
         assert compact_index == self.index_data.getvalue()
 
     def test_first_empty(self):
-        self.index(num_entries=3, num_buckets=6)
+        self.index(num_entries=3, num_buckets=6, num_empty=2)
         self.write_deleted(H2(1))
         self.write_entry(H2(0), 1, 2, 3)
         self.write_empty(H2(2))
@@ -508,14 +514,14 @@ class HashIndexCompactTestCase(HashIndexDataTestCase):
 
         compact_index = self.index_from_data_compact_to_data()
 
-        self.index(num_entries=3, num_buckets=3)
+        self.index(num_entries=3, num_buckets=3, num_empty=0)
         self.write_entry(H2(0), 1, 2, 3)
         self.write_entry(H2(3), 5, 6, 7)
         self.write_entry(H2(4), 8, 9, 10)
         assert compact_index == self.index_data.getvalue()
 
     def test_last_used(self):
-        self.index(num_entries=3, num_buckets=6)
+        self.index(num_entries=3, num_buckets=6, num_empty=2)
         self.write_deleted(H2(1))
         self.write_entry(H2(0), 1, 2, 3)
         self.write_empty(H2(2))
@@ -525,14 +531,14 @@ class HashIndexCompactTestCase(HashIndexDataTestCase):
 
         compact_index = self.index_from_data_compact_to_data()
 
-        self.index(num_entries=3, num_buckets=3)
+        self.index(num_entries=3, num_buckets=3, num_empty=0)
         self.write_entry(H2(0), 1, 2, 3)
         self.write_entry(H2(3), 5, 6, 7)
         self.write_entry(H2(4), 8, 9, 10)
         assert compact_index == self.index_data.getvalue()
 
     def test_too_few_empty_slots(self):
-        self.index(num_entries=3, num_buckets=6)
+        self.index(num_entries=3, num_buckets=6, num_empty=2)
         self.write_deleted(H2(1))
         self.write_entry(H2(0), 1, 2, 3)
         self.write_entry(H2(3), 5, 6, 7)
@@ -542,14 +548,14 @@ class HashIndexCompactTestCase(HashIndexDataTestCase):
 
         compact_index = self.index_from_data_compact_to_data()
 
-        self.index(num_entries=3, num_buckets=3)
+        self.index(num_entries=3, num_buckets=3, num_empty=0)
         self.write_entry(H2(0), 1, 2, 3)
         self.write_entry(H2(3), 5, 6, 7)
         self.write_entry(H2(4), 8, 9, 10)
         assert compact_index == self.index_data.getvalue()
 
     def test_empty(self):
-        self.index(num_entries=0, num_buckets=6)
+        self.index(num_entries=0, num_buckets=6, num_empty=3)
         self.write_deleted(H2(1))
         self.write_empty(H2(0))
         self.write_deleted(H2(3))
@@ -559,7 +565,7 @@ class HashIndexCompactTestCase(HashIndexDataTestCase):
 
         compact_index = self.index_from_data_compact_to_data()
 
-        self.index(num_entries=0, num_buckets=0)
+        self.index(num_entries=0, num_buckets=0, num_empty=0)
         assert compact_index == self.index_data.getvalue()
 
     def test_merge(self):
@@ -569,7 +575,7 @@ class HashIndexCompactTestCase(HashIndexDataTestCase):
         idx1[H(2)] = 2, 200
         idx1[H(3)] = 3, 300
         idx1.compact()
-        assert idx1.size() == 18 + 3 * (32 + 2 * 4)
+        assert idx1.size() == 1024 + 3 * (32 + 2 * 4)
 
         master.merge(idx1)
         assert master[H(1)] == (1, 100)
@@ -612,7 +618,7 @@ class IndexCorruptionTestCase(BaseTestCase):
         for y in range(700):  # stay below max load to not trigger resize
             idx[HH(0, y, 0)] = (0, y, 0)
 
-        assert idx.size() == 1031 * 48 + 18  # 1031 buckets + header
+        assert idx.size() == 1024 + 1031 * 48  # header + 1031 buckets
 
         # delete lots of the collisions, creating lots of tombstones
         for y in range(400):  # stay above min load to not trigger resize