浏览代码

Merge pull request #8502 from ThomasWaldmann/borghash

replace old hashindex code by borghash + small wrapper
TW 7 月之前
父节点
当前提交
1f00d29d19

+ 1 - 0
pyproject.toml

@@ -30,6 +30,7 @@ classifiers = [
 ]
 license = {text="BSD"}
 dependencies = [
+  "borghash ~= 0.0.1",
   "borgstore ~= 0.1.0",
   "msgpack >=1.0.3, <=1.1.0",
   "packaging",

+ 0 - 106
scripts/hash_sizes.py

@@ -1,106 +0,0 @@
-"""
-Compute hashtable sizes with nices properties
-- prime sizes (for small to medium sizes)
-- 2 prime-factor sizes (for big sizes)
-- fast growth for small sizes
-- slow growth for big sizes
-
-Note:
-     this is just a tool for developers.
-     within borgbackup, it is just used to generate hash_sizes definition for _hashindex.c.
-"""
-
-from collections import namedtuple
-
-K, M, G = 2**10, 2**20, 2**30
-
-# hash table size (in number of buckets)
-start, end_p1, end_p2 = 1 * K, 127 * M, 2 * G - 10 * M  # stay well below 2^31 - 1
-
-Policy = namedtuple("Policy", "upto grow")
-
-policies = [
-    # which growth factor to use when growing a hashtable of size < upto
-    # grow fast (*2.0) at the start so we do not have to resize too often (expensive).
-    # grow slow (*1.1) for huge hash tables (do not jump too much in memory usage)
-    Policy(256 * K, 2.0),
-    Policy(2 * M, 1.7),
-    Policy(16 * M, 1.4),
-    Policy(128 * M, 1.2),
-    Policy(2 * G - 1, 1.1),
-]
-
-
-# slightly modified version of:
-# http://www.macdevcenter.com/pub/a/python/excerpt/pythonckbk_chap1/index1.html?page=2
-def eratosthenes():
-    """Yields the sequence of prime numbers via the Sieve of Eratosthenes."""
-    D = {}  # map each composite integer to its first-found prime factor
-    q = 2  # q gets 2, 3, 4, 5, ... ad infinitum
-    while True:
-        p = D.pop(q, None)
-        if p is None:
-            # q not a key in D, so q is prime, therefore, yield it
-            yield q
-            # mark q squared as not-prime (with q as first-found prime factor)
-            D[q * q] = q
-        else:
-            # let x <- smallest (N*p)+q which wasn't yet known to be composite
-            # we just learned x is composite, with p first-found prime factor,
-            # since p is the first-found prime factor of q -- find and mark it
-            x = p + q
-            while x in D:
-                x += p
-            D[x] = p
-        q += 1
-
-
-def two_prime_factors(pfix=65537):
-    """Yields numbers with 2 prime factors pfix and p."""
-    for p in eratosthenes():
-        yield pfix * p
-
-
-def get_grow_factor(size):
-    for p in policies:
-        if size < p.upto:
-            return p.grow
-
-
-def find_bigger_prime(gen, i):
-    while True:
-        p = next(gen)
-        if p >= i:
-            return p
-
-
-def main():
-    sizes = []
-    i = start
-
-    gen = eratosthenes()
-    while i < end_p1:
-        grow_factor = get_grow_factor(i)
-        p = find_bigger_prime(gen, i)
-        sizes.append(p)
-        i = int(i * grow_factor)
-
-    gen = two_prime_factors()  # for lower ram consumption
-    while i < end_p2:
-        grow_factor = get_grow_factor(i)
-        p = find_bigger_prime(gen, i)
-        sizes.append(p)
-        i = int(i * grow_factor)
-
-    print(
-        """\
-static int hash_sizes[] = {
-    %s
-};
-"""
-        % ", ".join(str(size) for size in sizes)
-    )
-
-
-if __name__ == "__main__":
-    main()

+ 3 - 3
setup.py

@@ -175,12 +175,12 @@ if not on_rtd:
             dict(sources=[platform_linux_source], libraries=["acl"], extra_compile_args=cflags)
         )
 
-    # note: _chunker.c and _hashindex.c are relatively complex/large pieces of handwritten C code,
-    # thus we undef NDEBUG for them, so the compiled code will contain and execute assert().
+    # note: _chunker.c is a relatively complex/large piece of handwritten C code,
+    # thus we undef NDEBUG for it, so the compiled code will contain and execute assert().
     ext_modules += [
         Extension("borg.crypto.low_level", **crypto_ext_kwargs),
         Extension("borg.compress", **compress_ext_kwargs),
-        Extension("borg.hashindex", [hashindex_source], extra_compile_args=cflags, undef_macros=["NDEBUG"]),
+        Extension("borg.hashindex", [hashindex_source], extra_compile_args=cflags),
         Extension("borg.item", [item_source], extra_compile_args=cflags),
         Extension("borg.chunker", [chunker_source], extra_compile_args=cflags, undef_macros=["NDEBUG"]),
         Extension("borg.checksums", **checksums_ext_kwargs),

+ 0 - 30
src/borg/_endian.h

@@ -1,30 +0,0 @@
-#if !defined(_MSC_VER)
-#   include <unistd.h>
-#endif
-#include <stdlib.h>
-#include <stdint.h>
-
-#if defined (__SVR4) && defined (__sun)
-#include <sys/isa_defs.h>
-#endif
-
-#if (defined(BYTE_ORDER) && defined(BIG_ENDIAN) && (BYTE_ORDER == BIG_ENDIAN)) ||  \
-    (defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)) || \
-    (defined(_BIG_ENDIAN) && defined(__SVR4) && defined(__sun))
-#define BORG_BIG_ENDIAN 1
-#elif (defined(BYTE_ORDER) && defined(LITTLE_ENDIAN) && (BYTE_ORDER == LITTLE_ENDIAN)) || \
-      (defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) || \
-      (defined(_LITTLE_ENDIAN) && defined(__SVR4) && defined(__sun)) || \
-      (defined(_MSC_VER) && (defined(_M_AMD64) || defined(_M_IX86)))
-#define BORG_BIG_ENDIAN 0
-#else
-#error Unknown byte order
-#endif
-
-#if BORG_BIG_ENDIAN
-#define _le32toh(x) __builtin_bswap32(x)
-#define _htole32(x) __builtin_bswap32(x)
-#else
-#define _le32toh(x) (x)
-#define _htole32(x) (x)
-#endif

+ 0 - 886
src/borg/_hashindex.c

@@ -1,886 +0,0 @@
-#include <assert.h>
-#include <errno.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdint.h>
-#include <string.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#if !defined(_MSC_VER)
-#   include <unistd.h>
-#endif
-
-#include "_endian.h"
-
-#if defined(_MSC_VER)
-#   define BORG_PACKED(x) __pragma(pack(push, 1)) x __pragma(pack(pop))
-#else
-#   define BORG_PACKED(x) x __attribute__((packed))
-#endif
-
-#define MAGIC  "BORG2IDX"
-#define MAGIC1 "BORG_IDX"  // legacy
-#define MAGIC_LEN 8
-
-#define DEBUG 0
-
-#define debug_print(fmt, ...)                   \
-  do {                                          \
-    if (DEBUG) {                                \
-      fprintf(stderr, fmt, __VA_ARGS__);        \
-      fflush(NULL);                             \
-    }                                           \
-} while (0)
-
-BORG_PACKED(
-typedef struct {
-    char magic[MAGIC_LEN];
-    int32_t num_entries;
-    int32_t num_buckets;
-    int8_t  key_size;
-    int8_t  value_size;
-}) HashHeader1;
-
-BORG_PACKED(
-typedef struct {
-    char magic[MAGIC_LEN];
-    int32_t version;
-    int32_t num_entries;
-    int32_t num_buckets;
-    int32_t num_empty;
-    int32_t key_size;
-    int32_t value_size;
-    char reserved[1024 - 32];  // filler to 1024 bytes total
-}) HashHeader;
-
-typedef struct {
-    unsigned char *buckets;
-    int num_entries;
-    int num_buckets;
-    int num_empty;
-    int key_size;
-    int value_size;
-    off_t bucket_size;
-    int lower_limit;
-    int upper_limit;
-    int min_empty;
-#ifndef BORG_NO_PYTHON
-    /* buckets may be backed by a Python buffer. If buckets_buffer.buf is NULL then this is not used. */
-    Py_buffer buckets_buffer;
-#endif
-} HashIndex;
-
-/* prime (or w/ big prime factors) hash table sizes
- * not sure we need primes for borg's usage (as we have a hash function based
- * on sha256, we can assume an even, seemingly random distribution of values),
- * but OTOH primes don't harm.
- * also, growth of the sizes starts with fast-growing 2x steps, but slows down
- * more and more down to 1.1x. this is to avoid huge jumps in memory allocation,
- * like e.g. 4G -> 8G.
- * these values are generated by hash_sizes.py.
- *
- * update: no, we don't need primes or w/ big prime factors, we followed some
- *         incomplete / irrelevant advice here that did not match our use case.
- *         otoh, for now, we do not need to change the sizes as they do no harm.
- *         see ticket #2830.
- */
-static int hash_sizes[] = {
-    1031, 2053, 4099, 8209, 16411, 32771, 65537, 131101, 262147, 445649,
-    757607, 1287917, 2189459, 3065243, 4291319, 6007867, 8410991,
-    11775359, 16485527, 23079703, 27695653, 33234787, 39881729, 47858071,
-    57429683, 68915617, 82698751, 99238507, 119086189, 144378011, 157223263,
-    173476439, 190253911, 209915011, 230493629, 253169431, 278728861,
-    306647623, 337318939, 370742809, 408229973, 449387209, 493428073,
-    543105119, 596976533, 657794869, 722676499, 795815791, 874066969,
-    962279771, 1057701643, 1164002657, 1280003147, 1407800297, 1548442699,
-    1703765389, 1873768367, 2062383853, /* 32bit int ends about here */
-};
-
-#define HASH_MIN_LOAD .25
-#define HASH_MAX_LOAD .75  /* don't go higher than 0.75, otherwise performance severely suffers! */
-#define HASH_MAX_EFF_LOAD .93
-
-#define MAX(x, y) ((x) > (y) ? (x): (y))
-#define NELEMS(x) (sizeof(x) / sizeof((x)[0]))
-
-#define EMPTY _htole32(0xffffffff)
-#define DELETED _htole32(0xfffffffe)
-
-#define BUCKET_ADDR(index, idx) (index->buckets + ((idx) * index->bucket_size))
-
-#define BUCKET_MATCHES_KEY(index, idx, key) (memcmp(key, BUCKET_ADDR(index, idx), index->key_size) == 0)
-
-#define BUCKET_IS_DELETED(index, idx) (*((uint32_t *)(BUCKET_ADDR(index, idx) + index->key_size)) == DELETED)
-#define BUCKET_IS_EMPTY(index, idx) (*((uint32_t *)(BUCKET_ADDR(index, idx) + index->key_size)) == EMPTY)
-#define BUCKET_IS_EMPTY_OR_DELETED(index, idx) (BUCKET_IS_EMPTY(index, idx) || BUCKET_IS_DELETED(index, idx))
-
-#define BUCKET_MARK_DELETED(index, idx) (*((uint32_t *)(BUCKET_ADDR(index, idx) + index->key_size)) = DELETED)
-#define BUCKET_MARK_EMPTY(index, idx) (*((uint32_t *)(BUCKET_ADDR(index, idx) + index->key_size)) = EMPTY)
-
-#define EPRINTF_MSG(msg, ...) fprintf(stderr, "hashindex: " msg "\n", ##__VA_ARGS__)
-#define EPRINTF_MSG_PATH(path, msg, ...) fprintf(stderr, "hashindex: %s: " msg "\n", path, ##__VA_ARGS__)
-#define EPRINTF(msg, ...) fprintf(stderr, "hashindex: " msg "(%s)\n", ##__VA_ARGS__, strerror(errno))
-#define EPRINTF_PATH(path, msg, ...) fprintf(stderr, "hashindex: %s: " msg " (%s)\n", path, ##__VA_ARGS__, strerror(errno))
-
-#ifndef BORG_NO_PYTHON
-static HashIndex *hashindex_read(PyObject *file_py, int permit_compact, int legacy);
-static void hashindex_write(HashIndex *index, PyObject *file_py, int legacy);
-#endif
-
-static uint64_t hashindex_compact(HashIndex *index);
-static HashIndex *hashindex_init(int capacity, int key_size, int value_size);
-static const unsigned char *hashindex_get(HashIndex *index, const unsigned char *key);
-static int hashindex_set(HashIndex *index, const unsigned char *key, const void *value);
-static int hashindex_delete(HashIndex *index, const unsigned char *key);
-static unsigned char *hashindex_next_key(HashIndex *index, const unsigned char *key);
-
-/* Private API */
-static void hashindex_free(HashIndex *index);
-
-static void
-hashindex_free_buckets(HashIndex *index)
-{
-#ifndef BORG_NO_PYTHON
-    if(index->buckets_buffer.buf) {
-        PyBuffer_Release(&index->buckets_buffer);
-    } else
-#endif
-    {
-        free(index->buckets);
-    }
-}
-
-static int
-hashindex_index(HashIndex *index, const unsigned char *key)
-{
-    return _le32toh(*((uint32_t *)key)) % index->num_buckets;
-}
-
-static int
-hashindex_lookup(HashIndex *index, const unsigned char *key, int *start_idx)
-{
-    int didx = -1;
-    int start = hashindex_index(index, key);  /* perfect index for this key, if there is no collision. */
-    int idx = start;
-    for(;;) {
-        if(BUCKET_IS_EMPTY(index, idx))
-        {
-            break;  /* if we encounter an empty bucket, we do not need to look any further. */
-        }
-        if(BUCKET_IS_DELETED(index, idx)) {
-            if(didx == -1) {
-                didx = idx;  /* remember the index of the first deleted bucket. */
-            }
-        }
-        else if(BUCKET_MATCHES_KEY(index, idx, key)) {
-            /* we found the bucket with the key we are looking for! */
-            if (didx != -1) {
-                // note: although lookup is logically a read-only operation,
-                // we optimize (change) the hashindex here "on the fly":
-                // swap this full bucket with a previous deleted/tombstone bucket.
-                memcpy(BUCKET_ADDR(index, didx), BUCKET_ADDR(index, idx), index->bucket_size);
-                BUCKET_MARK_DELETED(index, idx);
-                idx = didx;
-            }
-            return idx;
-        }
-        idx++;
-        if (idx >= index->num_buckets) {  /* triggers at == already */
-            idx = 0;
-        }
-        /* When idx == start, we have done a full pass over all buckets.
-         * - We did not find a bucket with the key we searched for.
-         * - We did not find an empty bucket either.
-         * - We may have found a deleted/tombstone bucket, though.
-         * This can easily happen if we have a compact hashtable.
-         */
-        if(idx == start) {
-            if(didx != -1)
-                break;  /* we have found a deleted/tombstone bucket at least */
-            return -2;  /* HT is completely full, no empty or deleted buckets. */
-        }
-    }
-    /* we get here if we did not find a bucket with the key we searched for. */
-    if (start_idx != NULL) {
-        /* by giving a non-NULL pointer in start_idx, caller can request to
-         * get the index of the first empty or deleted bucket we encountered,
-         * e.g. to add a new entry for that key into that bucket.
-         */
-        (*start_idx) = (didx == -1) ? idx : didx;
-    }
-    return -1;
-}
-
-static int
-hashindex_resize(HashIndex *index, int capacity)
-{
-    HashIndex *new;
-    unsigned char *key = NULL;
-    int32_t key_size = index->key_size;
-
-    if(!(new = hashindex_init(capacity, key_size, index->value_size))) {
-        return 0;
-    }
-    while((key = hashindex_next_key(index, key))) {
-        if(!hashindex_set(new, key, key + key_size)) {
-            /* This can only happen if there's a bug in the code calculating capacity */
-            hashindex_free(new);
-            return 0;
-        }
-    }
-    assert(index->num_entries == new->num_entries);
-
-    hashindex_free_buckets(index);
-    index->buckets = new->buckets;
-    index->num_buckets = new->num_buckets;
-    index->num_empty = index->num_buckets - index->num_entries;
-    index->lower_limit = new->lower_limit;
-    index->upper_limit = new->upper_limit;
-    index->min_empty = new->min_empty;
-    free(new);
-    return 1;
-}
-
-int get_lower_limit(int num_buckets){
-    int min_buckets = hash_sizes[0];
-    if (num_buckets <= min_buckets)
-        return 0;
-    return (int)(num_buckets * HASH_MIN_LOAD);
-}
-
-int get_upper_limit(int num_buckets){
-    int max_buckets = hash_sizes[NELEMS(hash_sizes) - 1];
-    if (num_buckets >= max_buckets)
-        return num_buckets;
-    return (int)(num_buckets * HASH_MAX_LOAD);
-}
-
-int get_min_empty(int num_buckets){
-    /* Differently from load, the effective load also considers tombstones (deleted buckets).
-     * We always add 1, so this never can return 0 (0 empty buckets would be a bad HT state).
-     */
-    return 1 + (int)(num_buckets * (1.0 - HASH_MAX_EFF_LOAD));
-}
-
-int size_idx(int size){
-    /* find the smallest hash_sizes index with entry >= size */
-    int i = NELEMS(hash_sizes) - 1;
-    while(i >= 0 && hash_sizes[i] >= size) i--;
-    return i + 1;
-}
-
-int fit_size(int current){
-    int i = size_idx(current);
-    return hash_sizes[i];
-}
-
-int grow_size(int current){
-    int i = size_idx(current) + 1;
-    int elems = NELEMS(hash_sizes);
-    if (i >= elems)
-        return hash_sizes[elems - 1];
-    return hash_sizes[i];
-}
-
-int shrink_size(int current){
-    int i = size_idx(current) - 1;
-    if (i < 0)
-        return hash_sizes[0];
-    return hash_sizes[i];
-}
-
-int
-count_empty(HashIndex *index)
-{   /* count empty (never used) buckets. this does NOT include deleted buckets (tombstones). */
-    int i, count = 0, capacity = index->num_buckets;
-    for(i = 0; i < capacity; i++) {
-        if(BUCKET_IS_EMPTY(index, i))
-            count++;
-    }
-    return count;
-}
-
-HashIndex *
-read_hashheader1(PyObject *file_py)
-{
-    Py_ssize_t bytes_read, length, buckets_length;
-    Py_buffer header_buffer;
-    PyObject *header_bytes, *length_object, *tmp;
-    HashIndex *index = NULL;
-    HashHeader1 *header;
-
-    header_bytes = PyObject_CallMethod(file_py, "read", "n", (Py_ssize_t)sizeof(*header));
-    if(!header_bytes) {
-        assert(PyErr_Occurred());
-        goto fail;
-    }
-
-    bytes_read = PyBytes_Size(header_bytes);
-    if(PyErr_Occurred()) {
-        /* TypeError, not a bytes() object */
-        goto fail_decref_header;
-    }
-    if(bytes_read != sizeof(*header)) {
-        /* Truncated file */
-        /* Note: %zd is the format for Py_ssize_t, %zu is for size_t */
-        PyErr_Format(PyExc_ValueError, "Could not read header (expected %zu, but read %zd bytes)",
-                     sizeof(*header), bytes_read);
-        goto fail_decref_header;
-    }
-
-    /*
-     * Hash the header
-     * If the header is corrupted this bails before doing something stupid (like allocating 3.8 TB of memory)
-     */
-    tmp = PyObject_CallMethod(file_py, "hash_part", "s", "HashHeader");
-    Py_XDECREF(tmp);
-    if(PyErr_Occurred()) {
-        if(PyErr_ExceptionMatches(PyExc_AttributeError)) {
-            /* Be able to work with regular file objects which do not have a hash_part method. */
-            PyErr_Clear();
-        } else {
-            goto fail_decref_header;
-        }
-    }
-
-    /* Find length of file */
-    length_object = PyObject_CallMethod(file_py, "seek", "ni", (Py_ssize_t)0, SEEK_END);
-    if(PyErr_Occurred()) {
-        goto fail_decref_header;
-    }
-    length = PyNumber_AsSsize_t(length_object, PyExc_OverflowError);
-    Py_DECREF(length_object);
-    if(PyErr_Occurred()) {
-        /* This shouldn't generally happen; but can if seek() returns something that's not a number */
-        goto fail_decref_header;
-    }
-
-    tmp = PyObject_CallMethod(file_py, "seek", "ni", (Py_ssize_t)sizeof(*header), SEEK_SET);
-    Py_XDECREF(tmp);
-    if(PyErr_Occurred()) {
-        goto fail_decref_header;
-    }
-
-    /* Set up the in-memory header */
-    if(!(index = malloc(sizeof(HashIndex)))) {
-        PyErr_NoMemory();
-        goto fail_decref_header;
-    }
-
-    PyObject_GetBuffer(header_bytes, &header_buffer, PyBUF_SIMPLE);
-    if(PyErr_Occurred()) {
-        goto fail_free_index;
-    }
-
-    header = (HashHeader1*) header_buffer.buf;
-    if(memcmp(header->magic, MAGIC1, MAGIC_LEN)) {
-        PyErr_Format(PyExc_ValueError, "Unknown MAGIC in header");
-        goto fail_release_header_buffer;
-    }
-
-    buckets_length = (Py_ssize_t)_le32toh(header->num_buckets) * (header->key_size + header->value_size);
-    if((Py_ssize_t)length != (Py_ssize_t)sizeof(*header) + buckets_length) {
-        PyErr_Format(PyExc_ValueError, "Incorrect file length (expected %zd, got %zd)",
-                     sizeof(*header) + buckets_length, length);
-        goto fail_release_header_buffer;
-    }
-
-    index->num_entries = _le32toh(header->num_entries);
-    index->num_buckets = _le32toh(header->num_buckets);
-    index->num_empty = -1;  // unknown, needs counting
-    index->key_size = header->key_size;
-    index->value_size = header->value_size;
-
-fail_release_header_buffer:
-    PyBuffer_Release(&header_buffer);
-fail_free_index:
-    if(PyErr_Occurred()) {
-        free(index);
-        index = NULL;
-    }
-fail_decref_header:
-    Py_DECREF(header_bytes);
-fail:
-    return index;
-}
-
-HashIndex *
-read_hashheader(PyObject *file_py)
-{
-    Py_ssize_t bytes_read, length, buckets_length;
-    Py_buffer header_buffer;
-    PyObject *header_bytes, *length_object, *tmp;
-    HashIndex *index = NULL;
-    HashHeader *header;
-
-    header_bytes = PyObject_CallMethod(file_py, "read", "n", (Py_ssize_t)sizeof(*header));
-    if(!header_bytes) {
-        assert(PyErr_Occurred());
-        goto fail;
-    }
-
-    bytes_read = PyBytes_Size(header_bytes);
-    if(PyErr_Occurred()) {
-        /* TypeError, not a bytes() object */
-        goto fail_decref_header;
-    }
-    if(bytes_read != sizeof(*header)) {
-        /* Truncated file */
-        /* Note: %zd is the format for Py_ssize_t, %zu is for size_t */
-        PyErr_Format(PyExc_ValueError, "Could not read header (expected %zu, but read %zd bytes)",
-                     sizeof(*header), bytes_read);
-        goto fail_decref_header;
-    }
-
-    /*
-     * Hash the header
-     * If the header is corrupted this bails before doing something stupid (like allocating 3.8 TB of memory)
-     */
-    tmp = PyObject_CallMethod(file_py, "hash_part", "s", "HashHeader");
-    Py_XDECREF(tmp);
-    if(PyErr_Occurred()) {
-        if(PyErr_ExceptionMatches(PyExc_AttributeError)) {
-            /* Be able to work with regular file objects which do not have a hash_part method. */
-            PyErr_Clear();
-        } else {
-            goto fail_decref_header;
-        }
-    }
-
-    /* Find length of file */
-    length_object = PyObject_CallMethod(file_py, "seek", "ni", (Py_ssize_t)0, SEEK_END);
-    if(PyErr_Occurred()) {
-        goto fail_decref_header;
-    }
-    length = PyNumber_AsSsize_t(length_object, PyExc_OverflowError);
-    Py_DECREF(length_object);
-    if(PyErr_Occurred()) {
-        /* This shouldn't generally happen; but can if seek() returns something that's not a number */
-        goto fail_decref_header;
-    }
-
-    tmp = PyObject_CallMethod(file_py, "seek", "ni", (Py_ssize_t)sizeof(*header), SEEK_SET);
-    Py_XDECREF(tmp);
-    if(PyErr_Occurred()) {
-        goto fail_decref_header;
-    }
-
-    /* Set up the in-memory header */
-    if(!(index = malloc(sizeof(HashIndex)))) {
-        PyErr_NoMemory();
-        goto fail_decref_header;
-    }
-
-    PyObject_GetBuffer(header_bytes, &header_buffer, PyBUF_SIMPLE);
-    if(PyErr_Occurred()) {
-        goto fail_free_index;
-    }
-
-    header = (HashHeader*) header_buffer.buf;
-    if(memcmp(header->magic, MAGIC, MAGIC_LEN)) {
-        PyErr_Format(PyExc_ValueError, "Unknown MAGIC in header");
-        goto fail_release_header_buffer;
-    }
-
-    buckets_length = (Py_ssize_t)_le32toh(header->num_buckets) *
-                         (_le32toh(header->key_size) + _le32toh(header->value_size));
-    if ((Py_ssize_t)length != (Py_ssize_t)sizeof(*header) + buckets_length) {
-        PyErr_Format(PyExc_ValueError, "Incorrect file length (expected %zd, got %zd)",
-                     sizeof(*header) + buckets_length, length);
-        goto fail_release_header_buffer;
-    }
-
-    index->num_entries = _le32toh(header->num_entries);
-    index->num_buckets = _le32toh(header->num_buckets);
-    index->num_empty = _le32toh(header->num_empty);
-    index->key_size = _le32toh(header->key_size);
-    index->value_size = _le32toh(header->value_size);
-
-    int header_version = _le32toh(header->version);
-    if (header_version != 2) {
-        PyErr_Format(PyExc_ValueError, "Unsupported header version (expected %d, got %d)",
-                     2, header_version);
-        goto fail_release_header_buffer;
-    }
-
-fail_release_header_buffer:
-    PyBuffer_Release(&header_buffer);
-fail_free_index:
-    if(PyErr_Occurred()) {
-        free(index);
-        index = NULL;
-    }
-fail_decref_header:
-    Py_DECREF(header_bytes);
-fail:
-    return index;
-}
-
-/* Public API */
-
-#ifndef BORG_NO_PYTHON
-static HashIndex *
-hashindex_read(PyObject *file_py, int permit_compact, int legacy)
-{
-    Py_ssize_t buckets_length, bytes_read;
-    PyObject *bucket_bytes;
-    HashIndex *index = NULL;
-
-    if (legacy)
-        index = read_hashheader1(file_py);
-    else
-        index = read_hashheader(file_py);
-
-    if (!index)
-        goto fail;
-
-    index->bucket_size = index->key_size + index->value_size;
-    index->lower_limit = get_lower_limit(index->num_buckets);
-    index->upper_limit = get_upper_limit(index->num_buckets);
-
-    /*
-     * For indices read from disk we don't malloc() the buckets ourselves,
-     * we have them backed by a Python bytes() object instead, and go through
-     * Python I/O.
-     *
-     * Note: Issuing read(buckets_length) is okay here, because buffered readers
-     * will issue multiple underlying reads if necessary. This supports indices
-     * >2 GB on Linux. We also compare lengths later.
-     */
-    buckets_length = (Py_ssize_t)(index->num_buckets) * (index->key_size + index->value_size);
-    bucket_bytes = PyObject_CallMethod(file_py, "read", "n", buckets_length);
-    if(!bucket_bytes) {
-        assert(PyErr_Occurred());
-        goto fail_free_index;
-    }
-    bytes_read = PyBytes_Size(bucket_bytes);
-    if(PyErr_Occurred()) {
-        /* TypeError, not a bytes() object */
-        goto fail_decref_buckets;
-    }
-    if(bytes_read != buckets_length) {
-        PyErr_Format(PyExc_ValueError, "Could not read buckets (expected %zd, got %zd)", buckets_length, bytes_read);
-        goto fail_decref_buckets;
-    }
-
-    PyObject_GetBuffer(bucket_bytes, &index->buckets_buffer, PyBUF_SIMPLE);
-    if(PyErr_Occurred()) {
-        goto fail_decref_buckets;
-    }
-    index->buckets = index->buckets_buffer.buf;
-
-    index->min_empty = get_min_empty(index->num_buckets);
-    if (index->num_empty == -1)  // we read a legacy index without num_empty value
-        index->num_empty = count_empty(index);
-
-    if(!permit_compact) {
-        if(index->num_empty < index->min_empty) {
-            /* too many tombstones here / not enough empty buckets, do a same-size rebuild */
-            if(!hashindex_resize(index, index->num_buckets)) {
-                PyErr_Format(PyExc_ValueError, "Failed to rebuild table");
-                goto fail_free_buckets;
-            }
-        }
-    }
-
-    /*
-     * Clean intermediary objects up. Note that index is only freed if an error has occurred.
-     * Also note that the buffer in index->buckets_buffer holds a reference to buckets_bytes.
-     */
-
-fail_free_buckets:
-    if(PyErr_Occurred()) {
-        hashindex_free_buckets(index);
-    }
-fail_decref_buckets:
-    Py_DECREF(bucket_bytes);
-fail_free_index:
-    if(PyErr_Occurred()) {
-        free(index);
-        index = NULL;
-    }
-fail:
-    return index;
-}
-#endif
-
-static HashIndex *
-hashindex_init(int capacity, int key_size, int value_size)
-{
-    HashIndex *index;
-    int i;
-    capacity = fit_size(capacity);
-
-    if(!(index = malloc(sizeof(HashIndex)))) {
-        EPRINTF("malloc header failed");
-        return NULL;
-    }
-    if(!(index->buckets = calloc(capacity, key_size + value_size))) {
-        EPRINTF("malloc buckets failed");
-        free(index);
-        return NULL;
-    }
-    index->num_entries = 0;
-    index->key_size = key_size;
-    index->value_size = value_size;
-    index->num_buckets = capacity;
-    index->num_empty = capacity;
-    index->bucket_size = index->key_size + index->value_size;
-    index->lower_limit = get_lower_limit(index->num_buckets);
-    index->upper_limit = get_upper_limit(index->num_buckets);
-    index->min_empty = get_min_empty(index->num_buckets);
-#ifndef BORG_NO_PYTHON
-    index->buckets_buffer.buf = NULL;
-#endif
-    for(i = 0; i < capacity; i++) {
-        BUCKET_MARK_EMPTY(index, i);
-    }
-    return index;
-}
-
-static void
-hashindex_free(HashIndex *index)
-{
-    hashindex_free_buckets(index);
-    free(index);
-}
-
-int
-write_hashheader(HashIndex *index, PyObject *file_py)
-{
-    PyObject *length_object, *tmp;
-    Py_ssize_t length;
-
-    _Static_assert(sizeof(HashHeader) == 1024, "HashHeader struct should be exactly 1024 bytes in size");
-
-    HashHeader header = {
-        .magic = MAGIC,
-        .version = _htole32(2),
-        .num_entries = _htole32(index->num_entries),
-        .num_buckets = _htole32(index->num_buckets),
-        .num_empty = _htole32(index->num_empty),
-        .key_size = _htole32(index->key_size),
-        .value_size = _htole32(index->value_size),
-        .reserved = {0}
-    };
-
-    length_object = PyObject_CallMethod(file_py, "write", "y#", &header, (Py_ssize_t)sizeof(header));
-    if(PyErr_Occurred()) {
-        return 0;
-    }
-    length = PyNumber_AsSsize_t(length_object, PyExc_OverflowError);
-    Py_DECREF(length_object);
-    if(PyErr_Occurred()) {
-        return 0;
-    }
-    if(length != sizeof(header)) {
-        PyErr_SetString(PyExc_ValueError, "Failed to write header");
-        return 0;
-    }
-
-    /*
-     * Hash the header
-     */
-    tmp = PyObject_CallMethod(file_py, "hash_part", "s", "HashHeader");
-    Py_XDECREF(tmp);
-    if(PyErr_Occurred()) {
-        if(PyErr_ExceptionMatches(PyExc_AttributeError)) {
-            /* Be able to work with regular file objects which do not have a hash_part method. */
-            PyErr_Clear();
-        } else {
-            return 0;
-        }
-    }
-    return 1;
-}
-
-#ifndef BORG_NO_PYTHON
-static void
-hashindex_write(HashIndex *index, PyObject *file_py, int legacy)
-{
-    PyObject *length_object, *buckets_view;
-    Py_ssize_t length;
-    Py_ssize_t buckets_length = (Py_ssize_t)index->num_buckets * index->bucket_size;
-
-    assert(!legacy);  // we do not ever write legacy hashindexes
-
-    if(!write_hashheader(index, file_py))
-        return;
-
-    /* Note: explicitly construct view; BuildValue can convert (pointer, length) to Python objects, but copies them for doing so */
-    buckets_view = PyMemoryView_FromMemory((char*)index->buckets, buckets_length, PyBUF_READ);
-    if(!buckets_view) {
-        assert(PyErr_Occurred());
-        return;
-    }
-    length_object = PyObject_CallMethod(file_py, "write", "O", buckets_view);
-    Py_DECREF(buckets_view);
-    if(PyErr_Occurred()) {
-        return;
-    }
-    length = PyNumber_AsSsize_t(length_object, PyExc_OverflowError);
-    Py_DECREF(length_object);
-    if(PyErr_Occurred()) {
-        return;
-    }
-    if(length != buckets_length) {
-        PyErr_SetString(PyExc_ValueError, "Failed to write buckets");
-        return;
-    }
-}
-#endif
-
-static const unsigned char *
-hashindex_get(HashIndex *index, const unsigned char *key)
-{
-    int idx = hashindex_lookup(index, key, NULL);
-    if(idx < 0) {
-        return NULL;
-    }
-    return BUCKET_ADDR(index, idx) + index->key_size;
-}
-
-static int
-hashindex_set(HashIndex *index, const unsigned char *key, const void *value)
-{
-    int start_idx;
-    int idx = hashindex_lookup(index, key, &start_idx);  /* if idx < 0: start_idx -> EMPTY or DELETED */
-    uint8_t *ptr;
-    if(idx < 0)
-    {
-        if(index->num_entries >= index->upper_limit || idx == -2) {
-            /* hashtable too full or even a compact hashtable, grow/rebuild it! */
-            if(!hashindex_resize(index, grow_size(index->num_buckets))) {
-                return 0;
-            }
-            /* we have just built a fresh hashtable and removed all tombstones,
-             * so we only have EMPTY or USED buckets, but no DELETED ones any more.
-             */
-            idx = hashindex_lookup(index, key, &start_idx);
-            assert(idx == -1);
-            assert(BUCKET_IS_EMPTY(index, start_idx));
-        }
-        idx = start_idx;
-        if(BUCKET_IS_EMPTY(index, idx)){
-            if(index->num_empty <= index->min_empty) {
-                /* too many tombstones here / not enough empty buckets, do a same-size rebuild */
-                if(!hashindex_resize(index, index->num_buckets)) {
-                    return 0;
-                }
-                /* we have just built a fresh hashtable and removed all tombstones,
-                 * so we only have EMPTY or USED buckets, but no DELETED ones any more.
-                 */
-                idx = hashindex_lookup(index, key, &start_idx);
-                assert(idx == -1);
-                assert(BUCKET_IS_EMPTY(index, start_idx));
-                idx = start_idx;
-            }
-            index->num_empty--;
-        } else {
-            /* Bucket must be either EMPTY (see above) or DELETED. */
-            assert(BUCKET_IS_DELETED(index, idx));
-        }
-        ptr = BUCKET_ADDR(index, idx);
-        memcpy(ptr, key, index->key_size);
-        memcpy(ptr + index->key_size, value, index->value_size);
-        index->num_entries += 1;
-    }
-    else
-    {
-        memcpy(BUCKET_ADDR(index, idx) + index->key_size, value, index->value_size);
-    }
-    return 1;
-}
-
-static int
-hashindex_delete(HashIndex *index, const unsigned char *key)
-{
-    int idx = hashindex_lookup(index, key, NULL);
-    if (idx < 0) {
-        return -1;
-    }
-    BUCKET_MARK_DELETED(index, idx);
-    index->num_entries -= 1;
-    if(index->num_entries < index->lower_limit) {
-        if(!hashindex_resize(index, shrink_size(index->num_buckets))) {
-            return 0;
-        }
-    }
-    return 1;
-}
-
-static unsigned char *
-hashindex_next_key(HashIndex *index, const unsigned char *key)
-{
-    int idx = 0;
-    if(key) {
-        idx = 1 + (key - index->buckets) / index->bucket_size;
-    }
-    if (idx == index->num_buckets) {
-        return NULL;
-    }
-    while(BUCKET_IS_EMPTY_OR_DELETED(index, idx)) {
-        idx ++;
-        if (idx == index->num_buckets) {
-            return NULL;
-        }
-    }
-    return BUCKET_ADDR(index, idx);
-}
-
-/* Move all non-empty/non-deleted entries in the hash table to the beginning. This does not preserve the order, and it does not mark the previously used entries as empty or deleted. But it reduces num_buckets so that those entries will never be accessed. */
-static uint64_t
-hashindex_compact(HashIndex *index)
-{
-    int idx = index->num_buckets - 1;
-    int tail = 0;
-    uint64_t saved_size = (index->num_buckets - index->num_entries) * (uint64_t)index->bucket_size;
-
-    /* idx will point to the last filled spot and tail will point to the first empty or deleted spot. */
-    for(;;) {
-        /* Find the last filled spot >= index->num_entries. */
-        while((idx >= index->num_entries) && BUCKET_IS_EMPTY_OR_DELETED(index, idx)) {
-            idx--;
-        }
-        /* If all spots >= index->num_entries are empty, then we must be in a compact state. */
-        if(idx < index->num_entries) {
-            break;
-        }
-        /* Find the first empty or deleted spot < index->num_entries. */
-        while((tail < index->num_entries) && !BUCKET_IS_EMPTY_OR_DELETED(index, tail)) {
-            tail++;
-        }
-        assert(tail < index->num_entries);
-        memcpy(BUCKET_ADDR(index, tail), BUCKET_ADDR(index, idx), index->bucket_size);
-        idx--;
-        tail++;
-    }
-
-    index->num_buckets = index->num_entries;
-    index->num_empty = 0;
-    index->min_empty = 0;
-    index->upper_limit = index->num_entries;  /* triggers a resize/rebuild when a new entry is added */
-    return saved_size;
-}
-
-static int
-hashindex_len(HashIndex *index)
-{
-    return index->num_entries;
-}
-
-static int
-hashindex_size(HashIndex *index)
-{
-    return sizeof(HashHeader) + index->num_buckets * index->bucket_size;
-}
-
-/*
- * Used by the FuseVersionsIndex.
- */
-BORG_PACKED(
-typedef struct {
-    uint32_t version;
-    char hash[16];
-} ) FuseVersionsElement;

+ 1 - 1
src/borg/archiver/compact_cmd.py

@@ -65,7 +65,7 @@ class ArchiveGarbageCollector:
             # as we put the wrong size in there, we need to clean up the size:
             self.chunks[id] = ChunkIndexEntry(refcount=ChunkIndex.MAX_VALUE, size=0)
         # now self.chunks is an uptodate ChunkIndex, usable for general borg usage!
-        write_chunkindex_to_repo_cache(self.repository, self.chunks, compact=True, clear=True, force_write=True)
+        write_chunkindex_to_repo_cache(self.repository, self.chunks, clear=True, force_write=True)
         self.chunks = None  # nothing there (cleared!)
 
     def analyze_archives(self) -> Tuple[Set, Set, int, int, int]:

+ 9 - 9
src/borg/cache.py

@@ -630,18 +630,18 @@ def load_chunks_hash(repository) -> bytes:
     return hash
 
 
-def write_chunkindex_to_repo_cache(repository, chunks, *, compact=False, clear=False, force_write=False):
+CHUNKINDEX_HASH_SEED = 1
+
+
+def write_chunkindex_to_repo_cache(repository, chunks, *, clear=False, force_write=False):
     cached_hash = load_chunks_hash(repository)
-    if compact:
-        # if we don't need the in-memory chunks index anymore:
-        chunks.compact()  # vacuum the hash table
     with io.BytesIO() as f:
         chunks.write(f)
         data = f.getvalue()
     if clear:
         # if we don't need the in-memory chunks index anymore:
         chunks.clear()  # free memory, immediately
-    new_hash = xxh64(data)
+    new_hash = xxh64(data, seed=CHUNKINDEX_HASH_SEED)
     if force_write or new_hash != cached_hash:
         # when an updated chunks index is stored into the cache, we also store its hash into the cache.
         # when a client is loading the chunks index from a cache, it has to compare its xxh64
@@ -669,7 +669,7 @@ def build_chunkindex_from_repo(repository, *, disable_caches=False, cache_immedi
             # TODO: ^ seem like RemoteRepository raises Repository.ONF instead of StoreONF
             logger.debug("cache/chunks not found in the repository.")
         else:
-            if xxh64(chunks_data) == wanted_hash:
+            if xxh64(chunks_data, seed=CHUNKINDEX_HASH_SEED) == wanted_hash:
                 logger.debug("cache/chunks is valid.")
                 with io.BytesIO(chunks_data) as f:
                     chunks = ChunkIndex.read(f)
@@ -698,7 +698,7 @@ def build_chunkindex_from_repo(repository, *, disable_caches=False, cache_immedi
     logger.debug(f"queried {num_chunks} chunk IDs in {duration} s, ~{speed}/s")
     if cache_immediately:
         # immediately update cache/chunks, so we only rarely have to do it the slow way:
-        write_chunkindex_to_repo_cache(repository, chunks, compact=False, clear=False, force_write=True)
+        write_chunkindex_to_repo_cache(repository, chunks, clear=False, force_write=True)
     return chunks
 
 
@@ -770,8 +770,8 @@ class ChunksMixin:
         return ChunkListEntry(id, size)
 
     def _write_chunks_cache(self, chunks):
-        # this is called from .close, so we can clear/compact here:
-        write_chunkindex_to_repo_cache(self.repository, self._chunks, compact=True, clear=True)
+        # this is called from .close, so we can clear here:
+        write_chunkindex_to_repo_cache(self.repository, self._chunks, clear=True)
         self._chunks = None  # nothing there (cleared!)
 
     def refresh_lock(self, now):

+ 7 - 41
src/borg/hashindex.pyi

@@ -4,68 +4,34 @@ API_VERSION: str
 
 PATH_OR_FILE = Union[str, IO]
 
-def hashindex_variant(fn: str) -> str: ...
-
-class IndexBase:
-    value_size: int
-    MAX_VALUE: int
-    MAX_LOAD_FACTOR: int
-    def __init__(
-        self, capacity: int = ..., path: PATH_OR_FILE = ..., permit_compact: bool = ..., usable: Union[int, float] = ...
-    ): ...
-    @classmethod
-    def read(cls, path: PATH_OR_FILE, permit_compact: bool = False): ...
-    def write(self, path: PATH_OR_FILE) -> None: ...
-    def clear(self) -> None: ...
-    def setdefault(self, key: bytes, value: bytes) -> None: ...
-    def __delitem__(self, key: bytes) -> None: ...
-    def get(self, key: bytes, default: Any = ...) -> Any: ...
-    def pop(self, key: bytes, default: Any = ...) -> Any: ...
-    def __len__(self) -> int: ...
-    def size(self) -> int: ...
-    def compact(self) -> Any: ...
-
 class ChunkIndexEntry(NamedTuple):
     refcount: int
     size: int
 
 CIE = Union[Tuple[int, int], Type[ChunkIndexEntry]]
 
-class ChunkKeyIterator:
-    def __init__(self, keysize: int) -> None: ...
-    def __iter__(self) -> Iterator: ...
-    def __next__(self) -> Tuple[bytes, Type[ChunkIndexEntry]]: ...
-
-class ChunkIndex(IndexBase):
+class ChunkIndex:
     def add(self, key: bytes, refs: int, size: int) -> None: ...
     def iteritems(self, marker: bytes = ...) -> Iterator: ...
     def __contains__(self, key: bytes) -> bool: ...
     def __getitem__(self, key: bytes) -> Type[ChunkIndexEntry]: ...
     def __setitem__(self, key: bytes, value: CIE) -> None: ...
 
-class NSIndexEntry(NamedTuple):
+class NSIndex1Entry(NamedTuple):
     segment: int
     offset: int
-    size: int
-
-class NSKeyIterator:
-    def __init__(self, keysize: int) -> None: ...
-    def __iter__(self) -> Iterator: ...
-    def __next__(self) -> Tuple[bytes, Type[Any]]: ...
 
-class NSIndex(IndexBase):
+class NSIndex1:  # legacy
     def iteritems(self, *args, **kwargs) -> Iterator: ...
     def __contains__(self, key: bytes) -> bool: ...
     def __getitem__(self, key: bytes) -> Any: ...
     def __setitem__(self, key: bytes, value: Any) -> None: ...
 
-class NSIndex1(IndexBase):  # legacy
-    def iteritems(self, *args, **kwargs) -> Iterator: ...
-    def __contains__(self, key: bytes) -> bool: ...
-    def __getitem__(self, key: bytes) -> Any: ...
-    def __setitem__(self, key: bytes, value: Any) -> None: ...
+class FuseVersionsIndexEntry(NamedTuple):
+    version: int
+    hash: bytes
 
-class FuseVersionsIndex(IndexBase):
+class FuseVersionsIndex:
     def __contains__(self, key: bytes) -> bool: ...
     def __getitem__(self, key: bytes) -> Any: ...
     def __setitem__(self, key: bytes, value: Any) -> None: ...

+ 122 - 412
src/borg/hashindex.pyx

@@ -1,457 +1,167 @@
+from collections.abc import MutableMapping
 from collections import namedtuple
+import os
+import struct
 
-cimport cython
-from libc.stdint cimport uint32_t, UINT32_MAX, uint64_t
-from libc.string cimport memcpy
-from cpython.buffer cimport PyBUF_SIMPLE, PyObject_GetBuffer, PyBuffer_Release
-from cpython.bytes cimport PyBytes_FromStringAndSize, PyBytes_CheckExact, PyBytes_GET_SIZE, PyBytes_AS_STRING
+from borghash import HashTableNT
 
 API_VERSION = '1.2_01'
 
-
-cdef extern from "_hashindex.c":
-    ctypedef struct HashIndex:
-        pass
-
-    ctypedef struct FuseVersionsElement:
-        uint32_t version
-        char hash[16]
-
-    HashIndex *hashindex_read(object file_py, int permit_compact, int legacy) except *
-    HashIndex *hashindex_init(int capacity, int key_size, int value_size)
-    void hashindex_free(HashIndex *index)
-    int hashindex_len(HashIndex *index)
-    int hashindex_size(HashIndex *index)
-    void hashindex_write(HashIndex *index, object file_py, int legacy) except *
-    unsigned char *hashindex_get(HashIndex *index, unsigned char *key)
-    unsigned char *hashindex_next_key(HashIndex *index, unsigned char *key)
-    int hashindex_delete(HashIndex *index, unsigned char *key)
-    int hashindex_set(HashIndex *index, unsigned char *key, void *value)
-    uint64_t hashindex_compact(HashIndex *index)
-    uint32_t _htole32(uint32_t v)
-    uint32_t _le32toh(uint32_t v)
-
-    double HASH_MAX_LOAD
-
-
-_MAX_VALUE = 4294966271UL  # 2**32 - 1025
-
 cdef _NoDefault = object()
 
-"""
-The HashIndex is *not* a general purpose data structure. The value size must be at least 4 bytes, and these
-first bytes are used for in-band signalling in the data structure itself.
 
-The constant MAX_VALUE defines the valid range for these 4 bytes when interpreted as an uint32_t from 0
-to MAX_VALUE (inclusive). The following reserved values beyond MAX_VALUE are currently in use
-(byte order is LE)::
+class HTProxyMixin:
+    def __setitem__(self, key, value):
+        self.ht[key] = value
 
-    0xffffffff marks empty entries in the hashtable
-    0xfffffffe marks deleted entries in the hashtable
+    def __getitem__(self, key):
+        return self.ht[key]
 
-None of the publicly available classes in this module will accept nor return a reserved value;
-AssertionError is raised instead.
-"""
+    def __delitem__(self, key):
+        del self.ht[key]
 
-assert UINT32_MAX == 2**32-1
+    def __contains__(self, key):
+        return key in self.ht
 
-assert _MAX_VALUE % 2 == 1
+    def __len__(self):
+        return len(self.ht)
 
+    def __iter__(self):
+        for key, value in self.ht.items():
+            yield key
 
-def hashindex_variant(fn):
-    """peek into an index file and find out what it is"""
-    with open(fn, 'rb') as f:
-        magic = f.read(8)  # MAGIC_LEN
-    if magic == b'BORG_IDX':
-        return 1  # legacy
-    if magic == b'BORG2IDX':
-        return 2
-    if magic == b'12345678':  # used by unit tests
-        return 2  # just return the current variant
-    raise ValueError(f'unknown hashindex magic: {magic!r}')
+    def clear(self):
+        self.ht.clear()
 
 
-@cython.internal
-cdef class IndexBase:
-    cdef HashIndex *index
-    cdef int key_size
-    legacy = 0
+ChunkIndexEntry = namedtuple('ChunkIndexEntry', 'refcount size')
 
-    _key_size = 32
 
-    MAX_LOAD_FACTOR = HASH_MAX_LOAD
-    MAX_VALUE = _MAX_VALUE
+class ChunkIndex(HTProxyMixin, MutableMapping):
+    """
+    Mapping from key256 to (refcount32, size32) to track chunks in the repository.
+    """
+    MAX_VALUE = 2**32 - 1  # borghash has the full uint32_t range
 
-    def __cinit__(self, capacity=0, path=None, permit_compact=False, usable=None):
-        self.key_size = self._key_size
+    def __init__(self, capacity=1000, path=None, usable=None):
         if path:
-            if isinstance(path, (str, bytes)):
-                with open(path, 'rb') as fd:
-                    self.index = hashindex_read(fd, permit_compact, self.legacy)
-            else:
-                self.index = hashindex_read(path, permit_compact, self.legacy)
-            assert self.index, 'hashindex_read() returned NULL with no exception set'
+            self.ht = HashTableNT.read(path)
         else:
             if usable is not None:
-                capacity = int(usable / self.MAX_LOAD_FACTOR)
-            self.index = hashindex_init(capacity, self.key_size, self.value_size)
-            if not self.index:
-                raise Exception('hashindex_init failed')
+                capacity = usable * 2  # load factor 0.5
+            self.ht = HashTableNT(key_size=32, value_format="<II", value_type=ChunkIndexEntry, capacity=capacity)
 
-    def __dealloc__(self):
-        if self.index:
-            hashindex_free(self.index)
+    def iteritems(self):
+        yield from self.ht.items()
+
+    def add(self, key, refs, size):
+        v = self.get(key, ChunkIndexEntry(0, 0))
+        refcount = min(self.MAX_VALUE, v.refcount + refs)
+        self[key] = v._replace(refcount=refcount, size=size)
 
     @classmethod
-    def read(cls, path, permit_compact=False):
-        return cls(path=path, permit_compact=permit_compact)
+    def read(cls, path):
+        return cls(path=path)
 
     def write(self, path):
-        if isinstance(path, (str, bytes)):
-            with open(path, 'wb') as fd:
-                hashindex_write(self.index, fd, self.legacy)
-        else:
-            hashindex_write(self.index, path, self.legacy)
-
-    def clear(self):
-        hashindex_free(self.index)
-        self.index = hashindex_init(0, self.key_size, self.value_size)
-        if not self.index:
-            raise Exception('hashindex_init failed')
-
-    def setdefault(self, key, value):
-        if not key in self:
-            self[key] = value
-        return self[key]
-
-    def __delitem__(self, key):
-        assert len(key) == self.key_size
-        rc = hashindex_delete(self.index, <unsigned char *>key)
-        if rc == 1:
-            return  # success
-        if rc == -1:
-            raise KeyError(key)
-        if rc == 0:
-            raise Exception('hashindex_delete failed')
-
-    def get(self, key, default=None):
-        try:
-            return self[key]
-        except KeyError:
-            return default
-
-    def pop(self, key, default=_NoDefault):
-        try:
-            value = self[key]
-            del self[key]
-            return value
-        except KeyError:
-            if default != _NoDefault:
-                return default
-            raise
-
-    def __len__(self):
-        return hashindex_len(self.index)
+        self.ht.write(path)
 
     def size(self):
-        """Return size (bytes) of hash table."""
-        return hashindex_size(self.index)
-
-    def compact(self):
-        return hashindex_compact(self.index)
-
-
-cdef class FuseVersionsIndex(IndexBase):
-    # 4 byte version + 16 byte file contents hash
-    value_size = 20
-    _key_size = 16
-
-    def __getitem__(self, key):
-        cdef FuseVersionsElement *data
-        assert len(key) == self.key_size
-        data = <FuseVersionsElement *>hashindex_get(self.index, <unsigned char *>key)
-        if data == NULL:
-            raise KeyError(key)
-        return _le32toh(data.version), PyBytes_FromStringAndSize(data.hash, 16)
-
-    def __setitem__(self, key, value):
-        cdef FuseVersionsElement data
-        assert len(key) == self.key_size
-        data.version = value[0]
-        assert data.version <= _MAX_VALUE, "maximum number of versions reached"
-        if not PyBytes_CheckExact(value[1]) or PyBytes_GET_SIZE(value[1]) != 16:
-            raise TypeError("Expected bytes of length 16 for second value")
-        memcpy(data.hash, PyBytes_AS_STRING(value[1]), 16)
-        data.version = _htole32(data.version)
-        if not hashindex_set(self.index, <unsigned char *>key, <void *> &data):
-            raise Exception('hashindex_set failed')
-
-    def __contains__(self, key):
-        assert len(key) == self.key_size
-        return hashindex_get(self.index, <unsigned char *>key) != NULL
-
-
-NSIndexEntry = namedtuple('NSIndexEntry', 'segment offset size')
+        return self.ht.size()
 
 
-cdef class NSIndex(IndexBase):
+FuseVersionsIndexEntry = namedtuple('FuseVersionsIndexEntry', 'version hash')
 
-    value_size = 12
 
-    def __getitem__(self, key):
-        assert len(key) == self.key_size
-        data = <uint32_t *>hashindex_get(self.index, <unsigned char *>key)
-        if not data:
-            raise KeyError(key)
-        cdef uint32_t segment = _le32toh(data[0])
-        assert segment <= _MAX_VALUE, "maximum number of segments reached"
-        return NSIndexEntry(segment, _le32toh(data[1]), _le32toh(data[2]))
-
-    def __setitem__(self, key, value):
-        assert len(key) == self.key_size
-        cdef uint32_t[3] data
-        assert len(value) == len(data)
-        cdef uint32_t segment = value[0]
-        assert segment <= _MAX_VALUE, "maximum number of segments reached"
-        data[0] = _htole32(segment)
-        data[1] = _htole32(value[1])
-        data[2] = _htole32(value[2])
-        if not hashindex_set(self.index, <unsigned char *>key, data):
-            raise Exception('hashindex_set failed')
-
-    def __contains__(self, key):
-        cdef uint32_t segment
-        assert len(key) == self.key_size
-        data = <uint32_t *>hashindex_get(self.index, <unsigned char *>key)
-        if data != NULL:
-            segment = _le32toh(data[0])
-            assert segment <= _MAX_VALUE, "maximum number of segments reached"
-        return data != NULL
-
-    def iteritems(self, marker=None):
-        """iterate over all items or optionally only over items having specific flag values"""
-        cdef const unsigned char *key
-        iter = NSKeyIterator(self.key_size)
-        iter.idx = self
-        iter.index = self.index
-        if marker:
-            key = hashindex_get(self.index, <unsigned char *>marker)
-            if marker is None:
-                raise IndexError
-            iter.key = key - self.key_size
-        return iter
-
-
-cdef class NSKeyIterator:
-    cdef NSIndex idx
-    cdef HashIndex *index
-    cdef const unsigned char *key
-    cdef int key_size
-    cdef int exhausted
-
-    def __cinit__(self, key_size):
-        self.key = NULL
-        self.key_size = key_size
-        self.exhausted = 0
-
-    def __iter__(self):
-        return self
-
-    def __next__(self):
-        cdef uint32_t *value
-        if self.exhausted:
-            raise StopIteration
-        self.key = hashindex_next_key(self.index, <unsigned char *>self.key)
-        if not self.key:
-            self.exhausted = 1
-            raise StopIteration
-        value = <uint32_t *> (self.key + self.key_size)
-        cdef uint32_t segment = _le32toh(value[0])
-        assert segment <= _MAX_VALUE, "maximum number of segments reached"
-        return ((<char *>self.key)[:self.key_size],
-                NSIndexEntry(segment, _le32toh(value[1]), _le32toh(value[2])))
-
-
-cdef class NSIndex1(IndexBase):  # legacy borg 1.x
-
-    legacy = 1
-    value_size = 8
-
-    def __getitem__(self, key):
-        assert len(key) == self.key_size
-        data = <uint32_t *>hashindex_get(self.index, <unsigned char *>key)
-        if not data:
-            raise KeyError(key)
-        cdef uint32_t segment = _le32toh(data[0])
-        assert segment <= _MAX_VALUE, "maximum number of segments reached"
-        return segment, _le32toh(data[1])
-
-    def __setitem__(self, key, value):
-        assert len(key) == self.key_size
-        cdef uint32_t[2] data
-        cdef uint32_t segment = value[0]
-        assert segment <= _MAX_VALUE, "maximum number of segments reached"
-        data[0] = _htole32(segment)
-        data[1] = _htole32(value[1])
-        if not hashindex_set(self.index, <unsigned char *>key, data):
-            raise Exception('hashindex_set failed')
-
-    def __contains__(self, key):
-        cdef uint32_t segment
-        assert len(key) == self.key_size
-        data = <uint32_t *>hashindex_get(self.index, <unsigned char *>key)
-        if data != NULL:
-            segment = _le32toh(data[0])
-            assert segment <= _MAX_VALUE, "maximum number of segments reached"
-        return data != NULL
-
-    def iteritems(self, marker=None):
-        cdef const unsigned char *key
-        iter = NSKeyIterator1(self.key_size)
-        iter.idx = self
-        iter.index = self.index
-        if marker:
-            key = hashindex_get(self.index, <unsigned char *>marker)
-            if marker is None:
-                raise IndexError
-            iter.key = key - self.key_size
-        return iter
-
-
-cdef class NSKeyIterator1:  # legacy borg 1.x
-    cdef NSIndex1 idx
-    cdef HashIndex *index
-    cdef const unsigned char *key
-    cdef int key_size
-    cdef int exhausted
-
-    def __cinit__(self, key_size):
-        self.key = NULL
-        self.key_size = key_size
-        self.exhausted = 0
-
-    def __iter__(self):
-        return self
-
-    def __next__(self):
-        if self.exhausted:
-            raise StopIteration
-        self.key = hashindex_next_key(self.index, <unsigned char *>self.key)
-        if not self.key:
-            self.exhausted = 1
-            raise StopIteration
-        cdef uint32_t *value = <uint32_t *>(self.key + self.key_size)
-        cdef uint32_t segment = _le32toh(value[0])
-        assert segment <= _MAX_VALUE, "maximum number of segments reached"
-        return (<char *>self.key)[:self.key_size], (segment, _le32toh(value[1]))
-
-
-ChunkIndexEntry = namedtuple('ChunkIndexEntry', 'refcount size')
-
-
-cdef class ChunkIndex(IndexBase):
+class FuseVersionsIndex(HTProxyMixin, MutableMapping):
     """
-    Mapping of 32 byte keys to (refcount, size), which are all 32-bit unsigned.
-
-    The reference count cannot overflow. If an overflow would occur, the refcount
-    is fixed to MAX_VALUE and will neither increase nor decrease by incref(), decref()
-    or add().
-
-    Prior signed 32-bit overflow is handled correctly for most cases: All values
-    from UINT32_MAX (2**32-1, inclusive) to MAX_VALUE (exclusive) are reserved and either
-    cause silent data loss (-1, -2) or will raise an AssertionError when accessed.
-    Other values are handled correctly. Note that previously the refcount could also reach
-    0 by *increasing* it.
-
-    Assigning refcounts in this reserved range is an invalid operation and raises AssertionError.
+    Mapping from key128 to (file_version32, file_content_hash128) to support the FUSE versions view.
     """
+    def __init__(self):
+        self.ht = HashTableNT(key_size=16, value_format="<I16s", value_type=FuseVersionsIndexEntry)
 
-    value_size = 8
 
-    def __getitem__(self, key):
-        assert len(key) == self.key_size
-        data = <uint32_t *>hashindex_get(self.index, <unsigned char *>key)
-        if not data:
-            raise KeyError(key)
-        cdef uint32_t refcount = _le32toh(data[0])
-        assert refcount <= _MAX_VALUE, "invalid reference count"
-        return ChunkIndexEntry(refcount, _le32toh(data[1]))
+NSIndex1Entry = namedtuple('NSIndex1Entry', 'segment offset')
 
-    def __setitem__(self, key, value):
-        assert len(key) == self.key_size
-        cdef uint32_t[2] data
-        cdef uint32_t refcount = value[0]
-        assert refcount <= _MAX_VALUE, "invalid reference count"
-        data[0] = _htole32(refcount)
-        data[1] = _htole32(value[1])
-        if not hashindex_set(self.index, <unsigned char *>key, data):
-            raise Exception('hashindex_set failed')
 
-    def __contains__(self, key):
-        assert len(key) == self.key_size
-        data = <uint32_t *>hashindex_get(self.index, <unsigned char *>key)
-        if data != NULL:
-            assert _le32toh(data[0]) <= _MAX_VALUE, "invalid reference count"
-        return data != NULL
+class NSIndex1(HTProxyMixin, MutableMapping):
+    """
+    Mapping from key256 to (segment32, offset32), as used by legacy repo index of borg 1.x.
+    """
+    MAX_VALUE = 2**32 - 1  # borghash has the full uint32_t range
+    MAGIC = b"BORG_IDX"  # borg 1.x
+    HEADER_FMT = "<8sIIBB"  # magic, entries, buckets, ksize, vsize
+    VALUE_FMT = "<II"  # borg 1.x on-disk: little-endian segment, offset
+    KEY_SIZE = 32
+    VALUE_SIZE = 8
+
+    def __init__(self, capacity=1000, path=None, usable=None):
+        if usable is not None:
+            capacity = usable * 2  # load factor 0.5
+        self.ht = HashTableNT(key_size=self.KEY_SIZE, value_format=self.VALUE_FMT, value_type=NSIndex1Entry, capacity=capacity)
+        if path:
+            self._read(path)
 
     def iteritems(self, marker=None):
-        cdef const unsigned char *key
-        iter = ChunkKeyIterator(self.key_size)
-        iter.idx = self
-        iter.index = self.index
-        if marker:
-            key = hashindex_get(self.index, <unsigned char *>marker)
-            if marker is None:
-                raise IndexError
-            iter.key = key - self.key_size
-        return iter
-
-    def add(self, key, refs, size):
-        assert len(key) == self.key_size
-        cdef uint32_t[2] data
-        data[0] = _htole32(refs)
-        data[1] = _htole32(size)
-        self._add(<unsigned char*> key, data)
-
-    cdef _add(self, unsigned char *key, uint32_t *data):
-        cdef uint64_t refcount1, refcount2, result64
-        values = <uint32_t*> hashindex_get(self.index, key)
-        if values:
-            refcount1 = _le32toh(values[0])
-            refcount2 = _le32toh(data[0])
-            assert refcount1 <= _MAX_VALUE, "invalid reference count"
-            assert refcount2 <= _MAX_VALUE, "invalid reference count"
-            result64 = refcount1 + refcount2
-            values[0] = _htole32(min(result64, _MAX_VALUE))
-            values[1] = data[1]
-        else:
-            if not hashindex_set(self.index, key, data):
-                raise Exception('hashindex_set failed')
+        do_yield = marker is None
+        for key, value in self.ht.items():
+            if do_yield:
+                yield key, value
+            else:
+                do_yield = key == marker
 
+    @classmethod
+    def read(cls, path):
+        return cls(path=path)
 
-cdef class ChunkKeyIterator:
-    cdef ChunkIndex idx
-    cdef HashIndex *index
-    cdef const unsigned char *key
-    cdef int key_size
-    cdef int exhausted
+    def size(self):
+        return self.ht.size()  # not quite correct as this is not the on-disk read-only format.
 
-    def __cinit__(self, key_size):
-        self.key = NULL
-        self.key_size = key_size
-        self.exhausted = 0
+    def write(self, path):
+        if isinstance(path, str):
+            with open(path, 'wb') as fd:
+                self._write_fd(fd)
+        else:
+            self._write_fd(path)
 
-    def __iter__(self):
-        return self
-
-    def __next__(self):
-        if self.exhausted:
-            raise StopIteration
-        self.key = hashindex_next_key(self.index, <unsigned char *>self.key)
-        if not self.key:
-            self.exhausted = 1
-            raise StopIteration
-        cdef uint32_t *value = <uint32_t *>(self.key + self.key_size)
-        cdef uint32_t refcount = _le32toh(value[0])
-        assert refcount <= _MAX_VALUE, "invalid reference count"
-        return (<char *>self.key)[:self.key_size], ChunkIndexEntry(refcount, _le32toh(value[1]))
+    def _read(self, path):
+        if isinstance(path, str):
+            with open(path, 'rb') as fd:
+                self._read_fd(fd)
+        else:
+            self._read_fd(path)
+
+    def _write_fd(self, fd):
+        used = len(self.ht)
+        header_bytes = struct.pack(self.HEADER_FMT, self.MAGIC, used, used, self.KEY_SIZE, self.VALUE_SIZE)
+        fd.write(header_bytes)
+        count = 0
+        for key, _ in self.ht.items():
+            value = self.ht._get_raw(key)
+            fd.write(key)
+            fd.write(value)
+            count += 1
+        assert count == used
+
+    def _read_fd(self, fd):
+        header_size = struct.calcsize(self.HEADER_FMT)
+        header_bytes = fd.read(header_size)
+        if len(header_bytes) < header_size:
+            raise ValueError(f"Invalid file, file is too short (header).")
+        magic, entries, buckets, ksize, vsize = struct.unpack(self.HEADER_FMT, header_bytes)
+        if magic != self.MAGIC:
+            raise ValueError(f"Invalid file, magic {self.MAGIC.decode()} not found.")
+        assert ksize == self.KEY_SIZE, "invalid key size"
+        assert vsize == self.VALUE_SIZE, "invalid value size"
+        buckets_size = buckets * (ksize + vsize)
+        current_pos = fd.tell()
+        end_of_file = fd.seek(0, os.SEEK_END)
+        if current_pos + buckets_size != end_of_file:
+            raise ValueError(f"Invalid file, file size does not match (buckets).")
+        fd.seek(current_pos)
+        for i in range(buckets):
+            key = fd.read(ksize)
+            value = fd.read(vsize)
+            self.ht._set_raw(key, value)
+        pos = fd.tell()
+        assert pos == end_of_file

+ 11 - 15
src/borg/legacyrepository.py

@@ -13,7 +13,7 @@ from itertools import islice
 from typing import Callable, DefaultDict
 
 from .constants import *  # NOQA
-from .hashindex import NSIndexEntry, NSIndex, NSIndex1, hashindex_variant
+from .hashindex import NSIndex1Entry, NSIndex1
 from .helpers import Error, ErrorWithTraceback, IntegrityError, format_file_size, parse_file_size
 from .helpers import Location
 from .helpers import ProgressIndicatorPercent
@@ -562,16 +562,12 @@ class LegacyRepository:
 
     def open_index(self, transaction_id, auto_recover=True):
         if transaction_id is None:
-            return NSIndex()
+            return NSIndex1()
         index_path = os.path.join(self.path, "index.%d" % transaction_id)
-        variant = hashindex_variant(index_path)
         integrity_data = self._read_integrity(transaction_id, "index")
         try:
             with IntegrityCheckedFile(index_path, write=False, integrity_data=integrity_data) as fd:
-                if variant == 2:
-                    return NSIndex.read(fd)
-                if variant == 1:  # legacy
-                    return NSIndex1.read(fd)
+                return NSIndex1.read(fd)
         except (ValueError, OSError, FileIntegrityError) as exc:
             logger.warning("Repository index missing or corrupted, trying to recover from: %s", exc)
             os.unlink(index_path)
@@ -864,7 +860,7 @@ class LegacyRepository:
                     except LoggedIO.SegmentFull:
                         complete_xfer()
                         new_segment, offset = self.io.write_put(key, data)
-                    self.index[key] = NSIndexEntry(new_segment, offset, len(data))
+                    self.index[key] = NSIndex1Entry(new_segment, offset)
                     segments.setdefault(new_segment, 0)
                     segments[new_segment] += 1
                     segments[segment] -= 1
@@ -1001,7 +997,7 @@ class LegacyRepository:
                     self.shadow_index.setdefault(key, []).append(in_index.segment)
                 except KeyError:
                     pass
-                self.index[key] = NSIndexEntry(segment, offset, size)
+                self.index[key] = NSIndex1Entry(segment, offset)
                 self.segments[segment] += 1
                 self.storage_quota_use += header_size(tag) + size
             elif tag == TAG_DELETE:
@@ -1015,7 +1011,7 @@ class LegacyRepository:
                         # the old index is not necessarily valid for this transaction (e.g. compaction); if the segment
                         # is already gone, then it was already compacted.
                         self.segments[in_index.segment] -= 1
-                        self.compact[in_index.segment] += header_size(tag) + in_index.size
+                        self.compact[in_index.segment] += header_size(tag) + 0
                         self.shadow_index.setdefault(key, []).append(in_index.segment)
             elif tag == TAG_COMMIT:
                 continue
@@ -1219,8 +1215,8 @@ class LegacyRepository:
         if not self.index:
             self.index = self.open_index(self.get_transaction_id())
         try:
-            in_index = NSIndexEntry(*((self.index[id] + (None,))[:3]))  # legacy: index entries have no size element
-            return self.io.read(in_index.segment, in_index.offset, id, expected_size=in_index.size, read_data=read_data)
+            in_index = NSIndex1Entry(*(self.index[id][:2]))  # legacy: index entries have no size element
+            return self.io.read(in_index.segment, in_index.offset, id, read_data=read_data)
         except KeyError:
             raise self.ObjectNotFound(id, self.path) from None
 
@@ -1245,12 +1241,12 @@ class LegacyRepository:
             # it is essential to do a delete first to get correct quota bookkeeping
             # and also a correctly updated shadow_index, so that the compaction code
             # does not wrongly resurrect an old PUT by dropping a DEL that is still needed.
-            self._delete(id, in_index.segment, in_index.offset, in_index.size)
+            self._delete(id, in_index.segment, in_index.offset, 0)
         segment, offset = self.io.write_put(id, data)
         self.storage_quota_use += header_size(TAG_PUT2) + len(data)
         self.segments.setdefault(segment, 0)
         self.segments[segment] += 1
-        self.index[id] = NSIndexEntry(segment, offset, len(data))
+        self.index[id] = NSIndex1Entry(segment, offset)
         if self.storage_quota and self.storage_quota_use > self.storage_quota:
             self.transaction_doomed = self.StorageQuotaExceeded(
                 format_file_size(self.storage_quota), format_file_size(self.storage_quota_use)
@@ -1269,7 +1265,7 @@ class LegacyRepository:
             in_index = self.index.pop(id)
         except KeyError:
             raise self.ObjectNotFound(id, self.path) from None
-        self._delete(id, in_index.segment, in_index.offset, in_index.size)
+        self._delete(id, in_index.segment, in_index.offset, 0)
 
     def _delete(self, id, segment, offset, size):
         # common code used by put and delete

+ 2 - 2
src/borg/repository.py

@@ -193,7 +193,7 @@ class Repository:
             # to build the ChunkIndex the slow way by listing all the directories.
             from borg.cache import write_chunkindex_to_repo_cache
 
-            write_chunkindex_to_repo_cache(self, ChunkIndex(), compact=True, clear=True, force_write=True)
+            write_chunkindex_to_repo_cache(self, ChunkIndex(), clear=True, force_write=True)
         finally:
             self.store.close()
 
@@ -385,7 +385,7 @@ class Repository:
                     # if we did a full pass in one go, we built a complete, uptodate ChunkIndex, cache it!
                     from .cache import write_chunkindex_to_repo_cache
 
-                    write_chunkindex_to_repo_cache(self, chunks, compact=True, clear=True, force_write=True)
+                    write_chunkindex_to_repo_cache(self, chunks, clear=True, force_write=True)
         except StoreObjectNotFound:
             # it can be that there is no "data/" at all, then it crashes when iterating infos.
             pass

+ 2 - 9
src/borg/selftest.py

@@ -21,19 +21,12 @@ import sys
 import time
 from unittest import TestResult, TestSuite, defaultTestLoader
 
-from .testsuite.hashindex_test import HashIndexDataTestCase, HashIndexRefcountingTestCase, HashIndexTestCase
 from .testsuite.crypto_test import CryptoTestCase
 from .testsuite.chunker_test import ChunkerTestCase
 
-SELFTEST_CASES = [
-    HashIndexDataTestCase,
-    HashIndexRefcountingTestCase,
-    HashIndexTestCase,
-    CryptoTestCase,
-    ChunkerTestCase,
-]
+SELFTEST_CASES = [CryptoTestCase, ChunkerTestCase]
 
-SELFTEST_COUNT = 19
+SELFTEST_COUNT = 11
 
 
 class SelfTestResult(TestResult):

+ 0 - 75
src/borg/testsuite/hashindex_pytest_test.py

@@ -1,75 +0,0 @@
-# more hashindex tests. kept separate so we can use pytest here.
-
-import os
-import random
-
-import pytest
-
-from ..hashindex import NSIndex
-
-
-def verify_hash_table(kv, idx):
-    """kv should be a python dictionary and idx an NSIndex.  Check that idx
-    has the expected entries and the right number of entries.
-    """
-    for k, v in kv.items():
-        assert k in idx and idx[k] == (v, v, v)
-    assert len(idx) == len(kv)
-
-
-def make_hashtables(*, entries, loops):
-    idx = NSIndex()
-    kv = {}
-    for i in range(loops):
-        # put some entries
-        for j in range(entries):
-            k = random.randbytes(32)
-            v = random.randint(0, NSIndex.MAX_VALUE - 1)
-            idx[k] = (v, v, v)
-            kv[k] = v
-        # check and delete a random amount of entries
-        delete_keys = random.sample(list(kv), k=random.randint(0, len(kv)))
-        for k in delete_keys:
-            v = kv.pop(k)
-            assert idx.pop(k) == (v, v, v)
-        verify_hash_table(kv, idx)
-    return idx, kv
-
-
-@pytest.mark.skipif("BORG_TESTS_SLOW" not in os.environ, reason="slow tests not enabled, use BORG_TESTS_SLOW=1")
-def test_hashindex_stress():
-    """checks if the hashtable behaves as expected
-
-    This can be used in _hashindex.c before running this test to provoke more collisions (don't forget to compile):
-    #define HASH_MAX_LOAD .99
-    #define HASH_MAX_EFF_LOAD .999
-    """
-    make_hashtables(entries=10000, loops=1000)  # we do quite some assertions while making them
-
-
-def test_hashindex_compact():
-    """test that we do not lose or corrupt data by the compaction nor by expanding/rebuilding"""
-    idx, kv = make_hashtables(entries=5000, loops=5)
-    size_noncompact = idx.size()
-    # compact the hashtable (remove empty/tombstone buckets)
-    saved_space = idx.compact()
-    # did we actually compact (reduce space usage)?
-    size_compact = idx.size()
-    assert saved_space > 0
-    assert size_noncompact - size_compact == saved_space
-    # did we lose anything?
-    verify_hash_table(kv, idx)
-    # now expand the hashtable again. trigger a resize/rebuild by adding an entry.
-    k = b"x" * 32
-    idx[k] = (0, 0, 0)
-    kv[k] = 0
-    size_rebuilt = idx.size()
-    assert size_rebuilt > size_compact + 1
-    # did we lose anything?
-    verify_hash_table(kv, idx)
-
-
-@pytest.mark.skipif("BORG_TESTS_SLOW" not in os.environ, reason="slow tests not enabled, use BORG_TESTS_SLOW=1")
-def test_hashindex_compact_stress():
-    for _ in range(100):
-        test_hashindex_compact()

+ 18 - 405
src/borg/testsuite/hashindex_test.py

@@ -1,16 +1,9 @@
-# Note: these tests are part of the self test, do not use or import pytest functionality here.
-#       See borg.selftest for details. If you add/remove test methods, update SELFTEST_COUNT
-
-import base64
 import hashlib
-import io
-import os
-import tempfile
-import zlib
+import struct
+
+import pytest
 
-from ..hashindex import NSIndex, ChunkIndex
-from ..crypto.file_integrity import IntegrityCheckedFile, FileIntegrityError
-from . import BaseTestCase, unopened_tempfile
+from ..hashindex import ChunkIndex
 
 
 def H(x):
@@ -23,399 +16,19 @@ def H2(x):
     return hashlib.sha256(H(x)).digest()
 
 
-class HashIndexTestCase(BaseTestCase):
-    def _generic_test(self, cls, make_value, sha):
-        idx = cls()
-        self.assert_equal(len(idx), 0)
-        # Test set
-        for x in range(100):
-            idx[H(x)] = make_value(x)
-        self.assert_equal(len(idx), 100)
-        for x in range(100):
-            self.assert_equal(idx[H(x)], make_value(x))
-        # Test update
-        for x in range(100):
-            idx[H(x)] = make_value(x * 2)
-        self.assert_equal(len(idx), 100)
-        for x in range(100):
-            self.assert_equal(idx[H(x)], make_value(x * 2))
-        # Test delete
-        for x in range(50):
-            del idx[H(x)]
-        # Test some keys still in there
-        for x in range(50, 100):
-            assert H(x) in idx
-        # Test some keys not there any more
-        for x in range(50):
-            assert H(x) not in idx
-        # Test delete non-existing key
-        for x in range(50):
-            self.assert_raises(KeyError, idx.__delitem__, H(x))
-        self.assert_equal(len(idx), 50)
-        with unopened_tempfile() as filepath:
-            idx.write(filepath)
-            del idx
-            # Verify file contents
-            with open(filepath, "rb") as fd:
-                self.assert_equal(hashlib.sha256(fd.read()).hexdigest(), sha)
-            # Make sure we can open the file
-            idx = cls.read(filepath)
-            self.assert_equal(len(idx), 50)
-            for x in range(50, 100):
-                self.assert_equal(idx[H(x)], make_value(x * 2))
-            idx.clear()
-            self.assert_equal(len(idx), 0)
-            idx.write(filepath)
-            del idx
-            self.assert_equal(len(cls.read(filepath)), 0)
-        idx = cls()
-        # Test setdefault - set non-existing key
-        idx.setdefault(H(0), make_value(42))
-        assert H(0) in idx
-        assert idx[H(0)] == make_value(42)
-        # Test setdefault - do not set existing key
-        idx.setdefault(H(0), make_value(23))
-        assert H(0) in idx
-        assert idx[H(0)] == make_value(42)
-        # Test setdefault - get-like return value, key not present
-        assert idx.setdefault(H(1), make_value(23)) == make_value(23)
-        # Test setdefault - get-like return value, key present
-        assert idx.setdefault(H(0), make_value(23)) == make_value(42)
-        # clean up setdefault test
-        del idx
-
-    def test_nsindex(self):
-        self._generic_test(
-            NSIndex, lambda x: (x, x, x), "640b909cf07884cc11fdf5431ffc27dee399770ceadecce31dffecd130a311a3"
-        )
-
-    def test_chunkindex(self):
-        self._generic_test(
-            ChunkIndex, lambda x: (x, x), "5915fcf986da12e5f3ac68e05242b9c729e6101b0460b1d4e4a9e9f7cdf1b7da"
-        )
-
-    def test_resize(self):
-        n = 2000  # Must be >= MIN_BUCKETS
-        with unopened_tempfile() as filepath:
-            idx = NSIndex()
-            idx.write(filepath)
-            initial_size = os.path.getsize(filepath)
-            self.assert_equal(len(idx), 0)
-            for x in range(n):
-                idx[H(x)] = x, x, x
-            idx.write(filepath)
-            assert initial_size < os.path.getsize(filepath)
-            for x in range(n):
-                del idx[H(x)]
-            self.assert_equal(len(idx), 0)
-            idx.write(filepath)
-            self.assert_equal(initial_size, os.path.getsize(filepath))
-
-    def test_iteritems(self):
-        idx = NSIndex()
-        for x in range(100):
-            idx[H(x)] = x, x, x
-        iterator = idx.iteritems()
-        all = list(iterator)
-        self.assert_equal(len(all), 100)
-        # iterator is already exhausted by list():
-        self.assert_raises(StopIteration, next, iterator)
-        second_half = list(idx.iteritems(marker=all[49][0]))
-        self.assert_equal(len(second_half), 50)
-        self.assert_equal(second_half, all[50:])
-
-
-class HashIndexExtraTestCase(BaseTestCase):
-    """These tests are separate because they should not become part of the selftest."""
-
-    def test_chunk_indexer(self):
-        # see _hashindex.c hash_sizes, we want to be close to the max. load
-        # because interesting errors happen there.
-        key_count = int(65537 * ChunkIndex.MAX_LOAD_FACTOR) - 10
-        index = ChunkIndex(key_count)
-        all_keys = [hashlib.sha256(H(k)).digest() for k in range(key_count)]
-        # we're gonna delete 1/3 of all_keys, so let's split them 2/3 and 1/3:
-        keys, to_delete_keys = all_keys[0 : (2 * key_count // 3)], all_keys[(2 * key_count // 3) :]
-
-        for i, key in enumerate(keys):
-            index[key] = (i, i)
-        for i, key in enumerate(to_delete_keys):
-            index[key] = (i, i)
-
-        for key in to_delete_keys:
-            del index[key]
-        for i, key in enumerate(keys):
-            assert index[key] == (i, i)
-        for key in to_delete_keys:
-            assert index.get(key) is None
-
-        # now delete every key still in the index
-        for key in keys:
-            del index[key]
-        # the index should now be empty
-        assert list(index.iteritems()) == []
-
-
-class HashIndexSizeTestCase(BaseTestCase):
-    def test_size_on_disk(self):
-        idx = ChunkIndex()
-        assert idx.size() == 1024 + 1031 * (32 + 2 * 4)
-
-    def test_size_on_disk_accurate(self):
-        idx = ChunkIndex()
-        for i in range(1234):
-            idx[H(i)] = i, i**2
-        with unopened_tempfile() as filepath:
-            idx.write(filepath)
-            size = os.path.getsize(filepath)
-        assert idx.size() == size
-
-
-class HashIndexRefcountingTestCase(BaseTestCase):
-    def test_chunkindex_add(self):
-        idx1 = ChunkIndex()
-        idx1.add(H(1), 5, 6)
-        assert idx1[H(1)] == (5, 6)
-        idx1.add(H(1), 1, 2)
-        assert idx1[H(1)] == (6, 2)
-
-    def test_setitem_raises(self):
-        idx1 = ChunkIndex()
-        with self.assert_raises(AssertionError):
-            idx1[H(1)] = ChunkIndex.MAX_VALUE + 1, 0
-
-    def test_keyerror(self):
-        idx = ChunkIndex()
-        with self.assert_raises(KeyError):
-            idx[H(1)]
-        with self.assert_raises(OverflowError):
-            idx.add(H(1), -1, 0)
-
-
-class HashIndexDataTestCase(BaseTestCase):
-    # This bytestring was created with borg2-pre 2022-09-30
-    HASHINDEX = (
-        b"eJzt0DEKgwAMQNFoBXsMj9DqDUQoToKTR3Hzwr2DZi+0HS19HwIZHhnST/OjHYeljIhLTl1FVDlN7te"
-        b"Q9M/tGcdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHM"
-        b"dxHMdxHMdxHMdxHMdxHMdxHPfqbu+7F2nKz67Nc9sX97r1+Rt/4TiO4ziO4ziO4ziO4ziO4ziO4ziO4"
-        b"ziO4ziO4ziO4ziO4ziO4ziO4ziO4ziO487lDoRvHEk="
-    )
-
-    def _serialize_hashindex(self, idx):
-        with tempfile.TemporaryDirectory() as tempdir:
-            file = os.path.join(tempdir, "idx")
-            idx.write(file)
-            with open(file, "rb") as f:
-                return self._pack(f.read())
-
-    def _deserialize_hashindex(self, bytestring):
-        with tempfile.TemporaryDirectory() as tempdir:
-            file = os.path.join(tempdir, "idx")
-            with open(file, "wb") as f:
-                f.write(self._unpack(bytestring))
-            return ChunkIndex.read(file)
-
-    def _pack(self, bytestring):
-        return base64.b64encode(zlib.compress(bytestring))
-
-    def _unpack(self, bytestring):
-        return zlib.decompress(base64.b64decode(bytestring))
-
-    def test_identical_creation(self):
-        idx1 = ChunkIndex()
-        idx1[H(1)] = 1, 2
-        idx1[H(2)] = 2**31 - 1, 0
-        idx1[H(3)] = 4294962296, 0  # 4294962296 is -5000 interpreted as an uint32_t
-
-        serialized = self._serialize_hashindex(idx1)
-        assert self._unpack(serialized) == self._unpack(self.HASHINDEX)
-
-
-class HashIndexIntegrityTestCase(HashIndexDataTestCase):
-    def write_integrity_checked_index(self, tempdir):
-        idx = self._deserialize_hashindex(self.HASHINDEX)
-        file = os.path.join(tempdir, "idx")
-        with IntegrityCheckedFile(path=file, write=True) as fd:
-            idx.write(fd)
-        integrity_data = fd.integrity_data
-        assert "final" in integrity_data
-        assert "HashHeader" in integrity_data
-        return file, integrity_data
-
-    def test_integrity_checked_file(self):
-        with tempfile.TemporaryDirectory() as tempdir:
-            file, integrity_data = self.write_integrity_checked_index(tempdir)
-            with open(file, "r+b") as fd:
-                fd.write(b"Foo")
-            with self.assert_raises(FileIntegrityError):
-                with IntegrityCheckedFile(path=file, write=False, integrity_data=integrity_data) as fd:
-                    ChunkIndex.read(fd)
-
-
-class HashIndexCompactTestCase(HashIndexDataTestCase):
-    def index(self, num_entries, num_buckets, num_empty):
-        index_data = io.BytesIO()
-        index_data.write(b"BORG2IDX")
-        # version
-        index_data.write((2).to_bytes(4, "little"))
-        # num_entries
-        index_data.write(num_entries.to_bytes(4, "little"))
-        # num_buckets
-        index_data.write(num_buckets.to_bytes(4, "little"))
-        # num_empty
-        index_data.write(num_empty.to_bytes(4, "little"))
-        # key_size
-        index_data.write((32).to_bytes(4, "little"))
-        # value_size
-        index_data.write((3 * 4).to_bytes(4, "little"))
-        # reserved
-        index_data.write(bytes(1024 - 32))
-
-        self.index_data = index_data
-
-    def index_from_data(self):
-        self.index_data.seek(0)
-        # Since we are trying to carefully control the layout of the hashindex,
-        # we set permit_compact to prevent hashindex_read from resizing the hash table.
-        index = ChunkIndex.read(self.index_data, permit_compact=True)
-        return index
-
-    def write_entry(self, key, *values):
-        self.index_data.write(key)
-        for value in values:
-            self.index_data.write(value.to_bytes(4, "little"))
-
-    def write_empty(self, key):
-        self.write_entry(key, 0xFFFFFFFF, 0, 0)
-
-    def write_deleted(self, key):
-        self.write_entry(key, 0xFFFFFFFE, 0, 0)
-
-    def compare_indexes(self, idx1, idx2):
-        """Check that the two hash tables contain the same data.  idx1
-        is allowed to have "mis-filed" entries, because we only need to
-        iterate over it.  But idx2 needs to support lookup."""
-        for k, v in idx1.iteritems():
-            assert v == idx2[k]
-        assert len(idx1) == len(idx2)
-
-    def compare_compact(self, layout):
-        """A generic test of a hashindex with the specified layout.  layout should
-        be a string consisting only of the characters '*' (filled), 'D' (deleted)
-        and 'E' (empty).
-        """
-        num_buckets = len(layout)
-        num_empty = layout.count("E")
-        num_entries = layout.count("*")
-        self.index(num_entries=num_entries, num_buckets=num_buckets, num_empty=num_empty)
-        k = 0
-        for c in layout:
-            if c == "D":
-                self.write_deleted(H2(k))
-            elif c == "E":
-                self.write_empty(H2(k))
-            else:
-                assert c == "*"
-                self.write_entry(H2(k), 3 * k + 1, 3 * k + 2, 3 * k + 3)
-            k += 1
-        idx = self.index_from_data()
-        cpt = self.index_from_data()
-        cpt.compact()
-        # Note that idx is not a valid hash table, since the entries are not
-        # stored where they should be.  So lookups of the form idx[k] can fail.
-        # But cpt is a valid hash table, since there are no empty buckets.
-        assert idx.size() == 1024 + num_buckets * (32 + 3 * 4)
-        assert cpt.size() == 1024 + num_entries * (32 + 3 * 4)
-        self.compare_indexes(idx, cpt)
-
-    def test_simple(self):
-        self.compare_compact("*DE**E")
-
-    def test_first_empty(self):
-        self.compare_compact("D*E**E")
-
-    def test_last_used(self):
-        self.compare_compact("D*E*E*")
-
-    def test_too_few_empty_slots(self):
-        self.compare_compact("D**EE*")
-
-    def test_empty(self):
-        self.compare_compact("DEDEED")
-
-    def test_num_buckets_zero(self):
-        self.compare_compact("")
-
-    def test_already_compact(self):
-        self.compare_compact("***")
-
-    def test_all_at_front(self):
-        self.compare_compact("*DEEED")
-        self.compare_compact("**DEED")
-        self.compare_compact("***EED")
-        self.compare_compact("****ED")
-        self.compare_compact("*****D")
-
-    def test_all_at_back(self):
-        self.compare_compact("EDEEE*")
-        self.compare_compact("DEDE**")
-        self.compare_compact("DED***")
-        self.compare_compact("ED****")
-        self.compare_compact("D*****")
-
-
-class NSIndexTestCase(BaseTestCase):
-    def test_nsindex_segment_limit(self):
-        idx = NSIndex()
-        with self.assert_raises(AssertionError):
-            idx[H(1)] = NSIndex.MAX_VALUE + 1, 0, 0
-        assert H(1) not in idx
-        idx[H(2)] = NSIndex.MAX_VALUE, 0, 0
-        assert H(2) in idx
-
-
-class AllIndexTestCase(BaseTestCase):
-    def test_max_load_factor(self):
-        assert NSIndex.MAX_LOAD_FACTOR < 1.0
-        assert ChunkIndex.MAX_LOAD_FACTOR < 1.0
-
-
-class IndexCorruptionTestCase(BaseTestCase):
-    def test_bug_4829(self):
-        from struct import pack
-
-        def HH(x, y, z):
-            # make some 32byte long thing that depends on x, y, z.
-            # same x will mean a collision in the hashtable as bucket index is computed from
-            # first 4 bytes. giving a specific x targets bucket index x.
-            # y is to create different keys and does not go into the bucket index calculation.
-            # so, same x + different y --> collision
-            return pack("<IIIIIIII", x, y, z, 0, 0, 0, 0, 0)  # 8 * 4 == 32
-
-        idx = NSIndex()
-
-        # create lots of colliding entries
-        for y in range(700):  # stay below max load not to trigger resize
-            idx[HH(0, y, 0)] = (0, y, 0)
-
-        assert idx.size() == 1024 + 1031 * 44  # header + 1031 buckets
-
-        # delete lots of the collisions, creating lots of tombstones
-        for y in range(400):  # stay above min load not to trigger resize
-            del idx[HH(0, y, 0)]
-
-        # create lots of colliding entries, within the not yet used part of the hashtable
-        for y in range(330):  # stay below max load not to trigger resize
-            # at y == 259 a resize will happen due to going beyond max EFFECTIVE load
-            # if the bug is present, that element will be inserted at the wrong place.
-            # and because it will be at the wrong place, it can not be found again.
-            idx[HH(600, y, 0)] = 600, y, 0
-
-        # now check if hashtable contents is as expected:
-
-        assert [idx.get(HH(0, y, 0)) for y in range(400, 700)] == [(0, y, 0) for y in range(400, 700)]
+def test_chunkindex_add():
+    chunks = ChunkIndex()
+    x = H2(1)
+    chunks.add(x, 5, 6)
+    assert chunks[x] == (5, 6)
+    chunks.add(x, 1, 2)
+    assert chunks[x] == (6, 2)
 
-        assert [HH(0, y, 0) in idx for y in range(400)] == [False for y in range(400)]  # deleted entries
 
-        # this will fail at HH(600, 259) if the bug is present.
-        assert [idx.get(HH(600, y, 0)) for y in range(330)] == [(600, y, 0) for y in range(330)]
+def test_keyerror():
+    chunks = ChunkIndex()
+    x = H2(1)
+    with pytest.raises(KeyError):
+        chunks[x]
+    with pytest.raises(struct.error):
+        chunks.add(x, -1, 0)

+ 5 - 55
src/borg/testsuite/legacyrepository_test.py

@@ -7,7 +7,7 @@ from unittest.mock import patch
 import pytest
 
 from ..checksums import xxh64
-from ..hashindex import NSIndex
+from ..hashindex import NSIndex1
 from ..helpers import Location
 from ..helpers import IntegrityError
 from ..helpers import msgpack
@@ -234,7 +234,7 @@ def test_max_data_size(repo_fixtures, request):
 
 def _assert_sparse(repository):
     # the superseded 123456... PUT
-    assert repository.compact[0] == 41 + 8 + len(fchunk(b"123456789"))
+    assert repository.compact[0] == 41 + 8 + 0  # len(fchunk(b"123456789"))
     # a COMMIT
     assert repository.compact[1] == 9
     # the DELETE issued by the superseding PUT (or issued directly)
@@ -268,7 +268,7 @@ def test_sparse_delete(repository):
         repository.delete(H(0))
         repository.io._write_fd.sync()
         # the on-line tracking works on a per-object basis...
-        assert repository.compact[0] == 41 + 8 + 41 + len(chunk0)
+        assert repository.compact[0] == 41 + 8 + 41 + 0  # len(chunk0) information is lost
         repository._rebuild_sparse(0)
         # ...while _rebuild_sparse can mark whole segments as completely sparse (which then includes the segment magic)
         assert repository.compact[0] == 41 + 8 + 41 + len(chunk0) + len(MAGIC)
@@ -564,56 +564,6 @@ def test_create_free_space(repository):
         assert not os.path.exists(repository.path)
 
 
-def test_tracking(repository):
-    with repository:
-        assert repository.storage_quota_use == 0
-        ch1 = fchunk(bytes(1234))
-        repository.put(H(1), ch1)
-        assert repository.storage_quota_use == len(ch1) + 41 + 8
-        ch2 = fchunk(bytes(5678))
-        repository.put(H(2), ch2)
-        assert repository.storage_quota_use == len(ch1) + len(ch2) + 2 * (41 + 8)
-        repository.delete(H(1))
-        assert repository.storage_quota_use == len(ch1) + len(ch2) + 2 * (41 + 8)  # we have not compacted yet
-        repository.commit(compact=False)
-        assert repository.storage_quota_use == len(ch1) + len(ch2) + 2 * (41 + 8)  # we have not compacted yet
-    with reopen(repository) as repository:
-        # open new transaction; hints and thus quota data is not loaded unless needed.
-        ch3 = fchunk(b"")
-        repository.put(H(3), ch3)
-        repository.delete(H(3))
-        assert repository.storage_quota_use == len(ch1) + len(ch2) + len(ch3) + 3 * (
-            41 + 8
-        )  # we have not compacted yet
-        repository.commit(compact=True)
-        assert repository.storage_quota_use == len(ch2) + 41 + 8
-
-
-def test_exceed_quota(repository):
-    with repository:
-        assert repository.storage_quota_use == 0
-        repository.storage_quota = 80
-        ch1 = fchunk(b"x" * 7)
-        repository.put(H(1), ch1)
-        assert repository.storage_quota_use == len(ch1) + 41 + 8
-        repository.commit(compact=False)
-        with pytest.raises(LegacyRepository.StorageQuotaExceeded):
-            ch2 = fchunk(b"y" * 13)
-            repository.put(H(2), ch2)
-        assert repository.storage_quota_use == len(ch1) + len(ch2) + (41 + 8) * 2  # check ch2!?
-        with pytest.raises(LegacyRepository.StorageQuotaExceeded):
-            repository.commit(compact=False)
-        assert repository.storage_quota_use == len(ch1) + len(ch2) + (41 + 8) * 2  # check ch2!?
-    with reopen(repository) as repository:
-        repository.storage_quota = 161
-        # open new transaction; hints and thus quota data is not loaded unless needed.
-        repository.put(H(1), ch1)
-        # we have 2 puts for H(1) here and not yet compacted.
-        assert repository.storage_quota_use == len(ch1) * 2 + (41 + 8) * 2
-        repository.commit(compact=True)
-        assert repository.storage_quota_use == len(ch1) + 41 + 8  # now we have compacted.
-
-
 def make_auxiliary(repository):
     with repository:
         repository.put(H(0), fchunk(b"foo"))
@@ -805,12 +755,12 @@ def get_head(repo_path):
 
 
 def open_index(repo_path):
-    return NSIndex.read(os.path.join(repo_path, f"index.{get_head(repo_path)}"))
+    return NSIndex1.read(os.path.join(repo_path, f"index.{get_head(repo_path)}"))
 
 
 def corrupt_object(repo_path, id_):
     idx = open_index(repo_path)
-    segment, offset, _ = idx[H(id_)]
+    segment, offset = idx[H(id_)]
     with open(os.path.join(repo_path, "data", "0", str(segment)), "r+b") as fd:
         fd.seek(offset)
         fd.write(b"BOOM")