浏览代码

Merge pull request #7347 from jdchristensen/hashindex_compact

Make hashindex_compact simpler and probably faster
TW 2 年之前
父节点
当前提交
52040bc043
共有 3 个文件被更改,包括 106 次插入144 次删除
  1. 24 47
      src/borg/_hashindex.c
  2. 64 86
      src/borg/testsuite/hashindex.py
  3. 18 11
      src/borg/testsuite/hashindex_pytest.py

+ 24 - 47
src/borg/_hashindex.c

@@ -113,6 +113,7 @@ static int hash_sizes[] = {
 
 #define BUCKET_IS_DELETED(index, idx) (*((uint32_t *)(BUCKET_ADDR(index, idx) + index->key_size)) == DELETED)
 #define BUCKET_IS_EMPTY(index, idx) (*((uint32_t *)(BUCKET_ADDR(index, idx) + index->key_size)) == EMPTY)
+#define BUCKET_IS_EMPTY_OR_DELETED(index, idx) (BUCKET_IS_EMPTY(index, idx) || BUCKET_IS_DELETED(index, idx))
 
 #define BUCKET_MARK_DELETED(index, idx) (*((uint32_t *)(BUCKET_ADDR(index, idx) + index->key_size)) = DELETED)
 #define BUCKET_MARK_EMPTY(index, idx) (*((uint32_t *)(BUCKET_ADDR(index, idx) + index->key_size)) = EMPTY)
@@ -569,11 +570,11 @@ hashindex_read(PyObject *file_py, int permit_compact, int legacy)
     }
     index->buckets = index->buckets_buffer.buf;
 
-    if(!permit_compact) {
-        index->min_empty = get_min_empty(index->num_buckets);
-        if (index->num_empty == -1)  // we read a legacy index without num_empty value
-            index->num_empty = count_empty(index);
+    index->min_empty = get_min_empty(index->num_buckets);
+    if (index->num_empty == -1)  // we read a legacy index without num_empty value
+        index->num_empty = count_empty(index);
 
+    if(!permit_compact) {
         if(index->num_empty < index->min_empty) {
             /* too many tombstones here / not enough empty buckets, do a same-size rebuild */
             if(!hashindex_resize(index, index->num_buckets)) {
@@ -819,7 +820,7 @@ hashindex_next_key(HashIndex *index, const unsigned char *key)
     if (idx == index->num_buckets) {
         return NULL;
     }
-    while(BUCKET_IS_EMPTY(index, idx) || BUCKET_IS_DELETED(index, idx)) {
+    while(BUCKET_IS_EMPTY_OR_DELETED(index, idx)) {
         idx ++;
         if (idx == index->num_buckets) {
             return NULL;
@@ -828,56 +829,32 @@ hashindex_next_key(HashIndex *index, const unsigned char *key)
     return BUCKET_ADDR(index, idx);
 }
 
+/* Move all non-empty/non-deleted entries in the hash table to the beginning. This does not preserve the order, and it does not mark the previously used entries as empty or deleted. But it reduces num_buckets so that those entries will never be accessed. */
 static uint64_t
 hashindex_compact(HashIndex *index)
 {
-    int idx = 0;
-    int start_idx;
-    int begin_used_idx;
-    int empty_slot_count, count, buckets_to_copy;
-    int compact_tail_idx = 0;
+    int idx = index->num_buckets - 1;
+    int tail = 0;
     uint64_t saved_size = (index->num_buckets - index->num_entries) * (uint64_t)index->bucket_size;
 
-    if(index->num_buckets - index->num_entries == 0) {
-        /* already compact */
-        return 0;
-    }
-
-    while(idx < index->num_buckets) {
-        /* Phase 1: Find some empty slots */
-        start_idx = idx;
-        while((idx < index->num_buckets) && (BUCKET_IS_EMPTY(index, idx) || BUCKET_IS_DELETED(index, idx))) {
-            idx++;
-        }
-
-        /* everything from start_idx to idx-1 (inclusive) is empty or deleted */
-        count = empty_slot_count = idx - start_idx;
-        begin_used_idx = idx;
-
-        if(!empty_slot_count) {
-            /* In case idx==compact_tail_idx, the areas overlap */
-            memmove(BUCKET_ADDR(index, compact_tail_idx), BUCKET_ADDR(index, idx), index->bucket_size);
-            idx++;
-            compact_tail_idx++;
-            continue;
-        }
-
-        /* Phase 2: Find some non-empty/non-deleted slots we can move to the compact tail */
-
-        while(empty_slot_count && (idx < index->num_buckets) && !(BUCKET_IS_EMPTY(index, idx) || BUCKET_IS_DELETED(index, idx))) {
-            idx++;
-            empty_slot_count--;
+    /* idx will point to the last filled spot and tail will point to the first empty or deleted spot. */
+    for(;;) {
+        /* Find the last filled spot >= index->num_entries. */
+        while((idx >= index->num_entries) && BUCKET_IS_EMPTY_OR_DELETED(index, idx)) {
+            idx--;
         }
-
-        buckets_to_copy = count - empty_slot_count;
-
-        if(!buckets_to_copy) {
-            /* Nothing to move, reached end of the buckets array with no used buckets. */
+        /* If all spots >= index->num_entries are empty, then we must be in a compact state. */
+        if(idx < index->num_entries) {
             break;
         }
-
-        memcpy(BUCKET_ADDR(index, compact_tail_idx), BUCKET_ADDR(index, begin_used_idx), buckets_to_copy * index->bucket_size);
-        compact_tail_idx += buckets_to_copy;
+        /* Find the first empty or deleted spot < index->num_entries. */
+        while((tail < index->num_entries) && !BUCKET_IS_EMPTY_OR_DELETED(index, tail)) {
+            tail++;
+        }
+        assert(tail < index->num_entries);
+        memcpy(BUCKET_ADDR(index, tail), BUCKET_ADDR(index, idx), index->bucket_size);
+        idx--;
+        tail++;
     }
 
     index->num_buckets = index->num_entries;

+ 64 - 86
src/borg/testsuite/hashindex.py

@@ -461,20 +461,11 @@ class HashIndexCompactTestCase(HashIndexDataTestCase):
 
     def index_from_data(self):
         self.index_data.seek(0)
-        index = ChunkIndex.read(self.index_data)
+        # Since we are trying to carefully control the layout of the hashindex,
+        # we set permit_compact to prevent hashindex_read from resizing the hash table.
+        index = ChunkIndex.read(self.index_data, permit_compact=True)
         return index
 
-    def index_to_data(self, index):
-        data = io.BytesIO()
-        index.write(data)
-        return data.getvalue()
-
-    def index_from_data_compact_to_data(self):
-        index = self.index_from_data()
-        index.compact()
-        compact_index = self.index_to_data(index)
-        return compact_index
-
     def write_entry(self, key, *values):
         self.index_data.write(key)
         for value in values:
@@ -486,87 +477,77 @@ class HashIndexCompactTestCase(HashIndexDataTestCase):
     def write_deleted(self, key):
         self.write_entry(key, 0xFFFFFFFE, 0, 0)
 
+    def compare_indexes(self, idx1, idx2):
+        """Check that the two hash tables contain the same data.  idx1
+        is allowed to have "mis-filed" entries, because we only need to
+        iterate over it.  But idx2 needs to support lookup."""
+        for k, v in idx1.iteritems():
+            assert v == idx2[k]
+        assert len(idx1) == len(idx2)
+
+    def compare_compact(self, layout):
+        """A generic test of a hashindex with the specified layout.  layout should
+        be a string consisting only of the characters '*' (filled), 'D' (deleted)
+        and 'E' (empty).
+        """
+        num_buckets = len(layout)
+        num_empty = layout.count("E")
+        num_entries = layout.count("*")
+        self.index(num_entries=num_entries, num_buckets=num_buckets, num_empty=num_empty)
+        k = 0
+        for c in layout:
+            if c == "D":
+                self.write_deleted(H2(k))
+            elif c == "E":
+                self.write_empty(H2(k))
+            else:
+                assert c == "*"
+                self.write_entry(H2(k), 3 * k + 1, 3 * k + 2, 3 * k + 3)
+            k += 1
+        idx = self.index_from_data()
+        cpt = self.index_from_data()
+        cpt.compact()
+        # Note that idx is not a valid hash table, since the entries are not
+        # stored where they should be.  So lookups of the form idx[k] can fail.
+        # But cpt is a valid hash table, since there are no empty buckets.
+        assert idx.size() == 1024 + num_buckets * (32 + 3 * 4)
+        assert cpt.size() == 1024 + num_entries * (32 + 3 * 4)
+        self.compare_indexes(idx, cpt)
+
     def test_simple(self):
-        self.index(num_entries=3, num_buckets=6, num_empty=2)
-        self.write_entry(H2(0), 1, 2, 3)
-        self.write_deleted(H2(1))
-        self.write_empty(H2(2))
-        self.write_entry(H2(3), 5, 6, 7)
-        self.write_entry(H2(4), 8, 9, 10)
-        self.write_empty(H2(5))
-
-        compact_index = self.index_from_data_compact_to_data()
-
-        self.index(num_entries=3, num_buckets=3, num_empty=0)
-        self.write_entry(H2(0), 1, 2, 3)
-        self.write_entry(H2(3), 5, 6, 7)
-        self.write_entry(H2(4), 8, 9, 10)
-        assert compact_index == self.index_data.getvalue()
+        self.compare_compact("*DE**E")
 
     def test_first_empty(self):
-        self.index(num_entries=3, num_buckets=6, num_empty=2)
-        self.write_deleted(H2(1))
-        self.write_entry(H2(0), 1, 2, 3)
-        self.write_empty(H2(2))
-        self.write_entry(H2(3), 5, 6, 7)
-        self.write_entry(H2(4), 8, 9, 10)
-        self.write_empty(H2(5))
-
-        compact_index = self.index_from_data_compact_to_data()
-
-        self.index(num_entries=3, num_buckets=3, num_empty=0)
-        self.write_entry(H2(0), 1, 2, 3)
-        self.write_entry(H2(3), 5, 6, 7)
-        self.write_entry(H2(4), 8, 9, 10)
-        assert compact_index == self.index_data.getvalue()
+        self.compare_compact("D*E**E")
 
     def test_last_used(self):
-        self.index(num_entries=3, num_buckets=6, num_empty=2)
-        self.write_deleted(H2(1))
-        self.write_entry(H2(0), 1, 2, 3)
-        self.write_empty(H2(2))
-        self.write_entry(H2(3), 5, 6, 7)
-        self.write_empty(H2(5))
-        self.write_entry(H2(4), 8, 9, 10)
-
-        compact_index = self.index_from_data_compact_to_data()
-
-        self.index(num_entries=3, num_buckets=3, num_empty=0)
-        self.write_entry(H2(0), 1, 2, 3)
-        self.write_entry(H2(3), 5, 6, 7)
-        self.write_entry(H2(4), 8, 9, 10)
-        assert compact_index == self.index_data.getvalue()
+        self.compare_compact("D*E*E*")
 
     def test_too_few_empty_slots(self):
-        self.index(num_entries=3, num_buckets=6, num_empty=2)
-        self.write_deleted(H2(1))
-        self.write_entry(H2(0), 1, 2, 3)
-        self.write_entry(H2(3), 5, 6, 7)
-        self.write_empty(H2(2))
-        self.write_empty(H2(5))
-        self.write_entry(H2(4), 8, 9, 10)
-
-        compact_index = self.index_from_data_compact_to_data()
-
-        self.index(num_entries=3, num_buckets=3, num_empty=0)
-        self.write_entry(H2(0), 1, 2, 3)
-        self.write_entry(H2(3), 5, 6, 7)
-        self.write_entry(H2(4), 8, 9, 10)
-        assert compact_index == self.index_data.getvalue()
+        self.compare_compact("D**EE*")
 
     def test_empty(self):
-        self.index(num_entries=0, num_buckets=6, num_empty=3)
-        self.write_deleted(H2(1))
-        self.write_empty(H2(0))
-        self.write_deleted(H2(3))
-        self.write_empty(H2(2))
-        self.write_empty(H2(5))
-        self.write_deleted(H2(4))
+        self.compare_compact("DEDEED")
 
-        compact_index = self.index_from_data_compact_to_data()
+    def test_num_buckets_zero(self):
+        self.compare_compact("")
 
-        self.index(num_entries=0, num_buckets=0, num_empty=0)
-        assert compact_index == self.index_data.getvalue()
+    def test_already_compact(self):
+        self.compare_compact("***")
+
+    def test_all_at_front(self):
+        self.compare_compact("*DEEED")
+        self.compare_compact("**DEED")
+        self.compare_compact("***EED")
+        self.compare_compact("****ED")
+        self.compare_compact("*****D")
+
+    def test_all_at_back(self):
+        self.compare_compact("EDEEE*")
+        self.compare_compact("DEDE**")
+        self.compare_compact("DED***")
+        self.compare_compact("ED****")
+        self.compare_compact("D*****")
 
     def test_merge(self):
         master = ChunkIndex()
@@ -576,11 +557,8 @@ class HashIndexCompactTestCase(HashIndexDataTestCase):
         idx1[H(3)] = 3, 300
         idx1.compact()
         assert idx1.size() == 1024 + 3 * (32 + 2 * 4)
-
         master.merge(idx1)
-        assert master[H(1)] == (1, 100)
-        assert master[H(2)] == (2, 200)
-        assert master[H(3)] == (3, 300)
+        self.compare_indexes(idx1, master)
 
 
 class NSIndexTestCase(BaseTestCase):

+ 18 - 11
src/borg/testsuite/hashindex_pytest.py

@@ -8,6 +8,15 @@ import pytest
 from ..hashindex import NSIndex
 
 
+def verify_hash_table(kv, idx):
+    """kv should be a python dictionary and idx an NSIndex.  Check that idx
+    has the expected entries and the right number of entries.
+    """
+    for k, v in kv.items():
+        assert k in idx and idx[k] == (v, v, v)
+    assert len(idx) == len(kv)
+
+
 def make_hashtables(*, entries, loops):
     idx = NSIndex()
     kv = {}
@@ -23,11 +32,7 @@ def make_hashtables(*, entries, loops):
         for k in delete_keys:
             v = kv.pop(k)
             assert idx.pop(k) == (v, v, v)
-        # check if remaining entries are as expected
-        for k, v in kv.items():
-            assert idx[k] == (v, v, v)
-        # check entry count
-        assert len(kv) == len(idx)
+        verify_hash_table(kv, idx)
     return idx, kv
 
 
@@ -53,9 +58,7 @@ def test_hashindex_compact():
     assert saved_space > 0
     assert size_noncompact - size_compact == saved_space
     # did we lose anything?
-    for k, v in kv.items():
-        assert k in idx and idx[k] == (v, v, v)
-    assert len(idx) == len(kv)
+    verify_hash_table(kv, idx)
     # now expand the hashtable again. trigger a resize/rebuild by adding an entry.
     k = b"x" * 32
     idx[k] = (0, 0, 0)
@@ -63,6 +66,10 @@ def test_hashindex_compact():
     size_rebuilt = idx.size()
     assert size_rebuilt > size_compact + 1
     # did we lose anything?
-    for k, v in kv.items():
-        assert k in idx and idx[k] == (v, v, v)
-    assert len(idx) == len(kv)
+    verify_hash_table(kv, idx)
+
+
+@pytest.mark.skipif("BORG_TESTS_SLOW" not in os.environ, reason="slow tests not enabled, use BORG_TESTS_SLOW=1")
+def test_hashindex_compact_stress():
+    for _ in range(100):
+        test_hashindex_compact()