浏览代码

ChunkIndex: .refcount -> .flags

We gave up refcounting quite a while ago and are only interested
in whether a chunk is used (referenced) or not (orphan).

So, let's keep that uint32_t value, but use it for bit flags, so
we could use it to efficiently remember other chunk-related stuff also.
Thomas Waldmann 7 月之前
父节点
当前提交
94effcd782
共有 6 个文件被更改,包括 37 次插入27 次删除
  1. 1 1
      src/borg/archive.py
  2. 7 7
      src/borg/archiver/compact_cmd.py
  3. 5 6
      src/borg/cache.py
  4. 12 5
      src/borg/hashindex.pyx
  5. 3 3
      src/borg/repository.py
  6. 9 5
      src/borg/testsuite/hashindex_test.py

+ 1 - 1
src/borg/archive.py

@@ -1874,7 +1874,7 @@ class ArchiveChecker:
             # either we already have this chunk in repo and chunks index or we add it now
             if id_ not in self.chunks:
                 assert cdata is not None
-                self.chunks[id_] = ChunkIndexEntry(refcount=ChunkIndex.MAX_VALUE, size=size)
+                self.chunks[id_] = ChunkIndexEntry(flags=ChunkIndex.F_USED, size=size)
                 if self.repair:
                     self.repository.put(id_, cdata)
 

+ 7 - 7
src/borg/archiver/compact_cmd.py

@@ -50,20 +50,20 @@ class ArchiveGarbageCollector:
         """Build a dict id -> size of all chunks present in the repository"""
         chunks = ChunkIndex()
         for id, stored_size in repo_lister(self.repository, limit=LIST_SCAN_LIMIT):
-            # we add this id to the chunks index, using refcount == 0, because
+            # we add this id to the chunks index (as unused chunk), because
             # we do not know yet whether it is actually referenced from some archives.
             # we "abuse" the size field here. usually there is the plaintext size,
             # but we use it for the size of the stored object here.
-            chunks[id] = ChunkIndexEntry(refcount=0, size=stored_size)
+            chunks[id] = ChunkIndexEntry(flags=ChunkIndex.F_NONE, size=stored_size)
         return chunks
 
     def save_chunk_index(self):
         # first clean up:
         for id, entry in self.chunks.iteritems():
             # we already deleted the unused chunks, so everything left must be used:
-            assert entry.refcount == ChunkIndex.MAX_VALUE
+            assert entry.flags & ChunkIndex.F_USED
             # as we put the wrong size in there, we need to clean up the size:
-            self.chunks[id] = ChunkIndexEntry(refcount=ChunkIndex.MAX_VALUE, size=0)
+            self.chunks[id] = entry._replace(size=0)
         # now self.chunks is an uptodate ChunkIndex, usable for general borg usage!
         write_chunkindex_to_repo_cache(self.repository, self.chunks, clear=True, force_write=True)
         self.chunks = None  # nothing there (cleared!)
@@ -74,8 +74,8 @@ class ArchiveGarbageCollector:
         def use_it(id, *, wanted=False):
             entry = self.chunks.get(id)
             if entry is not None:
-                # the chunk is in the repo, mark it used by setting refcount to max.
-                self.chunks[id] = ChunkIndexEntry(refcount=ChunkIndex.MAX_VALUE, size=entry.size)
+                # the chunk is in the repo, mark it used.
+                self.chunks[id] = entry._replace(flags=entry.flags | ChunkIndex.F_USED)
                 if wanted:
                     # chunk id is from chunks_healthy list: a lost chunk has re-appeared!
                     reappeared_chunks.add(id)
@@ -131,7 +131,7 @@ class ArchiveGarbageCollector:
         logger.info("Determining unused objects...")
         unused = set()
         for id, entry in self.chunks.iteritems():
-            if entry.refcount == 0:
+            if not (entry.flags & ChunkIndex.F_USED):
                 unused.add(id)
         logger.info(f"Deleting {len(unused)} unused objects...")
         pi = ProgressIndicatorPercent(

+ 5 - 6
src/borg/cache.py

@@ -396,7 +396,7 @@ class FilesCacheMixin:
         for id, size in entry.chunks:
             cie = self.chunks.get(id)
             assert cie is not None
-            assert cie.refcount > 0
+            assert cie.flags & ChunkIndex.F_USED
             assert size == cie.size
             idx = self.chunks.k_to_idx(id)
             compressed_chunks.append(idx)
@@ -415,7 +415,7 @@ class FilesCacheMixin:
             id = self.chunks.idx_to_k(idx)
             cie = self.chunks.get(id)
             assert cie is not None
-            assert cie.refcount > 0
+            assert cie.flags & ChunkIndex.F_USED
             assert cie.size > 0
             chunks.append((id, cie.size))
         entry = entry._replace(chunks=chunks)
@@ -722,10 +722,9 @@ def build_chunkindex_from_repo(repository, *, disable_caches=False, cache_immedi
     chunks = ChunkIndex()
     t0 = perf_counter()
     num_chunks = 0
-    # The repo says it has these chunks, so we assume they are referenced chunks.
-    # We do not care for refcounting anymore, so we just set refcount = MAX_VALUE.
+    # The repo says it has these chunks, so we assume they are referenced/used chunks.
     # We do not know the plaintext size (!= stored_size), thus we set size = 0.
-    init_entry = ChunkIndexEntry(refcount=ChunkIndex.MAX_VALUE, size=0)
+    init_entry = ChunkIndexEntry(flags=ChunkIndex.F_USED, size=0)
     for id, stored_size in repo_lister(repository, limit=LIST_SCAN_LIMIT):
         num_chunks += 1
         chunks[id] = init_entry
@@ -809,7 +808,7 @@ class ChunksMixin:
         )
         self.repository.put(id, cdata, wait=wait)
         self.last_refresh_dt = now  # .put also refreshed the lock
-        self.chunks.add(id, ChunkIndex.MAX_VALUE, size)
+        self.chunks.add(id, 1, size)
         stats.update(size, not exists)
         return ChunkListEntry(id, size)
 

+ 12 - 5
src/borg/hashindex.pyx

@@ -34,14 +34,16 @@ class HTProxyMixin:
         self.ht.clear()
 
 
-ChunkIndexEntry = namedtuple('ChunkIndexEntry', 'refcount size')
+ChunkIndexEntry = namedtuple('ChunkIndexEntry', 'flags size')
 
 
 class ChunkIndex(HTProxyMixin, MutableMapping):
     """
     Mapping from key256 to (refcount32, size32) to track chunks in the repository.
     """
-    MAX_VALUE = 2**32 - 1  # borghash has the full uint32_t range
+    # .flags values: 2^0 .. 2^31
+    F_NONE = 0  # all flags cleared
+    F_USED = 1  # chunk is used/referenced
 
     def __init__(self, capacity=1000, path=None, usable=None):
         if path:
@@ -55,9 +57,14 @@ class ChunkIndex(HTProxyMixin, MutableMapping):
         yield from self.ht.items()
 
     def add(self, key, refs, size):
-        v = self.get(key, ChunkIndexEntry(0, 0))
-        refcount = min(self.MAX_VALUE, v.refcount + refs)
-        self[key] = v._replace(refcount=refcount, size=size)
+        assert refs > 0
+        v = self.get(key)
+        if v is None:
+            flags = self.F_USED
+        else:
+            flags = v.flags | self.F_USED
+            assert v.size == 0 or v.size == size
+        self[key] = ChunkIndexEntry(flags=flags, size=size)
 
     @classmethod
     def read(cls, path):

+ 3 - 3
src/borg/repository.py

@@ -324,10 +324,10 @@ class Repository:
         objs_checked = objs_errors = 0
         chunks = ChunkIndex()
         # we don't do refcounting anymore, neither we can know here whether any archive
-        # is using this object, but we assume that this is the case and set refcount to
-        # MAX_VALUE. As we don't do garbage collection here, this is not a problem.
+        # is using this object, but we assume that this is the case.
+        # As we don't do garbage collection here, this is not a problem.
         # We also don't know the plaintext size, so we set it to 0.
-        init_entry = ChunkIndexEntry(refcount=ChunkIndex.MAX_VALUE, size=0)
+        init_entry = ChunkIndexEntry(flags=ChunkIndex.F_USED, size=0)
         infos = self.store.list("data")
         try:
             for info in infos:

+ 9 - 5
src/borg/testsuite/hashindex_test.py

@@ -3,7 +3,7 @@ import struct
 
 import pytest
 
-from ..hashindex import ChunkIndex
+from ..hashindex import ChunkIndex, ChunkIndexEntry
 
 
 def H(x):
@@ -19,10 +19,14 @@ def H2(x):
 def test_chunkindex_add():
     chunks = ChunkIndex()
     x = H2(1)
-    chunks.add(x, 5, 6)
-    assert chunks[x] == (5, 6)
+    chunks.add(x, 1, 0)
+    assert chunks[x] == ChunkIndexEntry(flags=ChunkIndex.F_USED, size=0)
+    chunks.add(x, 1, 2)  # updating size (we do not have a size yet)
+    assert chunks[x] == ChunkIndexEntry(flags=ChunkIndex.F_USED, size=2)
     chunks.add(x, 1, 2)
-    assert chunks[x] == (6, 2)
+    assert chunks[x] == ChunkIndexEntry(flags=ChunkIndex.F_USED, size=2)
+    with pytest.raises(AssertionError):
+        chunks.add(x, 1, 3)  # inconsistent size (we already have a different size)
 
 
 def test_keyerror():
@@ -31,4 +35,4 @@ def test_keyerror():
     with pytest.raises(KeyError):
         chunks[x]
     with pytest.raises(struct.error):
-        chunks.add(x, -1, 0)
+        chunks[x] = ChunkIndexEntry(flags=2**33, size=0)