7 月之前 · 94effcd782
--- a/src/borg/archive.py
+++ b/src/borg/archive.py
@@ -1874,7 +1874,7 @@ class ArchiveChecker:
 
				             # either we already have this chunk in repo and chunks index or we add it now
			
 
				             if id_ not in self.chunks:
			
 
				                 assert cdata is not None
			
 
				-                self.chunks[id_] = ChunkIndexEntry(refcount=ChunkIndex.MAX_VALUE, size=size)
			
 
				+                self.chunks[id_] = ChunkIndexEntry(flags=ChunkIndex.F_USED, size=size)
			
 
				                 if self.repair:
			
 
				                     self.repository.put(id_, cdata)
			
 
				 
			
--- a/src/borg/archiver/compact_cmd.py
+++ b/src/borg/archiver/compact_cmd.py
@@ -50,20 +50,20 @@ class ArchiveGarbageCollector:
 
				         """Build a dict id -> size of all chunks present in the repository"""
			
 
				         chunks = ChunkIndex()
			
 
				         for id, stored_size in repo_lister(self.repository, limit=LIST_SCAN_LIMIT):
			
 
				-            # we add this id to the chunks index, using refcount == 0, because
			
 
				+            # we add this id to the chunks index (as unused chunk), because
			
 
				             # we do not know yet whether it is actually referenced from some archives.
			
 
				             # we "abuse" the size field here. usually there is the plaintext size,
			
 
				             # but we use it for the size of the stored object here.
			
 
				-            chunks[id] = ChunkIndexEntry(refcount=0, size=stored_size)
			
 
				+            chunks[id] = ChunkIndexEntry(flags=ChunkIndex.F_NONE, size=stored_size)
			
 
				         return chunks
			
 
				 
			
 
				     def save_chunk_index(self):
			
 
				         # first clean up:
			
 
				         for id, entry in self.chunks.iteritems():
			
 
				             # we already deleted the unused chunks, so everything left must be used:
			
 
				-            assert entry.refcount == ChunkIndex.MAX_VALUE
			
 
				+            assert entry.flags & ChunkIndex.F_USED
			
 
				             # as we put the wrong size in there, we need to clean up the size:
			
 
				-            self.chunks[id] = ChunkIndexEntry(refcount=ChunkIndex.MAX_VALUE, size=0)
			
 
				+            self.chunks[id] = entry._replace(size=0)
			
 
				         # now self.chunks is an uptodate ChunkIndex, usable for general borg usage!
			
 
				         write_chunkindex_to_repo_cache(self.repository, self.chunks, clear=True, force_write=True)
			
 
				         self.chunks = None  # nothing there (cleared!)
			
@@ -74,8 +74,8 @@ class ArchiveGarbageCollector:
 
				         def use_it(id, *, wanted=False):
			
 
				             entry = self.chunks.get(id)
			
 
				             if entry is not None:
			
 
				-                # the chunk is in the repo, mark it used by setting refcount to max.
			
 
				-                self.chunks[id] = ChunkIndexEntry(refcount=ChunkIndex.MAX_VALUE, size=entry.size)
			
 
				+                # the chunk is in the repo, mark it used.
			
 
				+                self.chunks[id] = entry._replace(flags=entry.flags | ChunkIndex.F_USED)
			
 
				                 if wanted:
			
 
				                     # chunk id is from chunks_healthy list: a lost chunk has re-appeared!
			
 
				                     reappeared_chunks.add(id)
			
@@ -131,7 +131,7 @@ class ArchiveGarbageCollector:
 
				         logger.info("Determining unused objects...")
			
 
				         unused = set()
			
 
				         for id, entry in self.chunks.iteritems():
			
 
				-            if entry.refcount == 0:
			
 
				+            if not (entry.flags & ChunkIndex.F_USED):
			
 
				                 unused.add(id)
			
 
				         logger.info(f"Deleting {len(unused)} unused objects...")
			
 
				         pi = ProgressIndicatorPercent(
			
--- a/src/borg/cache.py
+++ b/src/borg/cache.py
@@ -396,7 +396,7 @@ class FilesCacheMixin:
 
				         for id, size in entry.chunks:
			
 
				             cie = self.chunks.get(id)
			
 
				             assert cie is not None
			
 
				-            assert cie.refcount > 0
			
 
				+            assert cie.flags & ChunkIndex.F_USED
			
 
				             assert size == cie.size
			
 
				             idx = self.chunks.k_to_idx(id)
			
 
				             compressed_chunks.append(idx)
			
@@ -415,7 +415,7 @@ class FilesCacheMixin:
 
				             id = self.chunks.idx_to_k(idx)
			
 
				             cie = self.chunks.get(id)
			
 
				             assert cie is not None
			
 
				-            assert cie.refcount > 0
			
 
				+            assert cie.flags & ChunkIndex.F_USED
			
 
				             assert cie.size > 0
			
 
				             chunks.append((id, cie.size))
			
 
				         entry = entry._replace(chunks=chunks)
			
@@ -722,10 +722,9 @@ def build_chunkindex_from_repo(repository, *, disable_caches=False, cache_immedi
 
				     chunks = ChunkIndex()
			
 
				     t0 = perf_counter()
			
 
				     num_chunks = 0
			
 
				-    # The repo says it has these chunks, so we assume they are referenced chunks.
			
 
				-    # We do not care for refcounting anymore, so we just set refcount = MAX_VALUE.
			
 
				+    # The repo says it has these chunks, so we assume they are referenced/used chunks.
			
 
				     # We do not know the plaintext size (!= stored_size), thus we set size = 0.
			
 
				-    init_entry = ChunkIndexEntry(refcount=ChunkIndex.MAX_VALUE, size=0)
			
 
				+    init_entry = ChunkIndexEntry(flags=ChunkIndex.F_USED, size=0)
			
 
				     for id, stored_size in repo_lister(repository, limit=LIST_SCAN_LIMIT):
			
 
				         num_chunks += 1
			
 
				         chunks[id] = init_entry
			
@@ -809,7 +808,7 @@ class ChunksMixin:
 
				         )
			
 
				         self.repository.put(id, cdata, wait=wait)
			
 
				         self.last_refresh_dt = now  # .put also refreshed the lock
			
 
				-        self.chunks.add(id, ChunkIndex.MAX_VALUE, size)
			
 
				+        self.chunks.add(id, 1, size)
			
 
				         stats.update(size, not exists)
			
 
				         return ChunkListEntry(id, size)
			
 
				 
			
--- a/src/borg/hashindex.pyx
+++ b/src/borg/hashindex.pyx
@@ -34,14 +34,16 @@ class HTProxyMixin:
 
				         self.ht.clear()
			
 
				 
			
 
				 
			
 
				-ChunkIndexEntry = namedtuple('ChunkIndexEntry', 'refcount size')
			
 
				+ChunkIndexEntry = namedtuple('ChunkIndexEntry', 'flags size')
			
 
				 
			
 
				 
			
 
				 class ChunkIndex(HTProxyMixin, MutableMapping):
			
 
				     """
			
 
				     Mapping from key256 to (refcount32, size32) to track chunks in the repository.
			
 
				     """
			
 
				-    MAX_VALUE = 2**32 - 1  # borghash has the full uint32_t range
			
 
				+    # .flags values: 2^0 .. 2^31
			
 
				+    F_NONE = 0  # all flags cleared
			
 
				+    F_USED = 1  # chunk is used/referenced
			
 
				 
			
 
				     def __init__(self, capacity=1000, path=None, usable=None):
			
 
				         if path:
			
@@ -55,9 +57,14 @@ class ChunkIndex(HTProxyMixin, MutableMapping):
 
				         yield from self.ht.items()
			
 
				 
			
 
				     def add(self, key, refs, size):
			
 
				-        v = self.get(key, ChunkIndexEntry(0, 0))
			
 
				-        refcount = min(self.MAX_VALUE, v.refcount + refs)
			
 
				-        self[key] = v._replace(refcount=refcount, size=size)
			
 
				+        assert refs > 0
			
 
				+        v = self.get(key)
			
 
				+        if v is None:
			
 
				+            flags = self.F_USED
			
 
				+        else:
			
 
				+            flags = v.flags | self.F_USED
			
 
				+            assert v.size == 0 or v.size == size
			
 
				+        self[key] = ChunkIndexEntry(flags=flags, size=size)
			
 
				 
			
 
				     @classmethod
			
 
				     def read(cls, path):
			
--- a/src/borg/repository.py
+++ b/src/borg/repository.py
@@ -324,10 +324,10 @@ class Repository:
 
				         objs_checked = objs_errors = 0
			
 
				         chunks = ChunkIndex()
			
 
				         # we don't do refcounting anymore, neither we can know here whether any archive
			
 
				-        # is using this object, but we assume that this is the case and set refcount to
			
 
				-        # MAX_VALUE. As we don't do garbage collection here, this is not a problem.
			
 
				+        # is using this object, but we assume that this is the case.
			
 
				+        # As we don't do garbage collection here, this is not a problem.
			
 
				         # We also don't know the plaintext size, so we set it to 0.
			
 
				-        init_entry = ChunkIndexEntry(refcount=ChunkIndex.MAX_VALUE, size=0)
			
 
				+        init_entry = ChunkIndexEntry(flags=ChunkIndex.F_USED, size=0)
			
 
				         infos = self.store.list("data")
			
 
				         try:
			
 
				             for info in infos:
			
--- a/src/borg/testsuite/hashindex_test.py
+++ b/src/borg/testsuite/hashindex_test.py
@@ -3,7 +3,7 @@ import struct
 
				 
			
 
				 import pytest
			
 
				 
			
 
				-from ..hashindex import ChunkIndex
			
 
				+from ..hashindex import ChunkIndex, ChunkIndexEntry
			
 
				 
			
 
				 
			
 
				 def H(x):
			
@@ -19,10 +19,14 @@ def H2(x):
 
				 def test_chunkindex_add():
			
 
				     chunks = ChunkIndex()
			
 
				     x = H2(1)
			
 
				-    chunks.add(x, 5, 6)
			
 
				-    assert chunks[x] == (5, 6)
			
 
				+    chunks.add(x, 1, 0)
			
 
				+    assert chunks[x] == ChunkIndexEntry(flags=ChunkIndex.F_USED, size=0)
			
 
				+    chunks.add(x, 1, 2)  # updating size (we do not have a size yet)
			
 
				+    assert chunks[x] == ChunkIndexEntry(flags=ChunkIndex.F_USED, size=2)
			
 
				     chunks.add(x, 1, 2)
			
 
				-    assert chunks[x] == (6, 2)
			
 
				+    assert chunks[x] == ChunkIndexEntry(flags=ChunkIndex.F_USED, size=2)
			
 
				+    with pytest.raises(AssertionError):
			
 
				+        chunks.add(x, 1, 3)  # inconsistent size (we already have a different size)
			
 
				 
			
 
				 
			
 
				 def test_keyerror():
			
@@ -31,4 +35,4 @@ def test_keyerror():
 
				     with pytest.raises(KeyError):
			
 
				         chunks[x]
			
 
				     with pytest.raises(struct.error):
			
 
				-        chunks.add(x, -1, 0)
			
 
				+        chunks[x] = ChunkIndexEntry(flags=2**33, size=0)