浏览代码

Repository.list: return [(id, stored_size), ...]

Note: LegacyRepository still returns [id, ...] and so does RemoteRepository.list,
if the remote repo is a LegacyRepository.

also: use LIST_SCAN_LIMIT
Thomas Waldmann 9 月之前
父节点
当前提交
c67cf07522

+ 20 - 20
src/borg/archive.py

@@ -1696,10 +1696,10 @@ class ArchiveChecker:
             result = self.repository.list(limit=LIST_SCAN_LIMIT, marker=marker)
             if not result:
                 break
-            marker = result[-1]
-            init_entry = ChunkIndexEntry(refcount=0, size=0)
-            for id_ in result:
-                self.chunks[id_] = init_entry
+            marker = result[-1][0]
+            init_entry = ChunkIndexEntry(refcount=0, size=0)  # unknown plaintext size (!= stored size!)
+            for id, stored_size in result:
+                self.chunks[id] = init_entry
 
     def make_key(self, repository):
         attempt = 0
@@ -1737,7 +1737,7 @@ class ArchiveChecker:
     def verify_data(self):
         logger.info("Starting cryptographic data integrity verification...")
         chunks_count_index = len(self.chunks)
-        chunks_count_segments = 0
+        chunks_count_repo = 0
         errors = 0
         defect_chunks = []
         pi = ProgressIndicatorPercent(
@@ -1745,16 +1745,16 @@ class ArchiveChecker:
         )
         marker = None
         while True:
-            chunk_ids = self.repository.list(limit=100, marker=marker)
-            if not chunk_ids:
+            result = self.repository.list(limit=100, marker=marker)
+            if not result:
                 break
-            marker = chunk_ids[-1]
-            chunks_count_segments += len(chunk_ids)
-            chunk_data_iter = self.repository.get_many(chunk_ids)
-            chunk_ids_revd = list(reversed(chunk_ids))
-            while chunk_ids_revd:
+            marker = result[-1][0]
+            chunks_count_repo += len(result)
+            chunk_data_iter = self.repository.get_many(id for id, _ in result)
+            result_revd = list(reversed(result))
+            while result_revd:
                 pi.show()
-                chunk_id = chunk_ids_revd.pop(-1)  # better efficiency
+                chunk_id, _ = result_revd.pop(-1)  # better efficiency
                 try:
                     encrypted_data = next(chunk_data_iter)
                 except (Repository.ObjectNotFound, IntegrityErrorBase) as err:
@@ -1764,9 +1764,9 @@ class ArchiveChecker:
                     if isinstance(err, IntegrityErrorBase):
                         defect_chunks.append(chunk_id)
                     # as the exception killed our generator, make a new one for remaining chunks:
-                    if chunk_ids_revd:
-                        chunk_ids = list(reversed(chunk_ids_revd))
-                        chunk_data_iter = self.repository.get_many(chunk_ids)
+                    if result_revd:
+                        result = list(reversed(result_revd))
+                        chunk_data_iter = self.repository.get_many(id for id, _ in result)
                 else:
                     try:
                         # we must decompress, so it'll call assert_id() in there:
@@ -1777,10 +1777,10 @@ class ArchiveChecker:
                         logger.error("chunk %s, integrity error: %s", bin_to_hex(chunk_id), integrity_error)
                         defect_chunks.append(chunk_id)
         pi.finish()
-        if chunks_count_index != chunks_count_segments:
-            logger.error("Repo/Chunks index object count vs. segment files object count mismatch.")
+        if chunks_count_index != chunks_count_repo:
+            logger.error("Chunks index object count vs. repository object count mismatch.")
             logger.error(
-                "Repo/Chunks index: %d objects != segment files: %d objects", chunks_count_index, chunks_count_segments
+                "Chunks index: %d objects != Chunks repository: %d objects", chunks_count_index, chunks_count_repo
             )
         if defect_chunks:
             if self.repair:
@@ -1820,7 +1820,7 @@ class ArchiveChecker:
         log = logger.error if errors else logger.info
         log(
             "Finished cryptographic data integrity verification, verified %d chunks with %d integrity errors.",
-            chunks_count_segments,
+            chunks_count_repo,
             errors,
         )
 

+ 3 - 3
src/borg/archiver/compact_cmd.py

@@ -51,9 +51,9 @@ class ArchiveGarbageCollector:
             result = self.repository.list(limit=LIST_SCAN_LIMIT, marker=marker)
             if not result:
                 break
-            marker = result[-1]
-            for chunk_id in result:
-                repository_chunks[chunk_id] = 0  # plaintext size unknown
+            marker = result[-1][0]
+            for id, stored_size in result:
+                repository_chunks[id] = 0  # plaintext size unknown
         return repository_chunks
 
     def analyze_archives(self) -> Tuple[Dict[bytes, int], Dict[bytes, int], int, int, int]:

+ 14 - 12
src/borg/archiver/debug_cmd.py

@@ -123,17 +123,18 @@ class DebugMixIn:
                 fd.write(data)
 
         # set up the key without depending on a manifest obj
-        ids = repository.list(limit=1, marker=None)
-        cdata = repository.get(ids[0])
+        result = repository.list(limit=1, marker=None)
+        id, _ = result[0]
+        cdata = repository.get(id)
         key = key_factory(repository, cdata)
         repo_objs = RepoObj(key)
         marker = None
         while True:
-            ids = repository.list(limit=LIST_SCAN_LIMIT, marker=marker)
-            if not ids:
+            result = repository.list(limit=LIST_SCAN_LIMIT, marker=marker)
+            if not result:
                 break
-            marker = ids[-1]
-            for id in ids:
+            marker = result[-1][0]
+            for id, stored_size in result:
                 cdata = repository.get(id)
                 decrypt_dump(id, cdata)
         print("Done.")
@@ -168,8 +169,9 @@ class DebugMixIn:
         from ..crypto.key import key_factory
 
         # set up the key without depending on a manifest obj
-        ids = repository.list(limit=1, marker=None)
-        cdata = repository.get(ids[0])
+        result = repository.list(limit=1, marker=None)
+        id, _ = result[0]
+        cdata = repository.get(id)
         key = key_factory(repository, cdata)
         repo_objs = RepoObj(key)
 
@@ -178,11 +180,11 @@ class DebugMixIn:
         last_id = None
         i = 0
         while True:
-            ids = repository.list(limit=LIST_SCAN_LIMIT, marker=marker)
-            if not ids:
+            result = repository.list(limit=LIST_SCAN_LIMIT, marker=marker)
+            if not result:
                 break
-            marker = ids[-1]
-            for id in ids:
+            marker = result[-1][0]
+            for id, stored_size in result:
                 cdata = repository.get(id)
                 _, data = repo_objs.parse(id, cdata, ro_type=ROBJ_DONTCARE)
 

+ 4 - 4
src/borg/archiver/rcompress_cmd.py

@@ -20,12 +20,12 @@ def find_chunks(repository, repo_objs, stats, ctype, clevel, olevel):
     compr_keys = stats["compr_keys"] = set()
     compr_wanted = ctype, clevel, olevel
     marker = None
-    chunks_limit = 1000
     while True:
-        chunk_ids = repository.list(limit=chunks_limit, marker=marker)
-        if not chunk_ids:
+        result = repository.list(limit=LIST_SCAN_LIMIT, marker=marker)
+        if not result:
             break
-        marker = chunk_ids[-1]
+        marker = result[-1][0]
+        chunk_ids = [id for id, _ in result]
         for id, chunk_no_data in zip(chunk_ids, repository.get_many(chunk_ids, read_data=False)):
             meta = repo_objs.parse_meta(id, chunk_no_data, ro_type=ROBJ_DONTCARE)
             compr_found = meta["ctype"], meta["clevel"], meta.get("olevel", -1)

+ 4 - 4
src/borg/cache.py

@@ -639,14 +639,14 @@ class ChunksMixin:
             num_requests += 1
             if not result:
                 break
-            marker = result[-1]
+            marker = result[-1][0]
             # All chunks from the repository have a refcount of MAX_VALUE, which is sticky,
             # therefore we can't/won't delete them. Chunks we added ourselves in this transaction
             # are tracked correctly.
-            init_entry = ChunkIndexEntry(refcount=ChunkIndex.MAX_VALUE, size=0)
-            for id_ in result:
+            init_entry = ChunkIndexEntry(refcount=ChunkIndex.MAX_VALUE, size=0)  # plaintext size
+            for id, stored_size in result:
                 num_chunks += 1
-                chunks[id_] = init_entry
+                chunks[id] = init_entry
         # Cache does not contain the manifest.
         if not isinstance(self.repository, (Repository, RemoteRepository)):
             del chunks[self.manifest.MANIFEST_ID]

+ 6 - 5
src/borg/repository.py

@@ -288,11 +288,12 @@ class Repository:
 
     def list(self, limit=None, marker=None):
         """
-        list <limit> IDs starting from after id <marker>.
+        list <limit> infos starting from after id <marker>.
+        each info is a tuple (id, storage_size).
         """
         self._lock_refresh()
         collect = True if marker is None else False
-        ids = []
+        result = []
         infos = self.store.list("data")  # generator yielding ItemInfos
         while True:
             try:
@@ -304,13 +305,13 @@ class Repository:
             else:
                 id = hex_to_bin(info.name)
                 if collect:
-                    ids.append(id)
-                    if len(ids) == limit:
+                    result.append((id, info.size))
+                    if len(result) == limit:
                         break
                 elif id == marker:
                     collect = True
                     # note: do not collect the marker id
-        return ids
+        return result
 
     def get(self, id, read_data=True):
         self._lock_refresh()

+ 2 - 2
src/borg/testsuite/archiver/check_cmd.py

@@ -432,6 +432,6 @@ def test_empty_repository(archivers, request):
         pytest.skip("only works locally")
     check_cmd_setup(archiver)
     with Repository(archiver.repository_location, exclusive=True) as repository:
-        for id_ in repository.list():
-            repository.delete(id_)
+        for id, _ in repository.list():
+            repository.delete(id)
     cmd(archiver, "check", exit_code=1)

+ 4 - 4
src/borg/testsuite/archiver/rcompress_cmd.py

@@ -17,11 +17,11 @@ def test_rcompress(archiver):
             manifest = Manifest.load(repository, Manifest.NO_OPERATION_CHECK)
             marker = None
             while True:
-                ids = repository.list(limit=LIST_SCAN_LIMIT, marker=marker)
-                if not ids:
+                result = repository.list(limit=LIST_SCAN_LIMIT, marker=marker)
+                if not result:
                     break
-                marker = ids[-1]
-                for id in ids:
+                marker = result[-1][0]
+                for id, _ in result:
                     chunk = repository.get(id, read_data=True)
                     meta, data = manifest.repo_objs.parse(
                         id, chunk, ro_type=ROBJ_DONTCARE

+ 1 - 1
src/borg/testsuite/repository.py

@@ -131,7 +131,7 @@ def test_list(repo_fixtures, request):
         first_half = repository.list(limit=50)
         assert len(first_half) == 50
         assert first_half == repo_list[:50]
-        second_half = repository.list(marker=first_half[-1])
+        second_half = repository.list(marker=first_half[-1][0])
         assert len(second_half) == 50
         assert second_half == repo_list[50:]
         assert len(repository.list(limit=50)) == 50