Browse Source

Merge pull request #8561 from ThomasWaldmann/speedup-compact

compact: add --stats option
TW 6 months ago
parent
commit
84744ac1a4
2 changed files with 73 additions and 28 deletions
  1. 47 19
      src/borg/archiver/compact_cmd.py
  2. 26 9
      src/borg/testsuite/archiver/compact_cmd_test.py

+ 47 - 19
src/borg/archiver/compact_cmd.py

@@ -3,7 +3,7 @@ from typing import Tuple, Set
 
 
 from ._common import with_repository
 from ._common import with_repository
 from ..archive import Archive
 from ..archive import Archive
-from ..cache import write_chunkindex_to_repo_cache
+from ..cache import write_chunkindex_to_repo_cache, build_chunkindex_from_repo
 from ..constants import *  # NOQA
 from ..constants import *  # NOQA
 from ..hashindex import ChunkIndex, ChunkIndexEntry
 from ..hashindex import ChunkIndex, ChunkIndexEntry
 from ..helpers import set_ec, EXIT_WARNING, EXIT_ERROR, format_file_size, bin_to_hex
 from ..helpers import set_ec, EXIT_WARNING, EXIT_ERROR, format_file_size, bin_to_hex
@@ -18,7 +18,7 @@ logger = create_logger()
 
 
 
 
 class ArchiveGarbageCollector:
 class ArchiveGarbageCollector:
-    def __init__(self, repository, manifest):
+    def __init__(self, repository, manifest, *, stats):
         self.repository = repository
         self.repository = repository
         assert isinstance(repository, (Repository, RemoteRepository))
         assert isinstance(repository, (Repository, RemoteRepository))
         self.manifest = manifest
         self.manifest = manifest
@@ -26,17 +26,17 @@ class ArchiveGarbageCollector:
         self.total_files = None  # overall number of source files written to all archives in this repo
         self.total_files = None  # overall number of source files written to all archives in this repo
         self.total_size = None  # overall size of source file content data written to all archives
         self.total_size = None  # overall size of source file content data written to all archives
         self.archives_count = None  # number of archives
         self.archives_count = None  # number of archives
+        self.stats = stats  # compute repo space usage before/after - lists all repo objects, can be slow.
 
 
     @property
     @property
     def repository_size(self):
     def repository_size(self):
-        if self.chunks is None:
+        if self.chunks is None or not self.stats:
             return None
             return None
         return sum(entry.size for id, entry in self.chunks.iteritems())  # sum of stored sizes
         return sum(entry.size for id, entry in self.chunks.iteritems())  # sum of stored sizes
 
 
     def garbage_collect(self):
     def garbage_collect(self):
         """Removes unused chunks from a repository."""
         """Removes unused chunks from a repository."""
         logger.info("Starting compaction / garbage collection...")
         logger.info("Starting compaction / garbage collection...")
-        logger.info("Getting object IDs present in the repository...")
         self.chunks = self.get_repository_chunks()
         self.chunks = self.get_repository_chunks()
         logger.info("Computing object IDs used by archives...")
         logger.info("Computing object IDs used by archives...")
         (self.missing_chunks, self.reappeared_chunks, self.total_files, self.total_size, self.archives_count) = (
         (self.missing_chunks, self.reappeared_chunks, self.total_files, self.total_size, self.archives_count) = (
@@ -47,20 +47,30 @@ class ArchiveGarbageCollector:
         logger.info("Finished compaction / garbage collection...")
         logger.info("Finished compaction / garbage collection...")
 
 
     def get_repository_chunks(self) -> ChunkIndex:
     def get_repository_chunks(self) -> ChunkIndex:
-        """Build a dict id -> size of all chunks present in the repository"""
-        chunks = ChunkIndex()
-        for id, stored_size in repo_lister(self.repository, limit=LIST_SCAN_LIMIT):
-            # we add this id to the chunks index (as unused chunk), because
-            # we do not know yet whether it is actually referenced from some archives.
-            # we "abuse" the size field here. usually there is the plaintext size,
-            # but we use it for the size of the stored object here.
-            chunks[id] = ChunkIndexEntry(flags=ChunkIndex.F_NONE, size=stored_size)
+        """return a chunks index"""
+        if self.stats:  # slow method: build a fresh chunks index, with stored chunk sizes.
+            logger.info("Getting object IDs present in the repository...")
+            chunks = ChunkIndex()
+            for id, stored_size in repo_lister(self.repository, limit=LIST_SCAN_LIMIT):
+                # we add this id to the chunks index (as unused chunk), because
+                # we do not know yet whether it is actually referenced from some archives.
+                # we "abuse" the size field here. usually there is the plaintext size,
+                # but we use it for the size of the stored object here.
+                chunks[id] = ChunkIndexEntry(flags=ChunkIndex.F_NONE, size=stored_size)
+        else:  # faster: rely on existing chunks index (with flags F_NONE and size 0).
+            logger.info("Getting object IDs from cached chunks index...")
+            chunks = build_chunkindex_from_repo(self.repository, cache_immediately=True)
         return chunks
         return chunks
 
 
     def save_chunk_index(self):
     def save_chunk_index(self):
-        # write_chunkindex_to_repo now removes all flags and size infos.
-        # we need this, as we put the wrong size in there.
-        write_chunkindex_to_repo_cache(self.repository, self.chunks, clear=True, force_write=True, delete_other=True)
+        if self.stats:
+            # write_chunkindex_to_repo now removes all flags and size infos.
+            # we need this, as we put the wrong size in there to support --stats computations.
+            write_chunkindex_to_repo_cache(
+                self.repository, self.chunks, clear=True, force_write=True, delete_other=True
+            )
+        else:
+            self.chunks.clear()  # we already have updated the repo cache in get_repository_chunks
         self.chunks = None  # nothing there (cleared!)
         self.chunks = None  # nothing there (cleared!)
 
 
     def analyze_archives(self) -> Tuple[Set, Set, int, int, int]:
     def analyze_archives(self) -> Tuple[Set, Set, int, int, int]:
@@ -75,7 +85,8 @@ class ArchiveGarbageCollector:
                     # chunk id is from chunks_healthy list: a lost chunk has re-appeared!
                     # chunk id is from chunks_healthy list: a lost chunk has re-appeared!
                     reappeared_chunks.add(id)
                     reappeared_chunks.add(id)
             else:
             else:
-                # we do NOT have this chunk in the repository!
+                # with --stats: we do NOT have this chunk in the repository!
+                # without --stats: we do not have this chunk or the chunks index is incomplete.
                 missing_chunks.add(id)
                 missing_chunks.add(id)
 
 
         missing_chunks: set[bytes] = set()
         missing_chunks: set[bytes] = set()
@@ -153,15 +164,18 @@ class ArchiveGarbageCollector:
         logger.info(
         logger.info(
             f"Source data size was {format_file_size(self.total_size, precision=0)} in {self.total_files} files."
             f"Source data size was {format_file_size(self.total_size, precision=0)} in {self.total_files} files."
         )
         )
-        logger.info(f"Repository size is {format_file_size(repo_size_after, precision=0)} in {count} objects.")
-        logger.info(f"Compaction saved {format_file_size(repo_size_before - repo_size_after, precision=0)}.")
+        if self.stats:
+            logger.info(f"Repository size is {format_file_size(repo_size_after, precision=0)} in {count} objects.")
+            logger.info(f"Compaction saved {format_file_size(repo_size_before - repo_size_after, precision=0)}.")
+        else:
+            logger.info(f"Repository has data stored in {count} objects.")
 
 
 
 
 class CompactMixIn:
 class CompactMixIn:
     @with_repository(exclusive=True, compatibility=(Manifest.Operation.DELETE,))
     @with_repository(exclusive=True, compatibility=(Manifest.Operation.DELETE,))
     def do_compact(self, args, repository, manifest):
     def do_compact(self, args, repository, manifest):
         """Collect garbage in repository"""
         """Collect garbage in repository"""
-        ArchiveGarbageCollector(repository, manifest).garbage_collect()
+        ArchiveGarbageCollector(repository, manifest, stats=args.stats).garbage_collect()
 
 
     def build_parser_compact(self, subparsers, common_parser, mid_common_parser):
     def build_parser_compact(self, subparsers, common_parser, mid_common_parser):
         from ._common import process_epilog
         from ._common import process_epilog
@@ -198,6 +212,16 @@ class CompactMixIn:
             might not want to do that unless there are signs of lost archives (e.g. when
             might not want to do that unless there are signs of lost archives (e.g. when
             seeing fatal errors when creating backups or when archives are missing in
             seeing fatal errors when creating backups or when archives are missing in
             ``borg repo-list``).
             ``borg repo-list``).
+
+            When giving the ``--stats`` option, borg will internally list all repository
+            objects to determine their existence AND stored size. It will build a fresh
+            chunks index from that information and cache it in the repository. For some
+            types of repositories, this might be very slow. It will tell you the sum of
+            stored object sizes, before and after compaction.
+
+            Without ``--stats``, borg will rely on the cached chunks index to determine
+            existing object IDs (but there is no stored size information in the index,
+            thus it can't compute before/after compaction size statistics).
             """
             """
         )
         )
         subparser = subparsers.add_parser(
         subparser = subparsers.add_parser(
@@ -210,3 +234,7 @@ class CompactMixIn:
             help="compact repository",
             help="compact repository",
         )
         )
         subparser.set_defaults(func=self.do_compact)
         subparser.set_defaults(func=self.do_compact)
+
+        subparser.add_argument(
+            "-s", "--stats", dest="stats", action="store_true", help="print statistics (might be much slower)"
+        )

+ 26 - 9
src/borg/testsuite/archiver/compact_cmd_test.py

@@ -1,35 +1,48 @@
+import pytest
+
 from ...constants import *  # NOQA
 from ...constants import *  # NOQA
 from . import cmd, create_src_archive, generate_archiver_tests, RK_ENCRYPTION
 from . import cmd, create_src_archive, generate_archiver_tests, RK_ENCRYPTION
 
 
 pytest_generate_tests = lambda metafunc: generate_archiver_tests(metafunc, kinds="local,remote,binary")  # NOQA
 pytest_generate_tests = lambda metafunc: generate_archiver_tests(metafunc, kinds="local,remote,binary")  # NOQA
 
 
 
 
-def test_compact_empty_repository(archivers, request):
+@pytest.mark.parametrize("stats", (True, False))
+def test_compact_empty_repository(archivers, request, stats):
     archiver = request.getfixturevalue(archivers)
     archiver = request.getfixturevalue(archivers)
 
 
     cmd(archiver, "repo-create", RK_ENCRYPTION)
     cmd(archiver, "repo-create", RK_ENCRYPTION)
 
 
-    output = cmd(archiver, "compact", "-v", exit_code=0)
+    args = ("-v", "--stats") if stats else ("-v",)
+    output = cmd(archiver, "compact", *args, exit_code=0)
     assert "Starting compaction" in output
     assert "Starting compaction" in output
-    assert "Repository size is 0 B in 0 objects." in output
+    if stats:
+        assert "Repository size is 0 B in 0 objects." in output
+    else:
+        assert "Repository has data stored in 0 objects." in output
     assert "Finished compaction" in output
     assert "Finished compaction" in output
 
 
 
 
-def test_compact_after_deleting_all_archives(archivers, request):
+@pytest.mark.parametrize("stats", (True, False))
+def test_compact_after_deleting_all_archives(archivers, request, stats):
     archiver = request.getfixturevalue(archivers)
     archiver = request.getfixturevalue(archivers)
 
 
     cmd(archiver, "repo-create", RK_ENCRYPTION)
     cmd(archiver, "repo-create", RK_ENCRYPTION)
     create_src_archive(archiver, "archive")
     create_src_archive(archiver, "archive")
     cmd(archiver, "delete", "-a", "archive", exit_code=0)
     cmd(archiver, "delete", "-a", "archive", exit_code=0)
 
 
-    output = cmd(archiver, "compact", "-v", exit_code=0)
+    args = ("-v", "--stats") if stats else ("-v",)
+    output = cmd(archiver, "compact", *args, exit_code=0)
     assert "Starting compaction" in output
     assert "Starting compaction" in output
     assert "Deleting " in output
     assert "Deleting " in output
-    assert "Repository size is 0 B in 0 objects." in output
+    if stats:
+        assert "Repository size is 0 B in 0 objects." in output
+    else:
+        assert "Repository has data stored in 0 objects." in output
     assert "Finished compaction" in output
     assert "Finished compaction" in output
 
 
 
 
-def test_compact_after_deleting_some_archives(archivers, request):
+@pytest.mark.parametrize("stats", (True, False))
+def test_compact_after_deleting_some_archives(archivers, request, stats):
     archiver = request.getfixturevalue(archivers)
     archiver = request.getfixturevalue(archivers)
 
 
     cmd(archiver, "repo-create", RK_ENCRYPTION)
     cmd(archiver, "repo-create", RK_ENCRYPTION)
@@ -37,8 +50,12 @@ def test_compact_after_deleting_some_archives(archivers, request):
     create_src_archive(archiver, "archive2")
     create_src_archive(archiver, "archive2")
     cmd(archiver, "delete", "-a", "archive1", exit_code=0)
     cmd(archiver, "delete", "-a", "archive1", exit_code=0)
 
 
-    output = cmd(archiver, "compact", "-v", exit_code=0)
+    args = ("-v", "--stats") if stats else ("-v",)
+    output = cmd(archiver, "compact", *args, exit_code=0)
     assert "Starting compaction" in output
     assert "Starting compaction" in output
     assert "Deleting " in output
     assert "Deleting " in output
-    assert "Repository size is 0 B in 0 objects." not in output
+    if stats:
+        assert "Repository size is 0 B in 0 objects." not in output
+    else:
+        assert "Repository has data stored in 0 objects." not in output
     assert "Finished compaction" in output
     assert "Finished compaction" in output