Browse Source

Merge pull request #8854 from ThomasWaldmann/files-cache-cleanup

compact: also clean up files cache, fixes #8852
TW 2 weeks ago
parent
commit
80624d1460
3 changed files with 127 additions and 8 deletions
  1. 41 0
      src/borg/archiver/compact_cmd.py
  2. 31 8
      src/borg/cache.py
  3. 55 0
      src/borg/testsuite/archiver/compact_cmd_test.py

+ 41 - 0
src/borg/archiver/compact_cmd.py

@@ -1,9 +1,12 @@
 import argparse
+import os
 from typing import Tuple, Set
 
 from ._common import with_repository
 from ..archive import Archive
 from ..cache import write_chunkindex_to_repo_cache, build_chunkindex_from_repo
+from ..cache import files_cache_name, discover_files_cache_names
+from ..helpers import get_cache_dir
 from ..constants import *  # NOQA
 from ..hashindex import ChunkIndex, ChunkIndexEntry
 from ..helpers import set_ec, EXIT_ERROR, format_file_size, bin_to_hex
@@ -43,6 +46,7 @@ class ArchiveGarbageCollector:
         (self.missing_chunks, self.total_files, self.total_size, self.archives_count) = self.analyze_archives()
         self.report_and_delete()
         self.save_chunk_index()
+        self.cleanup_files_cache()
         logger.info("Finished compaction / garbage collection...")
 
     def get_repository_chunks(self) -> ChunkIndex:
@@ -71,6 +75,43 @@ class ArchiveGarbageCollector:
         )
         self.chunks = None  # nothing there (cleared!)
 
+    def cleanup_files_cache(self):
+        """
+        Clean up files cache files for archive series names that no longer exist in the repository.
+
+        Note: this only works perfectly if the files cache filename suffixes are automatically generated
+        and the user does not manually control them via more than one BORG_FILES_CACHE_SUFFIX env var value.
+        """
+        logger.info("Cleaning up files cache...")
+
+        cache_dir = os.path.join(get_cache_dir(), self.repository.id_str)
+        if not os.path.exists(cache_dir):
+            logger.debug("Cache directory does not exist, skipping files cache cleanup")
+            return
+
+        # Get all existing archive series names
+        existing_series = set(self.manifest.archives.names())
+        logger.debug(f"Found {len(existing_series)} existing archive series.")
+
+        # Get the set of all existing files cache file names.
+        try:
+            files_cache_names = set(discover_files_cache_names(cache_dir))
+            logger.debug(f"Found {len(files_cache_names)} files cache files.")
+        except (FileNotFoundError, PermissionError) as e:
+            logger.warning(f"Could not access cache directory: {e}")
+            return
+
+        used_files_cache_names = set(files_cache_name(series_name) for series_name in existing_series)
+        unused_files_cache_names = files_cache_names - used_files_cache_names
+
+        for cache_filename in unused_files_cache_names:
+            cache_path = os.path.join(cache_dir, cache_filename)
+            try:
+                os.unlink(cache_path)
+            except (FileNotFoundError, PermissionError) as e:
+                logger.warning(f"Could not access cache file: {e}")
+        logger.info(f"Removed {len(unused_files_cache_names)} unused files cache files.")
+
     def analyze_archives(self) -> Tuple[Set, int, int, int]:
         """Iterate over all items in all archives, create the dicts id -> size of all used chunks."""
 

+ 31 - 8
src/borg/cache.py

@@ -37,6 +37,35 @@ from .platform import SaveFile
 from .remote import RemoteRepository
 from .repository import LIST_SCAN_LIMIT, Repository, StoreObjectNotFound, repo_lister
 
+
+def files_cache_name(archive_name, files_cache_name="files"):
+    """
+    Return the name of the files cache file for the given archive name.
+
+    :param archive_name: name of the archive (ideally a series name)
+    :param files_cache_name: base name of the files cache file
+    :return: name of the files cache file
+    """
+    suffix = os.environ.get("BORG_FILES_CACHE_SUFFIX", "")
+    # when using archive series, we automatically make up a separate cache file per series.
+    # when not, the user may manually do that by using the env var.
+    if not suffix:
+        # avoid issues with too complex or long archive_name by hashing it:
+        suffix = bin_to_hex(xxh64(archive_name.encode()))
+    return files_cache_name + "." + suffix
+
+
+def discover_files_cache_names(path, files_cache_name="files"):
+    """
+    Return a list of all files cache file names in the given directory.
+
+    :param path: path to the directory to search in
+    :param files_cache_name: base name of the files cache files
+    :return: list of files cache file names
+    """
+    return [fn for fn in os.listdir(path) if fn.startswith(files_cache_name + ".")]
+
+
 # chunks is a list of ChunkListEntry
 FileCacheEntry = namedtuple("FileCacheEntry", "age inode size ctime mtime chunks")
 
@@ -495,16 +524,10 @@ class FilesCacheMixin:
         return files
 
     def files_cache_name(self):
-        suffix = os.environ.get("BORG_FILES_CACHE_SUFFIX", "")
-        # when using archive series, we automatically make up a separate cache file per series.
-        # when not, the user may manually do that by using the env var.
-        if not suffix:
-            # avoid issues with too complex or long archive_name by hashing it:
-            suffix = bin_to_hex(xxh64(self.archive_name.encode()))
-        return self.FILES_CACHE_NAME + "." + suffix
+        return files_cache_name(self.archive_name, self.FILES_CACHE_NAME)
 
     def discover_files_cache_names(self, path):
-        return [fn for fn in os.listdir(path) if fn.startswith(self.FILES_CACHE_NAME + ".")]
+        return discover_files_cache_names(path, self.FILES_CACHE_NAME)
 
     def _read_files_cache(self):
         """read files cache from cache directory"""

+ 55 - 0
src/borg/testsuite/archiver/compact_cmd_test.py

@@ -1,6 +1,9 @@
+import os
 import pytest
 
 from ...constants import *  # NOQA
+from ...helpers import get_cache_dir
+from ...cache import files_cache_name, discover_files_cache_names
 from . import cmd, create_src_archive, generate_archiver_tests, RK_ENCRYPTION
 
 pytest_generate_tests = lambda metafunc: generate_archiver_tests(metafunc, kinds="local,remote,binary")  # NOQA
@@ -79,3 +82,55 @@ def test_compact_index_corruption(archivers, request):
 
     output = cmd(archiver, "compact", "-v", "--stats", exit_code=0)
     assert "missing objects" not in output
+
+
+def test_compact_files_cache_cleanup(archivers, request):
+    """Test that files cache files for deleted archives are removed during compact."""
+    archiver = request.getfixturevalue(archivers)
+
+    # Create repository and archives
+    cmd(archiver, "repo-create", RK_ENCRYPTION)
+    create_src_archive(archiver, "archive1")
+    create_src_archive(archiver, "archive2")
+    create_src_archive(archiver, "archive3")
+
+    # Get repository ID
+    output = cmd(archiver, "repo-info")
+    for line in output.splitlines():
+        if "Repository ID:" in line:
+            repo_id = line.split(":", 1)[1].strip()
+            break
+    else:
+        pytest.fail("Could not find repository ID in info output")
+
+    # Check cache directory for files cache files
+    cache_dir = os.path.join(get_cache_dir(), repo_id)
+    if not os.path.exists(cache_dir):
+        pytest.skip("Cache directory does not exist, skipping test")
+
+    # Get initial files cache files
+    try:
+        initial_cache_files = set(discover_files_cache_names(cache_dir))
+    except (FileNotFoundError, PermissionError):
+        pytest.skip("Could not access cache directory, skipping test")
+
+    # Get expected cache files for remaining archives
+    expected_cache_files = {files_cache_name(name) for name in ["archive1", "archive2", "archive3"]}
+    assert expected_cache_files == initial_cache_files, "Unexpected cache files found"
+
+    # Delete one archive
+    cmd(archiver, "delete", "-a", "archive2")
+
+    # Run compact
+    output = cmd(archiver, "compact", "-v")
+    assert "Cleaning up files cache" in output
+
+    # Check that files cache for deleted archive is removed
+    try:
+        remaining_cache_files = set(discover_files_cache_names(cache_dir))
+    except (FileNotFoundError, PermissionError):
+        pytest.fail("Could not access cache directory after compact")
+
+    # Get expected cache files for remaining archives
+    expected_cache_files = {files_cache_name(name) for name in ["archive1", "archive3"]}
+    assert expected_cache_files == remaining_cache_files, "Unexpected cache files found"