1 year ago · 8cd951f324
--- a/docs/man/borg-analyze.1
+++ b/docs/man/borg-analyze.1
@@ -0,0 +1,91 @@
 
				+.\" Man page generated from reStructuredText.
			
 
				+.
			
 
				+.
			
 
				+.nr rst2man-indent-level 0
			
 
				+.
			
 
				+.de1 rstReportMargin
			
 
				+\\$1 \\n[an-margin]
			
 
				+level \\n[rst2man-indent-level]
			
 
				+level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
			
 
				+-
			
 
				+\\n[rst2man-indent0]
			
 
				+\\n[rst2man-indent1]
			
 
				+\\n[rst2man-indent2]
			
 
				+..
			
 
				+.de1 INDENT
			
 
				+.\" .rstReportMargin pre:
			
 
				+. RS \\$1
			
 
				+. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin]
			
 
				+. nr rst2man-indent-level +1
			
 
				+.\" .rstReportMargin post:
			
 
				+..
			
 
				+.de UNINDENT
			
 
				+. RE
			
 
				+.\" indent \\n[an-margin]
			
 
				+.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]]
			
 
				+.nr rst2man-indent-level -1
			
 
				+.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
			
 
				+.in \\n[rst2man-indent\\n[rst2man-indent-level]]u
			
 
				+..
			
 
				+.TH "BORG-ANALYZE" 1 "2024-10-02" "" "borg backup tool"
			
 
				+.SH NAME
			
 
				+borg-analyze \- Analyze archives
			
 
				+.SH SYNOPSIS
			
 
				+.sp
			
 
				+borg [common options] analyze [options]
			
 
				+.SH DESCRIPTION
			
 
				+.sp
			
 
				+Analyze archives to find \(dqhot spots\(dq.
			
 
				+.sp
			
 
				+Borg analyze relies on the usual archive matching options to select the
			
 
				+archives that should be considered for analysis (e.g. \fB\-a series_name\fP).
			
 
				+Then it iterates over all matching archives, over all contained files and
			
 
				+collects information about chunks stored in all directories it encountered.
			
 
				+.sp
			
 
				+It considers chunk IDs and their plaintext sizes (we don\(aqt have the compressed
			
 
				+size in the repository easily available) and adds up added/removed chunks\(aq
			
 
				+sizes per direct parent directory and outputs a list of \(dqdirectory: size\(dq.
			
 
				+.sp
			
 
				+You can use that list to find directories with a lot of \(dqactivity\(dq \- maybe
			
 
				+some of these are temporary or cache directories you did forget to exclude.
			
 
				+.sp
			
 
				+To not have these unwanted directories in your backups, you could carefully
			
 
				+exclude these in \fBborg create\fP (for future backups) or use \fBborg recreate\fP
			
 
				+to re\-create existing archives without these.
			
 
				+.SH OPTIONS
			
 
				+.sp
			
 
				+See \fIborg\-common(1)\fP for common options of Borg commands.
			
 
				+.SS Archive filters
			
 
				+.INDENT 0.0
			
 
				+.TP
			
 
				+.BI \-a \ PATTERN\fR,\fB \ \-\-match\-archives \ PATTERN
			
 
				+only consider archives matching all patterns. see \(dqborg help match\-archives\(dq.
			
 
				+.TP
			
 
				+.BI \-\-sort\-by \ KEYS
			
 
				+Comma\-separated list of sorting keys; valid keys are: timestamp, archive, name, id, tags, host, user; default is: timestamp
			
 
				+.TP
			
 
				+.BI \-\-first \ N
			
 
				+consider first N archives after other filters were applied
			
 
				+.TP
			
 
				+.BI \-\-last \ N
			
 
				+consider last N archives after other filters were applied
			
 
				+.TP
			
 
				+.BI \-\-oldest \ TIMESPAN
			
 
				+consider archives between the oldest archive\(aqs timestamp and (oldest + TIMESPAN), e.g. 7d or 12m.
			
 
				+.TP
			
 
				+.BI \-\-newest \ TIMESPAN
			
 
				+consider archives between the newest archive\(aqs timestamp and (newest \- TIMESPAN), e.g. 7d or 12m.
			
 
				+.TP
			
 
				+.BI \-\-older \ TIMESPAN
			
 
				+consider archives older than (now \- TIMESPAN), e.g. 7d or 12m.
			
 
				+.TP
			
 
				+.BI \-\-newer \ TIMESPAN
			
 
				+consider archives newer than (now \- TIMESPAN), e.g. 7d or 12m.
			
 
				+.UNINDENT
			
 
				+.SH SEE ALSO
			
 
				+.sp
			
 
				+\fIborg\-common(1)\fP
			
 
				+.SH AUTHOR
			
 
				+The Borg Collective
			
 
				+.\" Generated by docutils manpage writer.
			
 
				+.
			
--- a/docs/usage.rst
+++ b/docs/usage.rst
@@ -57,6 +57,7 @@ Usage
 
				    usage/delete
			
 
				    usage/prune
			
 
				    usage/info
			
 
				+   usage/analyze
			
 
				    usage/mount
			
 
				    usage/recreate
			
 
				    usage/tar
			
--- a/docs/usage/analyze.rst
+++ b/docs/usage/analyze.rst
@@ -0,0 +1 @@
 
				+.. include:: analyze.rst.inc
			
--- a/docs/usage/analyze.rst.inc
+++ b/docs/usage/analyze.rst.inc
@@ -0,0 +1,84 @@
 
				+.. IMPORTANT: this file is auto-generated from borg's built-in help, do not edit!
			
 
				+
			
 
				+.. _borg_analyze:
			
 
				+
			
 
				+borg analyze
			
 
				+------------
			
 
				+.. code-block:: none
			
 
				+
			
 
				+    borg [common options] analyze [options]
			
 
				+
			
 
				+.. only:: html
			
 
				+
			
 
				+    .. class:: borg-options-table
			
 
				+
			
 
				+    +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
			
 
				+    | .. class:: borg-common-opt-ref                                                                                                                                                                                                                           |
			
 
				+    |                                                                                                                                                                                                                                                          |
			
 
				+    | :ref:`common_options`                                                                                                                                                                                                                                    |
			
 
				+    +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
			
 
				+    | **Archive filters** — Archive filters can be applied to repository targets.                                                                                                                                                                              |
			
 
				+    +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
			
 
				+    |                                                                             | ``-a PATTERN``, ``--match-archives PATTERN`` | only consider archives matching all patterns. see "borg help match-archives".                                               |
			
 
				+    +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
			
 
				+    |                                                                             | ``--sort-by KEYS``                           | Comma-separated list of sorting keys; valid keys are: timestamp, archive, name, id, tags, host, user; default is: timestamp |
			
 
				+    +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
			
 
				+    |                                                                             | ``--first N``                                | consider first N archives after other filters were applied                                                                  |
			
 
				+    +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
			
 
				+    |                                                                             | ``--last N``                                 | consider last N archives after other filters were applied                                                                   |
			
 
				+    +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
			
 
				+    |                                                                             | ``--oldest TIMESPAN``                        | consider archives between the oldest archive's timestamp and (oldest + TIMESPAN), e.g. 7d or 12m.                           |
			
 
				+    +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
			
 
				+    |                                                                             | ``--newest TIMESPAN``                        | consider archives between the newest archive's timestamp and (newest - TIMESPAN), e.g. 7d or 12m.                           |
			
 
				+    +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
			
 
				+    |                                                                             | ``--older TIMESPAN``                         | consider archives older than (now - TIMESPAN), e.g. 7d or 12m.                                                              |
			
 
				+    +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
			
 
				+    |                                                                             | ``--newer TIMESPAN``                         | consider archives newer than (now - TIMESPAN), e.g. 7d or 12m.                                                              |
			
 
				+    +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
			
 
				+
			
 
				+    .. raw:: html
			
 
				+
			
 
				+        <script type='text/javascript'>
			
 
				+        $(document).ready(function () {
			
 
				+            $('.borg-options-table colgroup').remove();
			
 
				+        })
			
 
				+        </script>
			
 
				+
			
 
				+.. only:: latex
			
 
				+
			
 
				+
			
 
				+
			
 
				+    :ref:`common_options`
			
 
				+        |
			
 
				+
			
 
				+    Archive filters
			
 
				+        -a PATTERN, --match-archives PATTERN     only consider archives matching all patterns. see "borg help match-archives".
			
 
				+        --sort-by KEYS                           Comma-separated list of sorting keys; valid keys are: timestamp, archive, name, id, tags, host, user; default is: timestamp
			
 
				+        --first N                                consider first N archives after other filters were applied
			
 
				+        --last N                                 consider last N archives after other filters were applied
			
 
				+        --oldest TIMESPAN                        consider archives between the oldest archive's timestamp and (oldest + TIMESPAN), e.g. 7d or 12m.
			
 
				+        --newest TIMESPAN                        consider archives between the newest archive's timestamp and (newest - TIMESPAN), e.g. 7d or 12m.
			
 
				+        --older TIMESPAN                         consider archives older than (now - TIMESPAN), e.g. 7d or 12m.
			
 
				+        --newer TIMESPAN                         consider archives newer than (now - TIMESPAN), e.g. 7d or 12m.
			
 
				+
			
 
				+
			
 
				+Description
			
 
				+~~~~~~~~~~~
			
 
				+
			
 
				+Analyze archives to find "hot spots".
			
 
				+
			
 
				+Borg analyze relies on the usual archive matching options to select the
			
 
				+archives that should be considered for analysis (e.g. ``-a series_name``).
			
 
				+Then it iterates over all matching archives, over all contained files and
			
 
				+collects information about chunks stored in all directories it encountered.
			
 
				+
			
 
				+It considers chunk IDs and their plaintext sizes (we don't have the compressed
			
 
				+size in the repository easily available) and adds up added/removed chunks'
			
 
				+sizes per direct parent directory and outputs a list of "directory: size".
			
 
				+
			
 
				+You can use that list to find directories with a lot of "activity" - maybe
			
 
				+some of these are temporary or cache directories you did forget to exclude.
			
 
				+
			
 
				+To not have these unwanted directories in your backups, you could carefully
			
 
				+exclude these in ``borg create`` (for future backups) or use ``borg recreate``
			
 
				+to re-create existing archives without these.
			
--- a/src/borg/archiver/__init__.py
+++ b/src/borg/archiver/__init__.py
@@ -64,6 +64,7 @@ def get_func(args):
 
				     raise Exception("expected func attributes not found")
			
 
				 
			
 
				 
			
 
				+from .analyze_cmd import AnalyzeMixIn
			
 
				 from .benchmark_cmd import BenchmarkMixIn
			
 
				 from .check_cmd import CheckMixIn
			
 
				 from .compact_cmd import CompactMixIn
			
@@ -94,6 +95,7 @@ from .version_cmd import VersionMixIn
 
				 
			
 
				 
			
 
				 class Archiver(
			
 
				+    AnalyzeMixIn,
			
 
				     BenchmarkMixIn,
			
 
				     CheckMixIn,
			
 
				     CompactMixIn,
			
@@ -332,6 +334,7 @@ class Archiver(
 
				 
			
 
				         subparsers = parser.add_subparsers(title="required arguments", metavar="<command>")
			
 
				 
			
 
				+        self.build_parser_analyze(subparsers, common_parser, mid_common_parser)
			
 
				         self.build_parser_benchmarks(subparsers, common_parser, mid_common_parser)
			
 
				         self.build_parser_check(subparsers, common_parser, mid_common_parser)
			
 
				         self.build_parser_compact(subparsers, common_parser, mid_common_parser)
			
--- a/src/borg/archiver/analyze_cmd.py
+++ b/src/borg/archiver/analyze_cmd.py
@@ -0,0 +1,135 @@
 
				+import argparse
			
 
				+from collections import defaultdict
			
 
				+import os
			
 
				+
			
 
				+from ._common import with_repository, define_archive_filters_group
			
 
				+from ..archive import Archive
			
 
				+from ..constants import *  # NOQA
			
 
				+from ..helpers import bin_to_hex, Error
			
 
				+from ..helpers import ProgressIndicatorPercent
			
 
				+from ..manifest import Manifest
			
 
				+from ..remote import RemoteRepository
			
 
				+from ..repository import Repository
			
 
				+
			
 
				+from ..logger import create_logger
			
 
				+
			
 
				+logger = create_logger()
			
 
				+
			
 
				+
			
 
				+class ArchiveAnalyzer:
			
 
				+    def __init__(self, args, repository, manifest):
			
 
				+        self.args = args
			
 
				+        self.repository = repository
			
 
				+        assert isinstance(repository, (Repository, RemoteRepository))
			
 
				+        self.manifest = manifest
			
 
				+        self.difference_by_path = defaultdict(int)  # directory path -> count of chunks changed
			
 
				+
			
 
				+    def analyze(self):
			
 
				+        logger.info("Starting archives analysis...")
			
 
				+        self.analyze_archives()
			
 
				+        self.report()
			
 
				+        logger.info("Finished archives analysis.")
			
 
				+
			
 
				+    def analyze_archives(self) -> None:
			
 
				+        """Analyze all archives matching the given selection criteria."""
			
 
				+        archive_infos = self.manifest.archives.list_considering(self.args)
			
 
				+        num_archives = len(archive_infos)
			
 
				+        if num_archives < 2:
			
 
				+            raise Error("Need at least 2 archives to analyze.")
			
 
				+
			
 
				+        pi = ProgressIndicatorPercent(
			
 
				+            total=num_archives, msg="Analyzing archives %3.1f%%", step=0.1, msgid="analyze.analyze_archives"
			
 
				+        )
			
 
				+        i = 0
			
 
				+        info = archive_infos[i]
			
 
				+        pi.show(i)
			
 
				+        logger.info(f"Analyzing archive {info.name} {info.ts} {bin_to_hex(info.id)} ({i + 1}/{num_archives})")
			
 
				+        base = self.analyze_archive(info.id)
			
 
				+        for i, info in enumerate(archive_infos[1:]):
			
 
				+            pi.show(i + 1)
			
 
				+            logger.info(f"Analyzing archive {info.name} {info.ts} {bin_to_hex(info.id)} ({i + 2}/{num_archives})")
			
 
				+            new = self.analyze_archive(info.id)
			
 
				+            self.analyze_change(base, new)
			
 
				+            base = new
			
 
				+        pi.finish()
			
 
				+
			
 
				+    def analyze_archive(self, id):
			
 
				+        """compute the set of chunks for each directory in this archive"""
			
 
				+        archive = Archive(self.manifest, id)
			
 
				+        chunks_by_path = defaultdict(dict)  # collect all chunk IDs generated from files in this directory path
			
 
				+        for item in archive.iter_items():
			
 
				+            if "chunks" in item:
			
 
				+                item_chunks = dict(item.chunks)  # chunk id -> plaintext size
			
 
				+                directory_path = os.path.dirname(item.path)
			
 
				+                chunks_by_path[directory_path].update(item_chunks)
			
 
				+        return chunks_by_path
			
 
				+
			
 
				+    def analyze_change(self, base, new):
			
 
				+        """for each directory path, sum up the changed (removed or added) chunks' sizes between base and new."""
			
 
				+
			
 
				+        def analyze_path_change(path):
			
 
				+            base_chunks = base[path]
			
 
				+            new_chunks = new[path]
			
 
				+            # add up added chunks' sizes
			
 
				+            for id in new_chunks.keys() - base_chunks.keys():
			
 
				+                self.difference_by_path[directory_path] += new_chunks[id]
			
 
				+            # add up removed chunks' sizes
			
 
				+            for id in base_chunks.keys() - new_chunks.keys():
			
 
				+                self.difference_by_path[directory_path] += base_chunks[id]
			
 
				+
			
 
				+        for directory_path in base:
			
 
				+            analyze_path_change(directory_path)
			
 
				+        for directory_path in new:
			
 
				+            if directory_path not in base:
			
 
				+                analyze_path_change(directory_path)
			
 
				+
			
 
				+    def report(self):
			
 
				+        print()
			
 
				+        print("chunks added or removed by directory path")
			
 
				+        print("=========================================")
			
 
				+        for directory_path in sorted(self.difference_by_path, key=lambda p: self.difference_by_path[p], reverse=True):
			
 
				+            difference = self.difference_by_path[directory_path]
			
 
				+            print(f"{directory_path}: {difference}")
			
 
				+
			
 
				+
			
 
				+class AnalyzeMixIn:
			
 
				+    @with_repository(compatibility=(Manifest.Operation.READ,))
			
 
				+    def do_analyze(self, args, repository, manifest):
			
 
				+        """Analyze archives"""
			
 
				+        ArchiveAnalyzer(args, repository, manifest).analyze()
			
 
				+
			
 
				+    def build_parser_analyze(self, subparsers, common_parser, mid_common_parser):
			
 
				+        from ._common import process_epilog
			
 
				+
			
 
				+        analyze_epilog = process_epilog(
			
 
				+            """
			
 
				+            Analyze archives to find "hot spots".
			
 
				+
			
 
				+            Borg analyze relies on the usual archive matching options to select the
			
 
				+            archives that should be considered for analysis (e.g. ``-a series_name``).
			
 
				+            Then it iterates over all matching archives, over all contained files and
			
 
				+            collects information about chunks stored in all directories it encountered.
			
 
				+
			
 
				+            It considers chunk IDs and their plaintext sizes (we don't have the compressed
			
 
				+            size in the repository easily available) and adds up added/removed chunks'
			
 
				+            sizes per direct parent directory and outputs a list of "directory: size".
			
 
				+
			
 
				+            You can use that list to find directories with a lot of "activity" - maybe
			
 
				+            some of these are temporary or cache directories you did forget to exclude.
			
 
				+
			
 
				+            To not have these unwanted directories in your backups, you could carefully
			
 
				+            exclude these in ``borg create`` (for future backups) or use ``borg recreate``
			
 
				+            to re-create existing archives without these.
			
 
				+            """
			
 
				+        )
			
 
				+        subparser = subparsers.add_parser(
			
 
				+            "analyze",
			
 
				+            parents=[common_parser],
			
 
				+            add_help=False,
			
 
				+            description=self.do_analyze.__doc__,
			
 
				+            epilog=analyze_epilog,
			
 
				+            formatter_class=argparse.RawDescriptionHelpFormatter,
			
 
				+            help="analyze archives",
			
 
				+        )
			
 
				+        subparser.set_defaults(func=self.do_analyze)
			
 
				+        define_archive_filters_group(subparser)
			
--- a/src/borg/testsuite/archiver/analyze_cmd.py
+++ b/src/borg/testsuite/archiver/analyze_cmd.py
@@ -0,0 +1,41 @@
 
				+import pathlib
			
 
				+
			
 
				+from ...constants import *  # NOQA
			
 
				+from . import cmd, generate_archiver_tests, RK_ENCRYPTION
			
 
				+
			
 
				+pytest_generate_tests = lambda metafunc: generate_archiver_tests(metafunc, kinds="local")  # NOQA
			
 
				+
			
 
				+
			
 
				+def test_analyze(archivers, request):
			
 
				+    def create_archive():
			
 
				+        cmd(archiver, "create", "archive", archiver.input_path)
			
 
				+
			
 
				+    def analyze_archives():
			
 
				+        return cmd(archiver, "analyze", "-a", "archive")
			
 
				+
			
 
				+    archiver = request.getfixturevalue(archivers)
			
 
				+
			
 
				+    cmd(archiver, "repo-create", RK_ENCRYPTION)
			
 
				+    input_path = pathlib.Path(archiver.input_path)
			
 
				+
			
 
				+    # 1st archive
			
 
				+    (input_path / "file1").write_text("1")
			
 
				+    create_archive()
			
 
				+
			
 
				+    # 2nd archive
			
 
				+    (input_path / "file2").write_text("22")
			
 
				+    create_archive()
			
 
				+
			
 
				+    assert "/input: 2" in analyze_archives()  # 2nd archive added 1 chunk for input path
			
 
				+
			
 
				+    # 3rd archive
			
 
				+    (input_path / "file3").write_text("333")
			
 
				+    create_archive()
			
 
				+
			
 
				+    assert "/input: 5" in analyze_archives()  # 2nd/3rd archives added 2 chunks for input path
			
 
				+
			
 
				+    # 4th archive
			
 
				+    (input_path / "file2").unlink()
			
 
				+    create_archive()
			
 
				+
			
 
				+    assert "/input: 7" in analyze_archives()  # 2nd/3rd archives added 2, 4th archive removed 1