Browse Source

diff --sort-by: enhanced sorting, fixes #8998

use borg diff --sort-by=spec1,spec2,spec2 for enhanced sorting.

keep legacy --sort behaviour (sort by path) for compatibility,
but deprecate it.

Co-authored-by: Daniel Rudolf <github.com@daniel-rudolf.de>
Thomas Waldmann 1 month ago
parent
commit
7f9dd37536
3 changed files with 227 additions and 9 deletions
  1. 9 1
      docs/usage/diff.rst
  2. 159 7
      src/borg/archiver.py
  3. 59 1
      src/borg/testsuite/archiver.py

+ 9 - 1
docs/usage/diff.rst

@@ -41,4 +41,12 @@ Examples
     {"path": "file1", "changes": [{"type": "modified", "added": 17, "removed": 5}, {"type": "mode", "old_mode": "-rw-r--r--", "new_mode": "-rwxr-xr-x"}]}
     {"path": "file1", "changes": [{"type": "modified", "added": 17, "removed": 5}, {"type": "mode", "old_mode": "-rw-r--r--", "new_mode": "-rwxr-xr-x"}]}
     {"path": "file2", "changes": [{"type": "modified", "added": 135, "removed": 252}]}
     {"path": "file2", "changes": [{"type": "modified", "added": 135, "removed": 252}]}
     {"path": "file4", "changes": [{"type": "added", "size": 0}]}
     {"path": "file4", "changes": [{"type": "added", "size": 0}]}
-    {"path": "file3", "changes": [{"type": "removed", "size": 0}]}
+    {"path": "file3", "changes": [{"type": "removed", "size": 0}]}
+
+    # Use --sort-by with a comma-separated list; sorts apply stably from last to first.
+    # Here: primary by net size change descending, tie-breaker by path ascending
+    $ borg diff --sort-by=">size_diff,path" testrepo::archive1 archive3
+        +17 B      -5 B [-rw-r--r-- -> -rwxr-xr-x] file1
+    removed         0 B file3
+    added           0 B file4
+       +135 B    -252 B file2

+ 159 - 7
src/borg/archiver.py

@@ -1173,14 +1173,92 @@ class Archiver:
         matcher = self.build_matcher(args.patterns, args.paths)
         matcher = self.build_matcher(args.patterns, args.paths)
 
 
         diffs = Archive.compare_archives_iter(self.print_warning, archive1, archive2, matcher, can_compare_chunk_ids=can_compare_chunk_ids, content_only=args.content_only)
         diffs = Archive.compare_archives_iter(self.print_warning, archive1, archive2, matcher, can_compare_chunk_ids=can_compare_chunk_ids, content_only=args.content_only)
-        # Conversion to string and filtering for diff.equal to save memory if sorting
-        diffs = ((path, diff.changes()) for path, diff in diffs if not diff.equal)
-
-        if args.sort:
-            diffs = sorted(diffs)
+        diffs = ((path, diff) for path, diff in diffs if not diff.equal)
+
+        # Sorting support
+
+        def current_item(diff_obj):
+            # Prefer the state in archive2 if available, otherwise fall back to archive1 (e.g. removed items)
+            it2 = getattr(diff_obj, '_item2', None)
+            it1 = getattr(diff_obj, '_item1', None)
+            return it2 or it1
+
+        def key_for(field, path, diff_obj):
+            # field without direction markers
+            if field in ('path', None):
+                return remove_surrogates(path)
+            # compute size_added/removed from changes summary
+            if field in ('size_diff', 'size_added', 'size_removed'):
+                added = removed = 0
+                for j, s in diff_obj.changes():
+                    t = j.get('type')
+                    if t == 'modified':
+                        added += j.get('added', 0)
+                        removed += j.get('removed', 0)
+                    elif t and t.startswith('added'):
+                        added += j.get('size', 0)
+                    elif t and t.startswith('removed'):
+                        removed += j.get('size', 0)
+                if field == 'size_diff':
+                    return added - removed
+                if field == 'size_added':
+                    return added
+                if field == 'size_removed':
+                    return removed
+            if field in ('ctime_diff', 'mtime_diff'):
+                it2 = getattr(diff_obj, '_item2', None)
+                it1 = getattr(diff_obj, '_item1', None)
+                # default to 0 if missing
+                t2 = getattr(it2, field.split('_')[0], 0) if it2 is not None else 0
+                t1 = getattr(it1, field.split('_')[0], 0) if it1 is not None else 0
+                return t2 - t1
+            if field == 'size':
+                it2 = getattr(diff_obj, '_item2', None)
+                if it2 is None or it2.get('deleted'):
+                    return 0
+                # size of the item as stored in archive2
+                return it2.get_size()
+            it = current_item(diff_obj)
+            attr_defaults = {
+                # note: mode sorting removed per issue request
+                # keep defaults for potential other fields
+                'user': '',
+                'group': '',
+                'uid': -1,
+                'gid': -1,
+                'ctime': 0,
+                'mtime': 0,
+            }
+            if field in attr_defaults:
+                default = attr_defaults[field]
+                return getattr(it, field, default)
+            raise ValueError('Invalid field name: %s' % field)
+
+        # Determine sort specifications: use --sort-by if given; legacy --sort means sorting by path.
+        sort_specs = []
+        if getattr(args, 'sort_by', None):
+            # args.sort_by is a single comma-separated string
+            for spec in str(args.sort_by).split(','):
+                spec = spec.strip()
+                if spec:
+                    sort_specs.append(spec)
+        elif getattr(args, 'sort', False):
+            sort_specs = ['path']
+
+        if sort_specs:
+            diffs = list(diffs)  # create the full list from the iterator
+            # Apply stable sorts from last to first spec
+            for spec in reversed(sort_specs):
+                # parse direction
+                desc = False
+                field = spec
+                if field and (field[0] == '>' or field[0] == '<'):
+                    desc = field[0] == '>'
+                    field = field[1:]
+                diffs.sort(key=lambda pd: key_for(field, pd[0], pd[1]), reverse=desc)
 
 
         for path, diff in diffs:
         for path, diff in diffs:
-            print_output(diff, remove_surrogates(path))
+            print_output(diff.changes(), remove_surrogates(path))
 
 
         for pattern in matcher.get_unmatched_include_patterns():
         for pattern in matcher.get_unmatched_include_patterns():
             self.print_warning_instance(IncludePatternNeverMatchedWarning(pattern))
             self.print_warning_instance(IncludePatternNeverMatchedWarning(pattern))
@@ -2802,6 +2880,7 @@ class Archiver:
             ('--remote-ratelimit', None, 'Warning: "--remote-ratelimit" has been deprecated. Use --upload-ratelimit instead.'),
             ('--remote-ratelimit', None, 'Warning: "--remote-ratelimit" has been deprecated. Use --upload-ratelimit instead.'),
             ('--remote-buffer', None, 'Warning: "--remote-buffer" has been deprecated. Use --upload-buffer instead.'),
             ('--remote-buffer', None, 'Warning: "--remote-buffer" has been deprecated. Use --upload-buffer instead.'),
             ('--prefix', None, 'Warning: "--prefix" has been deprecated. Use "--glob-archives \'yourprefix*\'" (-a) instead.'),
             ('--prefix', None, 'Warning: "--prefix" has been deprecated. Use "--glob-archives \'yourprefix*\'" (-a) instead.'),
+            ('--sort', None, 'Warning: "--sort" is deprecated. Use "--sort-by=path" instead.'),
         ]
         ]
         for i, arg in enumerate(args[:]):
         for i, arg in enumerate(args[:]):
             for old_name, new_name, warning in deprecations:
             for old_name, new_name, warning in deprecations:
@@ -4090,6 +4169,29 @@ class Archiver:
         define_archive_filters_group(subparser)
         define_archive_filters_group(subparser)
 
 
         # borg diff
         # borg diff
+        def diff_sort_spec_validator(s):
+            if not isinstance(s, str):
+                raise argparse.ArgumentTypeError('unsupported sort field (not a string)')
+            allowed = {
+                'path',
+                'size_added', 'size_removed', 'size_diff', 'size',
+                'user', 'group', 'uid', 'gid',
+                'ctime', 'mtime',
+                'ctime_diff', 'mtime_diff',
+            }
+            # Allow comma-separated list; validate each spec
+            parts = [p.strip() for p in s.split(',') if p.strip()]
+            if not parts:
+                raise argparse.ArgumentTypeError('unsupported sort field: empty spec')
+            for spec in parts:
+                if spec and spec[0] in ('>', '<'):
+                    field = spec[1:]
+                else:
+                    field = spec
+                if field not in allowed:
+                    raise argparse.ArgumentTypeError(f"unsupported sort field: {field}")
+            # Return normalized string (comma-joined, no spaces)
+            return ','.join(parts)
         diff_epilog = process_epilog("""
         diff_epilog = process_epilog("""
             This command finds differences (file contents, user/group/mode) between archives.
             This command finds differences (file contents, user/group/mode) between archives.
 
 
@@ -4097,6 +4199,53 @@ class Archiver:
             ARCHIVE2 is just another archive name in the same repository (no repository location
             ARCHIVE2 is just another archive name in the same repository (no repository location
             allowed).
             allowed).
 
 
+            What is compared
+            +++++++++++++++++
+            For each matching item in both archives, Borg reports:
+
+            - Content changes: total added/removed bytes within files. If chunker parameters are comparable,
+              Borg compares chunk IDs quickly; otherwise, it compares the content.
+            - Metadata changes: user, group, mode, and other metadata shown inline like
+              "[old_mode -> new_mode]" for mode changes. Use ``--content-only`` to suppress metadata changes.
+            - Added/removed items: printed as "added SIZE path" or "removed SIZE path".
+
+            Output formats
+            ++++++++++++++
+            The default (text) output shows one line per changed path, e.g.::
+
+                +135 B    -252 B [ -rw-r--r-- -> -rwxr-xr-x ] path/to/file
+
+            JSON Lines output (``--json-lines``) prints one JSON object per changed path, e.g.::
+
+                {"path": "PATH", "changes": [
+                    {"type": "modified", "added": BYTES, "removed": BYTES},
+                    {"type": "mode", "old_mode": "-rw-r--r--", "new_mode": "-rwxr-xr-x"},
+                    {"type": "added", "size": SIZE},
+                    {"type": "removed", "size": SIZE}
+                ]}
+
+            Only actual changes are included in the "changes" list. For example, a modified entry with
+            added=0 and removed=0 is omitted.
+
+            Sorting
+            ++++++++
+            Use ``--sort-by FIELDS`` where FIELDS is a comma-separated list of fields.
+            Sorts are applied stably from last to first in the given list. Prepend ">" for
+            descending, "<" (or no prefix) for ascending, for example ``--sort-by=">size_added,path"``.
+            Supported fields include:
+
+            - path: the item path
+            - size_added: total bytes added for the item content
+            - size_removed: total bytes removed for the item content
+            - size_diff: size_added - size_removed (net content change)
+            - size: size of the item as stored in ARCHIVE2 (0 for removed items)
+            - user, group, uid, gid, ctime, mtime: taken from the item state in ARCHIVE2 when present
+            - ctime_diff, mtime_diff: timestamp difference (archive2 - archive1)
+
+            The ``--sort`` option is deprecated and only sorts by path.
+
+            Performance considerations
+            ++++++++++++++++++++++++++
             For archives created with Borg 1.1 or newer, diff automatically detects whether
             For archives created with Borg 1.1 or newer, diff automatically detects whether
             the archives were created with the same chunker parameters. If so, only chunk IDs
             the archives were created with the same chunker parameters. If so, only chunk IDs
             are compared, which is very fast.
             are compared, which is very fast.
@@ -4121,7 +4270,10 @@ class Archiver:
         subparser.add_argument('--same-chunker-params', dest='same_chunker_params', action='store_true',
         subparser.add_argument('--same-chunker-params', dest='same_chunker_params', action='store_true',
                                help='Override check of chunker parameters.')
                                help='Override check of chunker parameters.')
         subparser.add_argument('--sort', dest='sort', action='store_true',
         subparser.add_argument('--sort', dest='sort', action='store_true',
-                               help='Sort the output lines by file path.')
+                               help='Sort the output by path (deprecated, use --sort-by=path).')
+        subparser.add_argument('--sort-by', dest='sort_by', metavar='FIELD[,FIELD...]', type=diff_sort_spec_validator,
+                               action=Highlander,
+                               help='Advanced sorting: specify field(s) to sort by. Accepts a comma-separated list. Prefix with > for descending or < for ascending (default).')
         subparser.add_argument(
         subparser.add_argument(
             "--content-only",
             "--content-only",
             action="store_true",
             action="store_true",

+ 59 - 1
src/borg/testsuite/archiver.py

@@ -4842,8 +4842,10 @@ class DiffArchiverTestCase(ArchiverTestCaseBase):
         self.create_regular_file('d_file_added', size=256)
         self.create_regular_file('d_file_added', size=256)
         self.cmd('create', self.repository_location + '::test1', 'input')
         self.cmd('create', self.repository_location + '::test1', 'input')
 
 
-        output = self.cmd('diff', '--sort', self.repository_location + '::test0', 'test1', '--content-only')
+        # default sorting: legacy --sort sorts by path ascending
+        output = self.cmd('diff', self.repository_location + '::test0', 'test1', '--content-only', '--sort')
         expected = [
         expected = [
+            'Warning: "--sort" is deprecated',  # workaround to make test succeed
             'a_file_removed',
             'a_file_removed',
             'b_file_added',
             'b_file_added',
             'c_file_changed',
             'c_file_changed',
@@ -4851,9 +4853,65 @@ class DiffArchiverTestCase(ArchiverTestCaseBase):
             'e_file_changed',
             'e_file_changed',
             'f_file_removed',
             'f_file_removed',
         ]
         ]
+        assert all(x in line for x, line in zip(expected, output.splitlines()))
+
+        # single field sort by size_added descending (new --sort-by)
+        output = self.cmd('diff', self.repository_location + '::test0', 'test1', '--content-only', '--sort-by=>size_added')
+        # size_added for entries: e_file_changed (1024), c_file_changed (512), d_file_added (256), b_file_added (128), a_file_removed (0), f_file_removed (0)
+        expected = [
+            'e_file_changed',
+            'c_file_changed',
+            'd_file_added',
+            'b_file_added',
+        ]
+        names_in_output = [line.split()[-1].split('/')[-1] for line in output.splitlines()]
+        subset = [n for n in names_in_output if n in expected]
+        assert subset == expected
 
 
+        # multi-key sort: primary by size_added descending, secondary by path ascending for ties (removed files have 0)
+        output = self.cmd('diff', self.repository_location + '::test0', 'test1', '--content-only', '--sort-by=>size_added,path')
+        expected = [
+            'e_file_changed',
+            'c_file_changed',
+            'd_file_added',
+            'b_file_added',
+            'a_file_removed',
+            'f_file_removed',
+        ]
         assert all(x in line for x, line in zip(expected, output.splitlines()))
         assert all(x in line for x, line in zip(expected, output.splitlines()))
 
 
+        # sort by size_diff descending (net content change)
+        output = self.cmd('diff', self.repository_location + '::test0', 'test1', '--content-only', '--sort-by=>size_diff')
+        expected = [
+            'e_file_changed',
+            'c_file_changed',
+            'd_file_added',
+            'b_file_added',
+        ]
+        names_in_output = [line.split()[-1].split('/')[-1] for line in output.splitlines()]
+        subset = [n for n in names_in_output if n in expected]
+        assert subset == expected
+
+        # sort by size (file size in archive2) descending; removed files have 0 and come last
+        output = self.cmd('diff', self.repository_location + '::test0', 'test1', '--content-only', '--sort-by=>size')
+        expected = [
+            'e_file_changed',  # 1024 in archive2
+            'c_file_changed',  # 512
+            'd_file_added',    # 256
+            'b_file_added',    # 128
+        ]
+        names_in_output = [line.split()[-1].split('/')[-1] for line in output.splitlines()]
+        subset = [n for n in names_in_output if n in expected]
+        assert subset == expected
+
+    def test_sort_validation(self):
+        # invalid sort field should be rejected by argparse
+        self.cmd('init', '--encryption=repokey', self.repository_location)
+        self.create_regular_file('x', size=1)
+        self.cmd('create', self.repository_location + '::a', 'input')
+        out = self.cmd('diff', self.repository_location + '::a', 'a', '--sort-by=invalid_field', fork=True, exit_code=2)
+        assert 'unsupported sort field' in out or 'invalid choice' in out or 'unsupported' in out
+
     def test_time_diffs(self):
     def test_time_diffs(self):
         self.cmd('init', '--encryption=repokey', self.repository_location)
         self.cmd('init', '--encryption=repokey', self.repository_location)
         self.create_regular_file("test_file", size=10)
         self.create_regular_file("test_file", size=10)