Browse Source

diff --sort-by: enhanced sorting, fixes #8998

use borg diff --sort-by=spec1,spec2,spec2 for enhanced sorting.

remove legacy --sort behaviour (sort by path), this was deprecated
since 1.4.2.

Co-authored-by: Daniel Rudolf <github.com@daniel-rudolf.de>

This is a port of #9005 to master branch.
Thomas Waldmann 1 month ago
parent
commit
5c44dad125
4 changed files with 279 additions and 7 deletions
  1. 8 0
      docs/usage/diff.rst
  2. 153 5
      src/borg/archiver/diff_cmd.py
  3. 6 0
      src/borg/item.pyi
  4. 112 2
      src/borg/testsuite/archiver/diff_cmd_test.py

+ 8 - 0
docs/usage/diff.rst

@@ -16,3 +16,11 @@ Examples
     {"path": "file4", "changes": [{"type": "added", "size": 0}]}
     {"path": "file3", "changes": [{"type": "removed", "size": 0}]}
 
+
+    # Use --sort-by with a comma-separated list; sorts apply stably from last to first.
+    # Here: primary by net size change descending, tie-breaker by path ascending
+    $ borg diff --sort-by=">size_diff,path" archive1 archive2
+        +17 B      -5 B [-rw-r--r-- -> -rwxr-xr-x] file1
+    removed         0 B file3
+    added           0 B file4
+       +135 B    -252 B file2

+ 153 - 5
src/borg/archiver/diff_cmd.py

@@ -8,6 +8,8 @@ from ._common import with_repository, build_matcher, Highlander
 from ..archive import Archive
 from ..constants import *  # NOQA
 from ..helpers import BaseFormatter, DiffFormatter, archivename_validator, PathSpec, BorgJsonEncoder
+from ..helpers import IncludePatternNeverMatchedWarning, remove_surrogates
+from ..item import ItemDiff
 from ..manifest import Manifest
 from ..logger import create_logger
 
@@ -87,11 +89,75 @@ class DiffMixIn:
         diffs_iter = Archive.compare_archives_iter(
             archive1, archive2, matcher, can_compare_chunk_ids=can_compare_chunk_ids
         )
-        # Conversion to string and filtering for diff.equal to save memory if sorting
+        # Filter out equal items early (keep as generator; listify only if sorting)
         diffs = (diff for diff in diffs_iter if not diff.equal(args.content_only))
 
-        if args.sort:
-            diffs = sorted(diffs, key=lambda diff: diff.path)
+        sort_specs = []
+        if args.sort_by:
+            for spec in args.sort_by.split(","):
+                spec = spec.strip()
+                if spec:
+                    sort_specs.append(spec)
+
+        def key_for(field: str, d: "ItemDiff"):
+            # strip direction markers if present
+            if field and field[0] in ("<", ">"):
+                field = field[1:]
+            # path
+            if field in (None, "", "path"):
+                return remove_surrogates(d.path)
+            # compute size_* from changes
+            if field in ("size_diff", "size_added", "size_removed"):
+                added = removed = 0
+                ch = d.changes().get("content")
+                if ch is not None:
+                    info = ch.to_dict()
+                    t = info.get("type")
+                    if t == "modified":
+                        added = info.get("added", 0)
+                        removed = info.get("removed", 0)
+                    elif t and t.startswith("added"):
+                        added = info.get("added", info.get("size", 0))
+                        removed = 0
+                    elif t and t.startswith("removed"):
+                        added = 0
+                        removed = info.get("removed", info.get("size", 0))
+                if field == "size_diff":
+                    return added - removed
+                if field == "size_added":
+                    return added
+                if field == "size_removed":
+                    return removed
+            # timestamp diffs
+            if field in ("ctime_diff", "mtime_diff"):
+                ts = field.split("_")[0]
+                t1 = d._item1.get(ts, 0)
+                t2 = d._item2.get(ts, 0)
+                return t2 - t1
+            # size of item in archive2
+            if field == "size":
+                it = d._item2
+                if it is None or it.get("deleted"):
+                    return 0
+                return it.get_size()
+            # direct attributes from current item (prefer item2)
+            it = d._item2 or d._item1
+            attr_defaults = {"user": "", "group": "", "uid": -1, "gid": -1, "ctime": 0, "mtime": 0}
+            if field in attr_defaults:
+                if it is None:
+                    return attr_defaults[field]
+                return it.get(field, attr_defaults[field])
+            raise ValueError(f"Invalid field name: {field}")
+
+        if sort_specs:
+            diffs = list(diffs)
+            # Apply stable sorts from last to first
+            for spec in reversed(sort_specs):
+                desc = False
+                field = spec
+                if field and field[0] in ("<", ">"):
+                    desc = field[0] == ">"
+                diffs.sort(key=lambda di: key_for(field, di), reverse=desc)
 
         formatter = DiffFormatter(format, args.content_only)
         for diff in diffs:
@@ -112,7 +178,7 @@ class DiffMixIn:
                 """
         This command finds differences (file contents, metadata) between ARCHIVE1 and ARCHIVE2.
 
-        For more help on include/exclude patterns, see the :ref:`borg_patterns` command output.
+        For more help on include/exclude patterns, see the output of the :ref:`borg_patterns` command.
 
         .. man NOTES
 
@@ -149,7 +215,84 @@ class DiffMixIn:
         """
             )
             + DiffFormatter.keys_help()
+            + textwrap.dedent(
+                """
+
+        What is compared
+        +++++++++++++++++
+        For each matching item in both archives, Borg reports:
+
+        - Content changes: total added/removed bytes within files. If chunker parameters are comparable,
+          Borg compares chunk IDs quickly; otherwise, it compares the content.
+        - Metadata changes: user, group, mode, and other metadata shown inline, like
+          "[old_mode -> new_mode]" for mode changes. Use ``--content-only`` to suppress metadata changes.
+        - Added/removed items: printed as "added SIZE path" or "removed SIZE path".
+
+        Output formats
+        ++++++++++++++
+        The default (text) output shows one line per changed path, e.g.::
+
+            +135 B    -252 B [ -rw-r--r-- -> -rwxr-xr-x ] path/to/file
+
+        JSON Lines output (``--json-lines``) prints one JSON object per changed path, e.g.::
+
+            {"path": "PATH", "changes": [
+                {"type": "modified", "added": BYTES, "removed": BYTES},
+                {"type": "mode", "old_mode": "-rw-r--r--", "new_mode": "-rwxr-xr-x"},
+                {"type": "added", "size": SIZE},
+                {"type": "removed", "size": SIZE}
+            ]}
+
+        Sorting
+        ++++++++
+        Use ``--sort-by FIELDS`` where FIELDS is a comma-separated list of fields.
+        Sorts are applied stably from last to first in the given list. Prepend ">" for
+        descending, "<" (or no prefix) for ascending, for example ``--sort-by=">size_added,path"``.
+        Supported fields include:
+
+        - path: the item path
+        - size_added: total bytes added for the item content
+        - size_removed: total bytes removed for the item content
+        - size_diff: size_added - size_removed (net content change)
+        - size: size of the item as stored in ARCHIVE2 (0 for removed items)
+        - user, group, uid, gid, ctime, mtime: taken from the item state in ARCHIVE2 when present
+        - ctime_diff, mtime_diff: timestamp difference (ARCHIVE2 - ARCHIVE1)
+
+        Performance considerations
+        ++++++++++++++++++++++++++
+        diff automatically detects whether the archives were created with the same chunker
+        parameters. If so, only chunk IDs are compared, which is very fast.
+        """
+            )
         )
+
+        def diff_sort_spec_validator(s):
+            if not isinstance(s, str):
+                raise argparse.ArgumentTypeError("unsupported sort field (not a string)")
+            allowed = {
+                "path",
+                "size_added",
+                "size_removed",
+                "size_diff",
+                "size",
+                "user",
+                "group",
+                "uid",
+                "gid",
+                "ctime",
+                "mtime",
+                "ctime_diff",
+                "mtime_diff",
+            }
+            parts = [p.strip() for p in s.split(",") if p.strip()]
+            if not parts:
+                raise argparse.ArgumentTypeError("unsupported sort field: empty spec")
+            for spec in parts:
+                field = spec[1:] if spec and spec[0] in (">", "<") else spec
+                if field not in allowed:
+                    raise argparse.ArgumentTypeError(f"unsupported sort field: {field}")
+            return ",".join(parts)
+
         subparser = subparsers.add_parser(
             "diff",
             parents=[common_parser],
@@ -172,7 +315,6 @@ class DiffMixIn:
             action="store_true",
             help="override the check of chunker parameters",
         )
-        subparser.add_argument("--sort", dest="sort", action="store_true", help="Sort the output lines by file path.")
         subparser.add_argument(
             "--format",
             metavar="FORMAT",
@@ -181,6 +323,12 @@ class DiffMixIn:
             help='specify format for differences between archives (default: "{change} {path}{NL}")',
         )
         subparser.add_argument("--json-lines", action="store_true", help="Format output as JSON Lines.")
+        subparser.add_argument(
+            "--sort-by",
+            dest="sort_by",
+            type=diff_sort_spec_validator,
+            help="Sort output by comma-separated fields (e.g., '>size_added,path').",
+        )
         subparser.add_argument(
             "--content-only",
             action="store_true",

+ 6 - 0
src/borg/item.pyi

@@ -263,6 +263,12 @@ class DiffChange:
 
 class ItemDiff:
     path: str
+    _item1: Item
+    _item2: Item
+    _chunk_1: Iterator
+    _chunk_2: Iterator
+    _numeric_ids: bool
+    _can_compare_chunk_ids: bool
     def __init__(
         self,
         path: str,

+ 112 - 2
src/borg/testsuite/archiver/diff_cmd_test.py

@@ -293,7 +293,7 @@ def test_time_diffs(archivers, request):
         assert "ctime" not in output
 
 
-def test_sort_option(archivers, request):
+def test_sort_by_option(archivers, request):
     archiver = request.getfixturevalue(archivers)
     cmd(archiver, "repo-create", RK_ENCRYPTION)
 
@@ -313,7 +313,7 @@ def test_sort_option(archivers, request):
     create_regular_file(archiver.input_path, "d_file_added", size=256)
     cmd(archiver, "create", "test1", "input")
 
-    output = cmd(archiver, "diff", "test0", "test1", "--sort", "--content-only")
+    output = cmd(archiver, "diff", "test0", "test1", "--sort-by=path", "--content-only")
     expected = ["a_file_removed", "b_file_added", "c_file_changed", "d_file_added", "e_file_changed", "f_file_removed"]
     assert isinstance(output, str)
     outputs = output.splitlines()
@@ -321,6 +321,116 @@ def test_sort_option(archivers, request):
     assert all(x in line for x, line in zip(expected, outputs))
 
 
+def test_sort_by_invalid_field_is_rejected(archivers, request):
+    archiver = request.getfixturevalue(archivers)
+    cmd(archiver, "repo-create", RK_ENCRYPTION)
+
+    create_regular_file(archiver.input_path, "file", size=1)
+    cmd(archiver, "create", "a1", "input")
+    create_regular_file(archiver.input_path, "file", size=2)
+    cmd(archiver, "create", "a2", "input")
+
+    # Unsupported field should cause argument parsing error
+    cmd(archiver, "diff", "a1", "a2", "--sort-by=not_a_field", exit_code=EXIT_ERROR)
+
+
+def test_sort_by_size_added_then_path(archivers, request):
+    archiver = request.getfixturevalue(archivers)
+    cmd(archiver, "repo-create", RK_ENCRYPTION)
+
+    # Base archive with two files that will be removed later
+    create_regular_file(archiver.input_path, "r_big_removed", size=50)
+    create_regular_file(archiver.input_path, "r_small_removed", size=5)
+    cmd(archiver, "create", "base", "input")
+
+    # Second archive: remove both above and add two new files of different sizes
+    os.unlink("input/r_big_removed")
+    os.unlink("input/r_small_removed")
+    create_regular_file(archiver.input_path, "a_small_added", size=10)
+    create_regular_file(archiver.input_path, "b_large_added", size=30)
+    cmd(archiver, "create", "next", "input")
+
+    # Sort by size added (ascending), then path to break ties deterministically
+    output = cmd(archiver, "diff", "base", "next", "--sort-by=size_added,path", "--content-only")
+    lines = output.splitlines()
+    # Expect removed entries first (size_added=0), ordered by path, then added entries by increasing size
+    expected_order = [
+        "removed:.*input/r_big_removed",  # size_added=0
+        "removed:.*input/r_small_removed",  # size_added=0
+        "added:.*10 B.*input/a_small_added",
+        "added:.*30 B.*input/b_large_added",
+    ]
+    assert len(lines) == len(expected_order)
+    for pattern, line in zip(expected_order, lines):
+        assert_line_exists([line], pattern)
+
+
+@pytest.mark.parametrize(
+    "sort_key",
+    [
+        "path",
+        "size",
+        "size_added",
+        "size_removed",
+        "size_diff",
+        "user",
+        "group",
+        "uid",
+        "gid",
+        "ctime",
+        "mtime",
+        "ctime_diff",
+        "mtime_diff",
+    ],
+)
+def test_sort_by_all_keys_with_directions(archivers, request, sort_key):
+    archiver = request.getfixturevalue(archivers)
+    cmd(archiver, "repo-create", RK_ENCRYPTION)
+
+    # Prepare initial files
+    create_regular_file(archiver.input_path, "a_removed", size=11)
+    create_regular_file(archiver.input_path, "f_removed", size=22)
+    create_regular_file(archiver.input_path, "c_changed", size=33)
+    create_regular_file(archiver.input_path, "e_changed", size=44)
+    cmd(archiver, "create", "s0", "input")
+
+    # Ensure that subsequent modifications happen on a later timestamp tick than s0
+    time.sleep(1.0 if is_darwin else 0.1)  # HFS+ has ~1s timestamp granularity on macOS
+
+    # Create differences for second archive
+    os.unlink("input/a_removed")
+    os.unlink("input/f_removed")
+    os.unlink("input/c_changed")
+    os.unlink("input/e_changed")
+    # Recreate changed files with different sizes
+    create_regular_file(archiver.input_path, "c_changed", size=333)
+    create_regular_file(archiver.input_path, "e_changed", size=444)
+    # Added files
+    create_regular_file(archiver.input_path, "b_added", size=55)
+    create_regular_file(archiver.input_path, "d_added", size=66)
+    cmd(archiver, "create", "s1", "input")
+
+    expected_paths = {
+        "input/a_removed",
+        "input/b_added",
+        "input/c_changed",
+        "input/d_added",
+        "input/e_changed",
+        "input/f_removed",
+    }
+
+    # Exercise both ascending and descending for each key.
+    for direction in ("<", ">"):
+        sort_spec = f"{direction}{sort_key},path"
+        output = cmd(archiver, "diff", "s0", "s1", f"--sort-by={sort_spec}", "--content-only")
+        lines = output.splitlines()
+        assert len(lines) == len(expected_paths)
+        # Validate that we got exactly the expected items regardless of order.
+        # As we do not test the order, this is mostly for test coverage.
+        seen_paths = {line.split()[-1] for line in lines}
+        assert seen_paths == expected_paths
+
+
 @pytest.mark.skipif(not are_hardlinks_supported(), reason="hardlinks not supported")
 def test_hard_link_deletion_and_replacement(archivers, request):
     archiver = request.getfixturevalue(archivers)