2 weeks ago · a764abd7b0
--- a/src/borg/archiver/transfer_cmd.py
+++ b/src/borg/archiver/transfer_cmd.py
@@ -1,13 +1,15 @@
 
															 import argparse
														
 
															 from ._common import with_repository, with_other_repository, Highlander
														
 
															-from ..archive import Archive
														
 
															+from ..archive import Archive, cached_hash, DownloadPipeline
														
 
															+from ..chunker import get_chunker
														
 
															 from ..compress import CompressionSpec
														
 
															 from ..constants import *  # NOQA
														
 
															 from ..crypto.key import uses_same_id_hash, uses_same_chunker_secret
														
 
															 from ..helpers import Error
														
 
															 from ..helpers import location_validator, Location, archivename_validator, comment_validator
														
 
															 from ..helpers import format_file_size, bin_to_hex
														
 
															+from ..helpers import ChunkerParams, ChunkIteratorFileWrapper
														
 
															 from ..manifest import Manifest
														
 
															 from ..legacyrepository import LegacyRepository
														
 
															 from ..repository import Repository
														
@@ -17,6 +19,103 @@ from ..logger import create_logger
 
															 logger = create_logger()
														
 
															+def transfer_chunks(
														
 
															+    upgrader, other_repository, other_manifest, other_chunks, archive, cache, recompress, dry_run, chunker_params=None
														
 
															+):
														
 
															+    """
														
 
															+    Transfer chunks from another repository to the current repository.
														
 
															+
														
 
															+    If chunker_params is provided, the chunks will be re-chunked using the specified parameters.
														
 
															+    """
														
 
															+    transfer = 0
														
 
															+    present = 0
														
 
															+    chunks = []
														
 
															+
														
 
															+    # Determine if re-chunking is needed
														
 
															+    rechunkify = chunker_params is not None
														
 
															+
														
 
															+    if rechunkify:
														
 
															+        # Similar to ArchiveRecreater.iter_chunks
														
 
															+        pipeline = DownloadPipeline(other_manifest.repository, other_manifest.repo_objs)
														
 
															+        chunk_iterator = pipeline.fetch_many(other_chunks, ro_type=ROBJ_FILE_STREAM)
														
 
															+        file = ChunkIteratorFileWrapper(chunk_iterator)
														
 
															+
														
 
															+        # Create a chunker with the specified parameters
														
 
															+        chunker = get_chunker(*chunker_params, seed=archive.key.chunk_seed, sparse=False)
														
 
															+        for chunk in chunker.chunkify(file):
														
 
															+            if not dry_run:
														
 
															+                chunk_id, data = cached_hash(chunk, archive.key.id_hash)
														
 
															+                size = len(data)
														
 
															+                # Check if the chunk is already in the repository
														
 
															+                chunk_present = cache.seen_chunk(chunk_id, size)
														
 
															+                if chunk_present:
														
 
															+                    chunk_entry = cache.reuse_chunk(chunk_id, size, archive.stats)
														
 
															+                    present += size
														
 
															+                else:
														
 
															+                    # Add the new chunk to the repository
														
 
															+                    chunk_entry = cache.add_chunk(
														
 
															+                        chunk_id, {}, data, stats=archive.stats, wait=False, ro_type=ROBJ_FILE_STREAM
														
 
															+                    )
														
 
															+                    cache.repository.async_response(wait=False)
														
 
															+                    transfer += size
														
 
															+                chunks.append(chunk_entry)
														
 
															+            else:
														
 
															+                # In dry-run mode, just estimate the size
														
 
															+                size = len(chunk.data) if chunk.data is not None else chunk.size
														
 
															+                transfer += size
														
 
															+    else:
														
 
															+        # Original implementation without re-chunking
														
 
															+        for chunk_id, size in other_chunks:
														
 
															+            chunk_present = cache.seen_chunk(chunk_id, size)
														
 
															+            if not chunk_present:  # target repo does not yet have this chunk
														
 
															+                if not dry_run:
														
 
															+                    try:
														
 
															+                        cdata = other_repository.get(chunk_id)
														
 
															+                    except (Repository.ObjectNotFound, LegacyRepository.ObjectNotFound):
														
 
															+                        # missing correct chunk in other_repository (source) will result in
														
 
															+                        # a missing chunk in repository (destination).
														
 
															+                        # we do NOT want to transfer all-zero replacement chunks from borg1 repos.
														
 
															+                        pass
														
 
															+                    else:
														
 
															+                        if recompress == "never":
														
 
															+                            # keep compressed payload same, verify via assert_id (that will
														
 
															+                            # decompress, but avoid needing to compress it again):
														
 
															+                            meta, data = other_manifest.repo_objs.parse(
														
 
															+                                chunk_id, cdata, decompress=True, want_compressed=True, ro_type=ROBJ_FILE_STREAM
														
 
															+                            )
														
 
															+                            meta, data = upgrader.upgrade_compressed_chunk(meta, data)
														
 
															+                            chunk_entry = cache.add_chunk(
														
 
															+                                chunk_id,
														
 
															+                                meta,
														
 
															+                                data,
														
 
															+                                stats=archive.stats,
														
 
															+                                wait=False,
														
 
															+                                compress=False,
														
 
															+                                size=size,
														
 
															+                                ctype=meta["ctype"],
														
 
															+                                clevel=meta["clevel"],
														
 
															+                                ro_type=ROBJ_FILE_STREAM,
														
 
															+                            )
														
 
															+                        elif recompress == "always":
														
 
															+                            # always decompress and re-compress file data chunks
														
 
															+                            meta, data = other_manifest.repo_objs.parse(chunk_id, cdata, ro_type=ROBJ_FILE_STREAM)
														
 
															+                            chunk_entry = cache.add_chunk(
														
 
															+                                chunk_id, meta, data, stats=archive.stats, wait=False, ro_type=ROBJ_FILE_STREAM
														
 
															+                            )
														
 
															+                        else:
														
 
															+                            raise ValueError(f"unsupported recompress mode: {recompress}")
														
 
															+                    cache.repository.async_response(wait=False)
														
 
															+                    chunks.append(chunk_entry)
														
 
															+                transfer += size
														
 
															+            else:
														
 
															+                if not dry_run:
														
 
															+                    chunk_entry = cache.reuse_chunk(chunk_id, size, archive.stats)
														
 
															+                    chunks.append(chunk_entry)
														
 
															+                present += size
														
 
															+
														
 
															+    return chunks, transfer, present
														
 
															+
														
 
															+
														
 
															 class TransferMixIn:
														
 
															     @with_other_repository(manifest=True, compatibility=(Manifest.Operation.READ,))
														
 
															     @with_repository(manifest=True, cache=True, compatibility=(Manifest.Operation.WRITE,))
														
@@ -76,7 +175,7 @@ class TransferMixIn:
 
															         if UpgraderCls is not upgrade_mod.UpgraderFrom12To20 and other_manifest.repository.version == 1:
														
 
															             raise Error("To transfer from a borg 1.x repo, you need to use: --upgrader=From12To20")
														
 
															-        upgrader = UpgraderCls(cache=cache)
														
 
															+        upgrader = UpgraderCls(cache=cache, args=args)
														
 
															         for archive_info in archive_infos:
														
 
															             name, id, ts = archive_info.name, archive_info.id, archive_info.ts
														
@@ -120,68 +219,22 @@ class TransferMixIn:
 
															                     else:
														
 
															                         other_chunks = None
														
 
															                     if other_chunks is not None:
														
 
															-                        chunks = []
														
 
															-                        for chunk_id, size in other_chunks:
														
 
															-                            chunk_present = cache.seen_chunk(chunk_id, size)
														
 
															-                            if not chunk_present:  # target repo does not yet have this chunk
														
 
															-                                if not dry_run:
														
 
															-                                    try:
														
 
															-                                        cdata = other_repository.get(chunk_id)
														
 
															-                                    except (Repository.ObjectNotFound, LegacyRepository.ObjectNotFound):
														
 
															-                                        # missing correct chunk in other_repository (source) will result in
														
 
															-                                        # a missing chunk in repository (destination).
														
 
															-                                        # we do NOT want to transfer all-zero replacement chunks from borg1 repos.
														
 
															-                                        pass
														
 
															-                                    else:
														
 
															-                                        if args.recompress == "never":
														
 
															-                                            # keep compressed payload same, verify via assert_id (that will
														
 
															-                                            # decompress, but avoid needing to compress it again):
														
 
															-                                            meta, data = other_manifest.repo_objs.parse(
														
 
															-                                                chunk_id,
														
 
															-                                                cdata,
														
 
															-                                                decompress=True,
														
 
															-                                                want_compressed=True,
														
 
															-                                                ro_type=ROBJ_FILE_STREAM,
														
 
															-                                            )
														
 
															-                                            meta, data = upgrader.upgrade_compressed_chunk(meta, data)
														
 
															-                                            chunk_entry = cache.add_chunk(
														
 
															-                                                chunk_id,
														
 
															-                                                meta,
														
 
															-                                                data,
														
 
															-                                                stats=archive.stats,
														
 
															-                                                wait=False,
														
 
															-                                                compress=False,
														
 
															-                                                size=size,
														
 
															-                                                ctype=meta["ctype"],
														
 
															-                                                clevel=meta["clevel"],
														
 
															-                                                ro_type=ROBJ_FILE_STREAM,
														
 
															-                                            )
														
 
															-                                        elif args.recompress == "always":
														
 
															-                                            # always decompress and re-compress file data chunks
														
 
															-                                            meta, data = other_manifest.repo_objs.parse(
														
 
															-                                                chunk_id, cdata, ro_type=ROBJ_FILE_STREAM
														
 
															-                                            )
														
 
															-                                            chunk_entry = cache.add_chunk(
														
 
															-                                                chunk_id,
														
 
															-                                                meta,
														
 
															-                                                data,
														
 
															-                                                stats=archive.stats,
														
 
															-                                                wait=False,
														
 
															-                                                ro_type=ROBJ_FILE_STREAM,
														
 
															-                                            )
														
 
															-                                        else:
														
 
															-                                            raise ValueError(f"unsupported recompress mode: {args.recompress}")
														
 
															-                                    cache.repository.async_response(wait=False)
														
 
															-                                    chunks.append(chunk_entry)
														
 
															-                                transfer_size += size
														
 
															-                            else:
														
 
															-                                if not dry_run:
														
 
															-                                    chunk_entry = cache.reuse_chunk(chunk_id, size, archive.stats)
														
 
															-                                    chunks.append(chunk_entry)
														
 
															-                                present_size += size
														
 
															+                        chunks, transfer, present = transfer_chunks(
														
 
															+                            upgrader,
														
 
															+                            other_repository,
														
 
															+                            other_manifest,
														
 
															+                            other_chunks,
														
 
															+                            archive,
														
 
															+                            cache,
														
 
															+                            args.recompress,
														
 
															+                            dry_run,
														
 
															+                            args.chunker_params,
														
 
															+                        )
														
 
															                         if not dry_run:
														
 
															                             item.chunks = chunks
														
 
															                             archive.stats.nfiles += 1
														
 
															+                        transfer_size += transfer
														
 
															+                        present_size += present
														
 
															                     if not dry_run:
														
 
															                         item = upgrader.upgrade_item(item=item)
														
 
															                         archive.add_item(item, show_progress=args.progress)
														
@@ -213,6 +266,7 @@ class TransferMixIn:
 
															         This command transfers archives from one repository to another repository.
														
 
															         Optionally, it can also upgrade the transferred data.
														
 
															         Optionally, it can also recompress the transferred data.
														
 
															+        Optionally, it can also re-chunk the transferred data using different chunker parameters.
														
 
															         It is easiest (and fastest) to give ``--compression=COMPRESSION --recompress=never`` using
														
 
															         the same COMPRESSION mode as in the SRC_REPO - borg will use that COMPRESSION for metadata (in
														
@@ -258,6 +312,10 @@ class TransferMixIn:
 
															             borg --repo=DST_REPO transfer --other-repo=SRC_REPO --from-borg1 \\
														
 
															                  --compress=zstd,3 --recompress=always
														
 
															+            # to re-chunk using different chunker parameters:
														
 
															+            borg --repo=DST_REPO transfer --other-repo=SRC_REPO \\
														
 
															+                 --chunker-params=buzhash,19,23,21,4095
														
 
															+
														
 
															         """
														
 
															         )
														
@@ -321,5 +379,16 @@ class TransferMixIn:
 
															             "If no MODE is given, `always` will be used. "
														
 
															             'Not passing --recompress is equivalent to "--recompress never".',
														
 
															         )
														
 
															+        subparser.add_argument(
														
 
															+            "--chunker-params",
														
 
															+            metavar="PARAMS",
														
 
															+            dest="chunker_params",
														
 
															+            type=ChunkerParams,
														
 
															+            default=None,
														
 
															+            action=Highlander,
														
 
															+            help="rechunk using given chunker parameters (ALGO, CHUNK_MIN_EXP, CHUNK_MAX_EXP, "
														
 
															+            "HASH_MASK_BITS, HASH_WINDOW_SIZE) or `default` to use the chunker defaults. "
														
 
															+            "default: do not rechunk",
														
 
															+        )
														
 
															         define_archive_filters_group(subparser)
														
--- a/src/borg/testsuite/archiver/transfer_cmd_test.py
+++ b/src/borg/testsuite/archiver/transfer_cmd_test.py
@@ -1,5 +1,7 @@
 
															+import hashlib
														
 
															 import json
														
 
															 import os
														
 
															+import random
														
 
															 import re
														
 
															 import stat
														
 
															 import tarfile
														
@@ -8,12 +10,13 @@ from contextlib import contextmanager
 
															 import pytest
														
 
															 from ...constants import *  # NOQA
														
 
															+from ...helpers import open_item
														
 
															 from ...helpers.time import parse_timestamp
														
 
															-from ...helpers.parseformat import parse_file_size
														
 
															+from ...helpers.parseformat import parse_file_size, ChunkerParams
														
 
															 from ..platform_test import is_win32
														
 
															-from . import cmd, create_test_files, RK_ENCRYPTION, open_archive, generate_archiver_tests
														
 
															+from . import cmd, create_regular_file, create_test_files, RK_ENCRYPTION, open_archive, generate_archiver_tests
														
 
															-pytest_generate_tests = lambda metafunc: generate_archiver_tests(metafunc, kinds="local,remote,binary")  # NOQA
														
 
															+pytest_generate_tests = lambda metafunc: generate_archiver_tests(metafunc, kinds="local,remote")  # NOQA
														
 
															 def test_transfer_upgrade(archivers, request, monkeypatch):
														
@@ -285,9 +288,11 @@ def setup_repos(archiver, mp):
 
															     when the context manager is exited, archiver will work with REPO2 (so the transfer can be run).
														
 
															     """
														
 
															     original_location = archiver.repository_location
														
 
															+    original_path = archiver.repository_path
														
 
															     mp.setenv("BORG_PASSPHRASE", "pw1")
														
 
															     archiver.repository_location = original_location + "1"
														
 
															+    archiver.repository_path = original_path + "1"
														
 
															     cmd(archiver, "repo-create", RK_ENCRYPTION)
														
 
															     other_repo1 = f"--other-repo={original_location}1"
														
@@ -296,6 +301,7 @@ def setup_repos(archiver, mp):
 
															     mp.setenv("BORG_PASSPHRASE", "pw2")
														
 
															     mp.setenv("BORG_OTHER_PASSPHRASE", "pw1")
														
 
															     archiver.repository_location = original_location + "2"
														
 
															+    archiver.repository_path = original_path + "2"
														
 
															     cmd(archiver, "repo-create", RK_ENCRYPTION, other_repo1)
														
@@ -400,3 +406,66 @@ def test_transfer_recompress(archivers, request, monkeypatch, recompress_mode):
 
															         # We allow a small percentage difference to account for metadata changes.
														
 
															         size_diff_percent = abs(source_size - dest_size) / source_size * 100
														
 
															         assert size_diff_percent < 5, f"dest_size ({dest_size}) should be similar as source_size ({source_size})."
														
 
															+
														
 
															+
														
 
															+def test_transfer_rechunk(archivers, request, monkeypatch):
														
 
															+    """Test transfer with re-chunking"""
														
 
															+    archiver = request.getfixturevalue(archivers)
														
 
															+
														
 
															+    BLKSIZE = 4096
														
 
															+    source_chunker_params = "buzhash,19,23,21,4095"  # default buzhash chunks
														
 
															+    dest_chunker_params = f"fixed,{BLKSIZE}"  # fixed chunk size
														
 
															+
														
 
															+    with setup_repos(archiver, monkeypatch) as other_repo1:
														
 
															+        contents_1 = random.randbytes(1 * BLKSIZE)
														
 
															+        contents_255 = random.randbytes(255 * BLKSIZE)
														
 
															+        contents_1024 = random.randbytes(1024 * BLKSIZE)
														
 
															+        create_regular_file(archiver.input_path, "file_1", contents=contents_1)
														
 
															+        create_regular_file(archiver.input_path, "file_256", contents=contents_255 + contents_1)
														
 
															+        create_regular_file(archiver.input_path, "file_1280", contents=contents_1024 + contents_255 + contents_1)
														
 
															+
														
 
															+        cmd(archiver, "create", f"--chunker-params={source_chunker_params}", "archive", "input")
														
 
															+
														
 
															+        # Get metadata from source archive
														
 
															+        source_info_json = cmd(archiver, "info", "--json", "archive")
														
 
															+        source_info = json.loads(source_info_json)
														
 
															+        source_archive = source_info["archives"][0]
														
 
															+        source_chunker_params_info = source_archive["chunker_params"]
														
 
															+
														
 
															+        # Calculate SHA256 hashes of file contents from source archive
														
 
															+        source_archive_obj, source_repo = open_archive(archiver.repository_path, "archive")
														
 
															+        with source_repo:
														
 
															+            source_file_hashes = {}
														
 
															+            for item in source_archive_obj.iter_items():
														
 
															+                if hasattr(item, "chunks"):  # Only process regular files with chunks
														
 
															+                    f = open_item(source_archive_obj, item)
														
 
															+                    content = f.read(10 * 1024 * 1024)  # Read up to 10 MB
														
 
															+                    source_file_hashes[item.path] = hashlib.sha256(content).hexdigest()
														
 
															+
														
 
															+    # Transfer with rechunking
														
 
															+    cmd(archiver, "transfer", other_repo1, f"--chunker-params={dest_chunker_params}")
														
 
															+
														
 
															+    # Get metadata from destination archive
														
 
															+    dest_info_json = cmd(archiver, "info", "--json", "archive")
														
 
															+    dest_info = json.loads(dest_info_json)
														
 
															+    dest_archive = dest_info["archives"][0]
														
 
															+    dest_chunker_params_info = dest_archive["chunker_params"]
														
 
															+
														
 
															+    # chunker params in metadata must reflect the chunker params given on the CLI
														
 
															+    assert tuple(source_chunker_params_info) == ChunkerParams(source_chunker_params)
														
 
															+    assert tuple(dest_chunker_params_info) == ChunkerParams(dest_chunker_params)
														
 
															+
														
 
															+    # Compare file hashes between source and destination archives, also check expected chunk counts.
														
 
															+    dest_archive_obj, dest_repo = open_archive(archiver.repository_path, "archive")
														
 
															+    with dest_repo:
														
 
															+        for item in dest_archive_obj.iter_items():
														
 
															+            if hasattr(item, "chunks"):  # Only process regular files with chunks
														
 
															+                # Verify expected chunk count for each file
														
 
															+                expected_chunk_count = {"input/file_1": 1, "input/file_256": 256, "input/file_1280": 1280}[item.path]
														
 
															+                assert len(item.chunks) == expected_chunk_count
														
 
															+                f = open_item(dest_archive_obj, item)
														
 
															+                content = f.read(10 * 1024 * 1024)  # Read up to 10 MB
														
 
															+                dest_hash = hashlib.sha256(content).hexdigest()
														
 
															+                # Verify that the file hash is identical to the source
														
 
															+                assert item.path in source_file_hashes, f"File {item.path} not found in source archive"
														
 
															+                assert dest_hash == source_file_hashes[item.path], f"Content hash mismatch for {item.path}"
														
--- a/src/borg/upgrade.py
+++ b/src/borg/upgrade.py
@@ -10,8 +10,8 @@ logger = create_logger(__name__)
 
															 class UpgraderNoOp:
														
 
															-    def __init__(self, *, cache):
														
 
															-        pass
														
 
															+    def __init__(self, *, cache, args):
														
 
															+        self.args = args
														
 
															     def new_archive(self, *, archive):
														
 
															         pass
														
@@ -37,14 +37,19 @@ class UpgraderNoOp:
 
															         ):
														
 
															             if hasattr(metadata, attr):
														
 
															                 new_metadata[attr] = getattr(metadata, attr)
														
 
															+        rechunking = self.args.chunker_params is not None
														
 
															+        if rechunking:
														
 
															+            # if we are rechunking while transferring, we take the new chunker_params.
														
 
															+            new_metadata["chunker_params"] = self.args.chunker_params
														
 
															         return new_metadata
														
 
															 class UpgraderFrom12To20:
														
 
															     borg1_header_fmt = Struct(">I")
														
 
															-    def __init__(self, *, cache):
														
 
															+    def __init__(self, *, cache, args):
														
 
															         self.cache = cache
														
 
															+        self.args = args
														
 
															     def new_archive(self, *, archive):
														
 
															         self.archive = archive
														
@@ -144,10 +149,15 @@ class UpgraderFrom12To20:
 
															         for attr in ("hostname", "username", "comment", "chunker_params"):
														
 
															             if hasattr(metadata, attr):
														
 
															                 new_metadata[attr] = getattr(metadata, attr)
														
 
															-        if chunker_params := new_metadata.get("chunker_params"):
														
 
															-            if len(chunker_params) == 4 and isinstance(chunker_params[0], int):
														
 
															-                # this is a borg < 1.2 chunker_params tuple, no chunker algo specified, but we only had buzhash:
														
 
															-                new_metadata["chunker_params"] = (CH_BUZHASH,) + chunker_params
														
 
															+        rechunking = self.args.chunker_params is not None
														
 
															+        if rechunking:
														
 
															+            # if we are rechunking while transferring, we take the new chunker_params.
														
 
															+            new_metadata["chunker_params"] = self.args.chunker_params
														
 
															+        else:
														
 
															+            if chunker_params := new_metadata.get("chunker_params"):
														
 
															+                if len(chunker_params) == 4 and isinstance(chunker_params[0], int):
														
 
															+                    # this is a borg < 1.2 chunker_params tuple, no chunker algo specified, but we only had buzhash:
														
 
															+                    new_metadata["chunker_params"] = (CH_BUZHASH,) + chunker_params
														
 
															         # old borg used UTC timestamps, but did not have the explicit tz offset in them.
														
 
															         for attr in ("time", "time_end"):
														
 
															             if hasattr(metadata, attr):