Browse Source

Merge pull request #9024 from ThomasWaldmann/transfer-corrupts-src-repo

fix borg transfer corrupting the src repo index
TW 3 weeks ago
parent
commit
3bff0c31cf

+ 12 - 0
src/borg/hashindex.pyx

@@ -201,6 +201,10 @@ class NSIndex1(HTProxyMixin, MutableMapping):
         used = len(self.ht)
         header_bytes = struct.pack(self.HEADER_FMT, self.MAGIC, used, used, self.KEY_SIZE, self.VALUE_SIZE)
         fd.write(header_bytes)
+        # record the header as a separate integrity-hash part if supported
+        hash_part = getattr(fd, "hash_part", None)
+        if hash_part:
+            hash_part("HashHeader")
         count = 0
         for key, _ in self.ht.items():
             value = self.ht._get_raw(key)
@@ -214,6 +218,10 @@ class NSIndex1(HTProxyMixin, MutableMapping):
         header_bytes = fd.read(header_size)
         if len(header_bytes) < header_size:
             raise ValueError(f"Invalid file, file is too short (header).")
+        # verify the header as a separate integrity-hash part if supported
+        hash_part = getattr(fd, "hash_part", None)
+        if hash_part:
+            hash_part("HashHeader")
         magic, entries, buckets, ksize, vsize = struct.unpack(self.HEADER_FMT, header_bytes)
         if magic != self.MAGIC:
             raise ValueError(f"Invalid file, magic {self.MAGIC.decode()} not found.")
@@ -228,6 +236,10 @@ class NSIndex1(HTProxyMixin, MutableMapping):
         for i in range(buckets):
             key = fd.read(ksize)
             value = fd.read(vsize)
+            if value.startswith(b'\xFF\xFF\xFF\xFF'):  # LE for 0xffffffff (empty/unused bucket)
+                continue
+            if value.startswith(b'\xFE\xFF\xFF\xFF'):  # LE for 0xfffffffe (deleted/tombstone bucket)
+                continue
             self.ht._set_raw(key, value)
         pos = fd.tell()
         assert pos == end_of_file

+ 5 - 15
src/borg/legacyrepository.py

@@ -515,23 +515,13 @@ class LegacyRepository:
             return
         return integrity[key]
 
-    def open_index(self, transaction_id, auto_recover=True):
+    def open_index(self, transaction_id):
         if transaction_id is None:
             return NSIndex1()
         index_path = os.path.join(self.path, "index.%d" % transaction_id)
         integrity_data = self._read_integrity(transaction_id, "index")
-        try:
-            with IntegrityCheckedFile(index_path, write=False, integrity_data=integrity_data) as fd:
-                return NSIndex1.read(fd)
-        except (ValueError, OSError, FileIntegrityError) as exc:
-            logger.warning("Repository index missing or corrupted, trying to recover from: %s", exc)
-            os.unlink(index_path)
-            if not auto_recover:
-                raise
-            self.prepare_txn(self.get_transaction_id())
-            # don't leave an open transaction around
-            self.commit(compact=False)
-            return self.open_index(self.get_transaction_id())
+        with IntegrityCheckedFile(index_path, write=False, integrity_data=integrity_data) as fd:
+            return NSIndex1.read(fd)
 
     def _unpack_hints(self, transaction_id):
         hints_path = os.path.join(self.path, "hints.%d" % transaction_id)
@@ -560,11 +550,11 @@ class LegacyRepository:
                 raise
         if not self.index or transaction_id is None:
             try:
-                self.index = self.open_index(transaction_id, auto_recover=False)
+                self.index = self.open_index(transaction_id)
             except (ValueError, OSError, FileIntegrityError) as exc:
                 logger.warning("Checking repository transaction due to previous error: %s", exc)
                 self.check_transaction()
-                self.index = self.open_index(transaction_id, auto_recover=False)
+                self.index = self.open_index(transaction_id)
         if transaction_id is None:
             self.segments = {}  # XXX bad name: usage_count_of_segment_x = self.segments[x]
             self.compact = FreeSpace()  # XXX bad name: freeable_space_of_segment_x = self.compact[x]

+ 56 - 0
src/borg/testsuite/archiver/transfer_cmd_test.py

@@ -1,3 +1,4 @@
+import glob
 import hashlib
 import json
 import os
@@ -469,3 +470,58 @@ def test_transfer_rechunk(archivers, request, monkeypatch):
                 # Verify that the file hash is identical to the source
                 assert item.path in source_file_hashes, f"File {item.path} not found in source archive"
                 assert dest_hash == source_file_hashes[item.path], f"Content hash mismatch for {item.path}"
+
+
+def test_issue_9022(archivers, request, monkeypatch):
+    """
+    Regression test for borgbackup/borg#9022: After "borg transfer --from-borg1",
+    the source Borg 1.x repository index must not be changed.
+    """
+    archiver = request.getfixturevalue(archivers)
+    if archiver.get_kind() in ["remote", "binary"]:
+        pytest.skip("only works locally")
+
+    # Prepare source (borg 1.2) repo from tarball next to this test file
+    repo12_tar = os.path.join(os.path.dirname(__file__), "repo12.tar.gz")
+
+    original_location = archiver.repository_location
+    extract_dir = f"{original_location}1"
+    os.makedirs(extract_dir)
+    with tarfile.open(repo12_tar) as tf:
+        tf.extractall(extract_dir)
+
+    def index_meta(repo_path):
+        index_files = sorted(glob.glob(os.path.join(repo_path, "index.*")))
+        assert len(index_files) == 1, f"Expected exactly 1 index file before transfer, found {len(index_files)}"
+        st = os.stat(index_files[0])
+        # Return (mtime_ns, size, inode). Use fallbacks where attributes may not exist on some platforms.
+        mtime_ns = getattr(st, "st_mtime_ns", int(st.st_mtime * 1e9))
+        inode = getattr(st, "st_ino", None)
+        return (mtime_ns, st.st_size, inode)
+
+    # Record pre-transfer index file metadata
+    pre_meta = index_meta(extract_dir)
+
+    other_repo1 = f"--other-repo={original_location}1"
+
+    # Destination repo where we transfer to (borg 2 repo)
+    archiver.repository_location = f"{original_location}2"
+
+    # Set passphrases: repo12 testdata uses "waytooeasyonlyfortests"
+    monkeypatch.setenv("BORG_PASSPHRASE", "pw2")
+    monkeypatch.setenv("BORG_OTHER_PASSPHRASE", "waytooeasyonlyfortests")
+    # For this test, we must not weaken KDF, otherwise borg2 couldn't decrypt the borg1 key
+    os.environ["BORG_TESTONLY_WEAKEN_KDF"] = "0"
+
+    # Create destination repo and run transfer from borg1 source
+    cmd(archiver, "repo-create", RK_ENCRYPTION, other_repo1, "--from-borg1")
+    cmd(archiver, "transfer", other_repo1, "--from-borg1")
+
+    # After transfer, ensure the source borg1 index file looks valid and unchanged.
+    post_meta = index_meta(extract_dir)
+
+    assert post_meta == pre_meta, (
+        f"Index file metadata changed after transfer!\n"
+        f"Before: mtime_ns={pre_meta[0]}, size={pre_meta[1]}, inode={pre_meta[2]}\n"
+        f"After:  mtime_ns={post_meta[0]}, size={post_meta[1]}, inode={post_meta[2]}"
+    )

+ 0 - 24
src/borg/testsuite/legacyrepository_test.py

@@ -571,21 +571,6 @@ def test_unreadable_hints(repository):
         do_commit(repository)
 
 
-def test_index(repository):
-    make_auxiliary(repository)
-    with open(os.path.join(repository.path, "index.1"), "wb") as fd:
-        fd.write(b"123456789")
-    do_commit(repository)
-
-
-def test_index_outside_transaction(repository):
-    make_auxiliary(repository)
-    with open(os.path.join(repository.path, "index.1"), "wb") as fd:
-        fd.write(b"123456789")
-    with repository:
-        assert len(repository) == 1
-
-
 def _corrupt_index(repository):
     # HashIndex is able to detect incorrect headers and file lengths,
     # but on its own it can't tell if the data is correct.
@@ -601,15 +586,6 @@ def _corrupt_index(repository):
         fd.write(corrupted_index_data)
 
 
-def test_index_corrupted(repository):
-    make_auxiliary(repository)
-    _corrupt_index(repository)
-    with repository:
-        # data corruption is detected due to mismatching checksums, and fixed by rebuilding the index.
-        assert len(repository) == 1
-        assert pdchunk(repository.get(H(0))) == b"foo"
-
-
 def test_index_corrupted_without_integrity(repository):
     make_auxiliary(repository)
     _corrupt_index(repository)