浏览代码

repository3.check: implement --repair

Tests were a bit tricky as there is validation on 2 layers now:
- repository3 does an xxh64 check, finds most corruptions already
- on the archives level, borg also does an even stronger cryptographic check
Thomas Waldmann 9 月之前
父节点
当前提交
bfbf3ba7aa
共有 2 个文件被更改,包括 101 次插入26 次删除
  1. 42 20
      src/borg/repository3.py
  2. 59 6
      src/borg/testsuite/archiver/check_cmd.py

+ 42 - 20
src/borg/repository3.py

@@ -216,7 +216,26 @@ class Repository3:
             obj_corrupted = True
             logger.error(f"Repo object {info.name} is corrupted: {msg}")
 
-        # TODO: implement repair, progress indicator, partial checks, ...
+        def check_object(obj):
+            """Check if obj looks valid."""
+            hdr_size = RepoObj.obj_header.size
+            obj_size = len(obj)
+            if obj_size >= hdr_size:
+                hdr = RepoObj.ObjHeader(*RepoObj.obj_header.unpack(obj[:hdr_size]))
+                meta = obj[hdr_size : hdr_size + hdr.meta_size]
+                if hdr.meta_size != len(meta):
+                    log_error("metadata size incorrect.")
+                elif hdr.meta_hash != xxh64(meta):
+                    log_error("metadata does not match checksum.")
+                data = obj[hdr_size + hdr.meta_size : hdr_size + hdr.meta_size + hdr.data_size]
+                if hdr.data_size != len(data):
+                    log_error("data size incorrect.")
+                elif hdr.data_hash != xxh64(data):
+                    log_error("data does not match checksum.")
+            else:
+                log_error("too small.")
+
+        # TODO: progress indicator, partial checks, ...
         mode = "full"
         logger.info("Starting repository check")
         objs_checked = objs_errors = 0
@@ -224,40 +243,43 @@ class Repository3:
         try:
             for info in infos:
                 self._lock_refresh()
-                obj_corrupted = False
                 key = "data/%s" % info.name
                 try:
                     obj = self.store.load(key)
                 except StoreObjectNotFound:
                     # looks like object vanished since store.list(), ignore that.
                     continue
-                hdr_size = RepoObj.obj_header.size
-                obj_size = len(obj)
-                if obj_size >= hdr_size:
-                    hdr = RepoObj.ObjHeader(*RepoObj.obj_header.unpack(obj[:hdr_size]))
-                    meta = obj[hdr_size : hdr_size + hdr.meta_size]
-                    if hdr.meta_size != len(meta):
-                        log_error("metadata size incorrect.")
-                    elif hdr.meta_hash != xxh64(meta):
-                        log_error("metadata does not match checksum.")
-                    data = obj[hdr_size + hdr.meta_size : hdr_size + hdr.meta_size + hdr.data_size]
-                    if hdr.data_size != len(data):
-                        log_error("data size incorrect.")
-                    elif hdr.data_hash != xxh64(data):
-                        log_error("data does not match checksum.")
-                else:
-                    log_error("too small.")
+                obj_corrupted = False
+                check_object(obj)
                 objs_checked += 1
                 if obj_corrupted:
                     objs_errors += 1
+                    if repair:
+                        # if it is corrupted, we can't do much except getting rid of it.
+                        # but let's just retry loading it, in case the error goes away.
+                        try:
+                            obj = self.store.load(key)
+                        except StoreObjectNotFound:
+                            log_error("existing object vanished.")
+                        else:
+                            obj_corrupted = False
+                            check_object(obj)
+                            if obj_corrupted:
+                                log_error("reloading did not help, deleting it!")
+                                self.store.delete(key)
+                            else:
+                                log_error("reloading did help, inconsistent behaviour detected!")
         except StoreObjectNotFound:
             # it can be that there is no "data/" at all, then it crashes when iterating infos.
             pass
         logger.info(f"Checked {objs_checked} repository objects, {objs_errors} errors.")
         if objs_errors == 0:
-            logger.info("Finished %s repository check, no problems found.", mode)
+            logger.info(f"Finished {mode} repository check, no problems found.")
         else:
-            logger.error("Finished %s repository check, errors found.", mode)
+            if repair:
+                logger.info(f"Finished {mode} repository check, errors found and repaired.")
+            else:
+                logger.error(f"Finished {mode} repository check, errors found.")
         return objs_errors == 0 or repair
 
     def scan_low_level(self, segment=None, offset=None):

+ 59 - 6
src/borg/testsuite/archiver/check_cmd.py

@@ -360,6 +360,55 @@ def test_extra_chunks(archivers, request):
 
 @pytest.mark.parametrize("init_args", [["--encryption=repokey-aes-ocb"], ["--encryption", "none"]])
 def test_verify_data(archivers, request, init_args):
+    archiver = request.getfixturevalue(archivers)
+    if archiver.get_kind() != "local":
+        pytest.skip("only works locally, patches objects")
+
+    # it's tricky to test the cryptographic data verification, because usually already the
+    # repository-level xxh64 hash fails to verify. So we use a fake one that doesn't.
+    # note: it only works like tested here for a highly engineered data corruption attack,
+    # because with accidental corruption, usually already the xxh64 low-level check fails.
+    def fake_xxh64(data, seed=0):
+        return b"fakefake"
+
+    import borg.repoobj
+    import borg.repository3
+
+    with patch.object(borg.repoobj, "xxh64", fake_xxh64), patch.object(borg.repository3, "xxh64", fake_xxh64):
+        check_cmd_setup(archiver)
+        shutil.rmtree(archiver.repository_path)
+        cmd(archiver, "rcreate", *init_args)
+        create_src_archive(archiver, "archive1")
+        archive, repository = open_archive(archiver.repository_path, "archive1")
+        with repository:
+            for item in archive.iter_items():
+                if item.path.endswith(src_file):
+                    chunk = item.chunks[-1]
+                    data = repository.get(chunk.id)
+                    data = data[0:123] + b"x" + data[123:]
+                    repository.put(chunk.id, data)
+                    break
+            repository.commit(compact=False)
+
+        # the normal archives check does not read file content data.
+        cmd(archiver, "check", "--archives-only", exit_code=0)
+        # but with --verify-data, it does and notices the issue.
+        output = cmd(archiver, "check", "--archives-only", "--verify-data", exit_code=1)
+        assert f"{bin_to_hex(chunk.id)}, integrity error" in output
+
+        # repair (heal is tested in another test)
+        output = cmd(archiver, "check", "--repair", "--verify-data", exit_code=0)
+        assert f"{bin_to_hex(chunk.id)}, integrity error" in output
+        assert f"{src_file}: New missing file chunk detected" in output
+
+        # run with --verify-data again, all fine now (file was patched with a replacement chunk).
+        cmd(archiver, "check", "--archives-only", "--verify-data", exit_code=0)
+
+
+@pytest.mark.parametrize("init_args", [["--encryption=repokey-aes-ocb"], ["--encryption", "none"]])
+def test_corrupted_file_chunk(archivers, request, init_args):
+    ## similar to test_verify_data, but here we let the low level repository-only checks discover the issue.
+
     archiver = request.getfixturevalue(archivers)
     check_cmd_setup(archiver)
     shutil.rmtree(archiver.repository_path)
@@ -371,19 +420,23 @@ def test_verify_data(archivers, request, init_args):
             if item.path.endswith(src_file):
                 chunk = item.chunks[-1]
                 data = repository.get(chunk.id)
-                data = data[0:100] + b"x" + data[101:]
+                data = data[0:123] + b"x" + data[123:]
                 repository.put(chunk.id, data)
                 break
         repository.commit(compact=False)
-    cmd(archiver, "check", exit_code=1)
-    output = cmd(archiver, "check", "--verify-data", exit_code=1)
-    assert bin_to_hex(chunk.id) + ", integrity error" in output
+
+    # the normal check checks all repository objects and the xxh64 checksum fails.
+    output = cmd(archiver, "check", "--repository-only", exit_code=1)
+    assert f"{bin_to_hex(chunk.id)} is corrupted: data does not match checksum." in output
 
     # repair (heal is tested in another test)
-    output = cmd(archiver, "check", "--repair", "--verify-data", exit_code=0)
-    assert bin_to_hex(chunk.id) + ", integrity error" in output
+    output = cmd(archiver, "check", "--repair", exit_code=0)
+    assert f"{bin_to_hex(chunk.id)} is corrupted: data does not match checksum." in output
     assert f"{src_file}: New missing file chunk detected" in output
 
+    # run normal check again, all fine now (file was patched with a replacement chunk).
+    cmd(archiver, "check", "--repository-only", exit_code=0)
+
 
 def test_empty_repository(archivers, request):
     archiver = request.getfixturevalue(archivers)