Переглянути джерело

Merge pull request #8568 from ThomasWaldmann/remove-chunks-healthy

Item: remove .chunks_healthy, fixes #8559
TW 1 місяць тому
батько
коміт
e12b3bb2f6

+ 2 - 2
docs/internals/frontends.rst

@@ -480,8 +480,8 @@ Refer to the *borg list* documentation for the available keys and their meaning.
 
 
 Example (excerpt) of ``borg list --json-lines``::
 Example (excerpt) of ``borg list --json-lines``::
 
 
-    {"type": "d", "mode": "drwxr-xr-x", "user": "user", "group": "user", "uid": 1000, "gid": 1000, "path": "linux", "healthy": true, "target": "", "flags": null, "mtime": "2017-02-27T12:27:20.023407", "size": 0}
-    {"type": "d", "mode": "drwxr-xr-x", "user": "user", "group": "user", "uid": 1000, "gid": 1000, "path": "linux/baz", "healthy": true, "target": "", "flags": null, "mtime": "2017-02-27T12:27:20.585407", "size": 0}
+    {"type": "d", "mode": "drwxr-xr-x", "user": "user", "group": "user", "uid": 1000, "gid": 1000, "path": "linux", "target": "", "flags": null, "mtime": "2017-02-27T12:27:20.023407", "size": 0}
+    {"type": "d", "mode": "drwxr-xr-x", "user": "user", "group": "user", "uid": 1000, "gid": 1000, "path": "linux/baz", "target": "", "flags": null, "mtime": "2017-02-27T12:27:20.585407", "size": 0}
 
 
 
 
 Archive Differencing
 Archive Differencing

+ 46 - 102
src/borg/archive.py

@@ -273,14 +273,16 @@ class DownloadPipeline:
         """
         """
         self.hlids_preloaded = set()
         self.hlids_preloaded = set()
         unpacker = msgpack.Unpacker(use_list=False)
         unpacker = msgpack.Unpacker(use_list=False)
-        for data in self.fetch_many(ids, ro_type=ROBJ_ARCHIVE_STREAM):
+        for data in self.fetch_many(ids, ro_type=ROBJ_ARCHIVE_STREAM, replacement_chunk=False):
+            if data is None:
+                continue  # archive stream chunk missing
             unpacker.feed(data)
             unpacker.feed(data)
             for _item in unpacker:
             for _item in unpacker:
                 item = Item(internal_dict=_item)
                 item = Item(internal_dict=_item)
                 if filter is None or filter(item):
                 if filter is None or filter(item):
                     if "chunks" in item:
                     if "chunks" in item:
                         item.chunks = [ChunkListEntry(*e) for e in item.chunks]
                         item.chunks = [ChunkListEntry(*e) for e in item.chunks]
-                    if "chunks_healthy" in item:
+                    if "chunks_healthy" in item:  # legacy
                         item.chunks_healthy = [ChunkListEntry(*e) for e in item.chunks_healthy]
                         item.chunks_healthy = [ChunkListEntry(*e) for e in item.chunks_healthy]
                     yield item
                     yield item
 
 
@@ -312,10 +314,32 @@ class DownloadPipeline:
                 self.repository.preload([c.id for c in item.chunks])
                 self.repository.preload([c.id for c in item.chunks])
         return preload_chunks
         return preload_chunks
 
 
-    def fetch_many(self, ids, is_preloaded=False, ro_type=None):
+    def fetch_many(self, chunks, is_preloaded=False, ro_type=None, replacement_chunk=True):
         assert ro_type is not None
         assert ro_type is not None
-        for id_, cdata in zip(ids, self.repository.get_many(ids, is_preloaded=is_preloaded)):
-            _, data = self.repo_objs.parse(id_, cdata, ro_type=ro_type)
+        ids = []
+        sizes = []
+        if all(isinstance(chunk, ChunkListEntry) for chunk in chunks):
+            for chunk in chunks:
+                ids.append(chunk.id)
+                sizes.append(chunk.size)
+        elif all(isinstance(chunk, bytes) for chunk in chunks):
+            ids = list(chunks)
+            sizes = [None] * len(ids)
+        else:
+            raise TypeError(f"unsupported or mixed element types: {chunks}")
+        for id, size, cdata in zip(
+            ids, sizes, self.repository.get_many(ids, is_preloaded=is_preloaded, raise_missing=False)
+        ):
+            if cdata is None:
+                if replacement_chunk and size is not None:
+                    logger.error(f"repository object {bin_to_hex(id)} missing, returning {size} zero bytes.")
+                    data = zeros[:size]  # return an all-zero replacement chunk of correct size
+                else:
+                    logger.error(f"repository object {bin_to_hex(id)} missing, returning None.")
+                    data = None
+            else:
+                _, data = self.repo_objs.parse(id, cdata, ro_type=ro_type)
+            assert size is None or len(data) == size
             yield data
             yield data
 
 
 
 
@@ -762,7 +786,6 @@ Duration: {0.duration}
             # if a previous extraction was interrupted between setting the mtime and setting non-default flags.
             # if a previous extraction was interrupted between setting the mtime and setting non-default flags.
             return True
             return True
 
 
-        has_damaged_chunks = "chunks_healthy" in item
         if dry_run or stdout:
         if dry_run or stdout:
             with self.extract_helper(item, "", hlm, dry_run=dry_run or stdout) as hardlink_set:
             with self.extract_helper(item, "", hlm, dry_run=dry_run or stdout) as hardlink_set:
                 if not hardlink_set:
                 if not hardlink_set:
@@ -771,9 +794,7 @@ Duration: {0.duration}
                     # it would get stuck.
                     # it would get stuck.
                     if "chunks" in item:
                     if "chunks" in item:
                         item_chunks_size = 0
                         item_chunks_size = 0
-                        for data in self.pipeline.fetch_many(
-                            [c.id for c in item.chunks], is_preloaded=True, ro_type=ROBJ_FILE_STREAM
-                        ):
+                        for data in self.pipeline.fetch_many(item.chunks, is_preloaded=True, ro_type=ROBJ_FILE_STREAM):
                             if pi:
                             if pi:
                                 pi.show(increase=len(data), info=[remove_surrogates(item.path)])
                                 pi.show(increase=len(data), info=[remove_surrogates(item.path)])
                             if stdout:
                             if stdout:
@@ -789,8 +810,6 @@ Duration: {0.duration}
                                         item_size, item_chunks_size
                                         item_size, item_chunks_size
                                     )
                                     )
                                 )
                                 )
-            if has_damaged_chunks:
-                raise BackupError("File has damaged (all-zero) chunks. Try running borg check --repair.")
             return
             return
 
 
         dest = self.cwd
         dest = self.cwd
@@ -824,8 +843,7 @@ Duration: {0.duration}
                 with backup_io("open"):
                 with backup_io("open"):
                     fd = open(path, "wb")
                     fd = open(path, "wb")
                 with fd:
                 with fd:
-                    ids = [c.id for c in item.chunks]
-                    for data in self.pipeline.fetch_many(ids, is_preloaded=True, ro_type=ROBJ_FILE_STREAM):
+                    for data in self.pipeline.fetch_many(item.chunks, is_preloaded=True, ro_type=ROBJ_FILE_STREAM):
                         if pi:
                         if pi:
                             pi.show(increase=len(data), info=[remove_surrogates(item.path)])
                             pi.show(increase=len(data), info=[remove_surrogates(item.path)])
                         with backup_io("write"):
                         with backup_io("write"):
@@ -845,8 +863,6 @@ Duration: {0.duration}
                         raise BackupError(
                         raise BackupError(
                             f"Size inconsistency detected: size {item_size}, chunks size {item_chunks_size}"
                             f"Size inconsistency detected: size {item_size}, chunks size {item_chunks_size}"
                         )
                         )
-                if has_damaged_chunks:
-                    raise BackupError("File has damaged (all-zero) chunks. Try running borg check --repair.")
             return
             return
         with backup_io:
         with backup_io:
             # No repository access beyond this point.
             # No repository access beyond this point.
@@ -1010,8 +1026,8 @@ Duration: {0.duration}
                 path,
                 path,
                 item1,
                 item1,
                 item2,
                 item2,
-                archive1.pipeline.fetch_many([c.id for c in item1.get("chunks", [])], ro_type=ROBJ_FILE_STREAM),
-                archive2.pipeline.fetch_many([c.id for c in item2.get("chunks", [])], ro_type=ROBJ_FILE_STREAM),
+                archive1.pipeline.fetch_many(item1.get("chunks", []), ro_type=ROBJ_FILE_STREAM),
+                archive2.pipeline.fetch_many(item2.get("chunks", []), ro_type=ROBJ_FILE_STREAM),
                 can_compare_chunk_ids=can_compare_chunk_ids,
                 can_compare_chunk_ids=can_compare_chunk_ids,
             )
             )
 
 
@@ -1159,10 +1175,6 @@ class ChunksProcessor:
                 return chunk_entry
                 return chunk_entry
 
 
         item.chunks = []
         item.chunks = []
-        # if we rechunkify, we'll get a fundamentally different chunks list, thus we need
-        # to get rid of .chunks_healthy, as it might not correspond to .chunks any more.
-        if self.rechunkify and "chunks_healthy" in item:
-            del item.chunks_healthy
         for chunk in chunk_iter:
         for chunk in chunk_iter:
             chunk_entry = chunk_processor(chunk)
             chunk_entry = chunk_processor(chunk)
             item.chunks.append(chunk_entry)
             item.chunks.append(chunk_entry)
@@ -1779,13 +1791,10 @@ class ArchiveChecker:
         if defect_chunks:
         if defect_chunks:
             if self.repair:
             if self.repair:
                 # if we kill the defect chunk here, subsequent actions within this "borg check"
                 # if we kill the defect chunk here, subsequent actions within this "borg check"
-                # run will find missing chunks and replace them with all-zero replacement
-                # chunks and flag the files as "repaired".
-                # if another backup is done later and the missing chunks get backed up again,
-                # a "borg check" afterwards can heal all files where this chunk was missing.
+                # run will find missing chunks.
                 logger.warning(
                 logger.warning(
-                    "Found defect chunks. They will be deleted now, so affected files can "
-                    "get repaired now and maybe healed later."
+                    "Found defect chunks and will delete them now. "
+                    "Reading files referencing these chunks will result in an I/O error."
                 )
                 )
                 for defect_chunk in defect_chunks:
                 for defect_chunk in defect_chunks:
                     # remote repo (ssh): retry might help for strange network / NIC / RAM errors
                     # remote repo (ssh): retry might help for strange network / NIC / RAM errors
@@ -1805,10 +1814,7 @@ class ArchiveChecker:
                     else:
                     else:
                         logger.warning("chunk %s not deleted, did not consistently fail.", bin_to_hex(defect_chunk))
                         logger.warning("chunk %s not deleted, did not consistently fail.", bin_to_hex(defect_chunk))
             else:
             else:
-                logger.warning(
-                    "Found defect chunks. With --repair, they would get deleted, so affected "
-                    "files could get repaired then and maybe healed later."
-                )
+                logger.warning("Found defect chunks. With --repair, they would get deleted.")
                 for defect_chunk in defect_chunks:
                 for defect_chunk in defect_chunks:
                     logger.debug("chunk %s is defect.", bin_to_hex(defect_chunk))
                     logger.debug("chunk %s is defect.", bin_to_hex(defect_chunk))
         log = logger.error if errors else logger.info
         log = logger.error if errors else logger.info
@@ -1919,80 +1925,18 @@ class ArchiveChecker:
                     self.repository.put(id_, cdata)
                     self.repository.put(id_, cdata)
 
 
         def verify_file_chunks(archive_name, item):
         def verify_file_chunks(archive_name, item):
-            """Verifies that all file chunks are present.
-
-            Missing file chunks will be replaced with new chunks of the same length containing all zeros.
-            If a previously missing file chunk re-appears, the replacement chunk is replaced by the correct one.
-            """
-
-            def replacement_chunk(size):
-                chunk = Chunk(None, allocation=CH_ALLOC, size=size)
-                chunk_id, data = cached_hash(chunk, self.key.id_hash)
-                cdata = self.repo_objs.format(chunk_id, {}, data, ro_type=ROBJ_FILE_STREAM)
-                return chunk_id, size, cdata
-
+            """Verifies that all file chunks are present. Missing file chunks will be logged."""
             offset = 0
             offset = 0
-            chunk_list = []
-            chunks_replaced = False
-            has_chunks_healthy = "chunks_healthy" in item
-            chunks_current = item.chunks
-            chunks_healthy = item.chunks_healthy if has_chunks_healthy else chunks_current
-            if has_chunks_healthy and len(chunks_current) != len(chunks_healthy):
-                # should never happen, but there was issue #3218.
-                logger.warning(f"{archive_name}: {item.path}: Invalid chunks_healthy metadata removed!")
-                del item.chunks_healthy
-                has_chunks_healthy = False
-                chunks_healthy = chunks_current
-            for chunk_current, chunk_healthy in zip(chunks_current, chunks_healthy):
-                chunk_id, size = chunk_healthy
+            for chunk in item.chunks:
+                chunk_id, size = chunk
                 if chunk_id not in self.chunks:
                 if chunk_id not in self.chunks:
-                    # a chunk of the healthy list is missing
-                    if chunk_current == chunk_healthy:
-                        logger.error(
-                            "{}: {}: New missing file chunk detected (Byte {}-{}, Chunk {}). "
-                            "Replacing with all-zero chunk.".format(
-                                archive_name, item.path, offset, offset + size, bin_to_hex(chunk_id)
-                            )
+                    logger.error(
+                        "{}: {}: Missing file chunk detected (Byte {}-{}, Chunk {}).".format(
+                            archive_name, item.path, offset, offset + size, bin_to_hex(chunk_id)
                         )
                         )
-                        self.error_found = chunks_replaced = True
-                        chunk_id, size, cdata = replacement_chunk(size)
-                        add_reference(chunk_id, size, cdata)
-                    else:
-                        logger.info(
-                            "{}: {}: Previously missing file chunk is still missing (Byte {}-{}, Chunk {}). "
-                            "It has an all-zero replacement chunk already.".format(
-                                archive_name, item.path, offset, offset + size, bin_to_hex(chunk_id)
-                            )
-                        )
-                        chunk_id, size = chunk_current
-                        if chunk_id not in self.chunks:
-                            logger.warning(
-                                "{}: {}: Missing all-zero replacement chunk detected (Byte {}-{}, Chunk {}). "
-                                "Generating new replacement chunk.".format(
-                                    archive_name, item.path, offset, offset + size, bin_to_hex(chunk_id)
-                                )
-                            )
-                            self.error_found = chunks_replaced = True
-                            chunk_id, size, cdata = replacement_chunk(size)
-                            add_reference(chunk_id, size, cdata)
-                else:
-                    if chunk_current == chunk_healthy:
-                        pass  # normal case, all fine.
-                    else:
-                        logger.info(
-                            "{}: {}: Healed previously missing file chunk! (Byte {}-{}, Chunk {}).".format(
-                                archive_name, item.path, offset, offset + size, bin_to_hex(chunk_id)
-                            )
-                        )
-                chunk_list.append([chunk_id, size])  # list-typed element as chunks_healthy is list-of-lists
+                    )
+                    self.error_found = True
                 offset += size
                 offset += size
-            if chunks_replaced and not has_chunks_healthy:
-                # if this is first repair, remember the correct chunk IDs, so we can maybe heal the file later
-                item.chunks_healthy = item.chunks
-            if has_chunks_healthy and chunk_list == chunks_healthy:
-                logger.info(f"{archive_name}: {item.path}: Completely healed previously damaged file!")
-                del item.chunks_healthy
-            item.chunks = chunk_list
             if "size" in item:
             if "size" in item:
                 item_size = item.size
                 item_size = item.size
                 item_chunks_size = item.get_size(from_chunks=True)
                 item_chunks_size = item.get_size(from_chunks=True)
@@ -2270,7 +2214,7 @@ class ArchiveRecreater:
         return chunk_entry
         return chunk_entry
 
 
     def iter_chunks(self, archive, target, chunks):
     def iter_chunks(self, archive, target, chunks):
-        chunk_iterator = archive.pipeline.fetch_many([chunk_id for chunk_id, _ in chunks], ro_type=ROBJ_FILE_STREAM)
+        chunk_iterator = archive.pipeline.fetch_many(chunks, ro_type=ROBJ_FILE_STREAM)
         if target.recreate_rechunkify:
         if target.recreate_rechunkify:
             # The target.chunker will read the file contents through ChunkIteratorFileWrapper chunk-by-chunk
             # The target.chunker will read the file contents through ChunkIteratorFileWrapper chunk-by-chunk
             # (does not load the entire file into memory)
             # (does not load the entire file into memory)

+ 1 - 22
src/borg/archiver/check_cmd.py

@@ -168,28 +168,7 @@ class CheckMixIn:
 
 
         2. When checking the consistency and correctness of archives, repair mode might
         2. When checking the consistency and correctness of archives, repair mode might
            remove whole archives from the manifest if their archive metadata chunk is
            remove whole archives from the manifest if their archive metadata chunk is
-           corrupt or lost. On a chunk level (i.e. the contents of files), repair mode
-           will replace corrupt or lost chunks with a same-size replacement chunk of
-           zeroes. If a previously zeroed chunk reappears, repair mode will restore
-           this lost chunk using the new chunk.
-
-        Most steps taken by repair mode have a one-time effect on the repository, like
-        removing a lost archive from the repository. However, replacing a corrupt or
-        lost chunk with an all-zero replacement will have an ongoing effect on the
-        repository: When attempting to extract a file referencing an all-zero chunk,
-        the ``extract`` command will distinctly warn about it. The FUSE filesystem
-        created by the ``mount`` command will reject reading such a "zero-patched"
-        file unless a special mount option is given.
-
-        As mentioned earlier, Borg might be able to "heal" a "zero-patched" file in
-        repair mode, if all its previously lost chunks reappear (e.g. via a later
-        backup). This is achieved by Borg not only keeping track of the all-zero
-        replacement chunks, but also by keeping metadata about the lost chunks. In
-        repair mode Borg will check whether a previously lost chunk reappeared and will
-        replace the all-zero replacement chunk by the reappeared chunk. If all lost
-        chunks of a "zero-patched" file reappear, this effectively "heals" the file.
-        Consequently, if lost chunks were repaired earlier, it is advised to run
-        ``--repair`` a second time after creating some new backups.
+           corrupt or lost. Borg will also report files that reference missing chunks.
 
 
         If ``--repair --find-lost-archives`` is given, previously lost entries will
         If ``--repair --find-lost-archives`` is given, previously lost entries will
         be recreated in the archive directory. This is only possible before
         be recreated in the archive directory. This is only possible before

+ 8 - 25
src/borg/archiver/compact_cmd.py

@@ -6,7 +6,7 @@ from ..archive import Archive
 from ..cache import write_chunkindex_to_repo_cache, build_chunkindex_from_repo
 from ..cache import write_chunkindex_to_repo_cache, build_chunkindex_from_repo
 from ..constants import *  # NOQA
 from ..constants import *  # NOQA
 from ..hashindex import ChunkIndex, ChunkIndexEntry
 from ..hashindex import ChunkIndex, ChunkIndexEntry
-from ..helpers import set_ec, EXIT_WARNING, EXIT_ERROR, format_file_size, bin_to_hex
+from ..helpers import set_ec, EXIT_ERROR, format_file_size, bin_to_hex
 from ..helpers import ProgressIndicatorPercent
 from ..helpers import ProgressIndicatorPercent
 from ..manifest import Manifest
 from ..manifest import Manifest
 from ..remote import RemoteRepository
 from ..remote import RemoteRepository
@@ -39,9 +39,7 @@ class ArchiveGarbageCollector:
         logger.info("Starting compaction / garbage collection...")
         logger.info("Starting compaction / garbage collection...")
         self.chunks = self.get_repository_chunks()
         self.chunks = self.get_repository_chunks()
         logger.info("Computing object IDs used by archives...")
         logger.info("Computing object IDs used by archives...")
-        (self.missing_chunks, self.reappeared_chunks, self.total_files, self.total_size, self.archives_count) = (
-            self.analyze_archives()
-        )
+        (self.missing_chunks, self.total_files, self.total_size, self.archives_count) = self.analyze_archives()
         self.report_and_delete()
         self.report_and_delete()
         self.save_chunk_index()
         self.save_chunk_index()
         logger.info("Finished compaction / garbage collection...")
         logger.info("Finished compaction / garbage collection...")
@@ -73,28 +71,24 @@ class ArchiveGarbageCollector:
             self.chunks.clear()  # we already have updated the repo cache in get_repository_chunks
             self.chunks.clear()  # we already have updated the repo cache in get_repository_chunks
         self.chunks = None  # nothing there (cleared!)
         self.chunks = None  # nothing there (cleared!)
 
 
-    def analyze_archives(self) -> Tuple[Set, Set, int, int, int]:
-        """Iterate over all items in all archives, create the dicts id -> size of all used/wanted chunks."""
+    def analyze_archives(self) -> Tuple[Set, int, int, int]:
+        """Iterate over all items in all archives, create the dicts id -> size of all used chunks."""
 
 
-        def use_it(id, *, wanted=False):
+        def use_it(id):
             entry = self.chunks.get(id)
             entry = self.chunks.get(id)
             if entry is not None:
             if entry is not None:
                 # the chunk is in the repo, mark it used.
                 # the chunk is in the repo, mark it used.
                 self.chunks[id] = entry._replace(flags=entry.flags | ChunkIndex.F_USED)
                 self.chunks[id] = entry._replace(flags=entry.flags | ChunkIndex.F_USED)
-                if wanted:
-                    # chunk id is from chunks_healthy list: a lost chunk has re-appeared!
-                    reappeared_chunks.add(id)
             else:
             else:
                 # with --stats: we do NOT have this chunk in the repository!
                 # with --stats: we do NOT have this chunk in the repository!
                 # without --stats: we do not have this chunk or the chunks index is incomplete.
                 # without --stats: we do not have this chunk or the chunks index is incomplete.
                 missing_chunks.add(id)
                 missing_chunks.add(id)
 
 
         missing_chunks: set[bytes] = set()
         missing_chunks: set[bytes] = set()
-        reappeared_chunks: set[bytes] = set()
         archive_infos = self.manifest.archives.list(sort_by=["ts"])
         archive_infos = self.manifest.archives.list(sort_by=["ts"])
         num_archives = len(archive_infos)
         num_archives = len(archive_infos)
         pi = ProgressIndicatorPercent(
         pi = ProgressIndicatorPercent(
-            total=num_archives, msg="Computing used/wanted chunks %3.1f%%", step=0.1, msgid="compact.analyze_archives"
+            total=num_archives, msg="Computing used chunks %3.1f%%", step=0.1, msgid="compact.analyze_archives"
         )
         )
         total_size, total_files = 0, 0
         total_size, total_files = 0, 0
         for i, info in enumerate(archive_infos):
         for i, info in enumerate(archive_infos):
@@ -114,25 +108,14 @@ class ArchiveGarbageCollector:
                     for id, size in item.chunks:
                     for id, size in item.chunks:
                         total_size += size  # original, uncompressed file content size
                         total_size += size  # original, uncompressed file content size
                         use_it(id)
                         use_it(id)
-                    if "chunks_healthy" in item:
-                        # we also consider the chunks_healthy chunks as referenced - do not throw away
-                        # anything that borg check --repair might still need.
-                        for id, size in item.chunks_healthy:
-                            use_it(id, wanted=True)
         pi.finish()
         pi.finish()
-        return missing_chunks, reappeared_chunks, total_files, total_size, num_archives
+        return missing_chunks, total_files, total_size, num_archives
 
 
     def report_and_delete(self):
     def report_and_delete(self):
-        run_repair = " Run borg check --repair!"
-
         if self.missing_chunks:
         if self.missing_chunks:
-            logger.error(f"Repository has {len(self.missing_chunks)} missing objects." + run_repair)
+            logger.error(f"Repository has {len(self.missing_chunks)} missing objects!")
             set_ec(EXIT_ERROR)
             set_ec(EXIT_ERROR)
 
 
-        if self.reappeared_chunks:
-            logger.warning(f"{len(self.reappeared_chunks)} previously missing objects re-appeared!" + run_repair)
-            set_ec(EXIT_WARNING)
-
         logger.info("Cleaning archives directory from soft-deleted archives...")
         logger.info("Cleaning archives directory from soft-deleted archives...")
         archive_infos = self.manifest.archives.list(sort_by=["ts"], deleted=True)
         archive_infos = self.manifest.archives.list(sort_by=["ts"], deleted=True)
         for archive_info in archive_infos:
         for archive_info in archive_infos:

+ 3 - 3
src/borg/archiver/mount_cmds.py

@@ -104,9 +104,9 @@ class MountMixIn:
 
 
         - ``versions``: when used with a repository mount, this gives a merged, versioned
         - ``versions``: when used with a repository mount, this gives a merged, versioned
           view of the files in the archives. EXPERIMENTAL, layout may change in future.
           view of the files in the archives. EXPERIMENTAL, layout may change in future.
-        - ``allow_damaged_files``: by default damaged files (where missing chunks were
-          replaced with runs of zeros by ``borg check --repair``) are not readable and
-          return EIO (I/O error). Set this option to read such files.
+        - ``allow_damaged_files``: by default damaged files (where chunks are missing)
+          will return EIO (I/O error) when trying to read the related parts of the file.
+          Set this option to replace the missing parts with all-zero bytes.
         - ``ignore_permissions``: for security reasons the ``default_permissions`` mount
         - ``ignore_permissions``: for security reasons the ``default_permissions`` mount
           option is internally enforced by borg. ``ignore_permissions`` can be given to
           option is internally enforced by borg. ``ignore_permissions`` can be given to
           not enforce ``default_permissions``.
           not enforce ``default_permissions``.

+ 4 - 10
src/borg/archiver/recreate_cmd.py

@@ -95,16 +95,10 @@ class RecreateMixIn:
         at least the entire deduplicated size of the archives using the previous
         at least the entire deduplicated size of the archives using the previous
         chunker params.
         chunker params.
 
 
-        If you recently ran borg check --repair and it had to fix lost chunks with all-zero
-        replacement chunks, please first run another backup for the same data and re-run
-        borg check --repair afterwards to heal any archives that had lost chunks which are
-        still generated from the input data.
-
-        Important: running borg recreate to re-chunk will remove the chunks_healthy
-        metadata of all items with replacement chunks, so healing will not be possible
-        any more after re-chunking (it is also unlikely it would ever work: due to the
-        change of chunking parameters, the missing chunk likely will never be seen again
-        even if you still have the data that produced it).
+        If your most recent borg check found missing chunks, please first run another
+        backup for the same data, before doing any rechunking. If you are lucky, that
+        will re-create the missing chunks. Optionally, do another borg check, to see
+        if the chunks are still missing).
         """
         """
         )
         )
         subparser = subparsers.add_parser(
         subparser = subparsers.add_parser(

+ 1 - 3
src/borg/archiver/tar_cmds.py

@@ -113,9 +113,7 @@ class TarMixIn:
             """
             """
             Return a file-like object that reads from the chunks of *item*.
             Return a file-like object that reads from the chunks of *item*.
             """
             """
-            chunk_iterator = archive.pipeline.fetch_many(
-                [chunk_id for chunk_id, _ in item.chunks], is_preloaded=True, ro_type=ROBJ_FILE_STREAM
-            )
+            chunk_iterator = archive.pipeline.fetch_many(item.chunks, is_preloaded=True, ro_type=ROBJ_FILE_STREAM)
             if pi:
             if pi:
                 info = [remove_surrogates(item.path)]
                 info = [remove_surrogates(item.path)]
                 return ChunkIteratorFileWrapper(
                 return ChunkIteratorFileWrapper(

+ 56 - 41
src/borg/archiver/transfer_cmd.py

@@ -9,6 +9,8 @@ from ..helpers import Error
 from ..helpers import location_validator, Location, archivename_validator, comment_validator
 from ..helpers import location_validator, Location, archivename_validator, comment_validator
 from ..helpers import format_file_size, bin_to_hex
 from ..helpers import format_file_size, bin_to_hex
 from ..manifest import Manifest
 from ..manifest import Manifest
+from ..legacyrepository import LegacyRepository
+from ..repository import Repository
 
 
 from ..logger import create_logger
 from ..logger import create_logger
 
 
@@ -111,51 +113,64 @@ class TransferMixIn:
                         # so let's remove them from old archives also, considering there is no
                         # so let's remove them from old archives also, considering there is no
                         # code any more that deals with them in special ways (e.g. to get stats right).
                         # code any more that deals with them in special ways (e.g. to get stats right).
                         continue
                         continue
-                    if "chunks" in item:
+                    if "chunks_healthy" in item:  # legacy
+                        other_chunks = item.chunks_healthy  # chunks_healthy has the CORRECT chunks list, if present.
+                    elif "chunks" in item:
+                        other_chunks = item.chunks
+                    else:
+                        other_chunks = None
+                    if other_chunks is not None:
                         chunks = []
                         chunks = []
-                        for chunk_id, size in item.chunks:
+                        for chunk_id, size in other_chunks:
                             chunk_present = cache.seen_chunk(chunk_id, size)
                             chunk_present = cache.seen_chunk(chunk_id, size)
                             if not chunk_present:  # target repo does not yet have this chunk
                             if not chunk_present:  # target repo does not yet have this chunk
                                 if not dry_run:
                                 if not dry_run:
-                                    cdata = other_repository.get(chunk_id)
-                                    if args.recompress == "never":
-                                        # keep compressed payload same, verify via assert_id (that will
-                                        # decompress, but avoid needing to compress it again):
-                                        meta, data = other_manifest.repo_objs.parse(
-                                            chunk_id,
-                                            cdata,
-                                            decompress=True,
-                                            want_compressed=True,
-                                            ro_type=ROBJ_FILE_STREAM,
-                                        )
-                                        meta, data = upgrader.upgrade_compressed_chunk(meta, data)
-                                        chunk_entry = cache.add_chunk(
-                                            chunk_id,
-                                            meta,
-                                            data,
-                                            stats=archive.stats,
-                                            wait=False,
-                                            compress=False,
-                                            size=size,
-                                            ctype=meta["ctype"],
-                                            clevel=meta["clevel"],
-                                            ro_type=ROBJ_FILE_STREAM,
-                                        )
-                                    elif args.recompress == "always":
-                                        # always decompress and re-compress file data chunks
-                                        meta, data = other_manifest.repo_objs.parse(
-                                            chunk_id, cdata, ro_type=ROBJ_FILE_STREAM
-                                        )
-                                        chunk_entry = cache.add_chunk(
-                                            chunk_id,
-                                            meta,
-                                            data,
-                                            stats=archive.stats,
-                                            wait=False,
-                                            ro_type=ROBJ_FILE_STREAM,
-                                        )
+                                    try:
+                                        cdata = other_repository.get(chunk_id)
+                                    except (Repository.ObjectNotFound, LegacyRepository.ObjectNotFound):
+                                        # missing correct chunk in other_repository (source) will result in
+                                        # a missing chunk in repository (destination).
+                                        # we do NOT want to transfer all-zero replacement chunks from borg1 repos.
+                                        pass
                                     else:
                                     else:
-                                        raise ValueError(f"unsupported recompress mode: {args.recompress}")
+                                        if args.recompress == "never":
+                                            # keep compressed payload same, verify via assert_id (that will
+                                            # decompress, but avoid needing to compress it again):
+                                            meta, data = other_manifest.repo_objs.parse(
+                                                chunk_id,
+                                                cdata,
+                                                decompress=True,
+                                                want_compressed=True,
+                                                ro_type=ROBJ_FILE_STREAM,
+                                            )
+                                            meta, data = upgrader.upgrade_compressed_chunk(meta, data)
+                                            chunk_entry = cache.add_chunk(
+                                                chunk_id,
+                                                meta,
+                                                data,
+                                                stats=archive.stats,
+                                                wait=False,
+                                                compress=False,
+                                                size=size,
+                                                ctype=meta["ctype"],
+                                                clevel=meta["clevel"],
+                                                ro_type=ROBJ_FILE_STREAM,
+                                            )
+                                        elif args.recompress == "always":
+                                            # always decompress and re-compress file data chunks
+                                            meta, data = other_manifest.repo_objs.parse(
+                                                chunk_id, cdata, ro_type=ROBJ_FILE_STREAM
+                                            )
+                                            chunk_entry = cache.add_chunk(
+                                                chunk_id,
+                                                meta,
+                                                data,
+                                                stats=archive.stats,
+                                                wait=False,
+                                                ro_type=ROBJ_FILE_STREAM,
+                                            )
+                                        else:
+                                            raise ValueError(f"unsupported recompress mode: {args.recompress}")
                                     cache.repository.async_response(wait=False)
                                     cache.repository.async_response(wait=False)
                                     chunks.append(chunk_entry)
                                     chunks.append(chunk_entry)
                                 transfer_size += size
                                 transfer_size += size
@@ -165,7 +180,7 @@ class TransferMixIn:
                                     chunks.append(chunk_entry)
                                     chunks.append(chunk_entry)
                                 present_size += size
                                 present_size += size
                         if not dry_run:
                         if not dry_run:
-                            item.chunks = chunks  # TODO: overwrite? IDs and sizes are same.
+                            item.chunks = chunks
                             archive.stats.nfiles += 1
                             archive.stats.nfiles += 1
                     if not dry_run:
                     if not dry_run:
                         item = upgrader.upgrade_item(item=item)
                         item = upgrader.upgrade_item(item=item)

+ 12 - 13
src/borg/fuse.py

@@ -10,7 +10,7 @@ import time
 from collections import defaultdict, Counter
 from collections import defaultdict, Counter
 from signal import SIGINT
 from signal import SIGINT
 
 
-from .constants import ROBJ_FILE_STREAM
+from .constants import ROBJ_FILE_STREAM, zeros
 from .fuse_impl import llfuse, has_pyfuse3
 from .fuse_impl import llfuse, has_pyfuse3
 
 
 
 
@@ -46,6 +46,7 @@ from .helpers.lrucache import LRUCache
 from .item import Item
 from .item import Item
 from .platform import uid2user, gid2group
 from .platform import uid2user, gid2group
 from .platformflags import is_darwin
 from .platformflags import is_darwin
+from .repository import Repository
 from .remote import RemoteRepository
 from .remote import RemoteRepository
 
 
 
 
@@ -652,17 +653,6 @@ class FuseOperations(llfuse.Operations, FuseBackend):
 
 
     @async_wrapper
     @async_wrapper
     def open(self, inode, flags, ctx=None):
     def open(self, inode, flags, ctx=None):
-        if not self.allow_damaged_files:
-            item = self.get_item(inode)
-            if "chunks_healthy" in item:
-                # Processed archive items don't carry the path anymore; for converting the inode
-                # to the path we'd either have to store the inverse of the current structure,
-                # or search the entire archive. So we just don't print it. It's easy to correlate anyway.
-                logger.warning(
-                    "File has damaged (all-zero) chunks. Try running borg check --repair. "
-                    "Mount with allow_damaged_files to read damaged files."
-                )
-                raise llfuse.FUSEError(errno.EIO)
         return llfuse.FileInfo(fh=inode) if has_pyfuse3 else inode
         return llfuse.FileInfo(fh=inode) if has_pyfuse3 else inode
 
 
     @async_wrapper
     @async_wrapper
@@ -699,7 +689,16 @@ class FuseOperations(llfuse.Operations, FuseBackend):
                     # evict fully read chunk from cache
                     # evict fully read chunk from cache
                     del self.data_cache[id]
                     del self.data_cache[id]
             else:
             else:
-                _, data = self.repo_objs.parse(id, self.repository_uncached.get(id), ro_type=ROBJ_FILE_STREAM)
+                try:
+                    cdata = self.repository_uncached.get(id)
+                except Repository.ObjectNotFound:
+                    if self.allow_damaged_files:
+                        data = zeros[:s]
+                        assert len(data) == s
+                    else:
+                        raise llfuse.FUSEError(errno.EIO) from None
+                else:
+                    _, data = self.repo_objs.parse(id, cdata, ro_type=ROBJ_FILE_STREAM)
                 if offset + n < len(data):
                 if offset + n < len(data):
                     # chunk was only partially read, cache it
                     # chunk was only partially read, cache it
                     self.data_cache[id] = data
                     self.data_cache[id] = data

+ 3 - 3
src/borg/helpers/fs.py

@@ -308,8 +308,8 @@ class HardLinkManager:
        If we encounter the same hlid again later, we hardlink to the path of the already extracted content of same hlid.
        If we encounter the same hlid again later, we hardlink to the path of the already extracted content of same hlid.
 
 
     C) When transferring from a borg1 archive, we need:
     C) When transferring from a borg1 archive, we need:
-       path -> chunks, chunks_healthy  # for borg1_hl_targets
-       If we encounter a regular file item with source == path later, we reuse chunks and chunks_healthy
+       path -> chunks_correct  # for borg1_hl_targets, chunks_correct must be either from .chunks_healthy or .chunks.
+       If we encounter a regular file item with source == path later, we reuse chunks_correct
        and create the same hlid = hardlink_id_from_path(source).
        and create the same hlid = hardlink_id_from_path(source).
 
 
     D) When importing a tar file (simplified 1-pass way for now, not creating borg hardlink items):
     D) When importing a tar file (simplified 1-pass way for now, not creating borg hardlink items):
@@ -353,7 +353,7 @@ class HardLinkManager:
                    a hlid (new borg style) [bytes]
                    a hlid (new borg style) [bytes]
                    a (dev, inode) tuple (filesystem)
                    a (dev, inode) tuple (filesystem)
         :param info: information to remember, could be:
         :param info: information to remember, could be:
-                     chunks / chunks_healthy list
+                     chunks list
                      hlid
                      hlid
         """
         """
         assert isinstance(id, self.id_type), f"id is {id!r}, not of type {self.id_type}"
         assert isinstance(id, self.id_type), f"id is {id!r}, not of type {self.id_type}"

+ 1 - 1
src/borg/helpers/misc.py

@@ -124,7 +124,7 @@ class ChunkIteratorFileWrapper:
 
 
 def open_item(archive, item):
 def open_item(archive, item):
     """Return file-like object for archived item (with chunks)."""
     """Return file-like object for archived item (with chunks)."""
-    chunk_iterator = archive.pipeline.fetch_many([c.id for c in item.chunks], ro_type=ROBJ_FILE_STREAM)
+    chunk_iterator = archive.pipeline.fetch_many(item.chunks, ro_type=ROBJ_FILE_STREAM)
     return ChunkIteratorFileWrapper(chunk_iterator)
     return ChunkIteratorFileWrapper(chunk_iterator)
 
 
 
 

+ 1 - 7
src/borg/helpers/parseformat.py

@@ -827,7 +827,6 @@ class ItemFormatter(BaseFormatter):
         "isoctime": "file change time (ISO 8601 format)",
         "isoctime": "file change time (ISO 8601 format)",
         "isoatime": "file access time (ISO 8601 format)",
         "isoatime": "file access time (ISO 8601 format)",
         "xxh64": "XXH64 checksum of this file (note: this is NOT a cryptographic hash!)",
         "xxh64": "XXH64 checksum of this file (note: this is NOT a cryptographic hash!)",
-        "health": 'either "healthy" (file ok) or "broken" (if file has all-zero replacement chunks)',
         "archiveid": "internal ID of the archive",
         "archiveid": "internal ID of the archive",
         "archivename": "name of the archive",
         "archivename": "name of the archive",
     }
     }
@@ -837,7 +836,6 @@ class ItemFormatter(BaseFormatter):
         ("mtime", "ctime", "atime", "isomtime", "isoctime", "isoatime"),
         ("mtime", "ctime", "atime", "isomtime", "isoctime", "isoatime"),
         tuple(sorted(hash_algorithms)),
         tuple(sorted(hash_algorithms)),
         ("archiveid", "archivename", "extra"),
         ("archiveid", "archivename", "extra"),
-        ("health",),
     )
     )
 
 
     KEYS_REQUIRING_CACHE = ()
     KEYS_REQUIRING_CACHE = ()
@@ -894,10 +892,6 @@ class ItemFormatter(BaseFormatter):
         item_data.update(text_to_json("user", item.get("user", str(item_data["uid"]))))
         item_data.update(text_to_json("user", item.get("user", str(item_data["uid"]))))
         item_data.update(text_to_json("group", item.get("group", str(item_data["gid"]))))
         item_data.update(text_to_json("group", item.get("group", str(item_data["gid"]))))
 
 
-        if jsonline:
-            item_data["healthy"] = "chunks_healthy" not in item
-        else:
-            item_data["health"] = "broken" if "chunks_healthy" in item else "healthy"
         item_data["flags"] = item.get("bsdflags")  # int if flags known, else (if flags unknown) None
         item_data["flags"] = item.get("bsdflags")  # int if flags known, else (if flags unknown) None
         for key in self.used_call_keys:
         for key in self.used_call_keys:
             item_data[key] = self.call_keys[key](item)
             item_data[key] = self.call_keys[key](item)
@@ -917,7 +911,7 @@ class ItemFormatter(BaseFormatter):
             hash = self.xxh64()
             hash = self.xxh64()
         elif hash_function in self.hash_algorithms:
         elif hash_function in self.hash_algorithms:
             hash = hashlib.new(hash_function)
             hash = hashlib.new(hash_function)
-        for data in self.archive.pipeline.fetch_many([c.id for c in item.chunks], ro_type=ROBJ_FILE_STREAM):
+        for data in self.archive.pipeline.fetch_many(item.chunks, ro_type=ROBJ_FILE_STREAM):
             hash.update(data)
             hash.update(data)
         return hash.hexdigest()
         return hash.hexdigest()
 
 

+ 7 - 4
src/borg/legacyrepository.py

@@ -1202,18 +1202,21 @@ class LegacyRepository:
             self.index = self.open_index(self.get_transaction_id())
             self.index = self.open_index(self.get_transaction_id())
         return [id_ for id_, _ in islice(self.index.iteritems(marker=marker), limit)]
         return [id_ for id_, _ in islice(self.index.iteritems(marker=marker), limit)]
 
 
-    def get(self, id, read_data=True):
+    def get(self, id, read_data=True, raise_missing=True):
         if not self.index:
         if not self.index:
             self.index = self.open_index(self.get_transaction_id())
             self.index = self.open_index(self.get_transaction_id())
         try:
         try:
             in_index = NSIndex1Entry(*(self.index[id][:2]))  # legacy: index entries have no size element
             in_index = NSIndex1Entry(*(self.index[id][:2]))  # legacy: index entries have no size element
             return self.io.read(in_index.segment, in_index.offset, id, read_data=read_data)
             return self.io.read(in_index.segment, in_index.offset, id, read_data=read_data)
         except KeyError:
         except KeyError:
-            raise self.ObjectNotFound(id, self.path) from None
+            if raise_missing:
+                raise self.ObjectNotFound(id, self.path) from None
+            else:
+                return None
 
 
-    def get_many(self, ids, read_data=True, is_preloaded=False):
+    def get_many(self, ids, read_data=True, is_preloaded=False, raise_missing=True):
         for id_ in ids:
         for id_ in ids:
-            yield self.get(id_, read_data=read_data)
+            yield self.get(id_, read_data=read_data, raise_missing=raise_missing)
 
 
     def put(self, id, data, wait=True):
     def put(self, id, data, wait=True):
         """put a repo object
         """put a repo object

+ 20 - 12
src/borg/remote.py

@@ -943,7 +943,9 @@ class RemoteRepository:
                             self.to_send.push_back(msgpack.packb({MSGID: self.msgid, MSG: cmd, ARGS: args}))
                             self.to_send.push_back(msgpack.packb({MSGID: self.msgid, MSG: cmd, ARGS: args}))
                     if not self.to_send and self.preload_ids:
                     if not self.to_send and self.preload_ids:
                         chunk_id = self.preload_ids.pop(0)
                         chunk_id = self.preload_ids.pop(0)
-                        args = {"id": chunk_id}
+                        # for preloading chunks, the raise_missing behaviour is defined HERE,
+                        # not in the get_many / fetch_many call that later fetches the preloaded chunks.
+                        args = {"id": chunk_id, "raise_missing": False}
                         self.msgid += 1
                         self.msgid += 1
                         self.chunkid_to_msgids.setdefault(chunk_id, []).append(self.msgid)
                         self.chunkid_to_msgids.setdefault(chunk_id, []).append(self.msgid)
                         self.to_send.push_back(msgpack.packb({MSGID: self.msgid, MSG: "get", ARGS: args}))
                         self.to_send.push_back(msgpack.packb({MSGID: self.msgid, MSG: "get", ARGS: args}))
@@ -991,12 +993,16 @@ class RemoteRepository:
     def list(self, limit=None, marker=None):
     def list(self, limit=None, marker=None):
         """actual remoting is done via self.call in the @api decorator"""
         """actual remoting is done via self.call in the @api decorator"""
 
 
-    def get(self, id, read_data=True):
-        for resp in self.get_many([id], read_data=read_data):
+    def get(self, id, read_data=True, raise_missing=True):
+        for resp in self.get_many([id], read_data=read_data, raise_missing=raise_missing):
             return resp
             return resp
 
 
-    def get_many(self, ids, read_data=True, is_preloaded=False):
-        yield from self.call_many("get", [{"id": id, "read_data": read_data} for id in ids], is_preloaded=is_preloaded)
+    def get_many(self, ids, read_data=True, is_preloaded=False, raise_missing=True):
+        yield from self.call_many(
+            "get",
+            [{"id": id, "read_data": read_data, "raise_missing": raise_missing} for id in ids],
+            is_preloaded=is_preloaded,
+        )
 
 
     @api(since=parse_version("1.0.0"))
     @api(since=parse_version("1.0.0"))
     def put(self, id, data, wait=True):
     def put(self, id, data, wait=True):
@@ -1098,11 +1104,11 @@ class RepositoryNoCache:
     def __exit__(self, exc_type, exc_val, exc_tb):
     def __exit__(self, exc_type, exc_val, exc_tb):
         self.close()
         self.close()
 
 
-    def get(self, key, read_data=True):
-        return next(self.get_many([key], read_data=read_data, cache=False))
+    def get(self, key, read_data=True, raise_missing=True):
+        return next(self.get_many([key], read_data=read_data, raise_missing=raise_missing, cache=False))
 
 
-    def get_many(self, keys, read_data=True, cache=True):
-        for key, data in zip(keys, self.repository.get_many(keys, read_data=read_data)):
+    def get_many(self, keys, read_data=True, raise_missing=True, cache=True):
+        for key, data in zip(keys, self.repository.get_many(keys, read_data=read_data, raise_missing=raise_missing)):
             yield self.transform(key, data)
             yield self.transform(key, data)
 
 
     def log_instrumentation(self):
     def log_instrumentation(self):
@@ -1207,10 +1213,12 @@ class RepositoryCache(RepositoryNoCache):
         self.cache.clear()
         self.cache.clear()
         shutil.rmtree(self.basedir)
         shutil.rmtree(self.basedir)
 
 
-    def get_many(self, keys, read_data=True, cache=True):
+    def get_many(self, keys, read_data=True, raise_missing=True, cache=True):
         # It could use different cache keys depending on read_data and cache full vs. meta-only chunks.
         # It could use different cache keys depending on read_data and cache full vs. meta-only chunks.
         unknown_keys = [key for key in keys if self.prefixed_key(key, complete=read_data) not in self.cache]
         unknown_keys = [key for key in keys if self.prefixed_key(key, complete=read_data) not in self.cache]
-        repository_iterator = zip(unknown_keys, self.repository.get_many(unknown_keys, read_data=read_data))
+        repository_iterator = zip(
+            unknown_keys, self.repository.get_many(unknown_keys, read_data=read_data, raise_missing=raise_missing)
+        )
         for key in keys:
         for key in keys:
             pkey = self.prefixed_key(key, complete=read_data)
             pkey = self.prefixed_key(key, complete=read_data)
             if pkey in self.cache:
             if pkey in self.cache:
@@ -1228,7 +1236,7 @@ class RepositoryCache(RepositoryNoCache):
                 else:
                 else:
                     # slow path: eviction during this get_many removed this key from the cache
                     # slow path: eviction during this get_many removed this key from the cache
                     t0 = time.perf_counter()
                     t0 = time.perf_counter()
-                    data = self.repository.get(key, read_data=read_data)
+                    data = self.repository.get(key, read_data=read_data, raise_missing=raise_missing)
                     self.slow_lat += time.perf_counter() - t0
                     self.slow_lat += time.perf_counter() - t0
                     transformed = self.add_entry(key, data, cache, complete=read_data)
                     transformed = self.add_entry(key, data, cache, complete=read_data)
                     self.slow_misses += 1
                     self.slow_misses += 1

+ 7 - 4
src/borg/repository.py

@@ -425,7 +425,7 @@ class Repository:
                     # note: do not collect the marker id
                     # note: do not collect the marker id
         return result
         return result
 
 
-    def get(self, id, read_data=True):
+    def get(self, id, read_data=True, raise_missing=True):
         self._lock_refresh()
         self._lock_refresh()
         id_hex = bin_to_hex(id)
         id_hex = bin_to_hex(id)
         key = "data/" + id_hex
         key = "data/" + id_hex
@@ -452,11 +452,14 @@ class Repository:
                     raise IntegrityError(f"Object too small [id {id_hex}]: expected {meta_size}, got {len(meta)} bytes")
                     raise IntegrityError(f"Object too small [id {id_hex}]: expected {meta_size}, got {len(meta)} bytes")
                 return hdr + meta
                 return hdr + meta
         except StoreObjectNotFound:
         except StoreObjectNotFound:
-            raise self.ObjectNotFound(id, str(self._location)) from None
+            if raise_missing:
+                raise self.ObjectNotFound(id, str(self._location)) from None
+            else:
+                return None
 
 
-    def get_many(self, ids, read_data=True, is_preloaded=False):
+    def get_many(self, ids, read_data=True, is_preloaded=False, raise_missing=True):
         for id_ in ids:
         for id_ in ids:
-            yield self.get(id_, read_data=read_data)
+            yield self.get(id_, read_data=read_data, raise_missing=raise_missing)
 
 
     def put(self, id, data, wait=True):
     def put(self, id, data, wait=True):
         """put a repo object
         """put a repo object

+ 17 - 46
src/borg/testsuite/archiver/check_cmd_test.py

@@ -155,28 +155,19 @@ def test_missing_file_chunk(archivers, request):
         else:
         else:
             pytest.fail("should not happen")  # convert 'fail'
             pytest.fail("should not happen")  # convert 'fail'
 
 
-    cmd(archiver, "check", exit_code=1)
+    output = cmd(archiver, "check", exit_code=1)
+    assert "Missing file chunk detected" in output
     output = cmd(archiver, "check", "--repair", exit_code=0)
     output = cmd(archiver, "check", "--repair", exit_code=0)
-    assert "New missing file chunk detected" in output
-
-    cmd(archiver, "check", exit_code=0)
-    output = cmd(archiver, "list", "archive1", "--format={health}#{path}{NL}", exit_code=0)
-    assert "broken#" in output
+    assert "Missing file chunk detected" in output  # repair is not changing anything, just reporting.
 
 
-    # check that the file in the old archives has now a different chunk list without the killed chunk.
-    # also check that the correct original chunks list is preserved in item.chunks_healthy.
+    # check does not modify the chunks list.
     for archive_name in ("archive1", "archive2"):
     for archive_name in ("archive1", "archive2"):
         archive, repository = open_archive(archiver.repository_path, archive_name)
         archive, repository = open_archive(archiver.repository_path, archive_name)
         with repository:
         with repository:
             for item in archive.iter_items():
             for item in archive.iter_items():
                 if item.path.endswith(src_file):
                 if item.path.endswith(src_file):
                     assert len(valid_chunks) == len(item.chunks)
                     assert len(valid_chunks) == len(item.chunks)
-                    assert killed_chunk not in item.chunks
-                    assert valid_chunks != item.chunks
-                    assert "chunks_healthy" in item
-                    assert len(valid_chunks) == len(item.chunks_healthy)
-                    assert killed_chunk in item.chunks_healthy
-                    assert valid_chunks == item.chunks_healthy
+                    assert valid_chunks == item.chunks
                     break
                     break
             else:
             else:
                 pytest.fail("should not happen")  # convert 'fail'
                 pytest.fail("should not happen")  # convert 'fail'
@@ -185,32 +176,9 @@ def test_missing_file_chunk(archivers, request):
     with patch.object(ChunkBuffer, "BUFFER_SIZE", 10):
     with patch.object(ChunkBuffer, "BUFFER_SIZE", 10):
         create_src_archive(archiver, "archive3")
         create_src_archive(archiver, "archive3")
 
 
-    # check should be able to heal the file now:
+    # check should not complain anymore about missing chunks:
     output = cmd(archiver, "check", "-v", "--repair", exit_code=0)
     output = cmd(archiver, "check", "-v", "--repair", exit_code=0)
-    assert "Healed previously missing file chunk" in output
-    assert f"{src_file}: Completely healed previously damaged file!" in output
-
-    # check that the file in the old archives has the correct chunks again.
-    # also check that chunks_healthy list is removed as it is not needed any more.
-    for archive_name in ("archive1", "archive2"):
-        archive, repository = open_archive(archiver.repository_path, archive_name)
-        with repository:
-            for item in archive.iter_items():
-                if item.path.endswith(src_file):
-                    assert valid_chunks == item.chunks
-                    assert "chunks_healthy" not in item
-                    break
-            else:
-                pytest.fail("should not happen")
-
-    # list is also all-healthy again
-    output = cmd(archiver, "list", "archive1", "--format={health}#{path}{NL}", exit_code=0)
-    assert "broken#" not in output
-
-    # check should be fine now (and not show it has healed anything).
-    output = cmd(archiver, "check", "-v", "--repair", exit_code=0)
-    assert "Healed previously missing file chunk" not in output
-    assert "testsuite/archiver.py: Completely healed previously damaged file!" not in output
+    assert "Missing file chunk detected" not in output
 
 
 
 
 def test_missing_archive_item_chunk(archivers, request):
 def test_missing_archive_item_chunk(archivers, request):
@@ -425,13 +393,14 @@ def test_verify_data(archivers, request, init_args):
         output = cmd(archiver, "check", "--archives-only", "--verify-data", exit_code=1)
         output = cmd(archiver, "check", "--archives-only", "--verify-data", exit_code=1)
         assert f"{bin_to_hex(chunk.id)}, integrity error" in output
         assert f"{bin_to_hex(chunk.id)}, integrity error" in output
 
 
-        # repair (heal is tested in another test)
+        # repair will find the defect chunk and remove it
         output = cmd(archiver, "check", "--repair", "--verify-data", exit_code=0)
         output = cmd(archiver, "check", "--repair", "--verify-data", exit_code=0)
         assert f"{bin_to_hex(chunk.id)}, integrity error" in output
         assert f"{bin_to_hex(chunk.id)}, integrity error" in output
-        assert f"{src_file}: New missing file chunk detected" in output
+        assert f"{src_file}: Missing file chunk detected" in output
 
 
-        # run with --verify-data again, all fine now (file was patched with a replacement chunk).
-        cmd(archiver, "check", "--archives-only", "--verify-data", exit_code=0)
+        # run with --verify-data again, it will notice the missing chunk.
+        output = cmd(archiver, "check", "--archives-only", "--verify-data", exit_code=1)
+        assert f"{src_file}: Missing file chunk detected" in output
 
 
 
 
 @pytest.mark.parametrize("init_args", [["--encryption=repokey-aes-ocb"], ["--encryption", "none"]])
 @pytest.mark.parametrize("init_args", [["--encryption=repokey-aes-ocb"], ["--encryption", "none"]])
@@ -457,13 +426,15 @@ def test_corrupted_file_chunk(archivers, request, init_args):
     output = cmd(archiver, "check", "--repository-only", exit_code=1)
     output = cmd(archiver, "check", "--repository-only", exit_code=1)
     assert f"{bin_to_hex(chunk.id)} is corrupted: data does not match checksum." in output
     assert f"{bin_to_hex(chunk.id)} is corrupted: data does not match checksum." in output
 
 
-    # repair (heal is tested in another test)
+    # repair: the defect chunk will be removed by repair.
     output = cmd(archiver, "check", "--repair", exit_code=0)
     output = cmd(archiver, "check", "--repair", exit_code=0)
     assert f"{bin_to_hex(chunk.id)} is corrupted: data does not match checksum." in output
     assert f"{bin_to_hex(chunk.id)} is corrupted: data does not match checksum." in output
-    assert f"{src_file}: New missing file chunk detected" in output
+    assert f"{src_file}: Missing file chunk detected" in output
 
 
-    # run normal check again, all fine now (file was patched with a replacement chunk).
+    # run normal check again
     cmd(archiver, "check", "--repository-only", exit_code=0)
     cmd(archiver, "check", "--repository-only", exit_code=0)
+    output = cmd(archiver, "check", "--archives-only", exit_code=1)
+    assert f"{src_file}: Missing file chunk detected" in output
 
 
 
 
 def test_empty_repository(archivers, request):
 def test_empty_repository(archivers, request):

+ 23 - 1
src/borg/testsuite/archiver/extract_cmd_test.py

@@ -9,7 +9,7 @@ import pytest
 from ... import xattr
 from ... import xattr
 from ...chunker import has_seek_hole
 from ...chunker import has_seek_hole
 from ...constants import *  # NOQA
 from ...constants import *  # NOQA
-from ...helpers import EXIT_WARNING, BackupPermissionError
+from ...helpers import EXIT_WARNING, BackupPermissionError, bin_to_hex
 from ...helpers import flags_noatime, flags_normal
 from ...helpers import flags_noatime, flags_normal
 from .. import changedir, same_ts_ns
 from .. import changedir, same_ts_ns
 from .. import are_symlinks_supported, are_hardlinks_supported, is_utime_fully_supported, is_birthtime_fully_supported
 from .. import are_symlinks_supported, are_hardlinks_supported, is_utime_fully_supported, is_birthtime_fully_supported
@@ -24,6 +24,9 @@ from . import (
     _extract_hardlinks_setup,
     _extract_hardlinks_setup,
     assert_creates_file,
     assert_creates_file,
     generate_archiver_tests,
     generate_archiver_tests,
+    create_src_archive,
+    open_archive,
+    src_file,
 )
 )
 
 
 pytest_generate_tests = lambda metafunc: generate_archiver_tests(metafunc, kinds="local,remote,binary")  # NOQA
 pytest_generate_tests = lambda metafunc: generate_archiver_tests(metafunc, kinds="local,remote,binary")  # NOQA
@@ -737,3 +740,22 @@ def test_dry_run_extraction_flags(archivers, request):
         print(output)
         print(output)
 
 
     assert not os.listdir("output"), "Output directory should be empty after dry-run"
     assert not os.listdir("output"), "Output directory should be empty after dry-run"
+
+
+def test_extract_file_with_missing_chunk(archivers, request):
+    archiver = request.getfixturevalue(archivers)
+    cmd(archiver, "repo-create", RK_ENCRYPTION)
+    create_src_archive(archiver, "archive")
+    # Get rid of a chunk
+    archive, repository = open_archive(archiver.repository_path, "archive")
+    with repository:
+        for item in archive.iter_items():
+            if item.path.endswith(src_file):
+                chunk = item.chunks[-1]
+                repository.delete(chunk.id)
+                break
+        else:
+            assert False  # missed the file
+    output = cmd(archiver, "extract", "archive")
+    # TODO: this is a bit dirty still: no warning/error rc, no filename output for the damaged file.
+    assert f"repository object {bin_to_hex(chunk.id)} missing, returning {chunk.size} zero bytes." in output

+ 9 - 5
src/borg/testsuite/archiver/mount_cmds_test.py

@@ -233,15 +233,19 @@ def test_fuse_allow_damaged_files(archivers, request):
                 break
                 break
         else:
         else:
             assert False  # missed the file
             assert False  # missed the file
-    cmd(archiver, "check", "--repair", exit_code=0)
 
 
     mountpoint = os.path.join(archiver.tmpdir, "mountpoint")
     mountpoint = os.path.join(archiver.tmpdir, "mountpoint")
     with fuse_mount(archiver, mountpoint, "-a", "archive"):
     with fuse_mount(archiver, mountpoint, "-a", "archive"):
-        with pytest.raises(OSError) as excinfo:
-            open(os.path.join(mountpoint, "archive", path))
-        assert excinfo.value.errno == errno.EIO
+        with open(os.path.join(mountpoint, "archive", path), "rb") as f:
+            with pytest.raises(OSError) as excinfo:
+                f.read()
+            assert excinfo.value.errno == errno.EIO
+
     with fuse_mount(archiver, mountpoint, "-a", "archive", "-o", "allow_damaged_files"):
     with fuse_mount(archiver, mountpoint, "-a", "archive", "-o", "allow_damaged_files"):
-        open(os.path.join(mountpoint, "archive", path)).close()
+        with open(os.path.join(mountpoint, "archive", path), "rb") as f:
+            # no exception raised, missing data will be all-zero
+            data = f.read()
+        assert data.endswith(b"\0\0")
 
 
 
 
 @pytest.mark.skipif(not llfuse, reason="llfuse not installed")
 @pytest.mark.skipif(not llfuse, reason="llfuse not installed")

+ 3 - 1
src/borg/testsuite/archiver/transfer_cmd_test.py

@@ -164,7 +164,6 @@ def test_transfer_upgrade(archivers, request):
                     # fix expectation for size
                     # fix expectation for size
                     e["size"] = g["size"]
                     e["size"] = g["size"]
                 # Note: size == 0 for all items without a size or chunks list (like e.g. directories)
                 # Note: size == 0 for all items without a size or chunks list (like e.g. directories)
-                # Note: healthy == True indicates the *absence* of the additional chunks_healthy list
             del g["hlid"]
             del g["hlid"]
 
 
             # borg 1 used "linktarget" and "source" for links, borg 2 uses "target" for symlinks.
             # borg 1 used "linktarget" and "source" for links, borg 2 uses "target" for symlinks.
@@ -177,6 +176,9 @@ def test_transfer_upgrade(archivers, request):
                 # The S_IFBLK macro is broken on MINGW
                 # The S_IFBLK macro is broken on MINGW
                 del e["type"], g["type"]
                 del e["type"], g["type"]
                 del e["mode"], g["mode"]
                 del e["mode"], g["mode"]
+
+            del e["healthy"]  # not supported anymore
+
             assert g == e
             assert g == e
 
 
         if name == "archive1":
         if name == "archive1":

+ 3 - 6
src/borg/upgrade.py

@@ -48,7 +48,7 @@ class UpgraderFrom12To20:
 
 
     def new_archive(self, *, archive):
     def new_archive(self, *, archive):
         self.archive = archive
         self.archive = archive
-        self.hlm = HardLinkManager(id_type=bytes, info_type=tuple)  # hlid -> (chunks, chunks_healthy)
+        self.hlm = HardLinkManager(id_type=bytes, info_type=list)  # hlid -> chunks_correct
 
 
     def upgrade_item(self, *, item):
     def upgrade_item(self, *, item):
         """upgrade item as needed, get rid of legacy crap"""
         """upgrade item as needed, get rid of legacy crap"""
@@ -56,7 +56,6 @@ class UpgraderFrom12To20:
             "path",
             "path",
             "rdev",
             "rdev",
             "chunks",
             "chunks",
-            "chunks_healthy",
             "hlid",
             "hlid",
             "mode",
             "mode",
             "user",
             "user",
@@ -78,16 +77,14 @@ class UpgraderFrom12To20:
 
 
         if self.hlm.borg1_hardlink_master(item):
         if self.hlm.borg1_hardlink_master(item):
             item.hlid = hlid = self.hlm.hardlink_id_from_path(item.path)
             item.hlid = hlid = self.hlm.hardlink_id_from_path(item.path)
-            self.hlm.remember(id=hlid, info=(item.get("chunks"), item.get("chunks_healthy")))
+            self.hlm.remember(id=hlid, info=item.get("chunks"))
         elif self.hlm.borg1_hardlink_slave(item):
         elif self.hlm.borg1_hardlink_slave(item):
             item.hlid = hlid = self.hlm.hardlink_id_from_path(item.source)
             item.hlid = hlid = self.hlm.hardlink_id_from_path(item.source)
-            chunks, chunks_healthy = self.hlm.retrieve(id=hlid, default=(None, None))
+            chunks = self.hlm.retrieve(id=hlid)
             if chunks is not None:
             if chunks is not None:
                 item.chunks = chunks
                 item.chunks = chunks
                 for chunk_id, chunk_size in chunks:
                 for chunk_id, chunk_size in chunks:
                     self.cache.reuse_chunk(chunk_id, chunk_size, self.archive.stats)
                     self.cache.reuse_chunk(chunk_id, chunk_size, self.archive.stats)
-            if chunks_healthy is not None:
-                item.chunks_healthy = chunks
             del item.source  # not used for hardlinks any more, replaced by hlid
             del item.source  # not used for hardlinks any more, replaced by hlid
         # make sure we only have desired stuff in the new item. specifically, make sure to get rid of:
         # make sure we only have desired stuff in the new item. specifically, make sure to get rid of:
         # - 'acl' remnants of bug in attic <= 0.13
         # - 'acl' remnants of bug in attic <= 0.13