Browse Source

Merge pull request #1300 from ThomasWaldmann/heal-items

heal items
enkore 9 years ago
parent
commit
73e46358c3
3 changed files with 81 additions and 20 deletions
  1. 40 15
      borg/archive.py
  2. 6 3
      borg/archiver.py
  3. 35 2
      borg/testsuite/archiver.py

+ 40 - 15
borg/archive.py

@@ -920,31 +920,56 @@ class ArchiveChecker:
                     self.repository.put(id_, cdata)
 
         def verify_file_chunks(item):
-            """Verifies that all file chunks are present
+            """Verifies that all file chunks are present.
 
-            Missing file chunks will be replaced with new chunks of the same
-            length containing all zeros.
+            Missing file chunks will be replaced with new chunks of the same length containing all zeros.
+            If a previously missing file chunk re-appears, the replacement chunk is replaced by the correct one.
             """
             offset = 0
             chunk_list = []
             chunks_replaced = False
-            for chunk_id, size, csize in item[b'chunks']:
+            has_chunks_healthy = b'chunks_healthy' in item
+            chunks_current = item[b'chunks']
+            chunks_healthy = item[b'chunks_healthy'] if has_chunks_healthy else chunks_current
+            assert len(chunks_current) == len(chunks_healthy)
+            for chunk_current, chunk_healthy in zip(chunks_current, chunks_healthy):
+                chunk_id, size, csize = chunk_healthy
                 if chunk_id not in self.chunks:
-                    # If a file chunk is missing, create an all empty replacement chunk
-                    logger.error('{}: Missing file chunk detected (Byte {}-{})'.format(item[b'path'].decode('utf-8', 'surrogateescape'), offset, offset + size))
-                    self.error_found = chunks_replaced = True
-                    data = bytes(size)
-                    chunk_id = self.key.id_hash(data)
-                    cdata = self.key.encrypt(data)
-                    csize = len(cdata)
-                    add_reference(chunk_id, size, csize, cdata)
+                    # a chunk of the healthy list is missing
+                    if chunk_current == chunk_healthy:
+                        logger.error('{}: New missing file chunk detected (Byte {}-{}). '
+                                     'Replacing with all-zero chunk.'.format(
+                                     item[b'path'].decode('utf-8', 'surrogateescape'), offset, offset + size))
+                        self.error_found = chunks_replaced = True
+                        data = bytes(size)
+                        chunk_id = self.key.id_hash(data)
+                        cdata = self.key.encrypt(data)
+                        csize = len(cdata)
+                        add_reference(chunk_id, size, csize, cdata)
+                    else:
+                        logger.info('{}: Previously missing file chunk is still missing (Byte {}-{}). '
+                                    'It has a all-zero replacement chunk already.'.format(
+                                    item[b'path'].decode('utf-8', 'surrogateescape'), offset, offset + size))
+                        chunk_id, size, csize = chunk_current
+                        add_reference(chunk_id, size, csize)
                 else:
-                    add_reference(chunk_id, size, csize)
-                chunk_list.append((chunk_id, size, csize))
+                    if chunk_current == chunk_healthy:
+                        # normal case, all fine.
+                        add_reference(chunk_id, size, csize)
+                    else:
+                        logger.info('{}: Healed previously missing file chunk! (Byte {}-{}).'.format(
+                            item[b'path'].decode('utf-8', 'surrogateescape'), offset, offset + size))
+                        add_reference(chunk_id, size, csize)
+                        mark_as_possibly_superseded(chunk_current[0])  # maybe orphaned the all-zero replacement chunk
+                chunk_list.append([chunk_id, size, csize])  # list-typed element as chunks_healthy is list-of-lists
                 offset += size
-            if chunks_replaced and b'chunks_healthy' not in item:
+            if chunks_replaced and not has_chunks_healthy:
                 # if this is first repair, remember the correct chunk IDs, so we can maybe heal the file later
                 item[b'chunks_healthy'] = item[b'chunks']
+            if has_chunks_healthy and chunk_list == chunks_healthy:
+                logger.info('{}: Completely healed previously damaged file!'.format(
+                            item[b'path'].decode('utf-8', 'surrogateescape')))
+                del item[b'chunks_healthy']
             item[b'chunks'] = chunk_list
 
         def robust_iterator(archive):

+ 6 - 3
borg/archiver.py

@@ -973,9 +973,12 @@ class Archiver:
         - Check if archive metadata chunk is present. if not, remove archive from
           manifest.
         - For all files (items) in the archive, for all chunks referenced by these
-          files, check if chunk is present (if not and we are in repair mode, replace
-          it with a same-size chunk of zeros). This requires reading of archive and
-          file metadata, but not data.
+          files, check if chunk is present.
+          If a chunk is not present and we are in repair mode, replace it with a same-size
+          replacement chunk of zeros.
+          If a previously lost chunk reappears (e.g. via a later backup) and we are in
+          repair mode, the all-zero replacement chunk will be replaced by the correct chunk.
+          This requires reading of archive and file metadata, but not data.
         - If we are in repair mode and we checked all the archives: delete orphaned
           chunks from the repo.
         - if you use a remote repo server via ssh:, the archive check is executed on

+ 35 - 2
borg/testsuite/archiver.py

@@ -1142,12 +1142,45 @@ class ArchiverCheckTestCase(ArchiverTestCaseBase):
         with repository:
             for item in archive.iter_items():
                 if item[b'path'].endswith('testsuite/archiver.py'):
-                    repository.delete(item[b'chunks'][-1][0])
+                    valid_chunks = item[b'chunks']
+                    killed_chunk = valid_chunks[-1]
+                    repository.delete(killed_chunk[0])
                     break
+            else:
+                self.assert_true(False)  # should not happen
             repository.commit()
         self.cmd('check', self.repository_location, exit_code=1)
-        self.cmd('check', '--repair', self.repository_location, exit_code=0)
+        output = self.cmd('check', '--repair', self.repository_location, exit_code=0)
+        self.assert_in('New missing file chunk detected', output)
         self.cmd('check', self.repository_location, exit_code=0)
+        # check that the file in the old archives has now a different chunk list without the killed chunk
+        for archive_name in ('archive1', 'archive2'):
+            archive, repository = self.open_archive(archive_name)
+            with repository:
+                for item in archive.iter_items():
+                    if item[b'path'].endswith('testsuite/archiver.py'):
+                        self.assert_not_equal(valid_chunks, item[b'chunks'])
+                        self.assert_not_in(killed_chunk, item[b'chunks'])
+                        break
+                else:
+                    self.assert_true(False)  # should not happen
+        # do a fresh backup (that will include the killed chunk)
+        with patch.object(ChunkBuffer, 'BUFFER_SIZE', 10):
+            self.create_src_archive('archive3')
+        # check should be able to heal the file now:
+        output = self.cmd('check', '-v', '--repair', self.repository_location, exit_code=0)
+        self.assert_in('Healed previously missing file chunk', output)
+        self.assert_in('testsuite/archiver.py: Completely healed previously damaged file!', output)
+        # check that the file in the old archives has the correct chunks again
+        for archive_name in ('archive1', 'archive2'):
+            archive, repository = self.open_archive(archive_name)
+            with repository:
+                for item in archive.iter_items():
+                    if item[b'path'].endswith('testsuite/archiver.py'):
+                        self.assert_equal(valid_chunks, item[b'chunks'])
+                        break
+                else:
+                    self.assert_true(False)  # should not happen
 
     def test_missing_archive_item_chunk(self):
         archive, repository = self.open_archive('archive1')