Browse Source

Merge pull request #1921 from enkore/f/recreate-fixes

recreate fixes
enkore 8 years ago
parent
commit
e9d7f928e2
3 changed files with 26 additions and 35 deletions
  1. 16 19
      src/borg/archive.py
  2. 9 16
      src/borg/archiver.py
  3. 1 0
      src/borg/testsuite/archiver.py

+ 16 - 19
src/borg/archive.py

@@ -1394,10 +1394,6 @@ class ArchiveChecker:
 
 
 
 
 class ArchiveRecreater:
 class ArchiveRecreater:
-    class FakeTargetArchive:
-        def __init__(self):
-            self.stats = Statistics()
-
     class Interrupted(Exception):
     class Interrupted(Exception):
         def __init__(self, metadata=None):
         def __init__(self, metadata=None):
             self.metadata = metadata or {}
             self.metadata = metadata or {}
@@ -1421,6 +1417,9 @@ class ArchiveRecreater:
         self.exclude_if_present = exclude_if_present or []
         self.exclude_if_present = exclude_if_present or []
         self.keep_tag_files = keep_tag_files
         self.keep_tag_files = keep_tag_files
 
 
+        self.rechunkify = chunker_params is not None
+        if self.rechunkify:
+            logger.debug('Rechunking archives to %s', chunker_params)
         self.chunker_params = chunker_params or CHUNKER_PARAMS
         self.chunker_params = chunker_params or CHUNKER_PARAMS
         self.recompress = bool(compression)
         self.recompress = bool(compression)
         self.always_recompress = always_recompress
         self.always_recompress = always_recompress
@@ -1434,7 +1433,7 @@ class ArchiveRecreater:
         self.stats = stats
         self.stats = stats
         self.progress = progress
         self.progress = progress
         self.print_file_status = file_status_printer or (lambda *args: None)
         self.print_file_status = file_status_printer or (lambda *args: None)
-        self.checkpoint_interval = checkpoint_interval
+        self.checkpoint_interval = None if dry_run else checkpoint_interval
 
 
     def recreate(self, archive_name, comment=None, target_name=None):
     def recreate(self, archive_name, comment=None, target_name=None):
         assert not self.is_temporary_archive(archive_name)
         assert not self.is_temporary_archive(archive_name)
@@ -1444,10 +1443,10 @@ class ArchiveRecreater:
             self.matcher_add_tagged_dirs(archive)
             self.matcher_add_tagged_dirs(archive)
         if self.matcher.empty() and not self.recompress and not target.recreate_rechunkify and comment is None:
         if self.matcher.empty() and not self.recompress and not target.recreate_rechunkify and comment is None:
             logger.info("Skipping archive %s, nothing to do", archive_name)
             logger.info("Skipping archive %s, nothing to do", archive_name)
-            return True
+            return
         self.process_items(archive, target)
         self.process_items(archive, target)
         replace_original = target_name is None
         replace_original = target_name is None
-        return self.save(archive, target, comment, replace_original=replace_original)
+        self.save(archive, target, comment, replace_original=replace_original)
 
 
     def process_items(self, archive, target):
     def process_items(self, archive, target):
         matcher = self.matcher
         matcher = self.matcher
@@ -1494,12 +1493,11 @@ class ArchiveRecreater:
         self.print_file_status(file_status(item.mode), item.path)
         self.print_file_status(file_status(item.mode), item.path)
 
 
     def process_chunks(self, archive, target, item):
     def process_chunks(self, archive, target, item):
-        """Return new chunk ID list for 'item'."""
         if not self.recompress and not target.recreate_rechunkify:
         if not self.recompress and not target.recreate_rechunkify:
             for chunk_id, size, csize in item.chunks:
             for chunk_id, size, csize in item.chunks:
                 self.cache.chunk_incref(chunk_id, target.stats)
                 self.cache.chunk_incref(chunk_id, target.stats)
             return item.chunks
             return item.chunks
-        chunk_iterator = self.create_chunk_iterator(archive, target, list(item.chunks))
+        chunk_iterator = self.iter_chunks(archive, target, list(item.chunks))
         compress = self.compression_decider1.decide(item.path)
         compress = self.compression_decider1.decide(item.path)
         chunk_processor = partial(self.chunk_processor, target, compress)
         chunk_processor = partial(self.chunk_processor, target, compress)
         target.chunk_file(item, self.cache, target.stats, chunk_iterator, chunk_processor)
         target.chunk_file(item, self.cache, target.stats, chunk_iterator, chunk_processor)
@@ -1517,24 +1515,22 @@ class ArchiveRecreater:
             if Compressor.detect(old_chunk.data).name == compression_spec['name']:
             if Compressor.detect(old_chunk.data).name == compression_spec['name']:
                 # Stored chunk has the same compression we wanted
                 # Stored chunk has the same compression we wanted
                 overwrite = False
                 overwrite = False
-        chunk_id, size, csize = self.cache.add_chunk(chunk_id, chunk, target.stats, overwrite=overwrite)
-        self.seen_chunks.add(chunk_id)
-        return chunk_id, size, csize
+        chunk_entry = self.cache.add_chunk(chunk_id, chunk, target.stats, overwrite=overwrite)
+        self.seen_chunks.add(chunk_entry.id)
+        return chunk_entry
 
 
-    def create_chunk_iterator(self, archive, target, chunks):
-        """Return iterator of chunks to store for 'item' from 'archive' in 'target'."""
+    def iter_chunks(self, archive, target, chunks):
         chunk_iterator = archive.pipeline.fetch_many([chunk_id for chunk_id, _, _ in chunks])
         chunk_iterator = archive.pipeline.fetch_many([chunk_id for chunk_id, _, _ in chunks])
         if target.recreate_rechunkify:
         if target.recreate_rechunkify:
             # The target.chunker will read the file contents through ChunkIteratorFileWrapper chunk-by-chunk
             # The target.chunker will read the file contents through ChunkIteratorFileWrapper chunk-by-chunk
             # (does not load the entire file into memory)
             # (does not load the entire file into memory)
             file = ChunkIteratorFileWrapper(chunk_iterator)
             file = ChunkIteratorFileWrapper(chunk_iterator)
-            return target.chunker.chunkify(file)
+            yield from target.chunker.chunkify(file)
         else:
         else:
             for chunk in chunk_iterator:
             for chunk in chunk_iterator:
                 yield chunk.data
                 yield chunk.data
 
 
     def save(self, archive, target, comment=None, replace_original=True):
     def save(self, archive, target, comment=None, replace_original=True):
-        """Save target archive. If completed, replace source. If not, save temporary with additional 'metadata' dict."""
         if self.dry_run:
         if self.dry_run:
             return
             return
         timestamp = archive.ts.replace(tzinfo=None)
         timestamp = archive.ts.replace(tzinfo=None)
@@ -1591,12 +1587,13 @@ class ArchiveRecreater:
 
 
     def create_target(self, archive, target_name=None):
     def create_target(self, archive, target_name=None):
         """Create target archive."""
         """Create target archive."""
-        if self.dry_run:
-            return self.FakeTargetArchive(), None
         target_name = target_name or archive.name + '.recreate'
         target_name = target_name or archive.name + '.recreate'
         target = self.create_target_archive(target_name)
         target = self.create_target_archive(target_name)
         # If the archives use the same chunker params, then don't rechunkify
         # If the archives use the same chunker params, then don't rechunkify
-        target.recreate_rechunkify = tuple(archive.metadata.get('chunker_params', [])) != self.chunker_params
+        source_chunker_params = tuple(archive.metadata.get('chunker_params', []))
+        target.recreate_rechunkify = self.rechunkify and source_chunker_params != target.chunker_params
+        if target.recreate_rechunkify:
+            logger.debug('Rechunking archive from %s to %s', source_chunker_params or '(unknown)', target.chunker_params)
         return target
         return target
 
 
     def create_target_archive(self, name):
     def create_target_archive(self, name):

+ 9 - 16
src/borg/archiver.py

@@ -1101,11 +1101,11 @@ class Archiver:
                 if recreater.is_temporary_archive(name):
                 if recreater.is_temporary_archive(name):
                     continue
                     continue
                 print('Processing', name)
                 print('Processing', name)
-                if not recreater.recreate(name, args.comment):
-                    break
-        manifest.write()
-        repository.commit()
-        cache.commit()
+                recreater.recreate(name, args.comment)
+        if not args.dry_run:
+            manifest.write()
+            repository.commit()
+            cache.commit()
         return self.exit_code
         return self.exit_code
 
 
     @with_repository(manifest=False, exclusive=True)
     @with_repository(manifest=False, exclusive=True)
@@ -2356,6 +2356,8 @@ class Archiver:
         recreate_epilog = textwrap.dedent("""
         recreate_epilog = textwrap.dedent("""
         Recreate the contents of existing archives.
         Recreate the contents of existing archives.
 
 
+        This is an *experimental* feature. Do *not* use this on your only backup.
+
         --exclude, --exclude-from and PATH have the exact same semantics
         --exclude, --exclude-from and PATH have the exact same semantics
         as in "borg create". If PATHs are specified the resulting archive
         as in "borg create". If PATHs are specified the resulting archive
         will only contain files from these PATHs.
         will only contain files from these PATHs.
@@ -2372,15 +2374,6 @@ class Archiver:
         used to have upgraded Borg 0.xx or Attic archives deduplicate with
         used to have upgraded Borg 0.xx or Attic archives deduplicate with
         Borg 1.x archives.
         Borg 1.x archives.
 
 
-        borg recreate is signal safe. Send either SIGINT (Ctrl-C on most terminals) or
-        SIGTERM to request termination.
-
-        Use the *exact same* command line to resume the operation later - changing excludes
-        or paths will lead to inconsistencies (changed excludes will only apply to newly
-        processed files/dirs). Changing compression leads to incorrect size information
-        (which does not cause any data loss, but can be misleading).
-        Changing chunker params between invocations might lead to data loss.
-
         USE WITH CAUTION.
         USE WITH CAUTION.
         Depending on the PATHs and patterns given, recreate can be used to permanently
         Depending on the PATHs and patterns given, recreate can be used to permanently
         delete files from archives.
         delete files from archives.
@@ -2395,8 +2388,8 @@ class Archiver:
 
 
         When rechunking space usage can be substantial, expect at least the entire
         When rechunking space usage can be substantial, expect at least the entire
         deduplicated size of the archives using the previous chunker params.
         deduplicated size of the archives using the previous chunker params.
-        When recompressing approximately 1 % of the repository size or 512 MB
-        (whichever is greater) of additional space is used.
+        When recompressing expect approx. (throughput / checkpoint-interval) in space usage,
+        assuming all chunks are recompressed.
         """)
         """)
         subparser = subparsers.add_parser('recreate', parents=[common_parser], add_help=False,
         subparser = subparsers.add_parser('recreate', parents=[common_parser], add_help=False,
                                           description=self.do_recreate.__doc__,
                                           description=self.do_recreate.__doc__,

+ 1 - 0
src/borg/testsuite/archiver.py

@@ -1823,6 +1823,7 @@ class ArchiverTestCase(ArchiverTestCaseBase):
         self.cmd('recreate', self.repository_location, '--chunker-params', 'default')
         self.cmd('recreate', self.repository_location, '--chunker-params', 'default')
         self.check_cache()
         self.check_cache()
         # test1 and test2 do deduplicate after recreate
         # test1 and test2 do deduplicate after recreate
+        assert int(self.cmd('list', self.repository_location + '::test1', 'input/large_file', '--format={size}'))
         assert not int(self.cmd('list', self.repository_location + '::test1', 'input/large_file',
         assert not int(self.cmd('list', self.repository_location + '::test1', 'input/large_file',
                                 '--format', '{unique_chunks}'))
                                 '--format', '{unique_chunks}'))