2 days ago · 7a69499c0b
--- a/src/borg/archive.py
+++ b/src/borg/archive.py
@@ -469,7 +469,10 @@ class Archive:
 
				         self.pipeline = DownloadPipeline(self.repository, self.key)
			
 
				         self.create = create
			
 
				         if self.create:
			
 
				-            self.items_buffer = CacheChunkBuffer(self.cache, self.key, self.stats)
			
 
				+            # Use a separate statistics counter for metadata (items, archive metadata),
			
 
				+            # so that archive.stats reflects only file content statistics.
			
 
				+            self.meta_stats = Statistics(output_json=False, iec=iec)
			
 
				+            self.items_buffer = CacheChunkBuffer(self.cache, self.key, self.meta_stats)
			
 
				             if name in manifest.archives:
			
 
				                 raise self.AlreadyExists(name)
			
 
				             i = 0
			
@@ -605,7 +608,8 @@ Utilization of max. archive size: {csize_max:.0%}
 
				     def write_checkpoint(self):
			
 
				         self.save(self.checkpoint_name)
			
 
				         del self.manifest.archives[self.checkpoint_name]
			
 
				-        self.cache.chunk_decref(self.id, self.stats)
			
 
				+        # Use meta_stats so metadata chunks do not affect archive.stats
			
 
				+        self.cache.chunk_decref(self.id, self.meta_stats)
			
 
				 
			
 
				     def save(self, name=None, comment=None, timestamp=None, stats=None, additional_metadata=None):
			
 
				         name = name or self.name
			
@@ -649,7 +653,8 @@ Utilization of max. archive size: {csize_max:.0%}
 
				         data = self.key.pack_and_authenticate_metadata(metadata.as_dict(), context=b'archive')
			
 
				         self.id = self.key.id_hash(data)
			
 
				         try:
			
 
				-            self.cache.add_chunk(self.id, data, self.stats)
			
 
				+            # Use meta_stats so metadata chunk addition does not skew archive.stats
			
 
				+            self.cache.add_chunk(self.id, data, self.meta_stats)
			
 
				         except IntegrityError as err:
			
 
				             err_msg = str(err)
			
 
				             # hack to avoid changing the RPC protocol by introducing new (more specific) exception class
			
@@ -687,13 +692,14 @@ Utilization of max. archive size: {csize_max:.0%}
 
				         if have_borg12_meta and not want_unique:
			
 
				             unique_csize = 0
			
 
				         else:
			
 
				-            def add(id):
			
 
				-                entry = cache.chunks[id]
			
 
				-                archive_index.add(id, 1, entry.size, entry.csize)
			
 
				 
			
 
				             archive_index = ChunkIndex()
			
 
				             sync = CacheSynchronizer(archive_index)
			
 
				-            add(self.id)
			
 
				+            # do NOT add the archive metadata chunk (self.id) here.
			
 
				+            # The metadata chunk is accounted via meta_stats during creation and must not
			
 
				+            # contribute to the "This archive" deduplicated size computed by borg info.
			
 
				+            # See issue #9003: make info's deduplicated size match create-time stats.
			
 
				+
			
 
				             # we must escape any % char in the archive name, because we use it in a format string, see #6500
			
 
				             arch_name_escd = self.name.replace('%', '%%')
			
 
				             pi = ProgressIndicatorPercent(total=len(self.metadata.items),
			
@@ -701,7 +707,8 @@ Utilization of max. archive size: {csize_max:.0%}
 
				                                           msgid='archive.calc_stats')
			
 
				             for id, chunk in zip(self.metadata.items, self.repository.get_many(self.metadata.items)):
			
 
				                 pi.show(increase=1)
			
 
				-                add(id)
			
 
				+                # do NOT add(id) here, this is a metadata stream chunk and should not
			
 
				+                # be accounted for in stats, see comment above.
			
 
				                 data = self.key.decrypt(id, chunk)
			
 
				                 sync.feed(data)
			
 
				             unique_csize = archive_index.stats_against(cache.chunks)[3]
			
@@ -965,9 +972,11 @@ Utilization of max. archive size: {csize_max:.0%}
 
				         setattr(metadata, key, value)
			
 
				         data = self.key.pack_and_authenticate_metadata(metadata.as_dict(), context=b'archive')
			
 
				         new_id = self.key.id_hash(data)
			
 
				-        self.cache.add_chunk(new_id, data, self.stats)
			
 
				+        # Use a local Statistics(), metadata updates shall not affect stats.
			
 
				+        stats = Statistics(output_json=False, iec=self.iec)
			
 
				+        self.cache.add_chunk(new_id, data, stats)
			
 
				         self.manifest.archives[self.name] = (new_id, metadata.time)
			
 
				-        self.cache.chunk_decref(self.id, self.stats)
			
 
				+        self.cache.chunk_decref(self.id, stats)
			
 
				         self.id = new_id
			
 
				 
			
 
				     def rename(self, name):
			
--- a/src/borg/testsuite/archiver.py
+++ b/src/borg/testsuite/archiver.py
@@ -1617,6 +1617,37 @@ class ArchiverTestCase(ArchiverTestCaseBase):
 
				         info_archive = self.cmd('info', '--first', '1', self.repository_location)
			
 
				         assert 'Archive name: test\n' in info_archive
			
 
				 
			
 
				+    def test_info_and_create_stats(self):
			
 
				+        # "This archive" deduplicated size should match between `borg info` and `borg create --stats`.
			
 
				+        # "All archives" deduplicated size should match between `borg info` and `borg create --stats`.
			
 
				+        # However, even if there is only one archive in the repo, the deduplicated-size stats for
			
 
				+        # "All archives" do not match those for "This archive" because metadata is accounted for at the
			
 
				+        # repository level ("All archives"), but not for an individual archive ("This archive").
			
 
				+        data = b'Z' * (1024 * 80)
			
 
				+        self.create_regular_file('file1', contents=data)
			
 
				+        self.create_regular_file('file2', contents=data)
			
 
				+        self.cmd('init', '--encryption=repokey', self.repository_location)
			
 
				+        create_json = json.loads(self.cmd('create', '--json', self.repository_location + '::only', 'input'))
			
 
				+
			
 
				+        # From create --json
			
 
				+        dedup_this_create = create_json['archive']['stats']['deduplicated_size']
			
 
				+        dedup_all_create = create_json['cache']['stats']['unique_size']
			
 
				+
			
 
				+        # From info --json (archive and repository views)
			
 
				+        info_archive_json = json.loads(self.cmd('info', '--json', self.repository_location + '::only'))
			
 
				+        info_repo_json = json.loads(self.cmd('info', '--json', self.repository_location))
			
 
				+        assert len(info_archive_json['archives']) == 1
			
 
				+        dedup_this_info = info_archive_json['archives'][0]['stats']['deduplicated_size']
			
 
				+        dedup_all_info = info_repo_json['cache']['stats']['unique_size']
			
 
				+
			
 
				+        # create and info shall give the same numbers
			
 
				+        assert dedup_this_create == dedup_this_info
			
 
				+        assert dedup_all_create == dedup_all_info
			
 
				+        # accounting for "all archives" includes metadata chunks, for "this archive" it does not,
			
 
				+        # thus a mismatch is expected.
			
 
				+        assert dedup_this_create < dedup_all_create
			
 
				+        assert dedup_this_info < dedup_all_info
			
 
				+
			
 
				     def test_info_json(self):
			
 
				         self.create_regular_file('file1', size=1024 * 80)
			
 
				         self.cmd('init', '--encryption=repokey', self.repository_location)
			
@@ -4021,6 +4052,25 @@ id: 2 / e29442 3506da 4e1ea7 / 25f62a 5a3d41 - 02
 
				             self.cmd('create', self.repository_location + '::archive', 'input', fork=True,
			
 
				                      exit_code=Repository.InvalidRepository.exit_mcode)
			
 
				 
			
 
				+    def test_original_size_stable_across_recreate(self):
			
 
				+        # Test that changes in archive metadata (like number of chunks) do not influence the original size.
			
 
				+        self.cmd('init', '--encryption=repokey', self.repository_location)
			
 
				+
			
 
				+        def original_size(archive_name):
			
 
				+            info = json.loads(self.cmd('info', '--json', f"{self.repository_location}::{archive_name}"))
			
 
				+            return info['archives'][0]['stats']['original_size']
			
 
				+
			
 
				+        sizes = [12345, 67890]
			
 
				+        self.create_regular_file('file1', size=sizes[0])
			
 
				+        self.create_regular_file('file2', size=sizes[1])
			
 
				+
			
 
				+        self.cmd('create', '--compression=none', self.repository_location + '::archive', 'input')
			
 
				+        assert original_size('archive') == sum(sizes)
			
 
				+
			
 
				+        # Recreate with different chunker params to try to reproduce #8898.
			
 
				+        self.cmd('recreate', '--chunker-params=10,12,11,63', self.repository_location + '::archive')
			
 
				+        assert original_size('archive') == sum(sizes)
			
 
				+
			
 
				 
			
 
				 @unittest.skipUnless('binary' in BORG_EXES, 'no borg.exe available')
			
 
				 class ArchiverTestCaseBinary(ArchiverTestCase):