Bläddra i källkod

compute the deduplicated size before compression

so we do not need csize for it.
Thomas Waldmann 3 år sedan
förälder
incheckning
19dfbe5c5c
4 ändrade filer med 40 tillägg och 27 borttagningar
  1. 17 6
      src/borg/archive.py
  2. 3 3
      src/borg/archiver.py
  3. 10 10
      src/borg/cache.py
  4. 10 8
      src/borg/testsuite/archive.py

+ 17 - 6
src/borg/archive.py

@@ -58,38 +58,45 @@ class Statistics:
     def __init__(self, output_json=False, iec=False):
         self.output_json = output_json
         self.iec = iec
-        self.osize = self.nfiles = 0
-        self.osize_parts = self.nfiles_parts = 0
+        self.osize = self.usize = self.nfiles = 0
+        self.osize_parts = self.usize_parts = self.nfiles_parts = 0
         self.last_progress = 0  # timestamp when last progress was shown
 
-    def update(self, size, part=False):
+    def update(self, size, unique, part=False):
         if not part:
             self.osize += size
+            if unique:
+                self.usize += size
         else:
             self.osize_parts += size
+            if unique:
+                self.usize_parts += size
 
     def __add__(self, other):
         if not isinstance(other, Statistics):
             raise TypeError('can only add Statistics objects')
         stats = Statistics(self.output_json, self.iec)
         stats.osize = self.osize + other.osize
+        stats.usize = self.usize + other.usize
         stats.nfiles = self.nfiles + other.nfiles
         stats.osize_parts = self.osize_parts + other.osize_parts
+        stats.usize_parts = self.usize_parts + other.usize_parts
         stats.nfiles_parts = self.nfiles_parts + other.nfiles_parts
         return stats
 
-    summary = "{label:15} {stats.osize_fmt:>20s}"
+    summary = "{label:15} {stats.osize_fmt:>20s} {stats.usize_fmt:>20s}"
 
     def __str__(self):
         return self.summary.format(stats=self, label='This archive:')
 
     def __repr__(self):
-        return "<{cls} object at {hash:#x} ({self.osize})>".format(
+        return "<{cls} object at {hash:#x} ({self.osize}, {self.usize})>".format(
             cls=type(self).__name__, hash=id(self), self=self)
 
     def as_dict(self):
         return {
             'original_size': FileSize(self.osize, iec=self.iec),
+            'deduplicated_size': FileSize(self.usize, iec=self.iec),
             'nfiles': self.nfiles,
         }
 
@@ -114,6 +121,10 @@ class Statistics:
     def osize_fmt(self):
         return format_file_size(self.osize, iec=self.iec)
 
+    @property
+    def usize_fmt(self):
+        return format_file_size(self.usize, iec=self.iec)
+
     def show_progress(self, item=None, final=False, stream=None, dt=None):
         now = time.monotonic()
         if dt is None or now - self.last_progress > dt:
@@ -134,7 +145,7 @@ class Statistics:
             else:
                 columns, lines = get_terminal_size()
                 if not final:
-                    msg = '{0.osize_fmt} O {0.nfiles} N '.format(self)
+                    msg = '{0.osize_fmt} O {0.usize_fmt} U {0.nfiles} N '.format(self)
                     path = remove_surrogates(item.path) if item else ''
                     space = columns - swidth(msg)
                     if space < 12:

+ 3 - 3
src/borg/archiver.py

@@ -99,7 +99,7 @@ except BaseException:
 assert EXIT_ERROR == 2, "EXIT_ERROR is not 2, as expected - fix assert AND exception handler right above this line."
 
 
-STATS_HEADER = "                       Original size"
+STATS_HEADER = "                       Original size    Deduplicated size"
 
 PURE_PYTHON_MSGPACK_WARNING = "Using a pure-python msgpack! This will result in lower performance."
 
@@ -1797,8 +1797,8 @@ class Archiver:
                 Command line: {command_line}
                 Utilization of maximum supported archive size: {limits[max_archive_size]:.0%}
                 ------------------------------------------------------------------------------
-                                       Original size
-                This archive:   {stats[original_size]:>20s}
+                                       Original size    Deduplicated size
+                This archive:   {stats[original_size]:>20s} {stats[deduplicated_size]:>20s}
                 {cache}
                 """).strip().format(cache=cache, **info))
             if self.exit_code:

+ 10 - 10
src/borg/cache.py

@@ -406,7 +406,7 @@ class Cache:
 
 class CacheStatsMixin:
     str_format = """\
-All archives:   {0.total_size:>20s}
+All archives:   {0.total_size:>20s} {0.unique_size:>20s}
 
                        Unique chunks         Total chunks
 Chunk index:    {0.total_unique_chunks:20d} {0.total_chunks:20d}"""
@@ -440,7 +440,7 @@ Chunk index:    {0.total_unique_chunks:20d} {0.total_chunks:20d}"""
 
     def format_tuple(self):
         stats = self.stats()
-        for field in ['total_size', ]:
+        for field in ['total_size', 'unique_size']:
             stats[field] = format_file_size(stats[field], iec=self.iec)
         return self.Summary(**stats)
 
@@ -905,7 +905,7 @@ class LocalCache(CacheStatsMixin):
         data = self.key.encrypt(id, chunk, compress=compress)
         self.repository.put(id, data, wait=wait)
         self.chunks.add(id, 1, size)
-        stats.update(size)
+        stats.update(size, not refcount)
         return ChunkListEntry(id, size)
 
     def seen_chunk(self, id, size=None):
@@ -921,7 +921,7 @@ class LocalCache(CacheStatsMixin):
         if not self.txn_active:
             self.begin_txn()
         count, _size = self.chunks.incref(id)
-        stats.update(_size, part=part)
+        stats.update(_size, False, part=part)
         return ChunkListEntry(id, _size)
 
     def chunk_decref(self, id, stats, wait=True, part=False):
@@ -931,9 +931,9 @@ class LocalCache(CacheStatsMixin):
         if count == 0:
             del self.chunks[id]
             self.repository.delete(id, wait=wait)
-            stats.update(-size, part=part)
+            stats.update(-size, True, part=part)
         else:
-            stats.update(-size, part=part)
+            stats.update(-size, False, part=part)
 
     def file_known_and_unchanged(self, hashed_path, path_hash, st):
         """
@@ -1072,7 +1072,7 @@ Chunk index:    {0.total_unique_chunks:20d}             unknown"""
         data = self.key.encrypt(id, chunk, compress=compress)
         self.repository.put(id, data, wait=wait)
         self.chunks.add(id, 1, size)
-        stats.update(size)
+        stats.update(size, not refcount)
         return ChunkListEntry(id, size)
 
     def seen_chunk(self, id, size=None):
@@ -1094,7 +1094,7 @@ Chunk index:    {0.total_unique_chunks:20d}             unknown"""
         # size or add_chunk); we can't add references to those (size=0 is invalid) and generally don't try to.
         size = _size or size
         assert size
-        stats.update(size, part=part)
+        stats.update(size, False, part=part)
         return ChunkListEntry(id, size)
 
     def chunk_decref(self, id, stats, wait=True, part=False):
@@ -1104,9 +1104,9 @@ Chunk index:    {0.total_unique_chunks:20d}             unknown"""
         if count == 0:
             del self.chunks[id]
             self.repository.delete(id, wait=wait)
-            stats.update(-size, part=part)
+            stats.update(-size, True, part=part)
         else:
-            stats.update(-size, part=part)
+            stats.update(-size, False, part=part)
 
     def commit(self):
         if not self._txn_active:

+ 10 - 8
src/borg/testsuite/archive.py

@@ -19,44 +19,46 @@ from ..platform import uid2user, gid2group
 @pytest.fixture()
 def stats():
     stats = Statistics()
-    stats.update(20)
+    stats.update(20, unique=True)
     return stats
 
 
 def test_stats_basic(stats):
     assert stats.osize == 20
-    stats.update(20)
+    assert stats.usize == 20
+    stats.update(20, unique=False)
     assert stats.osize == 40
+    assert stats.usize == 20
 
 
 def tests_stats_progress(stats, monkeypatch, columns=80):
     monkeypatch.setenv('COLUMNS', str(columns))
     out = StringIO()
     stats.show_progress(stream=out)
-    s = '20 B O 0 N '
+    s = '20 B O 20 B U 0 N '
     buf = ' ' * (columns - len(s))
     assert out.getvalue() == s + buf + "\r"
 
     out = StringIO()
-    stats.update(10 ** 3)
+    stats.update(10 ** 3, unique=False)
     stats.show_progress(item=Item(path='foo'), final=False, stream=out)
-    s = '1.02 kB O 0 N foo'
+    s = '1.02 kB O 20 B U 0 N foo'
     buf = ' ' * (columns - len(s))
     assert out.getvalue() == s + buf + "\r"
     out = StringIO()
     stats.show_progress(item=Item(path='foo'*40), final=False, stream=out)
-    s = '1.02 kB O 0 N foofoofoofoofoofoofoofoofoofoo...foofoofoofoofoofoofoofoofoofoofoo'
+    s = '1.02 kB O 20 B U 0 N foofoofoofoofoofoofoofoofo...foofoofoofoofoofoofoofoofoofoo'
     buf = ' ' * (columns - len(s))
     assert out.getvalue() == s + buf + "\r"
 
 
 def test_stats_format(stats):
     assert str(stats) == """\
-This archive:                   20 B"""
+This archive:                   20 B                 20 B"""
     s = f"{stats.osize_fmt}"
     assert s == "20 B"
     # kind of redundant, but id is variable so we can't match reliably
-    assert repr(stats) == f'<Statistics object at {id(stats):#x} (20)>'
+    assert repr(stats) == f'<Statistics object at {id(stats):#x} (20, 20)>'
 
 
 def test_stats_progress_json(stats):