Преглед изворни кода

Adding performance statistics to borg create (#6991)

- file status A/M/E counters
- chunking time
- hashing time
- rx_bytes / tx_bytes

Note: the sleep() in the test is needed due to timestamp granularity on linux being much more coarse than expected (uses the system timer, 100Hz or 250Hz).
Franco Ayala пре 2 година
родитељ
комит
2ed7f317d3

+ 37 - 2
src/borg/archive.py

@@ -4,7 +4,7 @@ import os
 import stat
 import stat
 import sys
 import sys
 import time
 import time
-from collections import OrderedDict
+from collections import OrderedDict, defaultdict
 from contextlib import contextmanager
 from contextlib import contextmanager
 from datetime import datetime, timedelta
 from datetime import datetime, timedelta
 from functools import partial
 from functools import partial
@@ -60,6 +60,11 @@ class Statistics:
         self.osize = self.usize = self.nfiles = 0
         self.osize = self.usize = self.nfiles = 0
         self.osize_parts = self.usize_parts = self.nfiles_parts = 0
         self.osize_parts = self.usize_parts = self.nfiles_parts = 0
         self.last_progress = 0  # timestamp when last progress was shown
         self.last_progress = 0  # timestamp when last progress was shown
+        self.files_stats = defaultdict(int)
+        self.chunking_time = 0.0
+        self.hashing_time = 0.0
+        self.rx_bytes = 0
+        self.tx_bytes = 0
 
 
     def update(self, size, unique, part=False):
     def update(self, size, unique, part=False):
         if not part:
         if not part:
@@ -81,15 +86,36 @@ class Statistics:
         stats.osize_parts = self.osize_parts + other.osize_parts
         stats.osize_parts = self.osize_parts + other.osize_parts
         stats.usize_parts = self.usize_parts + other.usize_parts
         stats.usize_parts = self.usize_parts + other.usize_parts
         stats.nfiles_parts = self.nfiles_parts + other.nfiles_parts
         stats.nfiles_parts = self.nfiles_parts + other.nfiles_parts
+        stats.chunking_time = self.chunking_time + other.chunking_time
+        stats.hashing_time = self.hashing_time + other.hashing_time
+        for key in other.files_stats:
+            stats.files_stats[key] = self.files_stats[key] + other.files_stats[key]
+
         return stats
         return stats
 
 
     def __str__(self):
     def __str__(self):
+        hashing_time = format_timedelta(timedelta(seconds=self.hashing_time))
+        chunking_time = format_timedelta(timedelta(seconds=self.chunking_time))
         return """\
         return """\
 Number of files: {stats.nfiles}
 Number of files: {stats.nfiles}
 Original size: {stats.osize_fmt}
 Original size: {stats.osize_fmt}
 Deduplicated size: {stats.usize_fmt}
 Deduplicated size: {stats.usize_fmt}
+Time spent in hashing: {hashing_time}
+Time spent in chunking: {chunking_time}
+Added files: {added_files}
+Unchanged files: {unchanged_files}
+Modified files: {modified_files}
+Error files: {error_files}
+Bytes read from remote: {stats.rx_bytes}
+Bytes sent to remote: {stats.tx_bytes}
 """.format(
 """.format(
-            stats=self
+            stats=self,
+            hashing_time=hashing_time,
+            chunking_time=chunking_time,
+            added_files=self.files_stats["A"],
+            unchanged_files=self.files_stats["U"],
+            modified_files=self.files_stats["M"],
+            error_files=self.files_stats["E"],
         )
         )
 
 
     def __repr__(self):
     def __repr__(self):
@@ -102,6 +128,9 @@ Deduplicated size: {stats.usize_fmt}
             "original_size": FileSize(self.osize, iec=self.iec),
             "original_size": FileSize(self.osize, iec=self.iec),
             "deduplicated_size": FileSize(self.usize, iec=self.iec),
             "deduplicated_size": FileSize(self.usize, iec=self.iec),
             "nfiles": self.nfiles,
             "nfiles": self.nfiles,
+            "hashing_time": self.hashing_time,
+            "chunking_time": self.chunking_time,
+            "files_stats": self.files_stats,
         }
         }
 
 
     def as_raw_dict(self):
     def as_raw_dict(self):
@@ -1237,7 +1266,9 @@ class ChunksProcessor:
         if not chunk_processor:
         if not chunk_processor:
 
 
             def chunk_processor(chunk):
             def chunk_processor(chunk):
+                started_hashing = time.monotonic()
                 chunk_id, data = cached_hash(chunk, self.key.id_hash)
                 chunk_id, data = cached_hash(chunk, self.key.id_hash)
+                stats.hashing_time += time.monotonic() - started_hashing
                 chunk_entry = cache.add_chunk(chunk_id, {}, data, stats=stats, wait=False)
                 chunk_entry = cache.add_chunk(chunk_id, {}, data, stats=stats, wait=False)
                 self.cache.repository.async_response(wait=False)
                 self.cache.repository.async_response(wait=False)
                 return chunk_entry
                 return chunk_entry
@@ -1411,7 +1442,9 @@ class FilesystemObjectProcessors:
                 else:  # normal case, no "2nd+" hardlink
                 else:  # normal case, no "2nd+" hardlink
                     if not is_special_file:
                     if not is_special_file:
                         hashed_path = safe_encode(os.path.join(self.cwd, path))
                         hashed_path = safe_encode(os.path.join(self.cwd, path))
+                        started_hashing = time.monotonic()
                         path_hash = self.key.id_hash(hashed_path)
                         path_hash = self.key.id_hash(hashed_path)
+                        self.stats.hashing_time += time.monotonic() - started_hashing
                         known, ids = cache.file_known_and_unchanged(hashed_path, path_hash, st)
                         known, ids = cache.file_known_and_unchanged(hashed_path, path_hash, st)
                     else:
                     else:
                         # in --read-special mode, we may be called for special files.
                         # in --read-special mode, we may be called for special files.
@@ -1434,6 +1467,7 @@ class FilesystemObjectProcessors:
                     else:
                     else:
                         status = "M" if known else "A"  # regular file, modified or added
                         status = "M" if known else "A"  # regular file, modified or added
                     self.print_file_status(status, path)
                     self.print_file_status(status, path)
+                    self.stats.files_stats[status] += 1
                     status = None  # we already printed the status
                     status = None  # we already printed the status
                     # Only chunkify the file if needed
                     # Only chunkify the file if needed
                     if chunks is not None:
                     if chunks is not None:
@@ -1447,6 +1481,7 @@ class FilesystemObjectProcessors:
                                 self.show_progress,
                                 self.show_progress,
                                 backup_io_iter(self.chunker.chunkify(None, fd)),
                                 backup_io_iter(self.chunker.chunkify(None, fd)),
                             )
                             )
+                            self.stats.chunking_time = self.chunker.chunking_time
                         if is_win32:
                         if is_win32:
                             changed_while_backup = False  # TODO
                             changed_while_backup = False  # TODO
                         else:
                         else:

+ 6 - 0
src/borg/archiver/create_cmd.py

@@ -119,6 +119,7 @@ class CreateMixIn:
                     if status == "C":
                     if status == "C":
                         self.print_warning("%s: file changed while we backed it up", path)
                         self.print_warning("%s: file changed while we backed it up", path)
                     self.print_file_status(status, path)
                     self.print_file_status(status, path)
+                    fso.stats.files_stats[status] += 1
                 if args.paths_from_command:
                 if args.paths_from_command:
                     rc = proc.wait()
                     rc = proc.wait()
                     if rc != 0:
                     if rc != 0:
@@ -142,6 +143,7 @@ class CreateMixIn:
                         else:
                         else:
                             status = "-"
                             status = "-"
                         self.print_file_status(status, path)
                         self.print_file_status(status, path)
+                        fso.stats.files_stats[status] += 1
                         continue
                         continue
                     path = os.path.normpath(path)
                     path = os.path.normpath(path)
                     parent_dir = os.path.dirname(path) or "."
                     parent_dir = os.path.dirname(path) or "."
@@ -185,6 +187,8 @@ class CreateMixIn:
                 if args.progress:
                 if args.progress:
                     archive.stats.show_progress(final=True)
                     archive.stats.show_progress(final=True)
                 archive.stats += fso.stats
                 archive.stats += fso.stats
+                archive.stats.rx_bytes = getattr(repository, "rx_bytes", 0)
+                archive.stats.tx_bytes = getattr(repository, "tx_bytes", 0)
                 if sig_int:
                 if sig_int:
                     # do not save the archive if the user ctrl-c-ed - it is valid, but incomplete.
                     # do not save the archive if the user ctrl-c-ed - it is valid, but incomplete.
                     # we already have a checkpoint archive in this case.
                     # we already have a checkpoint archive in this case.
@@ -469,6 +473,8 @@ class CreateMixIn:
             self.print_warning("%s: file changed while we backed it up", path)
             self.print_warning("%s: file changed while we backed it up", path)
         if not recurse_excluded_dir:
         if not recurse_excluded_dir:
             self.print_file_status(status, path)
             self.print_file_status(status, path)
+            if status is not None:
+                fso.stats.files_stats[status] += 1
 
 
     def build_parser_create(self, subparsers, common_parser, mid_common_parser):
     def build_parser_create(self, subparsers, common_parser, mid_common_parser):
         from ._common import process_epilog
         from ._common import process_epilog

+ 9 - 0
src/borg/chunker.pyx

@@ -2,6 +2,7 @@ API_VERSION = '1.2_01'
 
 
 import errno
 import errno
 import os
 import os
+import time
 from collections import namedtuple
 from collections import namedtuple
 
 
 from .constants import CH_DATA, CH_ALLOC, CH_HOLE, zeros
 from .constants import CH_DATA, CH_ALLOC, CH_HOLE, zeros
@@ -145,6 +146,7 @@ class ChunkerFixed:
     def __init__(self, block_size, header_size=0, sparse=False):
     def __init__(self, block_size, header_size=0, sparse=False):
         self.block_size = block_size
         self.block_size = block_size
         self.header_size = header_size
         self.header_size = header_size
+        self.chunking_time = 0.0
         # should borg try to do sparse input processing?
         # should borg try to do sparse input processing?
         # whether it actually can be done depends on the input file being seekable.
         # whether it actually can be done depends on the input file being seekable.
         self.try_sparse = sparse and has_seek_hole
         self.try_sparse = sparse and has_seek_hole
@@ -198,6 +200,7 @@ class ChunkerFixed:
                 offset = range_start
                 offset = range_start
                 dseek(offset, os.SEEK_SET, fd, fh)
                 dseek(offset, os.SEEK_SET, fd, fh)
             while range_size:
             while range_size:
+                started_chunking = time.monotonic()
                 wanted = min(range_size, self.block_size)
                 wanted = min(range_size, self.block_size)
                 if is_data:
                 if is_data:
                     # read block from the range
                     # read block from the range
@@ -217,6 +220,7 @@ class ChunkerFixed:
                 if got > 0:
                 if got > 0:
                     offset += got
                     offset += got
                     range_size -= got
                     range_size -= got
+                    self.chunking_time += time.monotonic() - started_chunking
                     yield Chunk(data, size=got, allocation=allocation)
                     yield Chunk(data, size=got, allocation=allocation)
                 if got < wanted:
                 if got < wanted:
                     # we did not get enough data, looks like EOF.
                     # we did not get enough data, looks like EOF.
@@ -236,6 +240,7 @@ cdef class Chunker:
     It also uses a per-repo random seed to avoid some chunk length fingerprinting attacks.
     It also uses a per-repo random seed to avoid some chunk length fingerprinting attacks.
     """
     """
     cdef _Chunker *chunker
     cdef _Chunker *chunker
+    cdef readonly float chunking_time
 
 
     def __cinit__(self, int seed, int chunk_min_exp, int chunk_max_exp, int hash_mask_bits, int hash_window_size):
     def __cinit__(self, int seed, int chunk_min_exp, int chunk_max_exp, int hash_mask_bits, int hash_window_size):
         min_size = 1 << chunk_min_exp
         min_size = 1 << chunk_min_exp
@@ -245,6 +250,8 @@ cdef class Chunker:
         assert hash_window_size + min_size + 1 <= max_size, "too small max_size"
         assert hash_window_size + min_size + 1 <= max_size, "too small max_size"
         hash_mask = (1 << hash_mask_bits) - 1
         hash_mask = (1 << hash_mask_bits) - 1
         self.chunker = chunker_init(hash_window_size, hash_mask, min_size, max_size, seed & 0xffffffff)
         self.chunker = chunker_init(hash_window_size, hash_mask, min_size, max_size, seed & 0xffffffff)
+        self.chunking_time = 0.0
+
 
 
     def chunkify(self, fd, fh=-1):
     def chunkify(self, fd, fh=-1):
         """
         """
@@ -265,6 +272,7 @@ cdef class Chunker:
         return self
         return self
 
 
     def __next__(self):
     def __next__(self):
+        started_chunking = time.monotonic()
         data = chunker_process(self.chunker)
         data = chunker_process(self.chunker)
         got = len(data)
         got = len(data)
         # we do not have SEEK_DATA/SEEK_HOLE support in chunker_process C code,
         # we do not have SEEK_DATA/SEEK_HOLE support in chunker_process C code,
@@ -275,6 +283,7 @@ cdef class Chunker:
             allocation = CH_ALLOC
             allocation = CH_ALLOC
         else:
         else:
             allocation = CH_DATA
             allocation = CH_DATA
+        self.chunking_time += time.monotonic() - started_chunking
         return Chunk(data, size=got, allocation=allocation)
         return Chunk(data, size=got, allocation=allocation)
 
 
 
 

+ 8 - 0
src/borg/testsuite/archive.py

@@ -60,6 +60,14 @@ def test_stats_format(stats):
 Number of files: 1
 Number of files: 1
 Original size: 20 B
 Original size: 20 B
 Deduplicated size: 20 B
 Deduplicated size: 20 B
+Time spent in hashing: 0.00 seconds
+Time spent in chunking: 0.00 seconds
+Added files: 0
+Unchanged files: 0
+Modified files: 0
+Error files: 0
+Bytes read from remote: 0
+Bytes sent to remote: 0
 """
 """
     )
     )
     s = f"{stats.osize_fmt}"
     s = f"{stats.osize_fmt}"

+ 77 - 0
src/borg/testsuite/archiver/create_cmd.py

@@ -1,6 +1,7 @@
 import errno
 import errno
 import json
 import json
 import os
 import os
+from random import randbytes
 import shutil
 import shutil
 import socket
 import socket
 import stat
 import stat
@@ -626,6 +627,46 @@ class ArchiverTestCase(ArchiverTestCaseBase):
         if has_lchflags:
         if has_lchflags:
             self.assert_in("x input/file3", output)
             self.assert_in("x input/file3", output)
 
 
+    def test_file_status_counters(self):
+        """Test file status counters in the stats of `borg create --stats`"""
+
+        def to_dict(borg_create_output):
+            borg_create_output = borg_create_output.strip().splitlines()
+            borg_create_output = [line.split(":", 1) for line in borg_create_output]
+            borg_create_output = {
+                key: int(value)
+                for key, value in borg_create_output
+                if key in ("Added files", "Unchanged files", "Modified files")
+            }
+            return borg_create_output
+
+        # Test case set up: create a repository
+        self.cmd(f"--repo={self.repository_location}", "rcreate", RK_ENCRYPTION)
+        # Archive an empty dir
+        result = self.cmd(f"--repo={self.repository_location}", "create", "--stats", "test_archive", self.input_path)
+        result = to_dict(result)
+        assert result["Added files"] == 0
+        assert result["Unchanged files"] == 0
+        assert result["Modified files"] == 0
+        # Archive a dir with two added files
+        self.create_regular_file("testfile1", contents=b"test1")
+        time.sleep(0.01)  # testfile2 must have newer timestamps than testfile1
+        self.create_regular_file("testfile2", contents=b"test2")
+        result = self.cmd(f"--repo={self.repository_location}", "create", "--stats", "test_archive2", self.input_path)
+        result = to_dict(result)
+        assert result["Added files"] == 2
+        assert result["Unchanged files"] == 0
+        assert result["Modified files"] == 0
+        # Archive a dir with 1 unmodified file and 1 modified
+        self.create_regular_file("testfile1", contents=b"new data")
+        result = self.cmd(f"--repo={self.repository_location}", "create", "--stats", "test_archive3", self.input_path)
+        result = to_dict(result)
+        # Should process testfile2 as added because of
+        # https://borgbackup.readthedocs.io/en/stable/faq.html#i-am-seeing-a-added-status-for-an-unchanged-file
+        assert result["Added files"] == 1
+        assert result["Unchanged files"] == 0
+        assert result["Modified files"] == 1
+
     def test_create_json(self):
     def test_create_json(self):
         self.create_regular_file("file1", size=1024 * 80)
         self.create_regular_file("file1", size=1024 * 80)
         self.cmd(f"--repo={self.repository_location}", "rcreate", RK_ENCRYPTION)
         self.cmd(f"--repo={self.repository_location}", "rcreate", RK_ENCRYPTION)
@@ -731,6 +772,42 @@ class ArchiverTestCase(ArchiverTestCaseBase):
         log = self.cmd(f"--repo={self.repository_location}", "--debug", "create", "test", "input")
         log = self.cmd(f"--repo={self.repository_location}", "--debug", "create", "test", "input")
         assert "security: read previous location" in log
         assert "security: read previous location" in log
 
 
+    def test_hashing_time(self):
+        def extract_hashing_time(borg_create_output):
+            borg_create_output = borg_create_output.strip().splitlines()
+            borg_create_output = [line.split(":", 1) for line in borg_create_output]
+            hashing_time = [line for line in borg_create_output if line[0] == "Time spent in hashing"].pop()
+            hashing_time = hashing_time[1]
+            hashing_time = float(hashing_time.removesuffix(" seconds"))
+            return hashing_time
+
+        # Test case set up: create a repository and a file
+        self.cmd(f"--repo={self.repository_location}", "rcreate", "--encryption=none")
+        self.create_regular_file("testfile", contents=randbytes(6000000))
+        # Archive
+        result = self.cmd(f"--repo={self.repository_location}", "create", "--stats", "test_archive", self.input_path)
+        hashing_time = extract_hashing_time(result)
+
+        assert hashing_time > 0.0
+
+    def test_chunking_time(self):
+        def extract_chunking_time(borg_create_output):
+            borg_create_output = borg_create_output.strip().splitlines()
+            borg_create_output = [line.split(":", 1) for line in borg_create_output]
+            chunking_time = [line for line in borg_create_output if line[0] == "Time spent in chunking"].pop()
+            chunking_time = chunking_time[1]
+            chunking_time = float(chunking_time.removesuffix(" seconds"))
+            return chunking_time
+
+        # Test case set up: create a repository and a file
+        self.cmd(f"--repo={self.repository_location}", "rcreate", RK_ENCRYPTION)
+        self.create_regular_file("testfile", contents=randbytes(5000000))
+        # Archive
+        result = self.cmd(f"--repo={self.repository_location}", "create", "--stats", "test_archive", self.input_path)
+        chunking_time = extract_chunking_time(result)
+
+        assert chunking_time > 0.0
+
 
 
 class RemoteArchiverTestCase(RemoteArchiverTestCaseBase, ArchiverTestCase):
 class RemoteArchiverTestCase(RemoteArchiverTestCaseBase, ArchiverTestCase):
     """run the same tests, but with a remote repository"""
     """run the same tests, but with a remote repository"""