2 жил өмнө · 2ed7f317d3
--- a/src/borg/archive.py
+++ b/src/borg/archive.py
@@ -4,7 +4,7 @@ import os
 
				 import stat
			
 
				 import sys
			
 
				 import time
			
 
				-from collections import OrderedDict
			
 
				+from collections import OrderedDict, defaultdict
			
 
				 from contextlib import contextmanager
			
 
				 from datetime import datetime, timedelta
			
 
				 from functools import partial
			
@@ -60,6 +60,11 @@ class Statistics:
 
				         self.osize = self.usize = self.nfiles = 0
			
 
				         self.osize_parts = self.usize_parts = self.nfiles_parts = 0
			
 
				         self.last_progress = 0  # timestamp when last progress was shown
			
 
				+        self.files_stats = defaultdict(int)
			
 
				+        self.chunking_time = 0.0
			
 
				+        self.hashing_time = 0.0
			
 
				+        self.rx_bytes = 0
			
 
				+        self.tx_bytes = 0
			
 
				 
			
 
				     def update(self, size, unique, part=False):
			
 
				         if not part:
			
@@ -81,15 +86,36 @@ class Statistics:
 
				         stats.osize_parts = self.osize_parts + other.osize_parts
			
 
				         stats.usize_parts = self.usize_parts + other.usize_parts
			
 
				         stats.nfiles_parts = self.nfiles_parts + other.nfiles_parts
			
 
				+        stats.chunking_time = self.chunking_time + other.chunking_time
			
 
				+        stats.hashing_time = self.hashing_time + other.hashing_time
			
 
				+        for key in other.files_stats:
			
 
				+            stats.files_stats[key] = self.files_stats[key] + other.files_stats[key]
			
 
				+
			
 
				         return stats
			
 
				 
			
 
				     def __str__(self):
			
 
				+        hashing_time = format_timedelta(timedelta(seconds=self.hashing_time))
			
 
				+        chunking_time = format_timedelta(timedelta(seconds=self.chunking_time))
			
 
				         return """\
			
 
				 Number of files: {stats.nfiles}
			
 
				 Original size: {stats.osize_fmt}
			
 
				 Deduplicated size: {stats.usize_fmt}
			
 
				+Time spent in hashing: {hashing_time}
			
 
				+Time spent in chunking: {chunking_time}
			
 
				+Added files: {added_files}
			
 
				+Unchanged files: {unchanged_files}
			
 
				+Modified files: {modified_files}
			
 
				+Error files: {error_files}
			
 
				+Bytes read from remote: {stats.rx_bytes}
			
 
				+Bytes sent to remote: {stats.tx_bytes}
			
 
				 """.format(
			
 
				-            stats=self
			
 
				+            stats=self,
			
 
				+            hashing_time=hashing_time,
			
 
				+            chunking_time=chunking_time,
			
 
				+            added_files=self.files_stats["A"],
			
 
				+            unchanged_files=self.files_stats["U"],
			
 
				+            modified_files=self.files_stats["M"],
			
 
				+            error_files=self.files_stats["E"],
			
 
				         )
			
 
				 
			
 
				     def __repr__(self):
			
@@ -102,6 +128,9 @@ Deduplicated size: {stats.usize_fmt}
 
				             "original_size": FileSize(self.osize, iec=self.iec),
			
 
				             "deduplicated_size": FileSize(self.usize, iec=self.iec),
			
 
				             "nfiles": self.nfiles,
			
 
				+            "hashing_time": self.hashing_time,
			
 
				+            "chunking_time": self.chunking_time,
			
 
				+            "files_stats": self.files_stats,
			
 
				         }
			
 
				 
			
 
				     def as_raw_dict(self):
			
@@ -1237,7 +1266,9 @@ class ChunksProcessor:
 
				         if not chunk_processor:
			
 
				 
			
 
				             def chunk_processor(chunk):
			
 
				+                started_hashing = time.monotonic()
			
 
				                 chunk_id, data = cached_hash(chunk, self.key.id_hash)
			
 
				+                stats.hashing_time += time.monotonic() - started_hashing
			
 
				                 chunk_entry = cache.add_chunk(chunk_id, {}, data, stats=stats, wait=False)
			
 
				                 self.cache.repository.async_response(wait=False)
			
 
				                 return chunk_entry
			
@@ -1411,7 +1442,9 @@ class FilesystemObjectProcessors:
 
				                 else:  # normal case, no "2nd+" hardlink
			
 
				                     if not is_special_file:
			
 
				                         hashed_path = safe_encode(os.path.join(self.cwd, path))
			
 
				+                        started_hashing = time.monotonic()
			
 
				                         path_hash = self.key.id_hash(hashed_path)
			
 
				+                        self.stats.hashing_time += time.monotonic() - started_hashing
			
 
				                         known, ids = cache.file_known_and_unchanged(hashed_path, path_hash, st)
			
 
				                     else:
			
 
				                         # in --read-special mode, we may be called for special files.
			
@@ -1434,6 +1467,7 @@ class FilesystemObjectProcessors:
 
				                     else:
			
 
				                         status = "M" if known else "A"  # regular file, modified or added
			
 
				                     self.print_file_status(status, path)
			
 
				+                    self.stats.files_stats[status] += 1
			
 
				                     status = None  # we already printed the status
			
 
				                     # Only chunkify the file if needed
			
 
				                     if chunks is not None:
			
@@ -1447,6 +1481,7 @@ class FilesystemObjectProcessors:
 
				                                 self.show_progress,
			
 
				                                 backup_io_iter(self.chunker.chunkify(None, fd)),
			
 
				                             )
			
 
				+                            self.stats.chunking_time = self.chunker.chunking_time
			
 
				                         if is_win32:
			
 
				                             changed_while_backup = False  # TODO
			
 
				                         else:
			
--- a/src/borg/archiver/create_cmd.py
+++ b/src/borg/archiver/create_cmd.py
@@ -119,6 +119,7 @@ class CreateMixIn:
 
				                     if status == "C":
			
 
				                         self.print_warning("%s: file changed while we backed it up", path)
			
 
				                     self.print_file_status(status, path)
			
 
				+                    fso.stats.files_stats[status] += 1
			
 
				                 if args.paths_from_command:
			
 
				                     rc = proc.wait()
			
 
				                     if rc != 0:
			
@@ -142,6 +143,7 @@ class CreateMixIn:
 
				                         else:
			
 
				                             status = "-"
			
 
				                         self.print_file_status(status, path)
			
 
				+                        fso.stats.files_stats[status] += 1
			
 
				                         continue
			
 
				                     path = os.path.normpath(path)
			
 
				                     parent_dir = os.path.dirname(path) or "."
			
@@ -185,6 +187,8 @@ class CreateMixIn:
 
				                 if args.progress:
			
 
				                     archive.stats.show_progress(final=True)
			
 
				                 archive.stats += fso.stats
			
 
				+                archive.stats.rx_bytes = getattr(repository, "rx_bytes", 0)
			
 
				+                archive.stats.tx_bytes = getattr(repository, "tx_bytes", 0)
			
 
				                 if sig_int:
			
 
				                     # do not save the archive if the user ctrl-c-ed - it is valid, but incomplete.
			
 
				                     # we already have a checkpoint archive in this case.
			
@@ -469,6 +473,8 @@ class CreateMixIn:
 
				             self.print_warning("%s: file changed while we backed it up", path)
			
 
				         if not recurse_excluded_dir:
			
 
				             self.print_file_status(status, path)
			
 
				+            if status is not None:
			
 
				+                fso.stats.files_stats[status] += 1
			
 
				 
			
 
				     def build_parser_create(self, subparsers, common_parser, mid_common_parser):
			
 
				         from ._common import process_epilog
			
--- a/src/borg/chunker.pyx
+++ b/src/borg/chunker.pyx
@@ -2,6 +2,7 @@ API_VERSION = '1.2_01'
 
				 
			
 
				 import errno
			
 
				 import os
			
 
				+import time
			
 
				 from collections import namedtuple
			
 
				 
			
 
				 from .constants import CH_DATA, CH_ALLOC, CH_HOLE, zeros
			
@@ -145,6 +146,7 @@ class ChunkerFixed:
 
				     def __init__(self, block_size, header_size=0, sparse=False):
			
 
				         self.block_size = block_size
			
 
				         self.header_size = header_size
			
 
				+        self.chunking_time = 0.0
			
 
				         # should borg try to do sparse input processing?
			
 
				         # whether it actually can be done depends on the input file being seekable.
			
 
				         self.try_sparse = sparse and has_seek_hole
			
@@ -198,6 +200,7 @@ class ChunkerFixed:
 
				                 offset = range_start
			
 
				                 dseek(offset, os.SEEK_SET, fd, fh)
			
 
				             while range_size:
			
 
				+                started_chunking = time.monotonic()
			
 
				                 wanted = min(range_size, self.block_size)
			
 
				                 if is_data:
			
 
				                     # read block from the range
			
@@ -217,6 +220,7 @@ class ChunkerFixed:
 
				                 if got > 0:
			
 
				                     offset += got
			
 
				                     range_size -= got
			
 
				+                    self.chunking_time += time.monotonic() - started_chunking
			
 
				                     yield Chunk(data, size=got, allocation=allocation)
			
 
				                 if got < wanted:
			
 
				                     # we did not get enough data, looks like EOF.
			
@@ -236,6 +240,7 @@ cdef class Chunker:
 
				     It also uses a per-repo random seed to avoid some chunk length fingerprinting attacks.
			
 
				     """
			
 
				     cdef _Chunker *chunker
			
 
				+    cdef readonly float chunking_time
			
 
				 
			
 
				     def __cinit__(self, int seed, int chunk_min_exp, int chunk_max_exp, int hash_mask_bits, int hash_window_size):
			
 
				         min_size = 1 << chunk_min_exp
			
@@ -245,6 +250,8 @@ cdef class Chunker:
 
				         assert hash_window_size + min_size + 1 <= max_size, "too small max_size"
			
 
				         hash_mask = (1 << hash_mask_bits) - 1
			
 
				         self.chunker = chunker_init(hash_window_size, hash_mask, min_size, max_size, seed & 0xffffffff)
			
 
				+        self.chunking_time = 0.0
			
 
				+
			
 
				 
			
 
				     def chunkify(self, fd, fh=-1):
			
 
				         """
			
@@ -265,6 +272,7 @@ cdef class Chunker:
 
				         return self
			
 
				 
			
 
				     def __next__(self):
			
 
				+        started_chunking = time.monotonic()
			
 
				         data = chunker_process(self.chunker)
			
 
				         got = len(data)
			
 
				         # we do not have SEEK_DATA/SEEK_HOLE support in chunker_process C code,
			
@@ -275,6 +283,7 @@ cdef class Chunker:
 
				             allocation = CH_ALLOC
			
 
				         else:
			
 
				             allocation = CH_DATA
			
 
				+        self.chunking_time += time.monotonic() - started_chunking
			
 
				         return Chunk(data, size=got, allocation=allocation)
			
 
				 
			
 
				 
			
--- a/src/borg/testsuite/archive.py
+++ b/src/borg/testsuite/archive.py
@@ -60,6 +60,14 @@ def test_stats_format(stats):
 
				 Number of files: 1
			
 
				 Original size: 20 B
			
 
				 Deduplicated size: 20 B
			
 
				+Time spent in hashing: 0.00 seconds
			
 
				+Time spent in chunking: 0.00 seconds
			
 
				+Added files: 0
			
 
				+Unchanged files: 0
			
 
				+Modified files: 0
			
 
				+Error files: 0
			
 
				+Bytes read from remote: 0
			
 
				+Bytes sent to remote: 0
			
 
				 """
			
 
				     )
			
 
				     s = f"{stats.osize_fmt}"
			
--- a/src/borg/testsuite/archiver/create_cmd.py
+++ b/src/borg/testsuite/archiver/create_cmd.py
@@ -1,6 +1,7 @@
 
				 import errno
			
 
				 import json
			
 
				 import os
			
 
				+from random import randbytes
			
 
				 import shutil
			
 
				 import socket
			
 
				 import stat
			
@@ -626,6 +627,46 @@ class ArchiverTestCase(ArchiverTestCaseBase):
 
				         if has_lchflags:
			
 
				             self.assert_in("x input/file3", output)
			
 
				 
			
 
				+    def test_file_status_counters(self):
			
 
				+        """Test file status counters in the stats of `borg create --stats`"""
			
 
				+
			
 
				+        def to_dict(borg_create_output):
			
 
				+            borg_create_output = borg_create_output.strip().splitlines()
			
 
				+            borg_create_output = [line.split(":", 1) for line in borg_create_output]
			
 
				+            borg_create_output = {
			
 
				+                key: int(value)
			
 
				+                for key, value in borg_create_output
			
 
				+                if key in ("Added files", "Unchanged files", "Modified files")
			
 
				+            }
			
 
				+            return borg_create_output
			
 
				+
			
 
				+        # Test case set up: create a repository
			
 
				+        self.cmd(f"--repo={self.repository_location}", "rcreate", RK_ENCRYPTION)
			
 
				+        # Archive an empty dir
			
 
				+        result = self.cmd(f"--repo={self.repository_location}", "create", "--stats", "test_archive", self.input_path)
			
 
				+        result = to_dict(result)
			
 
				+        assert result["Added files"] == 0
			
 
				+        assert result["Unchanged files"] == 0
			
 
				+        assert result["Modified files"] == 0
			
 
				+        # Archive a dir with two added files
			
 
				+        self.create_regular_file("testfile1", contents=b"test1")
			
 
				+        time.sleep(0.01)  # testfile2 must have newer timestamps than testfile1
			
 
				+        self.create_regular_file("testfile2", contents=b"test2")
			
 
				+        result = self.cmd(f"--repo={self.repository_location}", "create", "--stats", "test_archive2", self.input_path)
			
 
				+        result = to_dict(result)
			
 
				+        assert result["Added files"] == 2
			
 
				+        assert result["Unchanged files"] == 0
			
 
				+        assert result["Modified files"] == 0
			
 
				+        # Archive a dir with 1 unmodified file and 1 modified
			
 
				+        self.create_regular_file("testfile1", contents=b"new data")
			
 
				+        result = self.cmd(f"--repo={self.repository_location}", "create", "--stats", "test_archive3", self.input_path)
			
 
				+        result = to_dict(result)
			
 
				+        # Should process testfile2 as added because of
			
 
				+        # https://borgbackup.readthedocs.io/en/stable/faq.html#i-am-seeing-a-added-status-for-an-unchanged-file
			
 
				+        assert result["Added files"] == 1
			
 
				+        assert result["Unchanged files"] == 0
			
 
				+        assert result["Modified files"] == 1
			
 
				+
			
 
				     def test_create_json(self):
			
 
				         self.create_regular_file("file1", size=1024 * 80)
			
 
				         self.cmd(f"--repo={self.repository_location}", "rcreate", RK_ENCRYPTION)
			
@@ -731,6 +772,42 @@ class ArchiverTestCase(ArchiverTestCaseBase):
 
				         log = self.cmd(f"--repo={self.repository_location}", "--debug", "create", "test", "input")
			
 
				         assert "security: read previous location" in log
			
 
				 
			
 
				+    def test_hashing_time(self):
			
 
				+        def extract_hashing_time(borg_create_output):
			
 
				+            borg_create_output = borg_create_output.strip().splitlines()
			
 
				+            borg_create_output = [line.split(":", 1) for line in borg_create_output]
			
 
				+            hashing_time = [line for line in borg_create_output if line[0] == "Time spent in hashing"].pop()
			
 
				+            hashing_time = hashing_time[1]
			
 
				+            hashing_time = float(hashing_time.removesuffix(" seconds"))
			
 
				+            return hashing_time
			
 
				+
			
 
				+        # Test case set up: create a repository and a file
			
 
				+        self.cmd(f"--repo={self.repository_location}", "rcreate", "--encryption=none")
			
 
				+        self.create_regular_file("testfile", contents=randbytes(6000000))
			
 
				+        # Archive
			
 
				+        result = self.cmd(f"--repo={self.repository_location}", "create", "--stats", "test_archive", self.input_path)
			
 
				+        hashing_time = extract_hashing_time(result)
			
 
				+
			
 
				+        assert hashing_time > 0.0
			
 
				+
			
 
				+    def test_chunking_time(self):
			
 
				+        def extract_chunking_time(borg_create_output):
			
 
				+            borg_create_output = borg_create_output.strip().splitlines()
			
 
				+            borg_create_output = [line.split(":", 1) for line in borg_create_output]
			
 
				+            chunking_time = [line for line in borg_create_output if line[0] == "Time spent in chunking"].pop()
			
 
				+            chunking_time = chunking_time[1]
			
 
				+            chunking_time = float(chunking_time.removesuffix(" seconds"))
			
 
				+            return chunking_time
			
 
				+
			
 
				+        # Test case set up: create a repository and a file
			
 
				+        self.cmd(f"--repo={self.repository_location}", "rcreate", RK_ENCRYPTION)
			
 
				+        self.create_regular_file("testfile", contents=randbytes(5000000))
			
 
				+        # Archive
			
 
				+        result = self.cmd(f"--repo={self.repository_location}", "create", "--stats", "test_archive", self.input_path)
			
 
				+        chunking_time = extract_chunking_time(result)
			
 
				+
			
 
				+        assert chunking_time > 0.0
			
 
				+
			
 
				 
			
 
				 class RemoteArchiverTestCase(RemoteArchiverTestCaseBase, ArchiverTestCase):
			
 
				     """run the same tests, but with a remote repository"""