пре 2 година · 2ed7f317d3
--- a/src/borg/archive.py
+++ b/src/borg/archive.py
@@ -4,7 +4,7 @@ import os
 
															 import stat
														
 
															 import sys
														
 
															 import time
														
 
															-from collections import OrderedDict
														
 
															+from collections import OrderedDict, defaultdict
														
 
															 from contextlib import contextmanager
														
 
															 from datetime import datetime, timedelta
														
 
															 from functools import partial
														
@@ -60,6 +60,11 @@ class Statistics:
 
															         self.osize = self.usize = self.nfiles = 0
														
 
															         self.osize_parts = self.usize_parts = self.nfiles_parts = 0
														
 
															         self.last_progress = 0  # timestamp when last progress was shown
														
 
															+        self.files_stats = defaultdict(int)
														
 
															+        self.chunking_time = 0.0
														
 
															+        self.hashing_time = 0.0
														
 
															+        self.rx_bytes = 0
														
 
															+        self.tx_bytes = 0
														
 
															     def update(self, size, unique, part=False):
														
 
															         if not part:
														
@@ -81,15 +86,36 @@ class Statistics:
 
															         stats.osize_parts = self.osize_parts + other.osize_parts
														
 
															         stats.usize_parts = self.usize_parts + other.usize_parts
														
 
															         stats.nfiles_parts = self.nfiles_parts + other.nfiles_parts
														
 
															+        stats.chunking_time = self.chunking_time + other.chunking_time
														
 
															+        stats.hashing_time = self.hashing_time + other.hashing_time
														
 
															+        for key in other.files_stats:
														
 
															+            stats.files_stats[key] = self.files_stats[key] + other.files_stats[key]
														
 
															+
														
 
															         return stats
														
 
															     def __str__(self):
														
 
															+        hashing_time = format_timedelta(timedelta(seconds=self.hashing_time))
														
 
															+        chunking_time = format_timedelta(timedelta(seconds=self.chunking_time))
														
 
															         return """\
														
 
															 Number of files: {stats.nfiles}
														
 
															 Original size: {stats.osize_fmt}
														
 
															 Deduplicated size: {stats.usize_fmt}
														
 
															+Time spent in hashing: {hashing_time}
														
 
															+Time spent in chunking: {chunking_time}
														
 
															+Added files: {added_files}
														
 
															+Unchanged files: {unchanged_files}
														
 
															+Modified files: {modified_files}
														
 
															+Error files: {error_files}
														
 
															+Bytes read from remote: {stats.rx_bytes}
														
 
															+Bytes sent to remote: {stats.tx_bytes}
														
 
															 """.format(
														
 
															-            stats=self
														
 
															+            stats=self,
														
 
															+            hashing_time=hashing_time,
														
 
															+            chunking_time=chunking_time,
														
 
															+            added_files=self.files_stats["A"],
														
 
															+            unchanged_files=self.files_stats["U"],
														
 
															+            modified_files=self.files_stats["M"],
														
 
															+            error_files=self.files_stats["E"],
														
 
															         )
														
 
															     def __repr__(self):
														
@@ -102,6 +128,9 @@ Deduplicated size: {stats.usize_fmt}
 
															             "original_size": FileSize(self.osize, iec=self.iec),
														
 
															             "deduplicated_size": FileSize(self.usize, iec=self.iec),
														
 
															             "nfiles": self.nfiles,
														
 
															+            "hashing_time": self.hashing_time,
														
 
															+            "chunking_time": self.chunking_time,
														
 
															+            "files_stats": self.files_stats,
														
 
															         }
														
 
															     def as_raw_dict(self):
														
@@ -1237,7 +1266,9 @@ class ChunksProcessor:
 
															         if not chunk_processor:
														
 
															             def chunk_processor(chunk):
														
 
															+                started_hashing = time.monotonic()
														
 
															                 chunk_id, data = cached_hash(chunk, self.key.id_hash)
														
 
															+                stats.hashing_time += time.monotonic() - started_hashing
														
 
															                 chunk_entry = cache.add_chunk(chunk_id, {}, data, stats=stats, wait=False)
														
 
															                 self.cache.repository.async_response(wait=False)
														
 
															                 return chunk_entry
														
@@ -1411,7 +1442,9 @@ class FilesystemObjectProcessors:
 
															                 else:  # normal case, no "2nd+" hardlink
														
 
															                     if not is_special_file:
														
 
															                         hashed_path = safe_encode(os.path.join(self.cwd, path))
														
 
															+                        started_hashing = time.monotonic()
														
 
															                         path_hash = self.key.id_hash(hashed_path)
														
 
															+                        self.stats.hashing_time += time.monotonic() - started_hashing
														
 
															                         known, ids = cache.file_known_and_unchanged(hashed_path, path_hash, st)
														
 
															                     else:
														
 
															                         # in --read-special mode, we may be called for special files.
														
@@ -1434,6 +1467,7 @@ class FilesystemObjectProcessors:
 
															                     else:
														
 
															                         status = "M" if known else "A"  # regular file, modified or added
														
 
															                     self.print_file_status(status, path)
														
 
															+                    self.stats.files_stats[status] += 1
														
 
															                     status = None  # we already printed the status
														
 
															                     # Only chunkify the file if needed
														
 
															                     if chunks is not None:
														
@@ -1447,6 +1481,7 @@ class FilesystemObjectProcessors:
 
															                                 self.show_progress,
														
 
															                                 backup_io_iter(self.chunker.chunkify(None, fd)),
														
 
															                             )
														
 
															+                            self.stats.chunking_time = self.chunker.chunking_time
														
 
															                         if is_win32:
														
 
															                             changed_while_backup = False  # TODO
														
 
															                         else:
														
--- a/src/borg/archiver/create_cmd.py
+++ b/src/borg/archiver/create_cmd.py
@@ -119,6 +119,7 @@ class CreateMixIn:
 
															                     if status == "C":
														
 
															                         self.print_warning("%s: file changed while we backed it up", path)
														
 
															                     self.print_file_status(status, path)
														
 
															+                    fso.stats.files_stats[status] += 1
														
 
															                 if args.paths_from_command:
														
 
															                     rc = proc.wait()
														
 
															                     if rc != 0:
														
@@ -142,6 +143,7 @@ class CreateMixIn:
 
															                         else:
														
 
															                             status = "-"
														
 
															                         self.print_file_status(status, path)
														
 
															+                        fso.stats.files_stats[status] += 1
														
 
															                         continue
														
 
															                     path = os.path.normpath(path)
														
 
															                     parent_dir = os.path.dirname(path) or "."
														
@@ -185,6 +187,8 @@ class CreateMixIn:
 
															                 if args.progress:
														
 
															                     archive.stats.show_progress(final=True)
														
 
															                 archive.stats += fso.stats
														
 
															+                archive.stats.rx_bytes = getattr(repository, "rx_bytes", 0)
														
 
															+                archive.stats.tx_bytes = getattr(repository, "tx_bytes", 0)
														
 
															                 if sig_int:
														
 
															                     # do not save the archive if the user ctrl-c-ed - it is valid, but incomplete.
														
 
															                     # we already have a checkpoint archive in this case.
														
@@ -469,6 +473,8 @@ class CreateMixIn:
 
															             self.print_warning("%s: file changed while we backed it up", path)
														
 
															         if not recurse_excluded_dir:
														
 
															             self.print_file_status(status, path)
														
 
															+            if status is not None:
														
 
															+                fso.stats.files_stats[status] += 1
														
 
															     def build_parser_create(self, subparsers, common_parser, mid_common_parser):
														
 
															         from ._common import process_epilog
														
--- a/src/borg/chunker.pyx
+++ b/src/borg/chunker.pyx
@@ -2,6 +2,7 @@ API_VERSION = '1.2_01'
 
															 import errno
														
 
															 import os
														
 
															+import time
														
 
															 from collections import namedtuple
														
 
															 from .constants import CH_DATA, CH_ALLOC, CH_HOLE, zeros
														
@@ -145,6 +146,7 @@ class ChunkerFixed:
 
															     def __init__(self, block_size, header_size=0, sparse=False):
														
 
															         self.block_size = block_size
														
 
															         self.header_size = header_size
														
 
															+        self.chunking_time = 0.0
														
 
															         # should borg try to do sparse input processing?
														
 
															         # whether it actually can be done depends on the input file being seekable.
														
 
															         self.try_sparse = sparse and has_seek_hole
														
@@ -198,6 +200,7 @@ class ChunkerFixed:
 
															                 offset = range_start
														
 
															                 dseek(offset, os.SEEK_SET, fd, fh)
														
 
															             while range_size:
														
 
															+                started_chunking = time.monotonic()
														
 
															                 wanted = min(range_size, self.block_size)
														
 
															                 if is_data:
														
 
															                     # read block from the range
														
@@ -217,6 +220,7 @@ class ChunkerFixed:
 
															                 if got > 0:
														
 
															                     offset += got
														
 
															                     range_size -= got
														
 
															+                    self.chunking_time += time.monotonic() - started_chunking
														
 
															                     yield Chunk(data, size=got, allocation=allocation)
														
 
															                 if got < wanted:
														
 
															                     # we did not get enough data, looks like EOF.
														
@@ -236,6 +240,7 @@ cdef class Chunker:
 
															     It also uses a per-repo random seed to avoid some chunk length fingerprinting attacks.
														
 
															     """
														
 
															     cdef _Chunker *chunker
														
 
															+    cdef readonly float chunking_time
														
 
															     def __cinit__(self, int seed, int chunk_min_exp, int chunk_max_exp, int hash_mask_bits, int hash_window_size):
														
 
															         min_size = 1 << chunk_min_exp
														
@@ -245,6 +250,8 @@ cdef class Chunker:
 
															         assert hash_window_size + min_size + 1 <= max_size, "too small max_size"
														
 
															         hash_mask = (1 << hash_mask_bits) - 1
														
 
															         self.chunker = chunker_init(hash_window_size, hash_mask, min_size, max_size, seed & 0xffffffff)
														
 
															+        self.chunking_time = 0.0
														
 
															+
														
 
															     def chunkify(self, fd, fh=-1):
														
 
															         """
														
@@ -265,6 +272,7 @@ cdef class Chunker:
 
															         return self
														
 
															     def __next__(self):
														
 
															+        started_chunking = time.monotonic()
														
 
															         data = chunker_process(self.chunker)
														
 
															         got = len(data)
														
 
															         # we do not have SEEK_DATA/SEEK_HOLE support in chunker_process C code,
														
@@ -275,6 +283,7 @@ cdef class Chunker:
 
															             allocation = CH_ALLOC
														
 
															         else:
														
 
															             allocation = CH_DATA
														
 
															+        self.chunking_time += time.monotonic() - started_chunking
														
 
															         return Chunk(data, size=got, allocation=allocation)
														
--- a/src/borg/testsuite/archive.py
+++ b/src/borg/testsuite/archive.py
@@ -60,6 +60,14 @@ def test_stats_format(stats):
 
															 Number of files: 1
														
 
															 Original size: 20 B
														
 
															 Deduplicated size: 20 B
														
 
															+Time spent in hashing: 0.00 seconds
														
 
															+Time spent in chunking: 0.00 seconds
														
 
															+Added files: 0
														
 
															+Unchanged files: 0
														
 
															+Modified files: 0
														
 
															+Error files: 0
														
 
															+Bytes read from remote: 0
														
 
															+Bytes sent to remote: 0
														
 
															 """
														
 
															     )
														
 
															     s = f"{stats.osize_fmt}"
														
--- a/src/borg/testsuite/archiver/create_cmd.py
+++ b/src/borg/testsuite/archiver/create_cmd.py
@@ -1,6 +1,7 @@
 
															 import errno
														
 
															 import json
														
 
															 import os
														
 
															+from random import randbytes
														
 
															 import shutil
														
 
															 import socket
														
 
															 import stat
														
@@ -626,6 +627,46 @@ class ArchiverTestCase(ArchiverTestCaseBase):
 
															         if has_lchflags:
														
 
															             self.assert_in("x input/file3", output)
														
 
															+    def test_file_status_counters(self):
														
 
															+        """Test file status counters in the stats of `borg create --stats`"""
														
 
															+
														
 
															+        def to_dict(borg_create_output):
														
 
															+            borg_create_output = borg_create_output.strip().splitlines()
														
 
															+            borg_create_output = [line.split(":", 1) for line in borg_create_output]
														
 
															+            borg_create_output = {
														
 
															+                key: int(value)
														
 
															+                for key, value in borg_create_output
														
 
															+                if key in ("Added files", "Unchanged files", "Modified files")
														
 
															+            }
														
 
															+            return borg_create_output
														
 
															+
														
 
															+        # Test case set up: create a repository
														
 
															+        self.cmd(f"--repo={self.repository_location}", "rcreate", RK_ENCRYPTION)
														
 
															+        # Archive an empty dir
														
 
															+        result = self.cmd(f"--repo={self.repository_location}", "create", "--stats", "test_archive", self.input_path)
														
 
															+        result = to_dict(result)
														
 
															+        assert result["Added files"] == 0
														
 
															+        assert result["Unchanged files"] == 0
														
 
															+        assert result["Modified files"] == 0
														
 
															+        # Archive a dir with two added files
														
 
															+        self.create_regular_file("testfile1", contents=b"test1")
														
 
															+        time.sleep(0.01)  # testfile2 must have newer timestamps than testfile1
														
 
															+        self.create_regular_file("testfile2", contents=b"test2")
														
 
															+        result = self.cmd(f"--repo={self.repository_location}", "create", "--stats", "test_archive2", self.input_path)
														
 
															+        result = to_dict(result)
														
 
															+        assert result["Added files"] == 2
														
 
															+        assert result["Unchanged files"] == 0
														
 
															+        assert result["Modified files"] == 0
														
 
															+        # Archive a dir with 1 unmodified file and 1 modified
														
 
															+        self.create_regular_file("testfile1", contents=b"new data")
														
 
															+        result = self.cmd(f"--repo={self.repository_location}", "create", "--stats", "test_archive3", self.input_path)
														
 
															+        result = to_dict(result)
														
 
															+        # Should process testfile2 as added because of
														
 
															+        # https://borgbackup.readthedocs.io/en/stable/faq.html#i-am-seeing-a-added-status-for-an-unchanged-file
														
 
															+        assert result["Added files"] == 1
														
 
															+        assert result["Unchanged files"] == 0
														
 
															+        assert result["Modified files"] == 1
														
 
															+
														
 
															     def test_create_json(self):
														
 
															         self.create_regular_file("file1", size=1024 * 80)
														
 
															         self.cmd(f"--repo={self.repository_location}", "rcreate", RK_ENCRYPTION)
														
@@ -731,6 +772,42 @@ class ArchiverTestCase(ArchiverTestCaseBase):
 
															         log = self.cmd(f"--repo={self.repository_location}", "--debug", "create", "test", "input")
														
 
															         assert "security: read previous location" in log
														
 
															+    def test_hashing_time(self):
														
 
															+        def extract_hashing_time(borg_create_output):
														
 
															+            borg_create_output = borg_create_output.strip().splitlines()
														
 
															+            borg_create_output = [line.split(":", 1) for line in borg_create_output]
														
 
															+            hashing_time = [line for line in borg_create_output if line[0] == "Time spent in hashing"].pop()
														
 
															+            hashing_time = hashing_time[1]
														
 
															+            hashing_time = float(hashing_time.removesuffix(" seconds"))
														
 
															+            return hashing_time
														
 
															+
														
 
															+        # Test case set up: create a repository and a file
														
 
															+        self.cmd(f"--repo={self.repository_location}", "rcreate", "--encryption=none")
														
 
															+        self.create_regular_file("testfile", contents=randbytes(6000000))
														
 
															+        # Archive
														
 
															+        result = self.cmd(f"--repo={self.repository_location}", "create", "--stats", "test_archive", self.input_path)
														
 
															+        hashing_time = extract_hashing_time(result)
														
 
															+
														
 
															+        assert hashing_time > 0.0
														
 
															+
														
 
															+    def test_chunking_time(self):
														
 
															+        def extract_chunking_time(borg_create_output):
														
 
															+            borg_create_output = borg_create_output.strip().splitlines()
														
 
															+            borg_create_output = [line.split(":", 1) for line in borg_create_output]
														
 
															+            chunking_time = [line for line in borg_create_output if line[0] == "Time spent in chunking"].pop()
														
 
															+            chunking_time = chunking_time[1]
														
 
															+            chunking_time = float(chunking_time.removesuffix(" seconds"))
														
 
															+            return chunking_time
														
 
															+
														
 
															+        # Test case set up: create a repository and a file
														
 
															+        self.cmd(f"--repo={self.repository_location}", "rcreate", RK_ENCRYPTION)
														
 
															+        self.create_regular_file("testfile", contents=randbytes(5000000))
														
 
															+        # Archive
														
 
															+        result = self.cmd(f"--repo={self.repository_location}", "create", "--stats", "test_archive", self.input_path)
														
 
															+        chunking_time = extract_chunking_time(result)
														
 
															+
														
 
															+        assert chunking_time > 0.0
														
 
															+
														
 
															 class RemoteArchiverTestCase(RemoteArchiverTestCaseBase, ArchiverTestCase):
														
 
															     """run the same tests, but with a remote repository"""