瀏覽代碼

read files cache early, init checkpoint timer after that, see #3394

reading the files cache can take considerable amount of time (a user
reported 1h 42min for a 700MB files cache for a repo with 8M files and
15TB total), so we must init the checkpoint timer after that or borg
will create the checkpoint too early.

creating a checkpoint means (among other stuff) saving the files cache,
which will also take a lot of time in such a case, one time too much.

doing this in a clean way required some refactoring:
- cache_mode is now given to Cache initializer and stored in instance
- the files cache is loaded early in _do_open (if needed)
Thomas Waldmann 7 年之前
父節點
當前提交
91e5e231f1
共有 3 個文件被更改,包括 24 次插入20 次删除
  1. 3 3
      src/borg/archive.py
  2. 5 4
      src/borg/archiver.py
  3. 16 13
      src/borg/cache.py

+ 3 - 3
src/borg/archive.py

@@ -1131,13 +1131,13 @@ class FilesystemObjectProcessors:
         self.add_item(item, stats=self.stats)
         return 'i'  # stdin
 
-    def process_file(self, path, st, cache, ignore_inode=False, files_cache_mode=DEFAULT_FILES_CACHE_MODE):
+    def process_file(self, path, st, cache, ignore_inode=False):
         with self.create_helper(path, st, None) as (item, status, hardlinked, hardlink_master):  # no status yet
             is_special_file = is_special(st.st_mode)
             if not hardlinked or hardlink_master:
                 if not is_special_file:
                     path_hash = self.key.id_hash(safe_encode(os.path.join(self.cwd, path)))
-                    known, ids = cache.file_known_and_unchanged(path_hash, st, ignore_inode, files_cache_mode)
+                    known, ids = cache.file_known_and_unchanged(path_hash, st, ignore_inode)
                 else:
                     # in --read-special mode, we may be called for special files.
                     # there should be no information in the cache about special files processed in
@@ -1172,7 +1172,7 @@ class FilesystemObjectProcessors:
                     if not is_special_file:
                         # we must not memorize special files, because the contents of e.g. a
                         # block or char device will change without its mtime/size/inode changing.
-                        cache.memorize_file(path_hash, st, [c.id for c in item.chunks], files_cache_mode)
+                        cache.memorize_file(path_hash, st, [c.id for c in item.chunks])
                 self.stats.nfiles += 1
             item.update(self.metadata_collector.stat_attrs(st, path))
             item.get_size(memorize=True)

+ 5 - 4
src/borg/archiver.py

@@ -144,7 +144,8 @@ def with_repository(fake=False, invert_fake=False, create=False, lock=True,
                 if cache:
                     with Cache(repository, kwargs['key'], kwargs['manifest'],
                                do_files=getattr(args, 'cache_files', False),
-                               progress=getattr(args, 'progress', False), lock_wait=self.lock_wait) as cache_:
+                               progress=getattr(args, 'progress', False), lock_wait=self.lock_wait,
+                               cache_mode=getattr(args, 'files_cache_mode', DEFAULT_FILES_CACHE_MODE)) as cache_:
                         return method(self, args, repository=repository, cache=cache_, **kwargs)
                 else:
                     return method(self, args, repository=repository, **kwargs)
@@ -504,13 +505,13 @@ class Archiver:
         self.ignore_inode = args.ignore_inode
         self.nobsdflags = args.nobsdflags
         self.exclude_nodump = args.exclude_nodump
-        self.files_cache_mode = args.files_cache_mode
         dry_run = args.dry_run
         t0 = datetime.utcnow()
         t0_monotonic = time.monotonic()
         if not dry_run:
             with Cache(repository, key, manifest, do_files=args.cache_files, progress=args.progress,
-                       lock_wait=self.lock_wait, permit_adhoc_cache=args.no_cache_sync) as cache:
+                       lock_wait=self.lock_wait, permit_adhoc_cache=args.no_cache_sync,
+                       cache_mode=args.files_cache_mode) as cache:
                 archive = Archive(repository, key, manifest, args.location.archive, cache=cache,
                                   create=True, checkpoint_interval=args.checkpoint_interval,
                                   numeric_owner=args.numeric_owner, noatime=args.noatime, noctime=args.noctime,
@@ -576,7 +577,7 @@ class Archiver:
                         return
             if stat.S_ISREG(st.st_mode):
                 if not dry_run:
-                    status = fso.process_file(path, st, cache, self.ignore_inode, self.files_cache_mode)
+                    status = fso.process_file(path, st, cache, self.ignore_inode)
             elif stat.S_ISDIR(st.st_mode):
                 if recurse:
                     tag_paths = dir_is_tagged(path, exclude_caches, exclude_if_present)

+ 16 - 13
src/borg/cache.py

@@ -359,11 +359,11 @@ class Cache:
             shutil.rmtree(path)
 
     def __new__(cls, repository, key, manifest, path=None, sync=True, do_files=False, warn_if_unencrypted=True,
-                progress=False, lock_wait=None, permit_adhoc_cache=False):
+                progress=False, lock_wait=None, permit_adhoc_cache=False, cache_mode=DEFAULT_FILES_CACHE_MODE):
         def local():
             return LocalCache(repository=repository, key=key, manifest=manifest, path=path, sync=sync,
                               do_files=do_files, warn_if_unencrypted=warn_if_unencrypted, progress=progress,
-                              lock_wait=lock_wait)
+                              lock_wait=lock_wait, cache_mode=cache_mode)
 
         def adhoc():
             return AdHocCache(repository=repository, key=key, manifest=manifest)
@@ -422,18 +422,20 @@ class LocalCache(CacheStatsMixin):
     """
 
     def __init__(self, repository, key, manifest, path=None, sync=True, do_files=False, warn_if_unencrypted=True,
-                 progress=False, lock_wait=None):
+                 progress=False, lock_wait=None, cache_mode=DEFAULT_FILES_CACHE_MODE):
         """
         :param do_files: use file metadata cache
         :param warn_if_unencrypted: print warning if accessing unknown unencrypted repository
         :param lock_wait: timeout for lock acquisition (None: return immediately if lock unavailable)
         :param sync: do :meth:`.sync`
+        :param cache_mode: what shall be compared in the file stat infos vs. cached stat infos comparison
         """
         self.repository = repository
         self.key = key
         self.manifest = manifest
         self.progress = progress
         self.do_files = do_files
+        self.cache_mode = cache_mode
         self.timestamp = None
         self.txn_active = False
 
@@ -485,7 +487,10 @@ class LocalCache(CacheStatsMixin):
         with IntegrityCheckedFile(path=os.path.join(self.path, 'chunks'), write=False,
                                   integrity_data=self.cache_config.integrity.get('chunks')) as fd:
             self.chunks = ChunkIndex.read(fd)
-        self.files = None
+        if 'd' in self.cache_mode or not self.do_files:  # d(isabled)
+            self.files = None
+        else:
+            self._read_files()
 
     def open(self):
         if not os.path.isdir(self.path):
@@ -917,7 +922,7 @@ class LocalCache(CacheStatsMixin):
         else:
             stats.update(-size, -csize, False)
 
-    def file_known_and_unchanged(self, path_hash, st, ignore_inode=False, cache_mode=DEFAULT_FILES_CACHE_MODE):
+    def file_known_and_unchanged(self, path_hash, st, ignore_inode=False):
         """
         Check if we know the file that has this path_hash (know == it is in our files cache) and
         whether it is unchanged (the size/inode number/cmtime is same for stuff we check in this cache_mode).
@@ -925,18 +930,15 @@ class LocalCache(CacheStatsMixin):
         :param path_hash: hash(file_path), to save some memory in the files cache
         :param st: the file's stat() result
         :param ignore_inode: whether the inode number shall be ignored
-        :param cache_mode: what shall be compared in the file stat infos vs. cached stat infos comparison
         :return: known, ids (known is True if we have infos about this file in the cache,
                              ids is the list of chunk ids IF the file has not changed, otherwise None).
         """
+        cache_mode = self.cache_mode
         if 'd' in cache_mode or not self.do_files or not stat.S_ISREG(st.st_mode):  # d(isabled)
             return False, None
-        if self.files is None:
-            self._read_files()
         # note: r(echunk) does not need the files cache in this method, but the files cache will
         # be updated and saved to disk to memorize the files. To preserve previous generations in
-        # the cache, this means that it also needs to get loaded from disk first, so keep
-        # _read_files() above here.
+        # the cache, this means that it also needs to get loaded from disk first.
         if 'r' in cache_mode:  # r(echunk)
             return False, None
         entry = self.files.get(path_hash)
@@ -963,7 +965,8 @@ class LocalCache(CacheStatsMixin):
         self.files[path_hash] = msgpack.packb(entry._replace(inode=st.st_ino, age=0))
         return True, entry.chunk_ids
 
-    def memorize_file(self, path_hash, st, ids, cache_mode=DEFAULT_FILES_CACHE_MODE):
+    def memorize_file(self, path_hash, st, ids):
+        cache_mode = self.cache_mode
         # note: r(echunk) modes will update the files cache, d(isabled) mode won't
         if 'd' in cache_mode or not self.do_files or not stat.S_ISREG(st.st_mode):
             return
@@ -1014,10 +1017,10 @@ Chunk index:    {0.total_unique_chunks:20d}             unknown"""
     files = None
     do_files = False
 
-    def file_known_and_unchanged(self, path_hash, st, ignore_inode=False, cache_mode=DEFAULT_FILES_CACHE_MODE):
+    def file_known_and_unchanged(self, path_hash, st, ignore_inode=False):
         return False, None
 
-    def memorize_file(self, path_hash, st, ids, cache_mode=DEFAULT_FILES_CACHE_MODE):
+    def memorize_file(self, path_hash, st, ids):
         pass
 
     def add_chunk(self, id, chunk, stats, overwrite=False, wait=True):