Browse Source

files cache: add chunk size information

the files cache used to have only the chunk ids,
so it had to rely on the chunks index having the
size information - which is problematic with e.g.
the AdhocCache (has size==0 for all not new chunks) and blocked using the files cache there.
Thomas Waldmann 1 year ago
parent
commit
4488c077a7
2 changed files with 20 additions and 17 deletions
  1. 10 8
      src/borg/archive.py
  2. 10 9
      src/borg/cache.py

+ 10 - 8
src/borg/archive.py

@@ -1552,25 +1552,27 @@ class FilesystemObjectProcessors:
                             started_hashing = time.monotonic()
                             path_hash = self.key.id_hash(hashed_path)
                             self.stats.hashing_time += time.monotonic() - started_hashing
-                            known, ids = cache.file_known_and_unchanged(hashed_path, path_hash, st)
+                            known, chunks = cache.file_known_and_unchanged(hashed_path, path_hash, st)
                         else:
                             # in --read-special mode, we may be called for special files.
                             # there should be no information in the cache about special files processed in
                             # read-special mode, but we better play safe as this was wrong in the past:
                             hashed_path = path_hash = None
-                            known, ids = False, None
-                        if ids is not None:
+                            known, chunks = False, None
+                        if chunks is not None:
                             # Make sure all ids are available
-                            for id_ in ids:
-                                if not cache.seen_chunk(id_):
+                            for chunk in chunks:
+                                if not cache.seen_chunk(chunk.id):
                                     # cache said it is unmodified, but we lost a chunk: process file like modified
                                     status = "M"
                                     break
                             else:
                                 item.chunks = []
-                                for chunk_id in ids:
+                                for chunk in chunks:
                                     # process one-by-one, so we will know in item.chunks how far we got
-                                    chunk_entry = cache.chunk_incref(chunk_id, self.stats)
+                                    chunk_entry = cache.chunk_incref(chunk.id, self.stats)
+                                    # chunk.size is from files cache, chunk_entry.size from index:
+                                    assert chunk == chunk_entry
                                     item.chunks.append(chunk_entry)
                                 status = "U"  # regular file, unchanged
                         else:
@@ -1606,7 +1608,7 @@ class FilesystemObjectProcessors:
                                 # block or char device will change without its mtime/size/inode changing.
                                 # also, we must not memorize a potentially inconsistent/corrupt file that
                                 # changed while we backed it up.
-                                cache.memorize_file(hashed_path, path_hash, st, [c.id for c in item.chunks])
+                                cache.memorize_file(hashed_path, path_hash, st, item.chunks)
                         self.stats.files_stats[status] += 1  # must be done late
                         if not changed_while_backup:
                             status = None  # we already called print_file_status

+ 10 - 9
src/borg/cache.py

@@ -35,8 +35,8 @@ from .platform import SaveFile
 from .remote import cache_if_remote
 from .repository import LIST_SCAN_LIMIT
 
-# note: cmtime might me either a ctime or a mtime timestamp
-FileCacheEntry = namedtuple("FileCacheEntry", "age inode size cmtime chunk_ids")
+# note: cmtime might be either a ctime or a mtime timestamp, chunks is a list of ChunkListEntry
+FileCacheEntry = namedtuple("FileCacheEntry", "age inode size cmtime chunks")
 
 
 class SecurityManager:
@@ -1030,8 +1030,8 @@ class LocalCache(CacheStatsMixin):
         :param hashed_path: the file's path as we gave it to hash(hashed_path)
         :param path_hash: hash(hashed_path), to save some memory in the files cache
         :param st: the file's stat() result
-        :return: known, ids (known is True if we have infos about this file in the cache,
-                             ids is the list of chunk ids IF the file has not changed, otherwise None).
+        :return: known, chunks (known is True if we have infos about this file in the cache,
+                               chunks is a list[ChunkListEntry] IF the file has not changed, otherwise None).
         """
         if not stat.S_ISREG(st.st_mode):
             return False, None
@@ -1072,9 +1072,10 @@ class LocalCache(CacheStatsMixin):
         # again at that time), we need to update the inode number in the cache with what
         # we see in the filesystem.
         self.files[path_hash] = msgpack.packb(entry._replace(inode=st.st_ino, age=0))
-        return True, entry.chunk_ids
+        chunks = [ChunkListEntry(*chunk) for chunk in entry.chunks]  # convert to list of namedtuple
+        return True, chunks
 
-    def memorize_file(self, hashed_path, path_hash, st, ids):
+    def memorize_file(self, hashed_path, path_hash, st, chunks):
         if not stat.S_ISREG(st.st_mode):
             return
         cache_mode = self.cache_mode
@@ -1092,13 +1093,13 @@ class LocalCache(CacheStatsMixin):
             cmtime_type = "ctime"
             cmtime_ns = safe_ns(st.st_ctime_ns)
         entry = FileCacheEntry(
-            age=0, inode=st.st_ino, size=st.st_size, cmtime=int_to_timestamp(cmtime_ns), chunk_ids=ids
+            age=0, inode=st.st_ino, size=st.st_size, cmtime=int_to_timestamp(cmtime_ns), chunks=chunks
         )
         self.files[path_hash] = msgpack.packb(entry)
         self._newest_cmtime = max(self._newest_cmtime or 0, cmtime_ns)
         files_cache_logger.debug(
             "FILES-CACHE-UPDATE: put %r [has %s] <- %r",
-            entry._replace(chunk_ids="[%d entries]" % len(entry.chunk_ids)),
+            entry._replace(chunks="[%d entries]" % len(entry.chunks)),
             cmtime_type,
             hashed_path,
         )
@@ -1149,7 +1150,7 @@ Chunk index:    {0.total_unique_chunks:20d}             unknown"""
         files_cache_logger.debug("UNKNOWN: files cache not implemented")
         return False, None
 
-    def memorize_file(self, hashed_path, path_hash, st, ids):
+    def memorize_file(self, hashed_path, path_hash, st, chunks):
         pass
 
     def add_chunk(self, id, meta, data, *, stats, wait=True, compress=True, size=None, ro_type=ROBJ_FILE_STREAM):