9 жил өмнө · 30f732052e
--- a/borg/archive.py
+++ b/borg/archive.py
@@ -25,7 +25,8 @@ from .helpers import Error, uid2user, user2uid, gid2group, group2gid, \
 
				 from .repository import Repository
			
 
				 from .platform import acl_get, acl_set
			
 
				 from .chunker import Chunker
			
 
				-from .hashindex import ChunkIndex
			
 
				+from .hashindex import ChunkIndex, ChunkIndexEntry
			
 
				+from .cache import ChunkListEntry
			
 
				 import msgpack
			
 
				 
			
 
				 ITEMS_BUFFER = 1024 * 1024
			
@@ -61,10 +62,13 @@ class DownloadPipeline:
 
				             items = [decode_dict(item, (b'path', b'source', b'user', b'group')) for item in unpacker]
			
 
				             if filter:
			
 
				                 items = [item for item in items if filter(item)]
			
 
				+            for item in items:
			
 
				+                if b'chunks' in item:
			
 
				+                    item[b'chunks'] = [ChunkListEntry(*e) for e in item[b'chunks']]
			
 
				             if preload:
			
 
				                 for item in items:
			
 
				                     if b'chunks' in item:
			
 
				-                        self.repository.preload([c[0] for c in item[b'chunks']])
			
 
				+                        self.repository.preload([c.id for c in item[b'chunks']])
			
 
				             for item in items:
			
 
				                 yield item
			
 
				 
			
@@ -318,7 +322,7 @@ Number of files: {0.stats.nfiles}'''.format(
 
				         """
			
 
				         if dry_run or stdout:
			
 
				             if b'chunks' in item:
			
 
				-                for data in self.pipeline.fetch_many([c[0] for c in item[b'chunks']], is_preloaded=True):
			
 
				+                for data in self.pipeline.fetch_many([c.id for c in item[b'chunks']], is_preloaded=True):
			
 
				                     if stdout:
			
 
				                         sys.stdout.buffer.write(data)
			
 
				                 if stdout:
			
@@ -361,7 +365,7 @@ Number of files: {0.stats.nfiles}'''.format(
 
				                     return
			
 
				                 # Extract chunks, since the item which had the chunks was not extracted
			
 
				             with open(path, 'wb') as fd:
			
 
				-                ids = [c[0] for c in item[b'chunks']]
			
 
				+                ids = [c.id for c in item[b'chunks']]
			
 
				                 for data in self.pipeline.fetch_many(ids, is_preloaded=True):
			
 
				                     if sparse and self.zeros.startswith(data):
			
 
				                         # all-zero chunk: create a hole in a sparse file
			
@@ -600,7 +604,7 @@ Number of files: {0.stats.nfiles}'''.format(
 
				                     chunks.append(cache.add_chunk(self.key.id_hash(chunk), chunk, self.stats))
			
 
				                     if self.show_progress:
			
 
				                         self.stats.show_progress(item=item, dt=0.2)
			
 
				-            cache.memorize_file(path_hash, st, [c[0] for c in chunks])
			
 
				+            cache.memorize_file(path_hash, st, [c.id for c in chunks])
			
 
				             status = status or 'M'  # regular file, modified (if not 'A' already)
			
 
				         item[b'chunks'] = chunks
			
 
				         item.update(self.stat_attrs(st, path))
			
@@ -732,8 +736,9 @@ class ArchiveChecker:
 
				             if not result:
			
 
				                 break
			
 
				             marker = result[-1]
			
 
				+            init_entry = ChunkIndexEntry(refcount=0, size=0, csize=0)
			
 
				             for id_ in result:
			
 
				-                self.chunks[id_] = (0, 0, 0)
			
 
				+                self.chunks[id_] = init_entry
			
 
				 
			
 
				     def identify_key(self, repository):
			
 
				         cdata = repository.get(next(self.chunks.iteritems())[0])
			
@@ -775,7 +780,7 @@ class ArchiveChecker:
 
				         del self.chunks[Manifest.MANIFEST_ID]
			
 
				 
			
 
				         def mark_as_possibly_superseded(id_):
			
 
				-            if self.chunks.get(id_, (0,))[0] == 0:
			
 
				+            if self.chunks.get(id_, ChunkIndexEntry(0, 0, 0)).refcount == 0:
			
 
				                 self.possibly_superseded.add(id_)
			
 
				 
			
 
				         def add_callback(chunk):
			
@@ -789,7 +794,7 @@ class ArchiveChecker:
 
				                 self.chunks.incref(id_)
			
 
				             except KeyError:
			
 
				                 assert cdata is not None
			
 
				-                self.chunks[id_] = 1, size, csize
			
 
				+                self.chunks[id_] = ChunkIndexEntry(refcount=1, size=size, csize=csize)
			
 
				                 if self.repair:
			
 
				                     self.repository.put(id_, cdata)
			
 
				 
			
@@ -909,10 +914,7 @@ class ArchiveChecker:
 
				 
			
 
				     def orphan_chunks_check(self):
			
 
				         if self.check_all:
			
 
				-            unused = set()
			
 
				-            for id_, (count, size, csize) in self.chunks.iteritems():
			
 
				-                if count == 0:
			
 
				-                    unused.add(id_)
			
 
				+            unused = {id_ for id_, entry in self.chunks.iteritems() if entry.refcount == 0}
			
 
				             orphaned = unused - self.possibly_superseded
			
 
				             if orphaned:
			
 
				                 logger.error('{} orphaned objects found!'.format(len(orphaned)))
			
@@ -1211,7 +1213,7 @@ class ArchiveRecreater:
 
				         for item in old_target.iter_items():
			
 
				             if b'chunks' in item:
			
 
				                 for chunk in item[b'chunks']:
			
 
				-                    self.cache.chunk_incref(chunk[0], target.stats)
			
 
				+                    self.cache.chunk_incref(chunk.id, target.stats)
			
 
				                 target.stats.nfiles += 1
			
 
				             target.add_item(item)
			
 
				         if item:
			
--- a/borg/archiver.py
+++ b/borg/archiver.py
@@ -32,6 +32,7 @@ from .cache import Cache
 
				 from .key import key_creator, RepoKey, PassphraseKey
			
 
				 from .archive import Archive, ArchiveChecker, ArchiveRecreater, CHUNKER_PARAMS
			
 
				 from .remote import RepositoryServer, RemoteRepository, cache_if_remote
			
 
				+from .hashindex import ChunkIndexEntry
			
 
				 
			
 
				 has_lchflags = hasattr(os, 'lchflags')
			
 
				 
			
@@ -446,8 +447,8 @@ class Archiver:
 
				             if item.get(b'deleted'):
			
 
				                 return None
			
 
				             else:
			
 
				-                return sum(c[1] for c in item[b'chunks']
			
 
				-                           if consider_ids is None or c[0] in consider_ids)
			
 
				+                return sum(c.size for c in item[b'chunks']
			
 
				+                           if consider_ids is None or c.id in consider_ids)
			
 
				 
			
 
				         def get_owner(item):
			
 
				             if args.numeric_owner:
			
@@ -482,8 +483,8 @@ class Archiver:
 
				                 if sum_chunk_size(item1) != sum_chunk_size(item2):
			
 
				                     return True
			
 
				                 else:
			
 
				-                    chunk_ids1 = [c[0] for c in item1[b'chunks']]
			
 
				-                    chunk_ids2 = [c[0] for c in item2[b'chunks']]
			
 
				+                    chunk_ids1 = [c.id for c in item1[b'chunks']]
			
 
				+                    chunk_ids2 = [c.id for c in item2[b'chunks']]
			
 
				                     return not fetch_and_compare_chunks(chunk_ids1, chunk_ids2, archive1, archive2)
			
 
				 
			
 
				         def compare_content(path, item1, item2):
			
@@ -493,8 +494,8 @@ class Archiver:
 
				                 elif item2.get(b'deleted'):
			
 
				                     return ('removed {:>11}'.format(format_file_size(sum_chunk_size(item1))))
			
 
				                 else:
			
 
				-                    chunk_ids1 = {c[0] for c in item1[b'chunks']}
			
 
				-                    chunk_ids2 = {c[0] for c in item2[b'chunks']}
			
 
				+                    chunk_ids1 = {c.id for c in item1[b'chunks']}
			
 
				+                    chunk_ids2 = {c.id for c in item2[b'chunks']}
			
 
				                     added_ids = chunk_ids2 - chunk_ids1
			
 
				                     removed_ids = chunk_ids1 - chunk_ids2
			
 
				                     added = sum_chunk_size(item2, added_ids)
			
--- a/borg/cache.py
+++ b/borg/cache.py
@@ -12,10 +12,13 @@ logger = create_logger()
 
				 from .helpers import Error, get_cache_dir, decode_dict, int_to_bigint, \
			
 
				     bigint_to_int, format_file_size, yes
			
 
				 from .locking import UpgradableLock
			
 
				-from .hashindex import ChunkIndex
			
 
				+from .hashindex import ChunkIndex, ChunkIndexEntry
			
 
				 
			
 
				 import msgpack
			
 
				 
			
 
				+ChunkListEntry = namedtuple('ChunkListEntry', 'id size csize')
			
 
				+FileCacheEntry = namedtuple('FileCacheEntry', 'age inode size mtime chunk_ids')
			
 
				+
			
 
				 
			
 
				 class Cache:
			
 
				     """Client Side cache
			
@@ -183,9 +186,9 @@ Chunk index:    {0.total_unique_chunks:20d} {0.total_chunks:20d}"""
 
				                     break
			
 
				                 u.feed(data)
			
 
				                 for path_hash, item in u:
			
 
				-                    item[0] += 1
			
 
				+                    entry = FileCacheEntry(*item)
			
 
				                     # in the end, this takes about 240 Bytes per file
			
 
				-                    self.files[path_hash] = msgpack.packb(item)
			
 
				+                    self.files[path_hash] = msgpack.packb(entry._replace(age=entry.age + 1))
			
 
				 
			
 
				     def begin_txn(self):
			
 
				         # Initialize transaction snapshot
			
@@ -208,9 +211,9 @@ Chunk index:    {0.total_unique_chunks:20d} {0.total_chunks:20d}"""
 
				                 for path_hash, item in self.files.items():
			
 
				                     # Discard cached files with the newest mtime to avoid
			
 
				                     # issues with filesystem snapshots and mtime precision
			
 
				-                    item = msgpack.unpackb(item)
			
 
				-                    if item[0] < 10 and bigint_to_int(item[3]) < self._newest_mtime:
			
 
				-                        msgpack.pack((path_hash, item), fd)
			
 
				+                    entry = FileCacheEntry(*msgpack.unpackb(item))
			
 
				+                    if entry.age < 10 and bigint_to_int(entry.mtime) < self._newest_mtime:
			
 
				+                        msgpack.pack((path_hash, entry), fd)
			
 
				         self.config.set('cache', 'manifest', hexlify(self.manifest.id).decode('ascii'))
			
 
				         self.config.set('cache', 'timestamp', self.manifest.timestamp)
			
 
				         self.config.set('cache', 'key_type', str(self.key.TYPE))
			
@@ -375,12 +378,12 @@ Chunk index:    {0.total_unique_chunks:20d} {0.total_chunks:20d}"""
 
				         data = self.key.encrypt(data)
			
 
				         csize = len(data)
			
 
				         self.repository.put(id, data, wait=False)
			
 
				-        self.chunks[id] = (refcount + 1, size, csize)
			
 
				+        self.chunks[id] = ChunkIndexEntry(refcount + 1, size, csize)
			
 
				         stats.update(size, csize, True)
			
 
				-        return id, size, csize
			
 
				+        return ChunkListEntry(id, size, csize)
			
 
				 
			
 
				     def seen_chunk(self, id, size=None):
			
 
				-        refcount, stored_size, _ = self.chunks.get(id, (0, None, None))
			
 
				+        refcount, stored_size, _ = self.chunks.get(id, ChunkIndexEntry(0, None, None))
			
 
				         if size is not None and stored_size is not None and size != stored_size:
			
 
				             # we already have a chunk with that id, but different size.
			
 
				             # this is either a hash collision (unlikely) or corruption or a bug.
			
@@ -393,7 +396,7 @@ Chunk index:    {0.total_unique_chunks:20d} {0.total_chunks:20d}"""
 
				             self.begin_txn()
			
 
				         count, size, csize = self.chunks.incref(id)
			
 
				         stats.update(size, csize, False)
			
 
				-        return id, size, csize
			
 
				+        return ChunkListEntry(id, size, csize)
			
 
				 
			
 
				     def chunk_decref(self, id, stats):
			
 
				         if not self.txn_active:
			
@@ -414,20 +417,17 @@ Chunk index:    {0.total_unique_chunks:20d} {0.total_chunks:20d}"""
 
				         entry = self.files.get(path_hash)
			
 
				         if not entry:
			
 
				             return None
			
 
				-        entry = msgpack.unpackb(entry)
			
 
				-        if (entry[2] == st.st_size and bigint_to_int(entry[3]) == st.st_mtime_ns and
			
 
				-                (ignore_inode or entry[1] == st.st_ino)):
			
 
				-            # reset entry age
			
 
				-            entry[0] = 0
			
 
				-            self.files[path_hash] = msgpack.packb(entry)
			
 
				-            return entry[4]
			
 
				+        entry = FileCacheEntry(*msgpack.unpackb(entry))
			
 
				+        if (entry.size == st.st_size and bigint_to_int(entry.mtime) == st.st_mtime_ns and
			
 
				+                (ignore_inode or entry.inode == st.st_ino)):
			
 
				+            self.files[path_hash] = msgpack.packb(entry._replace(age=0))
			
 
				+            return entry.chunk_ids
			
 
				         else:
			
 
				             return None
			
 
				 
			
 
				     def memorize_file(self, path_hash, st, ids):
			
 
				         if not (self.do_files and stat.S_ISREG(st.st_mode)):
			
 
				             return
			
 
				-        # Entry: Age, inode, size, mtime, chunk ids
			
 
				-        mtime_ns = st.st_mtime_ns
			
 
				-        self.files[path_hash] = msgpack.packb((0, st.st_ino, st.st_size, int_to_bigint(mtime_ns), ids))
			
 
				-        self._newest_mtime = max(self._newest_mtime, mtime_ns)
			
 
				+        entry = FileCacheEntry(age=0, inode=st.st_ino, size=st.st_size, mtime=int_to_bigint(st.st_mtime_ns), chunk_ids=ids)
			
 
				+        self.files[path_hash] = msgpack.packb(entry)
			
 
				+        self._newest_mtime = max(self._newest_mtime, st.st_mtime_ns)
			
--- a/borg/hashindex.pyx
+++ b/borg/hashindex.pyx
@@ -1,4 +1,5 @@
 
				 # -*- coding: utf-8 -*-
			
 
				+from collections import namedtuple
			
 
				 import os
			
 
				 
			
 
				 cimport cython
			
@@ -184,6 +185,9 @@ cdef class NSKeyIterator:
 
				         return (<char *>self.key)[:self.key_size], (segment, _le32toh(value[1]))
			
 
				 
			
 
				 
			
 
				+ChunkIndexEntry = namedtuple('ChunkIndexEntry', 'refcount size csize')
			
 
				+
			
 
				+
			
 
				 cdef class ChunkIndex(IndexBase):
			
 
				     """
			
 
				     Mapping of 32 byte keys to (refcount, size, csize), which are all 32-bit unsigned.
			
@@ -210,7 +214,7 @@ cdef class ChunkIndex(IndexBase):
 
				             raise KeyError(key)
			
 
				         cdef uint32_t refcount = _le32toh(data[0])
			
 
				         assert refcount <= _MAX_VALUE
			
 
				-        return refcount, _le32toh(data[1]), _le32toh(data[2])
			
 
				+        return ChunkIndexEntry(refcount, _le32toh(data[1]), _le32toh(data[2]))
			
 
				 
			
 
				     def __setitem__(self, key, value):
			
 
				         assert len(key) == self.key_size
			
@@ -342,4 +346,4 @@ cdef class ChunkKeyIterator:
 
				         cdef uint32_t *value = <uint32_t *>(self.key + self.key_size)
			
 
				         cdef uint32_t refcount = _le32toh(value[0])
			
 
				         assert refcount <= MAX_VALUE, "invalid reference count"
			
 
				-        return (<char *>self.key)[:self.key_size], (refcount, _le32toh(value[1]), _le32toh(value[2]))
			
 
				+        return (<char *>self.key)[:self.key_size], ChunkIndexEntry(refcount, _le32toh(value[1]), _le32toh(value[2]))
			
--- a/borg/helpers.py
+++ b/borg/helpers.py
@@ -1257,19 +1257,19 @@ class ItemFormatter:
 
				 
			
 
				     def calculate_unique_chunks(self, item):
			
 
				         chunk_index = self.archive.cache.chunks
			
 
				-        return sum(1 for chunk_id, _, _ in item.get(b'chunks', []) if chunk_index[chunk_id][0] == 1)
			
 
				+        return sum(1 for c in item.get(b'chunks', []) if chunk_index[c.id].refcount == 1)
			
 
				 
			
 
				     def calculate_size(self, item):
			
 
				-        return sum(size for _, size, _ in item.get(b'chunks', []))
			
 
				+        return sum(c.size for c in item.get(b'chunks', []))
			
 
				 
			
 
				     def calculate_csize(self, item):
			
 
				-        return sum(csize for _, _, csize in item.get(b'chunks', []))
			
 
				+        return sum(c.csize for c in item.get(b'chunks', []))
			
 
				 
			
 
				     def hash_item(self, hash_function, item):
			
 
				         if b'chunks' not in item:
			
 
				             return ""
			
 
				         hash = hashlib.new(hash_function)
			
 
				-        for chunk in self.archive.pipeline.fetch_many([c[0] for c in item[b'chunks']]):
			
 
				+        for chunk in self.archive.pipeline.fetch_many([c.id for c in item[b'chunks']]):
			
 
				             hash.update(chunk)
			
 
				         return hash.hexdigest()
			
 
				 
			
@@ -1320,7 +1320,7 @@ class ChunkIteratorFileWrapper:
 
				 
			
 
				 def open_item(archive, item):
			
 
				     """Return file-like object for archived item (with chunks)."""
			
 
				-    chunk_iterator = archive.pipeline.fetch_many([c[0] for c in item[b'chunks']])
			
 
				+    chunk_iterator = archive.pipeline.fetch_many([c.id for c in item[b'chunks']])
			
 
				     return ChunkIteratorFileWrapper(chunk_iterator)
			
 
				 
			
 
				 
			
--- a/borg/testsuite/archiver.py
+++ b/borg/testsuite/archiver.py
@@ -1425,7 +1425,7 @@ class ArchiverCheckTestCase(ArchiverTestCaseBase):
 
				         with repository:
			
 
				             for item in archive.iter_items():
			
 
				                 if item[b'path'].endswith('testsuite/archiver.py'):
			
 
				-                    repository.delete(item[b'chunks'][-1][0])
			
 
				+                    repository.delete(item[b'chunks'][-1].id)
			
 
				                     break
			
 
				             repository.commit()
			
 
				         self.cmd('check', self.repository_location, exit_code=1)