Pārlūkot izejas kodu

Merge pull request #2157 from ThomasWaldmann/add-filesize

archived file items: add size metadata
enkore 8 gadi atpakaļ
vecāks
revīzija
7c9c4b61d7

+ 23 - 1
src/borg/archive.py

@@ -519,13 +519,20 @@ Utilization of max. archive size: {csize_max:.0%}
         has_damaged_chunks = 'chunks_healthy' in item
         has_damaged_chunks = 'chunks_healthy' in item
         if dry_run or stdout:
         if dry_run or stdout:
             if 'chunks' in item:
             if 'chunks' in item:
+                item_chunks_size = 0
                 for _, data in self.pipeline.fetch_many([c.id for c in item.chunks], is_preloaded=True):
                 for _, data in self.pipeline.fetch_many([c.id for c in item.chunks], is_preloaded=True):
                     if pi:
                     if pi:
                         pi.show(increase=len(data), info=[remove_surrogates(item.path)])
                         pi.show(increase=len(data), info=[remove_surrogates(item.path)])
                     if stdout:
                     if stdout:
                         sys.stdout.buffer.write(data)
                         sys.stdout.buffer.write(data)
+                    item_chunks_size += len(data)
                 if stdout:
                 if stdout:
                     sys.stdout.buffer.flush()
                     sys.stdout.buffer.flush()
+                if 'size' in item:
+                    item_size = item.size
+                    if item_size != item_chunks_size:
+                        logger.warning('{}: size inconsistency detected: size {}, chunks size {}'.format(
+                            item.path, item_size, item_chunks_size))
             if has_damaged_chunks:
             if has_damaged_chunks:
                 logger.warning('File %s has damaged (all-zero) chunks. Try running borg check --repair.' %
                 logger.warning('File %s has damaged (all-zero) chunks. Try running borg check --repair.' %
                                remove_surrogates(item.path))
                                remove_surrogates(item.path))
@@ -582,10 +589,15 @@ Utilization of max. archive size: {csize_max:.0%}
                         else:
                         else:
                             fd.write(data)
                             fd.write(data)
                 with backup_io('truncate'):
                 with backup_io('truncate'):
-                    pos = fd.tell()
+                    pos = item_chunks_size = fd.tell()
                     fd.truncate(pos)
                     fd.truncate(pos)
                     fd.flush()
                     fd.flush()
                     self.restore_attrs(path, item, fd=fd.fileno())
                     self.restore_attrs(path, item, fd=fd.fileno())
+            if 'size' in item:
+                item_size = item.size
+                if item_size != item_chunks_size:
+                    logger.warning('{}: size inconsistency detected: size {}, chunks size {}'.format(
+                        item.path, item_size, item_chunks_size))
             if has_damaged_chunks:
             if has_damaged_chunks:
                 logger.warning('File %s has damaged (all-zero) chunks. Try running borg check --repair.' %
                 logger.warning('File %s has damaged (all-zero) chunks. Try running borg check --repair.' %
                                remove_surrogates(item.path))
                                remove_surrogates(item.path))
@@ -829,6 +841,7 @@ Utilization of max. archive size: {csize_max:.0%}
         length = len(item.chunks)
         length = len(item.chunks)
         # the item should only have the *additional* chunks we processed after the last partial item:
         # the item should only have the *additional* chunks we processed after the last partial item:
         item.chunks = item.chunks[from_chunk:]
         item.chunks = item.chunks[from_chunk:]
+        item.get_size(memorize=True)
         item.path += '.borg_part_%d' % number
         item.path += '.borg_part_%d' % number
         item.part = number
         item.part = number
         number += 1
         number += 1
@@ -877,6 +890,7 @@ Utilization of max. archive size: {csize_max:.0%}
         )
         )
         fd = sys.stdin.buffer  # binary
         fd = sys.stdin.buffer  # binary
         self.chunk_file(item, cache, self.stats, backup_io_iter(self.chunker.chunkify(fd)))
         self.chunk_file(item, cache, self.stats, backup_io_iter(self.chunker.chunkify(fd)))
+        item.get_size(memorize=True)
         self.stats.nfiles += 1
         self.stats.nfiles += 1
         self.add_item(item)
         self.add_item(item)
         return 'i'  # stdin
         return 'i'  # stdin
@@ -937,6 +951,7 @@ Utilization of max. archive size: {csize_max:.0%}
                 cache.memorize_file(path_hash, st, [c.id for c in item.chunks])
                 cache.memorize_file(path_hash, st, [c.id for c in item.chunks])
             status = status or 'M'  # regular file, modified (if not 'A' already)
             status = status or 'M'  # regular file, modified (if not 'A' already)
         item.update(self.stat_attrs(st, path))
         item.update(self.stat_attrs(st, path))
+        item.get_size(memorize=True)
         if is_special_file:
         if is_special_file:
             # we processed a special file like a regular file. reflect that in mode,
             # we processed a special file like a regular file. reflect that in mode,
             # so it can be extracted / accessed in FUSE mount like a regular file:
             # so it can be extracted / accessed in FUSE mount like a regular file:
@@ -1355,6 +1370,13 @@ class ArchiveChecker:
                 logger.info('{}: Completely healed previously damaged file!'.format(item.path))
                 logger.info('{}: Completely healed previously damaged file!'.format(item.path))
                 del item.chunks_healthy
                 del item.chunks_healthy
             item.chunks = chunk_list
             item.chunks = chunk_list
+            if 'size' in item:
+                item_size = item.size
+                item_chunks_size = item.get_size(compressed=False, from_chunks=True)
+                if item_size != item_chunks_size:
+                    # just warn, but keep the inconsistency, so that borg extract can warn about it.
+                    logger.warning('{}: size inconsistency detected: size {}, chunks size {}'.format(
+                                   item.path, item_size, item_chunks_size))
 
 
         def robust_iterator(archive):
         def robust_iterator(archive):
             """Iterates through all archive items
             """Iterates through all archive items

+ 7 - 4
src/borg/archiver.py

@@ -557,7 +557,7 @@ class Archiver:
         if progress:
         if progress:
             pi = ProgressIndicatorPercent(msg='%5.1f%% Extracting: %s', step=0.1)
             pi = ProgressIndicatorPercent(msg='%5.1f%% Extracting: %s', step=0.1)
             pi.output('Calculating size')
             pi.output('Calculating size')
-            extracted_size = sum(item.file_size(hardlink_masters) for item in archive.iter_items(filter))
+            extracted_size = sum(item.get_size(hardlink_masters) for item in archive.iter_items(filter))
             pi.total = extracted_size
             pi.total = extracted_size
         else:
         else:
             pi = None
             pi = None
@@ -616,10 +616,13 @@ class Archiver:
 
 
         def sum_chunk_size(item, consider_ids=None):
         def sum_chunk_size(item, consider_ids=None):
             if item.get('deleted'):
             if item.get('deleted'):
-                return None
+                size = None
             else:
             else:
-                return sum(c.size for c in item.chunks
-                           if consider_ids is None or c.id in consider_ids)
+                if consider_ids is not None:  # consider only specific chunks
+                    size = sum(chunk.size for chunk in item.chunks if chunk.id in consider_ids)
+                else:  # consider all chunks
+                    size = item.get_size()
+            return size
 
 
         def get_owner(item):
         def get_owner(item):
             if args.numeric_owner:
             if args.numeric_owner:

+ 1 - 2
src/borg/cache.py

@@ -20,13 +20,12 @@ from .helpers import format_file_size
 from .helpers import yes
 from .helpers import yes
 from .helpers import remove_surrogates
 from .helpers import remove_surrogates
 from .helpers import ProgressIndicatorPercent, ProgressIndicatorMessage
 from .helpers import ProgressIndicatorPercent, ProgressIndicatorMessage
-from .item import Item, ArchiveItem
+from .item import Item, ArchiveItem, ChunkListEntry
 from .key import PlaintextKey
 from .key import PlaintextKey
 from .locking import Lock
 from .locking import Lock
 from .platform import SaveFile
 from .platform import SaveFile
 from .remote import cache_if_remote
 from .remote import cache_if_remote
 
 
-ChunkListEntry = namedtuple('ChunkListEntry', 'id size csize')
 FileCacheEntry = namedtuple('FileCacheEntry', 'age inode size mtime chunk_ids')
 FileCacheEntry = namedtuple('FileCacheEntry', 'age inode size mtime chunk_ids')
 
 
 
 

+ 1 - 1
src/borg/constants.py

@@ -1,6 +1,6 @@
 # this set must be kept complete, otherwise the RobustUnpacker might malfunction:
 # this set must be kept complete, otherwise the RobustUnpacker might malfunction:
 ITEM_KEYS = frozenset(['path', 'source', 'rdev', 'chunks', 'chunks_healthy', 'hardlink_master',
 ITEM_KEYS = frozenset(['path', 'source', 'rdev', 'chunks', 'chunks_healthy', 'hardlink_master',
-                       'mode', 'user', 'group', 'uid', 'gid', 'mtime', 'atime', 'ctime',
+                       'mode', 'user', 'group', 'uid', 'gid', 'mtime', 'atime', 'ctime', 'size',
                        'xattrs', 'bsdflags', 'acl_nfs4', 'acl_access', 'acl_default', 'acl_extended',
                        'xattrs', 'bsdflags', 'acl_nfs4', 'acl_access', 'acl_default', 'acl_extended',
                        'part'])
                        'part'])
 
 

+ 2 - 11
src/borg/fuse.py

@@ -72,7 +72,6 @@ class FuseOperations(llfuse.Operations):
         self.contents = defaultdict(dict)
         self.contents = defaultdict(dict)
         self.default_dir = Item(mode=0o40755, mtime=int(time.time() * 1e9), uid=os.getuid(), gid=os.getgid())
         self.default_dir = Item(mode=0o40755, mtime=int(time.time() * 1e9), uid=os.getuid(), gid=os.getgid())
         self.pending_archives = {}
         self.pending_archives = {}
-        self.accounted_chunks = {}
         self.cache = ItemCache()
         self.cache = ItemCache()
         data_cache_capacity = int(os.environ.get('BORG_MOUNT_DATA_CACHE_ENTRIES', os.cpu_count() or 1))
         data_cache_capacity = int(os.environ.get('BORG_MOUNT_DATA_CACHE_ENTRIES', os.cpu_count() or 1))
         logger.debug('mount data cache capacity: %d chunks', data_cache_capacity)
         logger.debug('mount data cache capacity: %d chunks', data_cache_capacity)
@@ -257,14 +256,6 @@ class FuseOperations(llfuse.Operations):
 
 
     def getattr(self, inode, ctx=None):
     def getattr(self, inode, ctx=None):
         item = self.get_item(inode)
         item = self.get_item(inode)
-        size = 0
-        dsize = 0
-        if 'chunks' in item:
-            for key, chunksize, _ in item.chunks:
-                size += chunksize
-                if self.accounted_chunks.get(key, inode) == inode:
-                    self.accounted_chunks[key] = inode
-                    dsize += chunksize
         entry = llfuse.EntryAttributes()
         entry = llfuse.EntryAttributes()
         entry.st_ino = inode
         entry.st_ino = inode
         entry.generation = 0
         entry.generation = 0
@@ -275,9 +266,9 @@ class FuseOperations(llfuse.Operations):
         entry.st_uid = item.uid
         entry.st_uid = item.uid
         entry.st_gid = item.gid
         entry.st_gid = item.gid
         entry.st_rdev = item.get('rdev', 0)
         entry.st_rdev = item.get('rdev', 0)
-        entry.st_size = size
+        entry.st_size = item.get_size()
         entry.st_blksize = 512
         entry.st_blksize = 512
-        entry.st_blocks = dsize / 512
+        entry.st_blocks = (entry.st_size + entry.st_blksize - 1) // entry.st_blksize
         # note: older archives only have mtime (not atime nor ctime)
         # note: older archives only have mtime (not atime nor ctime)
         mtime_ns = item.mtime
         mtime_ns = item.mtime
         if have_fuse_xtime_ns:
         if have_fuse_xtime_ns:

+ 5 - 3
src/borg/helpers.py

@@ -105,7 +105,7 @@ def check_extension_modules():
         raise ExtensionModuleError
         raise ExtensionModuleError
     if platform.API_VERSION != platform.OS_API_VERSION != '1.1_01':
     if platform.API_VERSION != platform.OS_API_VERSION != '1.1_01':
         raise ExtensionModuleError
         raise ExtensionModuleError
-    if item.API_VERSION != '1.1_01':
+    if item.API_VERSION != '1.1_02':
         raise ExtensionModuleError
         raise ExtensionModuleError
 
 
 
 
@@ -1759,10 +1759,12 @@ class ItemFormatter(BaseFormatter):
         return len(item.get('chunks', []))
         return len(item.get('chunks', []))
 
 
     def calculate_size(self, item):
     def calculate_size(self, item):
-        return sum(c.size for c in item.get('chunks', []))
+        # note: does not support hardlink slaves, they will be size 0
+        return item.get_size(compressed=False)
 
 
     def calculate_csize(self, item):
     def calculate_csize(self, item):
-        return sum(c.csize for c in item.get('chunks', []))
+        # note: does not support hardlink slaves, they will be csize 0
+        return item.get_size(compressed=True)
 
 
     def hash_item(self, hash_function, item):
     def hash_item(self, hash_function, item):
         if 'chunks' not in item:
         if 'chunks' not in item:

+ 51 - 8
src/borg/item.pyx

@@ -1,8 +1,10 @@
+from collections import namedtuple
+
 from .constants import ITEM_KEYS
 from .constants import ITEM_KEYS
 from .helpers import safe_encode, safe_decode
 from .helpers import safe_encode, safe_decode
 from .helpers import StableDict
 from .helpers import StableDict
 
 
-API_VERSION = '1.1_01'
+API_VERSION = '1.1_02'
 
 
 
 
 class PropDict:
 class PropDict:
@@ -113,6 +115,8 @@ class PropDict:
         return property(_get, _set, _del, doc=doc)
         return property(_get, _set, _del, doc=doc)
 
 
 
 
+ChunkListEntry = namedtuple('ChunkListEntry', 'id size csize')
+
 class Item(PropDict):
 class Item(PropDict):
     """
     """
     Item abstraction that deals with validation and the low-level details internally:
     Item abstraction that deals with validation and the low-level details internally:
@@ -156,6 +160,10 @@ class Item(PropDict):
     ctime = PropDict._make_property('ctime', int)
     ctime = PropDict._make_property('ctime', int)
     mtime = PropDict._make_property('mtime', int)
     mtime = PropDict._make_property('mtime', int)
 
 
+    # size is only present for items with a chunk list and then it is sum(chunk_sizes)
+    # compatibility note: this is a new feature, in old archives size will be missing.
+    size = PropDict._make_property('size', int)
+
     hardlink_master = PropDict._make_property('hardlink_master', bool)
     hardlink_master = PropDict._make_property('hardlink_master', bool)
 
 
     chunks = PropDict._make_property('chunks', (list, type(None)), 'list or None')
     chunks = PropDict._make_property('chunks', (list, type(None)), 'list or None')
@@ -168,13 +176,48 @@ class Item(PropDict):
 
 
     part = PropDict._make_property('part', int)
     part = PropDict._make_property('part', int)
 
 
-    def file_size(self, hardlink_masters=None):
-        hardlink_masters = hardlink_masters or {}
-        chunks, _ = hardlink_masters.get(self.get('source'), (None, None))
-        chunks = self.get('chunks', chunks)
-        if chunks is None:
-            return 0
-        return sum(chunk.size for chunk in chunks)
+    def get_size(self, hardlink_masters=None, memorize=False, compressed=False, from_chunks=False):
+        """
+        Determine the (uncompressed or compressed) size of this item.
+
+        For hardlink slaves, the size is computed via the hardlink master's
+        chunk list, if available (otherwise size will be returned as 0).
+
+        If memorize is True, the computed size value will be stored into the item.
+        """
+        attr = 'csize' if compressed else 'size'
+        try:
+            if from_chunks:
+                raise AttributeError
+            size = getattr(self, attr)
+        except AttributeError:
+            # no precomputed (c)size value available, compute it:
+            try:
+                chunks = getattr(self, 'chunks')
+                having_chunks = True
+            except AttributeError:
+                having_chunks = False
+                # this item has no (own) chunks list, but if this is a hardlink slave
+                # and we know the master, we can still compute the size.
+                if hardlink_masters is None:
+                    chunks = None
+                else:
+                    try:
+                        master = getattr(self, 'source')
+                    except AttributeError:
+                        # not a hardlink slave, likely a directory or special file w/o chunks
+                        chunks = None
+                    else:
+                        # hardlink slave, try to fetch hardlink master's chunks list
+                        # todo: put precomputed size into hardlink_masters' values and use it, if present
+                        chunks, _ = hardlink_masters.get(master, (None, None))
+                if chunks is None:
+                    return 0
+            size = sum(getattr(ChunkListEntry(*chunk), attr) for chunk in chunks)
+            # if requested, memorize the precomputed (c)size for items that have an own chunks list:
+            if memorize and having_chunks:
+                setattr(self, attr, size)
+        return size
 
 
 
 
 class EncryptedKey(PropDict):
 class EncryptedKey(PropDict):

+ 2 - 2
src/borg/testsuite/item.py

@@ -142,9 +142,9 @@ def test_item_file_size():
         ChunkListEntry(csize=1, size=1000, id=None),
         ChunkListEntry(csize=1, size=1000, id=None),
         ChunkListEntry(csize=1, size=2000, id=None),
         ChunkListEntry(csize=1, size=2000, id=None),
     ])
     ])
-    assert item.file_size() == 3000
+    assert item.get_size() == 3000
 
 
 
 
 def test_item_file_size_no_chunks():
 def test_item_file_size_no_chunks():
     item = Item()
     item = Item()
-    assert item.file_size() == 0
+    assert item.get_size() == 0