3 years ago · fb74fdb710
--- a/docs/faq.rst
+++ b/docs/faq.rst
@@ -115,15 +115,6 @@ Which file types, attributes, etc. are *not* preserved?
 
				 Are there other known limitations?
			
 
				 ----------------------------------
			
 
				 
			
 
				-- A single archive can only reference a limited volume of file/dir metadata,
			
 
				-  usually corresponding to tens or hundreds of millions of files/dirs.
			
 
				-  When trying to go beyond that limit, you will get a fatal IntegrityError
			
 
				-  exception telling that the (archive) object is too big.
			
 
				-  An easy workaround is to create multiple archives with fewer items each.
			
 
				-  See also the :ref:`archive_limitation` and :issue:`1452`.
			
 
				-
			
 
				-  :ref:`borg_info` shows how large (relative to the maximum size) existing
			
 
				-  archives are.
			
 
				 - borg extract only supports restoring into an empty destination. After that,
			
 
				   the destination will exactly have the contents of the extracted archive.
			
 
				   If you extract into a non-empty destination, borg will (for example) not
			
--- a/docs/internals/data-structures.rst
+++ b/docs/internals/data-structures.rst
@@ -511,7 +511,8 @@ The archive object itself further contains some metadata:
 
				   When :ref:`borg_check` rebuilds the manifest (e.g. if it was corrupted) and finds
			
 
				   more than one archive object with the same name, it adds a counter to the name
			
 
				   in the manifest, but leaves the *name* field of the archives as it was.
			
 
				-* *items*, a list of chunk IDs containing item metadata (size: count * ~34B)
			
 
				+* *item_ptrs*, a list of "pointer chunk" IDs.
			
 
				+  Each "pointer chunk" contains a list of chunk IDs of item metadata.
			
 
				 * *cmdline*, the command line which was used to create the archive
			
 
				 * *hostname*
			
 
				 * *username*
			
@@ -521,34 +522,6 @@ The archive object itself further contains some metadata:
 
				   This is used by :ref:`borg_recreate` to determine whether a given archive needs rechunking.
			
 
				 * Some other pieces of information related to recreate.
			
 
				 
			
 
				-.. _archive_limitation:
			
 
				-
			
 
				-.. rubric:: Note about archive limitations
			
 
				-
			
 
				-The archive is currently stored as a single object in the repository
			
 
				-and thus limited in size to MAX_OBJECT_SIZE (20MiB).
			
 
				-
			
 
				-As one chunk list entry is ~40B, that means we can reference ~500.000 item
			
 
				-metadata stream chunks per archive.
			
 
				-
			
 
				-Each item metadata stream chunk is ~128kiB (see hardcoded ITEMS_CHUNKER_PARAMS).
			
 
				-
			
 
				-So that means the whole item metadata stream is limited to ~64GiB chunks.
			
 
				-If compression is used, the amount of storable metadata is bigger - by the
			
 
				-compression factor.
			
 
				-
			
 
				-If the medium size of an item entry is 100B (small size file, no ACLs/xattrs),
			
 
				-that means a limit of ~640 million files/directories per archive.
			
 
				-
			
 
				-If the medium size of an item entry is 2kB (~100MB size files or more
			
 
				-ACLs/xattrs), the limit will be ~32 million files/directories per archive.
			
 
				-
			
 
				-If one tries to create an archive object bigger than MAX_OBJECT_SIZE, a fatal
			
 
				-IntegrityError will be raised.
			
 
				-
			
 
				-A workaround is to create multiple archives with fewer items each, see
			
 
				-also :issue:`1452`.
			
 
				-
			
 
				 .. _item:
			
 
				 
			
 
				 Items
			
--- a/docs/internals/frontends.rst
+++ b/docs/internals/frontends.rst
@@ -331,11 +331,6 @@ stats
 
				         Deduplicated size (against the current repository, not when the archive was created)
			
 
				     nfiles
			
 
				         Number of regular files in the archive
			
 
				-limits
			
 
				-    Object describing the utilization of Borg limits
			
 
				-
			
 
				-    max_archive_size
			
 
				-        Float between 0 and 1 describing how large this archive is relative to the maximum size allowed by Borg
			
 
				 command_line
			
 
				     Array of strings of the command line that created the archive
			
 
				 
			
@@ -405,9 +400,6 @@ The same archive with more information (``borg info --last 1 --json``)::
 
				                 "end": "2017-02-27T12:27:20.789123",
			
 
				                 "hostname": "host",
			
 
				                 "id": "80cd07219ad725b3c5f665c1dcf119435c4dee1647a560ecac30f8d40221a46a",
			
 
				-                "limits": {
			
 
				-                    "max_archive_size": 0.0001330855110409714
			
 
				-                },
			
 
				                 "name": "host-system-backup-2017-02-27",
			
 
				                 "start": "2017-02-27T12:27:20.789123",
			
 
				                 "stats": {
			
--- a/src/borg/archive.py
+++ b/src/borg/archive.py
@@ -391,6 +391,38 @@ def get_item_uid_gid(item, *, numeric, uid_forced=None, gid_forced=None, uid_def
 
				     return uid, gid
			
 
				 
			
 
				 
			
 
				+def archive_get_items(metadata, key, repository):
			
 
				+    if "item_ptrs" in metadata:  # looks like a v2+ archive
			
 
				+        assert "items" not in metadata
			
 
				+        items = []
			
 
				+        for id, data in zip(metadata.item_ptrs, repository.get_many(metadata.item_ptrs)):
			
 
				+            data = key.decrypt(id, data)
			
 
				+            ids = msgpack.unpackb(data)
			
 
				+            items.extend(ids)
			
 
				+        return items
			
 
				+
			
 
				+    if "items" in metadata:  # legacy, v1 archive
			
 
				+        assert "item_ptrs" not in metadata
			
 
				+        return metadata.items
			
 
				+
			
 
				+
			
 
				+def archive_put_items(chunk_ids, *, key, cache=None, stats=None, add_reference=None):
			
 
				+    """gets a (potentially large) list of archive metadata stream chunk ids and writes them to repo objects"""
			
 
				+    item_ptrs = []
			
 
				+    for i in range(0, len(chunk_ids), IDS_PER_CHUNK):
			
 
				+        data = msgpack.packb(chunk_ids[i : i + IDS_PER_CHUNK])
			
 
				+        id = key.id_hash(data)
			
 
				+        if cache is not None and stats is not None:
			
 
				+            cache.add_chunk(id, data, stats)
			
 
				+        elif add_reference is not None:
			
 
				+            cdata = key.encrypt(id, data)
			
 
				+            add_reference(id, len(data), cdata)
			
 
				+        else:
			
 
				+            raise NotImplementedError
			
 
				+        item_ptrs.append(id)
			
 
				+    return item_ptrs
			
 
				+
			
 
				+
			
 
				 class Archive:
			
 
				     class DoesNotExist(Error):
			
 
				         """Archive {} does not exist"""
			
@@ -479,6 +511,8 @@ class Archive:
 
				         metadata = ArchiveItem(internal_dict=msgpack.unpackb(data))
			
 
				         if metadata.version not in (1, 2):  # legacy: still need to read v1 archives
			
 
				             raise Exception("Unknown archive metadata version")
			
 
				+        # note: metadata.items must not get written to disk!
			
 
				+        metadata.items = archive_get_items(metadata, self.key, self.repository)
			
 
				         return metadata
			
 
				 
			
 
				     def load(self, id):
			
@@ -512,10 +546,6 @@ class Archive:
 
				     def duration_from_meta(self):
			
 
				         return format_timedelta(self.ts_end - self.ts)
			
 
				 
			
 
				-    def _archive_csize(self):
			
 
				-        cdata = self.repository.get(self.id)
			
 
				-        return len(cdata)
			
 
				-
			
 
				     def info(self):
			
 
				         if self.create:
			
 
				             stats = self.stats
			
@@ -532,7 +562,6 @@ class Archive:
 
				             "end": OutputTimestamp(end),
			
 
				             "duration": (end - start).total_seconds(),
			
 
				             "stats": stats.as_dict(),
			
 
				-            "limits": {"max_archive_size": self._archive_csize() / MAX_DATA_SIZE},
			
 
				         }
			
 
				         if self.create:
			
 
				             info["command_line"] = sys.argv
			
@@ -556,12 +585,10 @@ Archive fingerprint: {0.fpr}
 
				 Time (start): {start}
			
 
				 Time (end):   {end}
			
 
				 Duration: {0.duration}
			
 
				-Utilization of max. archive size: {csize_max:.0%}
			
 
				 """.format(
			
 
				             self,
			
 
				             start=OutputTimestamp(self.start.replace(tzinfo=timezone.utc)),
			
 
				             end=OutputTimestamp(self.end.replace(tzinfo=timezone.utc)),
			
 
				-            csize_max=self._archive_csize() / MAX_DATA_SIZE,
			
 
				             location=self.repository._location.canonical_path(),
			
 
				         )
			
 
				 
			
@@ -599,6 +626,7 @@ Utilization of max. archive size: {csize_max:.0%}
 
				         if name in self.manifest.archives:
			
 
				             raise self.AlreadyExists(name)
			
 
				         self.items_buffer.flush(flush=True)
			
 
				+        item_ptrs = archive_put_items(self.items_buffer.chunks, key=self.key, cache=self.cache, stats=self.stats)
			
 
				         duration = timedelta(seconds=time.monotonic() - self.start_monotonic)
			
 
				         if timestamp is None:
			
 
				             end = datetime.utcnow()
			
@@ -612,7 +640,7 @@ Utilization of max. archive size: {csize_max:.0%}
 
				             "version": 2,
			
 
				             "name": name,
			
 
				             "comment": comment or "",
			
 
				-            "items": self.items_buffer.chunks,
			
 
				+            "item_ptrs": item_ptrs,  # see #1473
			
 
				             "cmdline": sys.argv,
			
 
				             "hostname": hostname,
			
 
				             "username": getuser(),
			
@@ -930,6 +958,8 @@ Utilization of max. archive size: {csize_max:.0%}
 
				     def set_meta(self, key, value):
			
 
				         metadata = self._load_meta(self.id)
			
 
				         setattr(metadata, key, value)
			
 
				+        if "items" in metadata:
			
 
				+            del metadata.items
			
 
				         data = msgpack.packb(metadata.as_dict())
			
 
				         new_id = self.key.id_hash(data)
			
 
				         self.cache.add_chunk(new_id, data, self.stats)
			
@@ -1004,6 +1034,11 @@ Utilization of max. archive size: {csize_max:.0%}
 
				             if forced == 0:
			
 
				                 raise
			
 
				             error = True
			
 
				+
			
 
				+        # delete the blocks that store all the references that end up being loaded into metadata.items:
			
 
				+        for id in self.metadata.item_ptrs:
			
 
				+            chunk_decref(id, stats)
			
 
				+
			
 
				         # in forced delete mode, we try hard to delete at least the manifest entry,
			
 
				         # if possible also the archive superblock, even if processing the items raises
			
 
				         # some harmless exception.
			
@@ -1997,7 +2032,8 @@ class ArchiveChecker:
 
				                 return True, ""
			
 
				 
			
 
				             i = 0
			
 
				-            for state, items in groupby(archive.items, missing_chunk_detector):
			
 
				+            archive_items = archive_get_items(archive, self.key, repository)
			
 
				+            for state, items in groupby(archive_items, missing_chunk_detector):
			
 
				                 items = list(items)
			
 
				                 if state % 2:
			
 
				                     for chunk_id in items:
			
@@ -2078,9 +2114,11 @@ class ArchiveChecker:
 
				                         verify_file_chunks(info.name, item)
			
 
				                     items_buffer.add(item)
			
 
				                 items_buffer.flush(flush=True)
			
 
				-                for previous_item_id in archive.items:
			
 
				+                for previous_item_id in archive_get_items(archive, self.key, self.repository):
			
 
				                     mark_as_possibly_superseded(previous_item_id)
			
 
				-                archive.items = items_buffer.chunks
			
 
				+                for previous_item_ptr in archive.item_ptrs:
			
 
				+                    mark_as_possibly_superseded(previous_item_ptr)
			
 
				+                archive.item_ptrs = archive_put_items(items_buffer.chunks, key=self.key, add_reference=add_reference)
			
 
				                 data = msgpack.packb(archive.as_dict())
			
 
				                 new_archive_id = self.key.id_hash(data)
			
 
				                 cdata = self.key.encrypt(new_archive_id, data)
			
--- a/src/borg/archiver/debug.py
+++ b/src/borg/archiver/debug.py
@@ -72,7 +72,11 @@ class DebugMixIn:
 
				 
			
 
				             unpacker = msgpack.Unpacker(use_list=False, object_hook=StableDict)
			
 
				             first = True
			
 
				-            for item_id in archive_org_dict["items"]:
			
 
				+            items = []
			
 
				+            for chunk_id in archive_org_dict["item_ptrs"]:
			
 
				+                data = key.decrypt(chunk_id, repository.get(chunk_id))
			
 
				+                items.extend(msgpack.unpackb(data))
			
 
				+            for item_id in items:
			
 
				                 data = key.decrypt(item_id, repository.get(item_id))
			
 
				                 unpacker.feed(data)
			
 
				                 for item in unpacker:
			
--- a/src/borg/archiver/info.py
+++ b/src/borg/archiver/info.py
@@ -55,7 +55,6 @@ class InfoMixIn:
 
				                 Time (end): {end}
			
 
				                 Duration: {duration}
			
 
				                 Command line: {command_line}
			
 
				-                Utilization of maximum supported archive size: {limits[max_archive_size]:.0%}
			
 
				                 Number of files: {stats[nfiles]}
			
 
				                 Original size: {stats[original_size]}
			
 
				                 Deduplicated size: {stats[deduplicated_size]}
			
@@ -88,11 +87,6 @@ class InfoMixIn:
 
				         = unique chunks of this archive.
			
 
				         All archives / deduplicated size = amount of data stored in the repo
			
 
				         = all chunks in the repository.
			
 
				-
			
 
				-        Borg archives can only contain a limited amount of file metadata.
			
 
				-        The size of an archive relative to this limit depends on a number of factors,
			
 
				-        mainly the number of files, the lengths of paths and other metadata stored for files.
			
 
				-        This is shown as *utilization of maximum supported archive size*.
			
 
				         """
			
 
				         )
			
 
				         subparser = subparsers.add_parser(
			
--- a/src/borg/cache.py
+++ b/src/borg/cache.py
@@ -775,8 +775,16 @@ class LocalCache(CacheStatsMixin):
 
				             archive = ArchiveItem(internal_dict=msgpack.unpackb(data))
			
 
				             if archive.version not in (1, 2):  # legacy
			
 
				                 raise Exception("Unknown archive metadata version")
			
 
				+            if archive.version == 1:
			
 
				+                items = archive.items
			
 
				+            elif archive.version == 2:
			
 
				+                items = []
			
 
				+                for chunk_id, (csize, data) in zip(archive.item_ptrs, decrypted_repository.get_many(archive.item_ptrs)):
			
 
				+                    chunk_idx.add(chunk_id, 1, len(data))
			
 
				+                    ids = msgpack.unpackb(data)
			
 
				+                    items.extend(ids)
			
 
				             sync = CacheSynchronizer(chunk_idx)
			
 
				-            for item_id, (csize, data) in zip(archive.items, decrypted_repository.get_many(archive.items)):
			
 
				+            for item_id, (csize, data) in zip(items, decrypted_repository.get_many(items)):
			
 
				                 chunk_idx.add(item_id, 1, len(data))
			
 
				                 processed_item_metadata_bytes += len(data)
			
 
				                 processed_item_metadata_chunks += 1
			
--- a/src/borg/constants.py
+++ b/src/borg/constants.py
@@ -11,7 +11,9 @@ REQUIRED_ITEM_KEYS = frozenset(["path", "mtime"])
 
				 
			
 
				 # this set must be kept complete, otherwise rebuild_manifest might malfunction:
			
 
				 # fmt: off
			
 
				-ARCHIVE_KEYS = frozenset(['version', 'name', 'items', 'cmdline', 'hostname', 'username', 'time', 'time_end',
			
 
				+ARCHIVE_KEYS = frozenset(['version', 'name', 'cmdline', 'hostname', 'username', 'time', 'time_end',
			
 
				+                          'items',  # legacy v1 archives
			
 
				+                          'item_ptrs',  # v2+ archives
			
 
				                           'comment', 'chunker_params',
			
 
				                           'recreate_cmdline',
			
 
				                           'recreate_source_id', 'recreate_args', 'recreate_partial_chunks',  # used in 1.1.0b1 .. b2
			
@@ -19,7 +21,7 @@ ARCHIVE_KEYS = frozenset(['version', 'name', 'items', 'cmdline', 'hostname', 'us
 
				 # fmt: on
			
 
				 
			
 
				 # this is the set of keys that are always present in archives:
			
 
				-REQUIRED_ARCHIVE_KEYS = frozenset(["version", "name", "items", "cmdline", "time"])
			
 
				+REQUIRED_ARCHIVE_KEYS = frozenset(["version", "name", "item_ptrs", "cmdline", "time"])
			
 
				 
			
 
				 # default umask, overridden by --umask, defaults to read/write only for owner
			
 
				 UMASK_DEFAULT = 0o077
			
@@ -47,6 +49,9 @@ MAX_DATA_SIZE = 20971479
 
				 # borg < 1.3, but this is not expected to cause any issues.
			
 
				 MAX_OBJECT_SIZE = MAX_DATA_SIZE + 41 + 8  # see assertion at end of repository module
			
 
				 
			
 
				+# how many metadata stream chunk ids do we store into a "pointer chunk" of the ArchiveItem.item_ptrs list?
			
 
				+IDS_PER_CHUNK = 3  # MAX_DATA_SIZE // 40
			
 
				+
			
 
				 # repo config max_segment_size value must be below this limit to stay within uint32 offsets:
			
 
				 MAX_SEGMENT_SIZE_LIMIT = 2**32 - MAX_OBJECT_SIZE
			
 
				 
			
--- a/src/borg/item.pyi
+++ b/src/borg/item.pyi
@@ -98,6 +98,10 @@ class ArchiveItem(PropDict):
 
				     def items(self) -> List: ...
			
 
				     @items.setter
			
 
				     def items(self, val: List) -> None: ...
			
 
				+    @property
			
 
				+    def item_ptrs(self) -> List: ...
			
 
				+    @items.setter
			
 
				+    def item_ptrs(self, val: List) -> None: ...
			
 
				 
			
 
				 class ChunkListEntry(NamedTuple):
			
 
				     id: bytes
			
--- a/src/borg/item.pyx
+++ b/src/borg/item.pyx
@@ -483,7 +483,8 @@ class ArchiveItem(PropDict):
 
				 
			
 
				     version = PropDict._make_property('version', int)
			
 
				     name = PropDict._make_property('name', str, 'surrogate-escaped str')
			
 
				-    items = PropDict._make_property('items', list)
			
 
				+    items = PropDict._make_property('items', list)  # list of chunk ids of item metadata stream (only in memory)
			
 
				+    item_ptrs = PropDict._make_property('item_ptrs', list)  # list of blocks with list of chunk ids of ims, arch v2
			
 
				     cmdline = PropDict._make_property('cmdline', list)  # list of s-e-str
			
 
				     hostname = PropDict._make_property('hostname', str, 'surrogate-escaped str')
			
 
				     username = PropDict._make_property('username', str, 'surrogate-escaped str')
			
@@ -515,7 +516,9 @@ class ArchiveItem(PropDict):
 
				                 v = fix_tuple_of_str_and_int(v)
			
 
				             if k in ('cmdline', 'recreate_cmdline'):
			
 
				                 v = fix_list_of_str(v)
			
 
				-            if k == 'items':
			
 
				+            if k == 'items':  # legacy
			
 
				+                v = fix_list_of_bytes(v)
			
 
				+            if k == 'item_ptrs':
			
 
				                 v = fix_list_of_bytes(v)
			
 
				             self._dict[k] = v
			
 
				 
			
--- a/src/borg/testsuite/archiver.py
+++ b/src/borg/testsuite/archiver.py
@@ -3981,7 +3981,7 @@ class ArchiverCheckTestCase(ArchiverTestCaseBase):
 
				             archive = msgpack.packb(
			
 
				                 {
			
 
				                     "cmdline": [],
			
 
				-                    "items": [],
			
 
				+                    "item_ptrs": [],
			
 
				                     "hostname": "foo",
			
 
				                     "username": "bar",
			
 
				                     "name": "archive1",