浏览代码

Merge pull request #6941 from ThomasWaldmann/archive-items-indirect

massively increase archive metadata stream size limit, fixes #1473
TW 2 年之前
父节点
当前提交
76ef20105f

+ 0 - 9
docs/faq.rst

@@ -115,15 +115,6 @@ Which file types, attributes, etc. are *not* preserved?
 Are there other known limitations?
 ----------------------------------
 
-- A single archive can only reference a limited volume of file/dir metadata,
-  usually corresponding to tens or hundreds of millions of files/dirs.
-  When trying to go beyond that limit, you will get a fatal IntegrityError
-  exception telling that the (archive) object is too big.
-  An easy workaround is to create multiple archives with fewer items each.
-  See also the :ref:`archive_limitation` and :issue:`1452`.
-
-  :ref:`borg_info` shows how large (relative to the maximum size) existing
-  archives are.
 - borg extract only supports restoring into an empty destination. After that,
   the destination will exactly have the contents of the extracted archive.
   If you extract into a non-empty destination, borg will (for example) not

+ 2 - 29
docs/internals/data-structures.rst

@@ -511,7 +511,8 @@ The archive object itself further contains some metadata:
   When :ref:`borg_check` rebuilds the manifest (e.g. if it was corrupted) and finds
   more than one archive object with the same name, it adds a counter to the name
   in the manifest, but leaves the *name* field of the archives as it was.
-* *items*, a list of chunk IDs containing item metadata (size: count * ~34B)
+* *item_ptrs*, a list of "pointer chunk" IDs.
+  Each "pointer chunk" contains a list of chunk IDs of item metadata.
 * *cmdline*, the command line which was used to create the archive
 * *hostname*
 * *username*
@@ -521,34 +522,6 @@ The archive object itself further contains some metadata:
   This is used by :ref:`borg_recreate` to determine whether a given archive needs rechunking.
 * Some other pieces of information related to recreate.
 
-.. _archive_limitation:
-
-.. rubric:: Note about archive limitations
-
-The archive is currently stored as a single object in the repository
-and thus limited in size to MAX_OBJECT_SIZE (20MiB).
-
-As one chunk list entry is ~40B, that means we can reference ~500.000 item
-metadata stream chunks per archive.
-
-Each item metadata stream chunk is ~128kiB (see hardcoded ITEMS_CHUNKER_PARAMS).
-
-So that means the whole item metadata stream is limited to ~64GiB chunks.
-If compression is used, the amount of storable metadata is bigger - by the
-compression factor.
-
-If the medium size of an item entry is 100B (small size file, no ACLs/xattrs),
-that means a limit of ~640 million files/directories per archive.
-
-If the medium size of an item entry is 2kB (~100MB size files or more
-ACLs/xattrs), the limit will be ~32 million files/directories per archive.
-
-If one tries to create an archive object bigger than MAX_OBJECT_SIZE, a fatal
-IntegrityError will be raised.
-
-A workaround is to create multiple archives with fewer items each, see
-also :issue:`1452`.
-
 .. _item:
 
 Items

+ 0 - 8
docs/internals/frontends.rst

@@ -331,11 +331,6 @@ stats
         Deduplicated size (against the current repository, not when the archive was created)
     nfiles
         Number of regular files in the archive
-limits
-    Object describing the utilization of Borg limits
-
-    max_archive_size
-        Float between 0 and 1 describing how large this archive is relative to the maximum size allowed by Borg
 command_line
     Array of strings of the command line that created the archive
 
@@ -405,9 +400,6 @@ The same archive with more information (``borg info --last 1 --json``)::
                 "end": "2017-02-27T12:27:20.789123",
                 "hostname": "host",
                 "id": "80cd07219ad725b3c5f665c1dcf119435c4dee1647a560ecac30f8d40221a46a",
-                "limits": {
-                    "max_archive_size": 0.0001330855110409714
-                },
                 "name": "host-system-backup-2017-02-27",
                 "start": "2017-02-27T12:27:20.789123",
                 "stats": {

+ 49 - 11
src/borg/archive.py

@@ -391,6 +391,38 @@ def get_item_uid_gid(item, *, numeric, uid_forced=None, gid_forced=None, uid_def
     return uid, gid
 
 
+def archive_get_items(metadata, key, repository):
+    if "item_ptrs" in metadata:  # looks like a v2+ archive
+        assert "items" not in metadata
+        items = []
+        for id, data in zip(metadata.item_ptrs, repository.get_many(metadata.item_ptrs)):
+            data = key.decrypt(id, data)
+            ids = msgpack.unpackb(data)
+            items.extend(ids)
+        return items
+
+    if "items" in metadata:  # legacy, v1 archive
+        assert "item_ptrs" not in metadata
+        return metadata.items
+
+
+def archive_put_items(chunk_ids, *, key, cache=None, stats=None, add_reference=None):
+    """gets a (potentially large) list of archive metadata stream chunk ids and writes them to repo objects"""
+    item_ptrs = []
+    for i in range(0, len(chunk_ids), IDS_PER_CHUNK):
+        data = msgpack.packb(chunk_ids[i : i + IDS_PER_CHUNK])
+        id = key.id_hash(data)
+        if cache is not None and stats is not None:
+            cache.add_chunk(id, data, stats)
+        elif add_reference is not None:
+            cdata = key.encrypt(id, data)
+            add_reference(id, len(data), cdata)
+        else:
+            raise NotImplementedError
+        item_ptrs.append(id)
+    return item_ptrs
+
+
 class Archive:
     class DoesNotExist(Error):
         """Archive {} does not exist"""
@@ -479,6 +511,8 @@ class Archive:
         metadata = ArchiveItem(internal_dict=msgpack.unpackb(data))
         if metadata.version not in (1, 2):  # legacy: still need to read v1 archives
             raise Exception("Unknown archive metadata version")
+        # note: metadata.items must not get written to disk!
+        metadata.items = archive_get_items(metadata, self.key, self.repository)
         return metadata
 
     def load(self, id):
@@ -512,10 +546,6 @@ class Archive:
     def duration_from_meta(self):
         return format_timedelta(self.ts_end - self.ts)
 
-    def _archive_csize(self):
-        cdata = self.repository.get(self.id)
-        return len(cdata)
-
     def info(self):
         if self.create:
             stats = self.stats
@@ -532,7 +562,6 @@ class Archive:
             "end": OutputTimestamp(end),
             "duration": (end - start).total_seconds(),
             "stats": stats.as_dict(),
-            "limits": {"max_archive_size": self._archive_csize() / MAX_DATA_SIZE},
         }
         if self.create:
             info["command_line"] = sys.argv
@@ -556,12 +585,10 @@ Archive fingerprint: {0.fpr}
 Time (start): {start}
 Time (end):   {end}
 Duration: {0.duration}
-Utilization of max. archive size: {csize_max:.0%}
 """.format(
             self,
             start=OutputTimestamp(self.start.replace(tzinfo=timezone.utc)),
             end=OutputTimestamp(self.end.replace(tzinfo=timezone.utc)),
-            csize_max=self._archive_csize() / MAX_DATA_SIZE,
             location=self.repository._location.canonical_path(),
         )
 
@@ -599,6 +626,7 @@ Utilization of max. archive size: {csize_max:.0%}
         if name in self.manifest.archives:
             raise self.AlreadyExists(name)
         self.items_buffer.flush(flush=True)
+        item_ptrs = archive_put_items(self.items_buffer.chunks, key=self.key, cache=self.cache, stats=self.stats)
         duration = timedelta(seconds=time.monotonic() - self.start_monotonic)
         if timestamp is None:
             end = datetime.utcnow()
@@ -612,7 +640,7 @@ Utilization of max. archive size: {csize_max:.0%}
             "version": 2,
             "name": name,
             "comment": comment or "",
-            "items": self.items_buffer.chunks,
+            "item_ptrs": item_ptrs,  # see #1473
             "cmdline": sys.argv,
             "hostname": hostname,
             "username": getuser(),
@@ -930,6 +958,8 @@ Utilization of max. archive size: {csize_max:.0%}
     def set_meta(self, key, value):
         metadata = self._load_meta(self.id)
         setattr(metadata, key, value)
+        if "items" in metadata:
+            del metadata.items
         data = msgpack.packb(metadata.as_dict())
         new_id = self.key.id_hash(data)
         self.cache.add_chunk(new_id, data, self.stats)
@@ -1004,6 +1034,11 @@ Utilization of max. archive size: {csize_max:.0%}
             if forced == 0:
                 raise
             error = True
+
+        # delete the blocks that store all the references that end up being loaded into metadata.items:
+        for id in self.metadata.item_ptrs:
+            chunk_decref(id, stats)
+
         # in forced delete mode, we try hard to delete at least the manifest entry,
         # if possible also the archive superblock, even if processing the items raises
         # some harmless exception.
@@ -1997,7 +2032,8 @@ class ArchiveChecker:
                 return True, ""
 
             i = 0
-            for state, items in groupby(archive.items, missing_chunk_detector):
+            archive_items = archive_get_items(archive, self.key, repository)
+            for state, items in groupby(archive_items, missing_chunk_detector):
                 items = list(items)
                 if state % 2:
                     for chunk_id in items:
@@ -2078,9 +2114,11 @@ class ArchiveChecker:
                         verify_file_chunks(info.name, item)
                     items_buffer.add(item)
                 items_buffer.flush(flush=True)
-                for previous_item_id in archive.items:
+                for previous_item_id in archive_get_items(archive, self.key, self.repository):
                     mark_as_possibly_superseded(previous_item_id)
-                archive.items = items_buffer.chunks
+                for previous_item_ptr in archive.item_ptrs:
+                    mark_as_possibly_superseded(previous_item_ptr)
+                archive.item_ptrs = archive_put_items(items_buffer.chunks, key=self.key, add_reference=add_reference)
                 data = msgpack.packb(archive.as_dict())
                 new_archive_id = self.key.id_hash(data)
                 cdata = self.key.encrypt(new_archive_id, data)

+ 5 - 1
src/borg/archiver/debug.py

@@ -72,7 +72,11 @@ class DebugMixIn:
 
             unpacker = msgpack.Unpacker(use_list=False, object_hook=StableDict)
             first = True
-            for item_id in archive_org_dict["items"]:
+            items = []
+            for chunk_id in archive_org_dict["item_ptrs"]:
+                data = key.decrypt(chunk_id, repository.get(chunk_id))
+                items.extend(msgpack.unpackb(data))
+            for item_id in items:
                 data = key.decrypt(item_id, repository.get(item_id))
                 unpacker.feed(data)
                 for item in unpacker:

+ 0 - 6
src/borg/archiver/info.py

@@ -55,7 +55,6 @@ class InfoMixIn:
                 Time (end): {end}
                 Duration: {duration}
                 Command line: {command_line}
-                Utilization of maximum supported archive size: {limits[max_archive_size]:.0%}
                 Number of files: {stats[nfiles]}
                 Original size: {stats[original_size]}
                 Deduplicated size: {stats[deduplicated_size]}
@@ -88,11 +87,6 @@ class InfoMixIn:
         = unique chunks of this archive.
         All archives / deduplicated size = amount of data stored in the repo
         = all chunks in the repository.
-
-        Borg archives can only contain a limited amount of file metadata.
-        The size of an archive relative to this limit depends on a number of factors,
-        mainly the number of files, the lengths of paths and other metadata stored for files.
-        This is shown as *utilization of maximum supported archive size*.
         """
         )
         subparser = subparsers.add_parser(

+ 9 - 1
src/borg/cache.py

@@ -775,8 +775,16 @@ class LocalCache(CacheStatsMixin):
             archive = ArchiveItem(internal_dict=msgpack.unpackb(data))
             if archive.version not in (1, 2):  # legacy
                 raise Exception("Unknown archive metadata version")
+            if archive.version == 1:
+                items = archive.items
+            elif archive.version == 2:
+                items = []
+                for chunk_id, (csize, data) in zip(archive.item_ptrs, decrypted_repository.get_many(archive.item_ptrs)):
+                    chunk_idx.add(chunk_id, 1, len(data))
+                    ids = msgpack.unpackb(data)
+                    items.extend(ids)
             sync = CacheSynchronizer(chunk_idx)
-            for item_id, (csize, data) in zip(archive.items, decrypted_repository.get_many(archive.items)):
+            for item_id, (csize, data) in zip(items, decrypted_repository.get_many(items)):
                 chunk_idx.add(item_id, 1, len(data))
                 processed_item_metadata_bytes += len(data)
                 processed_item_metadata_chunks += 1

+ 7 - 2
src/borg/constants.py

@@ -11,7 +11,9 @@ REQUIRED_ITEM_KEYS = frozenset(["path", "mtime"])
 
 # this set must be kept complete, otherwise rebuild_manifest might malfunction:
 # fmt: off
-ARCHIVE_KEYS = frozenset(['version', 'name', 'items', 'cmdline', 'hostname', 'username', 'time', 'time_end',
+ARCHIVE_KEYS = frozenset(['version', 'name', 'cmdline', 'hostname', 'username', 'time', 'time_end',
+                          'items',  # legacy v1 archives
+                          'item_ptrs',  # v2+ archives
                           'comment', 'chunker_params',
                           'recreate_cmdline',
                           'recreate_source_id', 'recreate_args', 'recreate_partial_chunks',  # used in 1.1.0b1 .. b2
@@ -19,7 +21,7 @@ ARCHIVE_KEYS = frozenset(['version', 'name', 'items', 'cmdline', 'hostname', 'us
 # fmt: on
 
 # this is the set of keys that are always present in archives:
-REQUIRED_ARCHIVE_KEYS = frozenset(["version", "name", "items", "cmdline", "time"])
+REQUIRED_ARCHIVE_KEYS = frozenset(["version", "name", "item_ptrs", "cmdline", "time"])
 
 # default umask, overridden by --umask, defaults to read/write only for owner
 UMASK_DEFAULT = 0o077
@@ -47,6 +49,9 @@ MAX_DATA_SIZE = 20971479
 # borg < 1.3, but this is not expected to cause any issues.
 MAX_OBJECT_SIZE = MAX_DATA_SIZE + 41 + 8  # see assertion at end of repository module
 
+# how many metadata stream chunk ids do we store into a "pointer chunk" of the ArchiveItem.item_ptrs list?
+IDS_PER_CHUNK = 3  # MAX_DATA_SIZE // 40
+
 # repo config max_segment_size value must be below this limit to stay within uint32 offsets:
 MAX_SEGMENT_SIZE_LIMIT = 2**32 - MAX_OBJECT_SIZE
 

+ 4 - 0
src/borg/item.pyi

@@ -98,6 +98,10 @@ class ArchiveItem(PropDict):
     def items(self) -> List: ...
     @items.setter
     def items(self, val: List) -> None: ...
+    @property
+    def item_ptrs(self) -> List: ...
+    @items.setter
+    def item_ptrs(self, val: List) -> None: ...
 
 class ChunkListEntry(NamedTuple):
     id: bytes

+ 5 - 2
src/borg/item.pyx

@@ -483,7 +483,8 @@ class ArchiveItem(PropDict):
 
     version = PropDict._make_property('version', int)
     name = PropDict._make_property('name', str, 'surrogate-escaped str')
-    items = PropDict._make_property('items', list)
+    items = PropDict._make_property('items', list)  # list of chunk ids of item metadata stream (only in memory)
+    item_ptrs = PropDict._make_property('item_ptrs', list)  # list of blocks with list of chunk ids of ims, arch v2
     cmdline = PropDict._make_property('cmdline', list)  # list of s-e-str
     hostname = PropDict._make_property('hostname', str, 'surrogate-escaped str')
     username = PropDict._make_property('username', str, 'surrogate-escaped str')
@@ -515,7 +516,9 @@ class ArchiveItem(PropDict):
                 v = fix_tuple_of_str_and_int(v)
             if k in ('cmdline', 'recreate_cmdline'):
                 v = fix_list_of_str(v)
-            if k == 'items':
+            if k == 'items':  # legacy
+                v = fix_list_of_bytes(v)
+            if k == 'item_ptrs':
                 v = fix_list_of_bytes(v)
             self._dict[k] = v
 

+ 1 - 1
src/borg/testsuite/archiver.py

@@ -3981,7 +3981,7 @@ class ArchiverCheckTestCase(ArchiverTestCaseBase):
             archive = msgpack.packb(
                 {
                     "cmdline": [],
-                    "items": [],
+                    "item_ptrs": [],
                     "hostname": "foo",
                     "username": "bar",
                     "name": "archive1",