ソースを参照

Merge pull request #6703 from ThomasWaldmann/r2r-transfer

borg2: "borg transfer" cmd (and also getting rid of legacy)
TW 3 年 前
コミット
86fe8bdd57

+ 0 - 3
docs/faq.rst

@@ -132,9 +132,6 @@ Which file types, attributes, etc. are *not* preserved?
       Archive extraction has optional support to extract all-zero chunks as
       Archive extraction has optional support to extract all-zero chunks as
       holes in a sparse file.
       holes in a sparse file.
     * Some filesystem specific attributes, like btrfs NOCOW, see :ref:`platforms`.
     * Some filesystem specific attributes, like btrfs NOCOW, see :ref:`platforms`.
-    * For hardlinked symlinks, the hardlinking can not be archived (and thus,
-      the hardlinking will not be done at extraction time). The symlinks will
-      be archived and extracted as non-hardlinked symlinks, see :issue:`2379`.
 
 
 Are there other known limitations?
 Are there other known limitations?
 ----------------------------------
 ----------------------------------

+ 1 - 1
docs/internals/data-structures.rst

@@ -567,7 +567,7 @@ dictionary created by the ``Item`` class that contains:
 * uid
 * uid
 * gid
 * gid
 * mode (item type + permissions)
 * mode (item type + permissions)
-* source (for symlinks, and for hardlinks within one archive)
+* source (for symlinks)
 * rdev (for device files)
 * rdev (for device files)
 * mtime, atime, ctime in nanoseconds
 * mtime, atime, ctime in nanoseconds
 * xattrs
 * xattrs

+ 1 - 1
docs/usage/general/file-metadata.rst.inc

@@ -10,7 +10,7 @@ Besides regular file and directory structures, Borg can preserve
   * FIFOs ("named pipes")
   * FIFOs ("named pipes")
   * special file *contents* can be backed up in ``--read-special`` mode.
   * special file *contents* can be backed up in ``--read-special`` mode.
     By default the metadata to create them with mknod(2), mkfifo(2) etc. is stored.
     By default the metadata to create them with mknod(2), mkfifo(2) etc. is stored.
-* hardlinked regular files, devices, FIFOs (considering all items in the same archive)
+* hardlinked regular files, devices, symlinks, FIFOs (considering all items in the same archive)
 * timestamps in nanosecond precision: mtime, atime, ctime
 * timestamps in nanosecond precision: mtime, atime, ctime
 * other timestamps: birthtime (on platforms supporting it)
 * other timestamps: birthtime (on platforms supporting it)
 * permissions:
 * permissions:

+ 116 - 198
src/borg/archive.py

@@ -28,7 +28,7 @@ from .constants import *  # NOQA
 from .crypto.low_level import IntegrityError as IntegrityErrorBase
 from .crypto.low_level import IntegrityError as IntegrityErrorBase
 from .hashindex import ChunkIndex, ChunkIndexEntry, CacheSynchronizer
 from .hashindex import ChunkIndex, ChunkIndexEntry, CacheSynchronizer
 from .helpers import Manifest
 from .helpers import Manifest
-from .helpers import hardlinkable
+from .helpers import HardLinkManager
 from .helpers import ChunkIteratorFileWrapper, open_item
 from .helpers import ChunkIteratorFileWrapper, open_item
 from .helpers import Error, IntegrityError, set_ec
 from .helpers import Error, IntegrityError, set_ec
 from .platform import uid2user, user2uid, gid2group, group2gid
 from .platform import uid2user, user2uid, gid2group, group2gid
@@ -280,7 +280,7 @@ class DownloadPipeline:
         self.repository = repository
         self.repository = repository
         self.key = key
         self.key = key
 
 
-    def unpack_many(self, ids, filter=None, partial_extract=False, preload=False, hardlink_masters=None):
+    def unpack_many(self, ids, *, filter=None, preload=False):
         """
         """
         Return iterator of items.
         Return iterator of items.
 
 
@@ -290,10 +290,7 @@ class DownloadPipeline:
         Warning: if *preload* is True then all data chunks of every yielded item have to be retrieved,
         Warning: if *preload* is True then all data chunks of every yielded item have to be retrieved,
         otherwise preloaded chunks will accumulate in RemoteRepository and create a memory leak.
         otherwise preloaded chunks will accumulate in RemoteRepository and create a memory leak.
         """
         """
-        def _preload(chunks):
-            self.repository.preload([c.id for c in chunks])
-
-        masters_preloaded = set()
+        hlids_preloaded = set()
         unpacker = msgpack.Unpacker(use_list=False)
         unpacker = msgpack.Unpacker(use_list=False)
         for data in self.fetch_many(ids):
         for data in self.fetch_many(ids):
             unpacker.feed(data)
             unpacker.feed(data)
@@ -306,33 +303,20 @@ class DownloadPipeline:
                 items = [item for item in items if filter(item)]
                 items = [item for item in items if filter(item)]
 
 
             if preload:
             if preload:
-                if filter and partial_extract:
-                    # if we do only a partial extraction, it gets a bit
-                    # complicated with computing the preload items: if a hardlink master item is not
-                    # selected (== not extracted), we will still need to preload its chunks if a
-                    # corresponding hardlink slave is selected (== is extracted).
-                    # due to a side effect of the filter() call, we now have hardlink_masters dict populated.
-                    for item in items:
-                        if hardlinkable(item.mode):
-                            source = item.get('source')
-                            if source is None:  # maybe a hardlink master
-                                if 'chunks' in item:
-                                    _preload(item.chunks)
-                                # if this is a hl master, remember that we already preloaded all chunks of it (if any):
-                                if item.get('hardlink_master', True):
-                                    masters_preloaded.add(item.path)
-                            else:  # hardlink slave
-                                if source not in masters_preloaded:
-                                    # we only need to preload *once* (for the 1st selected slave)
-                                    chunks, _ = hardlink_masters[source]
-                                    if chunks is not None:
-                                        _preload(chunks)
-                                    masters_preloaded.add(source)
-                else:
-                    # easy: we do not have a filter, thus all items are selected, thus we need to preload all chunks.
-                    for item in items:
-                        if 'chunks' in item:
-                            _preload(item.chunks)
+                for item in items:
+                    if 'chunks' in item:
+                        hlid = item.get('hlid', None)
+                        if hlid is None:
+                            preload_chunks = True
+                        else:
+                            if hlid in hlids_preloaded:
+                                preload_chunks = False
+                            else:
+                                # not having the hardlink's chunks already preloaded for other hardlink to same inode
+                                preload_chunks = True
+                                hlids_preloaded.add(hlid)
+                        if preload_chunks:
+                            self.repository.preload([c.id for c in item.chunks])
 
 
             for item in items:
             for item in items:
                 yield item
                 yield item
@@ -443,7 +427,6 @@ class Archive:
         self.repository = repository
         self.repository = repository
         self.cache = cache
         self.cache = cache
         self.manifest = manifest
         self.manifest = manifest
-        self.hard_links = {}
         self.stats = Statistics(output_json=log_json, iec=iec)
         self.stats = Statistics(output_json=log_json, iec=iec)
         self.iec = iec
         self.iec = iec
         self.show_progress = progress
         self.show_progress = progress
@@ -489,7 +472,7 @@ class Archive:
     def _load_meta(self, id):
     def _load_meta(self, id):
         data = self.key.decrypt(id, self.repository.get(id))
         data = self.key.decrypt(id, self.repository.get(id))
         metadata = ArchiveItem(internal_dict=msgpack.unpackb(data))
         metadata = ArchiveItem(internal_dict=msgpack.unpackb(data))
-        if metadata.version != 1:
+        if metadata.version not in (1, 2):  # legacy: still need to read v1 archives
             raise Exception('Unknown archive metadata version')
             raise Exception('Unknown archive metadata version')
         return metadata
         return metadata
 
 
@@ -584,12 +567,10 @@ Utilization of max. archive size: {csize_max:.0%}
             return False
             return False
         return filter(item) if filter else True
         return filter(item) if filter else True
 
 
-    def iter_items(self, filter=None, partial_extract=False, preload=False, hardlink_masters=None):
+    def iter_items(self, filter=None, preload=False):
         # note: when calling this with preload=True, later fetch_many() must be called with
         # note: when calling this with preload=True, later fetch_many() must be called with
         # is_preloaded=True or the RemoteRepository code will leak memory!
         # is_preloaded=True or the RemoteRepository code will leak memory!
-        assert not (filter and partial_extract and preload) or hardlink_masters is not None
-        for item in self.pipeline.unpack_many(self.metadata.items, partial_extract=partial_extract,
-                                              preload=preload, hardlink_masters=hardlink_masters,
+        for item in self.pipeline.unpack_many(self.metadata.items, preload=preload,
                                               filter=lambda item: self.item_filter(item, filter)):
                                               filter=lambda item: self.item_filter(item, filter)):
             yield item
             yield item
 
 
@@ -620,7 +601,7 @@ Utilization of max. archive size: {csize_max:.0%}
         self.start = start
         self.start = start
         self.end = end
         self.end = end
         metadata = {
         metadata = {
-            'version': 1,
+            'version': 2,
             'name': name,
             'name': name,
             'comment': comment or '',
             'comment': comment or '',
             'items': self.items_buffer.chunks,
             'items': self.items_buffer.chunks,
@@ -719,33 +700,30 @@ Utilization of max. archive size: {csize_max:.0%}
         return stats
         return stats
 
 
     @contextmanager
     @contextmanager
-    def extract_helper(self, dest, item, path, stripped_components, original_path, hardlink_masters):
+    def extract_helper(self, item, path, hlm, *, dry_run=False):
         hardlink_set = False
         hardlink_set = False
         # Hard link?
         # Hard link?
-        if 'source' in item:
-            source = os.path.join(dest, *item.source.split(os.sep)[stripped_components:])
-            chunks, link_target = hardlink_masters.get(item.source, (None, source))
-            if link_target and has_link:
-                # Hard link was extracted previously, just link
-                with backup_io('link'):
-                    os.link(link_target, path)
-                    hardlink_set = True
-            elif chunks is not None:
-                # assign chunks to this item, since the item which had the chunks was not extracted
-                item.chunks = chunks
+        if 'hlid' in item:
+            link_target = hlm.retrieve(id=item.hlid)
+            if link_target is not None and has_link:
+                if not dry_run:
+                    # another hardlink to same inode (same hlid) was extracted previously, just link to it
+                    with backup_io('link'):
+                        os.link(link_target, path, follow_symlinks=False)
+                hardlink_set = True
         yield hardlink_set
         yield hardlink_set
-        if not hardlink_set and hardlink_masters:
-            if has_link:
-                # Update master entry with extracted item path, so that following hardlinks don't extract twice.
+        if not hardlink_set:
+            if 'hlid' in item and has_link:
+                # Update entry with extracted item path, so that following hardlinks don't extract twice.
                 # We have hardlinking support, so we will hardlink not extract.
                 # We have hardlinking support, so we will hardlink not extract.
-                hardlink_masters[item.get('source') or original_path] = (None, path)
+                hlm.remember(id=item.hlid, info=path)
             else:
             else:
                 # Broken platform with no hardlinking support.
                 # Broken platform with no hardlinking support.
                 # In this case, we *want* to extract twice, because there is no other way.
                 # In this case, we *want* to extract twice, because there is no other way.
                 pass
                 pass
 
 
     def extract_item(self, item, restore_attrs=True, dry_run=False, stdout=False, sparse=False,
     def extract_item(self, item, restore_attrs=True, dry_run=False, stdout=False, sparse=False,
-                     hardlink_masters=None, stripped_components=0, original_path=None, pi=None):
+                     hlm=None, stripped_components=0, original_path=None, pi=None):
         """
         """
         Extract archive item.
         Extract archive item.
 
 
@@ -754,29 +732,33 @@ Utilization of max. archive size: {csize_max:.0%}
         :param dry_run: do not write any data
         :param dry_run: do not write any data
         :param stdout: write extracted data to stdout
         :param stdout: write extracted data to stdout
         :param sparse: write sparse files (chunk-granularity, independent of the original being sparse)
         :param sparse: write sparse files (chunk-granularity, independent of the original being sparse)
-        :param hardlink_masters: maps paths to (chunks, link_target) for extracting subtrees with hardlinks correctly
+        :param hlm: maps hlid to link_target for extracting subtrees with hardlinks correctly
         :param stripped_components: stripped leading path components to correct hard link extraction
         :param stripped_components: stripped leading path components to correct hard link extraction
         :param original_path: 'path' key as stored in archive
         :param original_path: 'path' key as stored in archive
         :param pi: ProgressIndicatorPercent (or similar) for file extraction progress (in bytes)
         :param pi: ProgressIndicatorPercent (or similar) for file extraction progress (in bytes)
         """
         """
-        hardlink_masters = hardlink_masters or {}
         has_damaged_chunks = 'chunks_healthy' in item
         has_damaged_chunks = 'chunks_healthy' in item
         if dry_run or stdout:
         if dry_run or stdout:
-            if 'chunks' in item:
-                item_chunks_size = 0
-                for data in self.pipeline.fetch_many([c.id for c in item.chunks], is_preloaded=True):
-                    if pi:
-                        pi.show(increase=len(data), info=[remove_surrogates(item.path)])
-                    if stdout:
-                        sys.stdout.buffer.write(data)
-                    item_chunks_size += len(data)
-                if stdout:
-                    sys.stdout.buffer.flush()
-                if 'size' in item:
-                    item_size = item.size
-                    if item_size != item_chunks_size:
-                        raise BackupError('Size inconsistency detected: size {}, chunks size {}'.format(
-                                          item_size, item_chunks_size))
+            with self.extract_helper(item, '', hlm, dry_run=dry_run or stdout) as hardlink_set:
+                if not hardlink_set:
+                    # it does not really set hardlinks due to dry_run, but we need to behave same
+                    # as non-dry_run concerning fetching preloaded chunks from the pipeline or
+                    # it would get stuck.
+                    if 'chunks' in item:
+                        item_chunks_size = 0
+                        for data in self.pipeline.fetch_many([c.id for c in item.chunks], is_preloaded=True):
+                            if pi:
+                                pi.show(increase=len(data), info=[remove_surrogates(item.path)])
+                            if stdout:
+                                sys.stdout.buffer.write(data)
+                            item_chunks_size += len(data)
+                        if stdout:
+                            sys.stdout.buffer.flush()
+                        if 'size' in item:
+                            item_size = item.size
+                            if item_size != item_chunks_size:
+                                raise BackupError('Size inconsistency detected: size {}, chunks size {}'.format(
+                                                  item_size, item_chunks_size))
             if has_damaged_chunks:
             if has_damaged_chunks:
                 raise BackupError('File has damaged (all-zero) chunks. Try running borg check --repair.')
                 raise BackupError('File has damaged (all-zero) chunks. Try running borg check --repair.')
             return
             return
@@ -807,8 +789,7 @@ Utilization of max. archive size: {csize_max:.0%}
         if stat.S_ISREG(mode):
         if stat.S_ISREG(mode):
             with backup_io('makedirs'):
             with backup_io('makedirs'):
                 make_parent(path)
                 make_parent(path)
-            with self.extract_helper(dest, item, path, stripped_components, original_path,
-                                     hardlink_masters) as hardlink_set:
+            with self.extract_helper(item, path, hlm) as hardlink_set:
                 if hardlink_set:
                 if hardlink_set:
                     return
                     return
                 with backup_io('open'):
                 with backup_io('open'):
@@ -847,24 +828,26 @@ Utilization of max. archive size: {csize_max:.0%}
                     self.restore_attrs(path, item)
                     self.restore_attrs(path, item)
             elif stat.S_ISLNK(mode):
             elif stat.S_ISLNK(mode):
                 make_parent(path)
                 make_parent(path)
-                source = item.source
-                try:
-                    os.symlink(source, path)
-                except UnicodeEncodeError:
-                    raise self.IncompatibleFilesystemEncodingError(source, sys.getfilesystemencoding()) from None
-                self.restore_attrs(path, item, symlink=True)
+                with self.extract_helper(item, path, hlm) as hardlink_set:
+                    if hardlink_set:
+                        # unusual, but possible: this is a hardlinked symlink.
+                        return
+                    source = item.source
+                    try:
+                        os.symlink(source, path)
+                    except UnicodeEncodeError:
+                        raise self.IncompatibleFilesystemEncodingError(source, sys.getfilesystemencoding()) from None
+                    self.restore_attrs(path, item, symlink=True)
             elif stat.S_ISFIFO(mode):
             elif stat.S_ISFIFO(mode):
                 make_parent(path)
                 make_parent(path)
-                with self.extract_helper(dest, item, path, stripped_components, original_path,
-                                         hardlink_masters) as hardlink_set:
+                with self.extract_helper(item, path, hlm) as hardlink_set:
                     if hardlink_set:
                     if hardlink_set:
                         return
                         return
                     os.mkfifo(path)
                     os.mkfifo(path)
                     self.restore_attrs(path, item)
                     self.restore_attrs(path, item)
             elif stat.S_ISCHR(mode) or stat.S_ISBLK(mode):
             elif stat.S_ISCHR(mode) or stat.S_ISBLK(mode):
                 make_parent(path)
                 make_parent(path)
-                with self.extract_helper(dest, item, path, stripped_components, original_path,
-                                         hardlink_masters) as hardlink_set:
+                with self.extract_helper(item, path, hlm) as hardlink_set:
                     if hardlink_set:
                     if hardlink_set:
                         return
                         return
                     os.mknod(path, item.mode, item.rdev)
                     os.mknod(path, item.mode, item.rdev)
@@ -1041,79 +1024,43 @@ Utilization of max. archive size: {csize_max:.0%}
         :param can_compare_chunk_ids: Whether --chunker-params are the same for both archives.
         :param can_compare_chunk_ids: Whether --chunker-params are the same for both archives.
         """
         """
 
 
-        def hardlink_master_seen(item):
-            return 'source' not in item or not hardlinkable(item.mode) or item.source in hardlink_masters
-
-        def is_hardlink_master(item):
-            return item.get('hardlink_master', True) and 'source' not in item and hardlinkable(item.mode)
-
-        def update_hardlink_masters(item1, item2):
-            if is_hardlink_master(item1) or is_hardlink_master(item2):
-                hardlink_masters[item1.path] = (item1, item2)
-
-        def has_hardlink_master(item, hardlink_masters):
-            return hardlinkable(item.mode) and item.get('source') in hardlink_masters
-
         def compare_items(item1, item2):
         def compare_items(item1, item2):
-            if has_hardlink_master(item1, hardlink_masters):
-                item1 = hardlink_masters[item1.source][0]
-            if has_hardlink_master(item2, hardlink_masters):
-                item2 = hardlink_masters[item2.source][1]
             return ItemDiff(item1, item2,
             return ItemDiff(item1, item2,
                             archive1.pipeline.fetch_many([c.id for c in item1.get('chunks', [])]),
                             archive1.pipeline.fetch_many([c.id for c in item1.get('chunks', [])]),
                             archive2.pipeline.fetch_many([c.id for c in item2.get('chunks', [])]),
                             archive2.pipeline.fetch_many([c.id for c in item2.get('chunks', [])]),
                             can_compare_chunk_ids=can_compare_chunk_ids)
                             can_compare_chunk_ids=can_compare_chunk_ids)
 
 
-        def defer_if_necessary(item1, item2):
-            """Adds item tuple to deferred if necessary and returns True, if items were deferred"""
-            update_hardlink_masters(item1, item2)
-            defer = not hardlink_master_seen(item1) or not hardlink_master_seen(item2)
-            if defer:
-                deferred.append((item1, item2))
-            return defer
-
         orphans_archive1 = OrderedDict()
         orphans_archive1 = OrderedDict()
         orphans_archive2 = OrderedDict()
         orphans_archive2 = OrderedDict()
-        deferred = []
-        hardlink_masters = {}
 
 
         for item1, item2 in zip_longest(
         for item1, item2 in zip_longest(
                 archive1.iter_items(lambda item: matcher.match(item.path)),
                 archive1.iter_items(lambda item: matcher.match(item.path)),
                 archive2.iter_items(lambda item: matcher.match(item.path)),
                 archive2.iter_items(lambda item: matcher.match(item.path)),
         ):
         ):
             if item1 and item2 and item1.path == item2.path:
             if item1 and item2 and item1.path == item2.path:
-                if not defer_if_necessary(item1, item2):
-                    yield (item1.path, compare_items(item1, item2))
+                yield (item1.path, compare_items(item1, item2))
                 continue
                 continue
             if item1:
             if item1:
                 matching_orphan = orphans_archive2.pop(item1.path, None)
                 matching_orphan = orphans_archive2.pop(item1.path, None)
                 if matching_orphan:
                 if matching_orphan:
-                    if not defer_if_necessary(item1, matching_orphan):
-                        yield (item1.path, compare_items(item1, matching_orphan))
+                    yield (item1.path, compare_items(item1, matching_orphan))
                 else:
                 else:
                     orphans_archive1[item1.path] = item1
                     orphans_archive1[item1.path] = item1
             if item2:
             if item2:
                 matching_orphan = orphans_archive1.pop(item2.path, None)
                 matching_orphan = orphans_archive1.pop(item2.path, None)
                 if matching_orphan:
                 if matching_orphan:
-                    if not defer_if_necessary(matching_orphan, item2):
-                        yield (matching_orphan.path, compare_items(matching_orphan, item2))
+                    yield (matching_orphan.path, compare_items(matching_orphan, item2))
                 else:
                 else:
                     orphans_archive2[item2.path] = item2
                     orphans_archive2[item2.path] = item2
         # At this point orphans_* contain items that had no matching partner in the other archive
         # At this point orphans_* contain items that had no matching partner in the other archive
         for added in orphans_archive2.values():
         for added in orphans_archive2.values():
             path = added.path
             path = added.path
             deleted_item = Item.create_deleted(path)
             deleted_item = Item.create_deleted(path)
-            update_hardlink_masters(deleted_item, added)
             yield (path, compare_items(deleted_item, added))
             yield (path, compare_items(deleted_item, added))
         for deleted in orphans_archive1.values():
         for deleted in orphans_archive1.values():
             path = deleted.path
             path = deleted.path
             deleted_item = Item.create_deleted(path)
             deleted_item = Item.create_deleted(path)
-            update_hardlink_masters(deleted, deleted_item)
             yield (path, compare_items(deleted, deleted_item))
             yield (path, compare_items(deleted, deleted_item))
-        for item1, item2 in deferred:
-            assert hardlink_master_seen(item1)
-            assert hardlink_master_seen(item2)
-            yield (path, compare_items(item1, item2))
 
 
 
 
 class MetadataCollector:
 class MetadataCollector:
@@ -1289,7 +1236,7 @@ class FilesystemObjectProcessors:
         self.show_progress = show_progress
         self.show_progress = show_progress
         self.print_file_status = file_status_printer or (lambda *args: None)
         self.print_file_status = file_status_printer or (lambda *args: None)
 
 
-        self.hard_links = {}
+        self.hlm = HardLinkManager(id_type=tuple, info_type=(list, type(None)))  # (dev, ino) -> chunks or None
         self.stats = Statistics(output_json=log_json, iec=iec)  # threading: done by cache (including progress)
         self.stats = Statistics(output_json=log_json, iec=iec)  # threading: done by cache (including progress)
         self.cwd = os.getcwd()
         self.cwd = os.getcwd()
         self.chunker = get_chunker(*chunker_params, seed=key.chunk_seed, sparse=sparse)
         self.chunker = get_chunker(*chunker_params, seed=key.chunk_seed, sparse=sparse)
@@ -1298,29 +1245,32 @@ class FilesystemObjectProcessors:
     def create_helper(self, path, st, status=None, hardlinkable=True):
     def create_helper(self, path, st, status=None, hardlinkable=True):
         safe_path = make_path_safe(path)
         safe_path = make_path_safe(path)
         item = Item(path=safe_path)
         item = Item(path=safe_path)
-        hardlink_master = False
         hardlinked = hardlinkable and st.st_nlink > 1
         hardlinked = hardlinkable and st.st_nlink > 1
+        update_map = False
         if hardlinked:
         if hardlinked:
-            source = self.hard_links.get((st.st_ino, st.st_dev))
-            if source is not None:
-                item.source = source
-                status = 'h'  # hardlink (to already seen inodes)
-            else:
-                hardlink_master = True
-        yield item, status, hardlinked, hardlink_master
-        # if we get here, "with"-block worked ok without error/exception, the item was processed ok...
+            status = 'h'  # hardlink
+            nothing = object()
+            chunks = self.hlm.retrieve(id=(st.st_ino, st.st_dev), default=nothing)
+            if chunks is nothing:
+                update_map = True
+            elif chunks is not None:
+                item.chunks = chunks
+            item.hlid = self.hlm.hardlink_id_from_inode(ino=st.st_ino, dev=st.st_dev)
+        yield item, status, hardlinked
         self.add_item(item, stats=self.stats)
         self.add_item(item, stats=self.stats)
-        # ... and added to the archive, so we can remember it to refer to it later in the archive:
-        if hardlink_master:
-            self.hard_links[(st.st_ino, st.st_dev)] = safe_path
+        if update_map:
+            # remember the hlid of this fs object and if the item has chunks,
+            # also remember them, so we do not have to re-chunk a hardlink.
+            chunks = item.chunks if 'chunks' in item else None
+            self.hlm.remember(id=(st.st_ino, st.st_dev), info=chunks)
 
 
     def process_dir_with_fd(self, *, path, fd, st):
     def process_dir_with_fd(self, *, path, fd, st):
-        with self.create_helper(path, st, 'd', hardlinkable=False) as (item, status, hardlinked, hardlink_master):
+        with self.create_helper(path, st, 'd', hardlinkable=False) as (item, status, hardlinked):
             item.update(self.metadata_collector.stat_attrs(st, path, fd=fd))
             item.update(self.metadata_collector.stat_attrs(st, path, fd=fd))
             return status
             return status
 
 
     def process_dir(self, *, path, parent_fd, name, st):
     def process_dir(self, *, path, parent_fd, name, st):
-        with self.create_helper(path, st, 'd', hardlinkable=False) as (item, status, hardlinked, hardlink_master):
+        with self.create_helper(path, st, 'd', hardlinkable=False) as (item, status, hardlinked):
             with OsOpen(path=path, parent_fd=parent_fd, name=name, flags=flags_dir,
             with OsOpen(path=path, parent_fd=parent_fd, name=name, flags=flags_dir,
                         noatime=True, op='dir_open') as fd:
                         noatime=True, op='dir_open') as fd:
                 # fd is None for directories on windows, in that case a race condition check is not possible.
                 # fd is None for directories on windows, in that case a race condition check is not possible.
@@ -1331,7 +1281,7 @@ class FilesystemObjectProcessors:
                 return status
                 return status
 
 
     def process_fifo(self, *, path, parent_fd, name, st):
     def process_fifo(self, *, path, parent_fd, name, st):
-        with self.create_helper(path, st, 'f') as (item, status, hardlinked, hardlink_master):  # fifo
+        with self.create_helper(path, st, 'f') as (item, status, hardlinked):  # fifo
             with OsOpen(path=path, parent_fd=parent_fd, name=name, flags=flags_normal, noatime=True) as fd:
             with OsOpen(path=path, parent_fd=parent_fd, name=name, flags=flags_normal, noatime=True) as fd:
                 with backup_io('fstat'):
                 with backup_io('fstat'):
                     st = stat_update_check(st, os.fstat(fd))
                     st = stat_update_check(st, os.fstat(fd))
@@ -1339,7 +1289,7 @@ class FilesystemObjectProcessors:
                 return status
                 return status
 
 
     def process_dev(self, *, path, parent_fd, name, st, dev_type):
     def process_dev(self, *, path, parent_fd, name, st, dev_type):
-        with self.create_helper(path, st, dev_type) as (item, status, hardlinked, hardlink_master):  # char/block device
+        with self.create_helper(path, st, dev_type) as (item, status, hardlinked):  # char/block device
             # looks like we can not work fd-based here without causing issues when trying to open/close the device
             # looks like we can not work fd-based here without causing issues when trying to open/close the device
             with backup_io('stat'):
             with backup_io('stat'):
                 st = stat_update_check(st, os_stat(path=path, parent_fd=parent_fd, name=name, follow_symlinks=False))
                 st = stat_update_check(st, os_stat(path=path, parent_fd=parent_fd, name=name, follow_symlinks=False))
@@ -1348,10 +1298,7 @@ class FilesystemObjectProcessors:
             return status
             return status
 
 
     def process_symlink(self, *, path, parent_fd, name, st):
     def process_symlink(self, *, path, parent_fd, name, st):
-        # note: using hardlinkable=False because we can not support hardlinked symlinks,
-        #       due to the dual-use of item.source, see issue #2343:
-        # hardlinked symlinks will be archived [and extracted] as non-hardlinked symlinks.
-        with self.create_helper(path, st, 's', hardlinkable=False) as (item, status, hardlinked, hardlink_master):
+        with self.create_helper(path, st, 's', hardlinkable=True) as (item, status, hardlinked):
             fname = name if name is not None and parent_fd is not None else path
             fname = name if name is not None and parent_fd is not None else path
             with backup_io('readlink'):
             with backup_io('readlink'):
                 source = os.readlink(fname, dir_fd=parent_fd)
                 source = os.readlink(fname, dir_fd=parent_fd)
@@ -1384,7 +1331,7 @@ class FilesystemObjectProcessors:
         return status
         return status
 
 
     def process_file(self, *, path, parent_fd, name, st, cache, flags=flags_normal):
     def process_file(self, *, path, parent_fd, name, st, cache, flags=flags_normal):
-        with self.create_helper(path, st, None) as (item, status, hardlinked, hardlink_master):  # no status yet
+        with self.create_helper(path, st, None) as (item, status, hardlinked):  # no status yet
             with OsOpen(path=path, parent_fd=parent_fd, name=name, flags=flags, noatime=True) as fd:
             with OsOpen(path=path, parent_fd=parent_fd, name=name, flags=flags, noatime=True) as fd:
                 with backup_io('fstat'):
                 with backup_io('fstat'):
                     st = stat_update_check(st, os.fstat(fd))
                     st = stat_update_check(st, os.fstat(fd))
@@ -1395,7 +1342,9 @@ class FilesystemObjectProcessors:
                     # so it can be extracted / accessed in FUSE mount like a regular file.
                     # so it can be extracted / accessed in FUSE mount like a regular file.
                     # this needs to be done early, so that part files also get the patched mode.
                     # this needs to be done early, so that part files also get the patched mode.
                     item.mode = stat.S_IFREG | stat.S_IMODE(item.mode)
                     item.mode = stat.S_IFREG | stat.S_IMODE(item.mode)
-                if not hardlinked or hardlink_master:
+                if 'chunks' in item:  # create_helper might have put chunks from a previous hardlink there
+                    [cache.chunk_incref(id_, self.stats) for id_, _, _ in item.chunks]
+                else:  # normal case, no "2nd+" hardlink
                     if not is_special_file:
                     if not is_special_file:
                         hashed_path = safe_encode(os.path.join(self.cwd, path))
                         hashed_path = safe_encode(os.path.join(self.cwd, path))
                         path_hash = self.key.id_hash(hashed_path)
                         path_hash = self.key.id_hash(hashed_path)
@@ -1420,7 +1369,6 @@ class FilesystemObjectProcessors:
                         status = 'M' if known else 'A'  # regular file, modified or added
                         status = 'M' if known else 'A'  # regular file, modified or added
                     self.print_file_status(status, path)
                     self.print_file_status(status, path)
                     status = None  # we already printed the status
                     status = None  # we already printed the status
-                    item.hardlink_master = hardlinked
                     # Only chunkify the file if needed
                     # Only chunkify the file if needed
                     if chunks is not None:
                     if chunks is not None:
                         item.chunks = chunks
                         item.chunks = chunks
@@ -1444,7 +1392,7 @@ class FilesystemObjectProcessors:
                             # also, we must not memorize a potentially inconsistent/corrupt file that
                             # also, we must not memorize a potentially inconsistent/corrupt file that
                             # changed while we backed it up.
                             # changed while we backed it up.
                             cache.memorize_file(hashed_path, path_hash, st, [c.id for c in item.chunks])
                             cache.memorize_file(hashed_path, path_hash, st, [c.id for c in item.chunks])
-                    self.stats.nfiles += 1
+                self.stats.nfiles += 1
                 item.update(self.metadata_collector.stat_ext_attrs(st, path, fd=fd))
                 item.update(self.metadata_collector.stat_ext_attrs(st, path, fd=fd))
                 item.get_size(memorize=True)
                 item.get_size(memorize=True)
                 return status
                 return status
@@ -1464,6 +1412,7 @@ class TarfileObjectProcessors:
 
 
         self.stats = Statistics(output_json=log_json, iec=iec)  # threading: done by cache (including progress)
         self.stats = Statistics(output_json=log_json, iec=iec)  # threading: done by cache (including progress)
         self.chunker = get_chunker(*chunker_params, seed=key.chunk_seed, sparse=False)
         self.chunker = get_chunker(*chunker_params, seed=key.chunk_seed, sparse=False)
+        self.hlm = HardLinkManager(id_type=str, info_type=list)  # path -> chunks
 
 
     @contextmanager
     @contextmanager
     def create_helper(self, tarinfo, status=None, type=None):
     def create_helper(self, tarinfo, status=None, type=None):
@@ -1504,11 +1453,21 @@ class TarfileObjectProcessors:
             item.rdev = os.makedev(tarinfo.devmajor, tarinfo.devminor)
             item.rdev = os.makedev(tarinfo.devmajor, tarinfo.devminor)
             return status
             return status
 
 
-    def process_link(self, *, tarinfo, status, type):
+    def process_symlink(self, *, tarinfo, status, type):
         with self.create_helper(tarinfo, status, type) as (item, status):
         with self.create_helper(tarinfo, status, type) as (item, status):
             item.source = tarinfo.linkname
             item.source = tarinfo.linkname
             return status
             return status
 
 
+    def process_hardlink(self, *, tarinfo, status, type):
+        with self.create_helper(tarinfo, status, type) as (item, status):
+            # create a not hardlinked borg item, reusing the chunks, see HardLinkManager.__doc__
+            chunks = self.hlm.retrieve(tarinfo.linkname)
+            if chunks is not None:
+                item.chunks = chunks
+            item.get_size(memorize=True, from_chunks=True)
+            self.stats.nfiles += 1
+            return status
+
     def process_file(self, *, tarinfo, status, type, tar):
     def process_file(self, *, tarinfo, status, type, tar):
         with self.create_helper(tarinfo, status, type) as (item, status):
         with self.create_helper(tarinfo, status, type) as (item, status):
             self.print_file_status(status, tarinfo.name)
             self.print_file_status(status, tarinfo.name)
@@ -1516,8 +1475,10 @@ class TarfileObjectProcessors:
             fd = tar.extractfile(tarinfo)
             fd = tar.extractfile(tarinfo)
             self.process_file_chunks(item, self.cache, self.stats, self.show_progress,
             self.process_file_chunks(item, self.cache, self.stats, self.show_progress,
                                      backup_io_iter(self.chunker.chunkify(fd)))
                                      backup_io_iter(self.chunker.chunkify(fd)))
-            item.get_size(memorize=True)
+            item.get_size(memorize=True, from_chunks=True)
             self.stats.nfiles += 1
             self.stats.nfiles += 1
+            # we need to remember ALL files, see HardLinkManager.__doc__
+            self.hlm.remember(id=tarinfo.name, info=item.chunks)
             return status
             return status
 
 
 
 
@@ -1787,7 +1748,7 @@ class ArchiveChecker:
                 continue
                 continue
             if not valid_msgpacked_dict(data, archive_keys_serialized):
             if not valid_msgpacked_dict(data, archive_keys_serialized):
                 continue
                 continue
-            if b'cmdline' not in data or b'\xa7version\x01' not in data:
+            if b'cmdline' not in data or b'\xa7version\x02' not in data:
                 continue
                 continue
             try:
             try:
                 archive = msgpack.unpackb(data)
                 archive = msgpack.unpackb(data)
@@ -1944,9 +1905,6 @@ class ArchiveChecker:
             def valid_item(obj):
             def valid_item(obj):
                 if not isinstance(obj, StableDict):
                 if not isinstance(obj, StableDict):
                     return False, 'not a dictionary'
                     return False, 'not a dictionary'
-                # A bug in Attic up to and including release 0.13 added a (meaningless) b'acl' key to every item.
-                # We ignore it here, should it exist. See test_attic013_acl_bug for details.
-                obj.pop(b'acl', None)
                 keys = set(obj)
                 keys = set(obj)
                 if not required_item_keys.issubset(keys):
                 if not required_item_keys.issubset(keys):
                     return False, 'missing required keys: ' + list_keys_safe(required_item_keys - keys)
                     return False, 'missing required keys: ' + list_keys_safe(required_item_keys - keys)
@@ -2031,7 +1989,7 @@ class ArchiveChecker:
                     del self.manifest.archives[info.name]
                     del self.manifest.archives[info.name]
                     continue
                     continue
                 archive = ArchiveItem(internal_dict=msgpack.unpackb(data))
                 archive = ArchiveItem(internal_dict=msgpack.unpackb(data))
-                if archive.version != 1:
+                if archive.version != 2:
                     raise Exception('Unknown archive metadata version')
                     raise Exception('Unknown archive metadata version')
                 archive.cmdline = [safe_decode(arg) for arg in archive.cmdline]
                 archive.cmdline = [safe_decode(arg) for arg in archive.cmdline]
                 items_buffer = ChunkBuffer(self.key)
                 items_buffer = ChunkBuffer(self.key)
@@ -2130,34 +2088,11 @@ class ArchiveRecreater:
 
 
     def process_items(self, archive, target):
     def process_items(self, archive, target):
         matcher = self.matcher
         matcher = self.matcher
-        target_is_subset = not matcher.empty()
-        hardlink_masters = {} if target_is_subset else None
-
-        def item_is_hardlink_master(item):
-            return (target_is_subset and
-                    hardlinkable(item.mode) and
-                    item.get('hardlink_master', True) and
-                    'source' not in item)
 
 
         for item in archive.iter_items():
         for item in archive.iter_items():
             if not matcher.match(item.path):
             if not matcher.match(item.path):
                 self.print_file_status('x', item.path)
                 self.print_file_status('x', item.path)
-                if item_is_hardlink_master(item):
-                    hardlink_masters[item.path] = (item.get('chunks'), item.get('chunks_healthy'), None)
                 continue
                 continue
-            if target_is_subset and hardlinkable(item.mode) and item.get('source') in hardlink_masters:
-                # master of this hard link is outside the target subset
-                chunks, chunks_healthy, new_source = hardlink_masters[item.source]
-                if new_source is None:
-                    # First item to use this master, move the chunks
-                    item.chunks = chunks
-                    if chunks_healthy is not None:
-                        item.chunks_healthy = chunks_healthy
-                    hardlink_masters[item.source] = (None, None, item.path)
-                    del item.source
-                else:
-                    # Master was already moved, only update this item's source
-                    item.source = new_source
             if self.dry_run:
             if self.dry_run:
                 self.print_file_status('-', item.path)
                 self.print_file_status('-', item.path)
             else:
             else:
@@ -2264,30 +2199,13 @@ class ArchiveRecreater:
         tag_files = []
         tag_files = []
         tagged_dirs = []
         tagged_dirs = []
 
 
-        # to support reading hard-linked CACHEDIR.TAGs (aka CACHE_TAG_NAME), similar to hardlink_masters:
-        cachedir_masters = {}
-
-        if self.exclude_caches:
-            # sadly, due to how CACHEDIR.TAG works (filename AND file [header] contents) and
-            # how borg deals with hardlinks (slave hardlinks referring back to master hardlinks),
-            # we need to pass over the archive collecting hardlink master paths.
-            # as seen in issue #4911, the master paths can have an arbitrary filenames,
-            # not just CACHEDIR.TAG.
-            for item in archive.iter_items(filter=lambda item: os.path.basename(item.path) == CACHE_TAG_NAME):
-                if stat.S_ISREG(item.mode) and 'chunks' not in item and 'source' in item:
-                    # this is a hardlink slave, referring back to its hardlink master (via item.source)
-                    cachedir_masters[item.source] = None  # we know the key (path), but not the value (item) yet
-
         for item in archive.iter_items(
         for item in archive.iter_items(
                 filter=lambda item: os.path.basename(item.path) == CACHE_TAG_NAME or matcher.match(item.path)):
                 filter=lambda item: os.path.basename(item.path) == CACHE_TAG_NAME or matcher.match(item.path)):
-            if self.exclude_caches and item.path in cachedir_masters:
-                cachedir_masters[item.path] = item
             dir, tag_file = os.path.split(item.path)
             dir, tag_file = os.path.split(item.path)
             if tag_file in self.exclude_if_present:
             if tag_file in self.exclude_if_present:
                 exclude(dir, item)
                 exclude(dir, item)
             elif self.exclude_caches and tag_file == CACHE_TAG_NAME and stat.S_ISREG(item.mode):
             elif self.exclude_caches and tag_file == CACHE_TAG_NAME and stat.S_ISREG(item.mode):
-                content_item = item if 'chunks' in item else cachedir_masters[item.source]
-                file = open_item(archive, content_item)
+                file = open_item(archive, item)
                 if file.read(len(CACHE_TAG_CONTENTS)) == CACHE_TAG_CONTENTS:
                 if file.read(len(CACHE_TAG_CONTENTS)) == CACHE_TAG_CONTENTS:
                     exclude(dir, item)
                     exclude(dir, item)
         matcher.add(tag_files, IECommand.Include)
         matcher.add(tag_files, IECommand.Include)

+ 194 - 58
src/borg/archiver.py

@@ -29,6 +29,7 @@ try:
     from contextlib import contextmanager
     from contextlib import contextmanager
     from datetime import datetime, timedelta
     from datetime import datetime, timedelta
     from io import TextIOWrapper
     from io import TextIOWrapper
+    from struct import Struct
 
 
     from .logger import create_logger, setup_logging
     from .logger import create_logger, setup_logging
 
 
@@ -44,7 +45,7 @@ try:
     from .archive import has_link
     from .archive import has_link
     from .cache import Cache, assert_secure, SecurityManager
     from .cache import Cache, assert_secure, SecurityManager
     from .constants import *  # NOQA
     from .constants import *  # NOQA
-    from .compress import CompressionSpec
+    from .compress import CompressionSpec, ZLIB, ZLIB_legacy, ObfuscateSize
     from .crypto.key import key_creator, key_argument_names, tam_required_file, tam_required
     from .crypto.key import key_creator, key_argument_names, tam_required_file, tam_required
     from .crypto.key import RepoKey, KeyfileKey, Blake2RepoKey, Blake2KeyfileKey, FlexiKey
     from .crypto.key import RepoKey, KeyfileKey, Blake2RepoKey, Blake2KeyfileKey, FlexiKey
     from .crypto.keymanager import KeyManager
     from .crypto.keymanager import KeyManager
@@ -59,7 +60,7 @@ try:
     from .helpers import timestamp
     from .helpers import timestamp
     from .helpers import get_cache_dir, os_stat
     from .helpers import get_cache_dir, os_stat
     from .helpers import Manifest, AI_HUMAN_SORT_KEYS
     from .helpers import Manifest, AI_HUMAN_SORT_KEYS
-    from .helpers import hardlinkable
+    from .helpers import HardLinkManager
     from .helpers import StableDict
     from .helpers import StableDict
     from .helpers import check_python, check_extension_modules
     from .helpers import check_python, check_extension_modules
     from .helpers import dir_is_tagged, is_slow_msgpack, is_supported_msgpack, yes, sysinfo
     from .helpers import dir_is_tagged, is_slow_msgpack, is_supported_msgpack, yes, sysinfo
@@ -338,6 +339,137 @@ class Archiver:
         ).serve()
         ).serve()
         return EXIT_SUCCESS
         return EXIT_SUCCESS
 
 
+    @with_other_repository(manifest=True, key=True, compatibility=(Manifest.Operation.READ,))
+    @with_repository(exclusive=True, manifest=True, cache=True, compatibility=(Manifest.Operation.WRITE,))
+    def do_transfer(self, args, *,
+               repository, manifest, key, cache,
+               other_repository=None, other_manifest=None, other_key=None):
+        """archives transfer from other repository"""
+
+        ITEM_KEY_WHITELIST = {'path', 'source', 'rdev', 'chunks', 'chunks_healthy', 'hlid',
+                              'mode', 'user', 'group', 'uid', 'gid', 'mtime', 'atime', 'ctime', 'birthtime', 'size',
+                              'xattrs', 'bsdflags', 'acl_nfs4', 'acl_access', 'acl_default', 'acl_extended',
+                              'part'}
+
+        def upgrade_item(item):
+            """upgrade item as needed, get rid of legacy crap"""
+            if hlm.borg1_hardlink_master(item):
+                item._dict['hlid'] = hlid = hlm.hardlink_id_from_path(item._dict['path'])
+                hlm.remember(id=hlid, info=(item._dict.get('chunks'), item._dict.get('chunks_healthy')))
+            elif hlm.borg1_hardlink_slave(item):
+                item._dict['hlid'] = hlid = hlm.hardlink_id_from_path(item._dict['source'])
+                chunks, chunks_healthy = hlm.retrieve(id=hlid, default=(None, None))
+                if chunks is not None:
+                    item._dict['chunks'] = chunks
+                    for chunk_id, _, _ in chunks:
+                        cache.chunk_incref(chunk_id, archive.stats)
+                if chunks_healthy is not None:
+                    item._dict['chunks_healthy'] = chunks
+                item._dict.pop('source')  # not used for hardlinks any more, replaced by hlid
+            for attr in 'atime', 'ctime', 'mtime', 'birthtime':
+                if attr in item:
+                    ns = getattr(item, attr)  # decode (bigint or Timestamp) --> int ns
+                    setattr(item, attr, ns)  # encode int ns --> msgpack.Timestamp only, no bigint any more
+            # make sure we only have desired stuff in the new item. specifically, make sure to get rid of:
+            # - 'acl' remnants of bug in attic <= 0.13
+            # - 'hardlink_master' (superseded by hlid)
+            new_item_dict = {key: value for key, value in item.as_dict().items() if key in ITEM_KEY_WHITELIST}
+            new_item = Item(internal_dict=new_item_dict)
+            new_item.get_size(memorize=True)  # if not already present: compute+remember size for items with chunks
+            assert all(key in new_item for key in REQUIRED_ITEM_KEYS)
+            return new_item
+
+        def upgrade_compressed_chunk(chunk):
+            def upgrade_zlib_and_level(chunk):
+                if ZLIB_legacy.detect(chunk):
+                    ctype = ZLIB.ID
+                    chunk = ctype + level + chunk  # get rid of the attic legacy: prepend separate type/level bytes
+                else:
+                    ctype = chunk[0:1]
+                    chunk = ctype + level + chunk[2:]  # keep type same, but set level
+                return chunk
+
+            ctype = chunk[0:1]
+            level = b'\xFF'  # FF means unknown compression level
+
+            if ctype == ObfuscateSize.ID:
+                # in older borg, we used unusual byte order
+                old_header_fmt = Struct('>I')
+                new_header_fmt = ObfuscateSize.header_fmt
+                length = ObfuscateSize.header_len
+                size_bytes = chunk[2:2+length]
+                size = old_header_fmt.unpack(size_bytes)
+                size_bytes = new_header_fmt.pack(size)
+                compressed = chunk[2+length:]
+                compressed = upgrade_zlib_and_level(compressed)
+                chunk = ctype + level + size_bytes + compressed
+            else:
+                chunk = upgrade_zlib_and_level(chunk)
+            return chunk
+
+        dry_run = args.dry_run
+
+        args.consider_checkpoints = True
+        archive_names = tuple(x.name for x in other_manifest.archives.list_considering(args))
+        if not archive_names:
+            return EXIT_SUCCESS
+
+        for name in archive_names:
+            transfer_size = 0
+            present_size = 0
+            if name in manifest.archives and not dry_run:
+                print(f"{name}: archive is already present in destination repo, skipping.")
+            else:
+                if not dry_run:
+                    print(f"{name}: copying archive to destination repo...")
+                hlm = HardLinkManager(id_type=bytes, info_type=tuple)  # hlid -> (chunks, chunks_healthy)
+                other_archive = Archive(other_repository, other_key, other_manifest, name)
+                archive = Archive(repository, key, manifest, name, cache=cache, create=True) if not dry_run else None
+                for item in other_archive.iter_items():
+                    if 'chunks' in item:
+                        chunks = []
+                        for chunk_id, size, _ in item.chunks:
+                            refcount = cache.seen_chunk(chunk_id, size)
+                            if refcount == 0:  # target repo does not yet have this chunk
+                                if not dry_run:
+                                    cdata = other_repository.get(chunk_id)
+                                    # keep compressed payload same, avoid decompression / recompression
+                                    data = other_key.decrypt(chunk_id, cdata, decompress=False)
+                                    data = upgrade_compressed_chunk(data)
+                                    chunk_entry = cache.add_chunk(chunk_id, data, archive.stats, wait=False,
+                                                                  compress=False, size=size)
+                                    cache.repository.async_response(wait=False)
+                                    chunks.append(chunk_entry)
+                                transfer_size += size
+                            else:
+                                if not dry_run:
+                                    chunk_entry = cache.chunk_incref(chunk_id, archive.stats)
+                                    chunks.append(chunk_entry)
+                                present_size += size
+                        if not dry_run:
+                            item.chunks = chunks  # overwrite! IDs and sizes are same, csizes are likely different
+                            archive.stats.nfiles += 1
+                    if not dry_run:
+                        archive.add_item(upgrade_item(item))
+                if not dry_run:
+                    additional_metadata = {}
+                    # keep all metadata except archive version and stats. also do not keep
+                    # recreate_source_id, recreate_args, recreate_partial_chunks which were used only in 1.1.0b1 .. b2.
+                    for attr in ('cmdline', 'hostname', 'username', 'time', 'time_end', 'comment',
+                                 'chunker_params', 'recreate_cmdline'):
+                        if hasattr(other_archive.metadata, attr):
+                            additional_metadata[attr] = getattr(other_archive.metadata, attr)
+                    archive.save(stats=archive.stats, additional_metadata=additional_metadata)
+                    print(f"{name}: finished. "
+                          f"transfer_size: {format_file_size(transfer_size)} "
+                          f"present_size: {format_file_size(present_size)}")
+                else:
+                    print(f"{name}: completed" if transfer_size == 0 else
+                          f"{name}: incomplete, "
+                          f"transfer_size: {format_file_size(transfer_size)} "
+                          f"present_size: {format_file_size(present_size)}")
+        return EXIT_SUCCESS
+
     @with_repository(create=True, exclusive=True, manifest=False)
     @with_repository(create=True, exclusive=True, manifest=False)
     @with_other_repository(key=True, compatibility=(Manifest.Operation.READ, ))
     @with_other_repository(key=True, compatibility=(Manifest.Operation.READ, ))
     def do_init(self, args, repository, *, other_repository=None, other_key=None):
     def do_init(self, args, repository, *, other_repository=None, other_key=None):
@@ -1055,16 +1187,14 @@ class Archiver:
             self.print_file_status(status, path)
             self.print_file_status(status, path)
 
 
     @staticmethod
     @staticmethod
-    def build_filter(matcher, peek_and_store_hardlink_masters, strip_components):
+    def build_filter(matcher, strip_components):
         if strip_components:
         if strip_components:
             def item_filter(item):
             def item_filter(item):
                 matched = matcher.match(item.path) and os.sep.join(item.path.split(os.sep)[strip_components:])
                 matched = matcher.match(item.path) and os.sep.join(item.path.split(os.sep)[strip_components:])
-                peek_and_store_hardlink_masters(item, matched)
                 return matched
                 return matched
         else:
         else:
             def item_filter(item):
             def item_filter(item):
                 matched = matcher.match(item.path)
                 matched = matcher.match(item.path)
-                peek_and_store_hardlink_masters(item, matched)
                 return matched
                 return matched
         return item_filter
         return item_filter
 
 
@@ -1087,33 +1217,18 @@ class Archiver:
         sparse = args.sparse
         sparse = args.sparse
         strip_components = args.strip_components
         strip_components = args.strip_components
         dirs = []
         dirs = []
-        partial_extract = not matcher.empty() or strip_components
-        hardlink_masters = {} if partial_extract or not has_link else None
+        hlm = HardLinkManager(id_type=bytes, info_type=str)  # hlid -> path
 
 
-        def peek_and_store_hardlink_masters(item, matched):
-            # not has_link:
-            # OS does not have hardlink capability thus we need to remember the chunks so that
-            # we can extract all hardlinks as separate normal (not-hardlinked) files instead.
-            #
-            # partial_extract and not matched and hardlinkable:
-            # we do not extract the very first hardlink, so we need to remember the chunks
-            # in hardlinks_master, so we can use them when we extract some 2nd+ hardlink item
-            # that has no chunks list.
-            if ((not has_link or (partial_extract and not matched and hardlinkable(item.mode))) and
-                    (item.get('hardlink_master', True) and 'source' not in item)):
-                hardlink_masters[item.get('path')] = (item.get('chunks'), None)
-
-        filter = self.build_filter(matcher, peek_and_store_hardlink_masters, strip_components)
+        filter = self.build_filter(matcher, strip_components)
         if progress:
         if progress:
             pi = ProgressIndicatorPercent(msg='%5.1f%% Extracting: %s', step=0.1, msgid='extract')
             pi = ProgressIndicatorPercent(msg='%5.1f%% Extracting: %s', step=0.1, msgid='extract')
             pi.output('Calculating total archive size for the progress indicator (might take long for large archives)')
             pi.output('Calculating total archive size for the progress indicator (might take long for large archives)')
-            extracted_size = sum(item.get_size(hardlink_masters) for item in archive.iter_items(filter))
+            extracted_size = sum(item.get_size() for item in archive.iter_items(filter))
             pi.total = extracted_size
             pi.total = extracted_size
         else:
         else:
             pi = None
             pi = None
 
 
-        for item in archive.iter_items(filter, partial_extract=partial_extract,
-                                       preload=True, hardlink_masters=hardlink_masters):
+        for item in archive.iter_items(filter, preload=True):
             orig_path = item.path
             orig_path = item.path
             if strip_components:
             if strip_components:
                 item.path = os.sep.join(orig_path.split(os.sep)[strip_components:])
                 item.path = os.sep.join(orig_path.split(os.sep)[strip_components:])
@@ -1128,13 +1243,13 @@ class Archiver:
                 logging.getLogger('borg.output.list').info(remove_surrogates(item.path))
                 logging.getLogger('borg.output.list').info(remove_surrogates(item.path))
             try:
             try:
                 if dry_run:
                 if dry_run:
-                    archive.extract_item(item, dry_run=True, pi=pi)
+                    archive.extract_item(item, dry_run=True, hlm=hlm, pi=pi)
                 else:
                 else:
                     if stat.S_ISDIR(item.mode):
                     if stat.S_ISDIR(item.mode):
                         dirs.append(item)
                         dirs.append(item)
                         archive.extract_item(item, stdout=stdout, restore_attrs=False)
                         archive.extract_item(item, stdout=stdout, restore_attrs=False)
                     else:
                     else:
-                        archive.extract_item(item, stdout=stdout, sparse=sparse, hardlink_masters=hardlink_masters,
+                        archive.extract_item(item, stdout=stdout, sparse=sparse, hlm=hlm,
                                              stripped_components=strip_components, original_path=orig_path, pi=pi)
                                              stripped_components=strip_components, original_path=orig_path, pi=pi)
             except (BackupOSError, BackupError) as e:
             except (BackupOSError, BackupError) as e:
                 self.print_warning('%s: %s', remove_surrogates(orig_path), e)
                 self.print_warning('%s: %s', remove_surrogates(orig_path), e)
@@ -1199,15 +1314,9 @@ class Archiver:
         progress = args.progress
         progress = args.progress
         output_list = args.output_list
         output_list = args.output_list
         strip_components = args.strip_components
         strip_components = args.strip_components
-        partial_extract = not matcher.empty() or strip_components
-        hardlink_masters = {} if partial_extract else None
-
-        def peek_and_store_hardlink_masters(item, matched):
-            if ((partial_extract and not matched and hardlinkable(item.mode)) and
-                    (item.get('hardlink_master', True) and 'source' not in item)):
-                hardlink_masters[item.get('path')] = (item.get('chunks'), None)
+        hlm = HardLinkManager(id_type=bytes, info_type=str)  # hlid -> path
 
 
-        filter = self.build_filter(matcher, peek_and_store_hardlink_masters, strip_components)
+        filter = self.build_filter(matcher, strip_components)
 
 
         # The | (pipe) symbol instructs tarfile to use a streaming mode of operation
         # The | (pipe) symbol instructs tarfile to use a streaming mode of operation
         # where it never seeks on the passed fileobj.
         # where it never seeks on the passed fileobj.
@@ -1217,7 +1326,7 @@ class Archiver:
         if progress:
         if progress:
             pi = ProgressIndicatorPercent(msg='%5.1f%% Processing: %s', step=0.1, msgid='extract')
             pi = ProgressIndicatorPercent(msg='%5.1f%% Processing: %s', step=0.1, msgid='extract')
             pi.output('Calculating size')
             pi.output('Calculating size')
-            extracted_size = sum(item.get_size(hardlink_masters) for item in archive.iter_items(filter))
+            extracted_size = sum(item.get_size() for item in archive.iter_items(filter))
             pi.total = extracted_size
             pi.total = extracted_size
         else:
         else:
             pi = None
             pi = None
@@ -1252,9 +1361,8 @@ class Archiver:
             tarinfo.gid = item.gid
             tarinfo.gid = item.gid
             tarinfo.uname = item.user or ''
             tarinfo.uname = item.user or ''
             tarinfo.gname = item.group or ''
             tarinfo.gname = item.group or ''
-            # The linkname in tar has the same dual use the 'source' attribute of Borg items,
-            # i.e. for symlinks it means the destination, while for hardlinks it refers to the
-            # file.
+            # The linkname in tar has 2 uses:
+            # for symlinks it means the destination, while for hardlinks it refers to the file.
             # Since hardlinks in tar have a different type code (LNKTYPE) the format might
             # Since hardlinks in tar have a different type code (LNKTYPE) the format might
             # support hardlinking arbitrary objects (including symlinks and directories), but
             # support hardlinking arbitrary objects (including symlinks and directories), but
             # whether implementations actually support that is a whole different question...
             # whether implementations actually support that is a whole different question...
@@ -1263,23 +1371,16 @@ class Archiver:
             modebits = stat.S_IFMT(item.mode)
             modebits = stat.S_IFMT(item.mode)
             if modebits == stat.S_IFREG:
             if modebits == stat.S_IFREG:
                 tarinfo.type = tarfile.REGTYPE
                 tarinfo.type = tarfile.REGTYPE
-                if 'source' in item:
-                    source = os.sep.join(item.source.split(os.sep)[strip_components:])
-                    if hardlink_masters is None:
-                        linkname = source
-                    else:
-                        chunks, linkname = hardlink_masters.get(item.source, (None, source))
-                    if linkname:
-                        # Master was already added to the archive, add a hardlink reference to it.
+                if 'hlid' in item:
+                    linkname = hlm.retrieve(id=item.hlid)
+                    if linkname is not None:
+                        # the first hardlink was already added to the archive, add a tar-hardlink reference to it.
                         tarinfo.type = tarfile.LNKTYPE
                         tarinfo.type = tarfile.LNKTYPE
                         tarinfo.linkname = linkname
                         tarinfo.linkname = linkname
-                    elif chunks is not None:
-                        # The item which has the chunks was not put into the tar, therefore
-                        # we do that now and update hardlink_masters to reflect that.
-                        item.chunks = chunks
+                    else:
                         tarinfo.size = item.get_size()
                         tarinfo.size = item.get_size()
                         stream = item_content_stream(item)
                         stream = item_content_stream(item)
-                        hardlink_masters[item.get('source') or original_path] = (None, item.path)
+                        hlm.remember(id=item.hlid, info=item.path)
                 else:
                 else:
                     tarinfo.size = item.get_size()
                     tarinfo.size = item.get_size()
                     stream = item_content_stream(item)
                     stream = item_content_stream(item)
@@ -1337,8 +1438,7 @@ class Archiver:
                 ph['BORG.item.meta'] = meta_text
                 ph['BORG.item.meta'] = meta_text
             return ph
             return ph
 
 
-        for item in archive.iter_items(filter, partial_extract=partial_extract,
-                                       preload=True, hardlink_masters=hardlink_masters):
+        for item in archive.iter_items(filter, preload=True):
             orig_path = item.path
             orig_path = item.path
             if strip_components:
             if strip_components:
                 item.path = os.sep.join(orig_path.split(os.sep)[strip_components:])
                 item.path = os.sep.join(orig_path.split(os.sep)[strip_components:])
@@ -1973,12 +2073,11 @@ class Archiver:
             elif tarinfo.isdir():
             elif tarinfo.isdir():
                 status = tfo.process_dir(tarinfo=tarinfo, status='d', type=stat.S_IFDIR)
                 status = tfo.process_dir(tarinfo=tarinfo, status='d', type=stat.S_IFDIR)
             elif tarinfo.issym():
             elif tarinfo.issym():
-                status = tfo.process_link(tarinfo=tarinfo, status='s', type=stat.S_IFLNK)
+                status = tfo.process_symlink(tarinfo=tarinfo, status='s', type=stat.S_IFLNK)
             elif tarinfo.islnk():
             elif tarinfo.islnk():
-                # tar uses the same hardlink model as borg (rather vice versa); the first instance of a hardlink
-                # is stored as a regular file, later instances are special entries referencing back to the
-                # first instance.
-                status = tfo.process_link(tarinfo=tarinfo, status='h', type=stat.S_IFREG)
+                # tar uses a hardlink model like: the first instance of a hardlink is stored as a regular file,
+                # later instances are special entries referencing back to the first instance.
+                status = tfo.process_hardlink(tarinfo=tarinfo, status='h', type=stat.S_IFREG)
             elif tarinfo.isblk():
             elif tarinfo.isblk():
                 status = tfo.process_dev(tarinfo=tarinfo, status='b', type=stat.S_IFBLK)
                 status = tfo.process_dev(tarinfo=tarinfo, status='b', type=stat.S_IFBLK)
             elif tarinfo.ischr():
             elif tarinfo.ischr():
@@ -4083,6 +4182,43 @@ class Archiver:
                                help='archives to delete')
                                help='archives to delete')
         define_archive_filters_group(subparser)
         define_archive_filters_group(subparser)
 
 
+        # borg transfer
+        transfer_epilog = process_epilog("""
+        This command transfers archives from one repository to another repository.
+
+        Suggested use:
+
+        # initialize DST_REPO reusing key material from SRC_REPO, so that
+        # chunking and chunk id generation will work in the same way as before.
+        borg init --other-location=SRC_REPO --encryption=DST_ENC DST_REPO
+
+        # transfer archives from SRC_REPO to DST_REPO
+        borg transfer --dry-run SRC_REPO DST_REPO  # check what it would do
+        borg transfer           SRC_REPO DST_REPO  # do it!
+        borg transfer --dry-run SRC_REPO DST_REPO  # check! anything left?
+
+        The default is to transfer all archives, including checkpoint archives.
+
+        You could use the misc. archive filter options to limit which archives it will
+        transfer, e.g. using the --prefix option. This is recommended for big
+        repositories with multiple data sets to keep the runtime per invocation lower.
+        """)
+        subparser = subparsers.add_parser('transfer', parents=[common_parser], add_help=False,
+                                          description=self.do_transfer.__doc__,
+                                          epilog=transfer_epilog,
+                                          formatter_class=argparse.RawDescriptionHelpFormatter,
+                                          help='transfer of archives from another repository')
+        subparser.set_defaults(func=self.do_transfer)
+        subparser.add_argument('-n', '--dry-run', dest='dry_run', action='store_true',
+                               help='do not change repository, just check')
+        subparser.add_argument('other_location', metavar='SRC_REPOSITORY',
+                               type=location_validator(archive=False, other=True),
+                               help='source repository')
+        subparser.add_argument('location', metavar='DST_REPOSITORY',
+                               type=location_validator(archive=False, other=False),
+                               help='destination repository')
+        define_archive_filters_group(subparser)
+
         # borg diff
         # borg diff
         diff_epilog = process_epilog("""
         diff_epilog = process_epilog("""
             This command finds differences (file contents, user/group/mode) between archives.
             This command finds differences (file contents, user/group/mode) between archives.

+ 7 - 6
src/borg/cache.py

@@ -19,7 +19,7 @@ from .helpers import Location
 from .helpers import Error
 from .helpers import Error
 from .helpers import Manifest
 from .helpers import Manifest
 from .helpers import get_cache_dir, get_security_dir
 from .helpers import get_cache_dir, get_security_dir
-from .helpers import int_to_bigint, bigint_to_int, bin_to_hex, parse_stringified_list
+from .helpers import bin_to_hex, parse_stringified_list
 from .helpers import format_file_size
 from .helpers import format_file_size
 from .helpers import safe_ns
 from .helpers import safe_ns
 from .helpers import yes
 from .helpers import yes
@@ -28,6 +28,7 @@ from .helpers import ProgressIndicatorPercent, ProgressIndicatorMessage
 from .helpers import set_ec, EXIT_WARNING
 from .helpers import set_ec, EXIT_WARNING
 from .helpers import safe_unlink
 from .helpers import safe_unlink
 from .helpers import msgpack
 from .helpers import msgpack
+from .helpers.msgpack import int_to_timestamp, timestamp_to_int
 from .item import ArchiveItem, ChunkListEntry
 from .item import ArchiveItem, ChunkListEntry
 from .crypto.key import PlaintextKey
 from .crypto.key import PlaintextKey
 from .crypto.file_integrity import IntegrityCheckedFile, DetachedIntegrityCheckedFile, FileIntegrityError
 from .crypto.file_integrity import IntegrityCheckedFile, DetachedIntegrityCheckedFile, FileIntegrityError
@@ -623,7 +624,7 @@ class LocalCache(CacheStatsMixin):
                     # this is to avoid issues with filesystem snapshots and cmtime granularity.
                     # this is to avoid issues with filesystem snapshots and cmtime granularity.
                     # Also keep files from older backups that have not reached BORG_FILES_CACHE_TTL yet.
                     # Also keep files from older backups that have not reached BORG_FILES_CACHE_TTL yet.
                     entry = FileCacheEntry(*msgpack.unpackb(item))
                     entry = FileCacheEntry(*msgpack.unpackb(item))
-                    if entry.age == 0 and bigint_to_int(entry.cmtime) < self._newest_cmtime or \
+                    if entry.age == 0 and timestamp_to_int(entry.cmtime) < self._newest_cmtime or \
                        entry.age > 0 and entry.age < ttl:
                        entry.age > 0 and entry.age < ttl:
                         msgpack.pack((path_hash, entry), fd)
                         msgpack.pack((path_hash, entry), fd)
                         entry_count += 1
                         entry_count += 1
@@ -756,7 +757,7 @@ class LocalCache(CacheStatsMixin):
             csize, data = decrypted_repository.get(archive_id)
             csize, data = decrypted_repository.get(archive_id)
             chunk_idx.add(archive_id, 1, len(data), csize)
             chunk_idx.add(archive_id, 1, len(data), csize)
             archive = ArchiveItem(internal_dict=msgpack.unpackb(data))
             archive = ArchiveItem(internal_dict=msgpack.unpackb(data))
-            if archive.version != 1:
+            if archive.version not in (1, 2):  # legacy
                 raise Exception('Unknown archive metadata version')
                 raise Exception('Unknown archive metadata version')
             sync = CacheSynchronizer(chunk_idx)
             sync = CacheSynchronizer(chunk_idx)
             for item_id, (csize, data) in zip(archive.items, decrypted_repository.get_many(archive.items)):
             for item_id, (csize, data) in zip(archive.items, decrypted_repository.get_many(archive.items)):
@@ -1018,10 +1019,10 @@ class LocalCache(CacheStatsMixin):
         if 'i' in cache_mode and entry.inode != st.st_ino:
         if 'i' in cache_mode and entry.inode != st.st_ino:
             files_cache_logger.debug('KNOWN-CHANGED: file inode number has changed: %r', hashed_path)
             files_cache_logger.debug('KNOWN-CHANGED: file inode number has changed: %r', hashed_path)
             return True, None
             return True, None
-        if 'c' in cache_mode and bigint_to_int(entry.cmtime) != st.st_ctime_ns:
+        if 'c' in cache_mode and timestamp_to_int(entry.cmtime) != st.st_ctime_ns:
             files_cache_logger.debug('KNOWN-CHANGED: file ctime has changed: %r', hashed_path)
             files_cache_logger.debug('KNOWN-CHANGED: file ctime has changed: %r', hashed_path)
             return True, None
             return True, None
-        elif 'm' in cache_mode and bigint_to_int(entry.cmtime) != st.st_mtime_ns:
+        elif 'm' in cache_mode and timestamp_to_int(entry.cmtime) != st.st_mtime_ns:
             files_cache_logger.debug('KNOWN-CHANGED: file mtime has changed: %r', hashed_path)
             files_cache_logger.debug('KNOWN-CHANGED: file mtime has changed: %r', hashed_path)
             return True, None
             return True, None
         # we ignored the inode number in the comparison above or it is still same.
         # we ignored the inode number in the comparison above or it is still same.
@@ -1049,7 +1050,7 @@ class LocalCache(CacheStatsMixin):
         elif 'm' in cache_mode:
         elif 'm' in cache_mode:
             cmtime_type = 'mtime'
             cmtime_type = 'mtime'
             cmtime_ns = safe_ns(st.st_mtime_ns)
             cmtime_ns = safe_ns(st.st_mtime_ns)
-        entry = FileCacheEntry(age=0, inode=st.st_ino, size=st.st_size, cmtime=int_to_bigint(cmtime_ns), chunk_ids=ids)
+        entry = FileCacheEntry(age=0, inode=st.st_ino, size=st.st_size, cmtime=int_to_timestamp(cmtime_ns), chunk_ids=ids)
         self.files[path_hash] = msgpack.packb(entry)
         self.files[path_hash] = msgpack.packb(entry)
         self._newest_cmtime = max(self._newest_cmtime or 0, cmtime_ns)
         self._newest_cmtime = max(self._newest_cmtime or 0, cmtime_ns)
         files_cache_logger.debug('FILES-CACHE-UPDATE: put %r [has %s] <- %r',
         files_cache_logger.debug('FILES-CACHE-UPDATE: put %r [has %s] <- %r',

+ 74 - 27
src/borg/compress.pyx

@@ -56,16 +56,21 @@ cdef class CompressorBase:
     also handles compression format auto detection and
     also handles compression format auto detection and
     adding/stripping the ID header (which enable auto detection).
     adding/stripping the ID header (which enable auto detection).
     """
     """
-    ID = b'\xFF\xFF'  # reserved and not used
-                      # overwrite with a unique 2-bytes bytestring in child classes
+    ID = b'\xFF'  # reserved and not used
+                  # overwrite with a unique 1-byte bytestring in child classes
     name = 'baseclass'
     name = 'baseclass'
 
 
     @classmethod
     @classmethod
     def detect(cls, data):
     def detect(cls, data):
         return data.startswith(cls.ID)
         return data.startswith(cls.ID)
 
 
-    def __init__(self, **kwargs):
-        pass
+    def __init__(self, level=255, **kwargs):
+        assert 0 <= level <= 255
+        if self.ID is not None:
+            self.id_level = self.ID + bytes((level, ))  # level 255 means "unknown level"
+            assert len(self.id_level) == 2
+        else:
+            self.id_level = None
 
 
     def decide(self, data):
     def decide(self, data):
         """
         """
@@ -85,8 +90,8 @@ cdef class CompressorBase:
         Compress *data* (bytes) and return bytes result. Prepend the ID bytes of this compressor,
         Compress *data* (bytes) and return bytes result. Prepend the ID bytes of this compressor,
         which is needed so that the correct decompressor can be used for decompression.
         which is needed so that the correct decompressor can be used for decompression.
         """
         """
-        # add ID bytes
-        return self.ID + data
+        # add id_level bytes
+        return self.id_level + data
 
 
     def decompress(self, data):
     def decompress(self, data):
         """
         """
@@ -96,7 +101,7 @@ cdef class CompressorBase:
         Only handles input generated by _this_ Compressor - for a general purpose
         Only handles input generated by _this_ Compressor - for a general purpose
         decompression method see *Compressor.decompress*.
         decompression method see *Compressor.decompress*.
         """
         """
-        # strip ID bytes
+        # strip id_level bytes
         return data[2:]
         return data[2:]
 
 
 cdef class DecidingCompressor(CompressorBase):
 cdef class DecidingCompressor(CompressorBase):
@@ -106,8 +111,8 @@ cdef class DecidingCompressor(CompressorBase):
     """
     """
     name = 'decidebaseclass'
     name = 'decidebaseclass'
 
 
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
+    def __init__(self, level=255, **kwargs):
+        super().__init__(level=level, **kwargs)
 
 
     def _decide(self, data):
     def _decide(self, data):
         """
         """
@@ -148,9 +153,12 @@ class CNONE(CompressorBase):
     """
     """
     none - no compression, just pass through data
     none - no compression, just pass through data
     """
     """
-    ID = b'\x00\x00'
+    ID = b'\x00'
     name = 'none'
     name = 'none'
 
 
+    def __init__(self, level=255, **kwargs):
+        super().__init__(level=level, **kwargs)  # no defined levels for CNONE, so just say "unknown"
+
     def compress(self, data):
     def compress(self, data):
         return super().compress(data)
         return super().compress(data)
 
 
@@ -170,11 +178,11 @@ class LZ4(DecidingCompressor):
         - wrapper releases CPython's GIL to support multithreaded code
         - wrapper releases CPython's GIL to support multithreaded code
         - uses safe lz4 methods that never go beyond the end of the output buffer
         - uses safe lz4 methods that never go beyond the end of the output buffer
     """
     """
-    ID = b'\x01\x00'
+    ID = b'\x01'
     name = 'lz4'
     name = 'lz4'
 
 
-    def __init__(self, **kwargs):
-        pass
+    def __init__(self, level=255, **kwargs):
+        super().__init__(level=level, **kwargs)  # no defined levels for LZ4, so just say "unknown"
 
 
     def _decide(self, idata):
     def _decide(self, idata):
         """
         """
@@ -235,11 +243,11 @@ class LZMA(DecidingCompressor):
     """
     """
     lzma compression / decompression
     lzma compression / decompression
     """
     """
-    ID = b'\x02\x00'
+    ID = b'\x02'
     name = 'lzma'
     name = 'lzma'
 
 
     def __init__(self, level=6, **kwargs):
     def __init__(self, level=6, **kwargs):
-        super().__init__(**kwargs)
+        super().__init__(level=level, **kwargs)
         self.level = level
         self.level = level
         if lzma is None:
         if lzma is None:
             raise ValueError('No lzma support found.')
             raise ValueError('No lzma support found.')
@@ -270,11 +278,11 @@ class ZSTD(DecidingCompressor):
     # This is a NOT THREAD SAFE implementation.
     # This is a NOT THREAD SAFE implementation.
     # Only ONE python context must be created at a time.
     # Only ONE python context must be created at a time.
     # It should work flawlessly as long as borg will call ONLY ONE compression job at time.
     # It should work flawlessly as long as borg will call ONLY ONE compression job at time.
-    ID = b'\x03\x00'
+    ID = b'\x03'
     name = 'zstd'
     name = 'zstd'
 
 
     def __init__(self, level=3, **kwargs):
     def __init__(self, level=3, **kwargs):
-        super().__init__(**kwargs)
+        super().__init__(level=level, **kwargs)
         self.level = level
         self.level = level
 
 
     def _decide(self, idata):
     def _decide(self, idata):
@@ -331,14 +339,52 @@ class ZSTD(DecidingCompressor):
         return dest[:osize]
         return dest[:osize]
 
 
 
 
-class ZLIB(CompressorBase):
+class ZLIB(DecidingCompressor):
     """
     """
     zlib compression / decompression (python stdlib)
     zlib compression / decompression (python stdlib)
     """
     """
-    ID = b'\x08\x00'  # not used here, see detect()
-                      # avoid all 0x.8.. IDs elsewhere!
+    ID = b'\x05'
     name = 'zlib'
     name = 'zlib'
 
 
+    def __init__(self, level=6, **kwargs):
+        super().__init__(level=level, **kwargs)
+        self.level = level
+
+    def _decide(self, data):
+        """
+        Decides what to do with *data*. Returns (compressor, zlib_data).
+
+        *zlib_data* is the ZLIB result if *compressor* is ZLIB as well, otherwise it is None.
+        """
+        zlib_data = zlib.compress(data, self.level)
+        if len(zlib_data) < len(data):
+            return self, zlib_data
+        else:
+            return NONE_COMPRESSOR, None
+
+    def decompress(self, data):
+        data = super().decompress(data)
+        try:
+            return zlib.decompress(data)
+        except zlib.error as e:
+            raise DecompressionError(str(e)) from None
+
+
+class ZLIB_legacy(CompressorBase):
+    """
+    zlib compression / decompression (python stdlib)
+
+    Note: This is the legacy ZLIB support as used by borg < 1.3.
+          It still suffers from attic *only* supporting zlib and not having separate
+          ID bytes to differentiate between differently compressed chunks.
+          This just works because zlib compressed stuff always starts with 0x.8.. bytes.
+          Newer borg uses the ZLIB class that has separate ID bytes (as all the other
+          compressors) and does not need this hack.
+    """
+    ID = b'\x08'  # not used here, see detect()
+    # avoid all 0x.8 IDs elsewhere!
+    name = 'zlib_legacy'
+
     @classmethod
     @classmethod
     def detect(cls, data):
     def detect(cls, data):
         # matches misc. patterns 0x.8.. used by zlib
         # matches misc. patterns 0x.8.. used by zlib
@@ -348,7 +394,7 @@ class ZLIB(CompressorBase):
         return check_ok and is_deflate
         return check_ok and is_deflate
 
 
     def __init__(self, level=6, **kwargs):
     def __init__(self, level=6, **kwargs):
-        super().__init__(**kwargs)
+        super().__init__(level=level, **kwargs)
         self.level = level
         self.level = level
 
 
     def compress(self, data):
     def compress(self, data):
@@ -440,14 +486,14 @@ class ObfuscateSize(CompressorBase):
     """
     """
     Meta-Compressor that obfuscates the compressed data size.
     Meta-Compressor that obfuscates the compressed data size.
     """
     """
-    ID = b'\x04\x00'
+    ID = b'\x04'
     name = 'obfuscate'
     name = 'obfuscate'
 
 
-    header_fmt = Struct('>I')
+    header_fmt = Struct('<I')
     header_len = len(header_fmt.pack(0))
     header_len = len(header_fmt.pack(0))
 
 
     def __init__(self, level=None, compressor=None):
     def __init__(self, level=None, compressor=None):
-        super().__init__()
+        super().__init__(level=level)  # data will be encrypted, so we can tell the level
         self.compressor = compressor
         self.compressor = compressor
         if level is None:
         if level is None:
             pass  # decompression
             pass  # decompression
@@ -502,13 +548,14 @@ COMPRESSOR_TABLE = {
     CNONE.name: CNONE,
     CNONE.name: CNONE,
     LZ4.name: LZ4,
     LZ4.name: LZ4,
     ZLIB.name: ZLIB,
     ZLIB.name: ZLIB,
+    ZLIB_legacy.name: ZLIB_legacy,
     LZMA.name: LZMA,
     LZMA.name: LZMA,
     Auto.name: Auto,
     Auto.name: Auto,
     ZSTD.name: ZSTD,
     ZSTD.name: ZSTD,
     ObfuscateSize.name: ObfuscateSize,
     ObfuscateSize.name: ObfuscateSize,
 }
 }
 # List of possible compression types. Does not include Auto, since it is a meta-Compressor.
 # List of possible compression types. Does not include Auto, since it is a meta-Compressor.
-COMPRESSOR_LIST = [LZ4, ZSTD, CNONE, ZLIB, LZMA, ObfuscateSize, ]  # check fast stuff first
+COMPRESSOR_LIST = [LZ4, ZSTD, CNONE, ZLIB, ZLIB_legacy, LZMA, ObfuscateSize, ]  # check fast stuff first
 
 
 def get_compressor(name, **kwargs):
 def get_compressor(name, **kwargs):
     cls = COMPRESSOR_TABLE[name]
     cls = COMPRESSOR_TABLE[name]
@@ -554,7 +601,7 @@ class CompressionSpec:
         self.name = values[0]
         self.name = values[0]
         if self.name in ('none', 'lz4', ):
         if self.name in ('none', 'lz4', ):
             return
             return
-        elif self.name in ('zlib', 'lzma', ):
+        elif self.name in ('zlib', 'lzma', 'zlib_legacy'):  # zlib_legacy just for testing
             if count < 2:
             if count < 2:
                 level = 6  # default compression level in py stdlib
                 level = 6  # default compression level in py stdlib
             elif count == 2:
             elif count == 2:
@@ -597,7 +644,7 @@ class CompressionSpec:
     def compressor(self):
     def compressor(self):
         if self.name in ('none', 'lz4', ):
         if self.name in ('none', 'lz4', ):
             return get_compressor(self.name)
             return get_compressor(self.name)
-        elif self.name in ('zlib', 'lzma', 'zstd', ):
+        elif self.name in ('zlib', 'lzma', 'zstd', 'zlib_legacy'):
             return get_compressor(self.name, level=self.level)
             return get_compressor(self.name, level=self.level)
         elif self.name == 'auto':
         elif self.name == 'auto':
             return get_compressor(self.name, compressor=self.inner.compressor)
             return get_compressor(self.name, compressor=self.inner.compressor)

+ 1 - 1
src/borg/constants.py

@@ -1,5 +1,5 @@
 # this set must be kept complete, otherwise the RobustUnpacker might malfunction:
 # this set must be kept complete, otherwise the RobustUnpacker might malfunction:
-ITEM_KEYS = frozenset(['path', 'source', 'rdev', 'chunks', 'chunks_healthy', 'hardlink_master',
+ITEM_KEYS = frozenset(['path', 'source', 'rdev', 'chunks', 'chunks_healthy', 'hardlink_master', 'hlid',
                        'mode', 'user', 'group', 'uid', 'gid', 'mtime', 'atime', 'ctime', 'birthtime', 'size',
                        'mode', 'user', 'group', 'uid', 'gid', 'mtime', 'atime', 'ctime', 'birthtime', 'size',
                        'xattrs', 'bsdflags', 'acl_nfs4', 'acl_access', 'acl_default', 'acl_extended',
                        'xattrs', 'bsdflags', 'acl_nfs4', 'acl_access', 'acl_default', 'acl_extended',
                        'part'])
                        'part'])

+ 14 - 25
src/borg/fuse.py

@@ -35,7 +35,8 @@ from .crypto.low_level import blake2b_128
 from .archiver import Archiver
 from .archiver import Archiver
 from .archive import Archive, get_item_uid_gid
 from .archive import Archive, get_item_uid_gid
 from .hashindex import FuseVersionsIndex
 from .hashindex import FuseVersionsIndex
-from .helpers import daemonize, daemonizing, hardlinkable, signal_handler, format_file_size, Error
+from .helpers import daemonize, daemonizing, signal_handler, format_file_size, Error
+from .helpers import HardLinkManager
 from .helpers import msgpack
 from .helpers import msgpack
 from .item import Item
 from .item import Item
 from .lrucache import LRUCache
 from .lrucache import LRUCache
@@ -339,15 +340,9 @@ class FuseBackend:
                           consider_part_files=self._args.consider_part_files)
                           consider_part_files=self._args.consider_part_files)
         strip_components = self._args.strip_components
         strip_components = self._args.strip_components
         matcher = Archiver.build_matcher(self._args.patterns, self._args.paths)
         matcher = Archiver.build_matcher(self._args.patterns, self._args.paths)
-        partial_extract = not matcher.empty() or strip_components
-        hardlink_masters = {} if partial_extract else None
+        hlm = HardLinkManager(id_type=bytes, info_type=str)  # hlid -> path
 
 
-        def peek_and_store_hardlink_masters(item, matched):
-            if (partial_extract and not matched and hardlinkable(item.mode) and
-                    item.get('hardlink_master', True) and 'source' not in item):
-                hardlink_masters[item.get('path')] = (item.get('chunks'), None)
-
-        filter = Archiver.build_filter(matcher, peek_and_store_hardlink_masters, strip_components)
+        filter = Archiver.build_filter(matcher, strip_components)
         for item_inode, item in self.cache.iter_archive_items(archive.metadata.items, filter=filter,
         for item_inode, item in self.cache.iter_archive_items(archive.metadata.items, filter=filter,
                                                               consider_part_files=self._args.consider_part_files):
                                                               consider_part_files=self._args.consider_part_files):
             if strip_components:
             if strip_components:
@@ -369,15 +364,13 @@ class FuseBackend:
             parent = 1
             parent = 1
             for segment in segments[:-1]:
             for segment in segments[:-1]:
                 parent = self._process_inner(segment, parent)
                 parent = self._process_inner(segment, parent)
-            self._process_leaf(segments[-1], item, parent, prefix, is_dir, item_inode,
-                               hardlink_masters, strip_components)
+            self._process_leaf(segments[-1], item, parent, prefix, is_dir, item_inode, hlm)
         duration = time.perf_counter() - t0
         duration = time.perf_counter() - t0
         logger.debug('fuse: _process_archive completed in %.1f s for archive %s', duration, archive.name)
         logger.debug('fuse: _process_archive completed in %.1f s for archive %s', duration, archive.name)
 
 
-    def _process_leaf(self, name, item, parent, prefix, is_dir, item_inode, hardlink_masters, stripped_components):
+    def _process_leaf(self, name, item, parent, prefix, is_dir, item_inode, hlm):
         path = item.path
         path = item.path
         del item.path  # save some space
         del item.path  # save some space
-        hardlink_masters = hardlink_masters or {}
 
 
         def file_version(item, path):
         def file_version(item, path):
             if 'chunks' in item:
             if 'chunks' in item:
@@ -402,10 +395,9 @@ class FuseBackend:
             version_enc = os.fsencode('.%05d' % version)
             version_enc = os.fsencode('.%05d' % version)
             return name + version_enc + ext
             return name + version_enc + ext
 
 
-        if 'source' in item and hardlinkable(item.mode):
-            source = os.sep.join(item.source.split(os.sep)[stripped_components:])
-            chunks, link_target = hardlink_masters.get(item.source, (None, source))
-            if link_target:
+        if 'hlid' in item:
+            link_target = hlm.retrieve(id=item.hlid, default=None)
+            if link_target is not None:
                 # Hard link was extracted previously, just link
                 # Hard link was extracted previously, just link
                 link_target = os.fsencode(link_target)
                 link_target = os.fsencode(link_target)
                 if self.versions:
                 if self.versions:
@@ -415,19 +407,16 @@ class FuseBackend:
                 try:
                 try:
                     inode = self.find_inode(link_target, prefix)
                     inode = self.find_inode(link_target, prefix)
                 except KeyError:
                 except KeyError:
-                    logger.warning('Skipping broken hard link: %s -> %s', path, source)
+                    logger.warning('Skipping broken hard link: %s -> %s', path, link_target)
                     return
                     return
                 item = self.get_item(inode)
                 item = self.get_item(inode)
                 item.nlink = item.get('nlink', 1) + 1
                 item.nlink = item.get('nlink', 1) + 1
                 self._items[inode] = item
                 self._items[inode] = item
-            elif chunks is not None:
-                # assign chunks to this item, since the item which had the chunks was not extracted
-                item.chunks = chunks
+            else:
                 inode = item_inode
                 inode = item_inode
                 self._items[inode] = item
                 self._items[inode] = item
-                if hardlink_masters:
-                    # Update master entry with extracted item path, so that following hardlinks don't extract twice.
-                    hardlink_masters[item.source] = (None, path)
+                # remember extracted item path, so that following hardlinks don't extract twice.
+                hlm.remember(id=item.hlid, info=path)
         else:
         else:
             inode = item_inode
             inode = item_inode
 
 
@@ -436,7 +425,7 @@ class FuseBackend:
             enc_path = os.fsencode(path)
             enc_path = os.fsencode(path)
             version = file_version(item, enc_path)
             version = file_version(item, enc_path)
             if version is not None:
             if version is not None:
-                # regular file, with contents - maybe a hardlink master
+                # regular file, with contents
                 name = make_versioned_name(name, version)
                 name = make_versioned_name(name, version)
                 self.file_versions[enc_path] = version
                 self.file_versions[enc_path] = version
 
 

+ 71 - 3
src/borg/helpers/fs.py

@@ -1,4 +1,5 @@
 import errno
 import errno
+import hashlib
 import os
 import os
 import os.path
 import os.path
 import re
 import re
@@ -165,9 +166,76 @@ def make_path_safe(path):
     return _safe_re.sub('', path) or '.'
     return _safe_re.sub('', path) or '.'
 
 
 
 
-def hardlinkable(mode):
-    """return True if we support hardlinked items of this type"""
-    return stat.S_ISREG(mode) or stat.S_ISBLK(mode) or stat.S_ISCHR(mode) or stat.S_ISFIFO(mode)
+class HardLinkManager:
+    """
+    Manage hardlinks (and avoid code duplication doing so).
+
+    A) When creating a borg2 archive from the filesystem, we have to maintain a mapping like:
+       (dev, ino) -> (hlid, chunks)  # for fs_hl_targets
+       If we encounter the same (dev, ino) again later, we'll just re-use the hlid and chunks list.
+
+    B) When extracting a borg2 archive to the filesystem, we have to maintain a mapping like:
+       hlid -> path
+       If we encounter the same hlid again later, we hardlink to the path of the already extracted content of same hlid.
+
+    C) When transferring from a borg1 archive, we need:
+       path -> chunks, chunks_healthy  # for borg1_hl_targets
+       If we encounter a regular file item with source == path later, we reuse chunks and chunks_healthy
+       and create the same hlid = hardlink_id_from_path(source).
+
+    D) When importing a tar file (simplified 1-pass way for now, not creating borg hardlink items):
+       path -> chunks
+       If we encounter a LNK tar entry later with linkname==path, we re-use the chunks and create a regular file item.
+       For better hardlink support (including the very first hardlink item for each group of same-target hardlinks),
+       we would need a 2-pass processing, which is not yet implemented.
+    """
+    def __init__(self, *, id_type, info_type):
+        self._map = {}
+        self.id_type = id_type
+        self.info_type = info_type
+
+    def borg1_hardlinkable(self, mode):  # legacy
+        return stat.S_ISREG(mode) or stat.S_ISBLK(mode) or stat.S_ISCHR(mode) or stat.S_ISFIFO(mode)
+
+    def borg1_hardlink_master(self, item):  # legacy
+        return item.get('hardlink_master', True) and 'source' not in item and self.borg1_hardlinkable(item.mode)
+
+    def borg1_hardlink_slave(self, item):  # legacy
+        return 'source' in item and self.borg1_hardlinkable(item.mode)
+
+    def hardlink_id_from_path(self, path):
+        """compute a hardlink id from a path"""
+        assert isinstance(path, bytes)
+        return hashlib.sha256(path).digest()
+
+    def hardlink_id_from_inode(self, *, ino, dev):
+        """compute a hardlink id from an inode"""
+        assert isinstance(ino, int)
+        assert isinstance(dev, int)
+        return hashlib.sha256(f'{ino}/{dev}'.encode()).digest()
+
+    def remember(self, *, id, info):
+        """
+        remember stuff from a (usually contentful) item.
+
+        :param id: some id used to reference to the contentful item, could be:
+                   a path (tar style, old borg style) [bytes]
+                   a hlid (new borg style) [bytes]
+                   a (dev, inode) tuple (filesystem)
+        :param info: information to remember, could be:
+                     chunks / chunks_healthy list
+                     hlid
+        """
+        assert isinstance(id, self.id_type), f"key is {key!r}, not of type {self.key_type}"
+        assert isinstance(info, self.info_type), f"info is {info!r}, not of type {self.info_type}"
+        self._map[id] = info
+
+    def retrieve(self, id, *, default=None):
+        """
+        retrieve stuff to use it in a (usually contentless) item.
+        """
+        assert isinstance(id, self.id_type)
+        return self._map.get(id, default)
 
 
 
 
 def scandir_keyfunc(dirent):
 def scandir_keyfunc(dirent):

+ 14 - 3
src/borg/helpers/msgpack.py

@@ -24,7 +24,7 @@ from msgpack import unpackb as mp_unpackb
 from msgpack import unpack as mp_unpack
 from msgpack import unpack as mp_unpack
 from msgpack import version as mp_version
 from msgpack import version as mp_version
 
 
-from msgpack import ExtType
+from msgpack import ExtType, Timestamp
 from msgpack import OutOfData
 from msgpack import OutOfData
 
 
 
 
@@ -164,7 +164,7 @@ def get_limited_unpacker(kind):
     return Unpacker(**args)
     return Unpacker(**args)
 
 
 
 
-def bigint_to_int(mtime):
+def bigint_to_int(mtime):  # legacy
     """Convert bytearray to int
     """Convert bytearray to int
     """
     """
     if isinstance(mtime, bytes):
     if isinstance(mtime, bytes):
@@ -172,7 +172,7 @@ def bigint_to_int(mtime):
     return mtime
     return mtime
 
 
 
 
-def int_to_bigint(value):
+def int_to_bigint(value):  # legacy
     """Convert integers larger than 64 bits to bytearray
     """Convert integers larger than 64 bits to bytearray
 
 
     Smaller integers are left alone
     Smaller integers are left alone
@@ -180,3 +180,14 @@ def int_to_bigint(value):
     if value.bit_length() > 63:
     if value.bit_length() > 63:
         return value.to_bytes((value.bit_length() + 9) // 8, 'little', signed=True)
         return value.to_bytes((value.bit_length() + 9) // 8, 'little', signed=True)
     return value
     return value
+
+
+def int_to_timestamp(ns):
+    return Timestamp.from_unix_nano(ns)
+
+
+def timestamp_to_int(ts):
+    if isinstance(ts, Timestamp):
+        return ts.to_unix_nano()
+    # legacy support note: we need to keep the bigint conversion for compatibility with borg < 1.3 archives.
+    return bigint_to_int(ts)

+ 10 - 7
src/borg/helpers/parseformat.py

@@ -19,6 +19,7 @@ logger = create_logger()
 
 
 from .errors import Error
 from .errors import Error
 from .fs import get_keys_dir
 from .fs import get_keys_dir
+from .msgpack import Timestamp
 from .time import OutputTimestamp, format_time, to_localtime, safe_timestamp, safe_s
 from .time import OutputTimestamp, format_time, to_localtime, safe_timestamp, safe_s
 from .. import __version__ as borg_version
 from .. import __version__ as borg_version
 from .. import __version_tuple__ as borg_version_tuple
 from .. import __version_tuple__ as borg_version_tuple
@@ -694,7 +695,8 @@ class ItemFormatter(BaseFormatter):
     KEY_DESCRIPTIONS = {
     KEY_DESCRIPTIONS = {
         'bpath': 'verbatim POSIX path, can contain any character except NUL',
         'bpath': 'verbatim POSIX path, can contain any character except NUL',
         'path': 'path interpreted as text (might be missing non-text characters, see bpath)',
         'path': 'path interpreted as text (might be missing non-text characters, see bpath)',
-        'source': 'link target for links (identical to linktarget)',
+        'source': 'link target for symlinks (identical to linktarget)',
+        'hlid': 'hard link identity (same if hardlinking same fs object)',
         'extra': 'prepends {source} with " -> " for soft links and " link to " for hard links',
         'extra': 'prepends {source} with " -> " for soft links and " link to " for hard links',
         'csize': 'compressed size',
         'csize': 'compressed size',
         'dsize': 'deduplicated size',
         'dsize': 'deduplicated size',
@@ -705,7 +707,7 @@ class ItemFormatter(BaseFormatter):
         'health': 'either "healthy" (file ok) or "broken" (if file has all-zero replacement chunks)',
         'health': 'either "healthy" (file ok) or "broken" (if file has all-zero replacement chunks)',
     }
     }
     KEY_GROUPS = (
     KEY_GROUPS = (
-        ('type', 'mode', 'uid', 'gid', 'user', 'group', 'path', 'bpath', 'source', 'linktarget', 'flags'),
+        ('type', 'mode', 'uid', 'gid', 'user', 'group', 'path', 'bpath', 'source', 'linktarget', 'hlid', 'flags'),
         ('size', 'csize', 'dsize', 'dcsize', 'num_chunks', 'unique_chunks'),
         ('size', 'csize', 'dsize', 'dcsize', 'num_chunks', 'unique_chunks'),
         ('mtime', 'ctime', 'atime', 'isomtime', 'isoctime', 'isoatime'),
         ('mtime', 'ctime', 'atime', 'isomtime', 'isoctime', 'isoatime'),
         tuple(sorted(hash_algorithms)),
         tuple(sorted(hash_algorithms)),
@@ -801,11 +803,9 @@ class ItemFormatter(BaseFormatter):
         extra = ''
         extra = ''
         if source:
         if source:
             source = remove_surrogates(source)
             source = remove_surrogates(source)
-            if item_type == 'l':
-                extra = ' -> %s' % source
-            else:
-                mode = 'h' + mode[1:]
-                extra = ' link to %s' % source
+            extra = ' -> %s' % source
+        hlid = item.get('hlid')
+        hlid = bin_to_hex(hlid) if hlid else ''
         item_data['type'] = item_type
         item_data['type'] = item_type
         item_data['mode'] = mode
         item_data['mode'] = mode
         item_data['user'] = item.user or item.uid
         item_data['user'] = item.user or item.uid
@@ -821,6 +821,7 @@ class ItemFormatter(BaseFormatter):
             item_data['health'] = 'broken' if 'chunks_healthy' in item else 'healthy'
             item_data['health'] = 'broken' if 'chunks_healthy' in item else 'healthy'
         item_data['source'] = source
         item_data['source'] = source
         item_data['linktarget'] = source
         item_data['linktarget'] = source
+        item_data['hlid'] = hlid
         item_data['flags'] = item.get('bsdflags')
         item_data['flags'] = item.get('bsdflags')
         for key in self.used_call_keys:
         for key in self.used_call_keys:
             item_data[key] = self.call_keys[key](item)
             item_data[key] = self.call_keys[key](item)
@@ -1043,6 +1044,8 @@ def prepare_dump_dict(d):
                 value = decode_tuple(value)
                 value = decode_tuple(value)
             elif isinstance(value, bytes):
             elif isinstance(value, bytes):
                 value = decode_bytes(value)
                 value = decode_bytes(value)
+            elif isinstance(value, Timestamp):
+                value = value.to_unix_nano()
             if isinstance(key, bytes):
             if isinstance(key, bytes):
                 key = key.decode()
                 key = key.decode()
             res[key] = value
             res[key] = value

+ 7 - 8
src/borg/item.pyx

@@ -3,9 +3,9 @@ from collections import namedtuple
 
 
 from .constants import ITEM_KEYS, ARCHIVE_KEYS
 from .constants import ITEM_KEYS, ARCHIVE_KEYS
 from .helpers import safe_encode, safe_decode
 from .helpers import safe_encode, safe_decode
-from .helpers import bigint_to_int, int_to_bigint
 from .helpers import StableDict
 from .helpers import StableDict
 from .helpers import format_file_size
 from .helpers import format_file_size
+from .helpers.msgpack import timestamp_to_int, int_to_timestamp
 
 
 
 
 cdef extern from "_item.c":
 cdef extern from "_item.c":
@@ -171,17 +171,17 @@ class Item(PropDict):
     rdev = PropDict._make_property('rdev', int)
     rdev = PropDict._make_property('rdev', int)
     bsdflags = PropDict._make_property('bsdflags', int)
     bsdflags = PropDict._make_property('bsdflags', int)
 
 
-    # note: we need to keep the bigint conversion for compatibility with borg 1.0 archives.
-    atime = PropDict._make_property('atime', int, 'bigint', encode=int_to_bigint, decode=bigint_to_int)
-    ctime = PropDict._make_property('ctime', int, 'bigint', encode=int_to_bigint, decode=bigint_to_int)
-    mtime = PropDict._make_property('mtime', int, 'bigint', encode=int_to_bigint, decode=bigint_to_int)
-    birthtime = PropDict._make_property('birthtime', int, 'bigint', encode=int_to_bigint, decode=bigint_to_int)
+    atime = PropDict._make_property('atime', int, 'int (ns)', encode=int_to_timestamp, decode=timestamp_to_int)
+    ctime = PropDict._make_property('ctime', int, 'int (ns)', encode=int_to_timestamp, decode=timestamp_to_int)
+    mtime = PropDict._make_property('mtime', int, 'int (ns)', encode=int_to_timestamp, decode=timestamp_to_int)
+    birthtime = PropDict._make_property('birthtime', int, 'int (ns)', encode=int_to_timestamp, decode=timestamp_to_int)
 
 
     # size is only present for items with a chunk list and then it is sum(chunk_sizes)
     # size is only present for items with a chunk list and then it is sum(chunk_sizes)
     # compatibility note: this is a new feature, in old archives size will be missing.
     # compatibility note: this is a new feature, in old archives size will be missing.
     size = PropDict._make_property('size', int)
     size = PropDict._make_property('size', int)
 
 
-    hardlink_master = PropDict._make_property('hardlink_master', bool)
+    hlid = PropDict._make_property('hlid', bytes)  # hard link id: same value means same hard link.
+    hardlink_master = PropDict._make_property('hardlink_master', bool)  # legacy
 
 
     chunks = PropDict._make_property('chunks', (list, type(None)), 'list or None')
     chunks = PropDict._make_property('chunks', (list, type(None)), 'list or None')
     chunks_healthy = PropDict._make_property('chunks_healthy', (list, type(None)), 'list or None')
     chunks_healthy = PropDict._make_property('chunks_healthy', (list, type(None)), 'list or None')
@@ -214,7 +214,6 @@ class Item(PropDict):
         except AttributeError:
         except AttributeError:
             if stat.S_ISLNK(self.mode):
             if stat.S_ISLNK(self.mode):
                 # get out of here quickly. symlinks have no own chunks, their fs size is the length of the target name.
                 # get out of here quickly. symlinks have no own chunks, their fs size is the length of the target name.
-                # also, there is the dual-use issue of .source (#2343), so don't confuse it with a hardlink slave.
                 return len(self.source)
                 return len(self.source)
             # no precomputed (c)size value available, compute it:
             # no precomputed (c)size value available, compute it:
             try:
             try:

+ 35 - 44
src/borg/testsuite/archiver.py

@@ -321,7 +321,7 @@ class ArchiverTestCaseBase(BaseTestCase):
                 contents = b'X' * size
                 contents = b'X' * size
             fd.write(contents)
             fd.write(contents)
 
 
-    def create_test_files(self):
+    def create_test_files(self, create_hardlinks=True):
         """Create a minimal test case including all supported file types
         """Create a minimal test case including all supported file types
         """
         """
         # File
         # File
@@ -332,7 +332,7 @@ class ArchiverTestCaseBase(BaseTestCase):
         # File mode
         # File mode
         os.chmod('input/file1', 0o4755)
         os.chmod('input/file1', 0o4755)
         # Hard link
         # Hard link
-        if are_hardlinks_supported():
+        if are_hardlinks_supported() and create_hardlinks:
             os.link(os.path.join(self.input_path, 'file1'),
             os.link(os.path.join(self.input_path, 'file1'),
                     os.path.join(self.input_path, 'hardlink'))
                     os.path.join(self.input_path, 'hardlink'))
         # Symlink
         # Symlink
@@ -432,7 +432,7 @@ class ArchiverTestCase(ArchiverTestCaseBase):
             self.assert_in(name, list_output)
             self.assert_in(name, list_output)
         self.assert_dirs_equal('input', 'output/input')
         self.assert_dirs_equal('input', 'output/input')
         info_output = self.cmd('info', self.repository_location + '::test')
         info_output = self.cmd('info', self.repository_location + '::test')
-        item_count = 4 if has_lchflags else 5  # one file is UF_NODUMP
+        item_count = 5 if has_lchflags else 6  # one file is UF_NODUMP
         self.assert_in('Number of files: %d' % item_count, info_output)
         self.assert_in('Number of files: %d' % item_count, info_output)
         shutil.rmtree(self.cache_path)
         shutil.rmtree(self.cache_path)
         info_output2 = self.cmd('info', self.repository_location + '::test')
         info_output2 = self.cmd('info', self.repository_location + '::test')
@@ -506,6 +506,29 @@ class ArchiverTestCase(ArchiverTestCaseBase):
             self.cmd('extract', self.repository_location + '::test')
             self.cmd('extract', self.repository_location + '::test')
             assert os.readlink('input/link1') == 'somewhere'
             assert os.readlink('input/link1') == 'somewhere'
 
 
+    @pytest.mark.skipif(not are_symlinks_supported() or not are_hardlinks_supported(),
+                        reason='symlinks or hardlinks not supported')
+    def test_hardlinked_symlinks_extract(self):
+        self.create_regular_file('target', size=1024)
+        with changedir('input'):
+            os.symlink('target', 'symlink1')
+            os.link('symlink1', 'symlink2', follow_symlinks=False)
+        self.cmd('init', '--encryption=repokey', self.repository_location)
+        self.cmd('create', self.repository_location + '::test', 'input')
+        with changedir('output'):
+            output = self.cmd('extract', self.repository_location + '::test')
+            print(output)
+            with changedir('input'):
+                assert os.path.exists('target')
+                assert os.readlink('symlink1') == 'target'
+                assert os.readlink('symlink2') == 'target'
+                st1 = os.stat('symlink1', follow_symlinks=False)
+                st2 = os.stat('symlink2', follow_symlinks=False)
+                assert st1.st_nlink == 2
+                assert st2.st_nlink == 2
+                assert st1.st_ino == st2.st_ino
+                assert st1.st_size == st2.st_size
+
     @pytest.mark.skipif(not is_utime_fully_supported(), reason='cannot properly setup and execute test without utime')
     @pytest.mark.skipif(not is_utime_fully_supported(), reason='cannot properly setup and execute test without utime')
     def test_atime(self):
     def test_atime(self):
         def has_noatime(some_file):
         def has_noatime(some_file):
@@ -2442,7 +2465,7 @@ class ArchiverTestCase(ArchiverTestCaseBase):
     def test_compression_zlib_compressible(self):
     def test_compression_zlib_compressible(self):
         size, csize = self._get_sizes('zlib', compressible=True)
         size, csize = self._get_sizes('zlib', compressible=True)
         assert csize < size * 0.1
         assert csize < size * 0.1
-        assert csize == 35
+        assert csize == 37
 
 
     def test_compression_zlib_uncompressible(self):
     def test_compression_zlib_uncompressible(self):
         size, csize = self._get_sizes('zlib', compressible=False)
         size, csize = self._get_sizes('zlib', compressible=False)
@@ -2451,7 +2474,7 @@ class ArchiverTestCase(ArchiverTestCaseBase):
     def test_compression_auto_compressible(self):
     def test_compression_auto_compressible(self):
         size, csize = self._get_sizes('auto,zlib', compressible=True)
         size, csize = self._get_sizes('auto,zlib', compressible=True)
         assert csize < size * 0.1
         assert csize < size * 0.1
-        assert csize == 35  # same as compression 'zlib'
+        assert csize == 37  # same as compression 'zlib'
 
 
     def test_compression_auto_uncompressible(self):
     def test_compression_auto_uncompressible(self):
         size, csize = self._get_sizes('auto,zlib', compressible=False)
         size, csize = self._get_sizes('auto,zlib', compressible=False)
@@ -2661,7 +2684,7 @@ class ArchiverTestCase(ArchiverTestCaseBase):
                 hl3 = os.path.join(mountpoint, 'input', 'hardlink3', 'hardlink3.00001')
                 hl3 = os.path.join(mountpoint, 'input', 'hardlink3', 'hardlink3.00001')
                 assert os.stat(hl1).st_ino == os.stat(hl2).st_ino == os.stat(hl3).st_ino
                 assert os.stat(hl1).st_ino == os.stat(hl2).st_ino == os.stat(hl3).st_ino
                 assert open(hl3, 'rb').read() == b'123456'
                 assert open(hl3, 'rb').read() == b'123456'
-        # similar again, but exclude the hardlink master:
+        # similar again, but exclude the 1st hardlink:
         with self.fuse_mount(self.repository_location, mountpoint, '-o', 'versions', '-e', 'input/hardlink1'):
         with self.fuse_mount(self.repository_location, mountpoint, '-o', 'versions', '-e', 'input/hardlink1'):
             if are_hardlinks_supported():
             if are_hardlinks_supported():
                 hl2 = os.path.join(mountpoint, 'input', 'hardlink2', 'hardlink2.00001')
                 hl2 = os.path.join(mountpoint, 'input', 'hardlink2', 'hardlink2.00001')
@@ -3475,7 +3498,7 @@ id: 2 / e29442 3506da 4e1ea7 / 25f62a 5a3d41 - 02
             assert os.stat('input/dir1/source2').st_nlink == 2
             assert os.stat('input/dir1/source2').st_nlink == 2
 
 
     def test_import_tar(self, tar_format='PAX'):
     def test_import_tar(self, tar_format='PAX'):
-        self.create_test_files()
+        self.create_test_files(create_hardlinks=False)  # hardlinks become separate files
         os.unlink('input/flagfile')
         os.unlink('input/flagfile')
         self.cmd('init', '--encryption=none', self.repository_location)
         self.cmd('init', '--encryption=none', self.repository_location)
         self.cmd('create', self.repository_location + '::src', 'input')
         self.cmd('create', self.repository_location + '::src', 'input')
@@ -3489,7 +3512,7 @@ id: 2 / e29442 3506da 4e1ea7 / 25f62a 5a3d41 - 02
     def test_import_tar_gz(self, tar_format='GNU'):
     def test_import_tar_gz(self, tar_format='GNU'):
         if not shutil.which('gzip'):
         if not shutil.which('gzip'):
             pytest.skip('gzip is not installed')
             pytest.skip('gzip is not installed')
-        self.create_test_files()
+        self.create_test_files(create_hardlinks=False)  # hardlinks become separate files
         os.unlink('input/flagfile')
         os.unlink('input/flagfile')
         self.cmd('init', '--encryption=none', self.repository_location)
         self.cmd('init', '--encryption=none', self.repository_location)
         self.cmd('create', self.repository_location + '::src', 'input')
         self.cmd('create', self.repository_location + '::src', 'input')
@@ -3850,7 +3873,7 @@ class ArchiverCheckTestCase(ArchiverTestCaseBase):
                 'username': 'bar',
                 'username': 'bar',
                 'name': 'archive1',
                 'name': 'archive1',
                 'time': '2016-12-15T18:49:51.849711',
                 'time': '2016-12-15T18:49:51.849711',
-                'version': 1,
+                'version': 2,
             })
             })
             archive_id = key.id_hash(archive)
             archive_id = key.id_hash(archive)
             repository.put(archive_id, key.encrypt(archive_id, archive))
             repository.put(archive_id, key.encrypt(archive_id, archive))
@@ -3907,35 +3930,6 @@ class ArchiverCheckTestCase(ArchiverTestCaseBase):
             repository.commit(compact=False)
             repository.commit(compact=False)
         self.cmd('check', self.repository_location, exit_code=1)
         self.cmd('check', self.repository_location, exit_code=1)
 
 
-    def test_attic013_acl_bug(self):
-        # Attic up to release 0.13 contained a bug where every item unintentionally received
-        # a b'acl'=None key-value pair.
-        # This bug can still live on in Borg repositories (through borg upgrade).
-        class Attic013Item:
-            def as_dict(self):
-                return {
-                    # These are required
-                    b'path': '1234',
-                    b'mtime': 0,
-                    b'mode': 0,
-                    b'user': b'0',
-                    b'group': b'0',
-                    b'uid': 0,
-                    b'gid': 0,
-                    # acl is the offending key.
-                    b'acl': None,
-                }
-
-        archive, repository = self.open_archive('archive1')
-        with repository:
-            manifest, key = Manifest.load(repository, Manifest.NO_OPERATION_CHECK)
-            with Cache(repository, key, manifest) as cache:
-                archive = Archive(repository, key, manifest, '0.13', cache=cache, create=True)
-                archive.items_buffer.add(Attic013Item())
-                archive.save()
-        self.cmd('check', self.repository_location, exit_code=0)
-        self.cmd('list', self.repository_location + '::0.13', exit_code=0)
-
 
 
 class ManifestAuthenticationTest(ArchiverTestCaseBase):
 class ManifestAuthenticationTest(ArchiverTestCaseBase):
     def spoof_manifest(self, repository):
     def spoof_manifest(self, repository):
@@ -4473,26 +4467,23 @@ def test_chunk_content_equal():
 
 
 
 
 class TestBuildFilter:
 class TestBuildFilter:
-    @staticmethod
-    def peek_and_store_hardlink_masters(item, matched):
-        pass
 
 
     def test_basic(self):
     def test_basic(self):
         matcher = PatternMatcher()
         matcher = PatternMatcher()
         matcher.add([parse_pattern('included')], IECommand.Include)
         matcher.add([parse_pattern('included')], IECommand.Include)
-        filter = Archiver.build_filter(matcher, self.peek_and_store_hardlink_masters, 0)
+        filter = Archiver.build_filter(matcher, 0)
         assert filter(Item(path='included'))
         assert filter(Item(path='included'))
         assert filter(Item(path='included/file'))
         assert filter(Item(path='included/file'))
         assert not filter(Item(path='something else'))
         assert not filter(Item(path='something else'))
 
 
     def test_empty(self):
     def test_empty(self):
         matcher = PatternMatcher(fallback=True)
         matcher = PatternMatcher(fallback=True)
-        filter = Archiver.build_filter(matcher, self.peek_and_store_hardlink_masters, 0)
+        filter = Archiver.build_filter(matcher, 0)
         assert filter(Item(path='anything'))
         assert filter(Item(path='anything'))
 
 
     def test_strip_components(self):
     def test_strip_components(self):
         matcher = PatternMatcher(fallback=True)
         matcher = PatternMatcher(fallback=True)
-        filter = Archiver.build_filter(matcher, self.peek_and_store_hardlink_masters, strip_components=1)
+        filter = Archiver.build_filter(matcher, strip_components=1)
         assert not filter(Item(path='shallow'))
         assert not filter(Item(path='shallow'))
         assert not filter(Item(path='shallow/'))  # can this even happen? paths are normalized...
         assert not filter(Item(path='shallow/'))  # can this even happen? paths are normalized...
         assert filter(Item(path='deep enough/file'))
         assert filter(Item(path='deep enough/file'))

+ 2 - 2
src/borg/testsuite/compress.py

@@ -88,11 +88,11 @@ def test_autodetect_invalid():
         Compressor(**params).decompress(b'\x08\x00notreallyzlib')
         Compressor(**params).decompress(b'\x08\x00notreallyzlib')
 
 
 
 
-def test_zlib_compat():
+def test_zlib_legacy_compat():
     # for compatibility reasons, we do not add an extra header for zlib,
     # for compatibility reasons, we do not add an extra header for zlib,
     # nor do we expect one when decompressing / autodetecting
     # nor do we expect one when decompressing / autodetecting
     for level in range(10):
     for level in range(10):
-        c = get_compressor(name='zlib', level=level)
+        c = get_compressor(name='zlib_legacy', level=level)
         cdata1 = c.compress(data)
         cdata1 = c.compress(data)
         cdata2 = zlib.compress(data, level)
         cdata2 = zlib.compress(data, level)
         assert cdata1 == cdata2
         assert cdata1 == cdata2

+ 4 - 3
src/borg/testsuite/item.py

@@ -3,6 +3,7 @@ import pytest
 from ..cache import ChunkListEntry
 from ..cache import ChunkListEntry
 from ..item import Item
 from ..item import Item
 from ..helpers import StableDict
 from ..helpers import StableDict
+from ..helpers.msgpack import Timestamp
 
 
 
 
 def test_item_empty():
 def test_item_empty():
@@ -77,15 +78,15 @@ def test_item_int_property():
         item.mode = "invalid"
         item.mode = "invalid"
 
 
 
 
-def test_item_bigint_property():
+def test_item_mptimestamp_property():
     item = Item()
     item = Item()
     small, big = 42, 2 ** 65
     small, big = 42, 2 ** 65
     item.atime = small
     item.atime = small
     assert item.atime == small
     assert item.atime == small
-    assert item.as_dict() == {'atime': small}
+    assert item.as_dict() == {'atime': Timestamp.from_unix_nano(small)}
     item.atime = big
     item.atime = big
     assert item.atime == big
     assert item.atime == big
-    assert item.as_dict() == {'atime': b'\0' * 8 + b'\x02'}
+    assert item.as_dict() == {'atime': Timestamp.from_unix_nano(big)}
 
 
 
 
 def test_item_user_group_none():
 def test_item_user_group_none():

+ 4 - 4
src/borg/testsuite/key.py

@@ -256,8 +256,8 @@ class TestKey:
         plaintext = b'123456789'
         plaintext = b'123456789'
         id = key.id_hash(plaintext)
         id = key.id_hash(plaintext)
         authenticated = key.encrypt(id, plaintext)
         authenticated = key.encrypt(id, plaintext)
-        # 0x07 is the key TYPE, \x0000 identifies no compression.
-        assert authenticated == b'\x07\x00\x00' + plaintext
+        # 0x07 is the key TYPE, \x00ff identifies no compression / unknown level.
+        assert authenticated == b'\x07\x00\xff' + plaintext
 
 
     def test_blake2_authenticated_encrypt(self, monkeypatch):
     def test_blake2_authenticated_encrypt(self, monkeypatch):
         monkeypatch.setenv('BORG_PASSPHRASE', 'test')
         monkeypatch.setenv('BORG_PASSPHRASE', 'test')
@@ -267,8 +267,8 @@ class TestKey:
         plaintext = b'123456789'
         plaintext = b'123456789'
         id = key.id_hash(plaintext)
         id = key.id_hash(plaintext)
         authenticated = key.encrypt(id, plaintext)
         authenticated = key.encrypt(id, plaintext)
-        # 0x06 is the key TYPE, 0x0000 identifies no compression.
-        assert authenticated == b'\x06\x00\x00' + plaintext
+        # 0x06 is the key TYPE, 0x00ff identifies no compression / unknown level.
+        assert authenticated == b'\x06\x00\xff' + plaintext
 
 
 
 
 class TestTAM:
 class TestTAM: