Переглянути джерело

Merge pull request #6704 from ThomasWaldmann/msgpack-str-bytes-cleanup

borg2: cleanup msgpack related str/bytes mess
TW 3 роки тому
батько
коміт
2c9be35886

+ 19 - 19
docs/internals/data-structures.rst

@@ -329,17 +329,17 @@ or modified. It looks like this:
 .. code-block:: python
 
     {
-        b'version': 1,
-        b'timestamp': b'2017-05-05T12:42:23.042864',
-        b'item_keys': [b'acl_access', b'acl_default', ...],
-        b'config': {},
-        b'archives': {
-            b'2017-05-05-system-backup': {
-                b'id': b'<32 byte binary object ID>',
-                b'time': b'2017-05-05T12:42:22.942864',
+        'version': 1,
+        'timestamp': '2017-05-05T12:42:23.042864',
+        'item_keys': ['acl_access', 'acl_default', ...],
+        'config': {},
+        'archives': {
+            '2017-05-05-system-backup': {
+                'id': b'<32 byte binary object ID>',
+                'time': '2017-05-05T12:42:22.942864',
             },
         },
-        b'tam': ...,
+        'tam': ...,
     }
 
 The *version* field can be either 1 or 2. The versions differ in the
@@ -393,15 +393,15 @@ The *config* key stores the feature flags enabled on a repository:
 .. code-block:: python
 
     config = {
-        b'feature_flags': {
-            b'read': {
-                b'mandatory': [b'some_feature'],
+        'feature_flags': {
+            'read': {
+                'mandatory': ['some_feature'],
             },
-            b'check': {
-                b'mandatory': [b'other_feature'],
+            'check': {
+                'mandatory': ['other_feature'],
             }
-            b'write': ...,
-            b'delete': ...
+            'write': ...,
+            'delete': ...
         },
     }
 
@@ -1220,9 +1220,9 @@ transaction ID in the file names. Integrity data is stored in a third file
 .. code-block:: python
 
     {
-        b'version': 2,
-        b'hints': b'{"algorithm": "XXH64", "digests": {"final": "411208db2aa13f1a"}}',
-        b'index': b'{"algorithm": "XXH64", "digests": {"HashHeader": "846b7315f91b8e48", "final": "cb3e26cadc173e40"}}'
+        'version': 2,
+        'hints': '{"algorithm": "XXH64", "digests": {"final": "411208db2aa13f1a"}}',
+        'index': '{"algorithm": "XXH64", "digests": {"HashHeader": "846b7315f91b8e48", "final": "cb3e26cadc173e40"}}'
     }
 
 The *version* key started at 2, the same version used for the hints. Since Borg has

+ 21 - 21
src/borg/archive.py

@@ -34,7 +34,7 @@ from .helpers import Error, IntegrityError, set_ec
 from .platform import uid2user, user2uid, gid2group, group2gid
 from .helpers import parse_timestamp, to_localtime
 from .helpers import OutputTimestamp, format_timedelta, format_file_size, file_status, FileSize
-from .helpers import safe_encode, safe_decode, make_path_safe, remove_surrogates
+from .helpers import safe_encode, make_path_safe, remove_surrogates
 from .helpers import StableDict
 from .helpers import bin_to_hex
 from .helpers import safe_ns
@@ -392,14 +392,14 @@ def get_item_uid_gid(item, *, numeric, uid_forced=None, gid_forced=None, uid_def
     if uid_forced is not None:
         uid = uid_forced
     else:
-        uid = None if numeric else user2uid(item.user)
+        uid = None if numeric else user2uid(item.get('user'))
         uid = item.uid if uid is None else uid
         if uid < 0:
             uid = uid_default
     if gid_forced is not None:
         gid = gid_forced
     else:
-        gid = None if numeric else group2gid(item.group)
+        gid = None if numeric else group2gid(item.get('group'))
         gid = item.gid if gid is None else gid
         if gid < 0:
             gid = gid_default
@@ -479,7 +479,6 @@ class Archive:
     def load(self, id):
         self.id = id
         self.metadata = self._load_meta(self.id)
-        self.metadata.cmdline = [safe_decode(arg) for arg in self.metadata.cmdline]
         self.name = self.metadata.name
         self.comment = self.metadata.get('comment', '')
 
@@ -1090,11 +1089,13 @@ class MetadataCollector:
         if not self.nobirthtime and hasattr(st, 'st_birthtime'):
             # sadly, there's no stat_result.st_birthtime_ns
             attrs['birthtime'] = safe_ns(int(st.st_birthtime * 10**9))
-        if self.numeric_ids:
-            attrs['user'] = attrs['group'] = None
-        else:
-            attrs['user'] = uid2user(st.st_uid)
-            attrs['group'] = gid2group(st.st_gid)
+        if not self.numeric_ids:
+            user = uid2user(st.st_uid)
+            if user is not None:
+                attrs['user'] = user
+            group = gid2group(st.st_gid)
+            if group is not None:
+                attrs['group'] = group
         return attrs
 
     def stat_ext_attrs(self, st, path, fd=None):
@@ -1427,8 +1428,11 @@ class TarfileObjectProcessors:
                 return safe_ns(int(float(s) * 1e9))
 
             item = Item(path=make_path_safe(tarinfo.name), mode=tarinfo.mode | type,
-                        uid=tarinfo.uid, gid=tarinfo.gid, user=tarinfo.uname or None, group=tarinfo.gname or None,
-                        mtime=s_to_ns(tarinfo.mtime))
+                        uid=tarinfo.uid, gid=tarinfo.gid, mtime=s_to_ns(tarinfo.mtime))
+            if tarinfo.uname:
+                item.user = tarinfo.uname
+            if tarinfo.gname:
+                item.group = tarinfo.gname
             if ph:
                 # note: for mtime this is a bit redundant as it is already done by tarfile module,
                 #       but we just do it in our way to be consistent for sure.
@@ -1515,7 +1519,7 @@ class RobustUnpacker:
     """
     def __init__(self, validator, item_keys):
         super().__init__()
-        self.item_keys = [msgpack.packb(name.encode()) for name in item_keys]
+        self.item_keys = [msgpack.packb(name) for name in item_keys]
         self.validator = validator
         self._buffered_data = []
         self._resync = False
@@ -1719,13 +1723,10 @@ class ArchiveChecker:
 
         Iterates through all objects in the repository looking for archive metadata blocks.
         """
-        required_archive_keys = frozenset(key.encode() for key in REQUIRED_ARCHIVE_KEYS)
-
         def valid_archive(obj):
             if not isinstance(obj, dict):
                 return False
-            keys = set(obj)
-            return required_archive_keys.issubset(keys)
+            return REQUIRED_ARCHIVE_KEYS.issubset(obj)
 
         logger.info('Rebuilding missing manifest, this might take some time...')
         # as we have lost the manifest, we do not know any more what valid item keys we had.
@@ -1734,7 +1735,7 @@ class ArchiveChecker:
         # lost manifest on a older borg version than the most recent one that was ever used
         # within this repository (assuming that newer borg versions support more item keys).
         manifest = Manifest(self.key, self.repository)
-        archive_keys_serialized = [msgpack.packb(name.encode()) for name in ARCHIVE_KEYS]
+        archive_keys_serialized = [msgpack.packb(name) for name in ARCHIVE_KEYS]
         pi = ProgressIndicatorPercent(total=len(self.chunks), msg="Rebuilding manifest %6.2f%%", step=0.01,
                                       msgid='check.rebuild_manifest')
         for chunk_id, _ in self.chunks.iteritems():
@@ -1881,9 +1882,9 @@ class ArchiveChecker:
 
             Missing item chunks will be skipped and the msgpack stream will be restarted
             """
-            item_keys = frozenset(key.encode() for key in self.manifest.item_keys)
-            required_item_keys = frozenset(key.encode() for key in REQUIRED_ITEM_KEYS)
-            unpacker = RobustUnpacker(lambda item: isinstance(item, StableDict) and b'path' in item,
+            item_keys = self.manifest.item_keys
+            required_item_keys = REQUIRED_ITEM_KEYS
+            unpacker = RobustUnpacker(lambda item: isinstance(item, StableDict) and 'path' in item,
                                       self.manifest.item_keys)
             _state = 0
 
@@ -1991,7 +1992,6 @@ class ArchiveChecker:
                 archive = ArchiveItem(internal_dict=msgpack.unpackb(data))
                 if archive.version != 2:
                     raise Exception('Unknown archive metadata version')
-                archive.cmdline = [safe_decode(arg) for arg in archive.cmdline]
                 items_buffer = ChunkBuffer(self.key)
                 items_buffer.write_chunk = add_callback
                 for item in robust_iterator(archive):

+ 9 - 13
src/borg/archiver.py

@@ -55,7 +55,7 @@ try:
     from .helpers import PrefixSpec, GlobSpec, CommentSpec, SortBySpec, FilesCacheMode
     from .helpers import BaseFormatter, ItemFormatter, ArchiveFormatter
     from .helpers import format_timedelta, format_file_size, parse_file_size, format_archive
-    from .helpers import safe_encode, remove_surrogates, bin_to_hex, prepare_dump_dict, eval_escapes
+    from .helpers import remove_surrogates, bin_to_hex, prepare_dump_dict, eval_escapes
     from .helpers import interval, prune_within, prune_split, PRUNING_PATTERNS
     from .helpers import timestamp
     from .helpers import get_cache_dir, os_stat
@@ -366,10 +366,6 @@ class Archiver:
                 if chunks_healthy is not None:
                     item._dict['chunks_healthy'] = chunks
                 item._dict.pop('source')  # not used for hardlinks any more, replaced by hlid
-            for attr in 'atime', 'ctime', 'mtime', 'birthtime':
-                if attr in item:
-                    ns = getattr(item, attr)  # decode (bigint or Timestamp) --> int ns
-                    setattr(item, attr, ns)  # encode int ns --> msgpack.Timestamp only, no bigint any more
             # make sure we only have desired stuff in the new item. specifically, make sure to get rid of:
             # - 'acl' remnants of bug in attic <= 0.13
             # - 'hardlink_master' (superseded by hlid)
@@ -1359,8 +1355,8 @@ class Archiver:
             tarinfo.mode = stat.S_IMODE(item.mode)
             tarinfo.uid = item.uid
             tarinfo.gid = item.gid
-            tarinfo.uname = item.user or ''
-            tarinfo.gname = item.group or ''
+            tarinfo.uname = item.get('user', '')
+            tarinfo.gname = item.get('group',  '')
             # The linkname in tar has 2 uses:
             # for symlinks it means the destination, while for hardlinks it refers to the file.
             # Since hardlinks in tar have a different type code (LNKTYPE) the format might
@@ -1944,12 +1940,12 @@ class Archiver:
                 print('This repository is not encrypted, cannot enable TAM.')
                 return EXIT_ERROR
 
-            if not manifest.tam_verified or not manifest.config.get(b'tam_required', False):
+            if not manifest.tam_verified or not manifest.config.get('tam_required', False):
                 # The standard archive listing doesn't include the archive ID like in borg 1.1.x
                 print('Manifest contents:')
                 for archive_info in manifest.archives.list(sort_by=['ts']):
                     print(format_archive(archive_info), '[%s]' % bin_to_hex(archive_info.id))
-                manifest.config[b'tam_required'] = True
+                manifest.config['tam_required'] = True
                 manifest.write()
                 repository.commit(compact=False)
             if not key.tam_required:
@@ -1972,7 +1968,7 @@ class Archiver:
                 print('Key updated')
                 if hasattr(key, 'find_key'):
                     print('Key location:', key.find_key())
-            manifest.config[b'tam_required'] = False
+            manifest.config['tam_required'] = False
             manifest.write()
             repository.commit(compact=False)
         else:
@@ -2304,7 +2300,7 @@ class Archiver:
         """dump decoded archive metadata (not: data)"""
 
         try:
-            archive_meta_orig = manifest.archives.get_raw_dict()[safe_encode(args.location.archive)]
+            archive_meta_orig = manifest.archives.get_raw_dict()[args.location.archive]
         except KeyError:
             raise Archive.DoesNotExist(args.location.archive)
 
@@ -2321,7 +2317,7 @@ class Archiver:
             fd.write(do_indent(prepare_dump_dict(archive_meta_orig)))
             fd.write(',\n')
 
-            data = key.decrypt(archive_meta_orig[b'id'], repository.get(archive_meta_orig[b'id']))
+            data = key.decrypt(archive_meta_orig['id'], repository.get(archive_meta_orig['id']))
             archive_org_dict = msgpack.unpackb(data, object_hook=StableDict)
 
             fd.write('    "_meta":\n')
@@ -2331,7 +2327,7 @@ class Archiver:
 
             unpacker = msgpack.Unpacker(use_list=False, object_hook=StableDict)
             first = True
-            for item_id in archive_org_dict[b'items']:
+            for item_id in archive_org_dict['items']:
                 data = key.decrypt(item_id, repository.get(item_id))
                 unpacker.feed(data)
                 for item in unpacker:

+ 20 - 16
src/borg/cache_sync/unpack.h

@@ -384,19 +384,11 @@ static inline int unpack_callback_map_end(unpack_user* u)
 
 static inline int unpack_callback_raw(unpack_user* u, const char* b, const char* p, unsigned int length)
 {
-    /* raw = what Borg uses for binary stuff and strings as well */
+    /* raw = what Borg uses for text stuff */
     /* Note: p points to an internal buffer which contains l bytes. */
     (void)b;
 
     switch(u->expect) {
-    case expect_key:
-        if(length != 32) {
-            SET_LAST_ERROR("Incorrect key length");
-            return -1;
-        }
-        memcpy(u->current.key, p, 32);
-        u->expect = expect_size;
-        break;
     case expect_map_key:
         if(length == 6 && !memcmp("chunks", p, 6)) {
             u->expect = expect_chunks_begin;
@@ -409,19 +401,31 @@ static inline int unpack_callback_raw(unpack_user* u, const char* b, const char*
             u->expect = expect_map_item_end;
         }
         break;
-    default:
-        if(u->inside_chunks) {
-            SET_LAST_ERROR("Unexpected bytes in chunks structure");
-            return -1;
-        }
     }
     return 0;
 }
 
 static inline int unpack_callback_bin(unpack_user* u, const char* b, const char* p, unsigned int length)
 {
-    (void)u; (void)b; (void)p; (void)length;
-    UNEXPECTED("bin");
+    /* bin = what Borg uses for binary stuff */
+    /* Note: p points to an internal buffer which contains l bytes. */
+    (void)b;
+
+    switch(u->expect) {
+    case expect_key:
+        if(length != 32) {
+            SET_LAST_ERROR("Incorrect key length");
+            return -1;
+        }
+        memcpy(u->current.key, p, 32);
+        u->expect = expect_size;
+        break;
+    default:
+        if(u->inside_chunks) {
+            SET_LAST_ERROR("Unexpected bytes in chunks structure");
+            return -1;
+        }
+    }
     return 0;
 }
 

+ 1 - 1
src/borg/constants.py

@@ -128,7 +128,7 @@ class KeyType:
     # upper 4 bits are ciphersuite, 0 == legacy AES-CTR
     KEYFILE = 0x00
     # repos with PASSPHRASE mode could not be created any more since borg 1.0, see #97.
-    # in borg 1.3 all of its code and also the "borg key migrate-to-repokey" command was removed.
+    # in borg 2. all of its code and also the "borg key migrate-to-repokey" command was removed.
     # if you still need to, you can use "borg key migrate-to-repokey" with borg 1.0, 1.1 and 1.2.
     # Nowadays, we just dispatch this to RepoKey and assume the passphrase was migrated to a repokey.
     PASSPHRASE = 0x01  # legacy, attic and borg < 1.0

+ 9 - 7
src/borg/crypto/key.py

@@ -22,7 +22,7 @@ from ..helpers import bin_to_hex
 from ..helpers.passphrase import Passphrase, PasswordRetriesExceeded, PassphraseWrong
 from ..helpers import msgpack
 from ..helpers.manifest import Manifest
-from ..item import Key, EncryptedKey
+from ..item import Key, EncryptedKey, want_bytes
 from ..platform import SaveFile
 
 from .nonces import NonceManager
@@ -232,26 +232,28 @@ class KeyBase:
         unpacker = get_limited_unpacker('manifest')
         unpacker.feed(data)
         unpacked = unpacker.unpack()
-        if b'tam' not in unpacked:
+        if 'tam' not in unpacked:
             if tam_required:
                 raise TAMRequiredError(self.repository._location.canonical_path())
             else:
                 logger.debug('TAM not found and not required')
                 return unpacked, False
-        tam = unpacked.pop(b'tam', None)
+        tam = unpacked.pop('tam', None)
         if not isinstance(tam, dict):
             raise TAMInvalid()
-        tam_type = tam.get(b'type', b'<none>').decode('ascii', 'replace')
+        tam_type = tam.get('type', '<none>')
         if tam_type != 'HKDF_HMAC_SHA512':
             if tam_required:
                 raise TAMUnsupportedSuiteError(repr(tam_type))
             else:
                 logger.debug('Ignoring TAM made with unsupported suite, since TAM is not required: %r', tam_type)
                 return unpacked, False
-        tam_hmac = tam.get(b'hmac')
-        tam_salt = tam.get(b'salt')
-        if not isinstance(tam_salt, bytes) or not isinstance(tam_hmac, bytes):
+        tam_hmac = tam.get('hmac')
+        tam_salt = tam.get('salt')
+        if not isinstance(tam_salt, (bytes, str)) or not isinstance(tam_hmac, (bytes, str)):
             raise TAMInvalid()
+        tam_hmac = want_bytes(tam_hmac)  # legacy
+        tam_salt = want_bytes(tam_salt)  # legacy
         offset = data.index(tam_hmac)
         data[offset:offset + 64] = bytes(64)
         tam_key = self._tam_key(tam_salt, context=b'manifest')

+ 1 - 1
src/borg/helpers/__init__.py

@@ -18,7 +18,7 @@ from .progress import *  # NOQA
 from .time import *  # NOQA
 from .yes import *  # NOQA
 
-from .msgpack import is_slow_msgpack, is_supported_msgpack, int_to_bigint, bigint_to_int, get_limited_unpacker
+from .msgpack import is_slow_msgpack, is_supported_msgpack, get_limited_unpacker
 from . import msgpack
 
 # generic mechanism to enable users to invoke workarounds by setting the

+ 2 - 2
src/borg/helpers/fs.py

@@ -205,8 +205,8 @@ class HardLinkManager:
 
     def hardlink_id_from_path(self, path):
         """compute a hardlink id from a path"""
-        assert isinstance(path, bytes)
-        return hashlib.sha256(path).digest()
+        assert isinstance(path, str)
+        return hashlib.sha256(path.encode('utf-8', errors='surrogateescape')).digest()
 
     def hardlink_id_from_inode(self, *, ino, dev):
         """compute a hardlink id from an inode"""

+ 21 - 25
src/borg/helpers/manifest.py

@@ -12,7 +12,7 @@ from ..logger import create_logger
 logger = create_logger()
 
 from .datastruct import StableDict
-from .parseformat import bin_to_hex, safe_encode, safe_decode
+from .parseformat import bin_to_hex
 from .time import parse_timestamp
 from .. import shellpattern
 from ..constants import *  # NOQA
@@ -39,39 +39,35 @@ class Archives(abc.MutableMapping):
     str timestamps or datetime timestamps.
     """
     def __init__(self):
-        # key: encoded archive name, value: dict(b'id': bytes_id, b'time': bytes_iso_ts)
+        # key: str archive name, value: dict('id': bytes_id, 'time': str_iso_ts)
         self._archives = {}
 
     def __len__(self):
         return len(self._archives)
 
     def __iter__(self):
-        return iter(safe_decode(name) for name in self._archives)
+        return iter(self._archives)
 
     def __getitem__(self, name):
         assert isinstance(name, str)
-        _name = safe_encode(name)
-        values = self._archives.get(_name)
+        values = self._archives.get(name)
         if values is None:
             raise KeyError
-        ts = parse_timestamp(values[b'time'].decode())
-        return ArchiveInfo(name=name, id=values[b'id'], ts=ts)
+        ts = parse_timestamp(values['time'])
+        return ArchiveInfo(name=name, id=values['id'], ts=ts)
 
     def __setitem__(self, name, info):
         assert isinstance(name, str)
-        name = safe_encode(name)
         assert isinstance(info, tuple)
         id, ts = info
         assert isinstance(id, bytes)
         if isinstance(ts, datetime):
             ts = ts.replace(tzinfo=None).strftime(ISO_FORMAT)
         assert isinstance(ts, str)
-        ts = ts.encode()
-        self._archives[name] = {b'id': id, b'time': ts}
+        self._archives[name] = {'id': id, 'time': ts}
 
     def __delitem__(self, name):
         assert isinstance(name, str)
-        name = safe_encode(name)
         del self._archives[name]
 
     def list(self, *, glob=None, match_end=r'\Z', sort_by=(), consider_checkpoints=True, first=None, last=None, reverse=False):
@@ -116,8 +112,8 @@ class Archives(abc.MutableMapping):
     def set_raw_dict(self, d):
         """set the dict we get from the msgpack unpacker"""
         for k, v in d.items():
-            assert isinstance(k, bytes)
-            assert isinstance(v, dict) and b'id' in v and b'time' in v
+            assert isinstance(k, str)
+            assert isinstance(v, dict) and 'id' in v and 'time' in v
             self._archives[k] = v
 
     def get_raw_dict(self):
@@ -196,10 +192,10 @@ class Manifest:
         manifest.timestamp = m.get('timestamp')
         manifest.config = m.config
         # valid item keys are whatever is known in the repo or every key we know
-        manifest.item_keys = ITEM_KEYS | frozenset(key.decode() for key in m.get('item_keys', []))
+        manifest.item_keys = ITEM_KEYS | frozenset(m.get('item_keys', []))
 
         if manifest.tam_verified:
-            manifest_required = manifest.config.get(b'tam_required', False)
+            manifest_required = manifest.config.get('tam_required', False)
             security_required = tam_required(repository)
             if manifest_required and not security_required:
                 logger.debug('Manifest is TAM verified and says TAM is required, updating security database...')
@@ -214,32 +210,32 @@ class Manifest:
     def check_repository_compatibility(self, operations):
         for operation in operations:
             assert isinstance(operation, self.Operation)
-            feature_flags = self.config.get(b'feature_flags', None)
+            feature_flags = self.config.get('feature_flags', None)
             if feature_flags is None:
                 return
-            if operation.value.encode() not in feature_flags:
+            if operation.value not in feature_flags:
                 continue
-            requirements = feature_flags[operation.value.encode()]
-            if b'mandatory' in requirements:
-                unsupported = set(requirements[b'mandatory']) - self.SUPPORTED_REPO_FEATURES
+            requirements = feature_flags[operation.value]
+            if 'mandatory' in requirements:
+                unsupported = set(requirements['mandatory']) - self.SUPPORTED_REPO_FEATURES
                 if unsupported:
-                    raise MandatoryFeatureUnsupported([f.decode() for f in unsupported])
+                    raise MandatoryFeatureUnsupported(list(unsupported))
 
     def get_all_mandatory_features(self):
         result = {}
-        feature_flags = self.config.get(b'feature_flags', None)
+        feature_flags = self.config.get('feature_flags', None)
         if feature_flags is None:
             return result
 
         for operation, requirements in feature_flags.items():
-            if b'mandatory' in requirements:
-                result[operation.decode()] = {feature.decode() for feature in requirements[b'mandatory']}
+            if 'mandatory' in requirements:
+                result[operation] = set(requirements['mandatory'])
         return result
 
     def write(self):
         from ..item import ManifestItem
         if self.key.tam_required:
-            self.config[b'tam_required'] = True
+            self.config['tam_required'] = True
         # self.timestamp needs to be strictly monotonically increasing. Clocks often are not set correctly
         if self.timestamp is None:
             self.timestamp = datetime.utcnow().strftime(ISO_FORMAT)

+ 71 - 52
src/borg/helpers/msgpack.py

@@ -1,21 +1,53 @@
+"""
+wrapping msgpack
+================
+
+We wrap msgpack here the way we need it - to avoid having lots of clutter in the calling code.
+
+Packing
+-------
+- use_bin_type = True (used by borg since borg 2.0)
+  This is used to generate output according to new msgpack 2.0 spec.
+  This cleanly keeps bytes and str types apart.
+
+- use_bin_type = False (used by borg < 1.3)
+  This creates output according to the older msgpack spec.
+  BAD: str and bytes were packed into same "raw" representation.
+
+- unicode_errors = 'surrogateescape'
+  Guess backup applications are one of the rare cases when this needs to be used.
+  It is needed because borg also needs to deal with data that does not cleanly encode/decode using utf-8.
+  There's a lot of crap out there, e.g. in filenames and as a backup tool, we must keep them as good as possible.
+
+Unpacking
+---------
+- raw = False (used by borg since borg 2.0)
+  We already can use this with borg 2.0 due to the type conversion to the desired type in item.py update_internal
+  methods. This type conversion code can be removed in future, when we do not have to deal with data any more
+  that was packed the old way.
+  It will then unpack according to the msgpack 2.0 spec format and directly output bytes or str.
+
+- raw = True (the old way, used by borg < 1.3)
+
+- unicode_errors = 'surrogateescape' -> see description above (will be used when raw is False).
+
+As of borg 2.0, we have fixed most of the msgpack str/bytes mess, #968.
+Borg now still needs to **read** old repos, archives, keys, ... so we can not yet fix it completely.
+But from now on, borg only **writes** new data according to the new msgpack 2.0 spec,
+thus we can remove some legacy support in a later borg release (some places are marked with "legacy").
+
+current way in msgpack terms
+----------------------------
+
+- pack with use_bin_type=True (according to msgpack 2.0 spec)
+- packs str -> raw and bytes -> bin
+- unpack with raw=False (according to msgpack 2.0 spec, using unicode_errors='surrogateescape')
+- unpacks bin to bytes and raw to str (thus we need to convert to desired type if we want bytes from "raw")
+"""
+
 from .datastruct import StableDict
 from ..constants import *  # NOQA
 
-# wrapping msgpack ---------------------------------------------------------------------------------------------------
-#
-# due to the planned breaking api changes in upstream msgpack, we wrap it the way we need it -
-# to avoid having lots of clutter in the calling code. see tickets #968 and #3632.
-#
-# Packing
-# -------
-# use_bin_type = False is needed to generate the old msgpack format (not msgpack 2.0 spec) as borg always did.
-# unicode_errors = None is needed because usage of it is deprecated
-#
-# Unpacking
-# ---------
-# raw = True is needed to unpack the old msgpack format to bytes (not str, about the decoding see item.pyx).
-# unicode_errors = None is needed because usage of it is deprecated
-
 from msgpack import Packer as mp_Packer
 from msgpack import packb as mp_packb
 from msgpack import pack as mp_pack
@@ -30,6 +62,10 @@ from msgpack import OutOfData
 
 version = mp_version
 
+USE_BIN_TYPE = True
+RAW = False
+UNICODE_ERRORS = 'surrogateescape'
+
 
 class PackException(Exception):
     """Exception while msgpack packing"""
@@ -40,10 +76,10 @@ class UnpackException(Exception):
 
 
 class Packer(mp_Packer):
-    def __init__(self, *, default=None, unicode_errors=None,
-                 use_single_float=False, autoreset=True, use_bin_type=False,
+    def __init__(self, *, default=None, unicode_errors=UNICODE_ERRORS,
+                 use_single_float=False, autoreset=True, use_bin_type=USE_BIN_TYPE,
                  strict_types=False):
-        assert unicode_errors is None
+        assert unicode_errors == UNICODE_ERRORS
         super().__init__(default=default, unicode_errors=unicode_errors,
                          use_single_float=use_single_float, autoreset=autoreset, use_bin_type=use_bin_type,
                          strict_types=strict_types)
@@ -55,16 +91,16 @@ class Packer(mp_Packer):
             raise PackException(e)
 
 
-def packb(o, *, use_bin_type=False, unicode_errors=None, **kwargs):
-    assert unicode_errors is None
+def packb(o, *, use_bin_type=USE_BIN_TYPE, unicode_errors=UNICODE_ERRORS, **kwargs):
+    assert unicode_errors == UNICODE_ERRORS
     try:
         return mp_packb(o, use_bin_type=use_bin_type, unicode_errors=unicode_errors, **kwargs)
     except Exception as e:
         raise PackException(e)
 
 
-def pack(o, stream, *, use_bin_type=False, unicode_errors=None, **kwargs):
-    assert unicode_errors is None
+def pack(o, stream, *, use_bin_type=USE_BIN_TYPE, unicode_errors=UNICODE_ERRORS, **kwargs):
+    assert unicode_errors == UNICODE_ERRORS
     try:
         return mp_pack(o, stream, use_bin_type=use_bin_type, unicode_errors=unicode_errors, **kwargs)
     except Exception as e:
@@ -72,13 +108,13 @@ def pack(o, stream, *, use_bin_type=False, unicode_errors=None, **kwargs):
 
 
 class Unpacker(mp_Unpacker):
-    def __init__(self, file_like=None, *, read_size=0, use_list=True, raw=True,
+    def __init__(self, file_like=None, *, read_size=0, use_list=True, raw=RAW,
                  object_hook=None, object_pairs_hook=None, list_hook=None,
-                 unicode_errors=None, max_buffer_size=0,
+                 unicode_errors=UNICODE_ERRORS, max_buffer_size=0,
                  ext_hook=ExtType,
                  strict_map_key=False):
-        assert raw is True
-        assert unicode_errors is None
+        assert raw == RAW
+        assert unicode_errors == UNICODE_ERRORS
         kw = dict(file_like=file_like, read_size=read_size, use_list=use_list, raw=raw,
                   object_hook=object_hook, object_pairs_hook=object_pairs_hook, list_hook=list_hook,
                   unicode_errors=unicode_errors, max_buffer_size=max_buffer_size,
@@ -105,10 +141,11 @@ class Unpacker(mp_Unpacker):
     next = __next__
 
 
-def unpackb(packed, *, raw=True, unicode_errors=None,
+def unpackb(packed, *, raw=RAW, unicode_errors=UNICODE_ERRORS,
             strict_map_key=False,
             **kwargs):
-    assert unicode_errors is None
+    assert raw == RAW
+    assert unicode_errors == UNICODE_ERRORS
     try:
         kw = dict(raw=raw, unicode_errors=unicode_errors,
                   strict_map_key=strict_map_key)
@@ -118,10 +155,11 @@ def unpackb(packed, *, raw=True, unicode_errors=None,
         raise UnpackException(e)
 
 
-def unpack(stream, *, raw=True, unicode_errors=None,
+def unpack(stream, *, raw=RAW, unicode_errors=UNICODE_ERRORS,
            strict_map_key=False,
            **kwargs):
-    assert unicode_errors is None
+    assert raw == RAW
+    assert unicode_errors == UNICODE_ERRORS
     try:
         kw = dict(raw=raw, unicode_errors=unicode_errors,
                   strict_map_key=strict_map_key)
@@ -164,30 +202,11 @@ def get_limited_unpacker(kind):
     return Unpacker(**args)
 
 
-def bigint_to_int(mtime):  # legacy
-    """Convert bytearray to int
-    """
-    if isinstance(mtime, bytes):
-        return int.from_bytes(mtime, 'little', signed=True)
-    return mtime
-
-
-def int_to_bigint(value):  # legacy
-    """Convert integers larger than 64 bits to bytearray
-
-    Smaller integers are left alone
-    """
-    if value.bit_length() > 63:
-        return value.to_bytes((value.bit_length() + 9) // 8, 'little', signed=True)
-    return value
-
-
 def int_to_timestamp(ns):
+    assert isinstance(ns, int)
     return Timestamp.from_unix_nano(ns)
 
 
 def timestamp_to_int(ts):
-    if isinstance(ts, Timestamp):
-        return ts.to_unix_nano()
-    # legacy support note: we need to keep the bigint conversion for compatibility with borg < 1.3 archives.
-    return bigint_to_int(ts)
+    assert isinstance(ts, Timestamp)
+    return ts.to_unix_nano()

+ 2 - 2
src/borg/helpers/parseformat.py

@@ -808,8 +808,8 @@ class ItemFormatter(BaseFormatter):
         hlid = bin_to_hex(hlid) if hlid else ''
         item_data['type'] = item_type
         item_data['mode'] = mode
-        item_data['user'] = item.user or item.uid
-        item_data['group'] = item.group or item.gid
+        item_data['user'] = item.get('user', str(item.uid))
+        item_data['group'] = item.get('group', str(item.gid))
         item_data['uid'] = item.uid
         item_data['gid'] = item.gid
         item_data['path'] = remove_surrogates(item.path)

+ 235 - 66
src/borg/item.pyx

@@ -2,10 +2,9 @@ import stat
 from collections import namedtuple
 
 from .constants import ITEM_KEYS, ARCHIVE_KEYS
-from .helpers import safe_encode, safe_decode
 from .helpers import StableDict
 from .helpers import format_file_size
-from .helpers.msgpack import timestamp_to_int, int_to_timestamp
+from .helpers.msgpack import timestamp_to_int, int_to_timestamp, Timestamp
 
 
 cdef extern from "_item.c":
@@ -16,6 +15,102 @@ cdef extern from "_item.c":
 API_VERSION = '1.2_01'
 
 
+def fix_key(data, key, *, errors='strict'):
+    """if k is a bytes-typed key, migrate key/value to a str-typed key in dict data"""
+    if isinstance(key, bytes):
+        value = data.pop(key)
+        key = key.decode('utf-8', errors=errors)
+        data[key] = value
+    assert isinstance(key, str)
+    return key
+
+
+def fix_str_value(data, key, errors='surrogateescape'):
+    """makes sure that data[key] is a str (decode if it is bytes)"""
+    assert isinstance(key, str)  # fix_key must be called first
+    value = data[key]
+    value = want_str(value, errors=errors)
+    data[key] = value
+    return value
+
+
+def fix_bytes_value(data, key):
+    """makes sure that data[key] is bytes (encode if it is str)"""
+    assert isinstance(key, str)  # fix_key must be called first
+    value = data[key]
+    value = want_bytes(value)
+    data[key] = value
+    return value
+
+
+def fix_list_of_str(v):
+    """make sure we have a list of str"""
+    assert isinstance(v, (tuple, list))
+    return [want_str(e) for e in v]
+
+
+def fix_list_of_bytes(v):
+    """make sure we have a list of bytes"""
+    assert isinstance(v, (tuple, list))
+    return [want_bytes(e) for e in v]
+
+
+def fix_list_of_chunkentries(v):
+    """make sure we have a list of correct chunkentries"""
+    assert isinstance(v, (tuple, list))
+    chunks = []
+    for ce in v:
+        assert isinstance(ce, (tuple, list))
+        assert len(ce) == 3  # id, size, csize
+        assert isinstance(ce[1], int)
+        assert isinstance(ce[2], int)
+        ce_fixed = [want_bytes(ce[0]), ce[1], ce[2]]  # list!
+        chunks.append(ce_fixed)  # create a list of lists
+    return chunks
+
+
+def fix_tuple_of_str(v):
+    """make sure we have a tuple of str"""
+    assert isinstance(v, (tuple, list))
+    return tuple(want_str(e) for e in v)
+
+
+def fix_tuple_of_str_and_int(v):
+    """make sure we have a tuple of str"""
+    assert isinstance(v, (tuple, list))
+    t = tuple(e.decode() if isinstance(e, bytes) else e for e in v)
+    assert all(isinstance(e, (str, int)) for e in t), repr(t)
+    return t
+
+
+def fix_timestamp(v):
+    """make sure v is a Timestamp"""
+    if isinstance(v, Timestamp):
+        return v
+    # legacy support
+    if isinstance(v, bytes):  # was: bigint_to_int()
+        v = int.from_bytes(v, 'little', signed=True)
+    assert isinstance(v, int)
+    return int_to_timestamp(v)
+
+
+def want_bytes(v, *, errors='surrogateescape'):
+    """we know that we want bytes and the value should be bytes"""
+    # legacy support: it being str can be caused by msgpack unpack decoding old data that was packed with use_bin_type=False
+    if isinstance(v, str):
+        v = v.encode('utf-8', errors=errors)
+    assert isinstance(v, bytes), f'not a bytes object, but {v!r}'
+    return v
+
+
+def want_str(v, *, errors='surrogateescape'):
+    """we know that we want str and the value should be str"""
+    if isinstance(v, bytes):
+        v = v.decode('utf-8', errors=errors)
+    assert isinstance(v, str), f'not a str object, but {v!r}'
+    return v
+
+
 class PropDict:
     """
     Manage a dictionary via properties.
@@ -112,6 +207,8 @@ class PropDict:
                 raise AttributeError(attr_error_msg) from None
             if decode is not None:
                 value = decode(value)
+            if not isinstance(value, value_type):
+                raise TypeError(type_error_msg)
             return value
 
         def _set(self, value):
@@ -139,14 +236,9 @@ class Item(PropDict):
     Items are created either from msgpack unpacker output, from another dict, from kwargs or
     built step-by-step by setting attributes.
 
-    msgpack gives us a dict with bytes-typed keys, just give it to Item(internal_dict=d) and use item.key_name later.
-    msgpack gives us byte-typed values for stuff that should be str, we automatically decode when getting
-    such a property and encode when setting it.
+    msgpack unpacker gives us a dict, just give it to Item(internal_dict=d) and use item.key_name later.
 
     If an Item shall be serialized, give as_dict() method output to msgpack packer.
-
-    A bug in Attic up to and including release 0.13 added a (meaningless) 'acl' key to every item.
-    We must never re-use this key. See test_attic013_acl_bug for details.
     """
 
     VALID_KEYS = ITEM_KEYS | {'deleted', 'nlink', }  # str-typed keys
@@ -155,10 +247,10 @@ class Item(PropDict):
 
     # properties statically defined, so that IDEs can know their names:
 
-    path = PropDict._make_property('path', str, 'surrogate-escaped str', encode=safe_encode, decode=safe_decode)
-    source = PropDict._make_property('source', str, 'surrogate-escaped str', encode=safe_encode, decode=safe_decode)
-    user = PropDict._make_property('user', (str, type(None)), 'surrogate-escaped str or None', encode=safe_encode, decode=safe_decode)
-    group = PropDict._make_property('group', (str, type(None)), 'surrogate-escaped str or None', encode=safe_encode, decode=safe_decode)
+    path = PropDict._make_property('path', str, 'surrogate-escaped str')
+    source = PropDict._make_property('source', str, 'surrogate-escaped str')
+    user = PropDict._make_property('user', str, 'surrogate-escaped str')
+    group = PropDict._make_property('group', str, 'surrogate-escaped str')
 
     acl_access = PropDict._make_property('acl_access', bytes)
     acl_default = PropDict._make_property('acl_default', bytes)
@@ -177,14 +269,13 @@ class Item(PropDict):
     birthtime = PropDict._make_property('birthtime', int, 'int (ns)', encode=int_to_timestamp, decode=timestamp_to_int)
 
     # size is only present for items with a chunk list and then it is sum(chunk_sizes)
-    # compatibility note: this is a new feature, in old archives size will be missing.
     size = PropDict._make_property('size', int)
 
     hlid = PropDict._make_property('hlid', bytes)  # hard link id: same value means same hard link.
     hardlink_master = PropDict._make_property('hardlink_master', bool)  # legacy
 
-    chunks = PropDict._make_property('chunks', (list, type(None)), 'list or None')
-    chunks_healthy = PropDict._make_property('chunks_healthy', (list, type(None)), 'list or None')
+    chunks = PropDict._make_property('chunks', list, 'list')
+    chunks_healthy = PropDict._make_property('chunks_healthy', list, 'list')
 
     xattrs = PropDict._make_property('xattrs', StableDict)
 
@@ -193,12 +284,10 @@ class Item(PropDict):
 
     part = PropDict._make_property('part', int)
 
-    def get_size(self, hardlink_masters=None, memorize=False, compressed=False, from_chunks=False, consider_ids=None):
+    def get_size(self, memorize=False, compressed=False, from_chunks=False, consider_ids=None):
         """
         Determine the (uncompressed or compressed) size of this item.
 
-        :param hardlink_masters: If given, the size of hardlink slaves is computed via the hardlink master's chunk list,
-        otherwise size will be returned as 0.
         :param memorize: Whether the computed size value will be stored into the item.
         :param compressed: Whether the compressed or uncompressed size will be returned.
         :param from_chunks: If true, size is computed from chunks even if a precomputed value is available.
@@ -218,31 +307,14 @@ class Item(PropDict):
             # no precomputed (c)size value available, compute it:
             try:
                 chunks = getattr(self, 'chunks')
-                having_chunks = True
             except AttributeError:
-                having_chunks = False
-                # this item has no (own) chunks list, but if this is a hardlink slave
-                # and we know the master, we can still compute the size.
-                if hardlink_masters is None:
-                    chunks = None
-                else:
-                    try:
-                        master = getattr(self, 'source')
-                    except AttributeError:
-                        # not a hardlink slave, likely a directory or special file w/o chunks
-                        chunks = None
-                    else:
-                        # hardlink slave, try to fetch hardlink master's chunks list
-                        # todo: put precomputed size into hardlink_masters' values and use it, if present
-                        chunks, _ = hardlink_masters.get(master, (None, None))
-                if chunks is None:
-                    return 0
+                return 0
             if consider_ids is not None:
                 size = sum(getattr(ChunkListEntry(*chunk), attr) for chunk in chunks if chunk.id in consider_ids)
             else:
                 size = sum(getattr(ChunkListEntry(*chunk), attr) for chunk in chunks)
             # if requested, memorize the precomputed (c)size for items that have an own chunks list:
-            if memorize and having_chunks:
+            if memorize:
                 setattr(self, attr, size)
         return size
 
@@ -290,6 +362,31 @@ class Item(PropDict):
         except AttributeError:
             return False
 
+    def update_internal(self, d):
+        # legacy support for migration (data from old msgpacks comes in as bytes always, but sometimes we want str),
+        # also need to fix old timestamp data types.
+        for k, v in list(d.items()):
+            k = fix_key(d, k)
+            if k in ('path', 'source', 'user', 'group'):
+                v = fix_str_value(d, k)
+            if k in ('chunks', 'chunks_healthy'):
+                v = fix_list_of_chunkentries(v)
+            if k in ('atime', 'ctime', 'mtime', 'birthtime'):
+                v = fix_timestamp(v)
+            if k in ('acl_access', 'acl_default', 'acl_extended', 'acl_nfs4'):
+                v = fix_bytes_value(d, k)
+            if k == 'xattrs':
+                if not isinstance(v, StableDict):
+                    v = StableDict(v)
+                v_new = StableDict()
+                for xk, xv in list(v.items()):
+                    xk = want_bytes(xk)
+                    # old borg used to store None instead of a b'' value
+                    xv = b'' if xv is None else want_bytes(xv)
+                    v_new[xk] = xv
+                v = v_new  # xattrs is a StableDict(bytes keys -> bytes values)
+            self._dict[k] = v
+
 
 class EncryptedKey(PropDict):
     """
@@ -298,18 +395,18 @@ class EncryptedKey(PropDict):
     A EncryptedKey is created either from msgpack unpacker output, from another dict, from kwargs or
     built step-by-step by setting attributes.
 
-    msgpack gives us a dict with bytes-typed keys, just give it to EncryptedKey(d) and use enc_key.xxx later.
+    msgpack unpacker gives us a dict, just give it to EncryptedKey(d) and use enc_key.xxx later.
 
     If a EncryptedKey shall be serialized, give as_dict() method output to msgpack packer.
     """
 
-    VALID_KEYS = { 'version', 'algorithm', 'iterations', 'salt', 'hash', 'data',
-                   'argon2_time_cost', 'argon2_memory_cost', 'argon2_parallelism', 'argon2_type' }
+    VALID_KEYS = {'version', 'algorithm', 'iterations', 'salt', 'hash', 'data',
+                  'argon2_time_cost', 'argon2_memory_cost', 'argon2_parallelism', 'argon2_type'}
 
     __slots__ = ("_dict", )  # avoid setting attributes not supported by properties
 
     version = PropDict._make_property('version', int)
-    algorithm = PropDict._make_property('algorithm', str, encode=str.encode, decode=bytes.decode)
+    algorithm = PropDict._make_property('algorithm', str)
     iterations = PropDict._make_property('iterations', int)
     salt = PropDict._make_property('salt', bytes)
     hash = PropDict._make_property('hash', bytes)
@@ -317,7 +414,19 @@ class EncryptedKey(PropDict):
     argon2_time_cost = PropDict._make_property('argon2_time_cost', int)
     argon2_memory_cost = PropDict._make_property('argon2_memory_cost', int)
     argon2_parallelism = PropDict._make_property('argon2_parallelism', int)
-    argon2_type = PropDict._make_property('argon2_type', str, encode=str.encode, decode=bytes.decode)
+    argon2_type = PropDict._make_property('argon2_type', str)
+
+    def update_internal(self, d):
+        # legacy support for migration (data from old msgpacks comes in as bytes always, but sometimes we want str)
+        for k, v in list(d.items()):
+            k = fix_key(d, k)
+            if k == 'version':
+                assert isinstance(v, int)
+            if k in ('algorithm', 'argon2_type'):
+                v = fix_str_value(d, k)
+            if k in ('salt', 'hash', 'data'):
+                v = fix_bytes_value(d, k)
+            self._dict[k] = v
 
 
 class Key(PropDict):
@@ -327,7 +436,7 @@ class Key(PropDict):
     A Key is created either from msgpack unpacker output, from another dict, from kwargs or
     built step-by-step by setting attributes.
 
-    msgpack gives us a dict with bytes-typed keys, just give it to Key(d) and use key.xxx later.
+    msgpack unpacker gives us a dict, just give it to Key(d) and use key.xxx later.
 
     If a Key shall be serialized, give as_dict() method output to msgpack packer.
     """
@@ -344,17 +453,15 @@ class Key(PropDict):
     chunk_seed = PropDict._make_property('chunk_seed', int)
     tam_required = PropDict._make_property('tam_required', bool)
 
-
-def tuple_encode(t):
-    """encode a tuple that might contain str items"""
-    # we have str, but want to give bytes to msgpack.pack
-    return tuple(safe_encode(e) if isinstance(e, str) else e for e in t)
-
-
-def tuple_decode(t):
-    """decode a tuple that might contain bytes items"""
-    # we get bytes objects from msgpack.unpack, but want str
-    return tuple(safe_decode(e) if isinstance(e, bytes) else e for e in t)
+    def update_internal(self, d):
+        # legacy support for migration (data from old msgpacks comes in as bytes always, but sometimes we want str)
+        for k, v in list(d.items()):
+            k = fix_key(d, k)
+            if k == 'version':
+                assert isinstance(v, int)
+            if k in ('repository_id', 'enc_key', 'enc_hmac_key', 'id_key'):
+                v = fix_bytes_value(d, k)
+            self._dict[k] = v
 
 
 class ArchiveItem(PropDict):
@@ -364,7 +471,7 @@ class ArchiveItem(PropDict):
     An ArchiveItem is created either from msgpack unpacker output, from another dict, from kwargs or
     built step-by-step by setting attributes.
 
-    msgpack gives us a dict with bytes-typed keys, just give it to ArchiveItem(d) and use arch.xxx later.
+    msgpack unpacker gives us a dict, just give it to ArchiveItem(d) and use arch.xxx later.
 
     If a ArchiveItem shall be serialized, give as_dict() method output to msgpack packer.
     """
@@ -374,15 +481,15 @@ class ArchiveItem(PropDict):
     __slots__ = ("_dict", )  # avoid setting attributes not supported by properties
 
     version = PropDict._make_property('version', int)
-    name = PropDict._make_property('name', str, 'surrogate-escaped str', encode=safe_encode, decode=safe_decode)
+    name = PropDict._make_property('name', str, 'surrogate-escaped str')
     items = PropDict._make_property('items', list)
     cmdline = PropDict._make_property('cmdline', list)  # list of s-e-str
-    hostname = PropDict._make_property('hostname', str, 'surrogate-escaped str', encode=safe_encode, decode=safe_decode)
-    username = PropDict._make_property('username', str, 'surrogate-escaped str', encode=safe_encode, decode=safe_decode)
-    time = PropDict._make_property('time', str, 'surrogate-escaped str', encode=safe_encode, decode=safe_decode)
-    time_end = PropDict._make_property('time_end', str, 'surrogate-escaped str', encode=safe_encode, decode=safe_decode)
-    comment = PropDict._make_property('comment', str, 'surrogate-escaped str', encode=safe_encode, decode=safe_decode)
-    chunker_params = PropDict._make_property('chunker_params', tuple, 'chunker-params tuple', encode=tuple_encode, decode=tuple_decode)
+    hostname = PropDict._make_property('hostname', str, 'surrogate-escaped str')
+    username = PropDict._make_property('username', str, 'surrogate-escaped str')
+    time = PropDict._make_property('time', str)
+    time_end = PropDict._make_property('time_end', str)
+    comment = PropDict._make_property('comment', str, 'surrogate-escaped str')
+    chunker_params = PropDict._make_property('chunker_params', tuple)
     recreate_cmdline = PropDict._make_property('recreate_cmdline', list)  # list of s-e-str
     # recreate_source_id, recreate_args, recreate_partial_chunks were used in 1.1.0b1 .. b2
     recreate_source_id = PropDict._make_property('recreate_source_id', bytes)
@@ -395,6 +502,24 @@ class ArchiveItem(PropDict):
     csize_parts = PropDict._make_property('csize_parts', int)
     nfiles_parts = PropDict._make_property('nfiles_parts', int)
 
+    def update_internal(self, d):
+        # legacy support for migration (data from old msgpacks comes in as bytes always, but sometimes we want str)
+        for k, v in list(d.items()):
+            k = fix_key(d, k)
+            if k == 'version':
+                assert isinstance(v, int)
+            if k in ('name', 'hostname', 'username', 'comment'):
+                v = fix_str_value(d, k)
+            if k in ('time', 'time_end'):
+                v = fix_str_value(d, k, 'replace')
+            if k == 'chunker_params':
+                v = fix_tuple_of_str_and_int(v)
+            if k in ('cmdline', 'recreate_cmdline'):
+                v = fix_list_of_str(v)
+            if k == 'items':
+                v = fix_list_of_bytes(v)
+            self._dict[k] = v
+
 
 class ManifestItem(PropDict):
     """
@@ -403,7 +528,7 @@ class ManifestItem(PropDict):
     A ManifestItem is created either from msgpack unpacker output, from another dict, from kwargs or
     built step-by-step by setting attributes.
 
-    msgpack gives us a dict with bytes-typed keys, just give it to ManifestItem(d) and use manifest.xxx later.
+    msgpack unpacker gives us a dict, just give it to ManifestItem(d) and use manifest.xxx later.
 
     If a ManifestItem shall be serialized, give as_dict() method output to msgpack packer.
     """
@@ -413,10 +538,54 @@ class ManifestItem(PropDict):
     __slots__ = ("_dict", )  # avoid setting attributes not supported by properties
 
     version = PropDict._make_property('version', int)
-    archives = PropDict._make_property('archives', dict)  # name -> dict
-    timestamp = PropDict._make_property('timestamp', str, 'surrogate-escaped str', encode=safe_encode, decode=safe_decode)
+    archives = PropDict._make_property('archives', dict, 'dict of str -> dict')  # name -> dict
+    timestamp = PropDict._make_property('timestamp', str)
     config = PropDict._make_property('config', dict)
-    item_keys = PropDict._make_property('item_keys', tuple)
+    item_keys = PropDict._make_property('item_keys', tuple, 'tuple of str')
+
+    def update_internal(self, d):
+        # legacy support for migration (data from old msgpacks comes in as bytes always, but sometimes we want str)
+        for k, v in list(d.items()):
+            k = fix_key(d, k)
+            if k == 'version':
+                assert isinstance(v, int)
+            if k == 'archives':
+                ad = v
+                assert isinstance(ad, dict)
+                for ak, av in list(ad.items()):
+                    ak = fix_key(ad, ak, errors='surrogateescape')
+                    assert isinstance(av, dict)
+                    for ik, iv in list(av.items()):
+                        ik = fix_key(av, ik)
+                        if ik == 'id':
+                            fix_bytes_value(av, 'id')
+                        if ik == 'time':
+                            fix_str_value(av, 'time')
+                    assert set(av) == {'id', 'time'}
+            if k == 'timestamp':
+                v = fix_str_value(d, k, 'replace')
+            if k == 'config':
+                cd = v
+                assert isinstance(cd, dict)
+                for ck, cv in list(cd.items()):
+                    ck = fix_key(cd, ck)
+                    if ck == 'tam_required':
+                        assert isinstance(cv, bool)
+                    if ck == 'feature_flags':
+                        assert isinstance(cv, dict)
+                        ops = {'read', 'check', 'write', 'delete'}
+                        for op, specs in list(cv.items()):
+                            op = fix_key(cv, op)
+                            assert op in ops
+                            for speck, specv in list(specs.items()):
+                                speck = fix_key(specs, speck)
+                                if speck == 'mandatory':
+                                    specs[speck] = fix_tuple_of_str(specv)
+                        assert set(cv).issubset(ops)
+            if k == 'item_keys':
+                v = fix_tuple_of_str(v)
+            self._dict[k] = v
+
 
 class ItemDiff:
     """

+ 43 - 49
src/borg/remote.py

@@ -38,7 +38,7 @@ logger = create_logger(__name__)
 
 RPC_PROTOCOL_VERSION = 2
 BORG_VERSION = parse_version(__version__)
-MSGID, MSG, ARGS, RESULT = b'i', b'm', b'a', b'r'
+MSGID, MSG, ARGS, RESULT = 'i', 'm', 'a', 'r'
 
 MAX_INFLIGHT = 100
 
@@ -138,10 +138,6 @@ compatMap = {
 }
 
 
-def decode_keys(d):
-    return {k.decode(): d[k] for k in d}
-
-
 class RepositoryServer:  # pragma: no cover
     rpc_methods = (
         '__len__',
@@ -217,13 +213,12 @@ class RepositoryServer:  # pragma: no cover
                     if isinstance(unpacked, dict):
                         dictFormat = True
                         msgid = unpacked[MSGID]
-                        method = unpacked[MSG].decode()
-                        args = decode_keys(unpacked[ARGS])
+                        method = unpacked[MSG]
+                        args = unpacked[ARGS]
                     elif isinstance(unpacked, tuple) and len(unpacked) == 4:
                         dictFormat = False
                         # The first field 'type' was always 1 and has always been ignored
                         _, msgid, method, args = unpacked
-                        method = method.decode()
                         args = self.positional_to_named(method, args)
                     else:
                         if self.repository is not None:
@@ -256,21 +251,21 @@ class RepositoryServer:  # pragma: no cover
 
                             try:
                                 msg = msgpack.packb({MSGID: msgid,
-                                                    b'exception_class': e.__class__.__name__,
-                                                    b'exception_args': e.args,
-                                                    b'exception_full': ex_full,
-                                                    b'exception_short': ex_short,
-                                                    b'exception_trace': ex_trace,
-                                                    b'sysinfo': sysinfo()})
+                                                    'exception_class': e.__class__.__name__,
+                                                    'exception_args': e.args,
+                                                    'exception_full': ex_full,
+                                                    'exception_short': ex_short,
+                                                    'exception_trace': ex_trace,
+                                                    'sysinfo': sysinfo()})
                             except TypeError:
                                 msg = msgpack.packb({MSGID: msgid,
-                                                    b'exception_class': e.__class__.__name__,
-                                                    b'exception_args': [x if isinstance(x, (str, bytes, int)) else None
-                                                                        for x in e.args],
-                                                    b'exception_full': ex_full,
-                                                    b'exception_short': ex_short,
-                                                    b'exception_trace': ex_trace,
-                                                    b'sysinfo': sysinfo()})
+                                                    'exception_class': e.__class__.__name__,
+                                                    'exception_args': [x if isinstance(x, (str, bytes, int)) else None
+                                                                       for x in e.args],
+                                                    'exception_full': ex_full,
+                                                    'exception_short': ex_short,
+                                                    'exception_trace': ex_trace,
+                                                    'sysinfo': sysinfo()})
 
                             os_write(stdout_fd, msg)
                         else:
@@ -307,7 +302,7 @@ class RepositoryServer:  # pragma: no cover
         # clients since 1.1.0b3 use a dict as client_data
         # clients since 1.1.0b6 support json log format from server
         if isinstance(client_data, dict):
-            self.client_version = client_data[b'client_version']
+            self.client_version = client_data['client_version']
             level = logging.getLevelName(logging.getLogger('').level)
             setup_logging(is_serve=True, json=True, level=level)
             logger.debug('Initialized logging system for JSON-based protocol')
@@ -369,7 +364,6 @@ class RepositoryServer:  # pragma: no cover
         return self.repository.id
 
     def inject_exception(self, kind):
-        kind = kind.decode()
         s1 = 'test string'
         s2 = 'test string2'
         if kind == 'DoesNotExist':
@@ -483,35 +477,35 @@ class RemoteRepository:
 
     class RPCError(Exception):
         def __init__(self, unpacked):
-            # for borg < 1.1: unpacked only has b'exception_class' as key
-            # for borg 1.1+: unpacked has keys: b'exception_args', b'exception_full', b'exception_short', b'sysinfo'
+            # for borg < 1.1: unpacked only has 'exception_class' as key
+            # for borg 1.1+: unpacked has keys: 'exception_args', 'exception_full', 'exception_short', 'sysinfo'
             self.unpacked = unpacked
 
         def get_message(self):
-            if b'exception_short' in self.unpacked:
-                return b'\n'.join(self.unpacked[b'exception_short']).decode()
+            if 'exception_short' in self.unpacked:
+                return '\n'.join(self.unpacked['exception_short'])
             else:
                 return self.exception_class
 
         @property
         def traceback(self):
-            return self.unpacked.get(b'exception_trace', True)
+            return self.unpacked.get('exception_trace', True)
 
         @property
         def exception_class(self):
-            return self.unpacked[b'exception_class'].decode()
+            return self.unpacked['exception_class']
 
         @property
         def exception_full(self):
-            if b'exception_full' in self.unpacked:
-                return b'\n'.join(self.unpacked[b'exception_full']).decode()
+            if 'exception_full' in self.unpacked:
+                return '\n'.join(self.unpacked['exception_full'])
             else:
                 return self.get_message() + '\nRemote Exception (see remote log for the traceback)'
 
         @property
         def sysinfo(self):
-            if b'sysinfo' in self.unpacked:
-                return self.unpacked[b'sysinfo'].decode()
+            if 'sysinfo' in self.unpacked:
+                return self.unpacked['sysinfo']
             else:
                 return ''
 
@@ -570,15 +564,15 @@ class RemoteRepository:
         try:
             try:
                 version = self.call('negotiate', {'client_data': {
-                    b'client_version': BORG_VERSION,
+                    'client_version': BORG_VERSION,
                 }})
             except ConnectionClosed:
                 raise ConnectionClosedWithHint('Is borg working on the server?') from None
             if version == RPC_PROTOCOL_VERSION:
                 self.dictFormat = False
-            elif isinstance(version, dict) and b'server_version' in version:
+            elif isinstance(version, dict) and 'server_version' in version:
                 self.dictFormat = True
-                self.server_version = version[b'server_version']
+                self.server_version = version['server_version']
             else:
                 raise Exception('Server insisted on using unsupported protocol version %s' % version)
 
@@ -733,9 +727,9 @@ This problem will go away as soon as the server has been upgraded to 1.0.7+.
             return msgid
 
         def handle_error(unpacked):
-            error = unpacked[b'exception_class'].decode()
-            old_server = b'exception_args' not in unpacked
-            args = unpacked.get(b'exception_args')
+            error = unpacked['exception_class']
+            old_server = 'exception_args' not in unpacked
+            args = unpacked.get('exception_args')
 
             if error == 'DoesNotExist':
                 raise Repository.DoesNotExist(self.location.processed)
@@ -747,29 +741,29 @@ This problem will go away as soon as the server has been upgraded to 1.0.7+.
                 if old_server:
                     raise IntegrityError('(not available)')
                 else:
-                    raise IntegrityError(args[0].decode())
+                    raise IntegrityError(args[0])
             elif error == 'AtticRepository':
                 if old_server:
                     raise Repository.AtticRepository('(not available)')
                 else:
-                    raise Repository.AtticRepository(args[0].decode())
+                    raise Repository.AtticRepository(args[0])
             elif error == 'PathNotAllowed':
                 if old_server:
                     raise PathNotAllowed('(unknown)')
                 else:
-                    raise PathNotAllowed(args[0].decode())
+                    raise PathNotAllowed(args[0])
             elif error == 'ParentPathDoesNotExist':
-                raise Repository.ParentPathDoesNotExist(args[0].decode())
+                raise Repository.ParentPathDoesNotExist(args[0])
             elif error == 'ObjectNotFound':
                 if old_server:
                     raise Repository.ObjectNotFound('(not available)', self.location.processed)
                 else:
-                    raise Repository.ObjectNotFound(args[0].decode(), self.location.processed)
+                    raise Repository.ObjectNotFound(args[0], self.location.processed)
             elif error == 'InvalidRPCMethod':
                 if old_server:
                     raise InvalidRPCMethod('(not available)')
                 else:
-                    raise InvalidRPCMethod(args[0].decode())
+                    raise InvalidRPCMethod(args[0])
             else:
                 raise self.RPCError(unpacked)
 
@@ -788,7 +782,7 @@ This problem will go away as soon as the server has been upgraded to 1.0.7+.
                 try:
                     unpacked = self.responses.pop(waiting_for[0])
                     waiting_for.pop(0)
-                    if b'exception_class' in unpacked:
+                    if 'exception_class' in unpacked:
                         handle_error(unpacked)
                     else:
                         yield unpacked[RESULT]
@@ -808,7 +802,7 @@ This problem will go away as soon as the server has been upgraded to 1.0.7+.
                         else:
                             return
                     else:
-                        if b'exception_class' in unpacked:
+                        if 'exception_class' in unpacked:
                             handle_error(unpacked)
                         else:
                             yield unpacked[RESULT]
@@ -834,7 +828,7 @@ This problem will go away as soon as the server has been upgraded to 1.0.7+.
                             _, msgid, error, res = unpacked
                             if error:
                                 # ignore res, because it is only a fixed string anyway.
-                                unpacked = {MSGID: msgid, b'exception_class': error}
+                                unpacked = {MSGID: msgid, 'exception_class': error}
                             else:
                                 unpacked = {MSGID: msgid, RESULT: res}
                         else:
@@ -842,7 +836,7 @@ This problem will go away as soon as the server has been upgraded to 1.0.7+.
                         if msgid in self.ignore_responses:
                             self.ignore_responses.remove(msgid)
                             # async methods never return values, but may raise exceptions.
-                            if b'exception_class' in unpacked:
+                            if 'exception_class' in unpacked:
                                 self.async_responses[msgid] = unpacked
                             else:
                                 # we currently do not have async result values except "None",

+ 22 - 22
src/borg/repository.py

@@ -516,16 +516,16 @@ class Repository:
                 integrity = msgpack.unpack(fd)
         except FileNotFoundError:
             return
-        if integrity.get(b'version') != 2:
-            logger.warning('Unknown integrity data version %r in %s', integrity.get(b'version'), integrity_file)
+        if integrity.get('version') != 2:
+            logger.warning('Unknown integrity data version %r in %s', integrity.get('version'), integrity_file)
             return
-        return integrity[key].decode()
+        return integrity[key]
 
     def open_index(self, transaction_id, auto_recover=True):
         if transaction_id is None:
             return NSIndex()
         index_path = os.path.join(self.path, 'index.%d' % transaction_id)
-        integrity_data = self._read_integrity(transaction_id, b'index')
+        integrity_data = self._read_integrity(transaction_id, 'index')
         try:
             with IntegrityCheckedFile(index_path, write=False, integrity_data=integrity_data) as fd:
                 return NSIndex.read(fd)
@@ -575,7 +575,7 @@ class Repository:
                 self.io.cleanup(transaction_id)
             hints_path = os.path.join(self.path, 'hints.%d' % transaction_id)
             index_path = os.path.join(self.path, 'index.%d' % transaction_id)
-            integrity_data = self._read_integrity(transaction_id, b'hints')
+            integrity_data = self._read_integrity(transaction_id, 'hints')
             try:
                 with IntegrityCheckedFile(hints_path, write=False, integrity_data=integrity_data) as fd:
                     hints = msgpack.unpack(fd)
@@ -588,23 +588,23 @@ class Repository:
                 self.check_transaction()
                 self.prepare_txn(transaction_id)
                 return
-            if hints[b'version'] == 1:
+            if hints['version'] == 1:
                 logger.debug('Upgrading from v1 hints.%d', transaction_id)
-                self.segments = hints[b'segments']
+                self.segments = hints['segments']
                 self.compact = FreeSpace()
                 self.storage_quota_use = 0
                 self.shadow_index = {}
-                for segment in sorted(hints[b'compact']):
+                for segment in sorted(hints['compact']):
                     logger.debug('Rebuilding sparse info for segment %d', segment)
                     self._rebuild_sparse(segment)
                 logger.debug('Upgrade to v2 hints complete')
-            elif hints[b'version'] != 2:
-                raise ValueError('Unknown hints file version: %d' % hints[b'version'])
+            elif hints['version'] != 2:
+                raise ValueError('Unknown hints file version: %d' % hints['version'])
             else:
-                self.segments = hints[b'segments']
-                self.compact = FreeSpace(hints[b'compact'])
-                self.storage_quota_use = hints.get(b'storage_quota_use', 0)
-                self.shadow_index = hints.get(b'shadow_index', {})
+                self.segments = hints['segments']
+                self.compact = FreeSpace(hints['compact'])
+                self.storage_quota_use = hints.get('storage_quota_use', 0)
+                self.shadow_index = hints.get('shadow_index', {})
             self.log_storage_quota()
             # Drop uncommitted segments in the shadow index
             for key, shadowed_segments in self.shadow_index.items():
@@ -621,16 +621,16 @@ class Repository:
             os.rename(file + '.tmp', file)
 
         hints = {
-            b'version': 2,
-            b'segments': self.segments,
-            b'compact': self.compact,
-            b'storage_quota_use': self.storage_quota_use,
-            b'shadow_index': self.shadow_index,
+            'version': 2,
+            'segments': self.segments,
+            'compact': self.compact,
+            'storage_quota_use': self.storage_quota_use,
+            'shadow_index': self.shadow_index,
         }
         integrity = {
             # Integrity version started at 2, the current hints version.
             # Thus, integrity version == hints version, for now.
-            b'version': 2,
+            'version': 2,
         }
         transaction_id = self.io.get_segments_transaction_id()
         assert transaction_id is not None
@@ -647,7 +647,7 @@ class Repository:
         with IntegrityCheckedFile(hints_file + '.tmp', filename=hints_name, write=True) as fd:
             msgpack.pack(hints, fd)
             flush_and_sync(fd)
-        integrity[b'hints'] = fd.integrity_data
+        integrity['hints'] = fd.integrity_data
 
         # Write repository index
         index_name = 'index.%d' % transaction_id
@@ -656,7 +656,7 @@ class Repository:
             # XXX: Consider using SyncFile for index write-outs.
             self.index.write(fd)
             flush_and_sync(fd)
-        integrity[b'index'] = fd.integrity_data
+        integrity['index'] = fd.integrity_data
 
         # Write integrity file, containing checksums of the hints and index files
         integrity_name = 'integrity.%d' % transaction_id

+ 14 - 14
src/borg/testsuite/archive.py

@@ -171,7 +171,7 @@ class RobustUnpackerTestCase(BaseTestCase):
         return b''.join(msgpack.packb({'path': item}) for item in items)
 
     def _validator(self, value):
-        return isinstance(value, dict) and value.get(b'path') in (b'foo', b'bar', b'boo', b'baz')
+        return isinstance(value, dict) and value.get('path') in ('foo', 'bar', 'boo', 'baz')
 
     def process(self, input):
         unpacker = RobustUnpacker(validator=self._validator, item_keys=ITEM_KEYS)
@@ -186,14 +186,14 @@ class RobustUnpackerTestCase(BaseTestCase):
         return result
 
     def test_extra_garbage_no_sync(self):
-        chunks = [(False, [self.make_chunks([b'foo', b'bar'])]),
-                  (False, [b'garbage'] + [self.make_chunks([b'boo', b'baz'])])]
+        chunks = [(False, [self.make_chunks(['foo', 'bar'])]),
+                  (False, [b'garbage'] + [self.make_chunks(['boo', 'baz'])])]
         result = self.process(chunks)
         self.assert_equal(result, [
-            {b'path': b'foo'}, {b'path': b'bar'},
+            {'path': 'foo'}, {'path': 'bar'},
             103, 97, 114, 98, 97, 103, 101,
-            {b'path': b'boo'},
-            {b'path': b'baz'}])
+            {'path': 'boo'},
+            {'path': 'baz'}])
 
     def split(self, left, length):
         parts = []
@@ -203,22 +203,22 @@ class RobustUnpackerTestCase(BaseTestCase):
         return parts
 
     def test_correct_stream(self):
-        chunks = self.split(self.make_chunks([b'foo', b'bar', b'boo', b'baz']), 2)
+        chunks = self.split(self.make_chunks(['foo', 'bar', 'boo', 'baz']), 2)
         input = [(False, chunks)]
         result = self.process(input)
-        self.assert_equal(result, [{b'path': b'foo'}, {b'path': b'bar'}, {b'path': b'boo'}, {b'path': b'baz'}])
+        self.assert_equal(result, [{'path': 'foo'}, {'path': 'bar'}, {'path': 'boo'}, {'path': 'baz'}])
 
     def test_missing_chunk(self):
-        chunks = self.split(self.make_chunks([b'foo', b'bar', b'boo', b'baz']), 4)
+        chunks = self.split(self.make_chunks(['foo', 'bar', 'boo', 'baz']), 4)
         input = [(False, chunks[:3]), (True, chunks[4:])]
         result = self.process(input)
-        self.assert_equal(result, [{b'path': b'foo'}, {b'path': b'boo'}, {b'path': b'baz'}])
+        self.assert_equal(result, [{'path': 'foo'}, {'path': 'boo'}, {'path': 'baz'}])
 
     def test_corrupt_chunk(self):
-        chunks = self.split(self.make_chunks([b'foo', b'bar', b'boo', b'baz']), 4)
+        chunks = self.split(self.make_chunks(['foo', 'bar', 'boo', 'baz']), 4)
         input = [(False, chunks[:3]), (True, [b'gar', b'bage'] + chunks[3:])]
         result = self.process(input)
-        self.assert_equal(result, [{b'path': b'foo'}, {b'path': b'boo'}, {b'path': b'baz'}])
+        self.assert_equal(result, [{'path': 'foo'}, {'path': 'boo'}, {'path': 'baz'}])
 
 
 @pytest.fixture
@@ -242,7 +242,7 @@ IK = sorted(list(ITEM_KEYS))
 
 @pytest.mark.parametrize('packed',
     [msgpack.packb(o) for o in [
-        {b'path': b'/a/b/c'},  # small (different msgpack mapping type!)
+        {'path': b'/a/b/c'},  # small (different msgpack mapping type!)
         OrderedDict((k, b'') for k in IK),  # as big (key count) as it gets
         OrderedDict((k, b'x' * 1000) for k in IK),  # as big (key count and volume) as it gets
     ]])
@@ -251,7 +251,7 @@ def test_valid_msgpacked_items(packed, item_keys_serialized):
 
 
 def test_key_length_msgpacked_items():
-    key = b'x' * 32  # 31 bytes is the limit for fixstr msgpack type
+    key = 'x' * 32  # 31 bytes is the limit for fixstr msgpack type
     data = {key: b''}
     item_keys_serialized = [msgpack.packb(key), ]
     assert valid_msgpacked_dict(msgpack.packb(data), item_keys_serialized)

+ 10 - 10
src/borg/testsuite/archiver.py

@@ -1810,7 +1810,7 @@ class ArchiverTestCase(ArchiverTestCaseBase):
     def add_unknown_feature(self, operation):
         with Repository(self.repository_path, exclusive=True) as repository:
             manifest, key = Manifest.load(repository, Manifest.NO_OPERATION_CHECK)
-            manifest.config[b'feature_flags'] = {operation.value.encode(): {b'mandatory': [b'unknown-feature']}}
+            manifest.config['feature_flags'] = {operation.value: {'mandatory': ['unknown-feature']}}
             manifest.write()
             repository.commit(compact=False)
 
@@ -3623,14 +3623,14 @@ id: 2 / e29442 3506da 4e1ea7 / 25f62a 5a3d41 - 02
         self.cmd('init', '--encryption=repokey', self.repository_location)
         with Repository(self.repository_path) as repository:
             key = msgpack.unpackb(a2b_base64(repository.load_key()))
-        assert key[b'algorithm'] == b'argon2 chacha20-poly1305'
+        assert key['algorithm'] == 'argon2 chacha20-poly1305'
 
     def test_init_with_explicit_key_algorithm(self):
         """https://github.com/borgbackup/borg/issues/747#issuecomment-1076160401"""
         self.cmd('init', '--encryption=repokey', '--key-algorithm=pbkdf2', self.repository_location)
         with Repository(self.repository_path) as repository:
             key = msgpack.unpackb(a2b_base64(repository.load_key()))
-        assert key[b'algorithm'] == b'sha256'
+        assert key['algorithm'] == 'sha256'
 
     def verify_change_passphrase_does_not_change_algorithm(self, given_algorithm, expected_algorithm):
         self.cmd('init', '--encryption=repokey', '--key-algorithm', given_algorithm, self.repository_location)
@@ -3640,13 +3640,13 @@ id: 2 / e29442 3506da 4e1ea7 / 25f62a 5a3d41 - 02
 
         with Repository(self.repository_path) as repository:
             key = msgpack.unpackb(a2b_base64(repository.load_key()))
-            assert key[b'algorithm'] == expected_algorithm
+            assert key['algorithm'] == expected_algorithm
 
     def test_change_passphrase_does_not_change_algorithm_argon2(self):
-        self.verify_change_passphrase_does_not_change_algorithm('argon2', b'argon2 chacha20-poly1305')
+        self.verify_change_passphrase_does_not_change_algorithm('argon2', 'argon2 chacha20-poly1305')
 
     def test_change_passphrase_does_not_change_algorithm_pbkdf2(self):
-        self.verify_change_passphrase_does_not_change_algorithm('pbkdf2', b'sha256')
+        self.verify_change_passphrase_does_not_change_algorithm('pbkdf2', 'sha256')
 
     def verify_change_location_does_not_change_algorithm(self, given_algorithm, expected_algorithm):
         self.cmd('init', '--encryption=keyfile', '--key-algorithm', given_algorithm, self.repository_location)
@@ -3655,13 +3655,13 @@ id: 2 / e29442 3506da 4e1ea7 / 25f62a 5a3d41 - 02
 
         with Repository(self.repository_path) as repository:
             key = msgpack.unpackb(a2b_base64(repository.load_key()))
-            assert key[b'algorithm'] == expected_algorithm
+            assert key['algorithm'] == expected_algorithm
 
     def test_change_location_does_not_change_algorithm_argon2(self):
-        self.verify_change_location_does_not_change_algorithm('argon2', b'argon2 chacha20-poly1305')
+        self.verify_change_location_does_not_change_algorithm('argon2', 'argon2 chacha20-poly1305')
 
     def test_change_location_does_not_change_algorithm_pbkdf2(self):
-        self.verify_change_location_does_not_change_algorithm('pbkdf2', b'sha256')
+        self.verify_change_location_does_not_change_algorithm('pbkdf2', 'sha256')
 
     def test_key_change_algorithm(self):
         self.cmd('init', '--encryption=repokey', '--key-algorithm=pbkdf2', self.repository_location)
@@ -3969,7 +3969,7 @@ class ManifestAuthenticationTest(ArchiverTestCaseBase):
             key.change_passphrase(key._passphrase)
 
             manifest = msgpack.unpackb(key.decrypt(Manifest.MANIFEST_ID, repository.get(Manifest.MANIFEST_ID)))
-            del manifest[b'tam']
+            del manifest['tam']
             repository.put(Manifest.MANIFEST_ID, key.encrypt(Manifest.MANIFEST_ID, msgpack.packb(manifest)))
             repository.commit(compact=False)
         output = self.cmd('list', '--debug', self.repository_location)

+ 1 - 13
src/borg/testsuite/helpers.py

@@ -22,7 +22,7 @@ from ..helpers import get_base_dir, get_cache_dir, get_keys_dir, get_security_di
 from ..helpers import is_slow_msgpack
 from ..helpers import msgpack
 from ..helpers import yes, TRUISH, FALSISH, DEFAULTISH
-from ..helpers import StableDict, int_to_bigint, bigint_to_int, bin_to_hex
+from ..helpers import StableDict, bin_to_hex
 from ..helpers import parse_timestamp, ChunkIteratorFileWrapper, ChunkerParams
 from ..helpers import ProgressIndicatorPercent, ProgressIndicatorEndless
 from ..helpers import swidth_slice
@@ -38,18 +38,6 @@ from ..helpers.passphrase import Passphrase, PasswordRetriesExceeded
 from . import BaseTestCase, FakeInputs
 
 
-class BigIntTestCase(BaseTestCase):
-
-    def test_bigint(self):
-        self.assert_equal(int_to_bigint(0), 0)
-        self.assert_equal(int_to_bigint(2**63-1), 2**63-1)
-        self.assert_equal(int_to_bigint(-2**63+1), -2**63+1)
-        self.assert_equal(int_to_bigint(2**63), b'\x00\x00\x00\x00\x00\x00\x00\x80\x00')
-        self.assert_equal(int_to_bigint(-2**63), b'\x00\x00\x00\x00\x00\x00\x00\x80\xff')
-        self.assert_equal(bigint_to_int(int_to_bigint(-2**70)), -2**70)
-        self.assert_equal(bigint_to_int(int_to_bigint(2**70)), 2**70)
-
-
 def test_bin_to_hex():
     assert bin_to_hex(b'') == ''
     assert bin_to_hex(b'\x00\x01\xff') == '0001ff'

+ 3 - 11
src/borg/testsuite/item.py

@@ -89,20 +89,12 @@ def test_item_mptimestamp_property():
     assert item.as_dict() == {'atime': Timestamp.from_unix_nano(big)}
 
 
-def test_item_user_group_none():
-    item = Item()
-    item.user = None
-    assert item.user is None
-    item.group = None
-    assert item.group is None
-
-
 def test_item_se_str_property():
     # start simple
     item = Item()
     item.path = '/a/b/c'
     assert item.path == '/a/b/c'
-    assert item.as_dict() == {'path': b'/a/b/c'}
+    assert item.as_dict() == {'path': '/a/b/c'}
     del item.path
     assert item.as_dict() == {}
     with pytest.raises(TypeError):
@@ -111,11 +103,11 @@ def test_item_se_str_property():
     # non-utf-8 path, needing surrogate-escaping for latin-1 u-umlaut
     item = Item(internal_dict={'path': b'/a/\xfc/c'})
     assert item.path == '/a/\udcfc/c'  # getting a surrogate-escaped representation
-    assert item.as_dict() == {'path': b'/a/\xfc/c'}
+    assert item.as_dict() == {'path': '/a/\udcfc/c'}
     del item.path
     assert 'path' not in item
     item.path = '/a/\udcfc/c'  # setting using a surrogate-escaped representation
-    assert item.as_dict() == {'path': b'/a/\xfc/c'}
+    assert item.as_dict() == {'path': '/a/\udcfc/c'}
 
 
 def test_item_list_property():

+ 8 - 8
src/borg/testsuite/key.py

@@ -360,23 +360,23 @@ class TestTAM:
         assert blob.startswith(b'\x82')
 
         unpacked = msgpack.unpackb(blob)
-        assert unpacked[b'tam'][b'type'] == b'HKDF_HMAC_SHA512'
+        assert unpacked['tam']['type'] == 'HKDF_HMAC_SHA512'
 
         unpacked, verified = key.unpack_and_verify_manifest(blob)
         assert verified
-        assert unpacked[b'foo'] == b'bar'
-        assert b'tam' not in unpacked
+        assert unpacked['foo'] == 'bar'
+        assert 'tam' not in unpacked
 
-    @pytest.mark.parametrize('which', (b'hmac', b'salt'))
+    @pytest.mark.parametrize('which', ('hmac', 'salt'))
     def test_tampered(self, key, which):
         data = {'foo': 'bar'}
         blob = key.pack_and_authenticate_metadata(data)
         assert blob.startswith(b'\x82')
 
         unpacked = msgpack.unpackb(blob, object_hook=StableDict)
-        assert len(unpacked[b'tam'][which]) == 64
-        unpacked[b'tam'][which] = unpacked[b'tam'][which][0:32] + bytes(32)
-        assert len(unpacked[b'tam'][which]) == 64
+        assert len(unpacked['tam'][which]) == 64
+        unpacked['tam'][which] = unpacked['tam'][which][0:32] + bytes(32)
+        assert len(unpacked['tam'][which]) == 64
         blob = msgpack.packb(unpacked)
 
         with pytest.raises(TAMInvalid):
@@ -421,4 +421,4 @@ def test_key_file_roundtrip(monkeypatch, cli_argument, expected_algorithm):
     load_me = RepoKey.detect(repository, manifest_data=None)
 
     assert to_dict(load_me) == to_dict(save_me)
-    assert msgpack.unpackb(a2b_base64(saved))[b'algorithm'] == expected_algorithm.encode()
+    assert msgpack.unpackb(a2b_base64(saved))['algorithm'] == expected_algorithm

+ 2 - 2
src/borg/testsuite/repository.py

@@ -655,8 +655,8 @@ class RepositoryAuxiliaryCorruptionTestCase(RepositoryTestCaseBase):
             hints = msgpack.unpack(fd)
             fd.seek(0)
             # Corrupt segment refcount
-            assert hints[b'segments'][2] == 1
-            hints[b'segments'][2] = 0
+            assert hints['segments'][2] == 1
+            hints['segments'][2] = 0
             msgpack.pack(hints, fd)
             fd.truncate()
 

+ 2 - 6
src/borg/xattr.py

@@ -76,9 +76,7 @@ def get_all(path, follow_symlinks=False):
         for name in names:
             try:
                 # xattr name is a bytes object, we directly use it.
-                # if we get an empty xattr value (b''), we store None into the result dict -
-                # borg always did it like that...
-                result[name] = getxattr(path, name, follow_symlinks=follow_symlinks) or None
+                result[name] = getxattr(path, name, follow_symlinks=follow_symlinks)
             except OSError as e:
                 name_str = name.decode()
                 if isinstance(path, int):
@@ -122,9 +120,7 @@ def set_all(path, xattrs, follow_symlinks=False):
     warning = False
     for k, v in xattrs.items():
         try:
-            # the key k is a bytes object due to msgpack unpacking it as such.
-            # if we have a None value, it means "empty", so give b'' to setxattr in that case:
-            setxattr(path, k, v or b'', follow_symlinks=follow_symlinks)
+            setxattr(path, k, v, follow_symlinks=follow_symlinks)
         except OSError as e:
             warning = True
             k_str = k.decode()