瀏覽代碼

cleanup msgpack related str/bytes mess, see #968

see ticket and borg.helpers.msgpack docstring.
Thomas Waldmann 3 年之前
父節點
當前提交
f8dbe5b542

+ 7 - 9
src/borg/archive.py

@@ -34,7 +34,7 @@ from .helpers import Error, IntegrityError, set_ec
 from .platform import uid2user, user2uid, gid2group, group2gid
 from .helpers import parse_timestamp, to_localtime
 from .helpers import OutputTimestamp, format_timedelta, format_file_size, file_status, FileSize
-from .helpers import safe_encode, safe_decode, make_path_safe, remove_surrogates
+from .helpers import safe_encode, make_path_safe, remove_surrogates
 from .helpers import StableDict
 from .helpers import bin_to_hex
 from .helpers import safe_ns
@@ -479,7 +479,6 @@ class Archive:
     def load(self, id):
         self.id = id
         self.metadata = self._load_meta(self.id)
-        self.metadata.cmdline = [safe_decode(arg) for arg in self.metadata.cmdline]
         self.name = self.metadata.name
         self.comment = self.metadata.get('comment', '')
 
@@ -1515,7 +1514,7 @@ class RobustUnpacker:
     """
     def __init__(self, validator, item_keys):
         super().__init__()
-        self.item_keys = [msgpack.packb(name.encode()) for name in item_keys]
+        self.item_keys = [msgpack.packb(name) for name in item_keys]
         self.validator = validator
         self._buffered_data = []
         self._resync = False
@@ -1734,7 +1733,7 @@ class ArchiveChecker:
         # lost manifest on a older borg version than the most recent one that was ever used
         # within this repository (assuming that newer borg versions support more item keys).
         manifest = Manifest(self.key, self.repository)
-        archive_keys_serialized = [msgpack.packb(name.encode()) for name in ARCHIVE_KEYS]
+        archive_keys_serialized = [msgpack.packb(name) for name in ARCHIVE_KEYS]
         pi = ProgressIndicatorPercent(total=len(self.chunks), msg="Rebuilding manifest %6.2f%%", step=0.01,
                                       msgid='check.rebuild_manifest')
         for chunk_id, _ in self.chunks.iteritems():
@@ -1881,9 +1880,9 @@ class ArchiveChecker:
 
             Missing item chunks will be skipped and the msgpack stream will be restarted
             """
-            item_keys = frozenset(key.encode() for key in self.manifest.item_keys)
-            required_item_keys = frozenset(key.encode() for key in REQUIRED_ITEM_KEYS)
-            unpacker = RobustUnpacker(lambda item: isinstance(item, StableDict) and b'path' in item,
+            item_keys = self.manifest.item_keys
+            required_item_keys = REQUIRED_ITEM_KEYS
+            unpacker = RobustUnpacker(lambda item: isinstance(item, StableDict) and 'path' in item,
                                       self.manifest.item_keys)
             _state = 0
 
@@ -1905,7 +1904,7 @@ class ArchiveChecker:
             def valid_item(obj):
                 if not isinstance(obj, StableDict):
                     return False, 'not a dictionary'
-                keys = set(obj)
+                keys = set(k.decode('utf-8', errors='replace') for k in obj)
                 if not required_item_keys.issubset(keys):
                     return False, 'missing required keys: ' + list_keys_safe(required_item_keys - keys)
                 if not keys.issubset(item_keys):
@@ -1991,7 +1990,6 @@ class ArchiveChecker:
                 archive = ArchiveItem(internal_dict=msgpack.unpackb(data))
                 if archive.version != 2:
                     raise Exception('Unknown archive metadata version')
-                archive.cmdline = [safe_decode(arg) for arg in archive.cmdline]
                 items_buffer = ChunkBuffer(self.key)
                 items_buffer.write_chunk = add_callback
                 for item in robust_iterator(archive):

+ 6 - 6
src/borg/archiver.py

@@ -55,7 +55,7 @@ try:
     from .helpers import PrefixSpec, GlobSpec, CommentSpec, SortBySpec, FilesCacheMode
     from .helpers import BaseFormatter, ItemFormatter, ArchiveFormatter
     from .helpers import format_timedelta, format_file_size, parse_file_size, format_archive
-    from .helpers import safe_encode, remove_surrogates, bin_to_hex, prepare_dump_dict, eval_escapes
+    from .helpers import remove_surrogates, bin_to_hex, prepare_dump_dict, eval_escapes
     from .helpers import interval, prune_within, prune_split, PRUNING_PATTERNS
     from .helpers import timestamp
     from .helpers import get_cache_dir, os_stat
@@ -1944,12 +1944,12 @@ class Archiver:
                 print('This repository is not encrypted, cannot enable TAM.')
                 return EXIT_ERROR
 
-            if not manifest.tam_verified or not manifest.config.get(b'tam_required', False):
+            if not manifest.tam_verified or not manifest.config.get('tam_required', False):
                 # The standard archive listing doesn't include the archive ID like in borg 1.1.x
                 print('Manifest contents:')
                 for archive_info in manifest.archives.list(sort_by=['ts']):
                     print(format_archive(archive_info), '[%s]' % bin_to_hex(archive_info.id))
-                manifest.config[b'tam_required'] = True
+                manifest.config['tam_required'] = True
                 manifest.write()
                 repository.commit(compact=False)
             if not key.tam_required:
@@ -1972,7 +1972,7 @@ class Archiver:
                 print('Key updated')
                 if hasattr(key, 'find_key'):
                     print('Key location:', key.find_key())
-            manifest.config[b'tam_required'] = False
+            manifest.config['tam_required'] = False
             manifest.write()
             repository.commit(compact=False)
         else:
@@ -2304,7 +2304,7 @@ class Archiver:
         """dump decoded archive metadata (not: data)"""
 
         try:
-            archive_meta_orig = manifest.archives.get_raw_dict()[safe_encode(args.location.archive)]
+            archive_meta_orig = manifest.archives.get_raw_dict()[args.location.archive]
         except KeyError:
             raise Archive.DoesNotExist(args.location.archive)
 
@@ -2321,7 +2321,7 @@ class Archiver:
             fd.write(do_indent(prepare_dump_dict(archive_meta_orig)))
             fd.write(',\n')
 
-            data = key.decrypt(archive_meta_orig[b'id'], repository.get(archive_meta_orig[b'id']))
+            data = key.decrypt(archive_meta_orig['id'], repository.get(archive_meta_orig['id']))
             archive_org_dict = msgpack.unpackb(data, object_hook=StableDict)
 
             fd.write('    "_meta":\n')

+ 20 - 16
src/borg/cache_sync/unpack.h

@@ -384,19 +384,11 @@ static inline int unpack_callback_map_end(unpack_user* u)
 
 static inline int unpack_callback_raw(unpack_user* u, const char* b, const char* p, unsigned int length)
 {
-    /* raw = what Borg uses for binary stuff and strings as well */
+    /* raw = what Borg uses for text stuff */
     /* Note: p points to an internal buffer which contains l bytes. */
     (void)b;
 
     switch(u->expect) {
-    case expect_key:
-        if(length != 32) {
-            SET_LAST_ERROR("Incorrect key length");
-            return -1;
-        }
-        memcpy(u->current.key, p, 32);
-        u->expect = expect_size;
-        break;
     case expect_map_key:
         if(length == 6 && !memcmp("chunks", p, 6)) {
             u->expect = expect_chunks_begin;
@@ -409,19 +401,31 @@ static inline int unpack_callback_raw(unpack_user* u, const char* b, const char*
             u->expect = expect_map_item_end;
         }
         break;
-    default:
-        if(u->inside_chunks) {
-            SET_LAST_ERROR("Unexpected bytes in chunks structure");
-            return -1;
-        }
     }
     return 0;
 }
 
 static inline int unpack_callback_bin(unpack_user* u, const char* b, const char* p, unsigned int length)
 {
-    (void)u; (void)b; (void)p; (void)length;
-    UNEXPECTED("bin");
+    /* bin = what Borg uses for binary stuff */
+    /* Note: p points to an internal buffer which contains l bytes. */
+    (void)b;
+
+    switch(u->expect) {
+    case expect_key:
+        if(length != 32) {
+            SET_LAST_ERROR("Incorrect key length");
+            return -1;
+        }
+        memcpy(u->current.key, p, 32);
+        u->expect = expect_size;
+        break;
+    default:
+        if(u->inside_chunks) {
+            SET_LAST_ERROR("Unexpected bytes in chunks structure");
+            return -1;
+        }
+    }
     return 0;
 }
 

+ 21 - 25
src/borg/helpers/manifest.py

@@ -12,7 +12,7 @@ from ..logger import create_logger
 logger = create_logger()
 
 from .datastruct import StableDict
-from .parseformat import bin_to_hex, safe_encode, safe_decode
+from .parseformat import bin_to_hex
 from .time import parse_timestamp
 from .. import shellpattern
 from ..constants import *  # NOQA
@@ -39,39 +39,35 @@ class Archives(abc.MutableMapping):
     str timestamps or datetime timestamps.
     """
     def __init__(self):
-        # key: encoded archive name, value: dict(b'id': bytes_id, b'time': bytes_iso_ts)
+        # key: str archive name, value: dict('id': bytes_id, 'time': str_iso_ts)
         self._archives = {}
 
     def __len__(self):
         return len(self._archives)
 
     def __iter__(self):
-        return iter(safe_decode(name) for name in self._archives)
+        return iter(self._archives)
 
     def __getitem__(self, name):
         assert isinstance(name, str)
-        _name = safe_encode(name)
-        values = self._archives.get(_name)
+        values = self._archives.get(name)
         if values is None:
             raise KeyError
-        ts = parse_timestamp(values[b'time'].decode())
-        return ArchiveInfo(name=name, id=values[b'id'], ts=ts)
+        ts = parse_timestamp(values['time'])
+        return ArchiveInfo(name=name, id=values['id'], ts=ts)
 
     def __setitem__(self, name, info):
         assert isinstance(name, str)
-        name = safe_encode(name)
         assert isinstance(info, tuple)
         id, ts = info
         assert isinstance(id, bytes)
         if isinstance(ts, datetime):
             ts = ts.replace(tzinfo=None).strftime(ISO_FORMAT)
         assert isinstance(ts, str)
-        ts = ts.encode()
-        self._archives[name] = {b'id': id, b'time': ts}
+        self._archives[name] = {'id': id, 'time': ts}
 
     def __delitem__(self, name):
         assert isinstance(name, str)
-        name = safe_encode(name)
         del self._archives[name]
 
     def list(self, *, glob=None, match_end=r'\Z', sort_by=(), consider_checkpoints=True, first=None, last=None, reverse=False):
@@ -116,8 +112,8 @@ class Archives(abc.MutableMapping):
     def set_raw_dict(self, d):
         """set the dict we get from the msgpack unpacker"""
         for k, v in d.items():
-            assert isinstance(k, bytes)
-            assert isinstance(v, dict) and b'id' in v and b'time' in v
+            assert isinstance(k, str)
+            assert isinstance(v, dict) and 'id' in v and 'time' in v
             self._archives[k] = v
 
     def get_raw_dict(self):
@@ -196,10 +192,10 @@ class Manifest:
         manifest.timestamp = m.get('timestamp')
         manifest.config = m.config
         # valid item keys are whatever is known in the repo or every key we know
-        manifest.item_keys = ITEM_KEYS | frozenset(key.decode() for key in m.get('item_keys', []))
+        manifest.item_keys = ITEM_KEYS | frozenset(m.get('item_keys', []))
 
         if manifest.tam_verified:
-            manifest_required = manifest.config.get(b'tam_required', False)
+            manifest_required = manifest.config.get('tam_required', False)
             security_required = tam_required(repository)
             if manifest_required and not security_required:
                 logger.debug('Manifest is TAM verified and says TAM is required, updating security database...')
@@ -214,32 +210,32 @@ class Manifest:
     def check_repository_compatibility(self, operations):
         for operation in operations:
             assert isinstance(operation, self.Operation)
-            feature_flags = self.config.get(b'feature_flags', None)
+            feature_flags = self.config.get('feature_flags', None)
             if feature_flags is None:
                 return
-            if operation.value.encode() not in feature_flags:
+            if operation.value not in feature_flags:
                 continue
-            requirements = feature_flags[operation.value.encode()]
-            if b'mandatory' in requirements:
-                unsupported = set(requirements[b'mandatory']) - self.SUPPORTED_REPO_FEATURES
+            requirements = feature_flags[operation.value]
+            if 'mandatory' in requirements:
+                unsupported = set(requirements['mandatory']) - self.SUPPORTED_REPO_FEATURES
                 if unsupported:
-                    raise MandatoryFeatureUnsupported([f.decode() for f in unsupported])
+                    raise MandatoryFeatureUnsupported(list(unsupported))
 
     def get_all_mandatory_features(self):
         result = {}
-        feature_flags = self.config.get(b'feature_flags', None)
+        feature_flags = self.config.get('feature_flags', None)
         if feature_flags is None:
             return result
 
         for operation, requirements in feature_flags.items():
-            if b'mandatory' in requirements:
-                result[operation.decode()] = {feature.decode() for feature in requirements[b'mandatory']}
+            if 'mandatory' in requirements:
+                result[operation] = set(requirements['mandatory'])
         return result
 
     def write(self):
         from ..item import ManifestItem
         if self.key.tam_required:
-            self.config[b'tam_required'] = True
+            self.config['tam_required'] = True
         # self.timestamp needs to be strictly monotonically increasing. Clocks often are not set correctly
         if self.timestamp is None:
             self.timestamp = datetime.utcnow().strftime(ISO_FORMAT)

+ 71 - 30
src/borg/helpers/msgpack.py

@@ -1,21 +1,56 @@
+"""
+wrapping msgpack
+================
+
+Due to the planned breaking api changes in upstream msgpack, we wrap it the way we need it -
+to avoid having lots of clutter in the calling code. see tickets #968 and #3632.
+
+Packing
+-------
+- use_bin_type = True (used by borg since borg 1.3)
+  This is used to generate output according to new msgpack 2.0 spec.
+  This cleanly keeps bytes and str types apart.
+
+- use_bin_type = False (used by borg < 1.3)
+  This creates output according to the older msgpack spec.
+  BAD: str and bytes were packed into same "raw" representation.
+
+- unicode_errors = 'surrogateescape'
+  Guess backup applications are one of the rare cases when this needs to be used.
+  It is needed because borg also needs to deal with data that does not cleanly encode/decode using utf-8.
+  There's a lot of crap out there, e.g. in filenames and as a backup tool, we must keep them as good as possible.
+
+Unpacking
+---------
+- raw = True (the old way, used by borg <= 1.3)
+  This is currently still needed to not try to decode "raw" msgpack objects.
+  These could come either from str (new or old msgpack) or bytes (old msgpack).
+  Thus, we basically must know what we want and either keep the bytes we get
+  or decode them to str, if we want str.
+
+- raw = False (the new way)
+  This can be used in future, when we do not have to deal with data any more that was packed the old way.
+  It will then unpack according to the msgpack 2.0 spec format and directly output bytes or str.
+
+- unicode_errors = 'surrogateescape' -> see description above (will be used when raw is False).
+
+As of borg 1.3, we have the first part on the way to fix the msgpack str/bytes mess, #968.
+borg now still needs to **read** old repos, archives, keys, ... so we can not yet fix it completely.
+But from now on, borg only **writes** new data according to the new msgpack spec,
+thus we can complete the fix for #968 in a later borg release.
+
+current way in msgpack terms
+----------------------------
+
+- pack with use_bin_type=True (according to msgpack 2.0 spec)
+- packs str -> raw and bytes -> bin
+- unpack with raw=True (aka "the old way")
+- unpacks raw to bytes (thus we always need to decode manually if we want str)
+"""
+
 from .datastruct import StableDict
 from ..constants import *  # NOQA
 
-# wrapping msgpack ---------------------------------------------------------------------------------------------------
-#
-# due to the planned breaking api changes in upstream msgpack, we wrap it the way we need it -
-# to avoid having lots of clutter in the calling code. see tickets #968 and #3632.
-#
-# Packing
-# -------
-# use_bin_type = False is needed to generate the old msgpack format (not msgpack 2.0 spec) as borg always did.
-# unicode_errors = None is needed because usage of it is deprecated
-#
-# Unpacking
-# ---------
-# raw = True is needed to unpack the old msgpack format to bytes (not str, about the decoding see item.pyx).
-# unicode_errors = None is needed because usage of it is deprecated
-
 from msgpack import Packer as mp_Packer
 from msgpack import packb as mp_packb
 from msgpack import pack as mp_pack
@@ -30,6 +65,10 @@ from msgpack import OutOfData
 
 version = mp_version
 
+USE_BIN_TYPE = True
+RAW = True  # should become False later when we do not need to read old stuff any more
+UNICODE_ERRORS = 'surrogateescape'  # previously done by safe_encode, safe_decode
+
 
 class PackException(Exception):
     """Exception while msgpack packing"""
@@ -40,10 +79,10 @@ class UnpackException(Exception):
 
 
 class Packer(mp_Packer):
-    def __init__(self, *, default=None, unicode_errors=None,
-                 use_single_float=False, autoreset=True, use_bin_type=False,
+    def __init__(self, *, default=None, unicode_errors=UNICODE_ERRORS,
+                 use_single_float=False, autoreset=True, use_bin_type=USE_BIN_TYPE,
                  strict_types=False):
-        assert unicode_errors is None
+        assert unicode_errors == UNICODE_ERRORS
         super().__init__(default=default, unicode_errors=unicode_errors,
                          use_single_float=use_single_float, autoreset=autoreset, use_bin_type=use_bin_type,
                          strict_types=strict_types)
@@ -55,16 +94,16 @@ class Packer(mp_Packer):
             raise PackException(e)
 
 
-def packb(o, *, use_bin_type=False, unicode_errors=None, **kwargs):
-    assert unicode_errors is None
+def packb(o, *, use_bin_type=USE_BIN_TYPE, unicode_errors=UNICODE_ERRORS, **kwargs):
+    assert unicode_errors == UNICODE_ERRORS
     try:
         return mp_packb(o, use_bin_type=use_bin_type, unicode_errors=unicode_errors, **kwargs)
     except Exception as e:
         raise PackException(e)
 
 
-def pack(o, stream, *, use_bin_type=False, unicode_errors=None, **kwargs):
-    assert unicode_errors is None
+def pack(o, stream, *, use_bin_type=USE_BIN_TYPE, unicode_errors=UNICODE_ERRORS, **kwargs):
+    assert unicode_errors == UNICODE_ERRORS
     try:
         return mp_pack(o, stream, use_bin_type=use_bin_type, unicode_errors=unicode_errors, **kwargs)
     except Exception as e:
@@ -72,13 +111,13 @@ def pack(o, stream, *, use_bin_type=False, unicode_errors=None, **kwargs):
 
 
 class Unpacker(mp_Unpacker):
-    def __init__(self, file_like=None, *, read_size=0, use_list=True, raw=True,
+    def __init__(self, file_like=None, *, read_size=0, use_list=True, raw=RAW,
                  object_hook=None, object_pairs_hook=None, list_hook=None,
-                 unicode_errors=None, max_buffer_size=0,
+                 unicode_errors=UNICODE_ERRORS, max_buffer_size=0,
                  ext_hook=ExtType,
                  strict_map_key=False):
-        assert raw is True
-        assert unicode_errors is None
+        assert raw == RAW
+        assert unicode_errors == UNICODE_ERRORS
         kw = dict(file_like=file_like, read_size=read_size, use_list=use_list, raw=raw,
                   object_hook=object_hook, object_pairs_hook=object_pairs_hook, list_hook=list_hook,
                   unicode_errors=unicode_errors, max_buffer_size=max_buffer_size,
@@ -105,10 +144,11 @@ class Unpacker(mp_Unpacker):
     next = __next__
 
 
-def unpackb(packed, *, raw=True, unicode_errors=None,
+def unpackb(packed, *, raw=RAW, unicode_errors=UNICODE_ERRORS,
             strict_map_key=False,
             **kwargs):
-    assert unicode_errors is None
+    assert raw == RAW
+    assert unicode_errors == UNICODE_ERRORS
     try:
         kw = dict(raw=raw, unicode_errors=unicode_errors,
                   strict_map_key=strict_map_key)
@@ -118,10 +158,11 @@ def unpackb(packed, *, raw=True, unicode_errors=None,
         raise UnpackException(e)
 
 
-def unpack(stream, *, raw=True, unicode_errors=None,
+def unpack(stream, *, raw=RAW, unicode_errors=UNICODE_ERRORS,
            strict_map_key=False,
            **kwargs):
-    assert unicode_errors is None
+    # assert raw == RAW
+    assert unicode_errors == UNICODE_ERRORS
     try:
         kw = dict(raw=raw, unicode_errors=unicode_errors,
                   strict_map_key=strict_map_key)

+ 144 - 28
src/borg/item.pyx

@@ -2,7 +2,6 @@ import stat
 from collections import namedtuple
 
 from .constants import ITEM_KEYS, ARCHIVE_KEYS
-from .helpers import safe_encode, safe_decode
 from .helpers import StableDict
 from .helpers import format_file_size
 from .helpers.msgpack import timestamp_to_int, int_to_timestamp
@@ -16,6 +15,51 @@ cdef extern from "_item.c":
 API_VERSION = '1.2_01'
 
 
+def fix_key(data, key):
+    """if k is a bytes-typed key, migrate key/value to a str-typed key in dict data"""
+    if isinstance(key, bytes):
+        value = data.pop(key)
+        key = key.decode()
+        data[key] = value
+    assert isinstance(key, str)
+    return key
+
+
+def fix_str_value(data, key, errors='surrogateescape'):
+    """makes sure that data[key] is a str (decode if it is bytes)"""
+    assert isinstance(key, str)  # fix_key must be called first
+    value = data[key]
+    if isinstance(value, bytes):
+        value = value.decode('utf-8', errors=errors)
+        data[key] = value
+    assert isinstance(value, str)
+    return value
+
+
+def fix_list_of_str(t):
+    """make sure we have a list of str"""
+    assert isinstance(t, (tuple, list))
+    l = [e.decode() if isinstance(e, bytes) else e for e in t]
+    assert all(isinstance(e, str) for e in l), repr(l)
+    return l
+
+
+def fix_tuple_of_str(t):
+    """make sure we have a tuple of str"""
+    assert isinstance(t, (tuple, list))
+    t = tuple(e.decode() if isinstance(e, bytes) else e for e in t)
+    assert all(isinstance(e, str) for e in t), repr(t)
+    return t
+
+
+def fix_tuple_of_str_and_int(t):
+    """make sure we have a tuple of str"""
+    assert isinstance(t, (tuple, list))
+    t = tuple(e.decode() if isinstance(e, bytes) else e for e in t)
+    assert all(isinstance(e, (str, int)) for e in t), repr(t)
+    return t
+
+
 class PropDict:
     """
     Manage a dictionary via properties.
@@ -155,10 +199,10 @@ class Item(PropDict):
 
     # properties statically defined, so that IDEs can know their names:
 
-    path = PropDict._make_property('path', str, 'surrogate-escaped str', encode=safe_encode, decode=safe_decode)
-    source = PropDict._make_property('source', str, 'surrogate-escaped str', encode=safe_encode, decode=safe_decode)
-    user = PropDict._make_property('user', (str, type(None)), 'surrogate-escaped str or None', encode=safe_encode, decode=safe_decode)
-    group = PropDict._make_property('group', (str, type(None)), 'surrogate-escaped str or None', encode=safe_encode, decode=safe_decode)
+    path = PropDict._make_property('path', str, 'surrogate-escaped str')
+    source = PropDict._make_property('source', str, 'surrogate-escaped str')
+    user = PropDict._make_property('user', (str, type(None)), 'surrogate-escaped str or None')
+    group = PropDict._make_property('group', (str, type(None)), 'surrogate-escaped str or None')
 
     acl_access = PropDict._make_property('acl_access', bytes)
     acl_default = PropDict._make_property('acl_default', bytes)
@@ -290,6 +334,14 @@ class Item(PropDict):
         except AttributeError:
             return False
 
+    def update_internal(self, d):
+        # legacy support for migration (data from old msgpacks comes in as bytes always, but sometimes we want str)
+        for k, v in list(d.items()):
+            k = fix_key(d, k)
+            if k in ('path', 'source', 'user', 'group'):
+                v = fix_str_value(d, k)
+            self._dict[k] = v
+
 
 class EncryptedKey(PropDict):
     """
@@ -309,7 +361,7 @@ class EncryptedKey(PropDict):
     __slots__ = ("_dict", )  # avoid setting attributes not supported by properties
 
     version = PropDict._make_property('version', int)
-    algorithm = PropDict._make_property('algorithm', str, encode=str.encode, decode=bytes.decode)
+    algorithm = PropDict._make_property('algorithm', str)
     iterations = PropDict._make_property('iterations', int)
     salt = PropDict._make_property('salt', bytes)
     hash = PropDict._make_property('hash', bytes)
@@ -317,7 +369,17 @@ class EncryptedKey(PropDict):
     argon2_time_cost = PropDict._make_property('argon2_time_cost', int)
     argon2_memory_cost = PropDict._make_property('argon2_memory_cost', int)
     argon2_parallelism = PropDict._make_property('argon2_parallelism', int)
-    argon2_type = PropDict._make_property('argon2_type', str, encode=str.encode, decode=bytes.decode)
+    argon2_type = PropDict._make_property('argon2_type', str)
+
+    def update_internal(self, d):
+        # legacy support for migration (data from old msgpacks comes in as bytes always, but sometimes we want str)
+        for k, v in list(d.items()):
+            k = fix_key(d, k)
+            if k == 'version':
+                assert isinstance(v, int)
+            if k in ('algorithm', 'argon2_type'):
+                v = fix_str_value(d, k)
+            self._dict[k] = v
 
 
 class Key(PropDict):
@@ -344,17 +406,13 @@ class Key(PropDict):
     chunk_seed = PropDict._make_property('chunk_seed', int)
     tam_required = PropDict._make_property('tam_required', bool)
 
-
-def tuple_encode(t):
-    """encode a tuple that might contain str items"""
-    # we have str, but want to give bytes to msgpack.pack
-    return tuple(safe_encode(e) if isinstance(e, str) else e for e in t)
-
-
-def tuple_decode(t):
-    """decode a tuple that might contain bytes items"""
-    # we get bytes objects from msgpack.unpack, but want str
-    return tuple(safe_decode(e) if isinstance(e, bytes) else e for e in t)
+    def update_internal(self, d):
+        # legacy support for migration (data from old msgpacks comes in as bytes always, but sometimes we want str)
+        for k, v in list(d.items()):
+            k = fix_key(d, k)
+            if k == 'version':
+                assert isinstance(v, int)
+            self._dict[k] = v
 
 
 class ArchiveItem(PropDict):
@@ -374,15 +432,15 @@ class ArchiveItem(PropDict):
     __slots__ = ("_dict", )  # avoid setting attributes not supported by properties
 
     version = PropDict._make_property('version', int)
-    name = PropDict._make_property('name', str, 'surrogate-escaped str', encode=safe_encode, decode=safe_decode)
+    name = PropDict._make_property('name', str, 'surrogate-escaped str')
     items = PropDict._make_property('items', list)
     cmdline = PropDict._make_property('cmdline', list)  # list of s-e-str
-    hostname = PropDict._make_property('hostname', str, 'surrogate-escaped str', encode=safe_encode, decode=safe_decode)
-    username = PropDict._make_property('username', str, 'surrogate-escaped str', encode=safe_encode, decode=safe_decode)
-    time = PropDict._make_property('time', str, 'surrogate-escaped str', encode=safe_encode, decode=safe_decode)
-    time_end = PropDict._make_property('time_end', str, 'surrogate-escaped str', encode=safe_encode, decode=safe_decode)
-    comment = PropDict._make_property('comment', str, 'surrogate-escaped str', encode=safe_encode, decode=safe_decode)
-    chunker_params = PropDict._make_property('chunker_params', tuple, 'chunker-params tuple', encode=tuple_encode, decode=tuple_decode)
+    hostname = PropDict._make_property('hostname', str, 'surrogate-escaped str')
+    username = PropDict._make_property('username', str, 'surrogate-escaped str')
+    time = PropDict._make_property('time', str)
+    time_end = PropDict._make_property('time_end', str)
+    comment = PropDict._make_property('comment', str, 'surrogate-escaped str')
+    chunker_params = PropDict._make_property('chunker_params', tuple)
     recreate_cmdline = PropDict._make_property('recreate_cmdline', list)  # list of s-e-str
     # recreate_source_id, recreate_args, recreate_partial_chunks were used in 1.1.0b1 .. b2
     recreate_source_id = PropDict._make_property('recreate_source_id', bytes)
@@ -395,6 +453,22 @@ class ArchiveItem(PropDict):
     csize_parts = PropDict._make_property('csize_parts', int)
     nfiles_parts = PropDict._make_property('nfiles_parts', int)
 
+    def update_internal(self, d):
+        # legacy support for migration (data from old msgpacks comes in as bytes always, but sometimes we want str)
+        for k, v in list(d.items()):
+            k = fix_key(d, k)
+            if k == 'version':
+                assert isinstance(v, int)
+            if k in ('name', 'hostname', 'username', 'comment'):
+                v = fix_str_value(d, k)
+            if k in ('time', 'time_end'):
+                v = fix_str_value(d, k, 'replace')
+            if k == 'chunker_params':
+                v = fix_tuple_of_str_and_int(v)
+            if k in ('cmdline', 'recreate_cmdline'):
+                v = fix_list_of_str(v)
+            self._dict[k] = v
+
 
 class ManifestItem(PropDict):
     """
@@ -413,10 +487,52 @@ class ManifestItem(PropDict):
     __slots__ = ("_dict", )  # avoid setting attributes not supported by properties
 
     version = PropDict._make_property('version', int)
-    archives = PropDict._make_property('archives', dict)  # name -> dict
-    timestamp = PropDict._make_property('timestamp', str, 'surrogate-escaped str', encode=safe_encode, decode=safe_decode)
+    archives = PropDict._make_property('archives', dict, 'dict of str -> dict')  # name -> dict
+    timestamp = PropDict._make_property('timestamp', str)
     config = PropDict._make_property('config', dict)
-    item_keys = PropDict._make_property('item_keys', tuple)
+    item_keys = PropDict._make_property('item_keys', tuple, 'tuple of str')
+
+    def update_internal(self, d):
+        # legacy support for migration (data from old msgpacks comes in as bytes always, but sometimes we want str)
+        for k, v in list(d.items()):
+            k = fix_key(d, k)
+            if k == 'version':
+                assert isinstance(v, int)
+            if k == 'archives':
+                ad = v
+                assert isinstance(ad, dict)
+                for ak, av in list(ad.items()):
+                    ak = fix_key(ad, ak)
+                    assert isinstance(av, dict)
+                    for ik, iv in list(av.items()):
+                        ik = fix_key(av, ik)
+                    assert set(av) == {'id', 'time'}
+                    assert isinstance(av['id'], bytes)
+                    fix_str_value(av, 'time')
+            if k == 'timestamp':
+                v = fix_str_value(d, k, 'replace')
+            if k == 'config':
+                cd = v
+                assert isinstance(cd, dict)
+                for ck, cv in list(cd.items()):
+                    ck = fix_key(cd, ck)
+                    if ck == 'tam_required':
+                        assert isinstance(cv, bool)
+                    if ck == 'feature_flags':
+                        assert isinstance(cv, dict)
+                        ops = {'read', 'check', 'write', 'delete'}
+                        for op, specs in list(cv.items()):
+                            op = fix_key(cv, op)
+                            assert op in ops
+                            for speck, specv in list(specs.items()):
+                                speck = fix_key(specs, speck)
+                                if speck == 'mandatory':
+                                    specs[speck] = fix_tuple_of_str(specv)
+                        assert set(cv).issubset(ops)
+            if k == 'item_keys':
+                v = fix_tuple_of_str(v)
+            self._dict[k] = v
+
 
 class ItemDiff:
     """

+ 25 - 24
src/borg/remote.py

@@ -38,7 +38,8 @@ logger = create_logger(__name__)
 
 RPC_PROTOCOL_VERSION = 2
 BORG_VERSION = parse_version(__version__)
-MSGID, MSG, ARGS, RESULT = b'i', b'm', b'a', b'r'
+MSGID, MSG, ARGS, RESULT = 'i', 'm', 'a', 'r'  # pack
+MSGIDB, MSGB, ARGSB, RESULTB = b'i', b'm', b'a', b'r'  # unpack
 
 MAX_INFLIGHT = 100
 
@@ -216,9 +217,9 @@ class RepositoryServer:  # pragma: no cover
                 for unpacked in unpacker:
                     if isinstance(unpacked, dict):
                         dictFormat = True
-                        msgid = unpacked[MSGID]
-                        method = unpacked[MSG].decode()
-                        args = decode_keys(unpacked[ARGS])
+                        msgid = unpacked[MSGIDB]
+                        method = unpacked[MSGB].decode()
+                        args = decode_keys(unpacked[ARGSB])
                     elif isinstance(unpacked, tuple) and len(unpacked) == 4:
                         dictFormat = False
                         # The first field 'type' was always 1 and has always been ignored
@@ -256,21 +257,21 @@ class RepositoryServer:  # pragma: no cover
 
                             try:
                                 msg = msgpack.packb({MSGID: msgid,
-                                                    b'exception_class': e.__class__.__name__,
-                                                    b'exception_args': e.args,
-                                                    b'exception_full': ex_full,
-                                                    b'exception_short': ex_short,
-                                                    b'exception_trace': ex_trace,
-                                                    b'sysinfo': sysinfo()})
+                                                    'exception_class': e.__class__.__name__,
+                                                    'exception_args': e.args,
+                                                    'exception_full': ex_full,
+                                                    'exception_short': ex_short,
+                                                    'exception_trace': ex_trace,
+                                                    'sysinfo': sysinfo()})
                             except TypeError:
                                 msg = msgpack.packb({MSGID: msgid,
-                                                    b'exception_class': e.__class__.__name__,
-                                                    b'exception_args': [x if isinstance(x, (str, bytes, int)) else None
-                                                                        for x in e.args],
-                                                    b'exception_full': ex_full,
-                                                    b'exception_short': ex_short,
-                                                    b'exception_trace': ex_trace,
-                                                    b'sysinfo': sysinfo()})
+                                                    'exception_class': e.__class__.__name__,
+                                                    'exception_args': [x if isinstance(x, (str, bytes, int)) else None
+                                                                       for x in e.args],
+                                                    'exception_full': ex_full,
+                                                    'exception_short': ex_short,
+                                                    'exception_trace': ex_trace,
+                                                    'sysinfo': sysinfo()})
 
                             os_write(stdout_fd, msg)
                         else:
@@ -570,7 +571,7 @@ class RemoteRepository:
         try:
             try:
                 version = self.call('negotiate', {'client_data': {
-                    b'client_version': BORG_VERSION,
+                    'client_version': BORG_VERSION,
                 }})
             except ConnectionClosed:
                 raise ConnectionClosedWithHint('Is borg working on the server?') from None
@@ -791,7 +792,7 @@ This problem will go away as soon as the server has been upgraded to 1.0.7+.
                     if b'exception_class' in unpacked:
                         handle_error(unpacked)
                     else:
-                        yield unpacked[RESULT]
+                        yield unpacked[RESULTB]
                         if not waiting_for and not calls:
                             return
                 except KeyError:
@@ -811,7 +812,7 @@ This problem will go away as soon as the server has been upgraded to 1.0.7+.
                         if b'exception_class' in unpacked:
                             handle_error(unpacked)
                         else:
-                            yield unpacked[RESULT]
+                            yield unpacked[RESULTB]
             if self.to_send or ((calls or self.preload_ids) and len(waiting_for) < MAX_INFLIGHT):
                 w_fds = [self.stdin_fd]
             else:
@@ -828,15 +829,15 @@ This problem will go away as soon as the server has been upgraded to 1.0.7+.
                     self.unpacker.feed(data)
                     for unpacked in self.unpacker:
                         if isinstance(unpacked, dict):
-                            msgid = unpacked[MSGID]
+                            msgid = unpacked[MSGIDB]
                         elif isinstance(unpacked, tuple) and len(unpacked) == 4:
                             # The first field 'type' was always 1 and has always been ignored
                             _, msgid, error, res = unpacked
                             if error:
                                 # ignore res, because it is only a fixed string anyway.
-                                unpacked = {MSGID: msgid, b'exception_class': error}
+                                unpacked = {MSGIDB: msgid, b'exception_class': error}
                             else:
-                                unpacked = {MSGID: msgid, RESULT: res}
+                                unpacked = {MSGIDB: msgid, RESULTB: res}
                         else:
                             raise UnexpectedRPCDataFormatFromServer(data)
                         if msgid in self.ignore_responses:
@@ -847,7 +848,7 @@ This problem will go away as soon as the server has been upgraded to 1.0.7+.
                             else:
                                 # we currently do not have async result values except "None",
                                 # so we do not add them into async_responses.
-                                if unpacked[RESULT] is not None:
+                                if unpacked[RESULTB] is not None:
                                     self.async_responses[msgid] = unpacked
                         else:
                             self.responses[msgid] = unpacked

+ 7 - 7
src/borg/testsuite/archive.py

@@ -186,8 +186,8 @@ class RobustUnpackerTestCase(BaseTestCase):
         return result
 
     def test_extra_garbage_no_sync(self):
-        chunks = [(False, [self.make_chunks([b'foo', b'bar'])]),
-                  (False, [b'garbage'] + [self.make_chunks([b'boo', b'baz'])])]
+        chunks = [(False, [self.make_chunks(['foo', 'bar'])]),
+                  (False, [b'garbage'] + [self.make_chunks(['boo', 'baz'])])]
         result = self.process(chunks)
         self.assert_equal(result, [
             {b'path': b'foo'}, {b'path': b'bar'},
@@ -203,19 +203,19 @@ class RobustUnpackerTestCase(BaseTestCase):
         return parts
 
     def test_correct_stream(self):
-        chunks = self.split(self.make_chunks([b'foo', b'bar', b'boo', b'baz']), 2)
+        chunks = self.split(self.make_chunks(['foo', 'bar', 'boo', 'baz']), 2)
         input = [(False, chunks)]
         result = self.process(input)
         self.assert_equal(result, [{b'path': b'foo'}, {b'path': b'bar'}, {b'path': b'boo'}, {b'path': b'baz'}])
 
     def test_missing_chunk(self):
-        chunks = self.split(self.make_chunks([b'foo', b'bar', b'boo', b'baz']), 4)
+        chunks = self.split(self.make_chunks(['foo', 'bar', 'boo', 'baz']), 4)
         input = [(False, chunks[:3]), (True, chunks[4:])]
         result = self.process(input)
         self.assert_equal(result, [{b'path': b'foo'}, {b'path': b'boo'}, {b'path': b'baz'}])
 
     def test_corrupt_chunk(self):
-        chunks = self.split(self.make_chunks([b'foo', b'bar', b'boo', b'baz']), 4)
+        chunks = self.split(self.make_chunks(['foo', 'bar', 'boo', 'baz']), 4)
         input = [(False, chunks[:3]), (True, [b'gar', b'bage'] + chunks[3:])]
         result = self.process(input)
         self.assert_equal(result, [{b'path': b'foo'}, {b'path': b'boo'}, {b'path': b'baz'}])
@@ -242,7 +242,7 @@ IK = sorted(list(ITEM_KEYS))
 
 @pytest.mark.parametrize('packed',
     [msgpack.packb(o) for o in [
-        {b'path': b'/a/b/c'},  # small (different msgpack mapping type!)
+        {'path': b'/a/b/c'},  # small (different msgpack mapping type!)
         OrderedDict((k, b'') for k in IK),  # as big (key count) as it gets
         OrderedDict((k, b'x' * 1000) for k in IK),  # as big (key count and volume) as it gets
     ]])
@@ -251,7 +251,7 @@ def test_valid_msgpacked_items(packed, item_keys_serialized):
 
 
 def test_key_length_msgpacked_items():
-    key = b'x' * 32  # 31 bytes is the limit for fixstr msgpack type
+    key = 'x' * 32  # 31 bytes is the limit for fixstr msgpack type
     data = {key: b''}
     item_keys_serialized = [msgpack.packb(key), ]
     assert valid_msgpacked_dict(msgpack.packb(data), item_keys_serialized)

+ 7 - 7
src/borg/testsuite/archiver.py

@@ -1810,7 +1810,7 @@ class ArchiverTestCase(ArchiverTestCaseBase):
     def add_unknown_feature(self, operation):
         with Repository(self.repository_path, exclusive=True) as repository:
             manifest, key = Manifest.load(repository, Manifest.NO_OPERATION_CHECK)
-            manifest.config[b'feature_flags'] = {operation.value.encode(): {b'mandatory': [b'unknown-feature']}}
+            manifest.config['feature_flags'] = {operation.value: {'mandatory': ['unknown-feature']}}
             manifest.write()
             repository.commit(compact=False)
 
@@ -3640,13 +3640,13 @@ id: 2 / e29442 3506da 4e1ea7 / 25f62a 5a3d41 - 02
 
         with Repository(self.repository_path) as repository:
             key = msgpack.unpackb(a2b_base64(repository.load_key()))
-            assert key[b'algorithm'] == expected_algorithm
+            assert key[b'algorithm'] == expected_algorithm.encode()
 
     def test_change_passphrase_does_not_change_algorithm_argon2(self):
-        self.verify_change_passphrase_does_not_change_algorithm('argon2', b'argon2 chacha20-poly1305')
+        self.verify_change_passphrase_does_not_change_algorithm('argon2', 'argon2 chacha20-poly1305')
 
     def test_change_passphrase_does_not_change_algorithm_pbkdf2(self):
-        self.verify_change_passphrase_does_not_change_algorithm('pbkdf2', b'sha256')
+        self.verify_change_passphrase_does_not_change_algorithm('pbkdf2', 'sha256')
 
     def verify_change_location_does_not_change_algorithm(self, given_algorithm, expected_algorithm):
         self.cmd('init', '--encryption=keyfile', '--key-algorithm', given_algorithm, self.repository_location)
@@ -3655,13 +3655,13 @@ id: 2 / e29442 3506da 4e1ea7 / 25f62a 5a3d41 - 02
 
         with Repository(self.repository_path) as repository:
             key = msgpack.unpackb(a2b_base64(repository.load_key()))
-            assert key[b'algorithm'] == expected_algorithm
+            assert key[b'algorithm'] == expected_algorithm.encode()
 
     def test_change_location_does_not_change_algorithm_argon2(self):
-        self.verify_change_location_does_not_change_algorithm('argon2', b'argon2 chacha20-poly1305')
+        self.verify_change_location_does_not_change_algorithm('argon2', 'argon2 chacha20-poly1305')
 
     def test_change_location_does_not_change_algorithm_pbkdf2(self):
-        self.verify_change_location_does_not_change_algorithm('pbkdf2', b'sha256')
+        self.verify_change_location_does_not_change_algorithm('pbkdf2', 'sha256')
 
     def test_key_change_algorithm(self):
         self.cmd('init', '--encryption=repokey', '--key-algorithm=pbkdf2', self.repository_location)

+ 3 - 3
src/borg/testsuite/item.py

@@ -102,7 +102,7 @@ def test_item_se_str_property():
     item = Item()
     item.path = '/a/b/c'
     assert item.path == '/a/b/c'
-    assert item.as_dict() == {'path': b'/a/b/c'}
+    assert item.as_dict() == {'path': '/a/b/c'}
     del item.path
     assert item.as_dict() == {}
     with pytest.raises(TypeError):
@@ -111,11 +111,11 @@ def test_item_se_str_property():
     # non-utf-8 path, needing surrogate-escaping for latin-1 u-umlaut
     item = Item(internal_dict={'path': b'/a/\xfc/c'})
     assert item.path == '/a/\udcfc/c'  # getting a surrogate-escaped representation
-    assert item.as_dict() == {'path': b'/a/\xfc/c'}
+    assert item.as_dict() == {'path': '/a/\udcfc/c'}
     del item.path
     assert 'path' not in item
     item.path = '/a/\udcfc/c'  # setting using a surrogate-escaped representation
-    assert item.as_dict() == {'path': b'/a/\xfc/c'}
+    assert item.as_dict() == {'path': '/a/\udcfc/c'}
 
 
 def test_item_list_property():