Browse Source

fix resync and msgpacked item qualifier, fixes #1135

when trying to resync and skip invalid data, borg tries to qualify a byte sequence as
valid-looking msgpacked item metadata dict (or not) before even invoking msgpack's unpack.

besides previously hard to understand code, there were 2 issues:

- a missing check for map16 - this type is what msgpack uses if the dict has more than
15 items (could happen in future, not for 1.0.x).

- missing checks for str8/16/32 - str16 is what msgpack uses if the bytestring has more than 31 bytes
(borg does not have that long key names, thus this wasn't causing any harm)

this misqualification (valid data considered invalid) could lead to a wrong resync, skipping valid items.

added more comments and tests.
Thomas Waldmann 9 years ago
parent
commit
918e0b2a52
2 changed files with 64 additions and 11 deletions
  1. 30 10
      borg/archive.py
  2. 34 1
      borg/testsuite/archive.py

+ 30 - 10
borg/archive.py

@@ -592,6 +592,34 @@ ITEM_KEYS = set([b'path', b'source', b'rdev', b'chunks',
                  b'xattrs', b'bsdflags', b'acl_nfs4', b'acl_access', b'acl_default', b'acl_extended', ])
 
 
+def valid_msgpacked_item(d, item_keys_serialized):
+    """check if the data <d> looks like a msgpacked item metadata dict"""
+    d_len = len(d)
+    if d_len == 0:
+        return False
+    if d[0] & 0xf0 == 0x80:  # object is a fixmap (up to 15 elements)
+        offs = 1
+    elif d[0] == 0xde:  # object is a map16 (up to 2^16-1 elements)
+        offs = 3
+    else:
+        # object is not a map (dict)
+        # note: we must not have item dicts with > 2^16-1 elements
+        return False
+    if d_len <= offs:
+        return False
+    # is the first dict key a bytestring?
+    if d[offs] & 0xe0 == 0xa0:  # key is a small bytestring (up to 31 chars)
+        pass
+    elif d[offs] in (0xd9, 0xda, 0xdb):  # key is a str8, str16 or str32
+        pass
+    else:
+        # key is not a bytestring
+        return False
+    # is the bytestring any of the expected key names?
+    key_serialized = d[offs:]
+    return any(key_serialized.startswith(pattern) for pattern in item_keys_serialized)
+
+
 class RobustUnpacker:
     """A restartable/robust version of the streaming msgpack unpacker
     """
@@ -622,18 +650,10 @@ class RobustUnpacker:
             while self._resync:
                 if not data:
                     raise StopIteration
-                # Abort early if the data does not look like a serialized dict
-                if len(data) < 2 or ((data[0] & 0xf0) != 0x80) or ((data[1] & 0xe0) != 0xa0):
-                    data = data[1:]
-                    continue
-                # Make sure it looks like an item dict
-                for pattern in self.item_keys:
-                    if data[1:].startswith(pattern):
-                        break
-                else:
+                # Abort early if the data does not look like a serialized item dict
+                if not valid_msgpacked_item(data, self.item_keys):
                     data = data[1:]
                     continue
-
                 self._unpacker = msgpack.Unpacker(object_hook=StableDict)
                 self._unpacker.feed(data)
                 try:

+ 34 - 1
borg/testsuite/archive.py

@@ -2,8 +2,9 @@ from datetime import datetime, timezone
 from unittest.mock import Mock
 
 import msgpack
+import pytest
 
-from ..archive import Archive, CacheChunkBuffer, RobustUnpacker
+from ..archive import Archive, CacheChunkBuffer, RobustUnpacker, valid_msgpacked_item, ITEM_KEYS
 from ..key import PlaintextKey
 from ..helpers import Manifest
 from . import BaseTestCase
@@ -112,3 +113,35 @@ class RobustUnpackerTestCase(BaseTestCase):
         input = [(False, chunks[:3]), (True, [b'gar', b'bage'] + chunks[3:])]
         result = self.process(input)
         self.assert_equal(result, [{b'path': b'foo'}, {b'path': b'boo'}, {b'path': b'baz'}])
+
+
+@pytest.fixture
+def item_keys_serialized():
+    return [msgpack.packb(name) for name in ITEM_KEYS]
+
+
+@pytest.mark.parametrize('packed',
+    [b'', b'x', b'foobar', ] +
+    [msgpack.packb(o) for o in (
+        [None, 0, 0.0, False, '', {}, [], ()] +
+        [42, 23.42, True, b'foobar', {b'foo': b'bar'}, [b'foo', b'bar'], (b'foo', b'bar')]
+    )])
+def test_invalid_msgpacked_item(packed, item_keys_serialized):
+    assert not valid_msgpacked_item(packed, item_keys_serialized)
+
+
+@pytest.mark.parametrize('packed',
+    [msgpack.packb(o) for o in [
+        {b'path': b'/a/b/c'},  # small (different msgpack mapping type!)
+        dict((k, b'') for k in ITEM_KEYS),  # as big (key count) as it gets
+        dict((k, b'x' * 1000) for k in ITEM_KEYS),  # as big (key count and volume) as it gets
+    ]])
+def test_valid_msgpacked_items(packed, item_keys_serialized):
+    assert valid_msgpacked_item(packed, item_keys_serialized)
+
+
+def test_key_length_msgpacked_items():
+    key = b'x' * 32  # 31 bytes is the limit for fixstr msgpack type
+    data = {key: b''}
+    item_keys_serialized = [msgpack.packb(key), ]
+    assert valid_msgpacked_item(msgpack.packb(data), item_keys_serialized)