Browse Source

Merge pull request #7063 from ThomasWaldmann/item-more-cython-rebased

item.pyx: re-doing PR #5763
TW 2 years ago
parent
commit
4e87c1b6be
2 changed files with 187 additions and 152 deletions
  1. 172 152
      src/borg/item.pyx
  2. 15 0
      src/borg/testsuite/benchmark.py

+ 172 - 152
src/borg/item.pyx

@@ -1,6 +1,9 @@
 import stat
 from collections import namedtuple
 
+from libc.string cimport memcmp
+from cpython.bytes cimport PyBytes_AsStringAndSize
+
 from .constants import ITEM_KEYS, ARCHIVE_KEYS
 from .helpers import StableDict
 from .helpers import format_file_size
@@ -76,7 +79,7 @@ def fix_tuple_of_str(v):
 
 
 def fix_tuple_of_str_and_int(v):
-    """make sure we have a tuple of str"""
+    """make sure we have a tuple of str or int"""
     assert isinstance(v, (tuple, list))
     t = tuple(e.decode() if isinstance(e, bytes) else e for e in v)
     assert all(isinstance(e, (str, int)) for e in t), repr(t)
@@ -111,7 +114,7 @@ def want_str(v, *, errors='surrogateescape'):
     return v
 
 
-class PropDict:
+cdef class PropDict:
     """
     Manage a dictionary via properties.
 
@@ -124,18 +127,18 @@ class PropDict:
     - be safe against typos in key names: check against VALID_KEYS
     - when setting a value: check type of value
 
-    When "packing" a dict, ie. you have a dict with some data and want to convert it into an instance,
-    then use eg. Item({'a': 1, ...}). This way all keys in your dictionary are validated.
+    When "packing" a dict, i.e. you have a dict with some data and want to convert it into an instance,
+    then use e.g. Item({'a': 1, ...}). This way all keys in your dictionary are validated.
 
-    When "unpacking", that is you've read a dictionary with some data from somewhere (eg. msgpack),
-    then use eg. Item(internal_dict={...}). This does not validate the keys, therefore unknown keys
+    When "unpacking", that is you've read a dictionary with some data from somewhere (e.g. msgpack),
+    then use e.g. Item(internal_dict={...}). This does not validate the keys, therefore unknown keys
     are ignored instead of causing an error.
     """
-    VALID_KEYS = None  # override with <set of str> in child class
+    VALID_KEYS = frozenset()  # override with <set of str> in child class
 
-    __slots__ = ("_dict", )  # avoid setting attributes not supported by properties
+    cdef object _dict
 
-    def __init__(self, data_dict=None, internal_dict=None, **kw):
+    def __cinit__(self, data_dict=None, internal_dict=None, **kw):
         self._dict = {}
         if internal_dict is None:
             pass  # nothing to do
@@ -190,46 +193,59 @@ class PropDict:
         """get value for key, return default if key does not exist"""
         return getattr(self, self._check_key(key), default)
 
-    @staticmethod
-    def _make_property(key, value_type, value_type_name=None, encode=None, decode=None):
-        """return a property that deals with self._dict[key]"""
-        assert isinstance(key, str)
-        if value_type_name is None:
-            value_type_name = value_type.__name__
-        doc = "%s (%s)" % (key, value_type_name)
-        type_error_msg = "%s value must be %s" % (key, value_type_name)
-        attr_error_msg = "attribute %s not found" % key
-
-        def _get(self):
-            try:
-                value = self._dict[key]
-            except KeyError:
-                raise AttributeError(attr_error_msg) from None
-            if decode is not None:
-                value = decode(value)
-            if not isinstance(value, value_type):
-                raise TypeError(type_error_msg)
-            return value
-
-        def _set(self, value):
-            if not isinstance(value, value_type):
-                raise TypeError(type_error_msg)
-            if encode is not None:
-                value = encode(value)
-            self._dict[key] = value
-
-        def _del(self):
-            try:
-                del self._dict[key]
-            except KeyError:
-                raise AttributeError(attr_error_msg) from None
 
-        return property(_get, _set, _del, doc=doc)
+cdef class PropDictProperty:
+    """return a property that deals with self._dict[key] of PropDict"""
+    cdef readonly str key
+    cdef readonly object value_type
+    cdef str value_type_name
+    cdef readonly str __doc__
+    cdef object encode
+    cdef object decode
+    cdef str type_error_msg
+    cdef str attr_error_msg
+
+    def __cinit__(self, value_type, value_type_name=None, encode=None, decode=None):
+       self.key = None
+       self.value_type = value_type
+       self.value_type_name = value_type_name if value_type_name is not None else value_type.__name__
+       self.encode = encode
+       self.decode = decode
+
+    def __get__(self, PropDict instance, owner):
+        try:
+            value = instance._dict[self.key]
+        except KeyError:
+            raise AttributeError(self.attr_error_msg) from None
+        if self.decode is not None:
+            value = self.decode(value)
+        if not isinstance(value, self.value_type):
+            raise TypeError(self.type_error_msg)
+        return value
+
+    def __set__(self, PropDict instance, value):
+        if not isinstance(value, self.value_type):
+            raise TypeError(self.type_error_msg)
+        if self.encode is not None:
+            value = self.encode(value)
+        instance._dict[self.key] = value
+
+    def __delete__(self, PropDict instance):
+        try:
+            del instance._dict[self.key]
+        except KeyError:
+            raise AttributeError(self.attr_error_msg) from None
+
+    cpdef __set_name__(self, name):
+       self.key = name
+       self.__doc__ = "%s (%s)" % (name, self.value_type_name)
+       self.type_error_msg = "%s value must be %s" % (name, self.value_type_name)
+       self.attr_error_msg = "attribute %s not found" % name
 
 
 ChunkListEntry = namedtuple('ChunkListEntry', 'id size')
 
-class Item(PropDict):
+cdef class Item(PropDict):
     """
     Item abstraction that deals with validation and the low-level details internally:
 
@@ -241,48 +257,46 @@ class Item(PropDict):
     If an Item shall be serialized, give as_dict() method output to msgpack packer.
     """
 
-    VALID_KEYS = ITEM_KEYS | {'deleted', 'nlink', }  # str-typed keys
-
-    __slots__ = ("_dict", )  # avoid setting attributes not supported by properties
+    VALID_KEYS = ITEM_KEYS | {'deleted', 'nlink', }
 
     # properties statically defined, so that IDEs can know their names:
 
-    path = PropDict._make_property('path', str, 'surrogate-escaped str')
-    source = PropDict._make_property('source', str, 'surrogate-escaped str')
-    user = PropDict._make_property('user', str, 'surrogate-escaped str')
-    group = PropDict._make_property('group', str, 'surrogate-escaped str')
+    path = PropDictProperty(str, 'surrogate-escaped str')
+    source = PropDictProperty(str, 'surrogate-escaped str')
+    user = PropDictProperty(str, 'surrogate-escaped str')
+    group = PropDictProperty(str, 'surrogate-escaped str')
 
-    acl_access = PropDict._make_property('acl_access', bytes)
-    acl_default = PropDict._make_property('acl_default', bytes)
-    acl_extended = PropDict._make_property('acl_extended', bytes)
-    acl_nfs4 = PropDict._make_property('acl_nfs4', bytes)
+    acl_access = PropDictProperty(bytes)
+    acl_default = PropDictProperty(bytes)
+    acl_extended = PropDictProperty(bytes)
+    acl_nfs4 = PropDictProperty(bytes)
 
-    mode = PropDict._make_property('mode', int)
-    uid = PropDict._make_property('uid', int)
-    gid = PropDict._make_property('gid', int)
-    rdev = PropDict._make_property('rdev', int)
-    bsdflags = PropDict._make_property('bsdflags', int)
+    mode = PropDictProperty(int)
+    uid = PropDictProperty(int)
+    gid = PropDictProperty(int)
+    rdev = PropDictProperty(int)
+    bsdflags = PropDictProperty(int)
 
-    atime = PropDict._make_property('atime', int, 'int (ns)', encode=int_to_timestamp, decode=timestamp_to_int)
-    ctime = PropDict._make_property('ctime', int, 'int (ns)', encode=int_to_timestamp, decode=timestamp_to_int)
-    mtime = PropDict._make_property('mtime', int, 'int (ns)', encode=int_to_timestamp, decode=timestamp_to_int)
-    birthtime = PropDict._make_property('birthtime', int, 'int (ns)', encode=int_to_timestamp, decode=timestamp_to_int)
+    atime = PropDictProperty(int, 'int (ns)', encode=int_to_timestamp, decode=timestamp_to_int)
+    ctime = PropDictProperty(int, 'int (ns)', encode=int_to_timestamp, decode=timestamp_to_int)
+    mtime = PropDictProperty(int, 'int (ns)', encode=int_to_timestamp, decode=timestamp_to_int)
+    birthtime = PropDictProperty(int, 'int (ns)', encode=int_to_timestamp, decode=timestamp_to_int)
 
     # size is only present for items with a chunk list and then it is sum(chunk_sizes)
-    size = PropDict._make_property('size', int)
+    size = PropDictProperty(int)
 
-    hlid = PropDict._make_property('hlid', bytes)  # hard link id: same value means same hard link.
-    hardlink_master = PropDict._make_property('hardlink_master', bool)  # legacy
+    hlid = PropDictProperty(bytes)  # hard link id: same value means same hard link.
+    hardlink_master = PropDictProperty(bool)  # legacy
 
-    chunks = PropDict._make_property('chunks', list, 'list')
-    chunks_healthy = PropDict._make_property('chunks_healthy', list, 'list')
+    chunks = PropDictProperty(list, 'list')
+    chunks_healthy = PropDictProperty(list, 'list')
 
-    xattrs = PropDict._make_property('xattrs', StableDict)
+    xattrs = PropDictProperty(StableDict)
 
-    deleted = PropDict._make_property('deleted', bool)
-    nlink = PropDict._make_property('nlink', int)
+    deleted = PropDictProperty(bool)
+    nlink = PropDictProperty(int)
 
-    part = PropDict._make_property('part', int)
+    part = PropDictProperty(int)
 
     def get_size(self, *, memorize=False, from_chunks=False, consider_ids=None):
         """
@@ -390,7 +404,7 @@ class Item(PropDict):
             self._dict[k] = v
 
 
-class EncryptedKey(PropDict):
+cdef class EncryptedKey(PropDict):
     """
     EncryptedKey abstraction that deals with validation and the low-level details internally:
 
@@ -405,18 +419,16 @@ class EncryptedKey(PropDict):
     VALID_KEYS = {'version', 'algorithm', 'iterations', 'salt', 'hash', 'data',
                   'argon2_time_cost', 'argon2_memory_cost', 'argon2_parallelism', 'argon2_type'}
 
-    __slots__ = ("_dict", )  # avoid setting attributes not supported by properties
-
-    version = PropDict._make_property('version', int)
-    algorithm = PropDict._make_property('algorithm', str)
-    iterations = PropDict._make_property('iterations', int)
-    salt = PropDict._make_property('salt', bytes)
-    hash = PropDict._make_property('hash', bytes)
-    data = PropDict._make_property('data', bytes)
-    argon2_time_cost = PropDict._make_property('argon2_time_cost', int)
-    argon2_memory_cost = PropDict._make_property('argon2_memory_cost', int)
-    argon2_parallelism = PropDict._make_property('argon2_parallelism', int)
-    argon2_type = PropDict._make_property('argon2_type', str)
+    version = PropDictProperty(int)
+    algorithm = PropDictProperty(str)
+    iterations = PropDictProperty(int)
+    salt = PropDictProperty(bytes)
+    hash = PropDictProperty(bytes)
+    data = PropDictProperty(bytes)
+    argon2_time_cost = PropDictProperty(int)
+    argon2_memory_cost = PropDictProperty(int)
+    argon2_parallelism = PropDictProperty(int)
+    argon2_type = PropDictProperty(str)
 
     def update_internal(self, d):
         # legacy support for migration (data from old msgpacks comes in as bytes always, but sometimes we want str)
@@ -431,7 +443,7 @@ class EncryptedKey(PropDict):
             self._dict[k] = v
 
 
-class Key(PropDict):
+cdef class Key(PropDict):
     """
     Key abstraction that deals with validation and the low-level details internally:
 
@@ -443,16 +455,14 @@ class Key(PropDict):
     If a Key shall be serialized, give as_dict() method output to msgpack packer.
     """
 
-    VALID_KEYS = {'version', 'repository_id', 'crypt_key', 'id_key', 'chunk_seed', 'tam_required'}  # str-typed keys
-
-    __slots__ = ("_dict", )  # avoid setting attributes not supported by properties
+    VALID_KEYS = {'version', 'repository_id', 'crypt_key', 'id_key', 'chunk_seed', 'tam_required'}
 
-    version = PropDict._make_property('version', int)
-    repository_id = PropDict._make_property('repository_id', bytes)
-    crypt_key = PropDict._make_property('crypt_key', bytes)
-    id_key = PropDict._make_property('id_key', bytes)
-    chunk_seed = PropDict._make_property('chunk_seed', int)
-    tam_required = PropDict._make_property('tam_required', bool)
+    version = PropDictProperty(int)
+    repository_id = PropDictProperty(bytes)
+    crypt_key = PropDictProperty(bytes)
+    id_key = PropDictProperty(bytes)
+    chunk_seed = PropDictProperty(int)
+    tam_required = PropDictProperty(bool)
 
     def update_internal(self, d):
         # legacy support for migration (data from old msgpacks comes in as bytes always, but sometimes we want str)
@@ -469,7 +479,7 @@ class Key(PropDict):
             assert len(k) in (32 + 32, 32 + 128)  # 256+256 or 256+1024 bits
             self._dict['crypt_key'] = k
 
-class ArchiveItem(PropDict):
+cdef class ArchiveItem(PropDict):
     """
     ArchiveItem abstraction that deals with validation and the low-level details internally:
 
@@ -481,30 +491,28 @@ class ArchiveItem(PropDict):
     If a ArchiveItem shall be serialized, give as_dict() method output to msgpack packer.
     """
 
-    VALID_KEYS = ARCHIVE_KEYS  # str-typed keys
-
-    __slots__ = ("_dict", )  # avoid setting attributes not supported by properties
-
-    version = PropDict._make_property('version', int)
-    name = PropDict._make_property('name', str, 'surrogate-escaped str')
-    items = PropDict._make_property('items', list)  # list of chunk ids of item metadata stream (only in memory)
-    item_ptrs = PropDict._make_property('item_ptrs', list)  # list of blocks with list of chunk ids of ims, arch v2
-    cmdline = PropDict._make_property('cmdline', list)  # list of s-e-str
-    hostname = PropDict._make_property('hostname', str, 'surrogate-escaped str')
-    username = PropDict._make_property('username', str, 'surrogate-escaped str')
-    time = PropDict._make_property('time', str)
-    time_end = PropDict._make_property('time_end', str)
-    comment = PropDict._make_property('comment', str, 'surrogate-escaped str')
-    chunker_params = PropDict._make_property('chunker_params', tuple)
-    recreate_cmdline = PropDict._make_property('recreate_cmdline', list)  # list of s-e-str
+    VALID_KEYS = ARCHIVE_KEYS
+
+    version = PropDictProperty(int)
+    name = PropDictProperty(str, 'surrogate-escaped str')
+    items = PropDictProperty(list)  # list of chunk ids of item metadata stream (only in memory)
+    item_ptrs = PropDictProperty(list)  # list of blocks with list of chunk ids of ims, arch v2
+    cmdline = PropDictProperty(list)  # list of s-e-str
+    hostname = PropDictProperty(str, 'surrogate-escaped str')
+    username = PropDictProperty(str, 'surrogate-escaped str')
+    time = PropDictProperty(str)
+    time_end = PropDictProperty(str)
+    comment = PropDictProperty(str, 'surrogate-escaped str')
+    chunker_params = PropDictProperty(tuple)
+    recreate_cmdline = PropDictProperty(list)  # list of s-e-str
     # recreate_source_id, recreate_args, recreate_partial_chunks were used in 1.1.0b1 .. b2
-    recreate_source_id = PropDict._make_property('recreate_source_id', bytes)
-    recreate_args = PropDict._make_property('recreate_args', list)  # list of s-e-str
-    recreate_partial_chunks = PropDict._make_property('recreate_partial_chunks', list)  # list of tuples
-    size = PropDict._make_property('size', int)
-    nfiles = PropDict._make_property('nfiles', int)
-    size_parts = PropDict._make_property('size_parts', int)
-    nfiles_parts = PropDict._make_property('nfiles_parts', int)
+    recreate_source_id = PropDictProperty(bytes)
+    recreate_args = PropDictProperty(list)  # list of s-e-str
+    recreate_partial_chunks = PropDictProperty(list)  # list of tuples
+    size = PropDictProperty(int)
+    nfiles = PropDictProperty(int)
+    size_parts = PropDictProperty(int)
+    nfiles_parts = PropDictProperty(int)
 
     def update_internal(self, d):
         # legacy support for migration (data from old msgpacks comes in as bytes always, but sometimes we want str)
@@ -527,7 +535,7 @@ class ArchiveItem(PropDict):
             self._dict[k] = v
 
 
-class ManifestItem(PropDict):
+cdef class ManifestItem(PropDict):
     """
     ManifestItem abstraction that deals with validation and the low-level details internally:
 
@@ -539,15 +547,13 @@ class ManifestItem(PropDict):
     If a ManifestItem shall be serialized, give as_dict() method output to msgpack packer.
     """
 
-    VALID_KEYS = {'version', 'archives', 'timestamp', 'config', 'item_keys', }  # str-typed keys
+    VALID_KEYS = {'version', 'archives', 'timestamp', 'config', 'item_keys', }
 
-    __slots__ = ("_dict", )  # avoid setting attributes not supported by properties
-
-    version = PropDict._make_property('version', int)
-    archives = PropDict._make_property('archives', dict, 'dict of str -> dict')  # name -> dict
-    timestamp = PropDict._make_property('timestamp', str)
-    config = PropDict._make_property('config', dict)
-    item_keys = PropDict._make_property('item_keys', tuple, 'tuple of str')
+    version = PropDictProperty(int)
+    archives = PropDictProperty(dict, 'dict of str -> dict')  # name -> dict
+    timestamp = PropDictProperty(str)
+    config = PropDictProperty(dict)
+    item_keys = PropDictProperty(tuple, 'tuple of str')
 
     def update_internal(self, d):
         # legacy support for migration (data from old msgpacks comes in as bytes always, but sometimes we want str)
@@ -593,6 +599,18 @@ class ManifestItem(PropDict):
             self._dict[k] = v
 
 
+cpdef _init_names():
+    """
+    re-implements python __set_name__
+    """
+    for cls in PropDict.__subclasses__():
+        for name, value in vars(cls).items():
+            if isinstance(value, PropDictProperty):
+                value.__set_name__(name)
+
+_init_names()
+
+
 class ItemDiff:
     """
     Comparison of two items from different archives.
@@ -640,7 +658,7 @@ class ItemDiff:
     def __repr__(self):
         if self.equal:
             return 'equal'
-        return ' '.join(str for d,str in self._changes)
+        return ' '.join(str for d, str in self._changes)
 
     def _equal(self, chunk_iterator1, chunk_iterator2):
         # if both are deleted, there is nothing at path regardless of what was deleted
@@ -719,33 +737,35 @@ class ItemDiff:
         return chunks_contents_equal(chunk_iterator1, chunk_iterator2)
 
 
-def chunks_contents_equal(chunks1, chunks2):
+def chunks_contents_equal(chunks_a, chunks_b):
     """
     Compare chunk content and return True if they are identical.
 
     The chunks must be given as chunk iterators (like returned by :meth:`.DownloadPipeline.fetch_many`).
     """
+    cdef:
+        bytes a, b
+        char * ap
+        char * bp
+        Py_ssize_t slicelen = 0
+        Py_ssize_t alen = 0
+        Py_ssize_t blen = 0
 
-    end = object()
-    alen = ai = 0
-    blen = bi = 0
     while True:
-        if not alen - ai:
-            a = next(chunks1, end)
-            if a is end:
-                return not blen - bi and next(chunks2, end) is end
-            a = memoryview(a)
-            alen = len(a)
-            ai = 0
-        if not blen - bi:
-            b = next(chunks2, end)
-            if b is end:
-                return not alen - ai and next(chunks1, end) is end
-            b = memoryview(b)
-            blen = len(b)
-            bi = 0
-        slicelen = min(alen - ai, blen - bi)
-        if a[ai:ai + slicelen] != b[bi:bi + slicelen]:
+        if not alen:
+            a = next(chunks_a, None)
+            if a is None:
+                return not blen and next(chunks_b, None) is None
+            PyBytes_AsStringAndSize(a, &ap, &alen)
+        if not blen:
+            b = next(chunks_b, None)
+            if b is None:
+                return not alen and next(chunks_a, None) is None
+            PyBytes_AsStringAndSize(b, &bp, &blen)
+        slicelen = min(alen, blen)
+        if memcmp(ap, bp, slicelen) != 0:
             return False
-        ai += slicelen
-        bi += slicelen
+        ap += slicelen
+        bp += slicelen
+        alen -= slicelen
+        blen -= slicelen

+ 15 - 0
src/borg/testsuite/benchmark.py

@@ -11,6 +11,7 @@ import os
 import pytest
 
 from .archiver import changedir, cmd
+from .item import Item
 from ..constants import zeros
 
 
@@ -108,3 +109,17 @@ def test_check(benchmark, cmd, repo_archive):
 def test_help(benchmark, cmd):
     result, out = benchmark(cmd, "help")
     assert result == 0
+
+
+@pytest.mark.parametrize("type, key, value", [(Item, "size", 1000), (Item, "mode", 0x777), (Item, "deleted", False)])
+def test_propdict_attributes(benchmark, type, key, value):
+
+    propdict = type()
+
+    def getattr_setattr(item, key, value):
+        setattr(item, key, value)
+        assert item.get(key) == value
+        assert getattr(item, key) == value
+        item.as_dict()
+
+    benchmark(getattr_setattr, propdict, key, value)