10 years ago · 746984c33b
--- a/borg/compress.pyx
+++ b/borg/compress.pyx
@@ -1,63 +1,91 @@
 
				-"""
			
 
				-A thin liblz4 wrapper for raw LZ4 compression / decompression.
			
 
				-
			
 
				-Features:
			
 
				-    - lz4 is super fast
			
 
				-    - wrapper releases CPython's GIL to support multithreaded code
			
 
				-    - helper buffer only allocated once at instance creation and then reused
			
 
				-
			
 
				-But beware:
			
 
				-    - this is not very generic, you MUST know the maximum uncompressed input
			
 
				-      data size you will feed into the compressor / get from the decompressor!
			
 
				-    - you must not do method calls to the same LZ4 instance from different
			
 
				-      threads at the same time - create one LZ4 instance per thread!
			
 
				-    - compress returns raw compressed data without adding any frame metadata
			
 
				-      (like checksums, magics, length of data, etc.)
			
 
				-    - decompress expects such raw compressed data as input
			
 
				-"""
			
 
				+import zlib
			
 
				 
			
 
				 from libc.stdlib cimport malloc, free
			
 
				 
			
 
				 
			
 
				 cdef extern from "lz4.h":
			
 
				-    int LZ4_compressBound(int inputSize)
			
 
				-    int LZ4_compress(const char* source, char* dest, int inputSize) nogil
			
 
				+    int LZ4_compress_limitedOutput(const char* source, char* dest, int inputSize, int maxOutputSize) nogil
			
 
				     int LZ4_decompress_safe(const char* source, char* dest, int inputSize, int maxOutputSize) nogil
			
 
				 
			
 
				 
			
 
				-cdef class LZ4:
			
 
				+cdef class CompressorBase:
			
 
				+    """
			
 
				+    base class for all (de)compression classes,
			
 
				+    also handles compression format auto detection and
			
 
				+    adding/stripping the ID header (which enable auto detection).
			
 
				+    """
			
 
				+    ID = b'\xFF\xFF'  # reserved and not used
			
 
				+                      # overwrite with a unique 2-bytes bytestring in child classes
			
 
				+    name = 'baseclass'
			
 
				+
			
 
				+    @classmethod
			
 
				+    def detect(cls, data):
			
 
				+        return data.startswith(cls.ID)
			
 
				+
			
 
				+    def __init__(self, **kwargs):
			
 
				+        pass
			
 
				+
			
 
				+    def compress(self, data):
			
 
				+        # add ID bytes
			
 
				+        return self.ID + data
			
 
				+
			
 
				+    def decompress(self, data):
			
 
				+        # strip ID bytes
			
 
				+        return data[2:]
			
 
				+
			
 
				+
			
 
				+class CNULL(CompressorBase):
			
 
				+    """
			
 
				+    null compression, just pass through data
			
 
				+    """
			
 
				+    ID = b'\x00\x00'
			
 
				+    name = 'null'
			
 
				+    # base class does all we need
			
 
				+
			
 
				+
			
 
				+cdef class LZ4(CompressorBase):
			
 
				+    """
			
 
				+    raw LZ4 compression / decompression (liblz4).
			
 
				+
			
 
				+    Features:
			
 
				+        - lz4 is super fast
			
 
				+        - wrapper releases CPython's GIL to support multithreaded code
			
 
				+        - buffer given by caller, avoiding frequent reallocation and buffer duplication
			
 
				+        - uses safe lz4 methods that never go beyond the end of the output buffer
			
 
				+
			
 
				+    But beware:
			
 
				+        - this is not very generic, the given buffer MUST be large enough to
			
 
				+          handle all compression or decompression output (or it will fail).
			
 
				+        - you must not do method calls to the same LZ4 instance from different
			
 
				+          threads at the same time - create one LZ4 instance per thread!
			
 
				+    """
			
 
				+    ID = b'\x01\x00'
			
 
				+    name = 'lz4'
			
 
				+
			
 
				     cdef char *buffer  # helper buffer for (de)compression output
			
 
				     cdef int bufsize  # size of this buffer
			
 
				-    cdef int max_isize  # maximum compressor input size safe for this bufsize
			
 
				 
			
 
				-    def __cinit__(self, int max_isize):
			
 
				-        self.max_isize = max_isize
			
 
				-        # compute worst case bufsize for not compressible data:
			
 
				-        self.bufsize = LZ4_compressBound(max_isize)
			
 
				-        self.buffer = <char *>malloc(self.bufsize)
			
 
				-        if not self.buffer:
			
 
				-            raise MemoryError
			
 
				-
			
 
				-    def __dealloc__(self):
			
 
				-        free(self.buffer)
			
 
				+    def __cinit__(self, **kwargs):
			
 
				+        buffer = kwargs['buffer']
			
 
				+        self.buffer = buffer
			
 
				+        self.bufsize = len(buffer)
			
 
				 
			
 
				     def compress(self, idata):
			
 
				         cdef int isize = len(idata)
			
 
				-        if isize > self.max_isize:
			
 
				-            raise Exception('lz4 buffer might be too small, increase max_isize!')
			
 
				-        cdef int osize
			
 
				+        cdef int osize = self.bufsize
			
 
				         cdef char *source = idata
			
 
				         cdef char *dest = self.buffer
			
 
				         with nogil:
			
 
				-            osize = LZ4_compress(source, dest, isize)
			
 
				+            osize = LZ4_compress_limitedOutput(source, dest, isize, osize)
			
 
				         if not osize:
			
 
				             raise Exception('lz4 compress failed')
			
 
				-        return dest[:osize]
			
 
				+        return super().compress(dest[:osize])
			
 
				 
			
 
				     def decompress(self, idata):
			
 
				+        idata = super().decompress(idata)
			
 
				         cdef int isize = len(idata)
			
 
				         cdef int osize = self.bufsize
			
 
				-        cdef char *source = idata  # <-- does not work for memoryview idata, wants bytes
			
 
				+        cdef char *source = idata
			
 
				         cdef char *dest = self.buffer
			
 
				         with nogil:
			
 
				             osize = LZ4_decompress_safe(source, dest, isize, osize)
			
@@ -65,3 +93,64 @@ cdef class LZ4:
 
				             # malformed input data, buffer too small, ...
			
 
				             raise Exception('lz4 decompress failed')
			
 
				         return dest[:osize]
			
 
				+
			
 
				+
			
 
				+class ZLIB(CompressorBase):
			
 
				+    """
			
 
				+    zlib compression / decompression (python stdlib)
			
 
				+    """
			
 
				+    ID = b'\x08\x00'  # not used here, see detect()
			
 
				+                      # avoid all 0x.8.. IDs elsewhere!
			
 
				+    name = 'zlib'
			
 
				+
			
 
				+    @classmethod
			
 
				+    def detect(cls, data):
			
 
				+        # matches misc. patterns 0x.8.. used by zlib
			
 
				+        cmf, flg = data[:2]
			
 
				+        is_deflate = cmf & 0x0f == 8
			
 
				+        check_ok = (cmf * 256 + flg) % 31 == 0
			
 
				+        return check_ok and is_deflate
			
 
				+
			
 
				+    def __init__(self, level=6, **kwargs):
			
 
				+        super().__init__(**kwargs)
			
 
				+        self.level = level
			
 
				+
			
 
				+    def compress(self, data):
			
 
				+        # note: for compatibility no super call, do not add ID bytes
			
 
				+        return zlib.compress(data, self.level)
			
 
				+
			
 
				+    def decompress(self, data):
			
 
				+        # note: for compatibility no super call, do not strip ID bytes
			
 
				+        return zlib.decompress(data)
			
 
				+
			
 
				+
			
 
				+COMPRESSOR_TABLE = {
			
 
				+    CNULL.name: CNULL,
			
 
				+    LZ4.name: LZ4,
			
 
				+    ZLIB.name: ZLIB,
			
 
				+}
			
 
				+COMPRESSOR_LIST = [LZ4, CNULL, ZLIB, ]  # check fast stuff first
			
 
				+
			
 
				+def get_compressor(name, **kwargs):
			
 
				+    cls = COMPRESSOR_TABLE[name]
			
 
				+    return cls(**kwargs)
			
 
				+
			
 
				+
			
 
				+class Compressor:
			
 
				+    """
			
 
				+    compresses using a compressor with given name and parameters
			
 
				+    decompresses everything we can handle (autodetect)
			
 
				+    """
			
 
				+    def __init__(self, name='zlib', **kwargs):
			
 
				+        self.params = kwargs
			
 
				+        self.compressor = get_compressor(name, **self.params)
			
 
				+
			
 
				+    def compress(self, data):
			
 
				+        return self.compressor.compress(data)
			
 
				+
			
 
				+    def decompress(self, data):
			
 
				+        for cls in COMPRESSOR_LIST:
			
 
				+            if cls.detect(data):
			
 
				+                return cls(**self.params).decompress(data)
			
 
				+        else:
			
 
				+            raise ValueError('No decompressor for this data found: %r.', data[:2])
			
--- a/borg/testsuite/compress.py
+++ b/borg/testsuite/compress.py
@@ -0,0 +1,81 @@
 
				+import zlib
			
 
				+
			
 
				+import pytest
			
 
				+
			
 
				+from ..compress import get_compressor, Compressor, CNULL, ZLIB, LZ4
			
 
				+
			
 
				+
			
 
				+buffer = bytes(2**16)
			
 
				+data = b'fooooooooobaaaaaaaar'
			
 
				+params = dict(name='zlib', level=6, buffer=buffer)
			
 
				+
			
 
				+
			
 
				+def test_get_compressor():
			
 
				+    c = get_compressor(name='null')
			
 
				+    assert isinstance(c, CNULL)
			
 
				+    c = get_compressor(name='lz4', buffer=buffer)
			
 
				+    assert isinstance(c, LZ4)
			
 
				+    c = get_compressor(name='zlib')
			
 
				+    assert isinstance(c, ZLIB)
			
 
				+    with pytest.raises(KeyError):
			
 
				+        get_compressor(name='foobar')
			
 
				+
			
 
				+
			
 
				+def test_cnull():
			
 
				+    c = get_compressor(name='null')
			
 
				+    cdata = c.compress(data)
			
 
				+    assert len(cdata) > len(data)
			
 
				+    assert data in cdata  # it's not compressed and just in there 1:1
			
 
				+    assert data == c.decompress(cdata)
			
 
				+    assert data == Compressor(**params).decompress(cdata)  # autodetect
			
 
				+
			
 
				+
			
 
				+def test_lz4():
			
 
				+    c = get_compressor(name='lz4', buffer=buffer)
			
 
				+    cdata = c.compress(data)
			
 
				+    assert len(cdata) < len(data)
			
 
				+    assert data == c.decompress(cdata)
			
 
				+    assert data == Compressor(**params).decompress(cdata)  # autodetect
			
 
				+
			
 
				+
			
 
				+def test_zlib():
			
 
				+    c = get_compressor(name='zlib')
			
 
				+    cdata = c.compress(data)
			
 
				+    assert len(cdata) < len(data)
			
 
				+    assert data == c.decompress(cdata)
			
 
				+    assert data == Compressor(**params).decompress(cdata)  # autodetect
			
 
				+
			
 
				+
			
 
				+def test_autodetect_invalid():
			
 
				+    with pytest.raises(ValueError):
			
 
				+        Compressor(**params).decompress(b'\xff\xfftotalcrap')
			
 
				+    with pytest.raises(ValueError):
			
 
				+        Compressor(**params).decompress(b'\x08\x00notreallyzlib')
			
 
				+
			
 
				+
			
 
				+def test_zlib_compat():
			
 
				+    # for compatibility reasons, we do not add an extra header for zlib,
			
 
				+    # nor do we expect one when decompressing / autodetecting
			
 
				+    for level in range(10):
			
 
				+        c = get_compressor(name='zlib', level=level)
			
 
				+        cdata1 = c.compress(data)
			
 
				+        cdata2 = zlib.compress(data, level)
			
 
				+        assert cdata1 == cdata2
			
 
				+        data2 = c.decompress(cdata2)
			
 
				+        assert data == data2
			
 
				+        data2 = Compressor(**params).decompress(cdata2)
			
 
				+        assert data == data2
			
 
				+
			
 
				+
			
 
				+def test_compressor():
			
 
				+    for params in [
			
 
				+        dict(name='null', buffer=buffer),
			
 
				+        dict(name='lz4', buffer=buffer),
			
 
				+        dict(name='zlib', level=0, buffer=buffer),
			
 
				+        dict(name='zlib', level=6, buffer=buffer),
			
 
				+        dict(name='zlib', level=9, buffer=buffer),
			
 
				+    ]:
			
 
				+        c = Compressor(**params)
			
 
				+        assert data == c.decompress(c.compress(data))
			
 
				+
			
 
				+