|
@@ -0,0 +1,199 @@
|
|
|
+import zlib
|
|
|
+try:
|
|
|
+ import lzma
|
|
|
+except ImportError:
|
|
|
+ lzma = None
|
|
|
+
|
|
|
+cdef extern from "lz4.h":
|
|
|
+ int LZ4_compress_limitedOutput(const char* source, char* dest, int inputSize, int maxOutputSize) nogil
|
|
|
+ int LZ4_decompress_safe(const char* source, char* dest, int inputSize, int maxOutputSize) nogil
|
|
|
+
|
|
|
+
|
|
|
+cdef class CompressorBase:
|
|
|
+ """
|
|
|
+ base class for all (de)compression classes,
|
|
|
+ also handles compression format auto detection and
|
|
|
+ adding/stripping the ID header (which enable auto detection).
|
|
|
+ """
|
|
|
+ ID = b'\xFF\xFF' # reserved and not used
|
|
|
+ # overwrite with a unique 2-bytes bytestring in child classes
|
|
|
+ name = 'baseclass'
|
|
|
+
|
|
|
+ @classmethod
|
|
|
+ def detect(cls, data):
|
|
|
+ return data.startswith(cls.ID)
|
|
|
+
|
|
|
+ def __init__(self, **kwargs):
|
|
|
+ pass
|
|
|
+
|
|
|
+ def compress(self, data):
|
|
|
+ # add ID bytes
|
|
|
+ return self.ID + data
|
|
|
+
|
|
|
+ def decompress(self, data):
|
|
|
+ # strip ID bytes
|
|
|
+ return data[2:]
|
|
|
+
|
|
|
+
|
|
|
+class CNONE(CompressorBase):
|
|
|
+ """
|
|
|
+ none - no compression, just pass through data
|
|
|
+ """
|
|
|
+ ID = b'\x00\x00'
|
|
|
+ name = 'none'
|
|
|
+
|
|
|
+ def compress(self, data):
|
|
|
+ return super().compress(data)
|
|
|
+
|
|
|
+ def decompress(self, data):
|
|
|
+ data = super().decompress(data)
|
|
|
+ if not isinstance(data, bytes):
|
|
|
+ data = bytes(data)
|
|
|
+ return data
|
|
|
+
|
|
|
+
|
|
|
+cdef class LZ4(CompressorBase):
|
|
|
+ """
|
|
|
+ raw LZ4 compression / decompression (liblz4).
|
|
|
+
|
|
|
+ Features:
|
|
|
+ - lz4 is super fast
|
|
|
+ - wrapper releases CPython's GIL to support multithreaded code
|
|
|
+ - buffer given by caller, avoiding frequent reallocation and buffer duplication
|
|
|
+ - uses safe lz4 methods that never go beyond the end of the output buffer
|
|
|
+
|
|
|
+ But beware:
|
|
|
+ - this is not very generic, the given buffer MUST be large enough to
|
|
|
+ handle all compression or decompression output (or it will fail).
|
|
|
+ - you must not do method calls to the same LZ4 instance from different
|
|
|
+ threads at the same time - create one LZ4 instance per thread!
|
|
|
+ """
|
|
|
+ ID = b'\x01\x00'
|
|
|
+ name = 'lz4'
|
|
|
+
|
|
|
+ cdef char *buffer # helper buffer for (de)compression output
|
|
|
+ cdef int bufsize # size of this buffer
|
|
|
+
|
|
|
+ def __cinit__(self, **kwargs):
|
|
|
+ buffer = kwargs['buffer']
|
|
|
+ self.buffer = buffer
|
|
|
+ self.bufsize = len(buffer)
|
|
|
+
|
|
|
+ def compress(self, idata):
|
|
|
+ if not isinstance(idata, bytes):
|
|
|
+ idata = bytes(idata) # code below does not work with memoryview
|
|
|
+ cdef int isize = len(idata)
|
|
|
+ cdef int osize = self.bufsize
|
|
|
+ cdef char *source = idata
|
|
|
+ cdef char *dest = self.buffer
|
|
|
+ with nogil:
|
|
|
+ osize = LZ4_compress_limitedOutput(source, dest, isize, osize)
|
|
|
+ if not osize:
|
|
|
+ raise Exception('lz4 compress failed')
|
|
|
+ return super().compress(dest[:osize])
|
|
|
+
|
|
|
+ def decompress(self, idata):
|
|
|
+ if not isinstance(idata, bytes):
|
|
|
+ idata = bytes(idata) # code below does not work with memoryview
|
|
|
+ idata = super().decompress(idata)
|
|
|
+ cdef int isize = len(idata)
|
|
|
+ cdef int osize = self.bufsize
|
|
|
+ cdef char *source = idata
|
|
|
+ cdef char *dest = self.buffer
|
|
|
+ with nogil:
|
|
|
+ osize = LZ4_decompress_safe(source, dest, isize, osize)
|
|
|
+ if osize < 0:
|
|
|
+ # malformed input data, buffer too small, ...
|
|
|
+ raise Exception('lz4 decompress failed')
|
|
|
+ return dest[:osize]
|
|
|
+
|
|
|
+
|
|
|
+class LZMA(CompressorBase):
|
|
|
+ """
|
|
|
+ lzma compression / decompression (python 3.3+ stdlib)
|
|
|
+ """
|
|
|
+ ID = b'\x02\x00'
|
|
|
+ name = 'lzma'
|
|
|
+
|
|
|
+ def __init__(self, level=6, **kwargs):
|
|
|
+ super().__init__(**kwargs)
|
|
|
+ self.level = level
|
|
|
+ if lzma is None:
|
|
|
+ raise ValueError('No lzma support found.')
|
|
|
+
|
|
|
+ def compress(self, data):
|
|
|
+ # we do not need integrity checks in lzma, we do that already
|
|
|
+ data = lzma.compress(data, preset=self.level, check=lzma.CHECK_NONE)
|
|
|
+ return super().compress(data)
|
|
|
+
|
|
|
+ def decompress(self, data):
|
|
|
+ data = super().decompress(data)
|
|
|
+ return lzma.decompress(data)
|
|
|
+
|
|
|
+
|
|
|
+class ZLIB(CompressorBase):
|
|
|
+ """
|
|
|
+ zlib compression / decompression (python stdlib)
|
|
|
+ """
|
|
|
+ ID = b'\x08\x00' # not used here, see detect()
|
|
|
+ # avoid all 0x.8.. IDs elsewhere!
|
|
|
+ name = 'zlib'
|
|
|
+
|
|
|
+ @classmethod
|
|
|
+ def detect(cls, data):
|
|
|
+ # matches misc. patterns 0x.8.. used by zlib
|
|
|
+ cmf, flg = data[:2]
|
|
|
+ is_deflate = cmf & 0x0f == 8
|
|
|
+ check_ok = (cmf * 256 + flg) % 31 == 0
|
|
|
+ return check_ok and is_deflate
|
|
|
+
|
|
|
+ def __init__(self, level=6, **kwargs):
|
|
|
+ super().__init__(**kwargs)
|
|
|
+ self.level = level
|
|
|
+
|
|
|
+ def compress(self, data):
|
|
|
+ # note: for compatibility no super call, do not add ID bytes
|
|
|
+ return zlib.compress(data, self.level)
|
|
|
+
|
|
|
+ def decompress(self, data):
|
|
|
+ # note: for compatibility no super call, do not strip ID bytes
|
|
|
+ return zlib.decompress(data)
|
|
|
+
|
|
|
+
|
|
|
+COMPRESSOR_TABLE = {
|
|
|
+ CNONE.name: CNONE,
|
|
|
+ LZ4.name: LZ4,
|
|
|
+ ZLIB.name: ZLIB,
|
|
|
+ LZMA.name: LZMA,
|
|
|
+}
|
|
|
+COMPRESSOR_LIST = [LZ4, CNONE, ZLIB, LZMA, ] # check fast stuff first
|
|
|
+
|
|
|
+def get_compressor(name, **kwargs):
|
|
|
+ cls = COMPRESSOR_TABLE[name]
|
|
|
+ return cls(**kwargs)
|
|
|
+
|
|
|
+
|
|
|
+class Compressor:
|
|
|
+ """
|
|
|
+ compresses using a compressor with given name and parameters
|
|
|
+ decompresses everything we can handle (autodetect)
|
|
|
+ """
|
|
|
+ def __init__(self, name='null', **kwargs):
|
|
|
+ self.params = kwargs
|
|
|
+ self.compressor = get_compressor(name, **self.params)
|
|
|
+
|
|
|
+ def compress(self, data):
|
|
|
+ return self.compressor.compress(data)
|
|
|
+
|
|
|
+ def decompress(self, data):
|
|
|
+ hdr = bytes(data[:2]) # detect() does not work with memoryview
|
|
|
+ for cls in COMPRESSOR_LIST:
|
|
|
+ if cls.detect(hdr):
|
|
|
+ return cls(**self.params).decompress(data)
|
|
|
+ else:
|
|
|
+ raise ValueError('No decompressor for this data found: %r.', data[:2])
|
|
|
+
|
|
|
+
|
|
|
+# a buffer used for (de)compression result, which can be slightly bigger
|
|
|
+# than the chunk buffer in the worst (incompressible data) case, add 10%:
|
|
|
+COMPR_BUFFER = bytes(int(1.1 * 2 ** 23)) # CHUNK_MAX_EXP == 23
|