123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199 |
- import zlib
- try:
- import lzma
- except ImportError:
- lzma = None
- cdef extern from "lz4.h":
- int LZ4_compress_limitedOutput(const char* source, char* dest, int inputSize, int maxOutputSize) nogil
- int LZ4_decompress_safe(const char* source, char* dest, int inputSize, int maxOutputSize) nogil
- cdef class CompressorBase:
- """
- base class for all (de)compression classes,
- also handles compression format auto detection and
- adding/stripping the ID header (which enable auto detection).
- """
- ID = b'\xFF\xFF' # reserved and not used
- # overwrite with a unique 2-bytes bytestring in child classes
- name = 'baseclass'
- @classmethod
- def detect(cls, data):
- return data.startswith(cls.ID)
- def __init__(self, **kwargs):
- pass
- def compress(self, data):
- # add ID bytes
- return self.ID + data
- def decompress(self, data):
- # strip ID bytes
- return data[2:]
- class CNONE(CompressorBase):
- """
- none - no compression, just pass through data
- """
- ID = b'\x00\x00'
- name = 'none'
- def compress(self, data):
- return super().compress(data)
- def decompress(self, data):
- data = super().decompress(data)
- if not isinstance(data, bytes):
- data = bytes(data)
- return data
- cdef class LZ4(CompressorBase):
- """
- raw LZ4 compression / decompression (liblz4).
- Features:
- - lz4 is super fast
- - wrapper releases CPython's GIL to support multithreaded code
- - buffer given by caller, avoiding frequent reallocation and buffer duplication
- - uses safe lz4 methods that never go beyond the end of the output buffer
- But beware:
- - this is not very generic, the given buffer MUST be large enough to
- handle all compression or decompression output (or it will fail).
- - you must not do method calls to the same LZ4 instance from different
- threads at the same time - create one LZ4 instance per thread!
- """
- ID = b'\x01\x00'
- name = 'lz4'
- cdef char *buffer # helper buffer for (de)compression output
- cdef int bufsize # size of this buffer
- def __cinit__(self, **kwargs):
- buffer = kwargs['buffer']
- self.buffer = buffer
- self.bufsize = len(buffer)
- def compress(self, idata):
- if not isinstance(idata, bytes):
- idata = bytes(idata) # code below does not work with memoryview
- cdef int isize = len(idata)
- cdef int osize = self.bufsize
- cdef char *source = idata
- cdef char *dest = self.buffer
- with nogil:
- osize = LZ4_compress_limitedOutput(source, dest, isize, osize)
- if not osize:
- raise Exception('lz4 compress failed')
- return super().compress(dest[:osize])
- def decompress(self, idata):
- if not isinstance(idata, bytes):
- idata = bytes(idata) # code below does not work with memoryview
- idata = super().decompress(idata)
- cdef int isize = len(idata)
- cdef int osize = self.bufsize
- cdef char *source = idata
- cdef char *dest = self.buffer
- with nogil:
- osize = LZ4_decompress_safe(source, dest, isize, osize)
- if osize < 0:
- # malformed input data, buffer too small, ...
- raise Exception('lz4 decompress failed')
- return dest[:osize]
- class LZMA(CompressorBase):
- """
- lzma compression / decompression (python 3.3+ stdlib)
- """
- ID = b'\x02\x00'
- name = 'lzma'
- def __init__(self, level=6, **kwargs):
- super().__init__(**kwargs)
- self.level = level
- if lzma is None:
- raise ValueError('No lzma support found.')
- def compress(self, data):
- # we do not need integrity checks in lzma, we do that already
- data = lzma.compress(data, preset=self.level, check=lzma.CHECK_NONE)
- return super().compress(data)
- def decompress(self, data):
- data = super().decompress(data)
- return lzma.decompress(data)
- class ZLIB(CompressorBase):
- """
- zlib compression / decompression (python stdlib)
- """
- ID = b'\x08\x00' # not used here, see detect()
- # avoid all 0x.8.. IDs elsewhere!
- name = 'zlib'
- @classmethod
- def detect(cls, data):
- # matches misc. patterns 0x.8.. used by zlib
- cmf, flg = data[:2]
- is_deflate = cmf & 0x0f == 8
- check_ok = (cmf * 256 + flg) % 31 == 0
- return check_ok and is_deflate
- def __init__(self, level=6, **kwargs):
- super().__init__(**kwargs)
- self.level = level
- def compress(self, data):
- # note: for compatibility no super call, do not add ID bytes
- return zlib.compress(data, self.level)
- def decompress(self, data):
- # note: for compatibility no super call, do not strip ID bytes
- return zlib.decompress(data)
- COMPRESSOR_TABLE = {
- CNONE.name: CNONE,
- LZ4.name: LZ4,
- ZLIB.name: ZLIB,
- LZMA.name: LZMA,
- }
- COMPRESSOR_LIST = [LZ4, CNONE, ZLIB, LZMA, ] # check fast stuff first
- def get_compressor(name, **kwargs):
- cls = COMPRESSOR_TABLE[name]
- return cls(**kwargs)
- class Compressor:
- """
- compresses using a compressor with given name and parameters
- decompresses everything we can handle (autodetect)
- """
- def __init__(self, name='null', **kwargs):
- self.params = kwargs
- self.compressor = get_compressor(name, **self.params)
- def compress(self, data):
- return self.compressor.compress(data)
- def decompress(self, data):
- hdr = bytes(data[:2]) # detect() does not work with memoryview
- for cls in COMPRESSOR_LIST:
- if cls.detect(hdr):
- return cls(**self.params).decompress(data)
- else:
- raise ValueError('No decompressor for this data found: %r.', data[:2])
- # a buffer used for (de)compression result, which can be slightly bigger
- # than the chunk buffer in the worst (incompressible data) case, add 10%:
- COMPR_BUFFER = bytes(int(1.1 * 2 ** 23)) # CHUNK_MAX_EXP == 23
|