compress.pyx 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199
  1. import zlib
  2. try:
  3. import lzma
  4. except ImportError:
  5. lzma = None
  6. cdef extern from "lz4.h":
  7. int LZ4_compress_limitedOutput(const char* source, char* dest, int inputSize, int maxOutputSize) nogil
  8. int LZ4_decompress_safe(const char* source, char* dest, int inputSize, int maxOutputSize) nogil
  9. cdef class CompressorBase:
  10. """
  11. base class for all (de)compression classes,
  12. also handles compression format auto detection and
  13. adding/stripping the ID header (which enable auto detection).
  14. """
  15. ID = b'\xFF\xFF' # reserved and not used
  16. # overwrite with a unique 2-bytes bytestring in child classes
  17. name = 'baseclass'
  18. @classmethod
  19. def detect(cls, data):
  20. return data.startswith(cls.ID)
  21. def __init__(self, **kwargs):
  22. pass
  23. def compress(self, data):
  24. # add ID bytes
  25. return self.ID + data
  26. def decompress(self, data):
  27. # strip ID bytes
  28. return data[2:]
  29. class CNONE(CompressorBase):
  30. """
  31. none - no compression, just pass through data
  32. """
  33. ID = b'\x00\x00'
  34. name = 'none'
  35. def compress(self, data):
  36. return super().compress(data)
  37. def decompress(self, data):
  38. data = super().decompress(data)
  39. if not isinstance(data, bytes):
  40. data = bytes(data)
  41. return data
  42. cdef class LZ4(CompressorBase):
  43. """
  44. raw LZ4 compression / decompression (liblz4).
  45. Features:
  46. - lz4 is super fast
  47. - wrapper releases CPython's GIL to support multithreaded code
  48. - buffer given by caller, avoiding frequent reallocation and buffer duplication
  49. - uses safe lz4 methods that never go beyond the end of the output buffer
  50. But beware:
  51. - this is not very generic, the given buffer MUST be large enough to
  52. handle all compression or decompression output (or it will fail).
  53. - you must not do method calls to the same LZ4 instance from different
  54. threads at the same time - create one LZ4 instance per thread!
  55. """
  56. ID = b'\x01\x00'
  57. name = 'lz4'
  58. cdef char *buffer # helper buffer for (de)compression output
  59. cdef int bufsize # size of this buffer
  60. def __cinit__(self, **kwargs):
  61. buffer = kwargs['buffer']
  62. self.buffer = buffer
  63. self.bufsize = len(buffer)
  64. def compress(self, idata):
  65. if not isinstance(idata, bytes):
  66. idata = bytes(idata) # code below does not work with memoryview
  67. cdef int isize = len(idata)
  68. cdef int osize = self.bufsize
  69. cdef char *source = idata
  70. cdef char *dest = self.buffer
  71. with nogil:
  72. osize = LZ4_compress_limitedOutput(source, dest, isize, osize)
  73. if not osize:
  74. raise Exception('lz4 compress failed')
  75. return super().compress(dest[:osize])
  76. def decompress(self, idata):
  77. if not isinstance(idata, bytes):
  78. idata = bytes(idata) # code below does not work with memoryview
  79. idata = super().decompress(idata)
  80. cdef int isize = len(idata)
  81. cdef int osize = self.bufsize
  82. cdef char *source = idata
  83. cdef char *dest = self.buffer
  84. with nogil:
  85. osize = LZ4_decompress_safe(source, dest, isize, osize)
  86. if osize < 0:
  87. # malformed input data, buffer too small, ...
  88. raise Exception('lz4 decompress failed')
  89. return dest[:osize]
  90. class LZMA(CompressorBase):
  91. """
  92. lzma compression / decompression (python 3.3+ stdlib)
  93. """
  94. ID = b'\x02\x00'
  95. name = 'lzma'
  96. def __init__(self, level=6, **kwargs):
  97. super().__init__(**kwargs)
  98. self.level = level
  99. if lzma is None:
  100. raise ValueError('No lzma support found.')
  101. def compress(self, data):
  102. # we do not need integrity checks in lzma, we do that already
  103. data = lzma.compress(data, preset=self.level, check=lzma.CHECK_NONE)
  104. return super().compress(data)
  105. def decompress(self, data):
  106. data = super().decompress(data)
  107. return lzma.decompress(data)
  108. class ZLIB(CompressorBase):
  109. """
  110. zlib compression / decompression (python stdlib)
  111. """
  112. ID = b'\x08\x00' # not used here, see detect()
  113. # avoid all 0x.8.. IDs elsewhere!
  114. name = 'zlib'
  115. @classmethod
  116. def detect(cls, data):
  117. # matches misc. patterns 0x.8.. used by zlib
  118. cmf, flg = data[:2]
  119. is_deflate = cmf & 0x0f == 8
  120. check_ok = (cmf * 256 + flg) % 31 == 0
  121. return check_ok and is_deflate
  122. def __init__(self, level=6, **kwargs):
  123. super().__init__(**kwargs)
  124. self.level = level
  125. def compress(self, data):
  126. # note: for compatibility no super call, do not add ID bytes
  127. return zlib.compress(data, self.level)
  128. def decompress(self, data):
  129. # note: for compatibility no super call, do not strip ID bytes
  130. return zlib.decompress(data)
  131. COMPRESSOR_TABLE = {
  132. CNONE.name: CNONE,
  133. LZ4.name: LZ4,
  134. ZLIB.name: ZLIB,
  135. LZMA.name: LZMA,
  136. }
  137. COMPRESSOR_LIST = [LZ4, CNONE, ZLIB, LZMA, ] # check fast stuff first
  138. def get_compressor(name, **kwargs):
  139. cls = COMPRESSOR_TABLE[name]
  140. return cls(**kwargs)
  141. class Compressor:
  142. """
  143. compresses using a compressor with given name and parameters
  144. decompresses everything we can handle (autodetect)
  145. """
  146. def __init__(self, name='null', **kwargs):
  147. self.params = kwargs
  148. self.compressor = get_compressor(name, **self.params)
  149. def compress(self, data):
  150. return self.compressor.compress(data)
  151. def decompress(self, data):
  152. hdr = bytes(data[:2]) # detect() does not work with memoryview
  153. for cls in COMPRESSOR_LIST:
  154. if cls.detect(hdr):
  155. return cls(**self.params).decompress(data)
  156. else:
  157. raise ValueError('No decompressor for this data found: %r.', data[:2])
  158. # a buffer used for (de)compression result, which can be slightly bigger
  159. # than the chunk buffer in the worst (incompressible data) case, add 10%:
  160. COMPR_BUFFER = bytes(int(1.1 * 2 ** 23)) # CHUNK_MAX_EXP == 23