compress.pyx 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210
  1. import zlib
  2. try:
  3. import lzma
  4. except ImportError:
  5. lzma = None
  6. from .helpers import Buffer, DecompressionError
  7. API_VERSION = '1.0_01'
  8. cdef extern from "lz4.h":
  9. int LZ4_compress_limitedOutput(const char* source, char* dest, int inputSize, int maxOutputSize) nogil
  10. int LZ4_decompress_safe(const char* source, char* dest, int inputSize, int maxOutputSize) nogil
  11. int LZ4_compressBound(int inputSize) nogil
  12. buffer = Buffer(bytearray, size=0)
  13. cdef class CompressorBase:
  14. """
  15. base class for all (de)compression classes,
  16. also handles compression format auto detection and
  17. adding/stripping the ID header (which enable auto detection).
  18. """
  19. ID = b'\xFF\xFF' # reserved and not used
  20. # overwrite with a unique 2-bytes bytestring in child classes
  21. name = 'baseclass'
  22. @classmethod
  23. def detect(cls, data):
  24. return data.startswith(cls.ID)
  25. def __init__(self, **kwargs):
  26. pass
  27. def compress(self, data):
  28. # add ID bytes
  29. return self.ID + data
  30. def decompress(self, data):
  31. # strip ID bytes
  32. return data[2:]
  33. class CNONE(CompressorBase):
  34. """
  35. none - no compression, just pass through data
  36. """
  37. ID = b'\x00\x00'
  38. name = 'none'
  39. def compress(self, data):
  40. return super().compress(data)
  41. def decompress(self, data):
  42. data = super().decompress(data)
  43. if not isinstance(data, bytes):
  44. data = bytes(data)
  45. return data
  46. class LZ4(CompressorBase):
  47. """
  48. raw LZ4 compression / decompression (liblz4).
  49. Features:
  50. - lz4 is super fast
  51. - wrapper releases CPython's GIL to support multithreaded code
  52. - uses safe lz4 methods that never go beyond the end of the output buffer
  53. """
  54. ID = b'\x01\x00'
  55. name = 'lz4'
  56. def __init__(self, **kwargs):
  57. pass
  58. def compress(self, idata):
  59. if not isinstance(idata, bytes):
  60. idata = bytes(idata) # code below does not work with memoryview
  61. cdef int isize = len(idata)
  62. cdef int osize
  63. cdef char *source = idata
  64. cdef char *dest
  65. osize = LZ4_compressBound(isize)
  66. buf = buffer.get(osize)
  67. dest = <char *> buf
  68. with nogil:
  69. osize = LZ4_compress_limitedOutput(source, dest, isize, osize)
  70. if not osize:
  71. raise Exception('lz4 compress failed')
  72. return super().compress(dest[:osize])
  73. def decompress(self, idata):
  74. if not isinstance(idata, bytes):
  75. idata = bytes(idata) # code below does not work with memoryview
  76. idata = super().decompress(idata)
  77. cdef int isize = len(idata)
  78. cdef int osize
  79. cdef int rsize
  80. cdef char *source = idata
  81. cdef char *dest
  82. # a bit more than 8MB is enough for the usual data sizes yielded by the chunker.
  83. # allocate more if isize * 3 is already bigger, to avoid having to resize often.
  84. osize = max(int(1.1 * 2**23), isize * 3)
  85. while True:
  86. buf = buffer.get(osize)
  87. dest = <char *> buf
  88. with nogil:
  89. rsize = LZ4_decompress_safe(source, dest, isize, osize)
  90. if rsize >= 0:
  91. break
  92. if osize > 2 ** 30:
  93. # this is insane, get out of here
  94. raise DecompressionError('lz4 decompress failed')
  95. # likely the buffer was too small, get a bigger one:
  96. osize = int(1.5 * osize)
  97. return dest[:rsize]
  98. class LZMA(CompressorBase):
  99. """
  100. lzma compression / decompression
  101. """
  102. ID = b'\x02\x00'
  103. name = 'lzma'
  104. def __init__(self, level=6, **kwargs):
  105. super().__init__(**kwargs)
  106. self.level = level
  107. if lzma is None:
  108. raise ValueError('No lzma support found.')
  109. def compress(self, data):
  110. # we do not need integrity checks in lzma, we do that already
  111. data = lzma.compress(data, preset=self.level, check=lzma.CHECK_NONE)
  112. return super().compress(data)
  113. def decompress(self, data):
  114. data = super().decompress(data)
  115. try:
  116. return lzma.decompress(data)
  117. except lzma.LZMAError as e:
  118. raise DecompressionError(str(e)) from None
  119. class ZLIB(CompressorBase):
  120. """
  121. zlib compression / decompression (python stdlib)
  122. """
  123. ID = b'\x08\x00' # not used here, see detect()
  124. # avoid all 0x.8.. IDs elsewhere!
  125. name = 'zlib'
  126. @classmethod
  127. def detect(cls, data):
  128. # matches misc. patterns 0x.8.. used by zlib
  129. cmf, flg = data[:2]
  130. is_deflate = cmf & 0x0f == 8
  131. check_ok = (cmf * 256 + flg) % 31 == 0
  132. return check_ok and is_deflate
  133. def __init__(self, level=6, **kwargs):
  134. super().__init__(**kwargs)
  135. self.level = level
  136. def compress(self, data):
  137. # note: for compatibility no super call, do not add ID bytes
  138. return zlib.compress(data, self.level)
  139. def decompress(self, data):
  140. # note: for compatibility no super call, do not strip ID bytes
  141. try:
  142. return zlib.decompress(data)
  143. except zlib.error as e:
  144. raise DecompressionError(str(e)) from None
  145. COMPRESSOR_TABLE = {
  146. CNONE.name: CNONE,
  147. LZ4.name: LZ4,
  148. ZLIB.name: ZLIB,
  149. LZMA.name: LZMA,
  150. }
  151. COMPRESSOR_LIST = [LZ4, CNONE, ZLIB, LZMA, ] # check fast stuff first
  152. def get_compressor(name, **kwargs):
  153. cls = COMPRESSOR_TABLE[name]
  154. return cls(**kwargs)
  155. class Compressor:
  156. """
  157. compresses using a compressor with given name and parameters
  158. decompresses everything we can handle (autodetect)
  159. """
  160. def __init__(self, name='null', **kwargs):
  161. self.params = kwargs
  162. self.compressor = get_compressor(name, **self.params)
  163. def compress(self, data):
  164. return self.compressor.compress(data)
  165. def decompress(self, data):
  166. hdr = bytes(data[:2]) # detect() does not work with memoryview
  167. for cls in COMPRESSOR_LIST:
  168. if cls.detect(hdr):
  169. return cls(**self.params).decompress(data)
  170. else:
  171. raise ValueError('No decompressor for this data found: %r.', data[:2])