compress.pyx 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445
  1. # cython: language_level=3
  2. """
  3. borg.compress
  4. =============
  5. Compression is applied to chunks after ID hashing (so the ID is a direct function of the
  6. plain chunk, compression is irrelevant to it), and of course before encryption.
  7. The "auto" mode (e.g. --compression auto,lzma,4) is implemented as a meta Compressor,
  8. meaning that Auto acts like a Compressor, but defers actual work to others (namely
  9. LZ4 as a heuristic whether compression is worth it, and the specified Compressor
  10. for the actual compression).
  11. Decompression is normally handled through Compressor.decompress which will detect
  12. which compressor has been used to compress the data and dispatch to the correct
  13. decompressor.
  14. """
  15. import zlib
  16. try:
  17. import lzma
  18. except ImportError:
  19. lzma = None
  20. from .helpers import Buffer, DecompressionError
  21. API_VERSION = '1.1_06'
  22. cdef extern from "algorithms/lz4-libselect.h":
  23. int LZ4_compress_default(const char* source, char* dest, int inputSize, int maxOutputSize) nogil
  24. int LZ4_decompress_safe(const char* source, char* dest, int inputSize, int maxOutputSize) nogil
  25. int LZ4_compressBound(int inputSize) nogil
  26. cdef extern from "algorithms/zstd-libselect.h":
  27. size_t ZSTD_compress(void* dst, size_t dstCapacity, const void* src, size_t srcSize, int compressionLevel) nogil
  28. size_t ZSTD_decompress(void* dst, size_t dstCapacity, const void* src, size_t compressedSize) nogil
  29. size_t ZSTD_compressBound(size_t srcSize) nogil
  30. unsigned long long ZSTD_CONTENTSIZE_UNKNOWN
  31. unsigned long long ZSTD_CONTENTSIZE_ERROR
  32. unsigned long long ZSTD_getFrameContentSize(const void *src, size_t srcSize) nogil
  33. unsigned ZSTD_isError(size_t code) nogil
  34. const char* ZSTD_getErrorName(size_t code) nogil
  35. buffer = Buffer(bytearray, size=0)
  36. cdef class CompressorBase:
  37. """
  38. base class for all (de)compression classes,
  39. also handles compression format auto detection and
  40. adding/stripping the ID header (which enable auto detection).
  41. """
  42. ID = b'\xFF\xFF' # reserved and not used
  43. # overwrite with a unique 2-bytes bytestring in child classes
  44. name = 'baseclass'
  45. @classmethod
  46. def detect(cls, data):
  47. return data.startswith(cls.ID)
  48. def __init__(self, **kwargs):
  49. pass
  50. def decide(self, data):
  51. """
  52. Return which compressor will perform the actual compression for *data*.
  53. This exists for a very specific case: If borg recreate is instructed to recompress
  54. using Auto compression it needs to determine the _actual_ target compression of a chunk
  55. in order to detect whether it should be recompressed.
  56. For all Compressors that are not Auto this always returns *self*.
  57. """
  58. return self
  59. def compress(self, data):
  60. """
  61. Compress *data* (bytes) and return bytes result. Prepend the ID bytes of this compressor,
  62. which is needed so that the correct decompressor can be used for decompression.
  63. """
  64. # add ID bytes
  65. return self.ID + data
  66. def decompress(self, data):
  67. """
  68. Decompress *data* (bytes) and return bytes result. The leading Compressor ID
  69. bytes need to be present.
  70. Only handles input generated by _this_ Compressor - for a general purpose
  71. decompression method see *Compressor.decompress*.
  72. """
  73. # strip ID bytes
  74. return data[2:]
  75. class CNONE(CompressorBase):
  76. """
  77. none - no compression, just pass through data
  78. """
  79. ID = b'\x00\x00'
  80. name = 'none'
  81. def compress(self, data):
  82. return super().compress(data)
  83. def decompress(self, data):
  84. data = super().decompress(data)
  85. if not isinstance(data, bytes):
  86. data = bytes(data)
  87. return data
  88. class LZ4(CompressorBase):
  89. """
  90. raw LZ4 compression / decompression (liblz4).
  91. Features:
  92. - lz4 is super fast
  93. - wrapper releases CPython's GIL to support multithreaded code
  94. - uses safe lz4 methods that never go beyond the end of the output buffer
  95. """
  96. ID = b'\x01\x00'
  97. name = 'lz4'
  98. def __init__(self, **kwargs):
  99. pass
  100. def compress(self, idata):
  101. if not isinstance(idata, bytes):
  102. idata = bytes(idata) # code below does not work with memoryview
  103. cdef int isize = len(idata)
  104. cdef int osize
  105. cdef char *source = idata
  106. cdef char *dest
  107. osize = LZ4_compressBound(isize)
  108. buf = buffer.get(osize)
  109. dest = <char *> buf
  110. osize = LZ4_compress_default(source, dest, isize, osize)
  111. if not osize:
  112. raise Exception('lz4 compress failed')
  113. return super().compress(dest[:osize])
  114. def decompress(self, idata):
  115. if not isinstance(idata, bytes):
  116. idata = bytes(idata) # code below does not work with memoryview
  117. idata = super().decompress(idata)
  118. cdef int isize = len(idata)
  119. cdef int osize
  120. cdef int rsize
  121. cdef char *source = idata
  122. cdef char *dest
  123. # a bit more than 8MB is enough for the usual data sizes yielded by the chunker.
  124. # allocate more if isize * 3 is already bigger, to avoid having to resize often.
  125. osize = max(int(1.1 * 2**23), isize * 3)
  126. while True:
  127. try:
  128. buf = buffer.get(osize)
  129. except MemoryError:
  130. raise DecompressionError('MemoryError')
  131. dest = <char *> buf
  132. rsize = LZ4_decompress_safe(source, dest, isize, osize)
  133. if rsize >= 0:
  134. break
  135. if osize > 2 ** 27: # 128MiB (should be enough, considering max. repo obj size and very good compression)
  136. # this is insane, get out of here
  137. raise DecompressionError('lz4 decompress failed')
  138. # likely the buffer was too small, get a bigger one:
  139. osize = int(1.5 * osize)
  140. return dest[:rsize]
  141. class LZMA(CompressorBase):
  142. """
  143. lzma compression / decompression
  144. """
  145. ID = b'\x02\x00'
  146. name = 'lzma'
  147. def __init__(self, level=6, **kwargs):
  148. super().__init__(**kwargs)
  149. self.level = level
  150. if lzma is None:
  151. raise ValueError('No lzma support found.')
  152. def compress(self, data):
  153. # we do not need integrity checks in lzma, we do that already
  154. data = lzma.compress(data, preset=self.level, check=lzma.CHECK_NONE)
  155. return super().compress(data)
  156. def decompress(self, data):
  157. data = super().decompress(data)
  158. try:
  159. return lzma.decompress(data)
  160. except lzma.LZMAError as e:
  161. raise DecompressionError(str(e)) from None
  162. class ZSTD(CompressorBase):
  163. """zstd compression / decompression (pypi: zstandard, gh: python-zstandard)"""
  164. # This is a NOT THREAD SAFE implementation.
  165. # Only ONE python context must to be created at a time.
  166. # It should work flawlessly as long as borg will call ONLY ONE compression job at time.
  167. ID = b'\x03\x00'
  168. name = 'zstd'
  169. def __init__(self, level=3, **kwargs):
  170. super().__init__(**kwargs)
  171. self.level = level
  172. def compress(self, idata):
  173. if not isinstance(idata, bytes):
  174. idata = bytes(idata) # code below does not work with memoryview
  175. cdef int isize = len(idata)
  176. cdef size_t osize
  177. cdef char *source = idata
  178. cdef char *dest
  179. cdef int level = self.level
  180. osize = ZSTD_compressBound(isize)
  181. buf = buffer.get(osize)
  182. dest = <char *> buf
  183. with nogil:
  184. osize = ZSTD_compress(dest, osize, source, isize, level)
  185. if ZSTD_isError(osize):
  186. raise Exception('zstd compress failed: %s' % ZSTD_getErrorName(osize))
  187. return super().compress(dest[:osize])
  188. def decompress(self, idata):
  189. if not isinstance(idata, bytes):
  190. idata = bytes(idata) # code below does not work with memoryview
  191. idata = super().decompress(idata)
  192. cdef int isize = len(idata)
  193. cdef unsigned long long osize
  194. cdef unsigned long long rsize
  195. cdef char *source = idata
  196. cdef char *dest
  197. osize = ZSTD_getFrameContentSize(source, isize)
  198. if osize == ZSTD_CONTENTSIZE_ERROR:
  199. raise DecompressionError('zstd get size failed: data was not compressed by zstd')
  200. if osize == ZSTD_CONTENTSIZE_UNKNOWN:
  201. raise DecompressionError('zstd get size failed: original size unknown')
  202. try:
  203. buf = buffer.get(osize)
  204. except MemoryError:
  205. raise DecompressionError('MemoryError')
  206. dest = <char *> buf
  207. with nogil:
  208. rsize = ZSTD_decompress(dest, osize, source, isize)
  209. if ZSTD_isError(rsize):
  210. raise DecompressionError('zstd decompress failed: %s' % ZSTD_getErrorName(rsize))
  211. if rsize != osize:
  212. raise DecompressionError('zstd decompress failed: size mismatch')
  213. return dest[:osize]
  214. class ZLIB(CompressorBase):
  215. """
  216. zlib compression / decompression (python stdlib)
  217. """
  218. ID = b'\x08\x00' # not used here, see detect()
  219. # avoid all 0x.8.. IDs elsewhere!
  220. name = 'zlib'
  221. @classmethod
  222. def detect(cls, data):
  223. # matches misc. patterns 0x.8.. used by zlib
  224. cmf, flg = data[:2]
  225. is_deflate = cmf & 0x0f == 8
  226. check_ok = (cmf * 256 + flg) % 31 == 0
  227. return check_ok and is_deflate
  228. def __init__(self, level=6, **kwargs):
  229. super().__init__(**kwargs)
  230. self.level = level
  231. def compress(self, data):
  232. # note: for compatibility no super call, do not add ID bytes
  233. return zlib.compress(data, self.level)
  234. def decompress(self, data):
  235. # note: for compatibility no super call, do not strip ID bytes
  236. try:
  237. return zlib.decompress(data)
  238. except zlib.error as e:
  239. raise DecompressionError(str(e)) from None
  240. class Auto(CompressorBase):
  241. """
  242. Meta-Compressor that decides which compression to use based on LZ4's ratio.
  243. As a meta-Compressor the actual compression is deferred to other Compressors,
  244. therefore this Compressor has no ID, no detect() and no decompress().
  245. """
  246. ID = None
  247. name = 'auto'
  248. def __init__(self, compressor):
  249. super().__init__()
  250. self.compressor = compressor
  251. self.lz4 = get_compressor('lz4')
  252. self.none = get_compressor('none')
  253. def _decide(self, data):
  254. """
  255. Decides what to do with *data*. Returns (compressor, lz4_data).
  256. *lz4_data* is the LZ4 result if *compressor* is LZ4 as well, otherwise it is None.
  257. """
  258. lz4_data = self.lz4.compress(data)
  259. ratio = len(lz4_data) / len(data)
  260. if ratio < 0.97:
  261. return self.compressor, lz4_data
  262. elif ratio < 1:
  263. return self.lz4, lz4_data
  264. else:
  265. return self.none, None
  266. def decide(self, data):
  267. return self._decide(data)[0]
  268. def compress(self, data):
  269. compressor, lz4_data = self._decide(data)
  270. if compressor is self.lz4:
  271. # we know that trying to compress with expensive compressor is likely pointless,
  272. # but lz4 managed to at least squeeze the data a bit.
  273. return lz4_data
  274. if compressor is self.none:
  275. # we know that trying to compress with expensive compressor is likely pointless
  276. # and also lz4 did not manage to squeeze the data (not even a bit).
  277. uncompressed_data = compressor.compress(data)
  278. return uncompressed_data
  279. # if we get here, the decider decided to try the expensive compressor.
  280. # we also know that lz4_data is smaller than uncompressed data.
  281. exp_compressed_data = compressor.compress(data)
  282. ratio = len(exp_compressed_data) / len(lz4_data)
  283. if ratio < 0.99:
  284. # the expensive compressor managed to squeeze the data significantly better than lz4.
  285. return exp_compressed_data
  286. else:
  287. # otherwise let's just store the lz4 data, which decompresses extremely fast.
  288. return lz4_data
  289. def decompress(self, data):
  290. raise NotImplementedError
  291. def detect(cls, data):
  292. raise NotImplementedError
  293. # Maps valid compressor names to their class
  294. COMPRESSOR_TABLE = {
  295. CNONE.name: CNONE,
  296. LZ4.name: LZ4,
  297. ZLIB.name: ZLIB,
  298. LZMA.name: LZMA,
  299. Auto.name: Auto,
  300. ZSTD.name: ZSTD,
  301. }
  302. # List of possible compression types. Does not include Auto, since it is a meta-Compressor.
  303. COMPRESSOR_LIST = [LZ4, ZSTD, CNONE, ZLIB, LZMA, ] # check fast stuff first
  304. def get_compressor(name, **kwargs):
  305. cls = COMPRESSOR_TABLE[name]
  306. return cls(**kwargs)
  307. class Compressor:
  308. """
  309. compresses using a compressor with given name and parameters
  310. decompresses everything we can handle (autodetect)
  311. """
  312. def __init__(self, name='null', **kwargs):
  313. self.params = kwargs
  314. self.compressor = get_compressor(name, **self.params)
  315. def compress(self, data):
  316. return self.compressor.compress(data)
  317. def decompress(self, data):
  318. compressor_cls = self.detect(data)
  319. return compressor_cls(**self.params).decompress(data)
  320. @staticmethod
  321. def detect(data):
  322. hdr = bytes(data[:2]) # detect() does not work with memoryview
  323. for cls in COMPRESSOR_LIST:
  324. if cls.detect(hdr):
  325. return cls
  326. else:
  327. raise ValueError('No decompressor for this data found: %r.', data[:2])
  328. class CompressionSpec:
  329. def __init__(self, s):
  330. values = s.split(',')
  331. count = len(values)
  332. if count < 1:
  333. raise ValueError
  334. # --compression algo[,level]
  335. self.name = values[0]
  336. if self.name in ('none', 'lz4', ):
  337. return
  338. elif self.name in ('zlib', 'lzma', ):
  339. if count < 2:
  340. level = 6 # default compression level in py stdlib
  341. elif count == 2:
  342. level = int(values[1])
  343. if not 0 <= level <= 9:
  344. raise ValueError
  345. else:
  346. raise ValueError
  347. self.level = level
  348. elif self.name in ('zstd', ):
  349. if count < 2:
  350. level = 3 # default compression level in zstd
  351. elif count == 2:
  352. level = int(values[1])
  353. if not 1 <= level <= 22:
  354. raise ValueError
  355. else:
  356. raise ValueError
  357. self.level = level
  358. elif self.name == 'auto':
  359. if 2 <= count <= 3:
  360. compression = ','.join(values[1:])
  361. else:
  362. raise ValueError
  363. self.inner = CompressionSpec(compression)
  364. else:
  365. raise ValueError
  366. @property
  367. def compressor(self):
  368. if self.name in ('none', 'lz4', ):
  369. return get_compressor(self.name)
  370. elif self.name in ('zlib', 'lzma', 'zstd', ):
  371. return get_compressor(self.name, level=self.level)
  372. elif self.name == 'auto':
  373. return get_compressor(self.name, compressor=self.inner.compressor)