浏览代码

Merge pull request #4683 from elho/only-compress-if-smaller

only store compressed data if the result actually is smaller
TW 5 年之前
父节点
当前提交
914a52ac15
共有 5 个文件被更改,包括 136 次插入47 次删除
  1. 116 35
      src/borg/compress.pyx
  2. 2 2
      src/borg/remote.py
  3. 10 2
      src/borg/testsuite/archiver.py
  4. 4 2
      src/borg/testsuite/compress.py
  5. 4 6
      src/borg/testsuite/key.py

+ 116 - 35
src/borg/compress.pyx

@@ -72,7 +72,8 @@ cdef class CompressorBase:
         using Auto compression it needs to determine the _actual_ target compression of a chunk
         in order to detect whether it should be recompressed.
 
-        For all Compressors that are not Auto this always returns *self*.
+        Any compressor may return a compressor other than *self*, like e.g. the CNONE compressor,
+        and should actually do so if *data* would become larger on compression.
         """
         return self
 
@@ -95,6 +96,50 @@ cdef class CompressorBase:
         # strip ID bytes
         return data[2:]
 
+cdef class DecidingCompressor(CompressorBase):
+    """
+    base class for (de)compression classes that (based on an internal _decide
+    method) decide whether and how to compress data.
+    """
+    name = 'decidebaseclass'
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    def _decide(self, data):
+        """
+        Decides what to do with *data*. Returns (compressor, compressed_data).
+
+        *compressed_data* can be the result of *data* being processed by *compressor*,
+        if that is generated as a side-effect of the decision process, or None otherwise.
+
+        This private method allows for more efficient implementation of compress()
+        and decide_compress() making use of *compressed_data*, if already generated.
+        """
+        raise NotImplementedError
+
+    def decide(self, data):
+        return self._decide(data)[0]
+
+    def decide_compress(self, data):
+        """
+        Decides what to do with *data* and handle accordingly. Returns (compressor, compressed_data).
+
+        *compressed_data* is the result of *data* being processed by *compressor*.
+        """
+        compressor, compressed_data = self._decide(data)
+
+        if compressed_data is None:
+            compressed_data = compressor.compress(data)
+
+        if compressor is self:
+            # call super class to add ID bytes
+            return self, super().compress(compressed_data)
+
+        return compressor, compressed_data
+
+    def compress(self, data):
+        return self.decide_compress(data)[1]
 
 class CNONE(CompressorBase):
     """
@@ -113,7 +158,7 @@ class CNONE(CompressorBase):
         return data
 
 
-class LZ4(CompressorBase):
+class LZ4(DecidingCompressor):
     """
     raw LZ4 compression / decompression (liblz4).
 
@@ -128,7 +173,12 @@ class LZ4(CompressorBase):
     def __init__(self, **kwargs):
         pass
 
-    def compress(self, idata):
+    def _decide(self, idata):
+        """
+        Decides what to do with *data*. Returns (compressor, lz4_data).
+
+        *lz4_data* is the LZ4 result if *compressor* is LZ4 as well, otherwise it is None.
+        """
         if not isinstance(idata, bytes):
             idata = bytes(idata)  # code below does not work with memoryview
         cdef int isize = len(idata)
@@ -142,7 +192,11 @@ class LZ4(CompressorBase):
             osize = LZ4_compress_default(source, dest, isize, osize)
         if not osize:
             raise Exception('lz4 compress failed')
-        return super().compress(dest[:osize])
+        # only compress if the result actually is smaller
+        if osize < isize:
+            return self, dest[:osize]
+        else:
+            return NONE_COMPRESSOR, None
 
     def decompress(self, idata):
         if not isinstance(idata, bytes):
@@ -174,7 +228,7 @@ class LZ4(CompressorBase):
         return dest[:rsize]
 
 
-class LZMA(CompressorBase):
+class LZMA(DecidingCompressor):
     """
     lzma compression / decompression
     """
@@ -187,10 +241,18 @@ class LZMA(CompressorBase):
         if lzma is None:
             raise ValueError('No lzma support found.')
 
-    def compress(self, data):
+    def _decide(self, data):
+        """
+        Decides what to do with *data*. Returns (compressor, lzma_data).
+
+        *lzma_data* is the LZMA result if *compressor* is LZMA as well, otherwise it is None.
+        """
         # we do not need integrity checks in lzma, we do that already
-        data = lzma.compress(data, preset=self.level, check=lzma.CHECK_NONE)
-        return super().compress(data)
+        lzma_data = lzma.compress(data, preset=self.level, check=lzma.CHECK_NONE)
+        if len(lzma_data) < len(data):
+            return self, lzma_data
+        else:
+            return NONE_COMPRESSOR, None
 
     def decompress(self, data):
         data = super().decompress(data)
@@ -200,7 +262,7 @@ class LZMA(CompressorBase):
             raise DecompressionError(str(e)) from None
 
 
-class ZSTD(CompressorBase):
+class ZSTD(DecidingCompressor):
     """zstd compression / decompression (pypi: zstandard, gh: python-zstandard)"""
     # This is a NOT THREAD SAFE implementation.
     # Only ONE python context must to be created at a time.
@@ -212,7 +274,12 @@ class ZSTD(CompressorBase):
         super().__init__(**kwargs)
         self.level = level
 
-    def compress(self, idata):
+    def _decide(self, idata):
+        """
+        Decides what to do with *data*. Returns (compressor, zstd_data).
+
+        *zstd_data* is the ZSTD result if *compressor* is ZSTD as well, otherwise it is None.
+        """
         if not isinstance(idata, bytes):
             idata = bytes(idata)  # code below does not work with memoryview
         cdef int isize = len(idata)
@@ -227,7 +294,11 @@ class ZSTD(CompressorBase):
             osize = ZSTD_compress(dest, osize, source, isize, level)
         if ZSTD_isError(osize):
             raise Exception('zstd compress failed: %s' % ZSTD_getErrorName(osize))
-        return super().compress(dest[:osize])
+        # only compress if the result actually is smaller
+        if osize < isize:
+            return self, dest[:osize]
+        else:
+            return NONE_COMPRESSOR, None
 
     def decompress(self, idata):
         if not isinstance(idata, bytes):
@@ -306,44 +377,54 @@ class Auto(CompressorBase):
 
     def _decide(self, data):
         """
-        Decides what to do with *data*. Returns (compressor, lz4_data).
-
-        *lz4_data* is the LZ4 result if *compressor* is LZ4 as well, otherwise it is None.
+        Decides what to do with *data*. Returns (compressor, compressed_data).
+
+        *compressor* is the compressor that is decided to be best suited to compress
+        *data*, *compressed_data* is the result of *data* being compressed by a
+        compressor, which may or may not be *compressor*!
+
+        There are three possible outcomes of the decision process:
+        * *data* compresses well enough for trying the more expensive compressor
+          set on instantiation to make sense.
+          In this case, (expensive_compressor_class, lz4_compressed_data) is
+          returned.
+        * *data* compresses only slightly using the LZ4 compressor, thus trying
+          the more expensive compressor for potentially little gain does not
+          make sense.
+          In this case, (LZ4_COMPRESSOR, lz4_compressed_data) is returned.
+        * *data* does not compress at all using LZ4, in this case
+          (NONE_COMPRESSOR, none_compressed_data) is returned.
+
+        Note: While it makes no sense, the expensive compressor may well be set
+        to the LZ4 compressor.
         """
-        lz4_data = LZ4_COMPRESSOR.compress(data)
-        # lz4_data includes the compression type header, while data does not yet
-        ratio = len(lz4_data) / (len(data) + 2)
+        compressor, compressed_data = LZ4_COMPRESSOR.decide_compress(data)
+        # compressed_data includes the compression type header, while data does not yet
+        ratio = len(compressed_data) / (len(data) + 2)
         if ratio < 0.97:
-            return self.compressor, lz4_data
-        elif ratio < 1:
-            return LZ4_COMPRESSOR, lz4_data
+            return self.compressor, compressed_data
         else:
-            return NONE_COMPRESSOR, None
+            return compressor, compressed_data
 
     def decide(self, data):
         return self._decide(data)[0]
 
     def compress(self, data):
-        compressor, lz4_data = self._decide(data)
-        if compressor is LZ4_COMPRESSOR:
+        compressor, cheap_compressed_data = self._decide(data)
+        if compressor in (LZ4_COMPRESSOR, NONE_COMPRESSOR):
             # we know that trying to compress with expensive compressor is likely pointless,
-            # but lz4 managed to at least squeeze the data a bit.
-            return lz4_data
-        if compressor is NONE_COMPRESSOR:
-            # we know that trying to compress with expensive compressor is likely pointless
-            # and also lz4 did not manage to squeeze the data (not even a bit).
-            uncompressed_data = compressor.compress(data)
-            return uncompressed_data
+            # so we fallback to return the cheap compressed data.
+            return cheap_compressed_data
         # if we get here, the decider decided to try the expensive compressor.
-        # we also know that lz4_data is smaller than uncompressed data.
-        exp_compressed_data = compressor.compress(data)
-        ratio = len(exp_compressed_data) / len(lz4_data)
+        # we also know that the compressed data returned by the decider is lz4 compressed.
+        expensive_compressed_data = compressor.compress(data)
+        ratio = len(expensive_compressed_data) / len(cheap_compressed_data)
         if ratio < 0.99:
             # the expensive compressor managed to squeeze the data significantly better than lz4.
-            return exp_compressed_data
+            return expensive_compressed_data
         else:
             # otherwise let's just store the lz4 data, which decompresses extremely fast.
-            return lz4_data
+            return cheap_compressed_data
 
     def decompress(self, data):
         raise NotImplementedError

+ 2 - 2
src/borg/remote.py

@@ -16,7 +16,7 @@ import traceback
 from subprocess import Popen, PIPE
 
 from . import __version__
-from .compress import LZ4
+from .compress import Compressor
 from .constants import *  # NOQA
 from .helpers import Error, IntegrityError
 from .helpers import bin_to_hex
@@ -1215,7 +1215,7 @@ def cache_if_remote(repository, *, decrypted_cache=False, pack=None, unpack=None
         key = decrypted_cache
         # 32 bit csize, 64 bit (8 byte) xxh64
         cache_struct = struct.Struct('=I8s')
-        compressor = LZ4()
+        compressor = Compressor('lz4')
 
         def pack(data):
             csize, decrypted = data

+ 10 - 2
src/borg/testsuite/archiver.py

@@ -2240,7 +2240,7 @@ class ArchiverTestCase(ArchiverTestCaseBase):
 
     def test_compression_lz4_uncompressible(self):
         size, csize = self._get_sizes('lz4', compressible=False)
-        assert csize >= size
+        assert csize == size + 3  # same as compression 'none'
 
     def test_compression_lzma_compressible(self):
         size, csize = self._get_sizes('lzma', compressible=True)
@@ -2248,7 +2248,15 @@ class ArchiverTestCase(ArchiverTestCaseBase):
 
     def test_compression_lzma_uncompressible(self):
         size, csize = self._get_sizes('lzma', compressible=False)
-        assert csize >= size
+        assert csize == size + 3  # same as compression 'none'
+
+    def test_compression_zstd_compressible(self):
+        size, csize = self._get_sizes('zstd', compressible=True)
+        assert csize < size * 0.1
+
+    def test_compression_zstd_uncompressible(self):
+        size, csize = self._get_sizes('zstd', compressible=False)
+        assert csize == size + 3  # same as compression 'none'
 
     def test_change_passphrase(self):
         self.cmd('init', '--encryption=repokey', self.repository_location)

+ 4 - 2
src/borg/testsuite/compress.py

@@ -43,11 +43,13 @@ def test_lz4():
     assert data == Compressor(**params).decompress(cdata)  # autodetect
 
 
-def test_lz4_buffer_allocation():
+def test_lz4_buffer_allocation(monkeypatch):
+    # disable fallback to no compression on incompressible data
+    monkeypatch.setattr(LZ4, 'decide', lambda always_compress: LZ4)
     # test with a rather huge data object to see if buffer allocation / resizing works
     data = os.urandom(5 * 2**20) * 10  # 50MiB badly compressible data
     assert len(data) == 50 * 2**20
-    c = get_compressor(name='lz4')
+    c = Compressor('lz4')
     cdata = c.compress(data)
     assert len(cdata) > len(data)
     assert data == c.decompress(cdata)

+ 4 - 6
src/borg/testsuite/key.py

@@ -264,9 +264,8 @@ class TestKey:
         assert len(key.id_key) == 32
         plaintext = b'123456789'
         authenticated = key.encrypt(plaintext)
-        # 0x07 is the key TYPE, 0x0100 identifies LZ4 compression, 0x90 is part of LZ4 and means that an uncompressed
-        # block of length nine follows (the plaintext).
-        assert authenticated == b'\x07\x01\x00\x90' + plaintext
+        # 0x07 is the key TYPE, \x0000 identifies no compression.
+        assert authenticated == b'\x07\x00\x00' + plaintext
 
     def test_blake2_authenticated_encrypt(self, monkeypatch):
         monkeypatch.setenv('BORG_PASSPHRASE', 'test')
@@ -275,9 +274,8 @@ class TestKey:
         assert len(key.id_key) == 128
         plaintext = b'123456789'
         authenticated = key.encrypt(plaintext)
-        # 0x06 is the key TYPE, 0x0100 identifies LZ4 compression, 0x90 is part of LZ4 and means that an uncompressed
-        # block of length nine follows (the plaintext).
-        assert authenticated == b'\x06\x01\x00\x90' + plaintext
+        # 0x06 is the key TYPE, 0x0000 identifies no compression.
+        assert authenticated == b'\x06\x00\x00' + plaintext
 
 
 class TestPassphrase: