Explorar el Código

implement compression heuristics based on lz4-compressibility, fixes #1006

also: add some tests that invoke all supported compression algorithms
Thomas Waldmann hace 9 años
padre
commit
75b3e786ed
Se han modificado 3 ficheros con 88 adiciones y 1 borrados
  1. 4 0
      borg/archiver.py
  2. 26 1
      borg/helpers.py
  3. 58 0
      borg/testsuite/archiver.py

+ 4 - 0
borg/archiver.py

@@ -1359,6 +1359,8 @@ class Archiver:
                                    type=CompressionSpec, default=dict(name='none'), metavar='COMPRESSION',
                                    help='select compression algorithm (and level):\n'
                                         'none == no compression (default),\n'
+                                        'auto,C[,L] == built-in heuristic decides between none or C[,L] - with C[,L]\n'
+                                        '              being any valid compression algorithm (and optional level),\n'
                                         'lz4 == lz4,\n'
                                         'zlib == zlib (default level 6),\n'
                                         'zlib,0 .. zlib,9 == zlib (with level 0..9),\n'
@@ -1828,6 +1830,8 @@ class Archiver:
                                    type=CompressionSpec, default=None, metavar='COMPRESSION',
                                    help='select compression algorithm (and level):\n'
                                         'none == no compression (default),\n'
+                                        'auto,C[,L] == built-in heuristic decides between none or C[,L] - with C[,L]\n'
+                                        '              being any valid compression algorithm (and optional level),\n'
                                         'lz4 == lz4,\n'
                                         'zlib == zlib (default level 6),\n'
                                         'zlib,0 .. zlib,9 == zlib (with level 0..9),\n'

+ 26 - 1
borg/helpers.py

@@ -31,7 +31,7 @@ from . import hashindex
 from . import chunker
 from .constants import *  # NOQA
 from . import crypto
-from .compress import COMPR_BUFFER
+from .compress import COMPR_BUFFER, get_compressor
 from . import shellpattern
 import msgpack
 import msgpack.fallback
@@ -530,6 +530,12 @@ def CompressionSpec(s):
         else:
             raise ValueError
         return dict(name=name, level=level)
+    if name == 'auto':
+        if 2 <= count <= 3:
+            compression = ','.join(values[1:])
+        else:
+            raise ValueError
+        return dict(name=name, spec=CompressionSpec(compression))
     raise ValueError
 
 
@@ -1497,4 +1503,23 @@ class CompressionDecider2:
         compr_spec = chunk.meta.get('compress', self.compression)
         compr_args = dict(buffer=COMPR_BUFFER)
         compr_args.update(compr_spec)
+        if compr_args['name'] == 'auto':
+            # we did not decide yet, use heuristic:
+            compr_args, chunk = self.heuristic_lz4(compr_args, chunk)
         return compr_args, chunk
+
+    def heuristic_lz4(self, compr_args, chunk):
+        meta, data = chunk
+        lz4 = get_compressor('lz4', buffer=compr_args['buffer'])
+        cdata = lz4.compress(data)
+        data_len = len(data)
+        cdata_len = len(cdata)
+        if cdata_len < data_len:
+            compr_spec = compr_args['spec']
+        else:
+            # uncompressible - we could have a special "uncompressible compressor"
+            # that marks such data as uncompressible via compression-type metadata.
+            compr_spec = CompressionSpec('none')
+        compr_args.update(compr_spec)
+        logger.debug("len(data) == %d, len(lz4(data)) == %d, choosing %s", data_len, cdata_len, compr_spec)
+        return compr_args, Chunk(data, **meta)

+ 58 - 0
borg/testsuite/archiver.py

@@ -1089,6 +1089,64 @@ class ArchiverTestCase(ArchiverTestCaseBase):
         size, csize, path = output.split("\n")[1].split(" ")
         assert int(csize) < int(size)
 
+    def _get_sizes(self, compression, compressible, size=10000):
+        if compressible:
+            contents = b'X' * size
+        else:
+            contents = os.urandom(size)
+        self.create_regular_file('file', contents=contents)
+        self.cmd('init', '--encryption=none', self.repository_location)
+        archive = self.repository_location + '::test'
+        self.cmd('create', '-C', compression, archive, 'input')
+        output = self.cmd('list', '--format', '{size} {csize} {path}{NL}', archive)
+        size, csize, path = output.split("\n")[1].split(" ")
+        return int(size), int(csize)
+
+    def test_compression_none_compressible(self):
+        size, csize = self._get_sizes('none', compressible=True)
+        assert csize >= size
+        assert csize == size + 3
+
+    def test_compression_none_uncompressible(self):
+        size, csize = self._get_sizes('none', compressible=False)
+        assert csize >= size
+        assert csize == size + 3
+
+    def test_compression_zlib_compressible(self):
+        size, csize = self._get_sizes('zlib', compressible=True)
+        assert csize < size * 0.1
+        assert csize == 35
+
+    def test_compression_zlib_uncompressible(self):
+        size, csize = self._get_sizes('zlib', compressible=False)
+        assert csize >= size
+
+    def test_compression_auto_compressible(self):
+        size, csize = self._get_sizes('auto,zlib', compressible=True)
+        assert csize < size * 0.1
+        assert csize == 35  # same as compression 'zlib'
+
+    def test_compression_auto_uncompressible(self):
+        size, csize = self._get_sizes('auto,zlib', compressible=False)
+        assert csize >= size
+        assert csize == size + 3  # same as compression 'none'
+
+    def test_compression_lz4_compressible(self):
+        size, csize = self._get_sizes('lz4', compressible=True)
+        assert csize < size * 0.1
+
+    def test_compression_lz4_uncompressible(self):
+        size, csize = self._get_sizes('lz4', compressible=False)
+        assert csize >= size
+
+    def test_compression_lzma_compressible(self):
+        size, csize = self._get_sizes('lzma', compressible=True)
+        assert csize < size * 0.1
+
+    def test_compression_lzma_uncompressible(self):
+        size, csize = self._get_sizes('lzma', compressible=False)
+        assert csize >= size
+
     def test_break_lock(self):
         self.cmd('init', self.repository_location)
         self.cmd('break-lock', self.repository_location)