Browse Source

refactor CompressionDecider2 into a meta Compressor

Marian Beermann 8 years ago
parent
commit
a27f585eaa

+ 3 - 6
src/borg/archive.py

@@ -21,7 +21,7 @@ logger = create_logger()
 from . import xattr
 from .cache import ChunkListEntry
 from .chunker import Chunker
-from .compress import Compressor
+from .compress import Compressor, CompressionSpec
 from .constants import *  # NOQA
 from .hashindex import ChunkIndex, ChunkIndexEntry
 from .helpers import Manifest
@@ -36,7 +36,7 @@ from .helpers import bin_to_hex
 from .helpers import safe_ns
 from .helpers import ellipsis_truncate, ProgressIndicatorPercent, log_multi
 from .helpers import PathPrefixPattern, FnmatchPattern
-from .helpers import CompressionDecider1, CompressionDecider2, CompressionSpec
+from .helpers import CompressionDecider1
 from .item import Item, ArchiveItem
 from .key import key_factory
 from .platform import acl_get, acl_set, set_flags, get_flags, swidth
@@ -312,7 +312,6 @@ class Archive:
             self.chunker = Chunker(self.key.chunk_seed, *chunker_params)
             self.compression_decider1 = CompressionDecider1(compression or CompressionSpec('none'),
                                                             compression_files or [])
-            key.compression_decider2 = CompressionDecider2(compression or CompressionSpec('none'))
             if name in manifest.archives:
                 raise self.AlreadyExists(name)
             self.last_checkpoint = time.monotonic()
@@ -1585,7 +1584,6 @@ class ArchiveRecreater:
         self.seen_chunks = set()
         self.compression_decider1 = CompressionDecider1(compression or CompressionSpec('none'),
                                                         compression_files or [])
-        key.compression_decider2 = CompressionDecider2(compression or CompressionSpec('none'))
 
         self.dry_run = dry_run
         self.stats = stats
@@ -1663,12 +1661,11 @@ class ArchiveRecreater:
         if chunk_id in self.seen_chunks:
             return self.cache.chunk_incref(chunk_id, target.stats)
         chunk = Chunk(data, compress=compress)
-        compression_spec, chunk = self.key.compression_decider2.decide(chunk)
         overwrite = self.recompress
         if self.recompress and not self.always_recompress and chunk_id in self.cache.chunks:
             # Check if this chunk is already compressed the way we want it
             old_chunk = self.key.decrypt(None, self.repository.get(chunk_id), decompress=False)
-            if Compressor.detect(old_chunk.data).name == compression_spec.name:
+            if Compressor.detect(old_chunk.data).name == compress.name:
                 # Stored chunk has the same compression we wanted
                 overwrite = False
         chunk_entry = self.cache.add_chunk(chunk_id, chunk, target.stats, overwrite=overwrite, wait=False)

+ 5 - 2
src/borg/archiver.py

@@ -34,10 +34,11 @@ from .archive import Archive, ArchiveChecker, ArchiveRecreater, Statistics, is_s
 from .archive import BackupOSError, backup_io
 from .cache import Cache
 from .constants import *  # NOQA
+from .compress import CompressionSpec
 from .crc32 import crc32
 from .helpers import EXIT_SUCCESS, EXIT_WARNING, EXIT_ERROR
 from .helpers import Error, NoManifestError, set_ec
-from .helpers import location_validator, archivename_validator, ChunkerParams, CompressionSpec, ComprSpec
+from .helpers import location_validator, archivename_validator, ChunkerParams
 from .helpers import PrefixSpec, SortBySpec, HUMAN_SORT_KEYS
 from .helpers import BaseFormatter, ItemFormatter, ArchiveFormatter
 from .helpers import format_time, format_timedelta, format_file_size, format_archive
@@ -107,6 +108,8 @@ def with_repository(fake=False, invert_fake=False, create=False, lock=True, excl
             with repository:
                 if manifest or cache:
                     kwargs['manifest'], kwargs['key'] = Manifest.load(repository)
+                    if args.__dict__.get('compression'):
+                        kwargs['key'].compressor = args.compression.compressor
                 if cache:
                     with Cache(repository, kwargs['key'], kwargs['manifest'],
                                do_files=getattr(args, 'cache_files', False),
@@ -2411,7 +2414,7 @@ class Archiver:
                                    help='specify the chunker parameters (CHUNK_MIN_EXP, CHUNK_MAX_EXP, '
                                         'HASH_MASK_BITS, HASH_WINDOW_SIZE). default: %d,%d,%d,%d' % CHUNKER_PARAMS)
         archive_group.add_argument('-C', '--compression', dest='compression',
-                                   type=CompressionSpec, default=ComprSpec(name='lz4', spec=None), metavar='COMPRESSION',
+                                   type=CompressionSpec, default=CompressionSpec('lz4'), metavar='COMPRESSION',
                                    help='select compression algorithm, see the output of the '
                                         '"borg help compression" command for details.')
         archive_group.add_argument('--compression-from', dest='compression_files',

+ 75 - 0
src/borg/compress.pyx

@@ -1,9 +1,12 @@
 import zlib
+from collections import namedtuple
+
 try:
     import lzma
 except ImportError:
     lzma = None
 
+from .logger import create_logger
 from .helpers import Buffer, DecompressionError
 
 API_VERSION = '1.1_02'
@@ -179,12 +182,50 @@ class ZLIB(CompressorBase):
             raise DecompressionError(str(e)) from None
 
 
+class Auto(CompressorBase):
+    """
+    Meta-Compressor that decides which compression to use based on LZ4's ratio.
+
+    As a meta-Compressor the actual compression is deferred to other Compressors,
+    therefore this Compressor has no ID, no detect() and no decompress().
+    """
+
+    ID = None
+    name = 'auto'
+
+    logger = create_logger('borg.debug.file-compression')
+
+    def __init__(self, compressor):
+        super().__init__()
+        self.compressor = compressor
+        self.lz4 = get_compressor('lz4')
+        self.none = get_compressor('none')
+
+    def compress(self, data):
+        lz4_data = self.lz4.compress(data)
+        if len(lz4_data) < 0.97 * len(data):
+            return self.compressor.compress(data)
+        elif len(lz4_data) < len(data):
+            return lz4_data
+        else:
+            return self.none.compress(data)
+
+    def decompress(self, data):
+        raise NotImplementedError
+
+    def detect(cls, data):
+        raise NotImplementedError
+
+
+# Maps valid compressor names to their class
 COMPRESSOR_TABLE = {
     CNONE.name: CNONE,
     LZ4.name: LZ4,
     ZLIB.name: ZLIB,
     LZMA.name: LZMA,
+    Auto.name: Auto,
 }
+# List of possible compression types. Does not include Auto, since it is a meta-Compressor.
 COMPRESSOR_LIST = [LZ4, CNONE, ZLIB, LZMA, ]  # check fast stuff first
 
 def get_compressor(name, **kwargs):
@@ -216,3 +257,37 @@ class Compressor:
                 return cls
         else:
             raise ValueError('No decompressor for this data found: %r.', data[:2])
+
+
+ComprSpec = namedtuple('ComprSpec', ('name', 'spec', 'compressor'))
+
+
+def CompressionSpec(s):
+    values = s.split(',')
+    count = len(values)
+    if count < 1:
+        raise ValueError
+    # --compression algo[,level]
+    name = values[0]
+    if name == 'none':
+        return ComprSpec(name=name, spec=None, compressor=CNONE())
+    elif name == 'lz4':
+        return ComprSpec(name=name, spec=None, compressor=LZ4())
+    if name in ('zlib', 'lzma', ):
+        if count < 2:
+            level = 6  # default compression level in py stdlib
+        elif count == 2:
+            level = int(values[1])
+            if not 0 <= level <= 9:
+                raise ValueError
+        else:
+            raise ValueError
+        return ComprSpec(name=name, spec=level, compressor=get_compressor(name, level=level))
+    if name == 'auto':
+        if 2 <= count <= 3:
+            compression = ','.join(values[1:])
+        else:
+            raise ValueError
+        inner = CompressionSpec(compression)
+        return ComprSpec(name=name, spec=inner, compressor=Auto(inner.compressor))
+    raise ValueError

+ 5 - 70
src/borg/helpers.py

@@ -726,37 +726,6 @@ def ChunkerParams(s):
     return int(chunk_min), int(chunk_max), int(chunk_mask), int(window_size)
 
 
-ComprSpec = namedtuple('ComprSpec', ('name', 'spec'))
-
-
-def CompressionSpec(s):
-    values = s.split(',')
-    count = len(values)
-    if count < 1:
-        raise ValueError
-    # --compression algo[,level]
-    name = values[0]
-    if name in ('none', 'lz4', ):
-        return ComprSpec(name=name, spec=None)
-    if name in ('zlib', 'lzma', ):
-        if count < 2:
-            level = 6  # default compression level in py stdlib
-        elif count == 2:
-            level = int(values[1])
-            if not 0 <= level <= 9:
-                raise ValueError
-        else:
-            raise ValueError
-        return ComprSpec(name=name, spec=level)
-    if name == 'auto':
-        if 2 <= count <= 3:
-            compression = ','.join(values[1:])
-        else:
-            raise ValueError
-        return ComprSpec(name=name, spec=CompressionSpec(compression))
-    raise ValueError
-
-
 def dir_is_cachedir(path):
     """Determines whether the specified path is a cache directory (and
     therefore should potentially be excluded from the backup) according to
@@ -2136,11 +2105,12 @@ class CompressionDecider1:
         :param compression_files: list of compression config files (e.g. from --compression-from) or
                                   a list of other line iterators
         """
-        self.compression = compression
+        from .compress import CompressionSpec
+        self.compressor = compression.compressor
         if not compression_files:
             self.matcher = None
         else:
-            self.matcher = PatternMatcher(fallback=compression)
+            self.matcher = PatternMatcher(fallback=compression.compressor)
             for file in compression_files:
                 try:
                     for line in clean_lines(file):
@@ -2148,7 +2118,7 @@ class CompressionDecider1:
                             compr_spec, fn_pattern = line.split(':', 1)
                         except:
                             continue
-                        self.matcher.add([parse_pattern(fn_pattern)], CompressionSpec(compr_spec))
+                        self.matcher.add([parse_pattern(fn_pattern)], CompressionSpec(compr_spec).compressor)
                 finally:
                     if hasattr(file, 'close'):
                         file.close()
@@ -2156,42 +2126,7 @@ class CompressionDecider1:
     def decide(self, path):
         if self.matcher is not None:
             return self.matcher.match(path)
-        return self.compression
-
-
-class CompressionDecider2:
-    logger = create_logger('borg.debug.file-compression')
-
-    def __init__(self, compression):
-        self.compression = compression
-
-    def decide(self, chunk):
-        # nothing fancy here yet: we either use what the metadata says or the default
-        # later, we can decide based on the chunk data also.
-        # if we compress the data here to decide, we can even update the chunk data
-        # and modify the metadata as desired.
-        compr_spec = chunk.meta.get('compress', self.compression)
-        if compr_spec.name == 'auto':
-            # we did not decide yet, use heuristic:
-            compr_spec, chunk = self.heuristic_lz4(compr_spec, chunk)
-        return compr_spec, chunk
-
-    def heuristic_lz4(self, compr_args, chunk):
-        from .compress import get_compressor
-        meta, data = chunk
-        lz4 = get_compressor('lz4')
-        cdata = lz4.compress(data)
-        data_len = len(data)
-        cdata_len = len(cdata)
-        if cdata_len < 0.97 * data_len:
-            compr_spec = compr_args.spec
-        else:
-            # uncompressible - we could have a special "uncompressible compressor"
-            # that marks such data as uncompressible via compression-type metadata.
-            compr_spec = CompressionSpec('none')
-        self.logger.debug("len(data) == %d, len(lz4(data)) == %d, ratio == %.3f, choosing %s", data_len, cdata_len, cdata_len/data_len, compr_spec)
-        meta['compress'] = compr_spec
-        return compr_spec, Chunk(data, **meta)
+        return self.compressor
 
 
 class ErrorIgnoringTextIOWrapper(io.TextIOWrapper):

+ 6 - 9
src/borg/key.py

@@ -13,14 +13,13 @@ from .logger import create_logger
 logger = create_logger()
 
 from .constants import *  # NOQA
-from .compress import Compressor, get_compressor
+from .compress import Compressor
 from .crypto import AES, bytes_to_long, bytes_to_int, num_aes_blocks, hmac_sha256, blake2b_256, hkdf_hmac_sha512
 from .helpers import Chunk, StableDict
 from .helpers import Error, IntegrityError
 from .helpers import yes
 from .helpers import get_keys_dir, get_security_dir
 from .helpers import bin_to_hex
-from .helpers import CompressionDecider2, CompressionSpec
 from .item import Key, EncryptedKey
 from .platform import SaveFile
 from .nonces import NonceManager
@@ -143,8 +142,8 @@ class KeyBase:
         self.TYPE_STR = bytes([self.TYPE])
         self.repository = repository
         self.target = None  # key location file path / repo obj
-        self.compression_decider2 = CompressionDecider2(CompressionSpec('none'))
         self.compressor = Compressor('none')  # for decompression
+        self.decompress = self.compressor.decompress
         self.tam_required = True
 
     def id_hash(self, data):
@@ -152,10 +151,8 @@ class KeyBase:
         """
 
     def compress(self, chunk):
-        compr_args, chunk = self.compression_decider2.decide(chunk)
-        compressor = Compressor(name=compr_args.name, level=compr_args.spec)
         meta, data = chunk
-        data = compressor.compress(data)
+        data = meta.get('compress', self.compressor).compress(data)
         return Chunk(data, **meta)
 
     def encrypt(self, chunk):
@@ -268,7 +265,7 @@ class PlaintextKey(KeyBase):
         payload = memoryview(data)[1:]
         if not decompress:
             return Chunk(payload)
-        data = self.compressor.decompress(payload)
+        data = self.decompress(payload)
         self.assert_id(id, data)
         return Chunk(data)
 
@@ -362,7 +359,7 @@ class AESKeyBase(KeyBase):
         payload = self.dec_cipher.decrypt(data_view[41:])
         if not decompress:
             return Chunk(payload)
-        data = self.compressor.decompress(payload)
+        data = self.decompress(payload)
         self.assert_id(id, data)
         return Chunk(data)
 
@@ -757,7 +754,7 @@ class AuthenticatedKey(ID_BLAKE2b_256, RepoKey):
         payload = memoryview(data)[1:]
         if not decompress:
             return Chunk(payload)
-        data = self.compressor.decompress(payload)
+        data = self.decompress(payload)
         self.assert_id(id, data)
         return Chunk(data)
 

+ 46 - 1
src/borg/testsuite/compress.py

@@ -7,7 +7,7 @@ except ImportError:
 
 import pytest
 
-from ..compress import get_compressor, Compressor, CNONE, ZLIB, LZ4
+from ..compress import get_compressor, Compressor, CompressionSpec, ComprSpec, CNONE, ZLIB, LZ4, LZMA, Auto
 
 
 buffer = bytes(2**16)
@@ -107,3 +107,48 @@ def test_compressor():
     for params in params_list:
         c = Compressor(**params)
         assert data == c.decompress(c.compress(data))
+
+
+def test_auto():
+    compressor = CompressionSpec('auto,zlib,9').compressor
+
+    compressed = compressor.compress(bytes(500))
+    assert Compressor.detect(compressed) == ZLIB
+
+    compressed = compressor.compress(b'\x00\xb8\xa3\xa2-O\xe1i\xb6\x12\x03\xc21\xf3\x8a\xf78\\\x01\xa5b\x07\x95\xbeE\xf8\xa3\x9ahm\xb1~')
+    assert Compressor.detect(compressed) == CNONE
+
+
+def test_compression_specs():
+    with pytest.raises(ValueError):
+        CompressionSpec('')
+
+    assert isinstance(CompressionSpec('none').compressor, CNONE)
+    assert isinstance(CompressionSpec('lz4').compressor, LZ4)
+
+    zlib = CompressionSpec('zlib').compressor
+    assert isinstance(zlib, ZLIB)
+    assert zlib.level == 6
+    zlib = CompressionSpec('zlib,0').compressor
+    assert isinstance(zlib, ZLIB)
+    assert zlib.level == 0
+    zlib = CompressionSpec('zlib,9').compressor
+    assert isinstance(zlib, ZLIB)
+    assert zlib.level == 9
+    with pytest.raises(ValueError):
+        CompressionSpec('zlib,9,invalid')
+
+    lzma = CompressionSpec('lzma').compressor
+    assert isinstance(lzma, LZMA)
+    assert lzma.level == 6
+    lzma = CompressionSpec('lzma,0').compressor
+    assert isinstance(lzma, LZMA)
+    assert lzma.level == 0
+    lzma = CompressionSpec('lzma,9').compressor
+    assert isinstance(lzma, LZMA)
+    assert lzma.level == 9
+
+    with pytest.raises(ValueError):
+        CompressionSpec('lzma,9,invalid')
+    with pytest.raises(ValueError):
+        CompressionSpec('invalid')

+ 2 - 30
src/borg/testsuite/helpers.py

@@ -12,6 +12,7 @@ import msgpack
 import msgpack.fallback
 
 from .. import platform
+from ..compress import CompressionSpec
 from ..helpers import Location
 from ..helpers import Buffer
 from ..helpers import partial_format, format_file_size, parse_file_size, format_timedelta, format_line, PlaceholderError, replace_placeholders
@@ -24,7 +25,7 @@ from ..helpers import StableDict, int_to_bigint, bigint_to_int, bin_to_hex
 from ..helpers import parse_timestamp, ChunkIteratorFileWrapper, ChunkerParams, Chunk
 from ..helpers import ProgressIndicatorPercent, ProgressIndicatorEndless
 from ..helpers import load_exclude_file, load_pattern_file
-from ..helpers import CompressionSpec, ComprSpec, CompressionDecider1, CompressionDecider2
+from ..helpers import CompressionDecider1
 from ..helpers import parse_pattern, PatternMatcher
 from ..helpers import PathFullPattern, PathPrefixPattern, FnmatchPattern, ShellPattern, RegexPattern
 from ..helpers import swidth_slice
@@ -698,25 +699,6 @@ def test_pattern_matcher():
     assert PatternMatcher(fallback="hey!").fallback == "hey!"
 
 
-def test_compression_specs():
-    with pytest.raises(ValueError):
-        CompressionSpec('')
-    assert CompressionSpec('none') == ComprSpec(name='none', spec=None)
-    assert CompressionSpec('lz4') == ComprSpec(name='lz4', spec=None)
-    assert CompressionSpec('zlib') == ComprSpec(name='zlib', spec=6)
-    assert CompressionSpec('zlib,0') == ComprSpec(name='zlib', spec=0)
-    assert CompressionSpec('zlib,9') == ComprSpec(name='zlib', spec=9)
-    with pytest.raises(ValueError):
-        CompressionSpec('zlib,9,invalid')
-    assert CompressionSpec('lzma') == ComprSpec(name='lzma', spec=6)
-    assert CompressionSpec('lzma,0') == ComprSpec(name='lzma', spec=0)
-    assert CompressionSpec('lzma,9') == ComprSpec(name='lzma', spec=9)
-    with pytest.raises(ValueError):
-        CompressionSpec('lzma,9,invalid')
-    with pytest.raises(ValueError):
-        CompressionSpec('invalid')
-
-
 def test_chunkerparams():
     assert ChunkerParams('19,23,21,4095') == (19, 23, 21, 4095)
     assert ChunkerParams('10,23,16,4095') == (10, 23, 16, 4095)
@@ -1242,16 +1224,6 @@ none:*.zip
     assert cd.decide('test').name == 'zlib'  # no match in conf, use default
 
 
-def test_compression_decider2():
-    default = CompressionSpec('zlib')
-
-    cd = CompressionDecider2(default)
-    compr_spec, chunk = cd.decide(Chunk(None))
-    assert compr_spec.name == 'zlib'
-    compr_spec, chunk = cd.decide(Chunk(None, compress=CompressionSpec('lzma')))
-    assert compr_spec.name == 'lzma'
-
-
 def test_format_line():
     data = dict(foo='bar baz')
     assert format_line('', data) == ''