8 years ago · 69fb9bd403
--- a/src/borg/archive.py
+++ b/src/borg/archive.py
@@ -36,7 +36,6 @@ from .helpers import bin_to_hex
 
				 from .helpers import safe_ns
			
 
				 from .helpers import ellipsis_truncate, ProgressIndicatorPercent, log_multi
			
 
				 from .helpers import PathPrefixPattern, FnmatchPattern
			
 
				-from .helpers import CompressionDecider
			
 
				 from .item import Item, ArchiveItem
			
 
				 from .key import key_factory
			
 
				 from .platform import acl_get, acl_set, set_flags, get_flags, swidth
			
@@ -278,7 +277,7 @@ class Archive:
 
				 
			
 
				     def __init__(self, repository, key, manifest, name, cache=None, create=False,
			
 
				                  checkpoint_interval=300, numeric_owner=False, noatime=False, noctime=False, progress=False,
			
 
				-                 chunker_params=CHUNKER_PARAMS, start=None, start_monotonic=None, end=None, compression=None, compression_files=None,
			
 
				+                 chunker_params=CHUNKER_PARAMS, start=None, start_monotonic=None, end=None,
			
 
				                  consider_part_files=False, log_json=False):
			
 
				         self.cwd = os.getcwd()
			
 
				         self.key = key
			
@@ -307,11 +306,8 @@ class Archive:
 
				         self.pipeline = DownloadPipeline(self.repository, self.key)
			
 
				         self.create = create
			
 
				         if self.create:
			
 
				-            self.file_compression_logger = create_logger('borg.debug.file-compression')
			
 
				             self.items_buffer = CacheChunkBuffer(self.cache, self.key, self.stats)
			
 
				             self.chunker = Chunker(self.key.chunk_seed, *chunker_params)
			
 
				-            self.compression_decider = CompressionDecider(compression or CompressionSpec('none'),
			
 
				-                                                          compression_files or [])
			
 
				             if name in manifest.archives:
			
 
				                 raise self.AlreadyExists(name)
			
 
				             self.last_checkpoint = time.monotonic()
			
@@ -970,12 +966,10 @@ Utilization of max. archive size: {csize_max:.0%}
 
				         if chunks is not None:
			
 
				             item.chunks = chunks
			
 
				         else:
			
 
				-            compressor = self.compression_decider.decide(path)
			
 
				-            self.file_compression_logger.debug('%s -> compression %s', path, compressor.name)
			
 
				             with backup_io('open'):
			
 
				                 fh = Archive._open_rb(path)
			
 
				             with os.fdopen(fh, 'rb') as fd:
			
 
				-                self.chunk_file(item, cache, self.stats, backup_io_iter(self.chunker.chunkify(fd, fh)), compressor=compressor)
			
 
				+                self.chunk_file(item, cache, self.stats, backup_io_iter(self.chunker.chunkify(fd, fh)))
			
 
				             if not is_special_file:
			
 
				                 # we must not memorize special files, because the contents of e.g. a
			
 
				                 # block or char device will change without its mtime/size/inode changing.
			
@@ -1561,7 +1555,7 @@ class ArchiveRecreater:
 
				 
			
 
				     def __init__(self, repository, manifest, key, cache, matcher,
			
 
				                  exclude_caches=False, exclude_if_present=None, keep_exclude_tags=False,
			
 
				-                 chunker_params=None, compression=None, compression_files=None, always_recompress=False,
			
 
				+                 chunker_params=None, compression=None, always_recompress=False,
			
 
				                  dry_run=False, stats=False, progress=False, file_status_printer=None,
			
 
				                  checkpoint_interval=1800):
			
 
				         self.repository = repository
			
@@ -1582,8 +1576,6 @@ class ArchiveRecreater:
 
				         self.always_recompress = always_recompress
			
 
				         self.compression = compression or CompressionSpec('none')
			
 
				         self.seen_chunks = set()
			
 
				-        self.compression_decider = CompressionDecider(compression or CompressionSpec('none'),
			
 
				-                                                      compression_files or [])
			
 
				 
			
 
				         self.dry_run = dry_run
			
 
				         self.stats = stats
			
@@ -1652,11 +1644,10 @@ class ArchiveRecreater:
 
				                 self.cache.chunk_incref(chunk_id, target.stats)
			
 
				             return item.chunks
			
 
				         chunk_iterator = self.iter_chunks(archive, target, list(item.chunks))
			
 
				-        compressor = self.compression_decider.decide(item.path)
			
 
				-        chunk_processor = partial(self.chunk_processor, target, compressor)
			
 
				+        chunk_processor = partial(self.chunk_processor, target)
			
 
				         target.chunk_file(item, self.cache, target.stats, chunk_iterator, chunk_processor)
			
 
				 
			
 
				-    def chunk_processor(self, target, compressor, data):
			
 
				+    def chunk_processor(self, target, data):
			
 
				         chunk_id = self.key.id_hash(data)
			
 
				         if chunk_id in self.seen_chunks:
			
 
				             return self.cache.chunk_incref(chunk_id, target.stats)
			
@@ -1664,10 +1655,10 @@ class ArchiveRecreater:
 
				         if self.recompress and not self.always_recompress and chunk_id in self.cache.chunks:
			
 
				             # Check if this chunk is already compressed the way we want it
			
 
				             old_chunk = self.key.decrypt(None, self.repository.get(chunk_id), decompress=False)
			
 
				-            if Compressor.detect(old_chunk.data).name == compressor.decide(data).name:
			
 
				+            if Compressor.detect(old_chunk.data).name == self.key.compressor.decide(data).name:
			
 
				                 # Stored chunk has the same compression we wanted
			
 
				                 overwrite = False
			
 
				-        chunk = Chunk(data, compressor=compressor)
			
 
				+        chunk = Chunk(data)
			
 
				         chunk_entry = self.cache.add_chunk(chunk_id, chunk, target.stats, overwrite=overwrite, wait=False)
			
 
				         self.cache.repository.async_response(wait=False)
			
 
				         self.seen_chunks.add(chunk_entry.id)
			
@@ -1753,7 +1744,7 @@ class ArchiveRecreater:
 
				     def create_target_archive(self, name):
			
 
				         target = Archive(self.repository, self.key, self.manifest, name, create=True,
			
 
				                           progress=self.progress, chunker_params=self.chunker_params, cache=self.cache,
			
 
				-                          checkpoint_interval=self.checkpoint_interval, compression=self.compression)
			
 
				+                          checkpoint_interval=self.checkpoint_interval)
			
 
				         return target
			
 
				 
			
 
				     def open_archive(self, name, **kwargs):
			
--- a/src/borg/archiver.py
+++ b/src/borg/archiver.py
@@ -481,7 +481,6 @@ class Archiver:
 
				                                   numeric_owner=args.numeric_owner, noatime=args.noatime, noctime=args.noctime,
			
 
				                                   progress=args.progress,
			
 
				                                   chunker_params=args.chunker_params, start=t0, start_monotonic=t0_monotonic,
			
 
				-                                  compression=args.compression, compression_files=args.compression_files,
			
 
				                                   log_json=args.log_json)
			
 
				                 create_inner(archive, cache)
			
 
				         else:
			
@@ -1335,8 +1334,7 @@ class Archiver:
 
				         recreater = ArchiveRecreater(repository, manifest, key, cache, matcher,
			
 
				                                      exclude_caches=args.exclude_caches, exclude_if_present=args.exclude_if_present,
			
 
				                                      keep_exclude_tags=args.keep_exclude_tags, chunker_params=args.chunker_params,
			
 
				-                                     compression=args.compression, compression_files=args.compression_files,
			
 
				-                                     always_recompress=args.always_recompress,
			
 
				+                                     compression=args.compression, always_recompress=args.always_recompress,
			
 
				                                      progress=args.progress, stats=args.stats,
			
 
				                                      file_status_printer=self.print_file_status,
			
 
				                                      checkpoint_interval=args.checkpoint_interval,
			
@@ -1799,43 +1797,13 @@ class Archiver:
 
				             For compressible data, it uses the given C[,L] compression - with C[,L]
			
 
				             being any valid compression specifier.
			
 
				 
			
 
				-        The decision about which compression to use is done by borg like this:
			
 
				-
			
 
				-        1. find a compression specifier (per file):
			
 
				-           match the path/filename against all patterns in all --compression-from
			
 
				-           files (if any). If a pattern matches, use the compression spec given for
			
 
				-           that pattern. If no pattern matches (and also if you do not give any
			
 
				-           --compression-from option), default to the compression spec given by
			
 
				-           --compression. See docs/misc/compression.conf for an example config.
			
 
				-
			
 
				-        2. if the found compression spec is not "auto", the decision is taken:
			
 
				-           use the found compression spec.
			
 
				-
			
 
				-        3. if the found compression spec is "auto", test compressibility of each
			
 
				-           chunk using lz4.
			
 
				-           If it is compressible, use the C,[L] compression spec given within the
			
 
				-           "auto" specifier. If it is not compressible, use no compression.
			
 
				-
			
 
				         Examples::
			
 
				 
			
 
				             borg create --compression lz4 REPO::ARCHIVE data
			
 
				             borg create --compression zlib REPO::ARCHIVE data
			
 
				             borg create --compression zlib,1 REPO::ARCHIVE data
			
 
				             borg create --compression auto,lzma,6 REPO::ARCHIVE data
			
 
				-            borg create --compression-from compression.conf --compression auto,lzma ...
			
 
				-
			
 
				-        compression.conf has entries like::
			
 
				-
			
 
				-            # example config file for --compression-from option
			
 
				-            #
			
 
				-            # Format of non-comment / non-empty lines:
			
 
				-            # <compression-spec>:<path/filename pattern>
			
 
				-            # compression-spec is same format as for --compression option
			
 
				-            # path/filename pattern is same format as for --exclude option
			
 
				-            none:*.gz
			
 
				-            none:*.zip
			
 
				-            none:*.mp3
			
 
				-            none:*.ogg
			
 
				+            borg create --compression auto,lzma ...
			
 
				 
			
 
				         General remarks:
			
 
				 
			
@@ -2424,11 +2392,6 @@ class Archiver:
 
				                                    type=CompressionSpec, default=CompressionSpec('lz4'), metavar='COMPRESSION',
			
 
				                                    help='select compression algorithm, see the output of the '
			
 
				                                         '"borg help compression" command for details.')
			
 
				-        archive_group.add_argument('--compression-from', dest='compression_files',
			
 
				-                                   type=argparse.FileType('r'), action='append',
			
 
				-                                   metavar='COMPRESSIONCONFIG',
			
 
				-                                   help='read compression patterns from COMPRESSIONCONFIG, see the output of the '
			
 
				-                                        '"borg help compression" command for details.')
			
 
				 
			
 
				         subparser.add_argument('location', metavar='ARCHIVE',
			
 
				                                type=location_validator(archive=True),
			
@@ -2964,7 +2927,7 @@ class Archiver:
 
				         resulting archive will only contain files from these PATHs.
			
 
				 
			
 
				         Note that all paths in an archive are relative, therefore absolute patterns/paths
			
 
				-        will *not* match (--exclude, --exclude-from, --compression-from, PATHs).
			
 
				+        will *not* match (--exclude, --exclude-from, PATHs).
			
 
				 
			
 
				         --compression: all chunks seen will be stored using the given method.
			
 
				         Due to how Borg stores compressed size information this might display
			
@@ -3059,11 +3022,6 @@ class Archiver:
 
				         archive_group.add_argument('--always-recompress', dest='always_recompress', action='store_true',
			
 
				                                    help='always recompress chunks, don\'t skip chunks already compressed with the same '
			
 
				                                         'algorithm.')
			
 
				-        archive_group.add_argument('--compression-from', dest='compression_files',
			
 
				-                                   type=argparse.FileType('r'), action='append',
			
 
				-                                   metavar='COMPRESSIONCONFIG',
			
 
				-                                   help='read compression patterns from COMPRESSIONCONFIG, see the output of the '
			
 
				-                                        '"borg help compression" command for details.')
			
 
				         archive_group.add_argument('--chunker-params', dest='chunker_params',
			
 
				                                    type=ChunkerParams, default=CHUNKER_PARAMS,
			
 
				                                    metavar='PARAMS',
			
--- a/src/borg/compress.pyx
+++ b/src/borg/compress.pyx
@@ -5,18 +5,6 @@ borg.compress
 
				 Compression is applied to chunks after ID hashing (so the ID is a direct function of the
			
 
				 plain chunk, compression is irrelevant to it), and of course before encryption.
			
 
				 
			
 
				-Borg has a flexible scheme for deciding which compression to use for chunks.
			
 
				-
			
 
				-First, there is a global default set by the --compression command line option,
			
 
				-which sets the .compressor attribute on the Key.
			
 
				-
			
 
				-For chunks that emanate from files CompressionDecider may set a specific
			
 
				-Compressor based on patterns (this is the --compression-from option). This is stored
			
 
				-as a Compressor instance in the "compressor" key in the Chunk's meta dictionary.
			
 
				-
			
 
				-When compressing (KeyBase.compress) either the Compressor specified in the Chunk's
			
 
				-meta dictionary is used, or the default Compressor of the key.
			
 
				-
			
 
				 The "auto" mode (e.g. --compression auto,lzma,4) is implemented as a meta Compressor,
			
 
				 meaning that Auto acts like a Compressor, but defers actual work to others (namely
			
 
				 LZ4 as a heuristic whether compression is worth it, and the specified Compressor
			
--- a/src/borg/helpers.py
+++ b/src/borg/helpers.py
@@ -2096,39 +2096,6 @@ def clean_lines(lines, lstrip=None, rstrip=None, remove_empty=True, remove_comme
 
				         yield line
			
 
				 
			
 
				 
			
 
				-class CompressionDecider:
			
 
				-    def __init__(self, compression, compression_files):
			
 
				-        """
			
 
				-        Initialize a CompressionDecider instance (and read config files, if needed).
			
 
				-
			
 
				-        :param compression: default CompressionSpec (e.g. from --compression option)
			
 
				-        :param compression_files: list of compression config files (e.g. from --compression-from) or
			
 
				-                                  a list of other line iterators
			
 
				-        """
			
 
				-        from .compress import CompressionSpec
			
 
				-        self.compressor = compression.compressor
			
 
				-        if not compression_files:
			
 
				-            self.matcher = None
			
 
				-        else:
			
 
				-            self.matcher = PatternMatcher(fallback=compression.compressor)
			
 
				-            for file in compression_files:
			
 
				-                try:
			
 
				-                    for line in clean_lines(file):
			
 
				-                        try:
			
 
				-                            compr_spec, fn_pattern = line.split(':', 1)
			
 
				-                        except:
			
 
				-                            continue
			
 
				-                        self.matcher.add([parse_pattern(fn_pattern)], CompressionSpec(compr_spec).compressor)
			
 
				-                finally:
			
 
				-                    if hasattr(file, 'close'):
			
 
				-                        file.close()
			
 
				-
			
 
				-    def decide(self, path):
			
 
				-        if self.matcher is not None:
			
 
				-            return self.matcher.match(path)
			
 
				-        return self.compressor
			
 
				-
			
 
				-
			
 
				 class ErrorIgnoringTextIOWrapper(io.TextIOWrapper):
			
 
				     def read(self, n):
			
 
				         if not self.closed:
			
--- a/src/borg/key.py
+++ b/src/borg/key.py
@@ -152,10 +152,6 @@ class KeyBase:
 
				         """Return HMAC hash using the "id" HMAC key
			
 
				         """
			
 
				 
			
 
				-    def compress(self, chunk):
			
 
				-        meta, data = chunk
			
 
				-        return meta.get('compressor', self.compressor).compress(data)
			
 
				-
			
 
				     def encrypt(self, chunk):
			
 
				         pass
			
 
				 
			
@@ -256,7 +252,7 @@ class PlaintextKey(KeyBase):
 
				         return sha256(data).digest()
			
 
				 
			
 
				     def encrypt(self, chunk):
			
 
				-        data = self.compress(chunk)
			
 
				+        data = self.compressor.compress(chunk.data)
			
 
				         return b''.join([self.TYPE_STR, data])
			
 
				 
			
 
				     def decrypt(self, id, data, decompress=True):
			
@@ -334,7 +330,7 @@ class AESKeyBase(KeyBase):
 
				     MAC = hmac_sha256
			
 
				 
			
 
				     def encrypt(self, chunk):
			
 
				-        data = self.compress(chunk)
			
 
				+        data = self.compressor.compress(chunk.data)
			
 
				         self.nonce_manager.ensure_reservation(num_aes_blocks(len(data)))
			
 
				         self.enc_cipher.reset()
			
 
				         data = b''.join((self.enc_cipher.iv[8:], self.enc_cipher.encrypt(data)))
			
@@ -746,7 +742,7 @@ class AuthenticatedKey(ID_BLAKE2b_256, RepoKey):
 
				     STORAGE = KeyBlobStorage.REPO
			
 
				 
			
 
				     def encrypt(self, chunk):
			
 
				-        data = self.compress(chunk)
			
 
				+        data = self.compressor.compress(chunk.data)
			
 
				         return b''.join([self.TYPE_STR, data])
			
 
				 
			
 
				     def decrypt(self, id, data, decompress=True):
			
--- a/src/borg/testsuite/helpers.py
+++ b/src/borg/testsuite/helpers.py
@@ -12,7 +12,6 @@ import msgpack
 
				 import msgpack.fallback
			
 
				 
			
 
				 from .. import platform
			
 
				-from ..compress import CompressionSpec
			
 
				 from ..helpers import Location
			
 
				 from ..helpers import Buffer
			
 
				 from ..helpers import partial_format, format_file_size, parse_file_size, format_timedelta, format_line, PlaceholderError, replace_placeholders
			
@@ -25,7 +24,6 @@ from ..helpers import StableDict, int_to_bigint, bigint_to_int, bin_to_hex
 
				 from ..helpers import parse_timestamp, ChunkIteratorFileWrapper, ChunkerParams, Chunk
			
 
				 from ..helpers import ProgressIndicatorPercent, ProgressIndicatorEndless
			
 
				 from ..helpers import load_exclude_file, load_pattern_file
			
 
				-from ..helpers import CompressionDecider
			
 
				 from ..helpers import parse_pattern, PatternMatcher
			
 
				 from ..helpers import PathFullPattern, PathPrefixPattern, FnmatchPattern, ShellPattern, RegexPattern
			
 
				 from ..helpers import swidth_slice
			
@@ -1202,28 +1200,6 @@ data2
 
				     assert list(clean_lines(conf, remove_comments=False)) == ['#comment', 'data1 #data1', 'data2', 'data3', ]
			
 
				 
			
 
				 
			
 
				-def test_compression_decider():
			
 
				-    default = CompressionSpec('zlib')
			
 
				-    conf = """
			
 
				-# use super-fast lz4 compression on huge VM files in this path:
			
 
				-lz4:/srv/vm_disks
			
 
				-
			
 
				-# jpeg or zip files do not compress:
			
 
				-none:*.jpeg
			
 
				-none:*.zip
			
 
				-""".splitlines()
			
 
				-
			
 
				-    cd = CompressionDecider(default, [])  # no conf, always use default
			
 
				-    assert cd.decide('/srv/vm_disks/linux').name == 'zlib'
			
 
				-    assert cd.decide('test.zip').name == 'zlib'
			
 
				-    assert cd.decide('test').name == 'zlib'
			
 
				-
			
 
				-    cd = CompressionDecider(default, [conf, ])
			
 
				-    assert cd.decide('/srv/vm_disks/linux').name == 'lz4'
			
 
				-    assert cd.decide('test.zip').name == 'none'
			
 
				-    assert cd.decide('test').name == 'zlib'  # no match in conf, use default
			
 
				-
			
 
				-
			
 
				 def test_format_line():
			
 
				     data = dict(foo='bar baz')
			
 
				     assert format_line('', data) == ''