Forráskód Böngészése

Merge pull request #129 from ThomasWaldmann/compression

compression flexibility, new none, lz4 and lzma compression
TW 10 éve
szülő
commit
fffe509268

+ 1 - 0
.gitignore

@@ -6,6 +6,7 @@ env
 .tox
 hashindex.c
 chunker.c
+compress.c
 crypto.c
 platform_darwin.c
 platform_freebsd.c

+ 4 - 0
.travis/install.sh

@@ -14,6 +14,7 @@ if [[ "$(uname -s)" == 'Darwin' ]]; then
         eval "$(pyenv init -)"
     fi
 
+    brew install lz4
     brew outdated pyenv || brew upgrade pyenv
 
     case "${TOXENV}" in
@@ -34,6 +35,9 @@ if [[ "$(uname -s)" == 'Darwin' ]]; then
     python -m pip install --user virtualenv
 else
     pip install virtualenv
+    sudo add-apt-repository -y ppa:gezakovacs/lz4
+    sudo apt-get update
+    sudo apt-get install -y liblz4-dev
     sudo apt-get install -y libacl1-dev
 fi
 

+ 22 - 3
CHANGES.rst

@@ -5,16 +5,35 @@ Borg Changelog
 Version 0.25.0 (not released yet)
 ---------------------------------
 
-Incompatible changes (compared to 0.24):
+Compatibility notes:
 
-- none yet
+- the new compression code is very compatible: as long as you stay with zlib
+  compression, older borg releases will still be able to read data from a
+  repo/archive made with the new code (note: this is not the case for the
+  default "none" compression, use "zlib,0" if you want a "no compression" mode
+  that can be read by older borg). Also the new code is able to read repos and
+  archives made with older borg versions (for all zlib levels  0..9).
 
 Deprecations:
 
-- none yet
+- --compression N (with N being a number, as in 0.24) is deprecated.
+  We keep the --compression 0..9 for now to not break scripts, but it is
+  deprecated and will be removed later, so better fix your scripts now:
+  --compression 0 (as in 0.24) is the same as --compression zlib,0 (now).
+  BUT: if you do not want compression, you rather want --compression none
+  (which is the default).
+  --compression 1 (in 0.24) is the same as --compression zlib,1 (now)
+  --compression 9 (in 0.24) is the same as --compression zlib,9 (now)
+
 
 New features:
 
+- create --compression none (default, means: do not compress, just pass through
+  data "as is". this is more efficient than zlib level 0 as used in borg 0.24)
+- create --compression lz4 (super-fast, but not very high compression)
+  Please note that borgbackup needs lz4 library as additional requirement.
+- create --compression zlib,N (slower, higher compression, default for N is 6)
+- create --compression lzma,N (slowest, highest compression, default N is 6)
 - honor the nodump flag (UF_NODUMP) and do not backup such items
 
 Bug fixes:

+ 2 - 1
README.rst

@@ -51,7 +51,8 @@ Main features
     authenticity is verified using HMAC-SHA256.
 
 **Compression**
-    All data can be compressed by zlib, level 0-9.
+    All data can be compressed by lz4 (super fast, low compression), zlib
+    (medium speed and compression) or lzma (low speed, high compression).
 
 **Off-site backups**
     Borg can store data on any remote host accessible over SSH.  If Borg is

+ 13 - 5
borg/archiver.py

@@ -14,6 +14,7 @@ import traceback
 
 from . import __version__
 from .archive import Archive, ArchiveChecker, CHUNKER_PARAMS
+from .compress import Compressor, COMPR_BUFFER
 from .repository import Repository
 from .cache import Cache
 from .key import key_creator
@@ -21,7 +22,7 @@ from .helpers import Error, location_validator, format_time, format_file_size, \
     format_file_mode, ExcludePattern, exclude_path, adjust_patterns, to_localtime, timestamp, \
     get_cache_dir, get_keys_dir, format_timedelta, prune_within, prune_split, \
     Manifest, remove_surrogates, update_excludes, format_archive, check_extension_modules, Statistics, \
-    is_cachedir, bigint_to_int, ChunkerParams
+    is_cachedir, bigint_to_int, ChunkerParams, CompressionSpec
 from .remote import RepositoryServer, RemoteRepository
 
 has_lchflags = hasattr(os, 'lchflags')
@@ -104,7 +105,9 @@ Type "Yes I am sure" if you understand this and want to continue.\n""")
         t0 = datetime.now()
         repository = self.open_repository(args.archive, exclusive=True)
         manifest, key = Manifest.load(repository)
-        key.compression_level = args.compression
+        compr_args = dict(buffer=COMPR_BUFFER)
+        compr_args.update(args.compression)
+        key.compressor = Compressor(**compr_args)
         cache = Cache(repository, key, manifest, do_files=args.cache_files)
         archive = Archive(repository, key, manifest, args.archive.archive, cache=cache,
                           create=True, checkpoint_interval=args.checkpoint_interval,
@@ -670,9 +673,14 @@ Type "Yes I am sure" if you understand this and want to continue.\n""")
                                metavar='CHUNK_MIN_EXP,CHUNK_MAX_EXP,HASH_MASK_BITS,HASH_WINDOW_SIZE',
                                help='specify the chunker parameters. default: %d,%d,%d,%d' % CHUNKER_PARAMS)
         subparser.add_argument('-C', '--compression', dest='compression',
-                               type=int, default=0, metavar='N',
-                               help='select compression algorithm and level. 0..9 is supported and means zlib '
-                                    'level 0 (no compression, fast, default) .. zlib level 9 (high compression, slow).')
+                               type=CompressionSpec, default=dict(name='none'), metavar='COMPRESSION',
+                               help='select compression algorithm (and level): '
+                                    'none == no compression (default), '
+                                    'lz4 == lz4, '
+                                    'zlib == zlib (default level 6), '
+                                    'zlib,0 .. zlib,9 == zlib (with level 0..9), '
+                                    'lzma == lzma (default level 6), '
+                                    'lzma,0 .. lzma,9 == lzma (with level 0..9).')
         subparser.add_argument('archive', metavar='ARCHIVE',
                                type=location_validator(archive=True),
                                help='archive to create')

+ 199 - 0
borg/compress.pyx

@@ -0,0 +1,199 @@
+import zlib
+try:
+    import lzma
+except ImportError:
+    lzma = None
+
+cdef extern from "lz4.h":
+    int LZ4_compress_limitedOutput(const char* source, char* dest, int inputSize, int maxOutputSize) nogil
+    int LZ4_decompress_safe(const char* source, char* dest, int inputSize, int maxOutputSize) nogil
+
+
+cdef class CompressorBase:
+    """
+    base class for all (de)compression classes,
+    also handles compression format auto detection and
+    adding/stripping the ID header (which enable auto detection).
+    """
+    ID = b'\xFF\xFF'  # reserved and not used
+                      # overwrite with a unique 2-bytes bytestring in child classes
+    name = 'baseclass'
+
+    @classmethod
+    def detect(cls, data):
+        return data.startswith(cls.ID)
+
+    def __init__(self, **kwargs):
+        pass
+
+    def compress(self, data):
+        # add ID bytes
+        return self.ID + data
+
+    def decompress(self, data):
+        # strip ID bytes
+        return data[2:]
+
+
+class CNONE(CompressorBase):
+    """
+    none - no compression, just pass through data
+    """
+    ID = b'\x00\x00'
+    name = 'none'
+
+    def compress(self, data):
+        return super().compress(data)
+
+    def decompress(self, data):
+        data = super().decompress(data)
+        if not isinstance(data, bytes):
+            data = bytes(data)
+        return data
+
+
+cdef class LZ4(CompressorBase):
+    """
+    raw LZ4 compression / decompression (liblz4).
+
+    Features:
+        - lz4 is super fast
+        - wrapper releases CPython's GIL to support multithreaded code
+        - buffer given by caller, avoiding frequent reallocation and buffer duplication
+        - uses safe lz4 methods that never go beyond the end of the output buffer
+
+    But beware:
+        - this is not very generic, the given buffer MUST be large enough to
+          handle all compression or decompression output (or it will fail).
+        - you must not do method calls to the same LZ4 instance from different
+          threads at the same time - create one LZ4 instance per thread!
+    """
+    ID = b'\x01\x00'
+    name = 'lz4'
+
+    cdef char *buffer  # helper buffer for (de)compression output
+    cdef int bufsize  # size of this buffer
+
+    def __cinit__(self, **kwargs):
+        buffer = kwargs['buffer']
+        self.buffer = buffer
+        self.bufsize = len(buffer)
+
+    def compress(self, idata):
+        if not isinstance(idata, bytes):
+            idata = bytes(idata)  # code below does not work with memoryview
+        cdef int isize = len(idata)
+        cdef int osize = self.bufsize
+        cdef char *source = idata
+        cdef char *dest = self.buffer
+        with nogil:
+            osize = LZ4_compress_limitedOutput(source, dest, isize, osize)
+        if not osize:
+            raise Exception('lz4 compress failed')
+        return super().compress(dest[:osize])
+
+    def decompress(self, idata):
+        if not isinstance(idata, bytes):
+            idata = bytes(idata)  # code below does not work with memoryview
+        idata = super().decompress(idata)
+        cdef int isize = len(idata)
+        cdef int osize = self.bufsize
+        cdef char *source = idata
+        cdef char *dest = self.buffer
+        with nogil:
+            osize = LZ4_decompress_safe(source, dest, isize, osize)
+        if osize < 0:
+            # malformed input data, buffer too small, ...
+            raise Exception('lz4 decompress failed')
+        return dest[:osize]
+
+
+class LZMA(CompressorBase):
+    """
+    lzma compression / decompression (python 3.3+ stdlib)
+    """
+    ID = b'\x02\x00'
+    name = 'lzma'
+
+    def __init__(self, level=6, **kwargs):
+        super().__init__(**kwargs)
+        self.level = level
+        if lzma is None:
+            raise ValueError('No lzma support found.')
+
+    def compress(self, data):
+        # we do not need integrity checks in lzma, we do that already
+        data = lzma.compress(data, preset=self.level, check=lzma.CHECK_NONE)
+        return super().compress(data)
+
+    def decompress(self, data):
+        data = super().decompress(data)
+        return lzma.decompress(data)
+
+
+class ZLIB(CompressorBase):
+    """
+    zlib compression / decompression (python stdlib)
+    """
+    ID = b'\x08\x00'  # not used here, see detect()
+                      # avoid all 0x.8.. IDs elsewhere!
+    name = 'zlib'
+
+    @classmethod
+    def detect(cls, data):
+        # matches misc. patterns 0x.8.. used by zlib
+        cmf, flg = data[:2]
+        is_deflate = cmf & 0x0f == 8
+        check_ok = (cmf * 256 + flg) % 31 == 0
+        return check_ok and is_deflate
+
+    def __init__(self, level=6, **kwargs):
+        super().__init__(**kwargs)
+        self.level = level
+
+    def compress(self, data):
+        # note: for compatibility no super call, do not add ID bytes
+        return zlib.compress(data, self.level)
+
+    def decompress(self, data):
+        # note: for compatibility no super call, do not strip ID bytes
+        return zlib.decompress(data)
+
+
+COMPRESSOR_TABLE = {
+    CNONE.name: CNONE,
+    LZ4.name: LZ4,
+    ZLIB.name: ZLIB,
+    LZMA.name: LZMA,
+}
+COMPRESSOR_LIST = [LZ4, CNONE, ZLIB, LZMA, ]  # check fast stuff first
+
+def get_compressor(name, **kwargs):
+    cls = COMPRESSOR_TABLE[name]
+    return cls(**kwargs)
+
+
+class Compressor:
+    """
+    compresses using a compressor with given name and parameters
+    decompresses everything we can handle (autodetect)
+    """
+    def __init__(self, name='null', **kwargs):
+        self.params = kwargs
+        self.compressor = get_compressor(name, **self.params)
+
+    def compress(self, data):
+        return self.compressor.compress(data)
+
+    def decompress(self, data):
+        hdr = bytes(data[:2])  # detect() does not work with memoryview
+        for cls in COMPRESSOR_LIST:
+            if cls.detect(hdr):
+                return cls(**self.params).decompress(data)
+        else:
+            raise ValueError('No decompressor for this data found: %r.', data[:2])
+
+
+# a buffer used for (de)compression result, which can be slightly bigger
+# than the chunk buffer in the worst (incompressible data) case, add 10%:
+COMPR_BUFFER = bytes(int(1.1 * 2 ** 23))  # CHUNK_MAX_EXP == 23

+ 36 - 0
borg/helpers.py

@@ -278,9 +278,45 @@ def timestamp(s):
 
 def ChunkerParams(s):
     window_size, chunk_mask, chunk_min, chunk_max = s.split(',')
+    if int(chunk_max) > 23:
+        # do not go beyond 2**23 (8MB) chunk size now,
+        # COMPR_BUFFER can only cope with up to this size
+        raise ValueError
     return int(window_size), int(chunk_mask), int(chunk_min), int(chunk_max)
 
 
+def CompressionSpec(s):
+    values = s.split(',')
+    count = len(values)
+    if count < 1:
+        raise ValueError
+    compression = values[0]
+    try:
+        compression = int(compression)
+        if count > 1:
+            raise ValueError
+        # DEPRECATED: it is just --compression N
+        if 0 <= compression <= 9:
+            return dict(name='zlib', level=compression)
+        raise ValueError
+    except ValueError:
+        # --compression algo[,...]
+        name = compression
+        if name in ('none', 'lz4', ):
+            return dict(name=name)
+        if name in ('zlib', 'lzma', ):
+            if count < 2:
+                level = 6  # default compression level in py stdlib
+            elif count == 2:
+                level = int(values[1])
+                if not 0 <= level <= 9:
+                    raise ValueError
+            else:
+                raise ValueError
+            return dict(name=name, level=level)
+        raise ValueError
+
+
 def is_cachedir(path):
     """Determines whether the specified path is a cache directory (and
     therefore should potentially be excluded from the backup) according to

+ 6 - 6
borg/key.py

@@ -6,9 +6,9 @@ import msgpack
 import textwrap
 import hmac
 from hashlib import sha256
-import zlib
 
 from .crypto import pbkdf2_sha256, get_random_bytes, AES, bytes_to_long, long_to_bytes, bytes_to_int, num_aes_blocks
+from .compress import Compressor, COMPR_BUFFER
 from .helpers import IntegrityError, get_keys_dir, Error
 
 PREFIX = b'\0' * 8
@@ -68,7 +68,7 @@ class KeyBase:
         self.TYPE_STR = bytes([self.TYPE])
         self.repository = repository
         self.target = None  # key location file path / repo obj
-        self.compression_level = 0
+        self.compressor = Compressor('none', buffer=COMPR_BUFFER)
 
     def id_hash(self, data):
         """Return HMAC hash using the "id" HMAC key
@@ -99,12 +99,12 @@ class PlaintextKey(KeyBase):
         return sha256(data).digest()
 
     def encrypt(self, data):
-        return b''.join([self.TYPE_STR, zlib.compress(data, self.compression_level)])
+        return b''.join([self.TYPE_STR, self.compressor.compress(data)])
 
     def decrypt(self, id, data):
         if data[0] != self.TYPE:
             raise IntegrityError('Invalid encryption envelope')
-        data = zlib.decompress(memoryview(data)[1:])
+        data = self.compressor.decompress(memoryview(data)[1:])
         if id and sha256(data).digest() != id:
             raise IntegrityError('Chunk id verification failed')
         return data
@@ -131,7 +131,7 @@ class AESKeyBase(KeyBase):
         return HMAC(self.id_key, data, sha256).digest()
 
     def encrypt(self, data):
-        data = zlib.compress(data, self.compression_level)
+        data = self.compressor.compress(data)
         self.enc_cipher.reset()
         data = b''.join((self.enc_cipher.iv[8:], self.enc_cipher.encrypt(data)))
         hmac = HMAC(self.enc_hmac_key, data, sha256).digest()
@@ -144,7 +144,7 @@ class AESKeyBase(KeyBase):
         if memoryview(HMAC(self.enc_hmac_key, memoryview(data)[33:], sha256).digest()) != hmac:
             raise IntegrityError('Encryption envelope checksum mismatch')
         self.dec_cipher.reset(iv=PREFIX + data[33:41])
-        data = zlib.decompress(self.dec_cipher.decrypt(data[41:]))  # should use memoryview
+        data = self.compressor.decompress(self.dec_cipher.decrypt(data[41:]))
         if id and HMAC(self.id_key, data, sha256).digest() != id:
             raise IntegrityError('Chunk id verification failed')
         return data

+ 102 - 0
borg/testsuite/compress.py

@@ -0,0 +1,102 @@
+import zlib
+try:
+    import lzma
+except ImportError:
+    lzma = None
+
+import pytest
+
+from ..compress import get_compressor, Compressor, CNONE, ZLIB, LZ4
+
+
+buffer = bytes(2**16)
+data = b'fooooooooobaaaaaaaar' * 10
+params = dict(name='zlib', level=6, buffer=buffer)
+
+
+def test_get_compressor():
+    c = get_compressor(name='none')
+    assert isinstance(c, CNONE)
+    c = get_compressor(name='lz4', buffer=buffer)
+    assert isinstance(c, LZ4)
+    c = get_compressor(name='zlib')
+    assert isinstance(c, ZLIB)
+    with pytest.raises(KeyError):
+        get_compressor(name='foobar')
+
+
+def test_cnull():
+    c = get_compressor(name='none')
+    cdata = c.compress(data)
+    assert len(cdata) > len(data)
+    assert data in cdata  # it's not compressed and just in there 1:1
+    assert data == c.decompress(cdata)
+    assert data == Compressor(**params).decompress(cdata)  # autodetect
+
+
+def test_lz4():
+    c = get_compressor(name='lz4', buffer=buffer)
+    cdata = c.compress(data)
+    assert len(cdata) < len(data)
+    assert data == c.decompress(cdata)
+    assert data == Compressor(**params).decompress(cdata)  # autodetect
+
+
+def test_zlib():
+    c = get_compressor(name='zlib')
+    cdata = c.compress(data)
+    assert len(cdata) < len(data)
+    assert data == c.decompress(cdata)
+    assert data == Compressor(**params).decompress(cdata)  # autodetect
+
+
+def test_lzma():
+    if lzma is None:
+        pytest.skip("No lzma support found.")
+    c = get_compressor(name='lzma')
+    cdata = c.compress(data)
+    assert len(cdata) < len(data)
+    assert data == c.decompress(cdata)
+    assert data == Compressor(**params).decompress(cdata)  # autodetect
+
+
+def test_autodetect_invalid():
+    with pytest.raises(ValueError):
+        Compressor(**params).decompress(b'\xff\xfftotalcrap')
+    with pytest.raises(ValueError):
+        Compressor(**params).decompress(b'\x08\x00notreallyzlib')
+
+
+def test_zlib_compat():
+    # for compatibility reasons, we do not add an extra header for zlib,
+    # nor do we expect one when decompressing / autodetecting
+    for level in range(10):
+        c = get_compressor(name='zlib', level=level)
+        cdata1 = c.compress(data)
+        cdata2 = zlib.compress(data, level)
+        assert cdata1 == cdata2
+        data2 = c.decompress(cdata2)
+        assert data == data2
+        data2 = Compressor(**params).decompress(cdata2)
+        assert data == data2
+
+
+def test_compressor():
+    params_list = [
+        dict(name='none', buffer=buffer),
+        dict(name='lz4', buffer=buffer),
+        dict(name='zlib', level=0, buffer=buffer),
+        dict(name='zlib', level=6, buffer=buffer),
+        dict(name='zlib', level=9, buffer=buffer),
+    ]
+    if lzma:
+        params_list += [
+            dict(name='lzma', level=0, buffer=buffer),
+            dict(name='lzma', level=6, buffer=buffer),
+            dict(name='lzma', level=9, buffer=buffer),
+        ]
+    for params in params_list:
+        c = Compressor(**params)
+        assert data == c.decompress(c.compress(data))
+
+

+ 26 - 1
borg/testsuite/helpers.py

@@ -2,11 +2,12 @@ import hashlib
 from time import mktime, strptime
 from datetime import datetime, timezone, timedelta
 
+import pytest
 import msgpack
 
 from ..helpers import adjust_patterns, exclude_path, Location, format_timedelta, ExcludePattern, make_path_safe, \
     prune_within, prune_split, \
-    StableDict, int_to_bigint, bigint_to_int, parse_timestamp
+    StableDict, int_to_bigint, bigint_to_int, parse_timestamp, CompressionSpec
 from . import BaseTestCase
 
 
@@ -104,6 +105,30 @@ class PatternTestCase(BaseTestCase):
                           ['/etc/passwd', '/etc/hosts', '/var/log/messages', '/var/log/dmesg'])
 
 
+def test_compression_specs():
+    with pytest.raises(ValueError):
+        CompressionSpec('')
+    assert CompressionSpec('0') == dict(name='zlib', level=0)
+    assert CompressionSpec('1') == dict(name='zlib', level=1)
+    assert CompressionSpec('9') == dict(name='zlib', level=9)
+    with pytest.raises(ValueError):
+        CompressionSpec('10')
+    assert CompressionSpec('none') == dict(name='none')
+    assert CompressionSpec('lz4') == dict(name='lz4')
+    assert CompressionSpec('zlib') == dict(name='zlib', level=6)
+    assert CompressionSpec('zlib,0') == dict(name='zlib', level=0)
+    assert CompressionSpec('zlib,9') == dict(name='zlib', level=9)
+    with pytest.raises(ValueError):
+        CompressionSpec('zlib,9,invalid')
+    assert CompressionSpec('lzma') == dict(name='lzma', level=6)
+    assert CompressionSpec('lzma,0') == dict(name='lzma', level=0)
+    assert CompressionSpec('lzma,9') == dict(name='lzma', level=9)
+    with pytest.raises(ValueError):
+        CompressionSpec('lzma,9,invalid')
+    with pytest.raises(ValueError):
+        CompressionSpec('invalid')
+
+
 class MakePathSafeTestCase(BaseTestCase):
 
     def test(self):

+ 1 - 0
docs/global.rst.inc

@@ -13,6 +13,7 @@
 .. _PBKDF2: https://en.wikipedia.org/wiki/PBKDF2
 .. _ACL: https://en.wikipedia.org/wiki/Access_control_list
 .. _libacl: http://savannah.nongnu.org/projects/acl/
+.. _liblz4: https://github.com/Cyan4973/lz4
 .. _OpenSSL: https://www.openssl.org/
 .. _Python: http://www.python.org/
 .. _Buzhash: https://en.wikipedia.org/wiki/Buzhash

+ 10 - 2
docs/installation.rst

@@ -9,6 +9,7 @@ Installation
 * Python_ >= 3.2
 * OpenSSL_ >= 1.0.0
 * libacl_
+* liblz4_
 * some python dependencies, see install_requires in setup.py
 
 General notes
@@ -59,6 +60,9 @@ Some of the steps detailled below might be useful also for non-git installs.
     # ACL support Headers + Library
     apt-get install libacl1-dev libacl1
 
+    # lz4 super fast compression support Headers + Library
+    apt-get install liblz4-dev liblz4-1
+
     # if you do not have gcc / make / etc. yet
     apt-get install build-essential
 
@@ -106,13 +110,16 @@ Some of the steps detailled below might be useful also for non-git installs.
 
     # ACL support Headers + Library
     sudo dnf install libacl-devel libacl
-    
+
+    # lz4 super fast compression support Headers + Library
+    sudo dnf install lz4
+
     # optional: FUSE support - to mount backup archives
     sudo dnf install fuse-devel fuse
     
     # optional: for unit testing
     sudo dnf install fakeroot
-    
+
     # get |project_name| from github, install it
     git clone |git_url|
 
@@ -148,6 +155,7 @@ You'll need at least (use the cygwin installer to fetch/install these):
     gcc-core
     git
     libopenssl
+    liblz4_1 liblz4-devel  # from cygwinports.org
     make
     openssh
     openssl-devel

+ 30 - 5
docs/internals.rst

@@ -382,10 +382,35 @@ representation of the repository id.
 Compression
 -----------
 
-|project_name| currently always pipes all data through a zlib compressor which
-supports compression levels 0 (no compression, fast) to 9 (high compression, slow).
+|project_name| supports the following compression methods:
 
-See ``borg create --help`` about how to specify the compression level and its default.
+- none (no compression, pass through data 1:1)
+- lz4 (low compression, but super fast)
+- zlib (level 0-9, level 0 is no compression [but still adding zlib overhead],
+  level 1 is low, level 9 is high compression)
+- lzma (level 0-9, level 0 is low, level 9 is high compression).
+
+Speed:  none > lz4 > zlib > lzma
+Compression: lzma > zlib > lz4 > none
+
+Be careful, higher zlib and especially lzma compression levels might take a
+lot of resources (CPU and memory).
+
+The overall speed of course also depends on the speed of your target storage.
+If that is slow, using a higher compression level might yield better overall
+performance. You need to experiment a bit. Maybe just watch your CPU load, if
+that is relatively low, increase compression until 1 core is 70-100% loaded.
 
-Note: zlib level 0 creates a little bit more output data than it gets as input,
-due to zlib protocol overhead.
+Even if your target storage is rather fast, you might see interesting effects:
+while doing no compression at all (none) is a operation that takes no time, it
+likely will need to store more data to the storage compared to using lz4.
+The time needed to transfer and store the additional data might be much more
+than if you had used lz4 (which is super fast, but still might compress your
+data about 2:1). This is assuming your data is compressible (if you backup
+already compressed data, trying to compress them at backup time is usually
+pointless).
+
+Compression is applied after deduplication, thus using different compression
+methods in one repo does not influence deduplication.
+
+See ``borg create --help`` about how to specify the compression level and its default.

+ 25 - 0
docs/quickstart.rst

@@ -89,6 +89,31 @@ certain number of old archives::
     # and 6 monthly archives.
     borg prune -v $REPOSITORY --keep-daily=7 --keep-weekly=4 --keep-monthly=6
 
+.. backup_compression:
+
+Backup compression
+------------------
+
+Default is no compression, but we support different methods with high speed
+or high compression:
+
+If you have a quick repo storage and you want a little compression:
+
+    $ borg create --compression lz4 /mnt/backup::repo ~
+
+If you have a medium fast repo storage and you want a bit more compression (N=0..9,
+0 means no compression, 9 means high compression):
+
+    $ borg create --compression zlib,N /mnt/backup::repo ~
+
+If you have a very slow repo storage and you want high compression (N=0..9, 0 means
+low compression, 9 means high compression):
+
+    $ borg create --compression lzma,N /mnt/backup::repo ~
+
+You'll need to experiment a bit to find the best compression for your use case.
+Keep an eye on CPU load and throughput.
+
 .. _encrypted_repos:
 
 Repository encryption

+ 3 - 0
docs/support.rst

@@ -4,6 +4,9 @@
 Support
 =======
 
+Please first read the docs and the FAQ section in the docs, a lot of stuff is
+documented / explained there.
+
 Issue Tracker
 -------------
 

+ 15 - 0
docs/usage.rst

@@ -76,8 +76,11 @@ Resource Usage
 |project_name| might use a lot of resources depending on the size of the data set it is dealing with.
 
 CPU: it won't go beyond 100% of 1 core as the code is currently single-threaded.
+     Especially higher zlib and lzma compression levels use significant amounts of CPU cycles.
 
 Memory (RAM): the chunks index and the files index are read into memory for performance reasons.
+              compression, esp. lzma compression with high levels might need substantial amounts
+              of memory.
 
 Temporary files: reading data and metadata from a FUSE mounted repository will consume about the same space as the
                  deduplicated chunks used to represent them in the repository.
@@ -175,6 +178,18 @@ Examples
     # Backup a raw device (must not be active/in use/mounted at that time)
     $ dd if=/dev/sda bs=10M | borg create /mnt/backup::my-sda -
 
+    # No compression (default)
+    $ borg create /mnt/backup::repo ~
+
+    # Super fast, low compression
+    $ borg create --compression lz4 /mnt/backup::repo ~
+
+    # Less fast, higher compression (N = 0..9)
+    $ borg create --compression zlib,N /mnt/backup::repo ~
+
+    # Even slower, even higher compression (N = 0..9)
+    $ borg create --compression lzma,N /mnt/backup::repo ~
+
 
 .. include:: usage/extract.rst.inc
 

+ 7 - 1
setup.py

@@ -19,6 +19,7 @@ if sys.version_info < min_python:
 
 from setuptools import setup, Extension
 
+compress_source = 'borg/compress.pyx'
 crypto_source = 'borg/crypto.pyx'
 chunker_source = 'borg/chunker.pyx'
 hashindex_source = 'borg/hashindex.pyx'
@@ -38,6 +39,7 @@ try:
 
         def make_distribution(self):
             self.filelist.extend([
+                'borg/compress.c',
                 'borg/crypto.c',
                 'borg/chunker.c', 'borg/_chunker.c',
                 'borg/hashindex.c', 'borg/_hashindex.c',
@@ -52,6 +54,7 @@ except ImportError:
         def __init__(self, *args, **kwargs):
             raise Exception('Cython is required to run sdist')
 
+    compress_source = compress_source.replace('.pyx', '.c')
     crypto_source = crypto_source.replace('.pyx', '.c')
     chunker_source = chunker_source.replace('.pyx', '.c')
     hashindex_source = hashindex_source.replace('.pyx', '.c')
@@ -59,7 +62,9 @@ except ImportError:
     platform_freebsd_source = platform_freebsd_source.replace('.pyx', '.c')
     platform_darwin_source = platform_darwin_source.replace('.pyx', '.c')
     from distutils.command.build_ext import build_ext
-    if not all(os.path.exists(path) for path in [crypto_source, chunker_source, hashindex_source, platform_linux_source, platform_freebsd_source]):
+    if not all(os.path.exists(path) for path in [
+        compress_source, crypto_source, chunker_source, hashindex_source,
+        platform_linux_source, platform_freebsd_source]):
         raise ImportError('The GIT version of Borg needs Cython. Install Cython or use a released version')
 
 
@@ -89,6 +94,7 @@ cmdclass = versioneer.get_cmdclass()
 cmdclass.update({'build_ext': build_ext, 'sdist': Sdist})
 
 ext_modules = [
+    Extension('borg.compress', [compress_source], libraries=['lz4']),
     Extension('borg.crypto', [crypto_source], libraries=['crypto'], include_dirs=include_dirs, library_dirs=library_dirs),
     Extension('borg.chunker', [chunker_source]),
     Extension('borg.hashindex', [hashindex_source])