Browse Source

Merge branch 'merge-all' of ../attic into experimental

Thomas Waldmann 10 years ago
parent
commit
3a38457def

+ 48 - 0
CHANGES-experimental.txt

@@ -0,0 +1,48 @@
+Important note about "experimental" branch
+==========================================
+
+Goal of the "experimental" branch is to merge all the stuff:
+- changesets from master branch
+- features that DO IMPACT compatibility
+- play with new technologies
+- etc.
+
+THERE IS NO GUARANTEE THAT IT IS COMPATIBLE WITH MASTER BRANCH OR PREVIOUS
+"experimental" CODE nor THAT YOU CAN SWITCH BACK AND FORTH BETWEEN BRANCHES
+WITHIN THE SAME REPOSITORY WITHOUT ENCOUNTERING SEVERE ISSUES.
+
+Please also see the LICENSE for more informations.
+
+
+Stuff in "experimental" that is not in "master" minus minor changes
+===================================================================
+
+added tuning docs
+
+attic init --compression NN --cipher NN --mac NN ...
+(see attic init --help)
+
+new hashes:      sha512-256
+                 sha512
+                 sha1
+                 ghash (default)
+new MACs:        hmac-sha512-256
+                 hmac-sha512
+                 hmac-sha1
+                 gmac (default)
+new ciphers:     aes256-ctr + hmac-sha512-256
+                 aes256-gcm (default)
+new compression: no compression (default)
+                 zlib level 1..9 (previously, level 6 was hardcoded)
+                 lzma preset 0..9
+                 lz4 (and other) multi-threaded algos from blosc library
+
+source: more flexible type 0x03 header format, allowing to give hash algo,
+compression algo and level, encryption algo, key type.
+
+IV is stored in full length, length of stored IV/MAC/hash is flexible.
+Indexing key size (key = id_hash()) is flexible and configurable per repo.
+
+source: less hardcoding, numeric offsets / lengths
+source: flexible hashing, compression, encryption, key dispatching
+

+ 2 - 1
README.rst

@@ -34,7 +34,7 @@ Space efficient storage
 
 Optional data encryption
     All data can be protected using 256-bit AES encryption and data integrity
-    and authenticity is verified using HMAC-SHA256.
+    and authenticity is verified using a MAC (message authentication code).
 
 Off-site backups
     Borg can store data on any remote host accessible over SSH.  This is
@@ -49,6 +49,7 @@ What do I need?
 Borg requires Python 3.2 or above to work.
 Borg also requires a sufficiently recent OpenSSL (>= 1.0.0).
 In order to mount archives as filesystems, llfuse is required.
+For other python requirements, please see setup.py install_requires.
 
 How do I install it?
 --------------------

+ 1 - 1
borg/_hashindex.c

@@ -366,7 +366,7 @@ hashindex_summarize(HashIndex *index, long long *total_size, long long *total_cs
     void *key = NULL;
 
     while((key = hashindex_next_key(index, key))) {
-        values = key + 32;
+        values = key + index->key_size;
         unique_size += values[1];
         unique_csize += values[2];
         size += values[0] * values[1];

+ 3 - 3
borg/archive.py

@@ -616,7 +616,7 @@ class ArchiveChecker:
         self.repository = repository
         self.init_chunks()
         self.key = self.identify_key(repository)
-        if Manifest.MANIFEST_ID not in self.chunks:
+        if Manifest.manifest_id(repository) not in self.chunks:
             self.manifest = self.rebuild_manifest()
         else:
             self.manifest, _ = Manifest.load(repository, key=self.key)
@@ -635,7 +635,7 @@ class ArchiveChecker:
         # Explicity set the initial hash table capacity to avoid performance issues
         # due to hash table "resonance"
         capacity = int(len(self.repository) * 1.2)
-        self.chunks = ChunkIndex(capacity)
+        self.chunks = ChunkIndex(capacity, key_size=self.repository.key_size)
         marker = None
         while True:
             result = self.repository.list(limit=10000, marker=marker)
@@ -687,7 +687,7 @@ class ArchiveChecker:
         Missing and/or incorrect data is repaired when detected
         """
         # Exclude the manifest from chunks
-        del self.chunks[Manifest.MANIFEST_ID]
+        del self.chunks[Manifest.manifest_id(self.repository)]
 
         def mark_as_possibly_superseded(id_):
             if self.chunks.get(id_, (0,))[0] == 0:

+ 52 - 10
borg/archiver.py

@@ -16,7 +16,7 @@ from . import __version__
 from .archive import Archive, ArchiveChecker
 from .repository import Repository
 from .cache import Cache
-from .key import key_creator
+from .key import key_creator, maccer_creator, COMPR_DEFAULT, HASH_DEFAULT, MAC_DEFAULT, PLAIN_DEFAULT, CIPHER_DEFAULT
 from .helpers import Error, location_validator, format_time, format_file_size, \
     format_file_mode, ExcludePattern, exclude_path, adjust_patterns, to_localtime, timestamp, \
     get_cache_dir, get_keys_dir, format_timedelta, prune_within, prune_split, \
@@ -30,11 +30,11 @@ class Archiver:
     def __init__(self):
         self.exit_code = 0
 
-    def open_repository(self, location, create=False, exclusive=False):
+    def open_repository(self, location, create=False, exclusive=False, key_size=None):
         if location.proto == 'ssh':
-            repository = RemoteRepository(location, create=create)
+            repository = RemoteRepository(location, create=create, key_size=key_size)
         else:
-            repository = Repository(location.path, create=create, exclusive=exclusive)
+            repository = Repository(location.path, create=create, exclusive=exclusive, key_size=key_size)
         repository._location = location
         return repository
 
@@ -59,10 +59,12 @@ class Archiver:
     def do_init(self, args):
         """Initialize an empty repository"""
         print('Initializing repository at "%s"' % args.repository.orig)
-        repository = self.open_repository(args.repository, create=True, exclusive=True)
-        key = key_creator(repository, args)
+        key_cls = key_creator(args)
+        maccer_cls = maccer_creator(args, key_cls)
+        repository = self.open_repository(args.repository, create=True, exclusive=True,
+                                          key_size=maccer_cls.digest_size)
+        key = key_cls.create(repository, args)
         manifest = Manifest(key, repository)
-        manifest.key = key
         manifest.write()
         repository.commit()
         Cache(repository, key, manifest, warn_if_unencrypted=False)
@@ -523,8 +525,39 @@ Type "Yes I am sure" if you understand this and want to continue.\n""")
         init_epilog = textwrap.dedent("""
         This command initializes an empty repository. A repository is a filesystem
         directory containing the deduplicated data from zero or more archives.
-        Encryption can be enabled at repository init time.
-        """)
+        Encryption can be enabled, compression, cipher and mac method can be chosen at
+        repository init time.
+
+        --compression METHODs (default: %02d):
+
+        - 00      no compression
+        - 01..09  zlib levels 1..9 (1 means low compression, 9 max. compression)
+        - 10..19  lzma levels 0..9 (0 means low compression, 9 max. compression)
+        - 20..29  lz4 (blosc) levels 0..9 (0 = no, 9 = max. compression)
+        - 30..39  lz4hc (blosc) levels 0..9 (0 = no, 9 = max. compression)
+        - 40..49  blosclz (blosc) levels 0..9 (0 = no, 9 = max. compression)
+        - 50..59  snappy (blosc) levels 0..9 (0 = no, 9 = max. compression)
+        - 60..69  zlib (blosc) levels 0..9 (0 = no, 9 = max. compression)
+
+        --cipher METHODs (default: %02d or %02d)
+
+        - 00      No encryption
+        - 01      AEAD: AES-CTR + HMAC-SHA256
+        - 02      AEAD: AES-GCM
+
+        --mac METHODs (default: %02d or %02d):
+
+        - 00      sha256 (simple hash, no MAC, faster on 32bit CPU)
+        - 01      sha512-256 (simple hash, no MAC, faster on 64bit CPU)
+        - 02      ghash (simple hash, no MAC, fastest on CPUs with AES-GCM support)
+        - 03      sha1 (simple hash, no MAC, fastest on CPUs without AES-GCM support)
+        - 04      sha512 (simple hash, no MAC, faster on 64bit CPU)
+        - 10      hmac-sha256 (MAC, faster on 32bit CPU)
+        - 11      hmac-sha512-256 (MAC, faster on 64bit CPU)
+        - 13      hmac-sha1 (MAC, fastest on CPUs without AES-GCM support)
+        - 14      hmac-sha512 (MAC, faster on 64bit CPU)
+        - 20      gmac (MAC, fastest on CPUs with AES-GCM support)
+        """ % (COMPR_DEFAULT, PLAIN_DEFAULT, CIPHER_DEFAULT, HASH_DEFAULT, MAC_DEFAULT))
         subparser = subparsers.add_parser('init', parents=[common_parser],
                                           description=self.do_init.__doc__, epilog=init_epilog,
                                           formatter_class=argparse.RawDescriptionHelpFormatter)
@@ -534,7 +567,16 @@ Type "Yes I am sure" if you understand this and want to continue.\n""")
                                help='repository to create')
         subparser.add_argument('-e', '--encryption', dest='encryption',
                                choices=('none', 'passphrase', 'keyfile'), default='none',
-                               help='select encryption method')
+                               help='select encryption key method')
+        subparser.add_argument('-C', '--cipher', dest='cipher',
+                               type=int, default=None, metavar='METHOD',
+                               help='select cipher (0..2)')
+        subparser.add_argument('-c', '--compression', dest='compression',
+                               type=int, default=COMPR_DEFAULT, metavar='METHOD',
+                               help='select compression method (0..19)')
+        subparser.add_argument('-m', '--mac', dest='mac',
+                               type=int, default=None, metavar='METHOD',
+                               help='select hash/mac method (0..3)')
 
         check_epilog = textwrap.dedent("""
         The check command verifies the consistency of a repository and the corresponding

+ 7 - 6
borg/cache.py

@@ -95,7 +95,7 @@ class Cache:
         config.set('cache', 'manifest', '')
         with open(os.path.join(self.path, 'config'), 'w') as fd:
             config.write(fd)
-        ChunkIndex().write(os.path.join(self.path, 'chunks').encode('utf-8'))
+        ChunkIndex(key_size=self.repository.key_size).write(os.path.join(self.path, 'chunks').encode('utf-8'))
         with open(os.path.join(self.path, 'chunks.archive'), 'wb') as fd:
             pass  # empty file
         with open(os.path.join(self.path, 'files'), 'wb') as fd:
@@ -118,7 +118,8 @@ class Cache:
         self.timestamp = self.config.get('cache', 'timestamp', fallback=None)
         self.key_type = self.config.get('cache', 'key_type', fallback=None)
         self.previous_location = self.config.get('cache', 'previous_location', fallback=None)
-        self.chunks = ChunkIndex.read(os.path.join(self.path, 'chunks').encode('utf-8'))
+        self.chunks = ChunkIndex.read(os.path.join(self.path, 'chunks').encode('utf-8'),
+                                      key_size=self.repository.key_size)
         self.files = None
 
     def open(self):
@@ -272,7 +273,7 @@ class Cache:
             return archive_name
 
         def fetch_and_build_idx(archive_id, repository, key, tmp_dir, tf_out):
-            chunk_idx = ChunkIndex()
+            chunk_idx = ChunkIndex(key_size=repository.key_size)
             cdata = repository.get(archive_id)
             data = key.decrypt(archive_id, cdata)
             add(chunk_idx, archive_id, len(data), len(cdata))
@@ -299,13 +300,13 @@ class Cache:
                 tf_out.addfile(tarinfo, f)
             os.unlink(file_tmp)
 
-        def create_master_idx(chunk_idx, tf_in, tmp_dir):
+        def create_master_idx(chunk_idx, repository, tf_in, tmp_dir):
             chunk_idx.clear()
             for tarinfo in tf_in:
                 archive_id_hex = tarinfo.name
                 tf_in.extract(archive_id_hex, tmp_dir)
                 chunk_idx_path = os.path.join(tmp_dir, archive_id_hex).encode('utf-8')
-                archive_chunk_idx = ChunkIndex.read(chunk_idx_path)
+                archive_chunk_idx = ChunkIndex.read(chunk_idx_path, key_size=repository.key_size)
                 for chunk_id, (count, size, csize) in archive_chunk_idx.iteritems():
                     add(chunk_idx, chunk_id, size, csize, incr=count)
                 os.unlink(chunk_idx_path)
@@ -334,7 +335,7 @@ class Cache:
             rename_out_archive()
             print('Merging collection into master chunks cache...')
             in_archive = open_in_archive()
-            create_master_idx(self.chunks, in_archive, tmp_dir)
+            create_master_idx(self.chunks, repository, in_archive, tmp_dir)
             close_archive(in_archive)
             print('Done.')
 

+ 81 - 17
borg/crypto.pyx

@@ -7,6 +7,12 @@ from libc.stdlib cimport malloc, free
 
 API_VERSION = 2
 
+AES_CTR_MODE = 1
+AES_GCM_MODE = 2
+
+MAC_SIZE = 16  # bytes; 128 bits is the maximum allowed value. see "hack" below.
+IV_SIZE = 16  # bytes; 128 bits
+
 cdef extern from "openssl/rand.h":
     int  RAND_bytes(unsigned char *buf, int num)
 
@@ -23,6 +29,7 @@ cdef extern from "openssl/evp.h":
         pass
     const EVP_MD *EVP_sha256()
     const EVP_CIPHER *EVP_aes_256_ctr()
+    const EVP_CIPHER *EVP_aes_256_gcm()
     void EVP_CIPHER_CTX_init(EVP_CIPHER_CTX *a)
     void EVP_CIPHER_CTX_cleanup(EVP_CIPHER_CTX *a)
 
@@ -36,20 +43,33 @@ cdef extern from "openssl/evp.h":
                           const unsigned char *in_, int inl)
     int EVP_EncryptFinal_ex(EVP_CIPHER_CTX *ctx, unsigned char *out, int *outl)
     int EVP_DecryptFinal_ex(EVP_CIPHER_CTX *ctx, unsigned char *out, int *outl)
-
+    int EVP_CIPHER_CTX_ctrl(EVP_CIPHER_CTX *ctx, int type, int arg, unsigned char *ptr)
     int PKCS5_PBKDF2_HMAC(const char *password, int passwordlen,
                           const unsigned char *salt, int saltlen, int iter,
                           const EVP_MD *digest,
                           int keylen, unsigned char *out)
+    int EVP_CTRL_GCM_GET_TAG
+    int EVP_CTRL_GCM_SET_TAG
+    int EVP_CTRL_GCM_SET_IVLEN
 
 import struct
 
 _int = struct.Struct('>I')
-_long = struct.Struct('>Q')
+_2long = struct.Struct('>QQ')
 
 bytes_to_int = lambda x, offset=0: _int.unpack_from(x, offset)[0]
-bytes_to_long = lambda x, offset=0: _long.unpack_from(x, offset)[0]
-long_to_bytes = lambda x: _long.pack(x)
+
+
+def bytes16_to_int(b, offset=0):
+    h, l = _2long.unpack_from(b, offset)
+    return (h << 64) + l
+
+
+def int_to_bytes16(i):
+    max_uint64 = 0xffffffffffffffff
+    l = i & max_uint64
+    h = (i >> 64) & max_uint64
+    return _2long.pack(h, l)
 
 
 def num_aes_blocks(length):
@@ -59,6 +79,22 @@ def num_aes_blocks(length):
     return (length + 15) // 16
 
 
+def increment_iv(iv, amount):
+    """
+    increment the given IV considering that <amount> bytes of data was
+    encrypted based on it. In CTR / GCM mode, the IV is just a counter and
+    must never repeat.
+
+    :param iv: current IV, 16 bytes (128 bit)
+    :param amount: amount of data (in bytes) that was encrypted
+    :return: new IV, 16 bytes (128 bit)
+    """
+    iv = bytes16_to_int(iv)
+    iv += num_aes_blocks(amount)
+    iv = int_to_bytes16(iv)
+    return iv
+
+
 def pbkdf2_sha256(password, salt, iterations, size):
     """Password based key derivation function 2 (RFC2898)
     """
@@ -93,12 +129,19 @@ cdef class AES:
     """
     cdef EVP_CIPHER_CTX ctx
     cdef int is_encrypt
+    cdef int mode
 
-    def __cinit__(self, is_encrypt, key, iv=None):
+    def __cinit__(self, mode, is_encrypt, key, iv=None):
         EVP_CIPHER_CTX_init(&self.ctx)
+        self.mode = mode
         self.is_encrypt = is_encrypt
         # Set cipher type and mode
-        cipher_mode = EVP_aes_256_ctr()
+        if mode == AES_CTR_MODE:
+            cipher_mode = EVP_aes_256_ctr()
+        elif mode == AES_GCM_MODE:
+            cipher_mode = EVP_aes_256_gcm()
+        else:
+            raise Exception('unknown mode')
         if self.is_encrypt:
             if not EVP_EncryptInit_ex(&self.ctx, cipher_mode, NULL, NULL, NULL):
                 raise Exception('EVP_EncryptInit_ex failed')
@@ -117,6 +160,10 @@ cdef class AES:
             key2 = key
         if iv:
             iv2 = iv
+        if self.mode == AES_GCM_MODE:
+            # Set IV length (bytes)
+            if not EVP_CIPHER_CTX_ctrl(&self.ctx, EVP_CTRL_GCM_SET_IVLEN, IV_SIZE, NULL):
+                raise Exception('EVP_CIPHER_CTX_ctrl SET IVLEN failed')
         # Initialise key and IV
         if self.is_encrypt:
             if not EVP_EncryptInit_ex(&self.ctx, NULL, NULL, key2, iv2):
@@ -125,16 +172,26 @@ cdef class AES:
             if not EVP_DecryptInit_ex(&self.ctx, NULL, NULL, key2, iv2):
                 raise Exception('EVP_DecryptInit_ex failed')
 
-    @property
-    def iv(self):
-        return self.ctx.iv[:16]
+    def add(self, aad):
+        cdef int aadl = len(aad)
+        cdef int outl
+        if self.mode != AES_GCM_MODE:
+            raise Exception('additional data only supported for AES GCM mode')
+        # Zero or more calls to specify any AAD
+        if self.is_encrypt:
+            if not EVP_EncryptUpdate(&self.ctx, NULL, &outl, aad, aadl):
+                raise Exception('EVP_EncryptUpdate failed')
+        else:  # decrypt
+            if not EVP_DecryptUpdate(&self.ctx, NULL, &outl, aad, aadl):
+                raise Exception('EVP_DecryptUpdate failed')
 
-    def encrypt(self, data):
+    def compute_mac_and_encrypt(self, data):
         cdef int inl = len(data)
         cdef int ctl = 0
         cdef int outl = 0
-        # note: modes that use padding, need up to one extra AES block (16b)
+        # note: modes that use padding, need up to one extra AES block (16B)
         cdef unsigned char *out = <unsigned char *>malloc(inl+16)
+        cdef unsigned char *mac = <unsigned char *>malloc(MAC_SIZE)
         if not out:
             raise MemoryError
         try:
@@ -144,15 +201,20 @@ cdef class AES:
             if not EVP_EncryptFinal_ex(&self.ctx, out+ctl, &outl):
                 raise Exception('EVP_EncryptFinal failed')
             ctl += outl
-            return out[:ctl]
+            if self.mode == AES_GCM_MODE:
+                # Get tag (mac) - only GCM mode. for CTR, the returned mac is undefined
+                if not EVP_CIPHER_CTX_ctrl(&self.ctx, EVP_CTRL_GCM_GET_TAG, MAC_SIZE, mac):
+                    raise Exception('EVP_CIPHER_CTX_ctrl GET TAG failed')
+            return (mac[:MAC_SIZE]), out[:ctl]
         finally:
+            free(mac)
             free(out)
 
-    def decrypt(self, data):
+    def check_mac_and_decrypt(self, mac, data):
         cdef int inl = len(data)
         cdef int ptl = 0
         cdef int outl = 0
-        # note: modes that use padding, need up to one extra AES block (16b).
+        # note: modes that use padding, need up to one extra AES block (16B).
         # This is what the openssl docs say. I am not sure this is correct,
         # but OTOH it will not cause any harm if our buffer is a little bigger.
         cdef unsigned char *out = <unsigned char *>malloc(inl+16)
@@ -162,10 +224,12 @@ cdef class AES:
             if not EVP_DecryptUpdate(&self.ctx, out, &outl, data, inl):
                 raise Exception('EVP_DecryptUpdate failed')
             ptl = outl
+            if self.mode == AES_GCM_MODE:
+                # Set expected tag (mac) value.
+                if not EVP_CIPHER_CTX_ctrl(&self.ctx, EVP_CTRL_GCM_SET_TAG, MAC_SIZE, mac):
+                    raise Exception('EVP_CIPHER_CTX_ctrl SET TAG failed')
             if EVP_DecryptFinal_ex(&self.ctx, out+ptl, &outl) <= 0:
-                # this error check is very important for modes with padding or
-                # authentication. for them, a failure here means corrupted data.
-                # CTR mode does not use padding nor authentication.
+                # for GCM mode, a failure here means corrupted / tampered tag (mac) or data
                 raise Exception('EVP_DecryptFinal failed')
             ptl += outl
             return out[:ptl]

+ 27 - 21
borg/hashindex.pyx

@@ -26,9 +26,11 @@ _NoDefault = object()
 
 cdef class IndexBase:
     cdef HashIndex *index
-    key_size = 32
+    cdef int key_size
 
-    def __cinit__(self, capacity=0, path=None):
+    def __cinit__(self, capacity=0, path=None, key_size=None):
+        assert key_size is not None
+        self.key_size = key_size
         if path:
             self.index = hashindex_read(<bytes>os.fsencode(path))
             if not self.index:
@@ -43,8 +45,8 @@ cdef class IndexBase:
             hashindex_free(self.index)
 
     @classmethod
-    def read(cls, path):
-        return cls(path=path)
+    def read(cls, path, key_size=None):
+        return cls(path=path, key_size=key_size)
 
     def write(self, path):
         if not hashindex_write(self.index, <bytes>os.fsencode(path)):
@@ -61,7 +63,7 @@ cdef class IndexBase:
             self[key] = value
 
     def __delitem__(self, key):
-        assert len(key) == 32
+        assert len(key) == self.key_size
         if not hashindex_delete(self.index, <char *>key):
             raise Exception('hashindex_delete failed')
 
@@ -90,14 +92,14 @@ cdef class NSIndex(IndexBase):
     value_size = 8
 
     def __getitem__(self, key):
-        assert len(key) == 32
+        assert len(key) == self.key_size
         data = <int *>hashindex_get(self.index, <char *>key)
         if not data:
             raise KeyError
         return _le32toh(data[0]), _le32toh(data[1])
 
     def __setitem__(self, key, value):
-        assert len(key) == 32
+        assert len(key) == self.key_size
         cdef int[2] data
         data[0] = _htole32(value[0])
         data[1] = _htole32(value[1])
@@ -105,20 +107,20 @@ cdef class NSIndex(IndexBase):
             raise Exception('hashindex_set failed')
 
     def __contains__(self, key):
-        assert len(key) == 32
+        assert len(key) == self.key_size
         data = <int *>hashindex_get(self.index, <char *>key)
         return data != NULL
 
     def iteritems(self, marker=None):
         cdef const void *key
-        iter = NSKeyIterator()
+        iter = NSKeyIterator(self.key_size)
         iter.idx = self
         iter.index = self.index
         if marker:
             key = hashindex_get(self.index, <char *>marker)
             if marker is None:
                 raise IndexError
-            iter.key = key - 32
+            iter.key = key - self.key_size
         return iter
 
 
@@ -126,9 +128,11 @@ cdef class NSKeyIterator:
     cdef NSIndex idx
     cdef HashIndex *index
     cdef const void *key
+    cdef int key_size
 
-    def __cinit__(self):
+    def __cinit__(self, key_size):
         self.key = NULL
+        self.key_size = key_size
 
     def __iter__(self):
         return self
@@ -137,8 +141,8 @@ cdef class NSKeyIterator:
         self.key = hashindex_next_key(self.index, <char *>self.key)
         if not self.key:
             raise StopIteration
-        cdef int *value = <int *>(self.key + 32)
-        return (<char *>self.key)[:32], (_le32toh(value[0]), _le32toh(value[1]))
+        cdef int *value = <int *>(self.key + self.key_size)
+        return (<char *>self.key)[:self.key_size], (_le32toh(value[0]), _le32toh(value[1]))
 
 
 cdef class ChunkIndex(IndexBase):
@@ -146,14 +150,14 @@ cdef class ChunkIndex(IndexBase):
     value_size = 12
 
     def __getitem__(self, key):
-        assert len(key) == 32
+        assert len(key) == self.key_size
         data = <int *>hashindex_get(self.index, <char *>key)
         if not data:
             raise KeyError
         return _le32toh(data[0]), _le32toh(data[1]), _le32toh(data[2])
 
     def __setitem__(self, key, value):
-        assert len(key) == 32
+        assert len(key) == self.key_size
         cdef int[3] data
         data[0] = _htole32(value[0])
         data[1] = _htole32(value[1])
@@ -162,20 +166,20 @@ cdef class ChunkIndex(IndexBase):
             raise Exception('hashindex_set failed')
 
     def __contains__(self, key):
-        assert len(key) == 32
+        assert len(key) == self.key_size
         data = <int *>hashindex_get(self.index, <char *>key)
         return data != NULL
 
     def iteritems(self, marker=None):
         cdef const void *key
-        iter = ChunkKeyIterator()
+        iter = ChunkKeyIterator(self.key_size)
         iter.idx = self
         iter.index = self.index
         if marker:
             key = hashindex_get(self.index, <char *>marker)
             if marker is None:
                 raise IndexError
-            iter.key = key - 32
+            iter.key = key - self.key_size
         return iter
 
     def summarize(self):
@@ -188,9 +192,11 @@ cdef class ChunkKeyIterator:
     cdef ChunkIndex idx
     cdef HashIndex *index
     cdef const void *key
+    cdef int key_size
 
-    def __cinit__(self):
+    def __cinit__(self, key_size):
         self.key = NULL
+        self.key_size = key_size
 
     def __iter__(self):
         return self
@@ -199,5 +205,5 @@ cdef class ChunkKeyIterator:
         self.key = hashindex_next_key(self.index, <char *>self.key)
         if not self.key:
             raise StopIteration
-        cdef int *value = <int *>(self.key + 32)
-        return (<char *>self.key)[:32], (_le32toh(value[0]), _le32toh(value[1]), _le32toh(value[2]))
+        cdef int *value = <int *>(self.key + self.key_size)
+        return (<char *>self.key)[:self.key_size], (_le32toh(value[0]), _le32toh(value[1]), _le32toh(value[2]))

+ 6 - 4
borg/helpers.py

@@ -82,18 +82,20 @@ def check_extension_modules():
 
 class Manifest:
 
-    MANIFEST_ID = b'\0' * 32
-
     def __init__(self, key, repository):
         self.archives = {}
         self.config = {}
         self.key = key
         self.repository = repository
 
+    @classmethod
+    def manifest_id(cls, repository):
+        return b'\0' * repository.key_size
+
     @classmethod
     def load(cls, repository, key=None):
         from .key import key_factory
-        cdata = repository.get(cls.MANIFEST_ID)
+        cdata = repository.get(cls.manifest_id(repository))
         if not key:
             key = key_factory(repository, cdata)
         manifest = cls(key, repository)
@@ -118,7 +120,7 @@ class Manifest:
             'config': self.config,
         }))
         self.id = self.key.id_hash(data)
-        self.repository.put(self.MANIFEST_ID, self.key.encrypt(data))
+        self.repository.put(self.manifest_id(self.repository), self.key.encrypt(data))
 
     def list_archive_infos(self, sort_by=None, reverse=False):
         # inexpensive Archive.list_archives replacement if we just need .name, .id, .ts

+ 647 - 95
borg/key.py

@@ -3,14 +3,33 @@ from getpass import getpass
 import os
 import msgpack
 import textwrap
+from collections import namedtuple
 import hmac
-from hashlib import sha256
+from hashlib import sha1, sha256, sha512
 import zlib
 
-from .crypto import pbkdf2_sha256, get_random_bytes, AES, bytes_to_long, long_to_bytes, bytes_to_int, num_aes_blocks
+try:
+    import lzma  # python >= 3.3
+except ImportError:
+    try:
+        from backports import lzma  # backports.lzma from pypi
+    except ImportError:
+        lzma = None
+
+try:
+    import blosc
+except ImportError:
+    blosc = None
+
+from .crypto import pbkdf2_sha256, get_random_bytes, AES, AES_CTR_MODE, AES_GCM_MODE, \
+                    bytes_to_int, increment_iv, num_aes_blocks
 from .helpers import IntegrityError, get_keys_dir, Error
 
-PREFIX = b'\0' * 8
+# TODO fix cyclic import:
+#from .archive import CHUNK_MAX
+CHUNK_MAX = 10 * 1024 * 1024
+
+Meta = namedtuple('Meta', 'compr_type, key_type, mac_type, cipher_type, iv, legacy')
 
 
 class UnsupportedPayloadError(Error):
@@ -22,47 +41,393 @@ class KeyfileNotFoundError(Error):
     """
 
 
+class sha512_256(object):  # note: can't subclass sha512
+    """sha512, but digest truncated to 256bit - faster than sha256 on 64bit platforms"""
+    digestsize = digest_size = 32
+    block_size = 64
+
+    def __init__(self, data=None):
+        self.name = 'sha512-256'
+        self._h = sha512()
+        if data:
+            self.update(data)
+
+    def update(self, data):
+        self._h.update(data)
+
+    def digest(self):
+        return self._h.digest()[:self.digest_size]
+
+    def hexdigest(self):
+        return self._h.hexdigest()[:self.digest_size * 2]
+
+    def copy(self):
+        new = sha512_256.__new__(sha512_256)
+        new._h = self._h.copy()
+        return new
+
+
+# HASH / MAC stuff below all has a mac-like interface, so it can be used in the same way.
+# special case: hashes do not use keys (and thus, do not sign/authenticate)
+
+class HASH:  # note: we can't subclass sha1/sha256/sha512
+    TYPE = 0  # override in subclass
+    digest_size = 0  # override in subclass
+    hash_func = None  # override in subclass
+
+    def __init__(self, key, data=b''):
+        # signature is like for a MAC, we ignore the key as this is a simple hash
+        if key is not None:
+            raise Exception("use a HMAC if you have a key")
+        self.h = self.hash_func(data)
+
+    def update(self, data):
+        self.h.update(data)
+
+    def digest(self):
+        return self.h.digest()
+
+    def hexdigest(self):
+        return self.h.hexdigest()
+
+
+class SHA256(HASH):
+    TYPE = 0
+    digest_size = 32
+    hash_func = sha256
+
+
+class SHA512_256(HASH):
+    TYPE = 1
+    digest_size = 32
+    hash_func = sha512_256
+
+
+class GHASH:
+    TYPE = 2
+    digest_size = 16
+
+    def __init__(self, key, data=b''):
+        # signature is like for a MAC, we ignore the key as this is a simple hash
+        if key is not None:
+            raise Exception("use a MAC if you have a key")
+        self.mac_cipher = AES(mode=AES_GCM_MODE, is_encrypt=True, key=b'\0' * 32, iv=b'\0' * 16)
+        if data:
+            self.update(data)
+
+    def update(self, data):
+        # GMAC = aes-gcm with all data as AAD, no data as to-be-encrypted data
+        self.mac_cipher.add(bytes(data))
+
+    def digest(self):
+        hash, _ = self.mac_cipher.compute_mac_and_encrypt(b'')
+        return hash
+
+
+class SHA1(HASH):
+    TYPE = 3
+    digest_size = 20
+    hash_func = sha1
+
+
+class SHA512(HASH):
+    TYPE = 4
+    digest_size = 64
+    hash_func = sha512
+
+
 class HMAC(hmac.HMAC):
-    """Workaround a bug in Python < 3.4 Where HMAC does not accept memoryviews
-    """
+    TYPE = 0  # override in subclass
+    digest_size = 0  # override in subclass
+    hash_func = None  # override in subclass
+
+    def __init__(self, key, data):
+        if key is None:
+            raise Exception("do not use HMAC if you don't have a key")
+        super().__init__(key, data, self.hash_func)
+
     def update(self, msg):
+        # Workaround a bug in Python < 3.4 Where HMAC does not accept memoryviews
         self.inner.update(msg)
 
 
-def key_creator(repository, args):
-    if args.encryption == 'keyfile':
-        return KeyfileKey.create(repository, args)
-    elif args.encryption == 'passphrase':
-        return PassphraseKey.create(repository, args)
-    else:
-        return PlaintextKey.create(repository, args)
+class HMAC_SHA256(HMAC):
+    TYPE = 10
+    digest_size = 32
+    hash_func = sha256
 
 
-def key_factory(repository, manifest_data):
-    if manifest_data[0] == KeyfileKey.TYPE:
-        return KeyfileKey.detect(repository, manifest_data)
-    elif manifest_data[0] == PassphraseKey.TYPE:
-        return PassphraseKey.detect(repository, manifest_data)
-    elif manifest_data[0] == PlaintextKey.TYPE:
-        return PlaintextKey.detect(repository, manifest_data)
-    else:
-        raise UnsupportedPayloadError(manifest_data[0])
+class HMAC_SHA512_256(HMAC):
+    TYPE = 11
+    digest_size = 32
+    hash_func = sha512_256
+
+
+class HMAC_SHA1(HMAC):
+    TYPE = 13
+    digest_size = 20
+    hash_func = sha1
+
+
+class HMAC_SHA512(HMAC):
+    TYPE = 14
+    digest_size = 64
+    hash_func = sha512
+
 
+class GMAC(GHASH):
+    TYPE = 20
+    digest_size = 16
 
-class KeyBase:
+    def __init__(self, key, data=b''):
+        if key is None:
+            raise Exception("do not use GMAC if you don't have a key")
+        self.mac_cipher = AES(mode=AES_GCM_MODE, is_encrypt=True, key=key, iv=b'\0' * 16)
+        if data:
+            self.update(data)
+
+
+# defaults are optimized for speed on modern CPUs with AES hw support
+HASH_DEFAULT = GHASH.TYPE
+MAC_DEFAULT = GMAC.TYPE
+
+
+# compressor classes, all same interface
+
+class NullCompressor(object):  # uses 0 in the mapping
+    TYPE = 0
+
+    def compress(self, data):
+        return bytes(data)
+
+    def decompress(self, data):
+        return bytes(data)
+
+
+class ZlibCompressor(object):  # uses 1..9 in the mapping
+    TYPE = 0
+    LEVELS = range(10)
+
+    def compress(self, data):
+        level = self.TYPE - ZlibCompressor.TYPE
+        return zlib.compress(data, level)
+
+    def decompress(self, data):
+        return zlib.decompress(data)
+
+
+class LzmaCompressor(object):  # uses 10..19 in the mapping
+    TYPE = 10
+    PRESETS = range(10)
+
+    def __init__(self):
+        if lzma is None:
+            raise NotImplemented("lzma compression needs Python >= 3.3 or backports.lzma from PyPi")
+
+    def compress(self, data):
+        preset = self.TYPE - LzmaCompressor.TYPE
+        return lzma.compress(data, preset=preset)
+
+    def decompress(self, data):
+        return lzma.decompress(data)
+
+
+class BLOSCCompressor(object):
+    TYPE = 0  # override in subclass
+    LEVELS = range(10)
+    CNAME = ''  # override in subclass
 
     def __init__(self):
-        self.TYPE_STR = bytes([self.TYPE])
+        if blosc is None:
+            raise NotImplemented("%s compression needs blosc from PyPi" % self.CNAME)
+        if self.CNAME not in blosc.compressor_list():
+            raise NotImplemented("%s compression is not supported by blosc" % self.CNAME)
+        blosc.set_blocksize(16384)  # 16kiB is the minimum, so 64kiB are enough for 4 threads
+
+    def _get_level(self):
+        raise NotImplemented
+
+    def compress(self, data):
+        return blosc.compress(bytes(data), 1, cname=self.CNAME, clevel=self._get_level())
+
+    def decompress(self, data):
+        return blosc.decompress(data)
+
+
+class LZ4Compressor(BLOSCCompressor):
+    TYPE = 20
+    CNAME = 'lz4'
+
+    def _get_level(self):
+        return self.TYPE - LZ4Compressor.TYPE
+
+
+class LZ4HCCompressor(BLOSCCompressor):
+    TYPE = 30
+    CNAME = 'lz4hc'
+
+    def _get_level(self):
+        return self.TYPE - LZ4HCCompressor.TYPE
+
+
+class BLOSCLZCompressor(BLOSCCompressor):
+    TYPE = 40
+    CNAME = 'blosclz'
+
+    def _get_level(self):
+        return self.TYPE - BLOSCLZCompressor.TYPE
+
+
+class SnappyCompressor(BLOSCCompressor):
+    TYPE = 50
+    CNAME = 'snappy'
+
+    def _get_level(self):
+        return self.TYPE - SnappyCompressor.TYPE
+
+
+class BLOSCZlibCompressor(BLOSCCompressor):
+    TYPE = 60
+    CNAME = 'zlib'
+
+    def _get_level(self):
+        return self.TYPE - BLOSCZlibCompressor.TYPE
+
+
+# default is optimized for speed
+COMPR_DEFAULT = NullCompressor.TYPE # no compression
+
+
+# ciphers - AEAD (authenticated encryption with assoc. data) style interface
+# special case: PLAIN dummy does not encrypt / authenticate
+
+class PLAIN:
+    TYPE = 0
+    enc_iv = None  # dummy
+
+    def __init__(self, **kw):
+        pass
+
+    def compute_mac_and_encrypt(self, meta, data):
+        return None, data
+
+    def check_mac_and_decrypt(self, mac, meta, data):
+        return data
+
+
+def get_aad(meta):
+    """get additional authenticated data for AEAD ciphers"""
+    if meta.legacy:
+        # legacy format computed the mac over (iv_last8 +  data)
+        return meta.iv[8:]
+    else:
+        return msgpack.packb(meta)
+
+
+class AES_CTR_HMAC:
+    TYPE = 1
+
+    def __init__(self, enc_key=b'\0' * 32, enc_iv=b'\0' * 16, enc_hmac_key=b'\0' * 32, **kw):
+        self.hmac_key = enc_hmac_key
+        self.enc_iv = enc_iv
+        self.enc_cipher = AES(mode=AES_CTR_MODE, is_encrypt=True, key=enc_key, iv=enc_iv)
+        self.dec_cipher = AES(mode=AES_CTR_MODE, is_encrypt=False, key=enc_key)
+
+    def compute_mac_and_encrypt(self, meta, data):
+        self.enc_cipher.reset(iv=meta.iv)
+        _, data = self.enc_cipher.compute_mac_and_encrypt(data)
+        self.enc_iv = increment_iv(meta.iv, len(data))
+        aad = get_aad(meta)
+        mac = HMAC_SHA256(self.hmac_key, aad + data).digest()  # XXX mac / hash flexibility
+        return mac, data
+
+    def check_mac_and_decrypt(self, mac, meta, data):
+        aad = get_aad(meta)
+        if HMAC_SHA256(self.hmac_key, aad + data).digest() != mac:  # XXX mac / hash flexibility
+            raise IntegrityError('Encryption envelope checksum mismatch')
+        self.dec_cipher.reset(iv=meta.iv)
+        data = self.dec_cipher.check_mac_and_decrypt(None, data)
+        return data
+
+
+class AES_GCM:
+    TYPE = 2
+
+    def __init__(self, enc_key=b'\0' * 32, enc_iv=b'\0' * 16, **kw):
+        # note: hmac_key is not used for aes-gcm, it does aes+gmac in 1 pass
+        self.enc_iv = enc_iv
+        self.enc_cipher = AES(mode=AES_GCM_MODE, is_encrypt=True, key=enc_key, iv=enc_iv)
+        self.dec_cipher = AES(mode=AES_GCM_MODE, is_encrypt=False, key=enc_key)
+
+    def compute_mac_and_encrypt(self, meta, data):
+        self.enc_cipher.reset(iv=meta.iv)
+        aad = get_aad(meta)
+        self.enc_cipher.add(aad)
+        mac, data = self.enc_cipher.compute_mac_and_encrypt(data)
+        self.enc_iv = increment_iv(meta.iv, len(data))
+        return mac, data
+
+    def check_mac_and_decrypt(self, mac, meta, data):
+        self.dec_cipher.reset(iv=meta.iv)
+        aad = get_aad(meta)
+        self.dec_cipher.add(aad)
+        try:
+            data = self.dec_cipher.check_mac_and_decrypt(mac, data)
+        except Exception:
+            raise IntegrityError('Encryption envelope checksum mismatch')
+        return data
+
+
+# cipher default is optimized for speed on modern CPUs with AES hw support
+PLAIN_DEFAULT = PLAIN.TYPE
+CIPHER_DEFAULT = AES_GCM.TYPE
+
+
+# misc. types of keys
+# special case: no keys (thus: no encryption, no signing/authentication)
+
+class KeyBase(object):
+    TYPE = 0x00  # override in derived classes
+
+    def __init__(self, compressor_cls, maccer_cls, cipher_cls):
+        self.compressor = compressor_cls()
+        self.maccer_cls = maccer_cls  # hasher/maccer used by id_hash
+        self.cipher_cls = cipher_cls  # plaintext dummy or AEAD cipher
+        self.cipher = cipher_cls()
+        self.id_key = None
 
     def id_hash(self, data):
-        """Return HMAC hash using the "id" HMAC key
+        """Return a HASH (no id_key) or a MAC (using the "id_key" key)
+
+        XXX do we need a cryptographic hash function here or is a keyed hash
+        function like GMAC / GHASH good enough? See NIST SP 800-38D.
+
+        IMPORTANT: in 1 repo, there should be only 1 kind of id_hash, otherwise
+        data hashed/maced with one id_hash might result in same ID as already
+        exists in the repo for other data created with another id_hash method.
+        somehow unlikely considering 128 or 256bits, but still.
         """
+        return self.maccer_cls(self.id_key, data).digest()
 
     def encrypt(self, data):
-        pass
+        data = self.compressor.compress(data)
+        meta = Meta(compr_type=self.compressor.TYPE, key_type=self.TYPE,
+                    mac_type=self.maccer_cls.TYPE, cipher_type=self.cipher.TYPE,
+                    iv=self.cipher.enc_iv, legacy=False)
+        mac, data = self.cipher.compute_mac_and_encrypt(meta, data)
+        return generate(mac, meta, data)
 
     def decrypt(self, id, data):
-        pass
+        mac, meta, data = parser(data)
+        compressor, keyer, maccer, cipher = get_implementations(meta)
+        assert isinstance(self, keyer)
+        assert self.maccer_cls is maccer
+        assert self.cipher_cls is cipher
+        data = self.cipher.check_mac_and_decrypt(mac, meta, data)
+        data = self.compressor.decompress(data)
+        if id and self.id_hash(data) != id:
+            raise IntegrityError('Chunk id verification failed')
+        return data
 
 
 class PlaintextKey(KeyBase):
@@ -73,71 +438,34 @@ class PlaintextKey(KeyBase):
     @classmethod
     def create(cls, repository, args):
         print('Encryption NOT enabled.\nUse the "--encryption=passphrase|keyfile" to enable encryption.')
-        return cls()
+        compressor = compressor_creator(args)
+        maccer = maccer_creator(args, cls)
+        cipher = cipher_creator(args, cls)
+        return cls(compressor, maccer, cipher)
 
     @classmethod
     def detect(cls, repository, manifest_data):
-        return cls()
-
-    def id_hash(self, data):
-        return sha256(data).digest()
-
-    def encrypt(self, data):
-        return b''.join([self.TYPE_STR, zlib.compress(data)])
-
-    def decrypt(self, id, data):
-        if data[0] != self.TYPE:
-            raise IntegrityError('Invalid encryption envelope')
-        data = zlib.decompress(memoryview(data)[1:])
-        if id and sha256(data).digest() != id:
-            raise IntegrityError('Chunk id verification failed')
-        return data
+        mac, meta, data = parser(manifest_data)
+        compressor, keyer, maccer, cipher = get_implementations(meta)
+        return cls(compressor, maccer, cipher)
 
 
 class AESKeyBase(KeyBase):
     """Common base class shared by KeyfileKey and PassphraseKey
 
-    Chunks are encrypted using 256bit AES in Counter Mode (CTR)
+    Chunks are encrypted using 256bit AES in CTR or GCM mode.
+    Chunks are authenticated by a GCM GMAC or a HMAC.
 
-    Payload layout: TYPE(1) + HMAC(32) + NONCE(8) + CIPHERTEXT
+    Payload layout: TYPE(1) + MAC(32) + NONCE(8) + CIPHERTEXT
 
     To reduce payload size only 8 bytes of the 16 bytes nonce is saved
     in the payload, the first 8 bytes are always zeros. This does not
     affect security but limits the maximum repository capacity to
     only 295 exabytes!
     """
-
-    PAYLOAD_OVERHEAD = 1 + 32 + 8  # TYPE + HMAC + NONCE
-
-    def id_hash(self, data):
-        """Return HMAC hash using the "id" HMAC key
-        """
-        return HMAC(self.id_key, data, sha256).digest()
-
-    def encrypt(self, data):
-        data = zlib.compress(data)
-        self.enc_cipher.reset()
-        data = b''.join((self.enc_cipher.iv[8:], self.enc_cipher.encrypt(data)))
-        hmac = HMAC(self.enc_hmac_key, data, sha256).digest()
-        return b''.join((self.TYPE_STR, hmac, data))
-
-    def decrypt(self, id, data):
-        if data[0] != self.TYPE:
-            raise IntegrityError('Invalid encryption envelope')
-        hmac = memoryview(data)[1:33]
-        if memoryview(HMAC(self.enc_hmac_key, memoryview(data)[33:], sha256).digest()) != hmac:
-            raise IntegrityError('Encryption envelope checksum mismatch')
-        self.dec_cipher.reset(iv=PREFIX + data[33:41])
-        data = zlib.decompress(self.dec_cipher.decrypt(data[41:]))  # should use memoryview
-        if id and HMAC(self.id_key, data, sha256).digest() != id:
-            raise IntegrityError('Chunk id verification failed')
-        return data
-
-    def extract_nonce(self, payload):
-        if payload[0] != self.TYPE:
-            raise IntegrityError('Invalid encryption envelope')
-        nonce = bytes_to_long(payload[33:41])
-        return nonce
+    def extract_iv(self, payload):
+        _, meta, _ = parser(payload)
+        return meta.iv
 
     def init_from_random_data(self, data):
         self.enc_key = data[0:32]
@@ -148,9 +476,13 @@ class AESKeyBase(KeyBase):
         if self.chunk_seed & 0x80000000:
             self.chunk_seed = self.chunk_seed - 0xffffffff - 1
 
-    def init_ciphers(self, enc_iv=b''):
-        self.enc_cipher = AES(is_encrypt=True, key=self.enc_key, iv=enc_iv)
-        self.dec_cipher = AES(is_encrypt=False, key=self.enc_key)
+    def init_ciphers(self, enc_iv=b'\0' * 16):
+        self.cipher = self.cipher_cls(enc_key=self.enc_key, enc_iv=enc_iv,
+                                      enc_hmac_key=self.enc_hmac_key)
+
+    @property
+    def enc_iv(self):
+        return self.cipher.enc_iv
 
 
 class PassphraseKey(AESKeyBase):
@@ -159,7 +491,10 @@ class PassphraseKey(AESKeyBase):
 
     @classmethod
     def create(cls, repository, args):
-        key = cls()
+        compressor = compressor_creator(args)
+        maccer = maccer_creator(args, cls)
+        cipher = cipher_creator(args, cls)
+        key = cls(compressor, maccer, cipher)
         passphrase = os.environ.get('BORG_PASSPHRASE')
         if passphrase is not None:
             passphrase2 = passphrase
@@ -181,7 +516,9 @@ class PassphraseKey(AESKeyBase):
     @classmethod
     def detect(cls, repository, manifest_data):
         prompt = 'Enter passphrase for %s: ' % repository._location.orig
-        key = cls()
+        mac, meta, data = parser(manifest_data)
+        compressor, keyer, maccer, cipher = get_implementations(meta)
+        key = cls(compressor, maccer, cipher)
         passphrase = os.environ.get('BORG_PASSPHRASE')
         if passphrase is None:
             passphrase = getpass(prompt)
@@ -189,8 +526,7 @@ class PassphraseKey(AESKeyBase):
             key.init(repository, passphrase)
             try:
                 key.decrypt(None, manifest_data)
-                num_blocks = num_aes_blocks(len(manifest_data) - 41)
-                key.init_ciphers(PREFIX + long_to_bytes(key.extract_nonce(manifest_data) + num_blocks))
+                key.init_ciphers(increment_iv(key.extract_iv(manifest_data), len(data)))
                 return key
             except IntegrityError:
                 passphrase = getpass(prompt)
@@ -212,14 +548,15 @@ class KeyfileKey(AESKeyBase):
 
     @classmethod
     def detect(cls, repository, manifest_data):
-        key = cls()
+        mac, meta, data = parser(manifest_data)
+        compressor, keyer, maccer, cipher = get_implementations(meta)
+        key = cls(compressor, maccer, cipher)
         path = cls.find_key_file(repository)
         prompt = 'Enter passphrase for key file %s: ' % path
         passphrase = os.environ.get('BORG_PASSPHRASE', '')
         while not key.load(path, passphrase):
             passphrase = getpass(prompt)
-        num_blocks = num_aes_blocks(len(manifest_data) - 41)
-        key.init_ciphers(PREFIX + long_to_bytes(key.extract_nonce(manifest_data) + num_blocks))
+        key.init_ciphers(increment_iv(key.extract_iv(manifest_data), len(data)))
         return key
 
     @classmethod
@@ -254,25 +591,27 @@ class KeyfileKey(AESKeyBase):
     def decrypt_key_file(self, data, passphrase):
         d = msgpack.unpackb(data)
         assert d[b'version'] == 1
-        assert d[b'algorithm'] == b'sha256'
+        assert d[b'algorithm'] == b'gmac'
         key = pbkdf2_sha256(passphrase.encode('utf-8'), d[b'salt'], d[b'iterations'], 32)
-        data = AES(is_encrypt=False, key=key).decrypt(d[b'data'])
-        if HMAC(key, data, sha256).digest() != d[b'hash']:
+        try:
+            cipher = AES(mode=AES_GCM_MODE, is_encrypt=False, key=key, iv=b'\0'*16)
+            data = cipher.check_mac_and_decrypt(d[b'hash'], d[b'data'])
+            return data
+        except Exception:
             return None
-        return data
 
     def encrypt_key_file(self, data, passphrase):
         salt = get_random_bytes(32)
         iterations = 100000
         key = pbkdf2_sha256(passphrase.encode('utf-8'), salt, iterations, 32)
-        hash = HMAC(key, data, sha256).digest()
-        cdata = AES(is_encrypt=True, key=key).encrypt(data)
+        cipher = AES(mode=AES_GCM_MODE, is_encrypt=True, key=key, iv=b'\0'*16)
+        mac, cdata = cipher.compute_mac_and_encrypt(data)
         d = {
             'version': 1,
             'salt': salt,
             'iterations': iterations,
-            'algorithm': 'sha256',
-            'hash': hash,
+            'algorithm': 'gmac',
+            'hash': mac,
             'data': cdata,
         }
         return msgpack.packb(d)
@@ -321,7 +660,10 @@ class KeyfileKey(AESKeyBase):
             passphrase2 = getpass('Enter same passphrase again: ')
             if passphrase != passphrase2:
                 print('Passphrases do not match')
-        key = cls()
+        compressor = compressor_creator(args)
+        maccer = maccer_creator(args, cls)
+        cipher = cipher_creator(args, cls)
+        key = cls(compressor, maccer, cipher)
         key.repository_id = repository.id
         key.init_from_random_data(get_random_bytes(100))
         key.init_ciphers()
@@ -329,3 +671,213 @@ class KeyfileKey(AESKeyBase):
         print('Key file "%s" created.' % key.path)
         print('Keep this file safe. Your data will be inaccessible without it.')
         return key
+
+
+# note: key 0 nicely maps to a zlib compressor with level 0 which means "no compression"
+compressor_mapping = {}
+for level in ZlibCompressor.LEVELS:
+    compressor_mapping[ZlibCompressor.TYPE + level] = \
+        type('ZlibCompressorLevel%d' % level, (ZlibCompressor, ), dict(TYPE=ZlibCompressor.TYPE + level))
+for preset in LzmaCompressor.PRESETS:
+    compressor_mapping[LzmaCompressor.TYPE + preset] = \
+        type('LzmaCompressorPreset%d' % preset, (LzmaCompressor, ), dict(TYPE=LzmaCompressor.TYPE + preset))
+for level in LZ4Compressor.LEVELS:
+    compressor_mapping[LZ4Compressor.TYPE + level] = \
+        type('LZ4CompressorLevel%d' % level, (LZ4Compressor, ), dict(TYPE=LZ4Compressor.TYPE + level))
+for level in LZ4HCCompressor.LEVELS:
+    compressor_mapping[LZ4HCCompressor.TYPE + level] = \
+        type('LZ4HCCompressorLevel%d' % level, (LZ4HCCompressor, ), dict(TYPE=LZ4HCCompressor.TYPE + level))
+for level in BLOSCLZCompressor.LEVELS:
+    compressor_mapping[BLOSCLZCompressor.TYPE + level] = \
+        type('BLOSCLZCompressorLevel%d' % level, (BLOSCLZCompressor, ), dict(TYPE=BLOSCLZCompressor.TYPE + level))
+for level in SnappyCompressor.LEVELS:
+    compressor_mapping[SnappyCompressor.TYPE + level] = \
+        type('SnappyCompressorLevel%d' % level, (SnappyCompressor, ), dict(TYPE=SnappyCompressor.TYPE + level))
+for level in BLOSCZlibCompressor.LEVELS:
+    compressor_mapping[BLOSCZlibCompressor.TYPE + level] = \
+        type('BLOSCZlibCompressorLevel%d' % level, (BLOSCZlibCompressor, ), dict(TYPE=BLOSCZlibCompressor.TYPE + level))
+# overwrite 0 with NullCompressor
+compressor_mapping[NullCompressor.TYPE] = NullCompressor
+
+
+keyer_mapping = {
+    KeyfileKey.TYPE: KeyfileKey,
+    PassphraseKey.TYPE: PassphraseKey,
+    PlaintextKey.TYPE: PlaintextKey,
+}
+
+
+maccer_mapping = {
+    # simple hashes, not MACs (but MAC-like class __init__ method signature):
+    SHA1.TYPE: SHA1,
+    SHA256.TYPE: SHA256,
+    SHA512_256.TYPE: SHA512_256,
+    SHA512.TYPE: SHA512,
+    GHASH.TYPE: GHASH,
+    # MACs:
+    HMAC_SHA1.TYPE: HMAC_SHA1,
+    HMAC_SHA256.TYPE: HMAC_SHA256,
+    HMAC_SHA512_256.TYPE: HMAC_SHA512_256,
+    HMAC_SHA512.TYPE: HMAC_SHA512,
+    GMAC.TYPE: GMAC,
+}
+
+
+cipher_mapping = {
+    # no cipher (but cipher-like class __init__ method signature):
+    PLAIN.TYPE: PLAIN,
+    # AEAD cipher implementations
+    AES_CTR_HMAC.TYPE: AES_CTR_HMAC,
+    AES_GCM.TYPE: AES_GCM,
+}
+
+
+def get_implementations(meta):
+    try:
+        compressor = compressor_mapping[meta.compr_type]
+        keyer = keyer_mapping[meta.key_type]
+        maccer = maccer_mapping[meta.mac_type]
+        cipher = cipher_mapping[meta.cipher_type]
+    except KeyError:
+        raise UnsupportedPayloadError("compr_type %x key_type %x mac_type %x cipher_type %x" % (
+            meta.compr_type, meta.key_type, meta.mac_type, meta.cipher_type))
+    return compressor, keyer, maccer, cipher
+
+
+def legacy_parser(all_data, key_type):  # all rather hardcoded
+    """
+    Payload layout:
+    no encryption:   TYPE(1) + data
+    with encryption: TYPE(1) + HMAC(32) + NONCE(8) + data
+    data is compressed with zlib level 6 and (in the 2nd case) encrypted.
+
+    To reduce payload size only 8 bytes of the 16 bytes nonce is saved
+    in the payload, the first 8 bytes are always zeros. This does not
+    affect security but limits the maximum repository capacity to
+    only 295 exabytes!
+    """
+    offset = 1
+    if key_type == PlaintextKey.TYPE:
+        mac_type = SHA256.TYPE
+        mac = None
+        cipher_type = PLAIN.TYPE
+        iv = None
+        data = all_data[offset:]
+    else:
+        mac_type = HMAC_SHA256.TYPE
+        mac = all_data[offset:offset+32]
+        cipher_type = AES_CTR_HMAC.TYPE
+        # legacy attic did not store the full IV on disk, as the upper 8 bytes
+        # are expected to be zero anyway as the full IV is a 128bit counter.
+        iv = b'\0' * 8 + all_data[offset+32:offset+40]
+        data = all_data[offset+40:]
+    meta = Meta(compr_type=6, key_type=key_type, mac_type=mac_type,
+                cipher_type=cipher_type, iv=iv, legacy=True)
+    return mac, meta, data
+
+def parser00(all_data):
+    return legacy_parser(all_data, KeyfileKey.TYPE)
+
+def parser01(all_data):
+    return legacy_parser(all_data, PassphraseKey.TYPE)
+
+def parser02(all_data):
+    return legacy_parser(all_data, PlaintextKey.TYPE)
+
+
+def parser03(all_data):  # new & flexible
+    """
+    Payload layout:
+    always: TYPE(1) + MSGPACK((mac, meta, data))
+
+    meta is a Meta namedtuple and contains all required information about data.
+    data is maybe compressed (see meta) and maybe encrypted (see meta).
+    """
+    unpacker = msgpack.Unpacker(
+        use_list=False,
+        # avoid memory allocation issues causes by tampered input data.
+        max_buffer_size=CHUNK_MAX + 1000,  # does not work in 0.4.6 unpackb C implementation
+        max_array_len=10,  # meta_tuple
+        max_bin_len=CHUNK_MAX,  # data
+        max_str_len=0,  # not used yet
+        max_map_len=0,  # not used yet
+        max_ext_len=0,  # not used yet
+        )
+    unpacker.feed(all_data[1:])
+    mac, meta_tuple, data = unpacker.unpack()
+    meta = Meta(*meta_tuple)
+    return mac, meta, data
+
+
+def parser(data):
+    parser_mapping = {
+        0x00: parser00,
+        0x01: parser01,
+        0x02: parser02,
+        0x03: parser03,
+    }
+    header_type = data[0]
+    parser_func = parser_mapping[header_type]
+    return parser_func(data)
+
+
+def key_factory(repository, manifest_data):
+    mac, meta, data = parser(manifest_data)
+    compressor, keyer, maccer, cipher = get_implementations(meta)
+    return keyer.detect(repository, manifest_data)
+
+
+def generate(mac, meta, data):
+    # always create new-style 0x03 format
+    return b'\x03' + msgpack.packb((mac, meta, data), use_bin_type=True)
+
+
+def compressor_creator(args):
+    # args == None is used by unit tests
+    compression = COMPR_DEFAULT if args is None else args.compression
+    compressor = compressor_mapping.get(compression)
+    if compressor is None:
+        raise NotImplementedError("no compression %d" % args.compression)
+    return compressor
+
+
+def key_creator(args):
+    if args.encryption == 'keyfile':
+        return KeyfileKey
+    if args.encryption == 'passphrase':
+        return PassphraseKey
+    if args.encryption == 'none':
+        return PlaintextKey
+    raise NotImplemented("no encryption %s" % args.encryption)
+
+
+def maccer_creator(args, key_cls):
+    # args == None is used by unit tests
+    mac = None if args is None else args.mac
+    if mac is None:
+        if key_cls is PlaintextKey:
+            mac = HASH_DEFAULT
+        elif key_cls in (KeyfileKey, PassphraseKey):
+            mac = MAC_DEFAULT
+        else:
+            raise NotImplementedError("unknown key class")
+    maccer = maccer_mapping.get(mac)
+    if maccer is None:
+        raise NotImplementedError("no mac %d" % args.mac)
+    return maccer
+
+
+def cipher_creator(args, key_cls):
+    # args == None is used by unit tests
+    cipher = None if args is None else args.cipher
+    if cipher is None:
+        if key_cls is PlaintextKey:
+            cipher = PLAIN_DEFAULT
+        elif key_cls in (KeyfileKey, PassphraseKey):
+            cipher = CIPHER_DEFAULT
+        else:
+            raise NotImplementedError("unknown key class")
+    cipher = cipher_mapping.get(cipher)
+    if cipher is None:
+        raise NotImplementedError("no cipher %d" % args.cipher)
+    return cipher

+ 7 - 6
borg/remote.py

@@ -89,7 +89,7 @@ class RepositoryServer:
     def negotiate(self, versions):
         return 1
 
-    def open(self, path, create=False):
+    def open(self, path, create=False, key_size=None):
         path = os.fsdecode(path)
         if path.startswith('/~'):
             path = path[1:]
@@ -100,8 +100,8 @@ class RepositoryServer:
                     break
             else:
                 raise PathNotAllowed(path)
-        self.repository = Repository(path, create)
-        return self.repository.id
+        self.repository = Repository(path, create, key_size=key_size)
+        return self.repository.id, self.repository.key_size
 
 
 class RemoteRepository:
@@ -112,7 +112,7 @@ class RemoteRepository:
         def __init__(self, name):
             self.name = name
 
-    def __init__(self, location, create=False):
+    def __init__(self, location, create=False, key_size=None):
         self.location = location
         self.preload_ids = []
         self.msgid = 0
@@ -144,7 +144,7 @@ class RemoteRepository:
         version = self.call('negotiate', 1)
         if version != 1:
             raise Exception('Server insisted on using unsupported protocol version %d' % version)
-        self.id = self.call('open', location.path, create)
+        self.id, self.key_size = self.call('open', location.path, create, key_size)
 
     def __del__(self):
         self.close()
@@ -303,7 +303,8 @@ class RepositoryCache:
 
     def initialize(self):
         self.tmppath = tempfile.mkdtemp()
-        self.index = NSIndex()
+        self.key_size = self.repository.key_size
+        self.index = NSIndex(key_size=self.key_size)
         self.data_fd = open(os.path.join(self.tmppath, 'data'), 'a+b')
 
     def cleanup(self):

+ 17 - 11
borg/repository.py

@@ -47,22 +47,23 @@ class Repository:
     class ObjectNotFound(Error):
         """Object with key {} not found in repository {}."""
 
-    def __init__(self, path, create=False, exclusive=False):
+    def __init__(self, path, create=False, exclusive=False, key_size=None):
         self.path = path
         self.io = None
         self.lock = None
         self.index = None
         self._active_txn = False
         if create:
-            self.create(path)
+            self.create(path, key_size)
         self.open(path, exclusive)
 
     def __del__(self):
         self.close()
 
-    def create(self, path):
+    def create(self, path, key_size):
         """Create a new empty repository at `path`
         """
+        assert key_size is not None
         if os.path.exists(path) and (not os.path.isdir(path) or os.listdir(path)):
             raise self.AlreadyExists(path)
         if not os.path.exists(path):
@@ -75,6 +76,7 @@ class Repository:
         config.set('repository', 'version', '1')
         config.set('repository', 'segments_per_dir', self.DEFAULT_SEGMENTS_PER_DIR)
         config.set('repository', 'max_segment_size', self.DEFAULT_MAX_SEGMENT_SIZE)
+        config.set('repository', 'key_size', key_size)
         config.set('repository', 'id', hexlify(os.urandom(32)).decode('ascii'))
         with open(os.path.join(path, 'config'), 'w') as fd:
             config.write(fd)
@@ -117,10 +119,12 @@ class Repository:
         if 'repository' not in self.config.sections() or self.config.getint('repository', 'version') != 1:
             raise self.InvalidRepository(path)
         self.lock = UpgradableLock(os.path.join(path, 'config'), exclusive)
+        # legacy attic repositories always have key size 32B (256b)
+        self.key_size = self.config.getint('repository', 'key_size', fallback=32)
         self.max_segment_size = self.config.getint('repository', 'max_segment_size')
         self.segments_per_dir = self.config.getint('repository', 'segments_per_dir')
         self.id = unhexlify(self.config.get('repository', 'id').strip())
-        self.io = LoggedIO(self.path, self.max_segment_size, self.segments_per_dir)
+        self.io = LoggedIO(self.path, self.max_segment_size, self.segments_per_dir, self.key_size)
 
     def close(self):
         if self.lock:
@@ -140,8 +144,9 @@ class Repository:
 
     def open_index(self, transaction_id):
         if transaction_id is None:
-            return NSIndex()
-        return NSIndex.read((os.path.join(self.path, 'index.%d') % transaction_id).encode('utf-8'))
+            return NSIndex(key_size=self.key_size)
+        return NSIndex.read((os.path.join(self.path, 'index.%d') % transaction_id).encode('utf-8'),
+                            key_size=self.key_size)
 
     def prepare_txn(self, transaction_id, do_cleanup=True):
         self._active_txn = True
@@ -397,8 +402,6 @@ class LoggedIO:
 
     header_fmt = struct.Struct('<IIB')
     assert header_fmt.size == 9
-    put_header_fmt = struct.Struct('<IIB32s')
-    assert put_header_fmt.size == 41
     header_no_crc_fmt = struct.Struct('<IB')
     assert header_no_crc_fmt.size == 5
     crc_fmt = struct.Struct('<I')
@@ -407,13 +410,16 @@ class LoggedIO:
     _commit = header_no_crc_fmt.pack(9, TAG_COMMIT)
     COMMIT = crc_fmt.pack(crc32(_commit)) + _commit
 
-    def __init__(self, path, limit, segments_per_dir, capacity=90):
+    def __init__(self, path, limit, segments_per_dir, key_size, capacity=90):
         self.path = path
         self.fds = LRUCache(capacity)
         self.segment = 0
         self.limit = limit
         self.segments_per_dir = segments_per_dir
+        self.key_size = key_size
         self.offset = 0
+        self.put_header_fmt = struct.Struct('<IIB%ds' % key_size)
+        assert self.put_header_fmt.size == self.header_fmt.size + key_size
         self._write_fd = None
 
     def close(self):
@@ -519,9 +525,9 @@ class LoggedIO:
                 raise IntegrityError('Invalid segment entry header')
             key = None
             if tag in (TAG_PUT, TAG_DELETE):
-                key = rest[:32]
+                key = rest[:self.key_size]
             if include_data:
-                yield tag, key, offset, rest[32:]
+                yield tag, key, offset, rest[self.key_size:]
             else:
                 yield tag, key, offset
             offset += size

+ 15 - 3
borg/testsuite/archive.py

@@ -3,7 +3,7 @@ from datetime import datetime, timezone
 import msgpack
 
 from ..archive import Archive, CacheChunkBuffer, RobustUnpacker
-from ..key import PlaintextKey
+from ..key import PlaintextKey, COMPR_DEFAULT
 from ..helpers import Manifest
 from . import BaseTestCase
 from .mock import Mock
@@ -21,9 +21,15 @@ class MockCache:
 
 class ArchiveTimestampTestCase(BaseTestCase):
 
+    class MockArgs(object):
+        repository = None
+        compression = COMPR_DEFAULT
+        mac = None
+        cipher = None
+
     def _test_timestamp_parsing(self, isoformat, expected):
         repository = Mock()
-        key = PlaintextKey()
+        key = PlaintextKey.create(None, self.MockArgs())
         manifest = Manifest(repository, key)
         a = Archive(repository, key, manifest, 'test', create=True)
         a.metadata = {b'time': isoformat}
@@ -42,10 +48,16 @@ class ArchiveTimestampTestCase(BaseTestCase):
 
 class ChunkBufferTestCase(BaseTestCase):
 
+    class MockArgs(object):
+        repository = None
+        compression = COMPR_DEFAULT
+        mac = None
+        cipher = None
+
     def test(self):
         data = [{b'foo': 1}, {b'bar': 2}]
         cache = MockCache()
-        key = PlaintextKey()
+        key = PlaintextKey.create(None, self.MockArgs())
         chunks = CacheChunkBuffer(cache, key, None)
         for d in data:
             chunks.add(d)

+ 7 - 5
borg/testsuite/archiver.py

@@ -15,8 +15,9 @@ from .. import xattr
 from ..archive import Archive, ChunkBuffer, CHUNK_MAX
 from ..archiver import Archiver
 from ..cache import Cache
-from ..crypto import bytes_to_long, num_aes_blocks
+from ..crypto import bytes16_to_int, num_aes_blocks
 from ..helpers import Manifest
+from ..key import parser
 from ..remote import RemoteRepository, PathNotAllowed
 from ..repository import Repository
 from . import BaseTestCase
@@ -496,8 +497,9 @@ class ArchiverTestCase(ArchiverTestCaseBase):
                 hash = sha256(data).digest()
                 if hash not in seen:
                     seen.add(hash)
-                    num_blocks = num_aes_blocks(len(data) - 41)
-                    nonce = bytes_to_long(data[33:41])
+                    mac, meta, data = parser(data)
+                    num_blocks = num_aes_blocks(len(data))
+                    nonce = bytes16_to_int(meta.iv)
                     for counter in range(nonce, nonce + num_blocks):
                         self.assert_not_in(counter, used)
                         used.add(counter)
@@ -576,7 +578,7 @@ class ArchiverCheckTestCase(ArchiverTestCaseBase):
 
     def test_missing_manifest(self):
         archive, repository = self.open_archive('archive1')
-        repository.delete(Manifest.MANIFEST_ID)
+        repository.delete(Manifest.manifest_id(repository))
         repository.commit()
         self.cmd('check', self.repository_location, exit_code=1)
         output = self.cmd('check', '--repair', self.repository_location, exit_code=0)
@@ -587,7 +589,7 @@ class ArchiverCheckTestCase(ArchiverTestCaseBase):
     def test_extra_chunks(self):
         self.cmd('check', self.repository_location, exit_code=0)
         repository = Repository(self.repository_location)
-        repository.put(b'01234567890123456789012345678901', b'xxxx')
+        repository.put(b'0123456789012345', b'xxxx')
         repository.commit()
         repository.close()
         self.cmd('check', self.repository_location, exit_code=1)

+ 48 - 14
borg/testsuite/crypto.py

@@ -1,6 +1,7 @@
 from binascii import hexlify
 
-from ..crypto import AES, bytes_to_long, bytes_to_int, long_to_bytes, pbkdf2_sha256, get_random_bytes
+from ..crypto import AES, AES_GCM_MODE, AES_CTR_MODE, pbkdf2_sha256, get_random_bytes, \
+                     bytes_to_int, bytes16_to_int, int_to_bytes16, increment_iv
 from . import BaseTestCase
 
 
@@ -9,9 +10,27 @@ class CryptoTestCase(BaseTestCase):
     def test_bytes_to_int(self):
         self.assert_equal(bytes_to_int(b'\0\0\0\1'), 1)
 
-    def test_bytes_to_long(self):
-        self.assert_equal(bytes_to_long(b'\0\0\0\0\0\0\0\1'), 1)
-        self.assert_equal(long_to_bytes(1), b'\0\0\0\0\0\0\0\1')
+    def test_bytes16_to_int(self):
+        i, b = 1, b'\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\1'
+        self.assert_equal(bytes16_to_int(b), i)
+        self.assert_equal(int_to_bytes16(i), b)
+        i, b = (1 << 64) + 2, b'\0\0\0\0\0\0\0\1\0\0\0\0\0\0\0\2'
+        self.assert_equal(bytes16_to_int(b), i)
+        self.assert_equal(int_to_bytes16(i), b)
+
+    def test_increment_iv(self):
+        tests = [
+            # iv, amount, iv_expected
+            (0, 0, 0),
+            (0, 15, 1),
+            (0, 16, 1),
+            (0, 17, 2),
+            (0xffffffffffffffff, 32, 0x10000000000000001),
+        ]
+        for iv, amount, iv_expected in tests:
+            iv = int_to_bytes16(iv)
+            iv_expected = int_to_bytes16(iv_expected)
+            self.assert_equal(increment_iv(iv, amount), iv_expected)
 
     def test_pbkdf2_sha256(self):
         self.assert_equal(hexlify(pbkdf2_sha256(b'password', b'salt', 1, 32)),
@@ -28,18 +47,33 @@ class CryptoTestCase(BaseTestCase):
         self.assert_equal(len(bytes2), 10)
         self.assert_not_equal(bytes, bytes2)
 
-    def test_aes(self):
+    def test_aes_ctr(self):
         key = b'X' * 32
+        iv = b'\0' * 16
         data = b'foo' * 10
         # encrypt
-        aes = AES(is_encrypt=True, key=key)
-        self.assert_equal(bytes_to_long(aes.iv, 8), 0)
-        cdata = aes.encrypt(data)
+        aes = AES(mode=AES_CTR_MODE, is_encrypt=True, key=key, iv=iv)
+        _, cdata = aes.compute_mac_and_encrypt(data)
         self.assert_equal(hexlify(cdata), b'c6efb702de12498f34a2c2bbc8149e759996d08bf6dc5c610aefc0c3a466')
-        self.assert_equal(bytes_to_long(aes.iv, 8), 2)
-        # decrypt
-        aes = AES(is_encrypt=False, key=key)
-        self.assert_equal(bytes_to_long(aes.iv, 8), 0)
-        pdata = aes.decrypt(cdata)
+        # decrypt (correct mac/cdata)
+        aes = AES(mode=AES_CTR_MODE, is_encrypt=False, key=key, iv=iv)
+        pdata = aes.check_mac_and_decrypt(None, cdata)
+        self.assert_equal(data, pdata)
+
+    def test_aes_gcm(self):
+        key = b'X' * 32
+        iv = b'A' * 16
+        data = b'foo' * 10
+        # encrypt
+        aes = AES(mode=AES_GCM_MODE, is_encrypt=True, key=key, iv=iv)
+        mac, cdata = aes.compute_mac_and_encrypt(data)
+        self.assert_equal(hexlify(mac), b'c98aa10eb6b7031bcc2160878d9438fb')
+        self.assert_equal(hexlify(cdata), b'841bcce405df769d22ee9f7f012edf5dc7fb2594d924c7400ffd050f2741')
+        # decrypt (correct mac/cdata)
+        aes = AES(mode=AES_GCM_MODE, is_encrypt=False, key=key, iv=iv)
+        pdata = aes.check_mac_and_decrypt(mac, cdata)
         self.assert_equal(data, pdata)
-        self.assert_equal(bytes_to_long(aes.iv, 8), 2)
+        # decrypt (incorrect mac/cdata)
+        aes = AES(mode=AES_GCM_MODE, is_encrypt=False, key=key, iv=iv)
+        cdata = b'x' + cdata[1:]  # corrupt cdata
+        self.assertRaises(Exception, aes.check_mac_and_decrypt, mac, cdata)

+ 5 - 5
borg/testsuite/hashindex.py

@@ -9,7 +9,7 @@ from . import BaseTestCase
 class HashIndexTestCase(BaseTestCase):
 
     def _generic_test(self, cls, make_value, sha):
-        idx = cls()
+        idx = cls(key_size=32)
         self.assert_equal(len(idx), 0)
         # Test set
         for x in range(100):
@@ -34,7 +34,7 @@ class HashIndexTestCase(BaseTestCase):
         with open(idx_name.name, 'rb') as fd:
             self.assert_equal(hashlib.sha256(fd.read()).hexdigest(), sha)
         # Make sure we can open the file
-        idx = cls.read(idx_name.name)
+        idx = cls.read(idx_name.name, key_size=32)
         self.assert_equal(len(idx), 50)
         for x in range(50, 100):
             self.assert_equal(idx[bytes('%-32d' % x, 'ascii')], make_value(x * 2))
@@ -42,7 +42,7 @@ class HashIndexTestCase(BaseTestCase):
         self.assert_equal(len(idx), 0)
         idx.write(idx_name.name)
         del idx
-        self.assert_equal(len(cls.read(idx_name.name)), 0)
+        self.assert_equal(len(cls.read(idx_name.name, key_size=32)), 0)
 
     def test_nsindex(self):
         self._generic_test(NSIndex, lambda x: (x, x),
@@ -55,7 +55,7 @@ class HashIndexTestCase(BaseTestCase):
     def test_resize(self):
         n = 2000  # Must be >= MIN_BUCKETS
         idx_name = tempfile.NamedTemporaryFile()
-        idx = NSIndex()
+        idx = NSIndex(key_size=32)
         idx.write(idx_name.name)
         initial_size = os.path.getsize(idx_name.name)
         self.assert_equal(len(idx), 0)
@@ -70,7 +70,7 @@ class HashIndexTestCase(BaseTestCase):
         self.assert_equal(initial_size, os.path.getsize(idx_name.name))
 
     def test_iteritems(self):
-        idx = NSIndex()
+        idx = NSIndex(key_size=32)
         for x in range(100):
             idx[bytes('%-0.32d' % x, 'ascii')] = x, x
         all = list(idx.iteritems())

+ 44 - 27
borg/testsuite/key.py

@@ -4,8 +4,7 @@ import shutil
 import tempfile
 from binascii import hexlify
 
-from ..crypto import bytes_to_long, num_aes_blocks
-from ..key import PlaintextKey, PassphraseKey, KeyfileKey
+from ..key import PlaintextKey, PassphraseKey, KeyfileKey, COMPR_DEFAULT, increment_iv
 from ..helpers import Location, unhexlify
 from . import BaseTestCase
 
@@ -14,22 +13,26 @@ class KeyTestCase(BaseTestCase):
 
     class MockArgs:
         repository = Location(tempfile.mkstemp()[1])
+        compression = COMPR_DEFAULT
+        mac = None
+        cipher = None
 
     keyfile2_key_file = """
-        BORG_KEY 0000000000000000000000000000000000000000000000000000000000000000
-        hqppdGVyYXRpb25zzgABhqCkaGFzaNoAIMyonNI+7Cjv0qHi0AOBM6bLGxACJhfgzVD2oq
-        bIS9SFqWFsZ29yaXRobaZzaGEyNTakc2FsdNoAINNK5qqJc1JWSUjACwFEWGTdM7Nd0a5l
-        1uBGPEb+9XM9p3ZlcnNpb24BpGRhdGHaANAYDT5yfPpU099oBJwMomsxouKyx/OG4QIXK2
-        hQCG2L2L/9PUu4WIuKvGrsXoP7syemujNfcZws5jLp2UPva4PkQhQsrF1RYDEMLh2eF9Ol
-        rwtkThq1tnh7KjWMG9Ijt7/aoQtq0zDYP/xaFF8XXSJxiyP5zjH5+spB6RL0oQHvbsliSh
-        /cXJq7jrqmrJ1phd6dg4SHAM/i+hubadZoS6m25OQzYAW09wZD/phG8OVa698Z5ed3HTaT
-        SmrtgJL3EoOKgUI9d6BLE4dJdBqntifo""".strip()
+BORG_KEY 0000000000000000000000000000000000000000000000000000000000000000
+hqRzYWx02gAgA1l4jfyv22y6U/mxxDT8HodSWAcX0g3nOESrQcNnBsundmVyc2lvbgGqaX
+RlcmF0aW9uc84AAYagqWFsZ29yaXRobaRnbWFjpGhhc2iw7eaB54JssAOnM1S4S9CeTaRk
+YXRh2gDQzmuyg3iYjMeTLObY+ybI+QfngB+5mmHeEAfBa42fuEZgqM3rYyMj2XfgvamF+O
+0asvhEyy9om190FaOxQ4RiiTMNqSP0FKLmd1i5ZyDMfRyp7JbscRFs9Ryk28yXWkv0MgQy
+EAYlaycY+6lWdRSgEPxidyPl9t9dr2AI/UuiQytwqmcmXgWD6Px6wgpOS/4AcRmEvDqIIl
+Rc2xsu+RevGAxk5rnrIIRPr7WB5R2cinzEn9ylDgBDt9LZbq706ELgtwVTnjWB8FBTPwVI
+vLTTXQ==
+""".strip()
 
     keyfile2_cdata = unhexlify(re.sub('\W', '', """
-        0055f161493fcfc16276e8c31493c4641e1eb19a79d0326fad0291e5a9c98e5933
-        00000000000003e8d21eaf9b86c297a8cd56432e1915bb
+        0393c4102e5ce8f5e9477c9e4ce2de453121aa139600001402c41000000000000000000000000000000000
+        c2c407b0147a64a379d1
         """))
-    keyfile2_id = unhexlify('c3fbf14bc001ebcc3cd86e696c13482ed071740927cd7cbe1b01b4bfcee49314')
+    keyfile2_id = unhexlify('dd9451069663931c8abd85452d016733')
 
     def setUp(self):
         self.tmppath = tempfile.mkdtemp()
@@ -45,25 +48,36 @@ class KeyTestCase(BaseTestCase):
         _location = _Location()
         id = bytes(32)
 
+    def _test_make_testdata(self):
+        # modify tearDown to not kill the key file first, before using this
+        os.environ['ATTIC_PASSPHRASE'] = 'passphrase'
+        key = KeyfileKey.create(self.MockRepository(), self.MockArgs())
+        print("keyfile2_key_file: find the it in the filesystem, see location in test log output")
+        print("keyfile2_cdata:", hexlify(key.encrypt(b'payload')))
+        print("keyfile2_id:", hexlify(key.id_hash(b'payload')))
+        assert False
+
     def test_plaintext(self):
-        key = PlaintextKey.create(None, None)
+        key = PlaintextKey.create(None, self.MockArgs())
         data = b'foo'
-        self.assert_equal(hexlify(key.id_hash(data)), b'2c26b46b68ffc68ff99b453c1d30413413422d706483bfa0f98a5e886266e7ae')
+        self.assert_equal(hexlify(key.id_hash(data)), b'4c9137bc0dd3ddb31de4e138a49d7eb3')
         self.assert_equal(data, key.decrypt(key.id_hash(data), key.encrypt(data)))
 
     def test_keyfile(self):
         os.environ['BORG_PASSPHRASE'] = 'test'
         key = KeyfileKey.create(self.MockRepository(), self.MockArgs())
-        self.assert_equal(bytes_to_long(key.enc_cipher.iv, 8), 0)
+        self.assert_equal(key.enc_iv, b'\0'*16)
         manifest = key.encrypt(b'XXX')
-        self.assert_equal(key.extract_nonce(manifest), 0)
+        self.assert_equal(key.extract_iv(manifest), b'\0'*16)
         manifest2 = key.encrypt(b'XXX')
         self.assert_not_equal(manifest, manifest2)
         self.assert_equal(key.decrypt(None, manifest), key.decrypt(None, manifest2))
-        self.assert_equal(key.extract_nonce(manifest2), 1)
-        iv = key.extract_nonce(manifest)
+        self.assert_equal(key.extract_iv(manifest2), b'\0'*15+b'\x01')
+        iv = key.extract_iv(manifest)
         key2 = KeyfileKey.detect(self.MockRepository(), manifest)
-        self.assert_equal(bytes_to_long(key2.enc_cipher.iv, 8), iv + num_aes_blocks(len(manifest) - KeyfileKey.PAYLOAD_OVERHEAD))
+        # we assume that the payload fits into one 16B AES block (which is given for b'XXX').
+        iv_plus_1 = increment_iv(iv, 16)
+        self.assert_equal(key2.enc_iv, iv_plus_1)
         # Key data sanity check
         self.assert_equal(len(set([key2.id_key, key2.enc_key, key2.enc_hmac_key])), 3)
         self.assert_equal(key2.chunk_seed == 0, False)
@@ -79,25 +93,28 @@ class KeyTestCase(BaseTestCase):
 
     def test_passphrase(self):
         os.environ['BORG_PASSPHRASE'] = 'test'
-        key = PassphraseKey.create(self.MockRepository(), None)
-        self.assert_equal(bytes_to_long(key.enc_cipher.iv, 8), 0)
+        key = PassphraseKey.create(self.MockRepository(), self.MockArgs())
+        # XXX self.assert_equal(bytes_to_long(key.enc_cipher.iv, 8), 0)
+        self.assert_equal(key.enc_iv, b'\0'*16)
         self.assert_equal(hexlify(key.id_key), b'793b0717f9d8fb01c751a487e9b827897ceea62409870600013fbc6b4d8d7ca6')
         self.assert_equal(hexlify(key.enc_hmac_key), b'b885a05d329a086627412a6142aaeb9f6c54ab7950f996dd65587251f6bc0901')
         self.assert_equal(hexlify(key.enc_key), b'2ff3654c6daf7381dbbe718d2b20b4f1ea1e34caa6cc65f6bb3ac376b93fed2a')
         self.assert_equal(key.chunk_seed, -775740477)
         manifest = key.encrypt(b'XXX')
-        self.assert_equal(key.extract_nonce(manifest), 0)
+        self.assert_equal(key.extract_iv(manifest), b'\0'*16)
         manifest2 = key.encrypt(b'XXX')
         self.assert_not_equal(manifest, manifest2)
         self.assert_equal(key.decrypt(None, manifest), key.decrypt(None, manifest2))
-        self.assert_equal(key.extract_nonce(manifest2), 1)
-        iv = key.extract_nonce(manifest)
+        self.assert_equal(key.extract_iv(manifest2), b'\0'*15+b'\x01')
+        iv = key.extract_iv(manifest)
         key2 = PassphraseKey.detect(self.MockRepository(), manifest)
-        self.assert_equal(bytes_to_long(key2.enc_cipher.iv, 8), iv + num_aes_blocks(len(manifest) - PassphraseKey.PAYLOAD_OVERHEAD))
+        # we assume that the payload fits into one 16B AES block (which is given for b'XXX').
+        iv_plus_1 = increment_iv(iv, 16)
+        self.assert_equal(key2.enc_iv, iv_plus_1)
         self.assert_equal(key.id_key, key2.id_key)
         self.assert_equal(key.enc_hmac_key, key2.enc_hmac_key)
         self.assert_equal(key.enc_key, key2.enc_key)
         self.assert_equal(key.chunk_seed, key2.chunk_seed)
         data = b'foo'
-        self.assert_equal(hexlify(key.id_hash(data)), b'818217cf07d37efad3860766dcdf1d21e401650fed2d76ed1d797d3aae925990')
+        self.assert_equal(hexlify(key.id_hash(data)), b'a409d69859b8a07625f066e42cde0501')
         self.assert_equal(data, key2.decrypt(key2.id_hash(data), key.encrypt(data)))

+ 11 - 9
borg/testsuite/repository.py

@@ -9,16 +9,15 @@ from ..repository import Repository
 from . import BaseTestCase
 from .mock import patch
 
-
 class RepositoryTestCaseBase(BaseTestCase):
     key_size = 32
 
-    def open(self, create=False):
-        return Repository(os.path.join(self.tmppath, 'repository'), create=create)
+    def open(self, create=False, key_size=None):
+        return Repository(os.path.join(self.tmppath, 'repository'), create=create, key_size=key_size)
 
     def setUp(self):
         self.tmppath = tempfile.mkdtemp()
-        self.repository = self.open(create=True)
+        self.repository = self.open(create=True, key_size=self.key_size)
 
     def tearDown(self):
         self.repository.close()
@@ -209,7 +208,8 @@ class RepositoryCheckTestCase(RepositoryTestCaseBase):
         return sorted(int(n) for n in os.listdir(os.path.join(self.tmppath, 'repository', 'data', '0')) if n.isdigit())[-1]
 
     def open_index(self):
-        return NSIndex.read(os.path.join(self.tmppath, 'repository', 'index.{}'.format(self.get_head())))
+        return NSIndex.read(os.path.join(self.tmppath, 'repository', 'index.{}'.format(self.get_head())),
+                            key_size=self.key_size)
 
     def corrupt_object(self, id_):
         idx = self.open_index()
@@ -317,8 +317,9 @@ class RepositoryCheckTestCase(RepositoryTestCaseBase):
 
 class RemoteRepositoryTestCase(RepositoryTestCase):
 
-    def open(self, create=False):
-        return RemoteRepository(Location('__testsuite__:' + os.path.join(self.tmppath, 'repository')), create=create)
+    def open(self, create=False, key_size=None):
+        return RemoteRepository(Location('__testsuite__:' + os.path.join(self.tmppath, 'repository')),
+                                create=create, key_size=key_size)
 
     def test_invalid_rpc(self):
         self.assert_raises(InvalidRPCMethod, lambda: self.repository.call('__init__', None))
@@ -326,5 +327,6 @@ class RemoteRepositoryTestCase(RepositoryTestCase):
 
 class RemoteRepositoryCheckTestCase(RepositoryCheckTestCase):
 
-    def open(self, create=False):
-        return RemoteRepository(Location('__testsuite__:' + os.path.join(self.tmppath, 'repository')), create=create)
+    def open(self, create=False, key_size=None):
+        return RemoteRepository(Location('__testsuite__:' + os.path.join(self.tmppath, 'repository')),
+                                create=create, key_size=key_size)

+ 1 - 0
docs/index.rst

@@ -53,6 +53,7 @@ User's Guide
    quickstart
    usage
    faq
+   tuning
    internals
 
 Getting help

+ 147 - 0
docs/tuning.rst

@@ -0,0 +1,147 @@
+.. _tuning:
+.. include:: global.rst.inc
+
+Tuning
+======
+
+General hints
+-------------
+CPU load, backup speed, memory and storage usage are covered below.
+
+As performance and resource usage depend on a lot of factors, you may need to
+tweak the parameters a bit and retry until you found the best ones for your
+setup.
+
+Usually, the default parameters are selected for best speed under the assumption
+that you run a modern machine with fast CPU, fast I/O and a good amount of RAM.
+
+If you run an older or low-resource machine or your backup target or connection
+to it is slow, tweaking parameters might give significant speedups.
+
+Exclude crap data
+-----------------
+Maybe you don't want to backup:
+
+* cache / temporary files (they can be rebuilt / are useless)
+* specific directories / filenames / file extensions you do not need
+* backups (some people make backups of backups...)
+
+You can exclude these, so they don't waste time and space.
+
+Avoid scrolling
+---------------
+If you do benchmarks, avoid creating a lot of log output, especially if it
+means scrolling text in a window on a graphical user interface.
+
+Rather use much less log output or at least redirect the output to a log file,
+that is also much faster than scrolling.
+
+Speed (in general)
+------------------
+Keep an eye on CPU and I/O bounds. Try to find the sweet spot in the middle
+where it is not too much I/O bound and not too much CPU bound.
+
+I/O bound
+~~~~~~~~~
+If CPU load does not sum up to 1 core fully loaded while backing up, the
+process is likely I/O bound (can't read or write data fast enough).
+
+Maybe you want to try higher compression then so it has less data to write.
+Or get faster I/O, if possible.
+
+CPU bound
+~~~~~~~~~
+If you have 1 core fully loaded most of the time, but your backup seems slow,
+the process is likely CPU bound (can't compute fast enough).
+
+Maybe you want to try lower compression then so it has less to compute.
+Using a faster MAC or cipher method might also be an option.
+Or get a faster CPU.
+
+I/O speed
+---------
+From fast to slower:
+
+* fast local filesystem, SSD or HDD, via PCIe, SATA, USB
+* ssh connection to a remote server's borg instance
+* mounted network filesystems of a remote server
+
+Not only throughput influences timing, latency does also.
+
+Backup space needed
+-------------------
+If you always backup the same data mostly, you will often save a lot of space
+due to deduplication - this works independently from compression.
+
+To avoid running out of space, regularly prune your backup archives according
+to your needs. Backups of same machine which are close in time are usually
+very cheap (because most data is same and deduplicated).
+
+Compression
+-----------
+If you have a fast backup source and destination and you are not low on backup space:
+Switch off compression, your backup will run faster and with less cpu load.
+
+If you just want to save a bit space, but stay relatively fast:
+Try zlib level 1.
+
+If you have very slow source or destination (e.g. a remote backup space via a
+network connection that is quite slower than your local and remote storage):
+Try a higher zlib or lzma.
+
+Authentication & MAC selection
+------------------------------
+Real MACs (Message Authentication Codes) can only be used when a secret key is
+available. It is signing your backup data and can detect malicious tampering.
+Without a key, a simple hash will be used (which helps to detect accidental
+data corruption, but can not detect malicious data tampering).
+
+Older or simple 32bit machine architecture
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Use sha256 (no key) or hmac-sha256 (key).
+
+64bit architecture, but no AES hardware acceleration in the CPU
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Use sha512-256 (no key) or hmac-sha512-256 (key).
+
+Modern 64bit CPU with AES hardware acceleration (AES-NI, PCLMULQDQ)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Use ghash (no key) or gmac (key).
+
+Encryption & Cipher selection
+-----------------------------
+Always encrypt your backups (and keep passphrase and key file [if any] safe).
+
+The cipher selection chooses between misc. AEAD ciphers (authenticated
+encryption with associated data), it is EtM (encrypt-then-mac):
+
+Older or simple 32bit machine architecture
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Use aes256-ctr + hmac-sha256.
+
+64bit architecture, but no AES hardware acceleration in the CPU
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Use aes256-ctr + hmac-sha512-256.
+
+Modern 64bit CPU with AES hardware acceleration (AES-NI, PCLMULQDQ)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Use aes256-gcm (AEAD 1-pass cipher).
+
+RAM usage
+---------
+Depending on the amount of files and chunks in the repository, memory usage
+varies:
+
+* about 250+B RAM per file (for "files" cache)
+* about 44B RAM per 64kiB chunk (for "chunks" cache)
+* about 40B RAM per 64kiB chunk (for repository index, if remote repo is used,
+  this will be allocated on remote side)
+
+If you run into memory usage issues, your options are:
+
+* get more RAM (or more swapspace, speed will be slower)
+* disable the "files" cache, speed will be slower
+* have less files / chunks per repo
+
+Note: RAM compression likely won't help as a lot of that data is using
+msgpack, which is already rather efficient.

+ 7 - 3
setup.py

@@ -102,6 +102,12 @@ elif sys.platform.startswith('freebsd'):
 elif sys.platform == 'darwin':
     ext_modules.append(Extension('borg.platform_darwin', [platform_darwin_source]))
 
+# msgpack pure python data corruption was fixed in 0.4.6.
+# Also, we might use some rather recent API features.
+install_requires=['msgpack-python>=0.4.6', 'blosc>=1.2.5']
+if sys.version_info < (3, 3):
+    install_requires.append('backports.lzma')
+
 setup(
     name='borgbackup',
     version=versioneer.get_version(),
@@ -132,7 +138,5 @@ setup(
     scripts=['scripts/borg'],
     cmdclass=cmdclass,
     ext_modules=ext_modules,
-    # msgpack pure python data corruption was fixed in 0.4.6.
-    # Also, we might use some rather recent API features.
-    install_requires=['msgpack-python>=0.4.6']
+    install_requires=install_requires,
 )