8 years ago · 820066da5d
--- a/src/borg/crypto/file_integrity.py
+++ b/src/borg/crypto/file_integrity.py
@@ -0,0 +1,182 @@
 
				+import hashlib
			
 
				+import io
			
 
				+import json
			
 
				+import os
			
 
				+from hmac import compare_digest
			
 
				+
			
 
				+from ..helpers import IntegrityError
			
 
				+from ..logger import create_logger
			
 
				+
			
 
				+logger = create_logger()
			
 
				+
			
 
				+
			
 
				+class FileLikeWrapper:
			
 
				+    def __enter__(self):
			
 
				+        self.fd.__enter__()
			
 
				+        return self
			
 
				+
			
 
				+    def __exit__(self, exc_type, exc_val, exc_tb):
			
 
				+        self.fd.__exit__(exc_type, exc_val, exc_tb)
			
 
				+
			
 
				+    def tell(self):
			
 
				+        return self.fd.tell()
			
 
				+
			
 
				+    def seek(self, offset, whence=io.SEEK_SET):
			
 
				+        return self.fd.seek(offset, whence)
			
 
				+
			
 
				+    def write(self, data):
			
 
				+        return self.fd.write(data)
			
 
				+
			
 
				+    def read(self, n=None):
			
 
				+        return self.fd.read(n)
			
 
				+
			
 
				+    def flush(self):
			
 
				+        self.fd.flush()
			
 
				+
			
 
				+    def fileno(self):
			
 
				+        return self.fd.fileno()
			
 
				+
			
 
				+
			
 
				+class SHA512FileHashingWrapper(FileLikeWrapper):
			
 
				+    """
			
 
				+    Wrapper for file-like objects that computes a hash on-the-fly while reading/writing.
			
 
				+
			
 
				+    WARNING: Seeks should only be used to query the size of the file, not
			
 
				+    to skip data, because skipped data isn't read and not hashed into the digest.
			
 
				+
			
 
				+    Similarly skipping while writing to create sparse files is also not supported.
			
 
				+
			
 
				+    Data has to be read/written in a symmetric fashion, otherwise different
			
 
				+    digests will be generated.
			
 
				+
			
 
				+    Note: When used as a context manager read/write operations outside the enclosed scope
			
 
				+    are illegal.
			
 
				+    """
			
 
				+
			
 
				+    ALGORITHM = 'SHA512'
			
 
				+
			
 
				+    def __init__(self, backing_fd, write):
			
 
				+        self.fd = backing_fd
			
 
				+        self.writing = write
			
 
				+        self.hash = hashlib.new(self.ALGORITHM)
			
 
				+
			
 
				+    def __exit__(self, exc_type, exc_val, exc_tb):
			
 
				+        if exc_type is None:
			
 
				+            self.hash_length()
			
 
				+        super().__exit__(exc_type, exc_val, exc_tb)
			
 
				+
			
 
				+    def write(self, data):
			
 
				+        """
			
 
				+        Write *data* to backing file and update internal state.
			
 
				+        """
			
 
				+        n = super().write(data)
			
 
				+        self.hash.update(data)
			
 
				+        return n
			
 
				+
			
 
				+    def read(self, n=None):
			
 
				+        """
			
 
				+        Read *data* from backing file (*n* has the usual meaning) and update internal state.
			
 
				+        """
			
 
				+        data = super().read(n)
			
 
				+        self.hash.update(data)
			
 
				+        return data
			
 
				+
			
 
				+    def hexdigest(self):
			
 
				+        """
			
 
				+        Return current digest bytes as hex-string.
			
 
				+
			
 
				+        Note: this can be called multiple times.
			
 
				+        """
			
 
				+        return self.hash.hexdigest()
			
 
				+
			
 
				+    def update(self, data: bytes):
			
 
				+        self.hash.update(data)
			
 
				+
			
 
				+    def hash_length(self, seek_to_end=False):
			
 
				+        if seek_to_end:
			
 
				+            # Add length of file to the hash to avoid problems if only a prefix is read.
			
 
				+            self.seek(0, io.SEEK_END)
			
 
				+        self.hash.update(str(self.tell()).encode())
			
 
				+
			
 
				+
			
 
				+class FileIntegrityError(IntegrityError):
			
 
				+    """File failed integrity check: {}"""
			
 
				+
			
 
				+
			
 
				+class IntegrityCheckedFile(FileLikeWrapper):
			
 
				+    def __init__(self, path, write, filename=None, override_fd=None):
			
 
				+        self.path = path
			
 
				+        self.writing = write
			
 
				+        mode = 'wb' if write else 'rb'
			
 
				+        self.file_fd = override_fd or open(path, mode)
			
 
				+
			
 
				+        self.fd = self.hasher = SHA512FileHashingWrapper(backing_fd=self.file_fd, write=write)
			
 
				+
			
 
				+        self.hash_filename(filename)
			
 
				+
			
 
				+        if write:
			
 
				+            self.digests = {}
			
 
				+        else:
			
 
				+            self.digests = self.read_integrity_file(path, self.hasher)
			
 
				+            # TODO: When we're reading but don't have any digests, i.e. no integrity file existed,
			
 
				+            # TODO: then we could just short-circuit.
			
 
				+
			
 
				+    def hash_filename(self, filename=None):
			
 
				+        # Hash the name of the file, but only the basename, ie. not the path.
			
 
				+        # In Borg the name itself encodes the context (eg. index.N, cache, files),
			
 
				+        # while the path doesn't matter, and moving e.g. a repository or cache directory is supported.
			
 
				+        # Changing the name however imbues a change of context that is not permissible.
			
 
				+        filename = os.path.basename(filename or self.path)
			
 
				+        self.hasher.update(('%10d' % len(filename)).encode())
			
 
				+        self.hasher.update(filename.encode())
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def integrity_file_path(path):
			
 
				+        return path + '.integrity'
			
 
				+
			
 
				+    @classmethod
			
 
				+    def read_integrity_file(cls, path, hasher):
			
 
				+        try:
			
 
				+            with open(cls.integrity_file_path(path), 'r') as fd:
			
 
				+                integrity_file = json.load(fd)
			
 
				+                # Provisions for agility now, implementation later, but make sure the on-disk joint is oiled.
			
 
				+                algorithm = integrity_file['algorithm']
			
 
				+                if algorithm != hasher.ALGORITHM:
			
 
				+                    logger.warning('Cannot verify integrity of %s: Unknown algorithm %r', path, algorithm)
			
 
				+                    return
			
 
				+                digests = integrity_file['digests']
			
 
				+                # Require at least presence of the final digest
			
 
				+                digests['final']
			
 
				+                return digests
			
 
				+        except FileNotFoundError:
			
 
				+            logger.info('No integrity file found for %s', path)
			
 
				+        except (OSError, ValueError, TypeError, KeyError) as e:
			
 
				+            logger.warning('Could not read integrity file for %s: %s', path, e)
			
 
				+            raise FileIntegrityError(path)
			
 
				+
			
 
				+    def hash_part(self, partname, is_final=False):
			
 
				+        if not self.writing and not self.digests:
			
 
				+            return
			
 
				+        self.hasher.update(partname.encode())
			
 
				+        self.hasher.hash_length(seek_to_end=is_final)
			
 
				+        digest = self.hasher.hexdigest()
			
 
				+        if self.writing:
			
 
				+            self.digests[partname] = digest
			
 
				+        elif self.digests and not compare_digest(self.digests.get(partname, ''), digest):
			
 
				+            raise FileIntegrityError(self.path)
			
 
				+
			
 
				+    def __exit__(self, exc_type, exc_val, exc_tb):
			
 
				+        exception = exc_type is not None
			
 
				+        if not exception:
			
 
				+            self.hash_part('final', is_final=True)
			
 
				+        self.hasher.__exit__(exc_type, exc_val, exc_tb)
			
 
				+        if exception:
			
 
				+            return
			
 
				+        if self.writing:
			
 
				+            with open(self.integrity_file_path(self.path), 'w') as fd:
			
 
				+                json.dump({
			
 
				+                    'algorithm': self.hasher.ALGORITHM,
			
 
				+                    'digests': self.digests,
			
 
				+                }, fd)
			
 
				+        elif self.digests:
			
 
				+            logger.debug('Verified integrity of %s', self.path)
			
--- a/src/borg/testsuite/file_integrity.py
+++ b/src/borg/testsuite/file_integrity.py
@@ -0,0 +1,152 @@
 
				+
			
 
				+import pytest
			
 
				+
			
 
				+from ..crypto.file_integrity import IntegrityCheckedFile, FileIntegrityError
			
 
				+
			
 
				+
			
 
				+class TestReadIntegrityFile:
			
 
				+    def test_no_integrity(self, tmpdir):
			
 
				+        protected_file = tmpdir.join('file')
			
 
				+        protected_file.write('1234')
			
 
				+        assert IntegrityCheckedFile.read_integrity_file(str(protected_file), None) is None
			
 
				+
			
 
				+    def test_truncated_integrity(self, tmpdir):
			
 
				+        protected_file = tmpdir.join('file')
			
 
				+        protected_file.write('1234')
			
 
				+        tmpdir.join('file.integrity').write('')
			
 
				+        with pytest.raises(FileIntegrityError):
			
 
				+            IntegrityCheckedFile.read_integrity_file(str(protected_file), None)
			
 
				+
			
 
				+    def test_unknown_algorithm(self, tmpdir):
			
 
				+        class SomeHasher:
			
 
				+            ALGORITHM = 'HMAC_FOOHASH9000'
			
 
				+
			
 
				+        protected_file = tmpdir.join('file')
			
 
				+        protected_file.write('1234')
			
 
				+        tmpdir.join('file.integrity').write('{"algorithm": "HMAC_SERIOUSHASH", "digests": "1234"}')
			
 
				+        assert IntegrityCheckedFile.read_integrity_file(str(protected_file), SomeHasher()) is None
			
 
				+
			
 
				+    @pytest.mark.parametrize('json', (
			
 
				+        '{"ALGORITHM": "HMAC_SERIOUSHASH", "digests": "1234"}',
			
 
				+        '[]',
			
 
				+        '1234.5',
			
 
				+        '"A string"',
			
 
				+        'Invalid JSON',
			
 
				+    ))
			
 
				+    def test_malformed(self, tmpdir, json):
			
 
				+        protected_file = tmpdir.join('file')
			
 
				+        protected_file.write('1234')
			
 
				+        tmpdir.join('file.integrity').write(json)
			
 
				+        with pytest.raises(FileIntegrityError):
			
 
				+            IntegrityCheckedFile.read_integrity_file(str(protected_file), None)
			
 
				+
			
 
				+    def test_valid(self, tmpdir):
			
 
				+        class SomeHasher:
			
 
				+            ALGORITHM = 'HMAC_FOO1'
			
 
				+
			
 
				+        protected_file = tmpdir.join('file')
			
 
				+        protected_file.write('1234')
			
 
				+        tmpdir.join('file.integrity').write('{"algorithm": "HMAC_FOO1", "digests": {"final": "1234"}}')
			
 
				+        assert IntegrityCheckedFile.read_integrity_file(str(protected_file), SomeHasher()) == {'final': '1234'}
			
 
				+
			
 
				+
			
 
				+class TestIntegrityCheckedFile:
			
 
				+    @pytest.fixture
			
 
				+    def integrity_protected_file(self, tmpdir):
			
 
				+        path = str(tmpdir.join('file'))
			
 
				+        with IntegrityCheckedFile(path, write=True) as fd:
			
 
				+            fd.write(b'foo and bar')
			
 
				+        return path
			
 
				+
			
 
				+    def test_simple(self, tmpdir, integrity_protected_file):
			
 
				+        assert tmpdir.join('file').check(file=True)
			
 
				+        assert tmpdir.join('file.integrity').check(file=True)
			
 
				+        with IntegrityCheckedFile(integrity_protected_file, write=False) as fd:
			
 
				+            assert fd.read() == b'foo and bar'
			
 
				+
			
 
				+    def test_corrupted_file(self, integrity_protected_file):
			
 
				+        with open(integrity_protected_file, 'ab') as fd:
			
 
				+            fd.write(b' extra data')
			
 
				+        with pytest.raises(FileIntegrityError):
			
 
				+            with IntegrityCheckedFile(integrity_protected_file, write=False) as fd:
			
 
				+                assert fd.read() == b'foo and bar extra data'
			
 
				+
			
 
				+    def test_corrupted_file_partial_read(self, integrity_protected_file):
			
 
				+        with open(integrity_protected_file, 'ab') as fd:
			
 
				+            fd.write(b' extra data')
			
 
				+        with pytest.raises(FileIntegrityError):
			
 
				+            with IntegrityCheckedFile(integrity_protected_file, write=False) as fd:
			
 
				+                data = b'foo and bar'
			
 
				+                assert fd.read(len(data)) == data
			
 
				+
			
 
				+    @pytest.mark.parametrize('new_name', (
			
 
				+        'different_file',
			
 
				+        'different_file.different_ext',
			
 
				+    ))
			
 
				+    def test_renamed_file(self, tmpdir, integrity_protected_file, new_name):
			
 
				+        new_path = tmpdir.join(new_name)
			
 
				+        tmpdir.join('file').move(new_path)
			
 
				+        tmpdir.join('file.integrity').move(new_path + '.integrity')
			
 
				+        with pytest.raises(FileIntegrityError):
			
 
				+            with IntegrityCheckedFile(str(new_path), write=False) as fd:
			
 
				+                assert fd.read() == b'foo and bar'
			
 
				+
			
 
				+    def test_moved_file(self, tmpdir, integrity_protected_file):
			
 
				+        new_dir = tmpdir.mkdir('another_directory')
			
 
				+        tmpdir.join('file').move(new_dir.join('file'))
			
 
				+        tmpdir.join('file.integrity').move(new_dir.join('file.integrity'))
			
 
				+        new_path = str(new_dir.join('file'))
			
 
				+        with IntegrityCheckedFile(new_path, write=False) as fd:
			
 
				+            assert fd.read() == b'foo and bar'
			
 
				+
			
 
				+    def test_no_integrity(self, tmpdir, integrity_protected_file):
			
 
				+        tmpdir.join('file.integrity').remove()
			
 
				+        with IntegrityCheckedFile(integrity_protected_file, write=False) as fd:
			
 
				+            assert fd.read() == b'foo and bar'
			
 
				+
			
 
				+
			
 
				+class TestIntegrityCheckedFileParts:
			
 
				+    @pytest.fixture
			
 
				+    def integrity_protected_file(self, tmpdir):
			
 
				+        path = str(tmpdir.join('file'))
			
 
				+        with IntegrityCheckedFile(path, write=True) as fd:
			
 
				+            fd.write(b'foo and bar')
			
 
				+            fd.hash_part('foopart')
			
 
				+            fd.write(b' other data')
			
 
				+        return path
			
 
				+
			
 
				+    def test_simple(self, integrity_protected_file):
			
 
				+        with IntegrityCheckedFile(integrity_protected_file, write=False) as fd:
			
 
				+            data1 = b'foo and bar'
			
 
				+            assert fd.read(len(data1)) == data1
			
 
				+            fd.hash_part('foopart')
			
 
				+            assert fd.read() == b' other data'
			
 
				+
			
 
				+    def test_wrong_part_name(self, integrity_protected_file):
			
 
				+        with pytest.raises(FileIntegrityError):
			
 
				+            # Because some hash_part failed, the final digest will fail as well - again - even if we catch
			
 
				+            # the failing hash_part. This is intentional: (1) it makes the code simpler (2) it's a good fail-safe
			
 
				+            # against overly broad exception handling.
			
 
				+            with IntegrityCheckedFile(integrity_protected_file, write=False) as fd:
			
 
				+                data1 = b'foo and bar'
			
 
				+                assert fd.read(len(data1)) == data1
			
 
				+                with pytest.raises(FileIntegrityError):
			
 
				+                    # This specific bit raises it directly
			
 
				+                    fd.hash_part('barpart')
			
 
				+                # Still explodes in the end.
			
 
				+
			
 
				+    @pytest.mark.parametrize('partial_read', (False, True))
			
 
				+    def test_part_independence(self, integrity_protected_file, partial_read):
			
 
				+        with open(integrity_protected_file, 'ab') as fd:
			
 
				+            fd.write(b'some extra stuff that does not belong')
			
 
				+        with pytest.raises(FileIntegrityError):
			
 
				+            with IntegrityCheckedFile(integrity_protected_file, write=False) as fd:
			
 
				+                data1 = b'foo and bar'
			
 
				+                try:
			
 
				+                    assert fd.read(len(data1)) == data1
			
 
				+                    fd.hash_part('foopart')
			
 
				+                except FileIntegrityError:
			
 
				+                    assert False, 'This part must not raise, since this part is still valid.'
			
 
				+                if not partial_read:
			
 
				+                    fd.read()
			
 
				+                # But overall it explodes with the final digest. Neat, eh?