9 年前 · f8b2ffe999
--- a/docs/changes.rst
+++ b/docs/changes.rst
@@ -130,6 +130,14 @@ Security fixes:
 
				 
			
 
				 - fix security issue with remote repository access, #1428
			
 
				 
			
 
				+Bug fixes:
			
 
				+
			
 
				+- do not write objects to repository that are bigger than the allowed size,
			
 
				+  borg will reject reading them, #1451.
			
 
				+  IMPORTANT: if you created archives with many millions of files or
			
 
				+             directories, please verify if you can open them successfully,
			
 
				+             e.g. try a "borg list REPO::ARCHIVE".
			
 
				+
			
 
				 
			
 
				 Version 1.0.7rc1 (2016-08-05)
			
 
				 -----------------------------
			
--- a/docs/usage/debug-dump-repo-objs.rst.inc
+++ b/docs/usage/debug-dump-repo-objs.rst.inc
@@ -0,0 +1,38 @@
 
				+.. IMPORTANT: this file is auto-generated from borg's built-in help, do not edit!
			
 
				+
			
 
				+.. _borg_debug-dump-repo-objs:
			
 
				+
			
 
				+borg debug-dump-repo-objs
			
 
				+-------------------------
			
 
				+::
			
 
				+
			
 
				+    usage: borg debug-dump-repo-objs [-h] [--critical] [--error] [--warning]
			
 
				+                                     [--info] [--debug] [--lock-wait N]
			
 
				+                                     [--show-rc] [--no-files-cache] [--umask M]
			
 
				+                                     [--remote-path PATH]
			
 
				+                                     REPOSITORY
			
 
				+    
			
 
				+    dump (decrypted, decompressed) repo objects
			
 
				+    
			
 
				+    positional arguments:
			
 
				+      REPOSITORY            repo to dump
			
 
				+    
			
 
				+    optional arguments:
			
 
				+      -h, --help            show this help message and exit
			
 
				+      --critical            work on log level CRITICAL
			
 
				+      --error               work on log level ERROR
			
 
				+      --warning             work on log level WARNING (default)
			
 
				+      --info, -v, --verbose
			
 
				+                            work on log level INFO
			
 
				+      --debug               work on log level DEBUG
			
 
				+      --lock-wait N         wait for the lock, but max. N seconds (default: 1).
			
 
				+      --show-rc             show/log the return code (rc)
			
 
				+      --no-files-cache      do not load/update the file metadata cache used to
			
 
				+                            detect unchanged files
			
 
				+      --umask M             set umask to M (local and remote, default: 0077)
			
 
				+      --remote-path PATH    set remote path to executable (default: "borg")
			
 
				+    
			
 
				+Description
			
 
				+~~~~~~~~~~~
			
 
				+
			
 
				+This command dumps raw (but decrypted and decompressed) repo objects to files.
			
--- a/src/borg/compress.pyx
+++ b/src/borg/compress.pyx
@@ -1,3 +1,4 @@
 
				+import threading
			
 
				 import zlib
			
 
				 try:
			
 
				     import lzma
			
@@ -7,6 +8,18 @@ except ImportError:
 
				 cdef extern from "lz4.h":
			
 
				     int LZ4_compress_limitedOutput(const char* source, char* dest, int inputSize, int maxOutputSize) nogil
			
 
				     int LZ4_decompress_safe(const char* source, char* dest, int inputSize, int maxOutputSize) nogil
			
 
				+    int LZ4_compressBound(int inputSize) nogil
			
 
				+
			
 
				+
			
 
				+thread_local = threading.local()
			
 
				+thread_local.buffer = bytes()
			
 
				+
			
 
				+
			
 
				+cdef char *get_buffer(size):
			
 
				+    size = int(size)
			
 
				+    if len(thread_local.buffer) < size:
			
 
				+        thread_local.buffer = bytes(size)
			
 
				+    return <char *> thread_local.buffer
			
 
				 
			
 
				 
			
 
				 cdef class CompressorBase:
			
@@ -52,40 +65,30 @@ class CNONE(CompressorBase):
 
				         return data
			
 
				 
			
 
				 
			
 
				-cdef class LZ4(CompressorBase):
			
 
				+class LZ4(CompressorBase):
			
 
				     """
			
 
				     raw LZ4 compression / decompression (liblz4).
			
 
				 
			
 
				     Features:
			
 
				         - lz4 is super fast
			
 
				         - wrapper releases CPython's GIL to support multithreaded code
			
 
				-        - buffer given by caller, avoiding frequent reallocation and buffer duplication
			
 
				         - uses safe lz4 methods that never go beyond the end of the output buffer
			
 
				-
			
 
				-    But beware:
			
 
				-        - this is not very generic, the given buffer MUST be large enough to
			
 
				-          handle all compression or decompression output (or it will fail).
			
 
				-        - you must not do method calls to the same LZ4 instance from different
			
 
				-          threads at the same time - create one LZ4 instance per thread!
			
 
				     """
			
 
				     ID = b'\x01\x00'
			
 
				     name = 'lz4'
			
 
				 
			
 
				-    cdef char *buffer  # helper buffer for (de)compression output
			
 
				-    cdef int bufsize  # size of this buffer
			
 
				-
			
 
				-    def __cinit__(self, **kwargs):
			
 
				-        buffer = kwargs['buffer']
			
 
				-        self.buffer = buffer
			
 
				-        self.bufsize = len(buffer)
			
 
				+    def __init__(self, **kwargs):
			
 
				+        pass
			
 
				 
			
 
				     def compress(self, idata):
			
 
				         if not isinstance(idata, bytes):
			
 
				             idata = bytes(idata)  # code below does not work with memoryview
			
 
				         cdef int isize = len(idata)
			
 
				-        cdef int osize = self.bufsize
			
 
				+        cdef int osize
			
 
				         cdef char *source = idata
			
 
				-        cdef char *dest = self.buffer
			
 
				+        cdef char *dest
			
 
				+        osize = LZ4_compressBound(isize)
			
 
				+        dest = get_buffer(osize)
			
 
				         with nogil:
			
 
				             osize = LZ4_compress_limitedOutput(source, dest, isize, osize)
			
 
				         if not osize:
			
@@ -97,15 +100,25 @@ cdef class LZ4(CompressorBase):
 
				             idata = bytes(idata)  # code below does not work with memoryview
			
 
				         idata = super().decompress(idata)
			
 
				         cdef int isize = len(idata)
			
 
				-        cdef int osize = self.bufsize
			
 
				+        cdef int osize
			
 
				+        cdef int rsize
			
 
				         cdef char *source = idata
			
 
				-        cdef char *dest = self.buffer
			
 
				-        with nogil:
			
 
				-            osize = LZ4_decompress_safe(source, dest, isize, osize)
			
 
				-        if osize < 0:
			
 
				-            # malformed input data, buffer too small, ...
			
 
				-            raise Exception('lz4 decompress failed')
			
 
				-        return dest[:osize]
			
 
				+        cdef char *dest
			
 
				+        # a bit more than 8MB is enough for the usual data sizes yielded by the chunker.
			
 
				+        # allocate more if isize * 3 is already bigger, to avoid having to resize often.
			
 
				+        osize = max(int(1.1 * 2**23), isize * 3)
			
 
				+        while True:
			
 
				+            dest = get_buffer(osize)
			
 
				+            with nogil:
			
 
				+                rsize = LZ4_decompress_safe(source, dest, isize, osize)
			
 
				+            if rsize >= 0:
			
 
				+                break
			
 
				+            if osize > 2 ** 30:
			
 
				+                # this is insane, get out of here
			
 
				+                raise Exception('lz4 decompress failed')
			
 
				+            # likely the buffer was too small, get a bigger one:
			
 
				+            osize = int(1.5 * osize)
			
 
				+        return dest[:rsize]
			
 
				 
			
 
				 
			
 
				 class LZMA(CompressorBase):
			
@@ -192,8 +205,3 @@ class Compressor:
 
				                 return cls(**self.params).decompress(data)
			
 
				         else:
			
 
				             raise ValueError('No decompressor for this data found: %r.', data[:2])
			
 
				-
			
 
				-
			
 
				-# a buffer used for (de)compression result, which can be slightly bigger
			
 
				-# than the chunk buffer in the worst (incompressible data) case, add 10%:
			
 
				-COMPR_BUFFER = bytes(int(1.1 * 2 ** 23))  # CHUNK_MAX_EXP == 23
			
--- a/src/borg/helpers.py
+++ b/src/borg/helpers.py
@@ -38,7 +38,7 @@ from . import crypto
 
				 from . import hashindex
			
 
				 from . import shellpattern
			
 
				 from .constants import *  # NOQA
			
 
				-from .compress import COMPR_BUFFER, get_compressor
			
 
				+from .compress import get_compressor
			
 
				 
			
 
				 # meta dict, data bytes
			
 
				 _Chunk = namedtuple('_Chunk', 'meta data')
			
@@ -470,8 +470,6 @@ def ChunkerParams(s):
 
				         return CHUNKER_PARAMS
			
 
				     chunk_min, chunk_max, chunk_mask, window_size = s.split(',')
			
 
				     if int(chunk_max) > 23:
			
 
				-        # do not go beyond 2**23 (8MB) chunk size now,
			
 
				-        # COMPR_BUFFER can only cope with up to this size
			
 
				         raise ValueError('max. chunk size exponent must not be more than 23 (2^23 = 8MiB max. chunk size)')
			
 
				     return int(chunk_min), int(chunk_max), int(chunk_mask), int(window_size)
			
 
				 
			
@@ -1538,16 +1536,14 @@ class CompressionDecider2:
 
				         # if we compress the data here to decide, we can even update the chunk data
			
 
				         # and modify the metadata as desired.
			
 
				         compr_spec = chunk.meta.get('compress', self.compression)
			
 
				-        compr_args = dict(buffer=COMPR_BUFFER)
			
 
				-        compr_args.update(compr_spec)
			
 
				-        if compr_args['name'] == 'auto':
			
 
				+        if compr_spec['name'] == 'auto':
			
 
				             # we did not decide yet, use heuristic:
			
 
				-            compr_args, chunk = self.heuristic_lz4(compr_args, chunk)
			
 
				-        return compr_args, chunk
			
 
				+            compr_spec, chunk = self.heuristic_lz4(compr_spec, chunk)
			
 
				+        return compr_spec, chunk
			
 
				 
			
 
				     def heuristic_lz4(self, compr_args, chunk):
			
 
				         meta, data = chunk
			
 
				-        lz4 = get_compressor('lz4', buffer=compr_args['buffer'])
			
 
				+        lz4 = get_compressor('lz4')
			
 
				         cdata = lz4.compress(data)
			
 
				         data_len = len(data)
			
 
				         cdata_len = len(cdata)
			
--- a/src/borg/key.py
+++ b/src/borg/key.py
@@ -13,7 +13,7 @@ from .logger import create_logger
 
				 logger = create_logger()
			
 
				 
			
 
				 from .constants import *  # NOQA
			
 
				-from .compress import Compressor, COMPR_BUFFER, get_compressor
			
 
				+from .compress import Compressor, get_compressor
			
 
				 from .crypto import AES, bytes_to_long, long_to_bytes, bytes_to_int, num_aes_blocks, hmac_sha256
			
 
				 from .helpers import Chunk
			
 
				 from .helpers import Error, IntegrityError
			
@@ -89,7 +89,7 @@ class KeyBase:
 
				         self.repository = repository
			
 
				         self.target = None  # key location file path / repo obj
			
 
				         self.compression_decider2 = CompressionDecider2(CompressionSpec('none'))
			
 
				-        self.compressor = Compressor('none', buffer=COMPR_BUFFER)  # for decompression
			
 
				+        self.compressor = Compressor('none')  # for decompression
			
 
				 
			
 
				     def id_hash(self, data):
			
 
				         """Return HMAC hash using the "id" HMAC key
			
--- a/src/borg/repository.py
+++ b/src/borg/repository.py
@@ -909,9 +909,14 @@ class LoggedIO:
 
				             key = None
			
 
				         else:
			
 
				             raise TypeError("_read called with unsupported format")
			
 
				-        if size > MAX_OBJECT_SIZE or size < fmt.size:
			
 
				-            raise IntegrityError('Invalid segment entry size [segment {}, offset {}]'.format(
			
 
				-                segment, offset))
			
 
				+        if size > MAX_OBJECT_SIZE:
			
 
				+            # if you get this on an archive made with borg < 1.0.7 and millions of files and
			
 
				+            # you need to restore it, you can disable this check by using "if False:" above.
			
 
				+            raise IntegrityError('Invalid segment entry size {} - too big [segment {}, offset {}]'.format(
			
 
				+                size, segment, offset))
			
 
				+        if size < fmt.size:
			
 
				+            raise IntegrityError('Invalid segment entry size {} - too small [segment {}, offset {}]'.format(
			
 
				+                size, segment, offset))
			
 
				         length = size - fmt.size
			
 
				         if read_data:
			
 
				             data = fd.read(length)
			
@@ -942,8 +947,12 @@ class LoggedIO:
 
				         return size, tag, key, data
			
 
				 
			
 
				     def write_put(self, id, data, raise_full=False):
			
 
				+        data_size = len(data)
			
 
				+        if data_size > MAX_DATA_SIZE:
			
 
				+            # this would push the segment entry size beyond MAX_OBJECT_SIZE.
			
 
				+            raise IntegrityError('More than allowed put data [{} > {}]'.format(data_size, MAX_DATA_SIZE))
			
 
				         fd = self.get_write_fd(raise_full=raise_full)
			
 
				-        size = len(data) + self.put_header_fmt.size
			
 
				+        size = data_size + self.put_header_fmt.size
			
 
				         offset = self.offset
			
 
				         header = self.header_no_crc_fmt.pack(size, TAG_PUT)
			
 
				         crc = self.crc_fmt.pack(crc32(data, crc32(id, crc32(header))) & 0xffffffff)
			
@@ -972,3 +981,6 @@ class LoggedIO:
 
				         crc = self.crc_fmt.pack(crc32(header) & 0xffffffff)
			
 
				         fd.write(b''.join((crc, header)))
			
 
				         self.close_segment()
			
 
				+
			
 
				+
			
 
				+MAX_DATA_SIZE = MAX_OBJECT_SIZE - LoggedIO.put_header_fmt.size
			
--- a/src/borg/testsuite/compress.py
+++ b/src/borg/testsuite/compress.py
@@ -1,3 +1,4 @@
 
				+import os
			
 
				 import zlib
			
 
				 try:
			
 
				     import lzma
			
@@ -11,13 +12,13 @@ from ..compress import get_compressor, Compressor, CNONE, ZLIB, LZ4
 
				 
			
 
				 buffer = bytes(2**16)
			
 
				 data = b'fooooooooobaaaaaaaar' * 10
			
 
				-params = dict(name='zlib', level=6, buffer=buffer)
			
 
				+params = dict(name='zlib', level=6)
			
 
				 
			
 
				 
			
 
				 def test_get_compressor():
			
 
				     c = get_compressor(name='none')
			
 
				     assert isinstance(c, CNONE)
			
 
				-    c = get_compressor(name='lz4', buffer=buffer)
			
 
				+    c = get_compressor(name='lz4')
			
 
				     assert isinstance(c, LZ4)
			
 
				     c = get_compressor(name='zlib')
			
 
				     assert isinstance(c, ZLIB)
			
@@ -35,13 +36,21 @@ def test_cnull():
 
				 
			
 
				 
			
 
				 def test_lz4():
			
 
				-    c = get_compressor(name='lz4', buffer=buffer)
			
 
				+    c = get_compressor(name='lz4')
			
 
				     cdata = c.compress(data)
			
 
				     assert len(cdata) < len(data)
			
 
				     assert data == c.decompress(cdata)
			
 
				     assert data == Compressor(**params).decompress(cdata)  # autodetect
			
 
				 
			
 
				 
			
 
				+def test_lz4_buffer_allocation():
			
 
				+    # test with a rather huge data object to see if buffer allocation / resizing works
			
 
				+    data = os.urandom(50 * 2**20)  # 50MiB incompressible data
			
 
				+    c = get_compressor(name='lz4')
			
 
				+    cdata = c.compress(data)
			
 
				+    assert data == c.decompress(cdata)
			
 
				+
			
 
				+
			
 
				 def test_zlib():
			
 
				     c = get_compressor(name='zlib')
			
 
				     cdata = c.compress(data)
			
@@ -83,16 +92,16 @@ def test_zlib_compat():
 
				 
			
 
				 def test_compressor():
			
 
				     params_list = [
			
 
				-        dict(name='none', buffer=buffer),
			
 
				-        dict(name='lz4', buffer=buffer),
			
 
				-        dict(name='zlib', level=0, buffer=buffer),
			
 
				-        dict(name='zlib', level=6, buffer=buffer),
			
 
				-        dict(name='zlib', level=9, buffer=buffer),
			
 
				+        dict(name='none'),
			
 
				+        dict(name='lz4'),
			
 
				+        dict(name='zlib', level=0),
			
 
				+        dict(name='zlib', level=6),
			
 
				+        dict(name='zlib', level=9),
			
 
				     ]
			
 
				     if lzma:
			
 
				         params_list += [
			
 
				-            dict(name='lzma', level=0, buffer=buffer),
			
 
				-            dict(name='lzma', level=6, buffer=buffer),
			
 
				+            dict(name='lzma', level=0),
			
 
				+            dict(name='lzma', level=6),
			
 
				             # we do not test lzma on level 9 because of the huge memory needs
			
 
				         ]
			
 
				     for params in params_list:
			
--- a/src/borg/testsuite/repository.py
+++ b/src/borg/testsuite/repository.py
@@ -13,7 +13,7 @@ from ..helpers import Location
 
				 from ..helpers import IntegrityError
			
 
				 from ..locking import Lock, LockFailed
			
 
				 from ..remote import RemoteRepository, InvalidRPCMethod, ConnectionClosedWithHint, handle_remote_line
			
 
				-from ..repository import Repository, LoggedIO, MAGIC
			
 
				+from ..repository import Repository, LoggedIO, MAGIC, MAX_DATA_SIZE
			
 
				 from . import BaseTestCase
			
 
				 
			
 
				 
			
@@ -142,6 +142,13 @@ class RepositoryTestCase(RepositoryTestCaseBase):
 
				         self.assert_equal(second_half, all[50:])
			
 
				         self.assert_equal(len(self.repository.list(limit=50)), 50)
			
 
				 
			
 
				+    def test_max_data_size(self):
			
 
				+        max_data = b'x' * MAX_DATA_SIZE
			
 
				+        self.repository.put(b'00000000000000000000000000000000', max_data)
			
 
				+        self.assert_equal(self.repository.get(b'00000000000000000000000000000000'), max_data)
			
 
				+        self.assert_raises(IntegrityError,
			
 
				+                           lambda: self.repository.put(b'00000000000000000000000000000001', max_data + b'x'))
			
 
				+
			
 
				 
			
 
				 class LocalRepositoryTestCase(RepositoryTestCaseBase):
			
 
				     # test case that doesn't work with remote repositories