Pārlūkot izejas kodu

Merge pull request #7589 from ThomasWaldmann/chunker-params-1.2

relax chunker params validation, tests (1.2-maint)
TW 2 gadi atpakaļ
vecāks
revīzija
53bedfb63b

+ 12 - 12
src/borg/helpers/parseformat.py

@@ -95,10 +95,16 @@ def interval(s):
 
 
 def ChunkerParams(s):
+    def reject_or_warn(msg, reject):
+        if reject:
+            raise argparse.ArgumentTypeError(msg)
+        else:
+            logger.warning(msg)
+
     params = s.strip().split(',')
     count = len(params)
     if count == 0:
-        raise argparse.ArgumentTypeError('no chunker params given')
+        reject_or_warn('no chunker params given', True)
     algo = params[0].lower()
     if algo == CH_FIXED and 2 <= count <= 3:  # fixed, block_size[, header_size]
         block_size = int(params[1])
@@ -109,11 +115,9 @@ def ChunkerParams(s):
             # or in-memory chunk management.
             # choose the block (chunk) size wisely: if you have a lot of data and you cut
             # it into very small chunks, you are asking for trouble!
-            raise argparse.ArgumentTypeError('block_size must not be less than 64 Bytes')
+            reject_or_warn('block_size must not be less than 64 Bytes', False)
         if block_size > MAX_DATA_SIZE or header_size > MAX_DATA_SIZE:
-            raise argparse.ArgumentTypeError(
-                'block_size and header_size must not exceed MAX_DATA_SIZE [%d]' % MAX_DATA_SIZE
-            )
+            reject_or_warn('block_size and header_size must not exceed MAX_DATA_SIZE [%d]' % MAX_DATA_SIZE, True)
         return algo, block_size, header_size
     if algo == 'default' and count == 1:  # default
         return CHUNKER_PARAMS
@@ -121,16 +125,12 @@ def ChunkerParams(s):
     if algo == CH_BUZHASH and count == 5 or count == 4:  # [buzhash, ]chunk_min, chunk_max, chunk_mask, window_size
         chunk_min, chunk_max, chunk_mask, window_size = (int(p) for p in params[count - 4:])
         if not (chunk_min <= chunk_mask <= chunk_max):
-            raise argparse.ArgumentTypeError('required: chunk_min <= chunk_mask <= chunk_max')
+            reject_or_warn('required: chunk_min <= chunk_mask <= chunk_max', False)
         if chunk_min < 6:
             # see comment in 'fixed' algo check
-            raise argparse.ArgumentTypeError(
-                'min. chunk size exponent must not be less than 6 (2^6 = 64B min. chunk size)'
-            )
+            reject_or_warn('min. chunk size exponent must not be less than 6 (2^6 = 64B min. chunk size)', False)
         if chunk_max > 23:
-            raise argparse.ArgumentTypeError(
-                'max. chunk size exponent must not be more than 23 (2^23 = 8MiB max. chunk size)'
-            )
+            reject_or_warn('max. chunk size exponent must not be more than 23 (2^23 = 8MiB max. chunk size)', True)
         return CH_BUZHASH, chunk_min, chunk_max, chunk_mask, window_size
     raise argparse.ArgumentTypeError('invalid chunker params')
 

+ 25 - 1
src/borg/testsuite/chunker_pytest.py

@@ -5,7 +5,7 @@ import tempfile
 import pytest
 
 from .chunker import cf
-from ..chunker import ChunkerFixed, sparsemap, has_seek_hole
+from ..chunker import Chunker, ChunkerFixed, sparsemap, has_seek_hole
 from ..constants import *  # NOQA
 
 BS = 4096  # fs block size
@@ -136,3 +136,27 @@ def test_chunkify_sparse(tmpdir, fname, sparse_map, header_size, sparse):
     fn = str(tmpdir / fname)
     make_sparsefile(fn, sparse_map, header_size=header_size)
     get_chunks(fn, sparse=sparse, header_size=header_size) == make_content(sparse_map, header_size=header_size)
+
+
+def test_buzhash_chunksize_distribution():
+    data = os.urandom(1048576)
+    min_exp, max_exp, mask = 10, 16, 14  # chunk size target 16kiB, clip at 1kiB and 64kiB
+    chunker = Chunker(0, min_exp, max_exp, mask, 4095)
+    f = BytesIO(data)
+    chunks = cf(chunker.chunkify(f))
+    chunk_sizes = [len(chunk) for chunk in chunks]
+    chunks_count = len(chunks)
+    min_chunksize_observed = min(chunk_sizes)
+    max_chunksize_observed = max(chunk_sizes)
+    min_count = sum((int(size == 2 ** min_exp) for size in chunk_sizes))
+    max_count = sum((int(size == 2 ** max_exp) for size in chunk_sizes))
+    print(f"count: {chunks_count} min: {min_chunksize_observed} max: {max_chunksize_observed} "
+          f"min count: {min_count} max count: {max_count}")
+    # usually there will about 64 chunks
+    assert 32 < chunks_count < 128
+    # chunks always must be between min and max (clipping must work):
+    assert min_chunksize_observed >= 2 ** min_exp
+    assert max_chunksize_observed <= 2 ** max_exp
+    # most chunks should be cut due to buzhash triggering, not due to clipping at min/max size:
+    assert min_count < 10
+    assert max_count < 10

+ 10 - 6
src/borg/testsuite/helpers.py

@@ -349,16 +349,20 @@ def test_chunkerparams():
     assert ChunkerParams('fixed,4096') == ('fixed', 4096, 0)
     assert ChunkerParams('fixed,4096,200') == ('fixed', 4096, 200)
     # invalid values checking
+    borg2 = False  # for borg < 2, we only emit a warning, but do not raise ArgumentTypeError for some cases
     with pytest.raises(ArgumentTypeError):
         ChunkerParams('crap,1,2,3,4')  # invalid algo
-    with pytest.raises(ArgumentTypeError):
-        ChunkerParams('buzhash,5,7,6,4095')  # too small min. size
+    if borg2:
+        with pytest.raises(ArgumentTypeError):
+            ChunkerParams('buzhash,5,7,6,4095')  # too small min. size
     with pytest.raises(ArgumentTypeError):
         ChunkerParams('buzhash,19,24,21,4095')  # too big max. size
-    with pytest.raises(ArgumentTypeError):
-        ChunkerParams('buzhash,23,19,21,4095')  # violates min <= mask <= max
-    with pytest.raises(ArgumentTypeError):
-        ChunkerParams('fixed,63')  # too small block size
+    if borg2:
+        with pytest.raises(ArgumentTypeError):
+            ChunkerParams('buzhash,23,19,21,4095')  # violates min <= mask <= max
+    if borg2:
+        with pytest.raises(ArgumentTypeError):
+            ChunkerParams('fixed,63')  # too small block size
     with pytest.raises(ArgumentTypeError):
         ChunkerParams('fixed,%d,%d' % (MAX_DATA_SIZE + 1, 4096))  # too big block size
     with pytest.raises(ArgumentTypeError):