Преглед изворни кода

Merge pull request #1077 from ThomasWaldmann/do-not-chunk-small-files

chunker: speed up remainder <= min_size case
enkore пре 9 година
родитељ
комит
33f3a70cf6
3 измењених фајлова са 15 додато и 5 уклоњено
  1. 11 3
      borg/_chunker.c
  2. 2 0
      borg/chunker.pyx
  3. 2 2
      borg/testsuite/archiver.py

+ 11 - 3
borg/_chunker.c

@@ -96,7 +96,7 @@ buzhash(const unsigned char *data, size_t len, const uint32_t *h)
 static uint32_t
 buzhash_update(uint32_t sum, unsigned char remove, unsigned char add, size_t len, const uint32_t *h)
 {
-    uint32_t lenmod = len & 0x1f;
+    uint32_t lenmod = len & 0x1f;  /* Note: replace by constant to get small speedup */
     return BARREL_SHIFT(sum, 1) ^ BARREL_SHIFT(h[remove], lenmod) ^ h[add];
 }
 
@@ -249,11 +249,12 @@ chunker_process(Chunker *c)
             PyErr_SetString(PyExc_Exception, "chunkifier byte count mismatch");
         return NULL;
     }
-    while(c->remaining <= window_size && !c->eof) {
+    while(c->remaining < min_size + window_size + 1 && !c->eof) {  /* see assert in Chunker init */
         if(!chunker_fill(c)) {
             return NULL;
         }
     }
+    /* here we either are at eof ... */
     if(c->eof) {
         c->done = 1;
         if(c->remaining) {
@@ -268,8 +269,15 @@ chunker_process(Chunker *c)
             return NULL;
         }
     }
+    /* ... or we have at least min_size + window_size + 1 bytes remaining.
+     * We do not want to "cut" a chunk smaller than min_size and the hash
+     * window starts at the potential cutting place.
+     */
+    c->position += min_size;
+    c->remaining -= min_size;
+    n += min_size;
     sum = buzhash(c->data + c->position, window_size, c->table);
-    while(c->remaining > c->window_size && ((sum & chunk_mask) || n < min_size)) {
+    while(c->remaining > c->window_size && (sum & chunk_mask)) {
         sum = buzhash_update(sum, c->data[c->position],
                              c->data[c->position + window_size],
                              window_size, c->table);

+ 2 - 0
borg/chunker.pyx

@@ -23,6 +23,8 @@ cdef class Chunker:
     def __cinit__(self, int seed, int chunk_min_exp, int chunk_max_exp, int hash_mask_bits, int hash_window_size):
         min_size = 1 << chunk_min_exp
         max_size = 1 << chunk_max_exp
+        # see chunker_process, first while loop condition, first term must be able to get True:
+        assert hash_window_size + min_size + 1 <= max_size, "too small max_size"
         hash_mask = (1 << hash_mask_bits) - 1
         self.chunker = chunker_init(hash_window_size, hash_mask, min_size, max_size, seed & 0xffffffff)
 

+ 2 - 2
borg/testsuite/archiver.py

@@ -1515,9 +1515,9 @@ class ArchiverTestCase(ArchiverTestCaseBase):
         self.cmd('create', self.repository_location + '::test', 'input')
         archive_before = self.cmd('list', self.repository_location + '::test', '--format', '{sha512}')
         with patch.object(Cache, 'add_chunk', self._test_recreate_chunker_interrupt_patch()):
-            self.cmd('recreate', '-pv', '--chunker-params', '10,12,11,4095', self.repository_location)
+            self.cmd('recreate', '-pv', '--chunker-params', '10,13,11,4095', self.repository_location)
         assert 'test.recreate' in self.cmd('list', self.repository_location)
-        output = self.cmd('recreate', '-svp', '--debug', '--chunker-params', '10,12,11,4095', self.repository_location)
+        output = self.cmd('recreate', '-svp', '--debug', '--chunker-params', '10,13,11,4095', self.repository_location)
         assert 'Found test.recreate, will resume' in output
         assert 'Copied 1 chunks from a partially processed item' in output
         archive_after = self.cmd('list', self.repository_location + '::test', '--format', '{sha512}')