Bläddra i källkod

detect all-zero chunks, avoid hashing them

comparing zeros is quicker than hashing them.
the comparison should fail quickly inside non-zero data.
Thomas Waldmann 4 år sedan
förälder
incheckning
6d0f9a52eb
3 ändrade filer med 21 tillägg och 10 borttagningar
  1. 3 2
      src/borg/archive.py
  2. 17 7
      src/borg/chunker.pyx
  3. 1 1
      src/borg/testsuite/chunker.py

+ 3 - 2
src/borg/archive.py

@@ -1143,7 +1143,7 @@ class ChunksProcessor:
                 if allocation == CH_DATA:
                     data = chunk.data
                     chunk_id = self.key.id_hash(data)
-                elif allocation == CH_HOLE:
+                elif allocation in (CH_HOLE, CH_ALLOC):
                     size = chunk.meta['size']
                     data = self.zeros[:size]
                     try:
@@ -2002,7 +2002,8 @@ class ArchiveRecreater:
         target.process_file_chunks(item, self.cache, target.stats, self.progress, chunk_iterator, chunk_processor)
 
     def chunk_processor(self, target, chunk):
-        # as this is recreate (we do not read from the fs), we never have holes here
+        # as this is recreate (we do not read from the fs), we never have CH_HOLE here,
+        # but we need to add support for CH_ALLOC - TODO!
         assert chunk.meta['allocation'] == CH_DATA
         data = chunk.data
         chunk_id = self.key.id_hash(data)

+ 17 - 7
src/borg/chunker.pyx

@@ -6,7 +6,7 @@ import errno
 import os
 from collections import namedtuple
 
-from .constants import CH_DATA, CH_HOLE
+from .constants import CH_DATA, CH_ALLOC, CH_HOLE
 
 from libc.stdlib cimport free
 
@@ -35,12 +35,16 @@ _Chunk.__doc__ = """\
 
     meta is always a dictionary, data depends on allocation.
 
-    on disk data:
-        meta = {'allocation' = CH_DATA, 'size' = size_of_data }
+    data chunk read from a DATA range of a file (not from a sparse hole):
+        meta = {'allocation' = CH_DATA, 'size' = size_of_chunk }
         data = read_data [bytes or memoryview]
 
-    hole in a sparse file:
-        meta = {'allocation' = CH_HOLE, 'size' = size_of_hole }
+    all-zero chunk read from a DATA range of a file (not from a sparse hole, but detected to be all-zero):
+        meta = {'allocation' = CH_ALLOC, 'size' = size_of_chunk }
+        data = None
+
+    all-zero chunk from a HOLE range of a file (from a sparse hole):
+        meta = {'allocation' = CH_HOLE, 'size' = size_of_chunk }
         data = None
 """
 
@@ -201,15 +205,21 @@ class ChunkerFixed:
                     # read block from the range
                     data = dread(offset, wanted, fd, fh)
                     got = len(data)
+                    if data == self.zeros[:got]:
+                        data = None
+                        is_zero = True
+                    else:
+                        is_zero = False
                 else:  # hole
                     # seek over block from the range
                     pos = dseek(wanted, os.SEEK_CUR, fd, fh)
-                    data = None
                     got = pos - offset
+                    data = None
+                    is_zero = True
                 if got > 0:
                     offset += got
                     range_size -= got
-                    yield Chunk(data, size=got, allocation=CH_DATA if is_data else CH_HOLE)
+                    yield Chunk(data, size=got, allocation=(CH_ALLOC if is_zero else CH_DATA) if is_data else CH_HOLE)
                 if got < wanted:
                     # we did not get enough data, looks like EOF.
                     return

+ 1 - 1
src/borg/testsuite/chunker.py

@@ -15,7 +15,7 @@ def cf(chunks):
         if chunk.meta['allocation'] == CH_DATA:
             assert len(chunk.data) == chunk.meta['size']
             return bytes(chunk.data)  # make sure we have bytes, not memoryview
-        if chunk.meta['allocation'] == CH_HOLE:
+        if chunk.meta['allocation'] in (CH_HOLE, CH_ALLOC):
             assert chunk.data is None
             return chunk.meta['size']
         assert False, "unexpected allocation value"