3 月之前 · 2818a0c26e
--- a/src/borg/chunker.pyi
+++ b/src/borg/chunker.pyi
@@ -21,9 +21,13 @@ class ChunkerFailing:
 
															     def __init__(self, block_size: int, map: str) -> None: ...
														
 
															     def chunkify(self, fd: BinaryIO = None, fh: int = -1) -> Iterator: ...
														
 
															-class ChunkerFixed:
														
 
															+class FileReader:
														
 
															     def __init__(self, block_size: int, header_size: int = 0, sparse: bool = False) -> None: ...
														
 
															     def _build_fmap(self, fd: BinaryIO = None, fh: int = -1) -> List[fmap_entry]: ...
														
 
															+    def blockify(self, fd: BinaryIO = None, fh: int = -1, fmap: List[fmap_entry] = None) -> Iterator: ...
														
 
															+
														
 
															+class ChunkerFixed:
														
 
															+    def __init__(self, block_size: int, header_size: int = 0, sparse: bool = False) -> None: ...
														
 
															     def chunkify(self, fd: BinaryIO = None, fh: int = -1, fmap: List[fmap_entry] = None) -> Iterator: ...
														
 
															 class Chunker:
														
--- a/src/borg/chunker.pyx
+++ b/src/borg/chunker.pyx
@@ -165,14 +165,9 @@ class ChunkerFailing:
 
															                 return
														
 
															-class ChunkerFixed:
														
 
															+class FileReader:
														
 
															     """
														
 
															-    This is a simple chunker for input data with data usually staying at same
														
 
															-    offset and / or with known block/record sizes:
														
 
															-
														
 
															-    - raw disk images
														
 
															-    - block devices
														
 
															-    - database files with simple header + fixed-size records layout
														
 
															+    This is for reading blocks from a file.
														
 
															     It optionally supports:
														
@@ -185,16 +180,18 @@ class ChunkerFixed:
 
															     Note: the last block of a data or hole range may be less than the block size,
														
 
															           this is supported and not considered to be an error.
														
 
															     """
														
 
															-    def __init__(self, block_size, header_size=0, sparse=False):
														
 
															-        self.block_size = block_size
														
 
															-        self.header_size = header_size
														
 
															-        self.chunking_time = 0.0
														
 
															+    def __init__(self, read_size, header_size=0, sparse=False):
														
 
															+        self.read_size = read_size  # how much data we want to read at once
														
 
															+        assert read_size <= len(zeros)
														
 
															+        self.header_size = header_size  # size of the first block
														
 
															+        assert read_size >= header_size
														
 
															+        self.reading_time = 0.0  # time spent in reading/seeking
														
 
															         # should borg try to do sparse input processing?
														
 
															         # whether it actually can be done depends on the input file being seekable.
														
 
															         self.try_sparse = sparse and has_seek_hole
														
 
															-        assert block_size <= len(zeros)
														
 
															     def _build_fmap(self, fd=None, fh=-1):
														
 
															+        started_fmap = time.monotonic()
														
 
															         fmap = None
														
 
															         if self.try_sparse:
														
 
															             try:
														
@@ -225,11 +222,12 @@ class ChunkerFixed:
 
															                 header_map = []
														
 
															                 body_map = [(0, 2 ** 62, True), ]
														
 
															             fmap = header_map + body_map
														
 
															+        self.reading_time += time.monotonic() - started_fmap
														
 
															         return fmap
														
 
															-    def chunkify(self, fd=None, fh=-1, fmap=None):
														
 
															+    def blockify(self, fd=None, fh=-1, fmap=None):
														
 
															         """
														
 
															-        Cut a file into chunks.
														
 
															+        Read <read_size> sized blocks from a file, optionally supporting a differently sized header block.
														
 
															         :param fd: Python file object
														
 
															         :param fh: OS-level file handle (if available),
														
@@ -238,6 +236,7 @@ class ChunkerFixed:
 
															         """
														
 
															         fmap =self._build_fmap(fd, fh) if fmap is None else fmap
														
 
															         offset = 0
														
 
															+        # note: the optional header block is implemented via the first fmap entry
														
 
															         for range_start, range_size, is_data in fmap:
														
 
															             if range_start != offset:
														
 
															                 # this is for the case when the fmap does not cover the file completely,
														
@@ -245,8 +244,8 @@ class ChunkerFixed:
 
															                 offset = range_start
														
 
															                 dseek(offset, os.SEEK_SET, fd, fh)
														
 
															             while range_size:
														
 
															-                started_chunking = time.monotonic()
														
 
															-                wanted = min(range_size, self.block_size)
														
 
															+                started_reading = time.monotonic()
														
 
															+                wanted = min(range_size, self.read_size)
														
 
															                 if is_data:
														
 
															                     # read block from the range
														
 
															                     data = dread(offset, wanted, fd, fh)
														
@@ -265,13 +264,81 @@ class ChunkerFixed:
 
															                 if got > 0:
														
 
															                     offset += got
														
 
															                     range_size -= got
														
 
															-                    self.chunking_time += time.monotonic() - started_chunking
														
 
															+                    self.reading_time += time.monotonic() - started_reading
														
 
															                     yield Chunk(data, size=got, allocation=allocation)
														
 
															                 if got < wanted:
														
 
															                     # we did not get enough data, looks like EOF.
														
 
															                     return
														
 
															+class ChunkerFixed:
														
 
															+    """
														
 
															+    This is a simple chunker for input data with data usually staying at same
														
 
															+    offset and / or with known block/record sizes:
														
 
															+
														
 
															+    - raw disk images
														
 
															+    - block devices
														
 
															+    - database files with simple header + fixed-size records layout
														
 
															+
														
 
															+    It optionally supports:
														
 
															+
														
 
															+    - a header block of different size
														
 
															+    - using a sparsemap to read only data ranges and seek over hole ranges
														
 
															+      for sparse files.
														
 
															+    - using an externally given filemap to read only specific ranges from
														
 
															+      a file.
														
 
															+
														
 
															+    Note: the last block of a data or hole range may be less than the block size,
														
 
															+          this is supported and not considered to be an error.
														
 
															+    """
														
 
															+    def __init__(self, block_size, header_size=0, sparse=False):
														
 
															+        self.block_size = block_size
														
 
															+        self.header_size = header_size
														
 
															+        self.chunking_time = 0.0  # likely will stay close to zero - not much to do here.
														
 
															+        self.reader_block_size = self.block_size  # start simple
														
 
															+        assert self.reader_block_size % self.block_size == 0, "reader_block_size must be N * block_size"
														
 
															+        self.reader = FileReader(self.reader_block_size, header_size=self.header_size, sparse=sparse)
														
 
															+
														
 
															+    def chunkify(self, fd=None, fh=-1, fmap=None):
														
 
															+        """
														
 
															+        Cut a file into chunks.
														
 
															+
														
 
															+        :param fd: Python file object
														
 
															+        :param fh: OS-level file handle (if available),
														
 
															+                   defaults to -1 which means not to use OS-level fd.
														
 
															+        :param fmap: a file map, same format as generated by sparsemap
														
 
															+        """
														
 
															+        in_header = self.header_size > 0  # first block is header, if header size is given
														
 
															+        for block in self.reader.blockify(fd, fh, fmap):
														
 
															+            if in_header:
														
 
															+                assert self.header_size == block.meta["size"]
														
 
															+                yield block  # just pass through the header block we get from the reader
														
 
															+                in_header = False
														
 
															+                continue
														
 
															+            # not much to do in here
														
 
															+            if self.reader_block_size == self.block_size:
														
 
															+                # trivial, the reader already did all the work
														
 
															+                yield block  # just pass through, avoid creating new objects
														
 
															+            else:
														
 
															+                # reader block size is a multiple of our block size
														
 
															+                read_size = block.meta["size"]
														
 
															+                allocation = block.meta["allocation"]
														
 
															+                start = 0
														
 
															+                while read_size:
														
 
															+                    started_chunking = time.monotonic()
														
 
															+                    size = min(read_size, self.block_size)
														
 
															+                    if allocation == CH_DATA:
														
 
															+                        data = block.data[start:start+size]  # TODO memoryview?
														
 
															+                    elif allocation in (CH_ALLOC, CH_HOLE):
														
 
															+                        data = None
														
 
															+                    else:
														
 
															+                        raise ValueError("unsupported allocation")
														
 
															+                    self.chunking_time += time.monotonic() - started_chunking
														
 
															+                    yield Chunk(data, size=size, allocation=allocation)
														
 
															+                    start += size
														
 
															+                    read_size -= size
														
 
															+
														
 
															+
														
 
															 # Cyclic polynomial / buzhash
														
 
															 #
														
 
															 # https://en.wikipedia.org/wiki/Rolling_hash