Selaa lähdekoodia

Refactor ChunkerFixed: move file reading into FileReader class

Replaced `ChunkerFixed`'s block-reading functionality with a new `FileReader` class to streamline code and improve separation of concerns. Adjusted `ChunkerFixed` to delegate file reading to `FileReader` while focusing on chunk assembly.

`FileReader` is intended to be useful for other chunkers also, so they can easily implement sparse file reading / fmap support.
Thomas Waldmann 1 kuukausi sitten
vanhempi
sitoutus
2818a0c26e
2 muutettua tiedostoa jossa 89 lisäystä ja 18 poistoa
  1. 5 1
      src/borg/chunker.pyi
  2. 84 17
      src/borg/chunker.pyx

+ 5 - 1
src/borg/chunker.pyi

@@ -21,9 +21,13 @@ class ChunkerFailing:
     def __init__(self, block_size: int, map: str) -> None: ...
     def chunkify(self, fd: BinaryIO = None, fh: int = -1) -> Iterator: ...
 
-class ChunkerFixed:
+class FileReader:
     def __init__(self, block_size: int, header_size: int = 0, sparse: bool = False) -> None: ...
     def _build_fmap(self, fd: BinaryIO = None, fh: int = -1) -> List[fmap_entry]: ...
+    def blockify(self, fd: BinaryIO = None, fh: int = -1, fmap: List[fmap_entry] = None) -> Iterator: ...
+
+class ChunkerFixed:
+    def __init__(self, block_size: int, header_size: int = 0, sparse: bool = False) -> None: ...
     def chunkify(self, fd: BinaryIO = None, fh: int = -1, fmap: List[fmap_entry] = None) -> Iterator: ...
 
 class Chunker:

+ 84 - 17
src/borg/chunker.pyx

@@ -165,14 +165,9 @@ class ChunkerFailing:
                 return
 
 
-class ChunkerFixed:
+class FileReader:
     """
-    This is a simple chunker for input data with data usually staying at same
-    offset and / or with known block/record sizes:
-
-    - raw disk images
-    - block devices
-    - database files with simple header + fixed-size records layout
+    This is for reading blocks from a file.
 
     It optionally supports:
 
@@ -185,16 +180,18 @@ class ChunkerFixed:
     Note: the last block of a data or hole range may be less than the block size,
           this is supported and not considered to be an error.
     """
-    def __init__(self, block_size, header_size=0, sparse=False):
-        self.block_size = block_size
-        self.header_size = header_size
-        self.chunking_time = 0.0
+    def __init__(self, read_size, header_size=0, sparse=False):
+        self.read_size = read_size  # how much data we want to read at once
+        assert read_size <= len(zeros)
+        self.header_size = header_size  # size of the first block
+        assert read_size >= header_size
+        self.reading_time = 0.0  # time spent in reading/seeking
         # should borg try to do sparse input processing?
         # whether it actually can be done depends on the input file being seekable.
         self.try_sparse = sparse and has_seek_hole
-        assert block_size <= len(zeros)
 
     def _build_fmap(self, fd=None, fh=-1):
+        started_fmap = time.monotonic()
         fmap = None
         if self.try_sparse:
             try:
@@ -225,11 +222,12 @@ class ChunkerFixed:
                 header_map = []
                 body_map = [(0, 2 ** 62, True), ]
             fmap = header_map + body_map
+        self.reading_time += time.monotonic() - started_fmap
         return fmap
 
-    def chunkify(self, fd=None, fh=-1, fmap=None):
+    def blockify(self, fd=None, fh=-1, fmap=None):
         """
-        Cut a file into chunks.
+        Read <read_size> sized blocks from a file, optionally supporting a differently sized header block.
 
         :param fd: Python file object
         :param fh: OS-level file handle (if available),
@@ -238,6 +236,7 @@ class ChunkerFixed:
         """
         fmap =self._build_fmap(fd, fh) if fmap is None else fmap
         offset = 0
+        # note: the optional header block is implemented via the first fmap entry
         for range_start, range_size, is_data in fmap:
             if range_start != offset:
                 # this is for the case when the fmap does not cover the file completely,
@@ -245,8 +244,8 @@ class ChunkerFixed:
                 offset = range_start
                 dseek(offset, os.SEEK_SET, fd, fh)
             while range_size:
-                started_chunking = time.monotonic()
-                wanted = min(range_size, self.block_size)
+                started_reading = time.monotonic()
+                wanted = min(range_size, self.read_size)
                 if is_data:
                     # read block from the range
                     data = dread(offset, wanted, fd, fh)
@@ -265,13 +264,81 @@ class ChunkerFixed:
                 if got > 0:
                     offset += got
                     range_size -= got
-                    self.chunking_time += time.monotonic() - started_chunking
+                    self.reading_time += time.monotonic() - started_reading
                     yield Chunk(data, size=got, allocation=allocation)
                 if got < wanted:
                     # we did not get enough data, looks like EOF.
                     return
 
 
+class ChunkerFixed:
+    """
+    This is a simple chunker for input data with data usually staying at same
+    offset and / or with known block/record sizes:
+
+    - raw disk images
+    - block devices
+    - database files with simple header + fixed-size records layout
+
+    It optionally supports:
+
+    - a header block of different size
+    - using a sparsemap to read only data ranges and seek over hole ranges
+      for sparse files.
+    - using an externally given filemap to read only specific ranges from
+      a file.
+
+    Note: the last block of a data or hole range may be less than the block size,
+          this is supported and not considered to be an error.
+    """
+    def __init__(self, block_size, header_size=0, sparse=False):
+        self.block_size = block_size
+        self.header_size = header_size
+        self.chunking_time = 0.0  # likely will stay close to zero - not much to do here.
+        self.reader_block_size = self.block_size  # start simple
+        assert self.reader_block_size % self.block_size == 0, "reader_block_size must be N * block_size"
+        self.reader = FileReader(self.reader_block_size, header_size=self.header_size, sparse=sparse)
+
+    def chunkify(self, fd=None, fh=-1, fmap=None):
+        """
+        Cut a file into chunks.
+
+        :param fd: Python file object
+        :param fh: OS-level file handle (if available),
+                   defaults to -1 which means not to use OS-level fd.
+        :param fmap: a file map, same format as generated by sparsemap
+        """
+        in_header = self.header_size > 0  # first block is header, if header size is given
+        for block in self.reader.blockify(fd, fh, fmap):
+            if in_header:
+                assert self.header_size == block.meta["size"]
+                yield block  # just pass through the header block we get from the reader
+                in_header = False
+                continue
+            # not much to do in here
+            if self.reader_block_size == self.block_size:
+                # trivial, the reader already did all the work
+                yield block  # just pass through, avoid creating new objects
+            else:
+                # reader block size is a multiple of our block size
+                read_size = block.meta["size"]
+                allocation = block.meta["allocation"]
+                start = 0
+                while read_size:
+                    started_chunking = time.monotonic()
+                    size = min(read_size, self.block_size)
+                    if allocation == CH_DATA:
+                        data = block.data[start:start+size]  # TODO memoryview?
+                    elif allocation in (CH_ALLOC, CH_HOLE):
+                        data = None
+                    else:
+                        raise ValueError("unsupported allocation")
+                    self.chunking_time += time.monotonic() - started_chunking
+                    yield Chunk(data, size=size, allocation=allocation)
+                    start += size
+                    read_size -= size
+
+
 # Cyclic polynomial / buzhash
 #
 # https://en.wikipedia.org/wiki/Rolling_hash