2
0
Эх сурвалжийг харах

Refactor ChunkerFixed: move file reading into FileReader class

Replaced `ChunkerFixed`'s block-reading functionality with a new `FileReader` class to streamline code and improve separation of concerns. Adjusted `ChunkerFixed` to delegate file reading to `FileReader` while focusing on chunk assembly.

`FileReader` is intended to be useful for other chunkers also, so they can easily implement sparse file reading / fmap support.
Thomas Waldmann 1 сар өмнө
parent
commit
2818a0c26e

+ 5 - 1
src/borg/chunker.pyi

@@ -21,9 +21,13 @@ class ChunkerFailing:
     def __init__(self, block_size: int, map: str) -> None: ...
     def __init__(self, block_size: int, map: str) -> None: ...
     def chunkify(self, fd: BinaryIO = None, fh: int = -1) -> Iterator: ...
     def chunkify(self, fd: BinaryIO = None, fh: int = -1) -> Iterator: ...
 
 
-class ChunkerFixed:
+class FileReader:
     def __init__(self, block_size: int, header_size: int = 0, sparse: bool = False) -> None: ...
     def __init__(self, block_size: int, header_size: int = 0, sparse: bool = False) -> None: ...
     def _build_fmap(self, fd: BinaryIO = None, fh: int = -1) -> List[fmap_entry]: ...
     def _build_fmap(self, fd: BinaryIO = None, fh: int = -1) -> List[fmap_entry]: ...
+    def blockify(self, fd: BinaryIO = None, fh: int = -1, fmap: List[fmap_entry] = None) -> Iterator: ...
+
+class ChunkerFixed:
+    def __init__(self, block_size: int, header_size: int = 0, sparse: bool = False) -> None: ...
     def chunkify(self, fd: BinaryIO = None, fh: int = -1, fmap: List[fmap_entry] = None) -> Iterator: ...
     def chunkify(self, fd: BinaryIO = None, fh: int = -1, fmap: List[fmap_entry] = None) -> Iterator: ...
 
 
 class Chunker:
 class Chunker:

+ 84 - 17
src/borg/chunker.pyx

@@ -165,14 +165,9 @@ class ChunkerFailing:
                 return
                 return
 
 
 
 
-class ChunkerFixed:
+class FileReader:
     """
     """
-    This is a simple chunker for input data with data usually staying at same
-    offset and / or with known block/record sizes:
-
-    - raw disk images
-    - block devices
-    - database files with simple header + fixed-size records layout
+    This is for reading blocks from a file.
 
 
     It optionally supports:
     It optionally supports:
 
 
@@ -185,16 +180,18 @@ class ChunkerFixed:
     Note: the last block of a data or hole range may be less than the block size,
     Note: the last block of a data or hole range may be less than the block size,
           this is supported and not considered to be an error.
           this is supported and not considered to be an error.
     """
     """
-    def __init__(self, block_size, header_size=0, sparse=False):
-        self.block_size = block_size
-        self.header_size = header_size
-        self.chunking_time = 0.0
+    def __init__(self, read_size, header_size=0, sparse=False):
+        self.read_size = read_size  # how much data we want to read at once
+        assert read_size <= len(zeros)
+        self.header_size = header_size  # size of the first block
+        assert read_size >= header_size
+        self.reading_time = 0.0  # time spent in reading/seeking
         # should borg try to do sparse input processing?
         # should borg try to do sparse input processing?
         # whether it actually can be done depends on the input file being seekable.
         # whether it actually can be done depends on the input file being seekable.
         self.try_sparse = sparse and has_seek_hole
         self.try_sparse = sparse and has_seek_hole
-        assert block_size <= len(zeros)
 
 
     def _build_fmap(self, fd=None, fh=-1):
     def _build_fmap(self, fd=None, fh=-1):
+        started_fmap = time.monotonic()
         fmap = None
         fmap = None
         if self.try_sparse:
         if self.try_sparse:
             try:
             try:
@@ -225,11 +222,12 @@ class ChunkerFixed:
                 header_map = []
                 header_map = []
                 body_map = [(0, 2 ** 62, True), ]
                 body_map = [(0, 2 ** 62, True), ]
             fmap = header_map + body_map
             fmap = header_map + body_map
+        self.reading_time += time.monotonic() - started_fmap
         return fmap
         return fmap
 
 
-    def chunkify(self, fd=None, fh=-1, fmap=None):
+    def blockify(self, fd=None, fh=-1, fmap=None):
         """
         """
-        Cut a file into chunks.
+        Read <read_size> sized blocks from a file, optionally supporting a differently sized header block.
 
 
         :param fd: Python file object
         :param fd: Python file object
         :param fh: OS-level file handle (if available),
         :param fh: OS-level file handle (if available),
@@ -238,6 +236,7 @@ class ChunkerFixed:
         """
         """
         fmap =self._build_fmap(fd, fh) if fmap is None else fmap
         fmap =self._build_fmap(fd, fh) if fmap is None else fmap
         offset = 0
         offset = 0
+        # note: the optional header block is implemented via the first fmap entry
         for range_start, range_size, is_data in fmap:
         for range_start, range_size, is_data in fmap:
             if range_start != offset:
             if range_start != offset:
                 # this is for the case when the fmap does not cover the file completely,
                 # this is for the case when the fmap does not cover the file completely,
@@ -245,8 +244,8 @@ class ChunkerFixed:
                 offset = range_start
                 offset = range_start
                 dseek(offset, os.SEEK_SET, fd, fh)
                 dseek(offset, os.SEEK_SET, fd, fh)
             while range_size:
             while range_size:
-                started_chunking = time.monotonic()
-                wanted = min(range_size, self.block_size)
+                started_reading = time.monotonic()
+                wanted = min(range_size, self.read_size)
                 if is_data:
                 if is_data:
                     # read block from the range
                     # read block from the range
                     data = dread(offset, wanted, fd, fh)
                     data = dread(offset, wanted, fd, fh)
@@ -265,13 +264,81 @@ class ChunkerFixed:
                 if got > 0:
                 if got > 0:
                     offset += got
                     offset += got
                     range_size -= got
                     range_size -= got
-                    self.chunking_time += time.monotonic() - started_chunking
+                    self.reading_time += time.monotonic() - started_reading
                     yield Chunk(data, size=got, allocation=allocation)
                     yield Chunk(data, size=got, allocation=allocation)
                 if got < wanted:
                 if got < wanted:
                     # we did not get enough data, looks like EOF.
                     # we did not get enough data, looks like EOF.
                     return
                     return
 
 
 
 
+class ChunkerFixed:
+    """
+    This is a simple chunker for input data with data usually staying at same
+    offset and / or with known block/record sizes:
+
+    - raw disk images
+    - block devices
+    - database files with simple header + fixed-size records layout
+
+    It optionally supports:
+
+    - a header block of different size
+    - using a sparsemap to read only data ranges and seek over hole ranges
+      for sparse files.
+    - using an externally given filemap to read only specific ranges from
+      a file.
+
+    Note: the last block of a data or hole range may be less than the block size,
+          this is supported and not considered to be an error.
+    """
+    def __init__(self, block_size, header_size=0, sparse=False):
+        self.block_size = block_size
+        self.header_size = header_size
+        self.chunking_time = 0.0  # likely will stay close to zero - not much to do here.
+        self.reader_block_size = self.block_size  # start simple
+        assert self.reader_block_size % self.block_size == 0, "reader_block_size must be N * block_size"
+        self.reader = FileReader(self.reader_block_size, header_size=self.header_size, sparse=sparse)
+
+    def chunkify(self, fd=None, fh=-1, fmap=None):
+        """
+        Cut a file into chunks.
+
+        :param fd: Python file object
+        :param fh: OS-level file handle (if available),
+                   defaults to -1 which means not to use OS-level fd.
+        :param fmap: a file map, same format as generated by sparsemap
+        """
+        in_header = self.header_size > 0  # first block is header, if header size is given
+        for block in self.reader.blockify(fd, fh, fmap):
+            if in_header:
+                assert self.header_size == block.meta["size"]
+                yield block  # just pass through the header block we get from the reader
+                in_header = False
+                continue
+            # not much to do in here
+            if self.reader_block_size == self.block_size:
+                # trivial, the reader already did all the work
+                yield block  # just pass through, avoid creating new objects
+            else:
+                # reader block size is a multiple of our block size
+                read_size = block.meta["size"]
+                allocation = block.meta["allocation"]
+                start = 0
+                while read_size:
+                    started_chunking = time.monotonic()
+                    size = min(read_size, self.block_size)
+                    if allocation == CH_DATA:
+                        data = block.data[start:start+size]  # TODO memoryview?
+                    elif allocation in (CH_ALLOC, CH_HOLE):
+                        data = None
+                    else:
+                        raise ValueError("unsupported allocation")
+                    self.chunking_time += time.monotonic() - started_chunking
+                    yield Chunk(data, size=size, allocation=allocation)
+                    start += size
+                    read_size -= size
+
+
 # Cyclic polynomial / buzhash
 # Cyclic polynomial / buzhash
 #
 #
 # https://en.wikipedia.org/wiki/Rolling_hash
 # https://en.wikipedia.org/wiki/Rolling_hash