Browse Source

Chunker: split logic into FileFMAPReader and FileReader

`FileFMAPReader` deals with sparse files (data vs holes) or fmap and yields blocks of some specific read_size using a generator.

`FileReader` uses the `FileFMAPReader` to fill an internal buffer and lets users use its `read` method to read arbitrary sized chunks from the buffer.

For both classes, instances now only deal with a single file.
Thomas Waldmann 1 month ago
parent
commit
f036152789
2 changed files with 228 additions and 54 deletions
  1. 26 3
      src/borg/chunker.pyi
  2. 202 51
      src/borg/chunker.pyx

+ 26 - 3
src/borg/chunker.pyi

@@ -21,10 +21,33 @@ class ChunkerFailing:
     def __init__(self, block_size: int, map: str) -> None: ...
     def __init__(self, block_size: int, map: str) -> None: ...
     def chunkify(self, fd: BinaryIO = None, fh: int = -1) -> Iterator: ...
     def chunkify(self, fd: BinaryIO = None, fh: int = -1) -> Iterator: ...
 
 
+class FileFMAPReader:
+    def __init__(
+        self,
+        *,
+        fd: BinaryIO = None,
+        fh: int = -1,
+        read_size: int = 0,
+        header_size: int = 0,
+        sparse: bool = False,
+        fmap: List[fmap_entry] = None,
+    ) -> None: ...
+    def _build_fmap(self) -> List[fmap_entry]: ...
+    def blockify(self) -> Iterator: ...
+
 class FileReader:
 class FileReader:
-    def __init__(self, block_size: int, header_size: int = 0, sparse: bool = False) -> None: ...
-    def _build_fmap(self, fd: BinaryIO = None, fh: int = -1) -> List[fmap_entry]: ...
-    def blockify(self, fd: BinaryIO = None, fh: int = -1, fmap: List[fmap_entry] = None) -> Iterator: ...
+    def __init__(
+        self,
+        *,
+        fd: BinaryIO = None,
+        fh: int = -1,
+        read_size: int = 0,
+        header_size: int = 0,
+        sparse: bool = False,
+        fmap: List[fmap_entry] = None,
+    ) -> None: ...
+    def _fill_buffer(self) -> bool: ...
+    def read(self, size: int, return_chunk_info: bool = False) -> Any: ...
 
 
 class ChunkerFixed:
 class ChunkerFixed:
     def __init__(self, block_size: int, header_size: int = 0, sparse: bool = False) -> None: ...
     def __init__(self, block_size: int, header_size: int = 0, sparse: bool = False) -> None: ...

+ 202 - 51
src/borg/chunker.pyx

@@ -165,7 +165,7 @@ class ChunkerFailing:
                 return
                 return
 
 
 
 
-class FileReader:
+class FileFMAPReader:
     """
     """
     This is for reading blocks from a file.
     This is for reading blocks from a file.
 
 
@@ -180,29 +180,34 @@ class FileReader:
     Note: the last block of a data or hole range may be less than the block size,
     Note: the last block of a data or hole range may be less than the block size,
           this is supported and not considered to be an error.
           this is supported and not considered to be an error.
     """
     """
-    def __init__(self, read_size, header_size=0, sparse=False):
-        self.read_size = read_size  # how much data we want to read at once
+    def __init__(self, *, fd=None, fh=-1, read_size=0, header_size=0, sparse=False, fmap=None):
+        assert fd is not None or fh >= 0
+        self.fd = fd
+        self.fh = fh
+        assert read_size > 0
         assert read_size <= len(zeros)
         assert read_size <= len(zeros)
+        self.read_size = read_size  # how much data we want to read at once
+        assert header_size <= read_size
         self.header_size = header_size  # size of the first block
         self.header_size = header_size  # size of the first block
-        assert read_size >= header_size
         self.reading_time = 0.0  # time spent in reading/seeking
         self.reading_time = 0.0  # time spent in reading/seeking
         # should borg try to do sparse input processing?
         # should borg try to do sparse input processing?
         # whether it actually can be done depends on the input file being seekable.
         # whether it actually can be done depends on the input file being seekable.
         self.try_sparse = sparse and has_seek_hole
         self.try_sparse = sparse and has_seek_hole
+        self.fmap = fmap
 
 
-    def _build_fmap(self, fd=None, fh=-1):
+    def _build_fmap(self):
         started_fmap = time.monotonic()
         started_fmap = time.monotonic()
         fmap = None
         fmap = None
         if self.try_sparse:
         if self.try_sparse:
             try:
             try:
                 if self.header_size > 0:
                 if self.header_size > 0:
                     header_map = [(0, self.header_size, True), ]
                     header_map = [(0, self.header_size, True), ]
-                    dseek(self.header_size, os.SEEK_SET, fd, fh)
-                    body_map = list(sparsemap(fd, fh))
-                    dseek(0, os.SEEK_SET, fd, fh)
+                    dseek(self.header_size, os.SEEK_SET, self.fd, self.fh)
+                    body_map = list(sparsemap(self.fd, self.fh))
+                    dseek(0, os.SEEK_SET, self.fd, self.fh)
                 else:
                 else:
                     header_map = []
                     header_map = []
-                    body_map = list(sparsemap(fd, fh))
+                    body_map = list(sparsemap(self.fd, self.fh))
             except OSError as err:
             except OSError as err:
                 # seeking did not work
                 # seeking did not work
                 pass
                 pass
@@ -225,30 +230,27 @@ class FileReader:
         self.reading_time += time.monotonic() - started_fmap
         self.reading_time += time.monotonic() - started_fmap
         return fmap
         return fmap
 
 
-    def blockify(self, fd=None, fh=-1, fmap=None):
+    def blockify(self):
         """
         """
         Read <read_size> sized blocks from a file, optionally supporting a differently sized header block.
         Read <read_size> sized blocks from a file, optionally supporting a differently sized header block.
-
-        :param fd: Python file object
-        :param fh: OS-level file handle (if available),
-                   defaults to -1 which means not to use OS-level fd.
-        :param fmap: a file map, same format as generated by sparsemap
         """
         """
-        fmap =self._build_fmap(fd, fh) if fmap is None else fmap
+        if self.fmap is None:
+            self.fmap = self._build_fmap()
+
         offset = 0
         offset = 0
         # note: the optional header block is implemented via the first fmap entry
         # note: the optional header block is implemented via the first fmap entry
-        for range_start, range_size, is_data in fmap:
+        for range_start, range_size, is_data in self.fmap:
             if range_start != offset:
             if range_start != offset:
                 # this is for the case when the fmap does not cover the file completely,
                 # this is for the case when the fmap does not cover the file completely,
                 # e.g. it could be without the ranges of holes or of unchanged data.
                 # e.g. it could be without the ranges of holes or of unchanged data.
                 offset = range_start
                 offset = range_start
-                dseek(offset, os.SEEK_SET, fd, fh)
+                dseek(offset, os.SEEK_SET, self.fd, self.fh)
             while range_size:
             while range_size:
                 started_reading = time.monotonic()
                 started_reading = time.monotonic()
                 wanted = min(range_size, self.read_size)
                 wanted = min(range_size, self.read_size)
                 if is_data:
                 if is_data:
                     # read block from the range
                     # read block from the range
-                    data = dread(offset, wanted, fd, fh)
+                    data = dread(offset, wanted, self.fd, self.fh)
                     got = len(data)
                     got = len(data)
                     if zeros.startswith(data):
                     if zeros.startswith(data):
                         data = None
                         data = None
@@ -257,20 +259,164 @@ class FileReader:
                         allocation = CH_DATA
                         allocation = CH_DATA
                 else:  # hole
                 else:  # hole
                     # seek over block from the range
                     # seek over block from the range
-                    pos = dseek(wanted, os.SEEK_CUR, fd, fh)
+                    pos = dseek(wanted, os.SEEK_CUR, self.fd, self.fh)
                     got = pos - offset
                     got = pos - offset
                     data = None
                     data = None
                     allocation = CH_HOLE
                     allocation = CH_HOLE
+                self.reading_time += time.monotonic() - started_reading
                 if got > 0:
                 if got > 0:
                     offset += got
                     offset += got
                     range_size -= got
                     range_size -= got
-                    self.reading_time += time.monotonic() - started_reading
                     yield Chunk(data, size=got, allocation=allocation)
                     yield Chunk(data, size=got, allocation=allocation)
                 if got < wanted:
                 if got < wanted:
                     # we did not get enough data, looks like EOF.
                     # we did not get enough data, looks like EOF.
                     return
                     return
 
 
 
 
+class FileReader:
+    """
+    This is a buffered reader for file data.
+
+    It maintains a buffer that is filled by using FileFMAPReader.blockify generator when needed.
+    The data in that buffer is consumed by clients calling FileReader.read.
+    """
+    def __init__(self, *, fd=None, fh=-1, read_size=0, header_size=0, sparse=False, fmap=None):
+        self.reader = FileFMAPReader(fd=fd, fh=fh, read_size=read_size, header_size=header_size, sparse=sparse, fmap=fmap)
+        self.buffer = []  # list of (data, meta) tuples
+        self.offset = 0  # offset into the first buffer object's data
+        self.remaining_bytes = 0  # total bytes available in buffer
+        self.blockify_gen = None  # generator from FileFMAPReader.blockify
+        self.fd = fd
+        self.fh = fh
+        self.fmap = fmap
+
+    def _fill_buffer(self):
+        """
+        Fill the buffer with more data from the blockify generator.
+        Returns True if more data was added, False if EOF.
+        """
+        if self.blockify_gen is None:
+            return False
+
+        try:
+            chunk = next(self.blockify_gen)
+            # Store both data and metadata in the buffer
+            self.buffer.append((chunk.data, chunk.meta))
+            self.remaining_bytes += chunk.meta["size"]
+            return True
+        except StopIteration:
+            self.blockify_gen = None
+            return False
+
+    def read(self, size, return_chunk_info=False):
+        """
+        Read up to 'size' bytes from the file.
+
+        :param size: Number of bytes to read
+        :param return_chunk_info: if True, return a tuple (data, allocation, size) instead of just data
+        :return: Bytes object containing the read data, or None if no data is available.
+                 If return_chunk_info is True, returns a tuple (data, allocation, size).
+        """
+        # Initialize if not already done
+        if self.blockify_gen is None:
+            self.buffer = []
+            self.offset = 0
+            self.remaining_bytes = 0
+            self.blockify_gen = self.reader.blockify()
+
+        # If we don't have enough data in the buffer, try to fill it
+        while self.remaining_bytes < size:
+            if not self._fill_buffer():
+                # No more data available, return what we have
+                break
+
+        # If we have no data at all, return None
+        if not self.buffer:
+            return None if not return_chunk_info else (None, None, 0)
+
+        # Get the first chunk from the buffer
+        data, meta = self.buffer[0]
+        chunk_size = meta["size"]
+        allocation = meta["allocation"]
+
+        # If we're returning chunk info and this is a non-data chunk, handle it specially
+        if return_chunk_info and (allocation != CH_DATA or data is None):
+            # For non-data chunks, we return the allocation type and size
+            size_to_return = min(size, chunk_size - self.offset)
+
+            # Update buffer state
+            if size_to_return == chunk_size - self.offset:
+                self.buffer.pop(0)
+                self.offset = 0
+            else:
+                self.offset += size_to_return
+
+            self.remaining_bytes -= size_to_return
+
+            return (None, allocation, size_to_return)
+
+        # For data chunks or when not returning chunk info, proceed as before
+        # Prepare to collect the requested data
+        result = bytearray()
+        bytes_to_read = min(size, self.remaining_bytes)
+        bytes_read = 0
+
+        # Read data from the buffer
+        while bytes_read < bytes_to_read and self.buffer:
+            data, meta = self.buffer[0]
+            chunk_size = meta["size"]
+            allocation = meta["allocation"]
+
+            # Skip non-data chunks if not returning chunk info
+            if (allocation != CH_DATA or data is None) and not return_chunk_info:
+                self.buffer.pop(0)
+                self.remaining_bytes -= chunk_size
+                continue
+
+            # If this is a non-data chunk and we're returning chunk info, break to handle it
+            if (allocation != CH_DATA or data is None) and return_chunk_info:
+                if bytes_read > 0:
+                    # We've already read some data, so return that first
+                    break
+                else:
+                    # No data read yet, return info about this non-data chunk
+                    size_to_return = min(size, chunk_size - self.offset)
+
+                    # Update buffer state
+                    if size_to_return == chunk_size - self.offset:
+                        self.buffer.pop(0)
+                        self.offset = 0
+                    else:
+                        self.offset += size_to_return
+
+                    self.remaining_bytes -= size_to_return
+
+                    return (None, allocation, size_to_return)
+
+            # Calculate how much we can read from this chunk
+            available = chunk_size - self.offset
+            to_read = min(available, bytes_to_read - bytes_read)
+
+            # Read the data
+            if to_read > 0:
+                result.extend(data[self.offset:self.offset + to_read])
+                bytes_read += to_read
+
+                # Update offset or remove chunk if fully consumed
+                if to_read < available:
+                    self.offset += to_read
+                else:
+                    self.offset = 0
+                    self.buffer.pop(0)
+
+                self.remaining_bytes -= to_read
+
+        if return_chunk_info:
+            return (bytes(result) if result else None, CH_DATA, bytes_read)
+        else:
+            return bytes(result) if result else None
+
+
 class ChunkerFixed:
 class ChunkerFixed:
     """
     """
     This is a simple chunker for input data with data usually staying at same
     This is a simple chunker for input data with data usually staying at same
@@ -297,7 +443,8 @@ class ChunkerFixed:
         self.chunking_time = 0.0  # likely will stay close to zero - not much to do here.
         self.chunking_time = 0.0  # likely will stay close to zero - not much to do here.
         self.reader_block_size = self.block_size  # start simple
         self.reader_block_size = self.block_size  # start simple
         assert self.reader_block_size % self.block_size == 0, "reader_block_size must be N * block_size"
         assert self.reader_block_size % self.block_size == 0, "reader_block_size must be N * block_size"
-        self.reader = FileReader(self.reader_block_size, header_size=self.header_size, sparse=sparse)
+        self.reader = None
+        self.sparse = sparse
 
 
     def chunkify(self, fd=None, fh=-1, fmap=None):
     def chunkify(self, fd=None, fh=-1, fmap=None):
         """
         """
@@ -308,35 +455,39 @@ class ChunkerFixed:
                    defaults to -1 which means not to use OS-level fd.
                    defaults to -1 which means not to use OS-level fd.
         :param fmap: a file map, same format as generated by sparsemap
         :param fmap: a file map, same format as generated by sparsemap
         """
         """
-        in_header = self.header_size > 0  # first block is header, if header size is given
-        for block in self.reader.blockify(fd, fh, fmap):
-            if in_header:
-                assert self.header_size == block.meta["size"]
-                yield block  # just pass through the header block we get from the reader
-                in_header = False
-                continue
-            # not much to do in here
-            if self.reader_block_size == self.block_size:
-                # trivial, the reader already did all the work
-                yield block  # just pass through, avoid creating new objects
-            else:
-                # reader block size is a multiple of our block size
-                read_size = block.meta["size"]
-                allocation = block.meta["allocation"]
-                start = 0
-                while read_size:
-                    started_chunking = time.monotonic()
-                    size = min(read_size, self.block_size)
-                    if allocation == CH_DATA:
-                        data = block.data[start:start+size]  # TODO memoryview?
-                    elif allocation in (CH_ALLOC, CH_HOLE):
-                        data = None
-                    else:
-                        raise ValueError("unsupported allocation")
-                    self.chunking_time += time.monotonic() - started_chunking
-                    yield Chunk(data, size=size, allocation=allocation)
-                    start += size
-                    read_size -= size
+        # Initialize the reader with the file descriptors
+        self.reader = FileReader(fd=fd, fh=fh, read_size=self.reader_block_size,
+                                header_size=self.header_size, sparse=self.sparse, fmap=fmap)
+
+        # Handle header if present
+        if self.header_size > 0:
+            # Read the header block using read
+            started_chunking = time.monotonic()
+            header_info = self.reader.read(self.header_size, return_chunk_info=True)
+            self.chunking_time += time.monotonic() - started_chunking
+
+            if header_info is not None and header_info[2] > 0:
+                # Unpack the header info
+                data, allocation, size = header_info
+                assert self.header_size == size
+                # Yield the header chunk
+                yield Chunk(data, size=size, allocation=allocation)
+
+        # Process the rest of the file using read
+        while True:
+            started_chunking = time.monotonic()
+            chunk_info = self.reader.read(self.block_size, return_chunk_info=True)
+            self.chunking_time += time.monotonic() - started_chunking
+
+            if chunk_info is None or chunk_info[2] == 0:
+                # End of file
+                break
+
+            # Unpack the chunk info
+            data, allocation, size = chunk_info
+
+            # Yield the chunk with the appropriate allocation type
+            yield Chunk(data, size=size, allocation=allocation)
 
 
 
 
 # Cyclic polynomial / buzhash
 # Cyclic polynomial / buzhash