1 month ago · f036152789
--- a/src/borg/chunker.pyi
+++ b/src/borg/chunker.pyi
@@ -21,10 +21,33 @@ class ChunkerFailing:
 
															     def __init__(self, block_size: int, map: str) -> None: ...
														
 
															     def chunkify(self, fd: BinaryIO = None, fh: int = -1) -> Iterator: ...
														
 
															+class FileFMAPReader:
														
 
															+    def __init__(
														
 
															+        self,
														
 
															+        *,
														
 
															+        fd: BinaryIO = None,
														
 
															+        fh: int = -1,
														
 
															+        read_size: int = 0,
														
 
															+        header_size: int = 0,
														
 
															+        sparse: bool = False,
														
 
															+        fmap: List[fmap_entry] = None,
														
 
															+    ) -> None: ...
														
 
															+    def _build_fmap(self) -> List[fmap_entry]: ...
														
 
															+    def blockify(self) -> Iterator: ...
														
 
															+
														
 
															 class FileReader:
														
 
															-    def __init__(self, block_size: int, header_size: int = 0, sparse: bool = False) -> None: ...
														
 
															-    def _build_fmap(self, fd: BinaryIO = None, fh: int = -1) -> List[fmap_entry]: ...
														
 
															-    def blockify(self, fd: BinaryIO = None, fh: int = -1, fmap: List[fmap_entry] = None) -> Iterator: ...
														
 
															+    def __init__(
														
 
															+        self,
														
 
															+        *,
														
 
															+        fd: BinaryIO = None,
														
 
															+        fh: int = -1,
														
 
															+        read_size: int = 0,
														
 
															+        header_size: int = 0,
														
 
															+        sparse: bool = False,
														
 
															+        fmap: List[fmap_entry] = None,
														
 
															+    ) -> None: ...
														
 
															+    def _fill_buffer(self) -> bool: ...
														
 
															+    def read(self, size: int, return_chunk_info: bool = False) -> Any: ...
														
 
															 class ChunkerFixed:
														
 
															     def __init__(self, block_size: int, header_size: int = 0, sparse: bool = False) -> None: ...
														
--- a/src/borg/chunker.pyx
+++ b/src/borg/chunker.pyx
@@ -165,7 +165,7 @@ class ChunkerFailing:
 
															                 return
														
 
															-class FileReader:
														
 
															+class FileFMAPReader:
														
 
															     """
														
 
															     This is for reading blocks from a file.
														
@@ -180,29 +180,34 @@ class FileReader:
 
															     Note: the last block of a data or hole range may be less than the block size,
														
 
															           this is supported and not considered to be an error.
														
 
															     """
														
 
															-    def __init__(self, read_size, header_size=0, sparse=False):
														
 
															-        self.read_size = read_size  # how much data we want to read at once
														
 
															+    def __init__(self, *, fd=None, fh=-1, read_size=0, header_size=0, sparse=False, fmap=None):
														
 
															+        assert fd is not None or fh >= 0
														
 
															+        self.fd = fd
														
 
															+        self.fh = fh
														
 
															+        assert read_size > 0
														
 
															         assert read_size <= len(zeros)
														
 
															+        self.read_size = read_size  # how much data we want to read at once
														
 
															+        assert header_size <= read_size
														
 
															         self.header_size = header_size  # size of the first block
														
 
															-        assert read_size >= header_size
														
 
															         self.reading_time = 0.0  # time spent in reading/seeking
														
 
															         # should borg try to do sparse input processing?
														
 
															         # whether it actually can be done depends on the input file being seekable.
														
 
															         self.try_sparse = sparse and has_seek_hole
														
 
															+        self.fmap = fmap
														
 
															-    def _build_fmap(self, fd=None, fh=-1):
														
 
															+    def _build_fmap(self):
														
 
															         started_fmap = time.monotonic()
														
 
															         fmap = None
														
 
															         if self.try_sparse:
														
 
															             try:
														
 
															                 if self.header_size > 0:
														
 
															                     header_map = [(0, self.header_size, True), ]
														
 
															-                    dseek(self.header_size, os.SEEK_SET, fd, fh)
														
 
															-                    body_map = list(sparsemap(fd, fh))
														
 
															-                    dseek(0, os.SEEK_SET, fd, fh)
														
 
															+                    dseek(self.header_size, os.SEEK_SET, self.fd, self.fh)
														
 
															+                    body_map = list(sparsemap(self.fd, self.fh))
														
 
															+                    dseek(0, os.SEEK_SET, self.fd, self.fh)
														
 
															                 else:
														
 
															                     header_map = []
														
 
															-                    body_map = list(sparsemap(fd, fh))
														
 
															+                    body_map = list(sparsemap(self.fd, self.fh))
														
 
															             except OSError as err:
														
 
															                 # seeking did not work
														
 
															                 pass
														
@@ -225,30 +230,27 @@ class FileReader:
 
															         self.reading_time += time.monotonic() - started_fmap
														
 
															         return fmap
														
 
															-    def blockify(self, fd=None, fh=-1, fmap=None):
														
 
															+    def blockify(self):
														
 
															         """
														
 
															         Read <read_size> sized blocks from a file, optionally supporting a differently sized header block.
														
 
															-
														
 
															-        :param fd: Python file object
														
 
															-        :param fh: OS-level file handle (if available),
														
 
															-                   defaults to -1 which means not to use OS-level fd.
														
 
															-        :param fmap: a file map, same format as generated by sparsemap
														
 
															         """
														
 
															-        fmap =self._build_fmap(fd, fh) if fmap is None else fmap
														
 
															+        if self.fmap is None:
														
 
															+            self.fmap = self._build_fmap()
														
 
															+
														
 
															         offset = 0
														
 
															         # note: the optional header block is implemented via the first fmap entry
														
 
															-        for range_start, range_size, is_data in fmap:
														
 
															+        for range_start, range_size, is_data in self.fmap:
														
 
															             if range_start != offset:
														
 
															                 # this is for the case when the fmap does not cover the file completely,
														
 
															                 # e.g. it could be without the ranges of holes or of unchanged data.
														
 
															                 offset = range_start
														
 
															-                dseek(offset, os.SEEK_SET, fd, fh)
														
 
															+                dseek(offset, os.SEEK_SET, self.fd, self.fh)
														
 
															             while range_size:
														
 
															                 started_reading = time.monotonic()
														
 
															                 wanted = min(range_size, self.read_size)
														
 
															                 if is_data:
														
 
															                     # read block from the range
														
 
															-                    data = dread(offset, wanted, fd, fh)
														
 
															+                    data = dread(offset, wanted, self.fd, self.fh)
														
 
															                     got = len(data)
														
 
															                     if zeros.startswith(data):
														
 
															                         data = None
														
@@ -257,20 +259,164 @@ class FileReader:
 
															                         allocation = CH_DATA
														
 
															                 else:  # hole
														
 
															                     # seek over block from the range
														
 
															-                    pos = dseek(wanted, os.SEEK_CUR, fd, fh)
														
 
															+                    pos = dseek(wanted, os.SEEK_CUR, self.fd, self.fh)
														
 
															                     got = pos - offset
														
 
															                     data = None
														
 
															                     allocation = CH_HOLE
														
 
															+                self.reading_time += time.monotonic() - started_reading
														
 
															                 if got > 0:
														
 
															                     offset += got
														
 
															                     range_size -= got
														
 
															-                    self.reading_time += time.monotonic() - started_reading
														
 
															                     yield Chunk(data, size=got, allocation=allocation)
														
 
															                 if got < wanted:
														
 
															                     # we did not get enough data, looks like EOF.
														
 
															                     return
														
 
															+class FileReader:
														
 
															+    """
														
 
															+    This is a buffered reader for file data.
														
 
															+
														
 
															+    It maintains a buffer that is filled by using FileFMAPReader.blockify generator when needed.
														
 
															+    The data in that buffer is consumed by clients calling FileReader.read.
														
 
															+    """
														
 
															+    def __init__(self, *, fd=None, fh=-1, read_size=0, header_size=0, sparse=False, fmap=None):
														
 
															+        self.reader = FileFMAPReader(fd=fd, fh=fh, read_size=read_size, header_size=header_size, sparse=sparse, fmap=fmap)
														
 
															+        self.buffer = []  # list of (data, meta) tuples
														
 
															+        self.offset = 0  # offset into the first buffer object's data
														
 
															+        self.remaining_bytes = 0  # total bytes available in buffer
														
 
															+        self.blockify_gen = None  # generator from FileFMAPReader.blockify
														
 
															+        self.fd = fd
														
 
															+        self.fh = fh
														
 
															+        self.fmap = fmap
														
 
															+
														
 
															+    def _fill_buffer(self):
														
 
															+        """
														
 
															+        Fill the buffer with more data from the blockify generator.
														
 
															+        Returns True if more data was added, False if EOF.
														
 
															+        """
														
 
															+        if self.blockify_gen is None:
														
 
															+            return False
														
 
															+
														
 
															+        try:
														
 
															+            chunk = next(self.blockify_gen)
														
 
															+            # Store both data and metadata in the buffer
														
 
															+            self.buffer.append((chunk.data, chunk.meta))
														
 
															+            self.remaining_bytes += chunk.meta["size"]
														
 
															+            return True
														
 
															+        except StopIteration:
														
 
															+            self.blockify_gen = None
														
 
															+            return False
														
 
															+
														
 
															+    def read(self, size, return_chunk_info=False):
														
 
															+        """
														
 
															+        Read up to 'size' bytes from the file.
														
 
															+
														
 
															+        :param size: Number of bytes to read
														
 
															+        :param return_chunk_info: if True, return a tuple (data, allocation, size) instead of just data
														
 
															+        :return: Bytes object containing the read data, or None if no data is available.
														
 
															+                 If return_chunk_info is True, returns a tuple (data, allocation, size).
														
 
															+        """
														
 
															+        # Initialize if not already done
														
 
															+        if self.blockify_gen is None:
														
 
															+            self.buffer = []
														
 
															+            self.offset = 0
														
 
															+            self.remaining_bytes = 0
														
 
															+            self.blockify_gen = self.reader.blockify()
														
 
															+
														
 
															+        # If we don't have enough data in the buffer, try to fill it
														
 
															+        while self.remaining_bytes < size:
														
 
															+            if not self._fill_buffer():
														
 
															+                # No more data available, return what we have
														
 
															+                break
														
 
															+
														
 
															+        # If we have no data at all, return None
														
 
															+        if not self.buffer:
														
 
															+            return None if not return_chunk_info else (None, None, 0)
														
 
															+
														
 
															+        # Get the first chunk from the buffer
														
 
															+        data, meta = self.buffer[0]
														
 
															+        chunk_size = meta["size"]
														
 
															+        allocation = meta["allocation"]
														
 
															+
														
 
															+        # If we're returning chunk info and this is a non-data chunk, handle it specially
														
 
															+        if return_chunk_info and (allocation != CH_DATA or data is None):
														
 
															+            # For non-data chunks, we return the allocation type and size
														
 
															+            size_to_return = min(size, chunk_size - self.offset)
														
 
															+
														
 
															+            # Update buffer state
														
 
															+            if size_to_return == chunk_size - self.offset:
														
 
															+                self.buffer.pop(0)
														
 
															+                self.offset = 0
														
 
															+            else:
														
 
															+                self.offset += size_to_return
														
 
															+
														
 
															+            self.remaining_bytes -= size_to_return
														
 
															+
														
 
															+            return (None, allocation, size_to_return)
														
 
															+
														
 
															+        # For data chunks or when not returning chunk info, proceed as before
														
 
															+        # Prepare to collect the requested data
														
 
															+        result = bytearray()
														
 
															+        bytes_to_read = min(size, self.remaining_bytes)
														
 
															+        bytes_read = 0
														
 
															+
														
 
															+        # Read data from the buffer
														
 
															+        while bytes_read < bytes_to_read and self.buffer:
														
 
															+            data, meta = self.buffer[0]
														
 
															+            chunk_size = meta["size"]
														
 
															+            allocation = meta["allocation"]
														
 
															+
														
 
															+            # Skip non-data chunks if not returning chunk info
														
 
															+            if (allocation != CH_DATA or data is None) and not return_chunk_info:
														
 
															+                self.buffer.pop(0)
														
 
															+                self.remaining_bytes -= chunk_size
														
 
															+                continue
														
 
															+
														
 
															+            # If this is a non-data chunk and we're returning chunk info, break to handle it
														
 
															+            if (allocation != CH_DATA or data is None) and return_chunk_info:
														
 
															+                if bytes_read > 0:
														
 
															+                    # We've already read some data, so return that first
														
 
															+                    break
														
 
															+                else:
														
 
															+                    # No data read yet, return info about this non-data chunk
														
 
															+                    size_to_return = min(size, chunk_size - self.offset)
														
 
															+
														
 
															+                    # Update buffer state
														
 
															+                    if size_to_return == chunk_size - self.offset:
														
 
															+                        self.buffer.pop(0)
														
 
															+                        self.offset = 0
														
 
															+                    else:
														
 
															+                        self.offset += size_to_return
														
 
															+
														
 
															+                    self.remaining_bytes -= size_to_return
														
 
															+
														
 
															+                    return (None, allocation, size_to_return)
														
 
															+
														
 
															+            # Calculate how much we can read from this chunk
														
 
															+            available = chunk_size - self.offset
														
 
															+            to_read = min(available, bytes_to_read - bytes_read)
														
 
															+
														
 
															+            # Read the data
														
 
															+            if to_read > 0:
														
 
															+                result.extend(data[self.offset:self.offset + to_read])
														
 
															+                bytes_read += to_read
														
 
															+
														
 
															+                # Update offset or remove chunk if fully consumed
														
 
															+                if to_read < available:
														
 
															+                    self.offset += to_read
														
 
															+                else:
														
 
															+                    self.offset = 0
														
 
															+                    self.buffer.pop(0)
														
 
															+
														
 
															+                self.remaining_bytes -= to_read
														
 
															+
														
 
															+        if return_chunk_info:
														
 
															+            return (bytes(result) if result else None, CH_DATA, bytes_read)
														
 
															+        else:
														
 
															+            return bytes(result) if result else None
														
 
															+
														
 
															+
														
 
															 class ChunkerFixed:
														
 
															     """
														
 
															     This is a simple chunker for input data with data usually staying at same
														
@@ -297,7 +443,8 @@ class ChunkerFixed:
 
															         self.chunking_time = 0.0  # likely will stay close to zero - not much to do here.
														
 
															         self.reader_block_size = self.block_size  # start simple
														
 
															         assert self.reader_block_size % self.block_size == 0, "reader_block_size must be N * block_size"
														
 
															-        self.reader = FileReader(self.reader_block_size, header_size=self.header_size, sparse=sparse)
														
 
															+        self.reader = None
														
 
															+        self.sparse = sparse
														
 
															     def chunkify(self, fd=None, fh=-1, fmap=None):
														
 
															         """
														
@@ -308,35 +455,39 @@ class ChunkerFixed:
 
															                    defaults to -1 which means not to use OS-level fd.
														
 
															         :param fmap: a file map, same format as generated by sparsemap
														
 
															         """
														
 
															-        in_header = self.header_size > 0  # first block is header, if header size is given
														
 
															-        for block in self.reader.blockify(fd, fh, fmap):
														
 
															-            if in_header:
														
 
															-                assert self.header_size == block.meta["size"]
														
 
															-                yield block  # just pass through the header block we get from the reader
														
 
															-                in_header = False
														
 
															-                continue
														
 
															-            # not much to do in here
														
 
															-            if self.reader_block_size == self.block_size:
														
 
															-                # trivial, the reader already did all the work
														
 
															-                yield block  # just pass through, avoid creating new objects
														
 
															-            else:
														
 
															-                # reader block size is a multiple of our block size
														
 
															-                read_size = block.meta["size"]
														
 
															-                allocation = block.meta["allocation"]
														
 
															-                start = 0
														
 
															-                while read_size:
														
 
															-                    started_chunking = time.monotonic()
														
 
															-                    size = min(read_size, self.block_size)
														
 
															-                    if allocation == CH_DATA:
														
 
															-                        data = block.data[start:start+size]  # TODO memoryview?
														
 
															-                    elif allocation in (CH_ALLOC, CH_HOLE):
														
 
															-                        data = None
														
 
															-                    else:
														
 
															-                        raise ValueError("unsupported allocation")
														
 
															-                    self.chunking_time += time.monotonic() - started_chunking
														
 
															-                    yield Chunk(data, size=size, allocation=allocation)
														
 
															-                    start += size
														
 
															-                    read_size -= size
														
 
															+        # Initialize the reader with the file descriptors
														
 
															+        self.reader = FileReader(fd=fd, fh=fh, read_size=self.reader_block_size,
														
 
															+                                header_size=self.header_size, sparse=self.sparse, fmap=fmap)
														
 
															+
														
 
															+        # Handle header if present
														
 
															+        if self.header_size > 0:
														
 
															+            # Read the header block using read
														
 
															+            started_chunking = time.monotonic()
														
 
															+            header_info = self.reader.read(self.header_size, return_chunk_info=True)
														
 
															+            self.chunking_time += time.monotonic() - started_chunking
														
 
															+
														
 
															+            if header_info is not None and header_info[2] > 0:
														
 
															+                # Unpack the header info
														
 
															+                data, allocation, size = header_info
														
 
															+                assert self.header_size == size
														
 
															+                # Yield the header chunk
														
 
															+                yield Chunk(data, size=size, allocation=allocation)
														
 
															+
														
 
															+        # Process the rest of the file using read
														
 
															+        while True:
														
 
															+            started_chunking = time.monotonic()
														
 
															+            chunk_info = self.reader.read(self.block_size, return_chunk_info=True)
														
 
															+            self.chunking_time += time.monotonic() - started_chunking
														
 
															+
														
 
															+            if chunk_info is None or chunk_info[2] == 0:
														
 
															+                # End of file
														
 
															+                break
														
 
															+
														
 
															+            # Unpack the chunk info
														
 
															+            data, allocation, size = chunk_info
														
 
															+
														
 
															+            # Yield the chunk with the appropriate allocation type
														
 
															+            yield Chunk(data, size=size, allocation=allocation)
														
 
															 # Cyclic polynomial / buzhash