|
@@ -165,7 +165,7 @@ class ChunkerFailing:
|
|
|
return
|
|
|
|
|
|
|
|
|
-class FileReader:
|
|
|
+class FileFMAPReader:
|
|
|
"""
|
|
|
This is for reading blocks from a file.
|
|
|
|
|
@@ -180,29 +180,34 @@ class FileReader:
|
|
|
Note: the last block of a data or hole range may be less than the block size,
|
|
|
this is supported and not considered to be an error.
|
|
|
"""
|
|
|
- def __init__(self, read_size, header_size=0, sparse=False):
|
|
|
- self.read_size = read_size # how much data we want to read at once
|
|
|
+ def __init__(self, *, fd=None, fh=-1, read_size=0, header_size=0, sparse=False, fmap=None):
|
|
|
+ assert fd is not None or fh >= 0
|
|
|
+ self.fd = fd
|
|
|
+ self.fh = fh
|
|
|
+ assert read_size > 0
|
|
|
assert read_size <= len(zeros)
|
|
|
+ self.read_size = read_size # how much data we want to read at once
|
|
|
+ assert header_size <= read_size
|
|
|
self.header_size = header_size # size of the first block
|
|
|
- assert read_size >= header_size
|
|
|
self.reading_time = 0.0 # time spent in reading/seeking
|
|
|
# should borg try to do sparse input processing?
|
|
|
# whether it actually can be done depends on the input file being seekable.
|
|
|
self.try_sparse = sparse and has_seek_hole
|
|
|
+ self.fmap = fmap
|
|
|
|
|
|
- def _build_fmap(self, fd=None, fh=-1):
|
|
|
+ def _build_fmap(self):
|
|
|
started_fmap = time.monotonic()
|
|
|
fmap = None
|
|
|
if self.try_sparse:
|
|
|
try:
|
|
|
if self.header_size > 0:
|
|
|
header_map = [(0, self.header_size, True), ]
|
|
|
- dseek(self.header_size, os.SEEK_SET, fd, fh)
|
|
|
- body_map = list(sparsemap(fd, fh))
|
|
|
- dseek(0, os.SEEK_SET, fd, fh)
|
|
|
+ dseek(self.header_size, os.SEEK_SET, self.fd, self.fh)
|
|
|
+ body_map = list(sparsemap(self.fd, self.fh))
|
|
|
+ dseek(0, os.SEEK_SET, self.fd, self.fh)
|
|
|
else:
|
|
|
header_map = []
|
|
|
- body_map = list(sparsemap(fd, fh))
|
|
|
+ body_map = list(sparsemap(self.fd, self.fh))
|
|
|
except OSError as err:
|
|
|
# seeking did not work
|
|
|
pass
|
|
@@ -225,30 +230,27 @@ class FileReader:
|
|
|
self.reading_time += time.monotonic() - started_fmap
|
|
|
return fmap
|
|
|
|
|
|
- def blockify(self, fd=None, fh=-1, fmap=None):
|
|
|
+ def blockify(self):
|
|
|
"""
|
|
|
Read <read_size> sized blocks from a file, optionally supporting a differently sized header block.
|
|
|
-
|
|
|
- :param fd: Python file object
|
|
|
- :param fh: OS-level file handle (if available),
|
|
|
- defaults to -1 which means not to use OS-level fd.
|
|
|
- :param fmap: a file map, same format as generated by sparsemap
|
|
|
"""
|
|
|
- fmap =self._build_fmap(fd, fh) if fmap is None else fmap
|
|
|
+ if self.fmap is None:
|
|
|
+ self.fmap = self._build_fmap()
|
|
|
+
|
|
|
offset = 0
|
|
|
# note: the optional header block is implemented via the first fmap entry
|
|
|
- for range_start, range_size, is_data in fmap:
|
|
|
+ for range_start, range_size, is_data in self.fmap:
|
|
|
if range_start != offset:
|
|
|
# this is for the case when the fmap does not cover the file completely,
|
|
|
# e.g. it could be without the ranges of holes or of unchanged data.
|
|
|
offset = range_start
|
|
|
- dseek(offset, os.SEEK_SET, fd, fh)
|
|
|
+ dseek(offset, os.SEEK_SET, self.fd, self.fh)
|
|
|
while range_size:
|
|
|
started_reading = time.monotonic()
|
|
|
wanted = min(range_size, self.read_size)
|
|
|
if is_data:
|
|
|
# read block from the range
|
|
|
- data = dread(offset, wanted, fd, fh)
|
|
|
+ data = dread(offset, wanted, self.fd, self.fh)
|
|
|
got = len(data)
|
|
|
if zeros.startswith(data):
|
|
|
data = None
|
|
@@ -257,20 +259,164 @@ class FileReader:
|
|
|
allocation = CH_DATA
|
|
|
else: # hole
|
|
|
# seek over block from the range
|
|
|
- pos = dseek(wanted, os.SEEK_CUR, fd, fh)
|
|
|
+ pos = dseek(wanted, os.SEEK_CUR, self.fd, self.fh)
|
|
|
got = pos - offset
|
|
|
data = None
|
|
|
allocation = CH_HOLE
|
|
|
+ self.reading_time += time.monotonic() - started_reading
|
|
|
if got > 0:
|
|
|
offset += got
|
|
|
range_size -= got
|
|
|
- self.reading_time += time.monotonic() - started_reading
|
|
|
yield Chunk(data, size=got, allocation=allocation)
|
|
|
if got < wanted:
|
|
|
# we did not get enough data, looks like EOF.
|
|
|
return
|
|
|
|
|
|
|
|
|
+class FileReader:
|
|
|
+ """
|
|
|
+ This is a buffered reader for file data.
|
|
|
+
|
|
|
+ It maintains a buffer that is filled by using FileFMAPReader.blockify generator when needed.
|
|
|
+ The data in that buffer is consumed by clients calling FileReader.read.
|
|
|
+ """
|
|
|
+ def __init__(self, *, fd=None, fh=-1, read_size=0, header_size=0, sparse=False, fmap=None):
|
|
|
+ self.reader = FileFMAPReader(fd=fd, fh=fh, read_size=read_size, header_size=header_size, sparse=sparse, fmap=fmap)
|
|
|
+ self.buffer = [] # list of (data, meta) tuples
|
|
|
+ self.offset = 0 # offset into the first buffer object's data
|
|
|
+ self.remaining_bytes = 0 # total bytes available in buffer
|
|
|
+ self.blockify_gen = None # generator from FileFMAPReader.blockify
|
|
|
+ self.fd = fd
|
|
|
+ self.fh = fh
|
|
|
+ self.fmap = fmap
|
|
|
+
|
|
|
+ def _fill_buffer(self):
|
|
|
+ """
|
|
|
+ Fill the buffer with more data from the blockify generator.
|
|
|
+ Returns True if more data was added, False if EOF.
|
|
|
+ """
|
|
|
+ if self.blockify_gen is None:
|
|
|
+ return False
|
|
|
+
|
|
|
+ try:
|
|
|
+ chunk = next(self.blockify_gen)
|
|
|
+ # Store both data and metadata in the buffer
|
|
|
+ self.buffer.append((chunk.data, chunk.meta))
|
|
|
+ self.remaining_bytes += chunk.meta["size"]
|
|
|
+ return True
|
|
|
+ except StopIteration:
|
|
|
+ self.blockify_gen = None
|
|
|
+ return False
|
|
|
+
|
|
|
+ def read(self, size, return_chunk_info=False):
|
|
|
+ """
|
|
|
+ Read up to 'size' bytes from the file.
|
|
|
+
|
|
|
+ :param size: Number of bytes to read
|
|
|
+ :param return_chunk_info: if True, return a tuple (data, allocation, size) instead of just data
|
|
|
+ :return: Bytes object containing the read data, or None if no data is available.
|
|
|
+ If return_chunk_info is True, returns a tuple (data, allocation, size).
|
|
|
+ """
|
|
|
+ # Initialize if not already done
|
|
|
+ if self.blockify_gen is None:
|
|
|
+ self.buffer = []
|
|
|
+ self.offset = 0
|
|
|
+ self.remaining_bytes = 0
|
|
|
+ self.blockify_gen = self.reader.blockify()
|
|
|
+
|
|
|
+ # If we don't have enough data in the buffer, try to fill it
|
|
|
+ while self.remaining_bytes < size:
|
|
|
+ if not self._fill_buffer():
|
|
|
+ # No more data available, return what we have
|
|
|
+ break
|
|
|
+
|
|
|
+ # If we have no data at all, return None
|
|
|
+ if not self.buffer:
|
|
|
+ return None if not return_chunk_info else (None, None, 0)
|
|
|
+
|
|
|
+ # Get the first chunk from the buffer
|
|
|
+ data, meta = self.buffer[0]
|
|
|
+ chunk_size = meta["size"]
|
|
|
+ allocation = meta["allocation"]
|
|
|
+
|
|
|
+ # If we're returning chunk info and this is a non-data chunk, handle it specially
|
|
|
+ if return_chunk_info and (allocation != CH_DATA or data is None):
|
|
|
+ # For non-data chunks, we return the allocation type and size
|
|
|
+ size_to_return = min(size, chunk_size - self.offset)
|
|
|
+
|
|
|
+ # Update buffer state
|
|
|
+ if size_to_return == chunk_size - self.offset:
|
|
|
+ self.buffer.pop(0)
|
|
|
+ self.offset = 0
|
|
|
+ else:
|
|
|
+ self.offset += size_to_return
|
|
|
+
|
|
|
+ self.remaining_bytes -= size_to_return
|
|
|
+
|
|
|
+ return (None, allocation, size_to_return)
|
|
|
+
|
|
|
+ # For data chunks or when not returning chunk info, proceed as before
|
|
|
+ # Prepare to collect the requested data
|
|
|
+ result = bytearray()
|
|
|
+ bytes_to_read = min(size, self.remaining_bytes)
|
|
|
+ bytes_read = 0
|
|
|
+
|
|
|
+ # Read data from the buffer
|
|
|
+ while bytes_read < bytes_to_read and self.buffer:
|
|
|
+ data, meta = self.buffer[0]
|
|
|
+ chunk_size = meta["size"]
|
|
|
+ allocation = meta["allocation"]
|
|
|
+
|
|
|
+ # Skip non-data chunks if not returning chunk info
|
|
|
+ if (allocation != CH_DATA or data is None) and not return_chunk_info:
|
|
|
+ self.buffer.pop(0)
|
|
|
+ self.remaining_bytes -= chunk_size
|
|
|
+ continue
|
|
|
+
|
|
|
+ # If this is a non-data chunk and we're returning chunk info, break to handle it
|
|
|
+ if (allocation != CH_DATA or data is None) and return_chunk_info:
|
|
|
+ if bytes_read > 0:
|
|
|
+ # We've already read some data, so return that first
|
|
|
+ break
|
|
|
+ else:
|
|
|
+ # No data read yet, return info about this non-data chunk
|
|
|
+ size_to_return = min(size, chunk_size - self.offset)
|
|
|
+
|
|
|
+ # Update buffer state
|
|
|
+ if size_to_return == chunk_size - self.offset:
|
|
|
+ self.buffer.pop(0)
|
|
|
+ self.offset = 0
|
|
|
+ else:
|
|
|
+ self.offset += size_to_return
|
|
|
+
|
|
|
+ self.remaining_bytes -= size_to_return
|
|
|
+
|
|
|
+ return (None, allocation, size_to_return)
|
|
|
+
|
|
|
+ # Calculate how much we can read from this chunk
|
|
|
+ available = chunk_size - self.offset
|
|
|
+ to_read = min(available, bytes_to_read - bytes_read)
|
|
|
+
|
|
|
+ # Read the data
|
|
|
+ if to_read > 0:
|
|
|
+ result.extend(data[self.offset:self.offset + to_read])
|
|
|
+ bytes_read += to_read
|
|
|
+
|
|
|
+ # Update offset or remove chunk if fully consumed
|
|
|
+ if to_read < available:
|
|
|
+ self.offset += to_read
|
|
|
+ else:
|
|
|
+ self.offset = 0
|
|
|
+ self.buffer.pop(0)
|
|
|
+
|
|
|
+ self.remaining_bytes -= to_read
|
|
|
+
|
|
|
+ if return_chunk_info:
|
|
|
+ return (bytes(result) if result else None, CH_DATA, bytes_read)
|
|
|
+ else:
|
|
|
+ return bytes(result) if result else None
|
|
|
+
|
|
|
+
|
|
|
class ChunkerFixed:
|
|
|
"""
|
|
|
This is a simple chunker for input data with data usually staying at same
|
|
@@ -297,7 +443,8 @@ class ChunkerFixed:
|
|
|
self.chunking_time = 0.0 # likely will stay close to zero - not much to do here.
|
|
|
self.reader_block_size = self.block_size # start simple
|
|
|
assert self.reader_block_size % self.block_size == 0, "reader_block_size must be N * block_size"
|
|
|
- self.reader = FileReader(self.reader_block_size, header_size=self.header_size, sparse=sparse)
|
|
|
+ self.reader = None
|
|
|
+ self.sparse = sparse
|
|
|
|
|
|
def chunkify(self, fd=None, fh=-1, fmap=None):
|
|
|
"""
|
|
@@ -308,35 +455,39 @@ class ChunkerFixed:
|
|
|
defaults to -1 which means not to use OS-level fd.
|
|
|
:param fmap: a file map, same format as generated by sparsemap
|
|
|
"""
|
|
|
- in_header = self.header_size > 0 # first block is header, if header size is given
|
|
|
- for block in self.reader.blockify(fd, fh, fmap):
|
|
|
- if in_header:
|
|
|
- assert self.header_size == block.meta["size"]
|
|
|
- yield block # just pass through the header block we get from the reader
|
|
|
- in_header = False
|
|
|
- continue
|
|
|
- # not much to do in here
|
|
|
- if self.reader_block_size == self.block_size:
|
|
|
- # trivial, the reader already did all the work
|
|
|
- yield block # just pass through, avoid creating new objects
|
|
|
- else:
|
|
|
- # reader block size is a multiple of our block size
|
|
|
- read_size = block.meta["size"]
|
|
|
- allocation = block.meta["allocation"]
|
|
|
- start = 0
|
|
|
- while read_size:
|
|
|
- started_chunking = time.monotonic()
|
|
|
- size = min(read_size, self.block_size)
|
|
|
- if allocation == CH_DATA:
|
|
|
- data = block.data[start:start+size] # TODO memoryview?
|
|
|
- elif allocation in (CH_ALLOC, CH_HOLE):
|
|
|
- data = None
|
|
|
- else:
|
|
|
- raise ValueError("unsupported allocation")
|
|
|
- self.chunking_time += time.monotonic() - started_chunking
|
|
|
- yield Chunk(data, size=size, allocation=allocation)
|
|
|
- start += size
|
|
|
- read_size -= size
|
|
|
+ # Initialize the reader with the file descriptors
|
|
|
+ self.reader = FileReader(fd=fd, fh=fh, read_size=self.reader_block_size,
|
|
|
+ header_size=self.header_size, sparse=self.sparse, fmap=fmap)
|
|
|
+
|
|
|
+ # Handle header if present
|
|
|
+ if self.header_size > 0:
|
|
|
+ # Read the header block using read
|
|
|
+ started_chunking = time.monotonic()
|
|
|
+ header_info = self.reader.read(self.header_size, return_chunk_info=True)
|
|
|
+ self.chunking_time += time.monotonic() - started_chunking
|
|
|
+
|
|
|
+ if header_info is not None and header_info[2] > 0:
|
|
|
+ # Unpack the header info
|
|
|
+ data, allocation, size = header_info
|
|
|
+ assert self.header_size == size
|
|
|
+ # Yield the header chunk
|
|
|
+ yield Chunk(data, size=size, allocation=allocation)
|
|
|
+
|
|
|
+ # Process the rest of the file using read
|
|
|
+ while True:
|
|
|
+ started_chunking = time.monotonic()
|
|
|
+ chunk_info = self.reader.read(self.block_size, return_chunk_info=True)
|
|
|
+ self.chunking_time += time.monotonic() - started_chunking
|
|
|
+
|
|
|
+ if chunk_info is None or chunk_info[2] == 0:
|
|
|
+ # End of file
|
|
|
+ break
|
|
|
+
|
|
|
+ # Unpack the chunk info
|
|
|
+ data, allocation, size = chunk_info
|
|
|
+
|
|
|
+ # Yield the chunk with the appropriate allocation type
|
|
|
+ yield Chunk(data, size=size, allocation=allocation)
|
|
|
|
|
|
|
|
|
# Cyclic polynomial / buzhash
|