|
@@ -1,781 +0,0 @@
|
|
|
-# cython: language_level=3
|
|
|
-
|
|
|
-API_VERSION = '1.2_01'
|
|
|
-
|
|
|
-import cython
|
|
|
-import os
|
|
|
-import errno
|
|
|
-import time
|
|
|
-from collections import namedtuple
|
|
|
-from cpython.bytes cimport PyBytes_AsString
|
|
|
-from libc.stdint cimport uint8_t, uint32_t
|
|
|
-from libc.stdlib cimport malloc, free
|
|
|
-from libc.string cimport memcpy, memmove
|
|
|
-
|
|
|
-from .constants import CH_DATA, CH_ALLOC, CH_HOLE, zeros
|
|
|
-from .platform import safe_fadvise
|
|
|
-
|
|
|
-# this will be True if Python's seek implementation supports data/holes seeking.
|
|
|
-# this does not imply that it will actually work on the filesystem,
|
|
|
-# because the FS also needs to support this.
|
|
|
-has_seek_hole = hasattr(os, 'SEEK_DATA') and hasattr(os, 'SEEK_HOLE')
|
|
|
-
|
|
|
-
|
|
|
-_Chunk = namedtuple('_Chunk', 'meta data')
|
|
|
-_Chunk.__doc__ = """\
|
|
|
- Chunk namedtuple
|
|
|
-
|
|
|
- meta is always a dictionary, data depends on allocation.
|
|
|
-
|
|
|
- data chunk read from a DATA range of a file (not from a sparse hole):
|
|
|
- meta = {'allocation' = CH_DATA, 'size' = size_of_chunk }
|
|
|
- data = read_data [bytes or memoryview]
|
|
|
-
|
|
|
- all-zero chunk read from a DATA range of a file (not from a sparse hole, but detected to be all-zero):
|
|
|
- meta = {'allocation' = CH_ALLOC, 'size' = size_of_chunk }
|
|
|
- data = None
|
|
|
-
|
|
|
- all-zero chunk from a HOLE range of a file (from a sparse hole):
|
|
|
- meta = {'allocation' = CH_HOLE, 'size' = size_of_chunk }
|
|
|
- data = None
|
|
|
-"""
|
|
|
-
|
|
|
-def Chunk(data, **meta):
|
|
|
- return _Chunk(meta, data)
|
|
|
-
|
|
|
-
|
|
|
-def dread(offset, size, fd=None, fh=-1):
|
|
|
- use_fh = fh >= 0
|
|
|
- if use_fh:
|
|
|
- data = os.read(fh, size)
|
|
|
- safe_fadvise(fh, offset, len(data), "DONTNEED")
|
|
|
- return data
|
|
|
- else:
|
|
|
- return fd.read(size)
|
|
|
-
|
|
|
-
|
|
|
-def dseek(amount, whence, fd=None, fh=-1):
|
|
|
- use_fh = fh >= 0
|
|
|
- if use_fh:
|
|
|
- return os.lseek(fh, amount, whence)
|
|
|
- else:
|
|
|
- return fd.seek(amount, whence)
|
|
|
-
|
|
|
-
|
|
|
-def dpos_curr_end(fd=None, fh=-1):
|
|
|
- """
|
|
|
- determine current position, file end position (== file length)
|
|
|
- """
|
|
|
- curr = dseek(0, os.SEEK_CUR, fd, fh)
|
|
|
- end = dseek(0, os.SEEK_END, fd, fh)
|
|
|
- dseek(curr, os.SEEK_SET, fd, fh)
|
|
|
- return curr, end
|
|
|
-
|
|
|
-
|
|
|
-def sparsemap(fd=None, fh=-1):
|
|
|
- """
|
|
|
- generator yielding a (start, length, is_data) tuple for each range.
|
|
|
- is_data is indicating data ranges (True) or hole ranges (False).
|
|
|
-
|
|
|
- note:
|
|
|
- the map is generated starting from the current seek position (it
|
|
|
- is not required to be 0 / to be at the start of the file) and
|
|
|
- work from there up to the end of the file.
|
|
|
- when the generator is finished, the file pointer position will be
|
|
|
- reset to where it was before calling this function.
|
|
|
- """
|
|
|
- curr, file_len = dpos_curr_end(fd, fh)
|
|
|
- start = curr
|
|
|
- try:
|
|
|
- whence = os.SEEK_HOLE
|
|
|
- while True:
|
|
|
- is_data = whence == os.SEEK_HOLE # True: range with data, False: range is a hole
|
|
|
- try:
|
|
|
- end = dseek(start, whence, fd, fh)
|
|
|
- except OSError as e:
|
|
|
- if e.errno == errno.ENXIO:
|
|
|
- if not is_data and start < file_len:
|
|
|
- # if there is a hole at the end of a file, we can not find the file end by SEEK_DATA
|
|
|
- # (because we run into ENXIO), thus we must manually deal with this case:
|
|
|
- end = file_len
|
|
|
- yield (start, end - start, is_data)
|
|
|
- break
|
|
|
- else:
|
|
|
- raise
|
|
|
- # we do not want to yield zero-length ranges with start == end:
|
|
|
- if end > start:
|
|
|
- yield (start, end - start, is_data)
|
|
|
- start = end
|
|
|
- whence = os.SEEK_DATA if is_data else os.SEEK_HOLE
|
|
|
- finally:
|
|
|
- # seek to same position as before calling this function
|
|
|
- dseek(curr, os.SEEK_SET, fd, fh)
|
|
|
-
|
|
|
-
|
|
|
-class ChunkerFailing:
|
|
|
- """
|
|
|
- This is a very simple chunker for testing purposes.
|
|
|
-
|
|
|
- Reads block_size chunks, starts failing at block <fail_start>, <fail_count> failures, then succeeds.
|
|
|
- """
|
|
|
- def __init__(self, block_size, map):
|
|
|
- self.block_size = block_size
|
|
|
- # one char per block: r/R = successful read, e/E = I/O Error, e.g.: "rrrrErrrEEr"
|
|
|
- # blocks beyond the map will have same behaviour as the last map char indicates.
|
|
|
- map = map.upper()
|
|
|
- if not set(map).issubset({"R", "E"}):
|
|
|
- raise ValueError("unsupported map character")
|
|
|
- self.map = map
|
|
|
- self.count = 0
|
|
|
- self.chunking_time = 0.0 # not updated, just provided so that caller does not crash
|
|
|
-
|
|
|
- def chunkify(self, fd=None, fh=-1):
|
|
|
- """
|
|
|
- Cut a file into chunks.
|
|
|
-
|
|
|
- :param fd: Python file object
|
|
|
- :param fh: OS-level file handle (if available),
|
|
|
- defaults to -1 which means not to use OS-level fd.
|
|
|
- """
|
|
|
- use_fh = fh >= 0
|
|
|
- wanted = self.block_size
|
|
|
- while True:
|
|
|
- data = os.read(fh, wanted) if use_fh else fd.read(wanted)
|
|
|
- got = len(data)
|
|
|
- if got > 0:
|
|
|
- idx = self.count if self.count < len(self.map) else -1
|
|
|
- behaviour = self.map[idx]
|
|
|
- if behaviour == "E":
|
|
|
- self.count += 1
|
|
|
- fname = None if use_fh else getattr(fd, "name", None)
|
|
|
- raise OSError(errno.EIO, "simulated I/O error", fname)
|
|
|
- elif behaviour == "R":
|
|
|
- self.count += 1
|
|
|
- yield Chunk(data, size=got, allocation=CH_DATA)
|
|
|
- else:
|
|
|
- raise ValueError("unsupported map character")
|
|
|
- if got < wanted:
|
|
|
- # we did not get enough data, looks like EOF.
|
|
|
- return
|
|
|
-
|
|
|
-
|
|
|
-class FileFMAPReader:
|
|
|
- """
|
|
|
- This is for reading blocks from a file.
|
|
|
-
|
|
|
- It optionally supports:
|
|
|
-
|
|
|
- - using a sparsemap to read only data ranges and seek over hole ranges
|
|
|
- for sparse files.
|
|
|
- - using an externally given filemap to read only specific ranges from
|
|
|
- a file.
|
|
|
-
|
|
|
- Note: the last block of a data or hole range may be less than the read_size,
|
|
|
- this is supported and not considered to be an error.
|
|
|
- """
|
|
|
- def __init__(self, *, fd=None, fh=-1, read_size=0, sparse=False, fmap=None):
|
|
|
- assert fd is not None or fh >= 0
|
|
|
- self.fd = fd
|
|
|
- self.fh = fh
|
|
|
- assert 0 < read_size <= len(zeros)
|
|
|
- self.read_size = read_size # how much data we want to read at once
|
|
|
- self.reading_time = 0.0 # time spent in reading/seeking
|
|
|
- # should borg try to do sparse input processing?
|
|
|
- # whether it actually can be done depends on the input file being seekable.
|
|
|
- self.try_sparse = sparse and has_seek_hole
|
|
|
- self.fmap = fmap
|
|
|
-
|
|
|
- def _build_fmap(self):
|
|
|
- started_fmap = time.monotonic()
|
|
|
- fmap = None
|
|
|
- if self.try_sparse:
|
|
|
- try:
|
|
|
- fmap = list(sparsemap(self.fd, self.fh))
|
|
|
- except OSError as err:
|
|
|
- # seeking did not work
|
|
|
- pass
|
|
|
-
|
|
|
- if fmap is None:
|
|
|
- # either sparse processing (building the fmap) was not tried or it failed.
|
|
|
- # in these cases, we just build a "fake fmap" that considers the whole file
|
|
|
- # as range(s) of data (no holes), so we can use the same code.
|
|
|
- fmap = [(0, 2 ** 62, True), ]
|
|
|
- self.reading_time += time.monotonic() - started_fmap
|
|
|
- return fmap
|
|
|
-
|
|
|
- def blockify(self):
|
|
|
- """
|
|
|
- Read <read_size> sized blocks from a file.
|
|
|
- """
|
|
|
- if self.fmap is None:
|
|
|
- self.fmap = self._build_fmap()
|
|
|
-
|
|
|
- offset = 0
|
|
|
- for range_start, range_size, is_data in self.fmap:
|
|
|
- if range_start != offset:
|
|
|
- # this is for the case when the fmap does not cover the file completely,
|
|
|
- # e.g. it could be without the ranges of holes or of unchanged data.
|
|
|
- offset = range_start
|
|
|
- dseek(offset, os.SEEK_SET, self.fd, self.fh)
|
|
|
- while range_size:
|
|
|
- started_reading = time.monotonic()
|
|
|
- wanted = min(range_size, self.read_size)
|
|
|
- if is_data:
|
|
|
- # read block from the range
|
|
|
- data = dread(offset, wanted, self.fd, self.fh)
|
|
|
- got = len(data)
|
|
|
- if zeros.startswith(data):
|
|
|
- data = None
|
|
|
- allocation = CH_ALLOC
|
|
|
- else:
|
|
|
- allocation = CH_DATA
|
|
|
- else: # hole
|
|
|
- # seek over block from the range
|
|
|
- pos = dseek(wanted, os.SEEK_CUR, self.fd, self.fh)
|
|
|
- got = pos - offset
|
|
|
- data = None
|
|
|
- allocation = CH_HOLE
|
|
|
- self.reading_time += time.monotonic() - started_reading
|
|
|
- if got > 0:
|
|
|
- offset += got
|
|
|
- range_size -= got
|
|
|
- yield Chunk(data, size=got, allocation=allocation)
|
|
|
- if got < wanted:
|
|
|
- # we did not get enough data, looks like EOF.
|
|
|
- return
|
|
|
-
|
|
|
-
|
|
|
-class FileReader:
|
|
|
- """
|
|
|
- This is a buffered reader for file data.
|
|
|
-
|
|
|
- It maintains a buffer that is filled with Chunks from the FileFMAPReader.blockify generator.
|
|
|
- The data in that buffer is consumed by clients calling FileReader.read, which returns a Chunk.
|
|
|
-
|
|
|
- Most complexity in here comes from the desired size when a user calls FileReader.read does
|
|
|
- not need to match the Chunk sizes we got from the FileFMAPReader.
|
|
|
- """
|
|
|
- def __init__(self, *, fd=None, fh=-1, read_size=0, sparse=False, fmap=None):
|
|
|
- assert read_size > 0
|
|
|
- self.reader = FileFMAPReader(fd=fd, fh=fh, read_size=read_size, sparse=sparse, fmap=fmap)
|
|
|
- self.buffer = [] # list of Chunk objects
|
|
|
- self.offset = 0 # offset into the first buffer object's data
|
|
|
- self.remaining_bytes = 0 # total bytes available in buffer
|
|
|
- self.blockify_gen = None # generator from FileFMAPReader.blockify
|
|
|
- self.fd = fd
|
|
|
- self.fh = fh
|
|
|
- self.fmap = fmap
|
|
|
-
|
|
|
- def _fill_buffer(self):
|
|
|
- """
|
|
|
- Fill the buffer with more data from the blockify generator.
|
|
|
- Returns True if more data was added, False if EOF.
|
|
|
- """
|
|
|
- if self.blockify_gen is None:
|
|
|
- return False
|
|
|
-
|
|
|
- try:
|
|
|
- chunk = next(self.blockify_gen)
|
|
|
- # Store the Chunk object directly in the buffer
|
|
|
- self.buffer.append(chunk)
|
|
|
- self.remaining_bytes += chunk.meta["size"]
|
|
|
- return True
|
|
|
- except StopIteration:
|
|
|
- self.blockify_gen = None
|
|
|
- return False
|
|
|
-
|
|
|
- def read(self, size):
|
|
|
- """
|
|
|
- Read a Chunk of up to 'size' bytes from the file.
|
|
|
-
|
|
|
- This method tries to yield a Chunk of the requested size, if possible, by considering
|
|
|
- multiple chunks from the buffer.
|
|
|
-
|
|
|
- The allocation type of the resulting chunk depends on the allocation types of the contributing chunks:
|
|
|
- - If one of the chunks is CH_DATA, it will create all-zero bytes for other chunks that are not CH_DATA
|
|
|
- - If all contributing chunks are CH_HOLE, the resulting chunk will also be CH_HOLE
|
|
|
- - If the contributing chunks are a mix of CH_HOLE and CH_ALLOC, the resulting chunk will be CH_HOLE
|
|
|
-
|
|
|
- :param size: Number of bytes to read
|
|
|
- :return: Chunk object containing the read data.
|
|
|
- If no data is available, returns Chunk(None, size=0, allocation=CH_ALLOC).
|
|
|
- If less than requested bytes were available (at EOF), the returned chunk might be smaller
|
|
|
- than requested.
|
|
|
- """
|
|
|
- # Initialize if not already done
|
|
|
- if self.blockify_gen is None:
|
|
|
- self.buffer = []
|
|
|
- self.offset = 0
|
|
|
- self.remaining_bytes = 0
|
|
|
- self.blockify_gen = self.reader.blockify()
|
|
|
-
|
|
|
- # If we don't have enough data in the buffer, try to fill it
|
|
|
- while self.remaining_bytes < size:
|
|
|
- if not self._fill_buffer():
|
|
|
- # No more data available, return what we have
|
|
|
- break
|
|
|
-
|
|
|
- # If we have no data at all, return an empty Chunk
|
|
|
- if not self.buffer:
|
|
|
- return Chunk(b"", size=0, allocation=CH_DATA)
|
|
|
-
|
|
|
- # Prepare to collect the requested data
|
|
|
- result = bytearray()
|
|
|
- bytes_to_read = min(size, self.remaining_bytes)
|
|
|
- bytes_read = 0
|
|
|
-
|
|
|
- # Track if we've seen different allocation types
|
|
|
- has_data = False
|
|
|
- has_hole = False
|
|
|
- has_alloc = False
|
|
|
-
|
|
|
- # Read data from the buffer, combining chunks as needed
|
|
|
- while bytes_read < bytes_to_read and self.buffer:
|
|
|
- chunk = self.buffer[0]
|
|
|
- chunk_size = chunk.meta["size"]
|
|
|
- allocation = chunk.meta["allocation"]
|
|
|
- data = chunk.data
|
|
|
-
|
|
|
- # Track allocation types
|
|
|
- if allocation == CH_DATA:
|
|
|
- has_data = True
|
|
|
- elif allocation == CH_HOLE:
|
|
|
- has_hole = True
|
|
|
- elif allocation == CH_ALLOC:
|
|
|
- has_alloc = True
|
|
|
- else:
|
|
|
- raise ValueError(f"Invalid allocation type: {allocation}")
|
|
|
-
|
|
|
- # Calculate how much we can read from this chunk
|
|
|
- available = chunk_size - self.offset
|
|
|
- to_read = min(available, bytes_to_read - bytes_read)
|
|
|
-
|
|
|
- # Process the chunk based on its allocation type
|
|
|
- if allocation == CH_DATA:
|
|
|
- assert data is not None
|
|
|
- # For data chunks, add the actual data
|
|
|
- result.extend(data[self.offset:self.offset + to_read])
|
|
|
- else:
|
|
|
- # For non-data chunks, add zeros if we've seen a data chunk
|
|
|
- if has_data:
|
|
|
- result.extend(b'\0' * to_read)
|
|
|
- # Otherwise, we'll just track the size without adding data
|
|
|
-
|
|
|
- bytes_read += to_read
|
|
|
-
|
|
|
- # Update offset or remove chunk if fully consumed
|
|
|
- if to_read < available:
|
|
|
- self.offset += to_read
|
|
|
- else:
|
|
|
- self.offset = 0
|
|
|
- self.buffer.pop(0)
|
|
|
-
|
|
|
- self.remaining_bytes -= to_read
|
|
|
-
|
|
|
- # Determine the allocation type of the resulting chunk
|
|
|
- if has_data:
|
|
|
- # If any chunk was CH_DATA, the result is CH_DATA
|
|
|
- return Chunk(bytes(result), size=bytes_read, allocation=CH_DATA)
|
|
|
- elif has_hole:
|
|
|
- # If any chunk was CH_HOLE (and none were CH_DATA), the result is CH_HOLE
|
|
|
- return Chunk(None, size=bytes_read, allocation=CH_HOLE)
|
|
|
- else:
|
|
|
- # Otherwise, all chunks were CH_ALLOC
|
|
|
- return Chunk(None, size=bytes_read, allocation=CH_ALLOC)
|
|
|
-
|
|
|
-
|
|
|
-class ChunkerFixed:
|
|
|
- """
|
|
|
- This is a simple chunker for input data with data usually staying at same
|
|
|
- offset and / or with known block/record sizes:
|
|
|
-
|
|
|
- - raw disk images
|
|
|
- - block devices
|
|
|
- - database files with simple header + fixed-size records layout
|
|
|
-
|
|
|
- It optionally supports:
|
|
|
-
|
|
|
- - a header block of different size
|
|
|
- - using a sparsemap to read only data ranges and seek over hole ranges
|
|
|
- for sparse files.
|
|
|
- - using an externally given filemap to read only specific ranges from
|
|
|
- a file.
|
|
|
-
|
|
|
- Note: the last block of a data or hole range may be less than the block size,
|
|
|
- this is supported and not considered to be an error.
|
|
|
- """
|
|
|
- def __init__(self, block_size, header_size=0, sparse=False):
|
|
|
- self.block_size = block_size
|
|
|
- self.header_size = header_size
|
|
|
- self.chunking_time = 0.0 # likely will stay close to zero - not much to do here.
|
|
|
- self.reader_block_size = 1024 * 1024
|
|
|
- self.reader = None
|
|
|
- self.sparse = sparse
|
|
|
-
|
|
|
- def chunkify(self, fd=None, fh=-1, fmap=None):
|
|
|
- """
|
|
|
- Cut a file into chunks.
|
|
|
-
|
|
|
- :param fd: Python file object
|
|
|
- :param fh: OS-level file handle (if available),
|
|
|
- defaults to -1 which means not to use OS-level fd.
|
|
|
- :param fmap: a file map, same format as generated by sparsemap
|
|
|
- """
|
|
|
- # Initialize the reader with the file descriptors
|
|
|
- self.reader = FileReader(fd=fd, fh=fh, read_size=self.reader_block_size,
|
|
|
- sparse=self.sparse, fmap=fmap)
|
|
|
-
|
|
|
- # Handle header if present
|
|
|
- if self.header_size > 0:
|
|
|
- # Read the header block using read
|
|
|
- started_chunking = time.monotonic()
|
|
|
- header_chunk = self.reader.read(self.header_size)
|
|
|
- self.chunking_time += time.monotonic() - started_chunking
|
|
|
-
|
|
|
- if header_chunk.meta["size"] > 0:
|
|
|
- # Yield the header chunk
|
|
|
- yield header_chunk
|
|
|
-
|
|
|
- # Process the rest of the file using read
|
|
|
- while True:
|
|
|
- started_chunking = time.monotonic()
|
|
|
- chunk = self.reader.read(self.block_size)
|
|
|
- self.chunking_time += time.monotonic() - started_chunking
|
|
|
- size = chunk.meta["size"]
|
|
|
- if size == 0:
|
|
|
- break # EOF
|
|
|
- assert size <= self.block_size
|
|
|
- yield chunk
|
|
|
-
|
|
|
-
|
|
|
-# Cyclic polynomial / buzhash
|
|
|
-#
|
|
|
-# https://en.wikipedia.org/wiki/Rolling_hash
|
|
|
-#
|
|
|
-# http://www.serve.net/buz/Notes.1st.year/HTML/C6/rand.012.html (by "BUZ", the inventor)
|
|
|
-#
|
|
|
-# http://www.dcs.gla.ac.uk/~hamer/cakes-talk.pdf (see buzhash slide)
|
|
|
-#
|
|
|
-# Some properties of buzhash / of this implementation:
|
|
|
-#
|
|
|
-# (1) the hash is designed for inputs <= 32 bytes, but the chunker uses it on a 4095 byte window;
|
|
|
-# any repeating bytes at distance 32 within those 4095 bytes can cause cancellation within
|
|
|
-# the hash function, e.g. in "X <any 31 bytes> X", the last X would cancel out the influence
|
|
|
-# of the first X on the hash value.
|
|
|
-#
|
|
|
-# (2) the hash table is supposed to have (according to the BUZ) exactly a 50% distribution of
|
|
|
-# 0/1 bit values per position, but the hard coded table below doesn't fit that property.
|
|
|
-#
|
|
|
-# (3) if you would use a window size divisible by 64, the seed would cancel itself out completely.
|
|
|
-# this is why we use a window size of 4095 bytes.
|
|
|
-#
|
|
|
-# Another quirk is that, even with the 4095 byte window, XORing the entire table by a constant
|
|
|
-# is equivalent to XORing the hash output with a different constant. but since the seed is stored
|
|
|
-# encrypted, i think it still serves its purpose.
|
|
|
-
|
|
|
-cdef uint32_t table_base[256]
|
|
|
-table_base = [
|
|
|
- 0xe7f831ec, 0xf4026465, 0xafb50cae, 0x6d553c7a, 0xd639efe3, 0x19a7b895, 0x9aba5b21, 0x5417d6d4,
|
|
|
- 0x35fd2b84, 0xd1f6a159, 0x3f8e323f, 0xb419551c, 0xf444cebf, 0x21dc3b80, 0xde8d1e36, 0x84a32436,
|
|
|
- 0xbeb35a9d, 0xa36f24aa, 0xa4e60186, 0x98d18ffe, 0x3f042f9e, 0xdb228bcd, 0x096474b7, 0x5c20c2f7,
|
|
|
- 0xf9eec872, 0xe8625275, 0xb9d38f80, 0xd48eb716, 0x22a950b4, 0x3cbaaeaa, 0xc37cddd3, 0x8fea6f6a,
|
|
|
- 0x1d55d526, 0x7fd6d3b3, 0xdaa072ee, 0x4345ac40, 0xa077c642, 0x8f2bd45b, 0x28509110, 0x55557613,
|
|
|
- 0xffc17311, 0xd961ffef, 0xe532c287, 0xaab95937, 0x46d38365, 0xb065c703, 0xf2d91d0f, 0x92cd4bb0,
|
|
|
- 0x4007c712, 0xf35509dd, 0x505b2f69, 0x557ead81, 0x310f4563, 0xbddc5be8, 0x9760f38c, 0x701e0205,
|
|
|
- 0x00157244, 0x14912826, 0xdc4ca32b, 0x67b196de, 0x5db292e8, 0x8c1b406b, 0x01f34075, 0xfa2520f7,
|
|
|
- 0x73bc37ab, 0x1e18bc30, 0xfe2c6cb3, 0x20c522d0, 0x5639e3db, 0x942bda35, 0x899af9d1, 0xced44035,
|
|
|
- 0x98cc025b, 0x255f5771, 0x70fefa24, 0xe928fa4d, 0x2c030405, 0xb9325590, 0x20cb63bd, 0xa166305d,
|
|
|
- 0x80e52c0a, 0xa8fafe2f, 0x1ad13f7d, 0xcfaf3685, 0x6c83a199, 0x7d26718a, 0xde5dfcd9, 0x79cf7355,
|
|
|
- 0x8979d7fb, 0xebf8c55e, 0xebe408e4, 0xcd2affba, 0xe483be6e, 0xe239d6de, 0x5dc1e9e0, 0x0473931f,
|
|
|
- 0x851b097c, 0xac5db249, 0x09c0f9f2, 0xd8d2f134, 0xe6f38e41, 0xb1c71bf1, 0x52b6e4db, 0x07224424,
|
|
|
- 0x6cf73e85, 0x4f25d89c, 0x782a7d74, 0x10a68dcd, 0x3a868189, 0xd570d2dc, 0x69630745, 0x9542ed86,
|
|
|
- 0x331cd6b2, 0xa84b5b28, 0x07879c9d, 0x38372f64, 0x7185db11, 0x25ba7c83, 0x01061523, 0xe6792f9f,
|
|
|
- 0xe5df07d1, 0x4321b47f, 0x7d2469d8, 0x1a3a4f90, 0x48be29a3, 0x669071af, 0x8ec8dd31, 0x0810bfbf,
|
|
|
- 0x813a06b4, 0x68538345, 0x65865ddc, 0x43a71b8e, 0x78619a56, 0x5a34451d, 0x5bdaa3ed, 0x71edc7e9,
|
|
|
- 0x17ac9a20, 0x78d10bfa, 0x6c1e7f35, 0xd51839d9, 0x240cbc51, 0x33513cc1, 0xd2b4f795, 0xccaa8186,
|
|
|
- 0x0babe682, 0xa33cf164, 0x18c643ea, 0xc1ca105f, 0x9959147a, 0x6d3d94de, 0x0b654fbe, 0xed902ca0,
|
|
|
- 0x7d835cb5, 0x99ba1509, 0x6445c922, 0x495e76c2, 0xf07194bc, 0xa1631d7e, 0x677076a5, 0x89fffe35,
|
|
|
- 0x1a49bcf3, 0x8e6c948a, 0x0144c917, 0x8d93aea1, 0x16f87ddf, 0xc8f25d49, 0x1fb11297, 0x27e750cd,
|
|
|
- 0x2f422da1, 0xdee89a77, 0x1534c643, 0x457b7b8b, 0xaf172f7a, 0x6b9b09d6, 0x33573f7f, 0xf14e15c4,
|
|
|
- 0x526467d5, 0xaf488241, 0x87c3ee0d, 0x33be490c, 0x95aa6e52, 0x43ec242e, 0xd77de99b, 0xd018334f,
|
|
|
- 0x5b78d407, 0x498eb66b, 0xb1279fa8, 0xb38b0ea6, 0x90718376, 0xe325dee2, 0x8e2f2cba, 0xcaa5bdec,
|
|
|
- 0x9d652c56, 0xad68f5cb, 0xa77591af, 0x88e37ee8, 0xf8faa221, 0xfcbbbe47, 0x4f407786, 0xaf393889,
|
|
|
- 0xf444a1d9, 0x15ae1a2f, 0x40aa7097, 0x6f9486ac, 0x29d232a3, 0xe47609e9, 0xe8b631ff, 0xba8565f4,
|
|
|
- 0x11288749, 0x46c9a838, 0xeb1b7cd8, 0xf516bbb1, 0xfb74fda0, 0x010996e6, 0x4c994653, 0x1d889512,
|
|
|
- 0x53dcd9a3, 0xdd074697, 0x1e78e17c, 0x637c98bf, 0x930bb219, 0xcf7f75b0, 0xcb9355fb, 0x9e623009,
|
|
|
- 0xe466d82c, 0x28f968d3, 0xfeb385d9, 0x238e026c, 0xb8ed0560, 0x0c6a027a, 0x3d6fec4b, 0xbb4b2ec2,
|
|
|
- 0xe715031c, 0xeded011d, 0xcdc4d3b9, 0xc456fc96, 0xdd0eea20, 0xb3df8ec9, 0x12351993, 0xd9cbb01c,
|
|
|
- 0x603147a2, 0xcf37d17d, 0xf7fcd9dc, 0xd8556fa3, 0x104c8131, 0x13152774, 0xb4715811, 0x6a72c2c9,
|
|
|
- 0xc5ae37bb, 0xa76ce12a, 0x8150d8f3, 0x2ec29218, 0xa35f0984, 0x48c0647e, 0x0b5ff98c, 0x71893f7b
|
|
|
-]
|
|
|
-
|
|
|
-# This seems to be the most reliable way to inline this code, using a C preprocessor macro:
|
|
|
-cdef extern from *:
|
|
|
- """
|
|
|
- #define BARREL_SHIFT(v, shift) (((v) << (shift)) | ((v) >> (((32 - (shift)) & 0x1f))))
|
|
|
- """
|
|
|
- uint32_t BARREL_SHIFT(uint32_t v, uint32_t shift)
|
|
|
-
|
|
|
-
|
|
|
-@cython.boundscheck(False) # Deactivate bounds checking
|
|
|
-@cython.wraparound(False) # Deactivate negative indexing.
|
|
|
-cdef uint32_t* buzhash_init_table(uint32_t seed):
|
|
|
- """Initialize the buzhash table with the given seed."""
|
|
|
- cdef int i
|
|
|
- cdef uint32_t* table = <uint32_t*>malloc(1024) # 256 * sizeof(uint32_t)
|
|
|
- for i in range(256):
|
|
|
- table[i] = table_base[i] ^ seed
|
|
|
- return table
|
|
|
-
|
|
|
-
|
|
|
-@cython.boundscheck(False) # Deactivate bounds checking
|
|
|
-@cython.wraparound(False) # Deactivate negative indexing.
|
|
|
-@cython.cdivision(True) # Use C division/modulo semantics for integer division.
|
|
|
-cdef uint32_t _buzhash(const unsigned char* data, size_t len, const uint32_t* h):
|
|
|
- """Calculate the buzhash of the given data."""
|
|
|
- cdef uint32_t i
|
|
|
- cdef uint32_t sum = 0, imod
|
|
|
- for i in range(len - 1, 0, -1):
|
|
|
- imod = i & 0x1f
|
|
|
- sum ^= BARREL_SHIFT(h[data[0]], imod)
|
|
|
- data += 1
|
|
|
- return sum ^ h[data[0]]
|
|
|
-
|
|
|
-
|
|
|
-@cython.boundscheck(False) # Deactivate bounds checking
|
|
|
-@cython.wraparound(False) # Deactivate negative indexing.
|
|
|
-@cython.cdivision(True) # Use C division/modulo semantics for integer division.
|
|
|
-cdef uint32_t _buzhash_update(uint32_t sum, unsigned char remove, unsigned char add, size_t len, const uint32_t* h):
|
|
|
- """Update the buzhash with a new byte."""
|
|
|
- cdef uint32_t lenmod = len & 0x1f
|
|
|
- return BARREL_SHIFT(sum, 1) ^ BARREL_SHIFT(h[remove], lenmod) ^ h[add]
|
|
|
-
|
|
|
-
|
|
|
-cdef class Chunker:
|
|
|
- """
|
|
|
- Content-Defined Chunker, variable chunk sizes.
|
|
|
-
|
|
|
- This chunker makes quite some effort to cut mostly chunks of the same-content, even if
|
|
|
- the content moves to a different offset inside the file. It uses the buzhash
|
|
|
- rolling-hash algorithm to identify the chunk cutting places by looking at the
|
|
|
- content inside the moving window and computing the rolling hash value over the
|
|
|
- window contents. If the last n bits of the rolling hash are 0, a chunk is cut.
|
|
|
- Additionally it obeys some more criteria, like a minimum and maximum chunk size.
|
|
|
- It also uses a per-repo random seed to avoid some chunk length fingerprinting attacks.
|
|
|
- """
|
|
|
- cdef uint32_t chunk_mask
|
|
|
- cdef uint32_t* table
|
|
|
- cdef uint8_t* data
|
|
|
- cdef object _fd # Python object for file descriptor
|
|
|
- cdef int fh
|
|
|
- cdef int done, eof
|
|
|
- cdef size_t min_size, buf_size, window_size, remaining, position, last
|
|
|
- cdef long long bytes_read, bytes_yielded # off_t in C, using long long for compatibility
|
|
|
- cdef readonly float chunking_time
|
|
|
- cdef object file_reader # FileReader instance
|
|
|
- cdef size_t reader_block_size
|
|
|
- cdef bint sparse
|
|
|
-
|
|
|
- def __cinit__(self, int seed, int chunk_min_exp, int chunk_max_exp, int hash_mask_bits, int hash_window_size, bint sparse=False):
|
|
|
- min_size = 1 << chunk_min_exp
|
|
|
- max_size = 1 << chunk_max_exp
|
|
|
- assert max_size <= len(zeros)
|
|
|
- # see chunker_process, first while loop condition, first term must be able to get True:
|
|
|
- assert hash_window_size + min_size + 1 <= max_size, "too small max_size"
|
|
|
-
|
|
|
- self.window_size = hash_window_size
|
|
|
- self.chunk_mask = (1 << hash_mask_bits) - 1
|
|
|
- self.min_size = min_size
|
|
|
- self.table = buzhash_init_table(seed & 0xffffffff)
|
|
|
- self.buf_size = max_size
|
|
|
- self.data = <uint8_t*>malloc(self.buf_size)
|
|
|
- self.fh = -1
|
|
|
- self.done = 0
|
|
|
- self.eof = 0
|
|
|
- self.remaining = 0
|
|
|
- self.position = 0
|
|
|
- self.last = 0
|
|
|
- self.bytes_read = 0
|
|
|
- self.bytes_yielded = 0
|
|
|
- self._fd = None
|
|
|
- self.chunking_time = 0.0
|
|
|
- self.reader_block_size = 1024 * 1024
|
|
|
- self.sparse = sparse
|
|
|
-
|
|
|
- def __dealloc__(self):
|
|
|
- """Free the chunker's resources."""
|
|
|
- if self.table != NULL:
|
|
|
- free(self.table)
|
|
|
- self.table = NULL
|
|
|
- if self.data != NULL:
|
|
|
- free(self.data)
|
|
|
- self.data = NULL
|
|
|
-
|
|
|
- cdef int fill(self) except 0:
|
|
|
- """Fill the chunker's buffer with more data."""
|
|
|
- cdef ssize_t n
|
|
|
- cdef object chunk
|
|
|
-
|
|
|
- # Move remaining data to the beginning of the buffer
|
|
|
- memmove(self.data, self.data + self.last, self.position + self.remaining - self.last)
|
|
|
- self.position -= self.last
|
|
|
- self.last = 0
|
|
|
- n = self.buf_size - self.position - self.remaining
|
|
|
-
|
|
|
- if self.eof or n == 0:
|
|
|
- return 1
|
|
|
-
|
|
|
- # Use FileReader to read data
|
|
|
- chunk = self.file_reader.read(n)
|
|
|
- n = chunk.meta["size"]
|
|
|
-
|
|
|
- if n > 0:
|
|
|
- # Only copy data if it's not a hole
|
|
|
- if chunk.meta["allocation"] == CH_DATA:
|
|
|
- # Copy data from chunk to our buffer
|
|
|
- memcpy(self.data + self.position + self.remaining, <const unsigned char*>PyBytes_AsString(chunk.data), n)
|
|
|
- else:
|
|
|
- # For holes, fill with zeros
|
|
|
- memcpy(self.data + self.position + self.remaining, <const unsigned char*>PyBytes_AsString(zeros[:n]), n)
|
|
|
-
|
|
|
- self.remaining += n
|
|
|
- self.bytes_read += n
|
|
|
- else:
|
|
|
- self.eof = 1
|
|
|
-
|
|
|
- return 1
|
|
|
-
|
|
|
- cdef object process(self) except *:
|
|
|
- """Process the chunker's buffer and return the next chunk."""
|
|
|
- cdef uint32_t sum, chunk_mask = self.chunk_mask
|
|
|
- cdef size_t n, old_last, min_size = self.min_size, window_size = self.window_size
|
|
|
- cdef uint8_t* p
|
|
|
- cdef uint8_t* stop_at
|
|
|
- cdef size_t did_bytes
|
|
|
-
|
|
|
- if self.done:
|
|
|
- if self.bytes_read == self.bytes_yielded:
|
|
|
- raise StopIteration
|
|
|
- else:
|
|
|
- raise Exception("chunkifier byte count mismatch")
|
|
|
-
|
|
|
- while self.remaining < min_size + window_size + 1 and not self.eof: # see assert in Chunker init
|
|
|
- if not self.fill():
|
|
|
- return None
|
|
|
-
|
|
|
- # Here we either are at eof...
|
|
|
- if self.eof:
|
|
|
- self.done = 1
|
|
|
- if self.remaining:
|
|
|
- self.bytes_yielded += self.remaining
|
|
|
- # Return a memory view of the remaining data
|
|
|
- return memoryview((self.data + self.position)[:self.remaining])
|
|
|
- else:
|
|
|
- if self.bytes_read == self.bytes_yielded:
|
|
|
- raise StopIteration
|
|
|
- else:
|
|
|
- raise Exception("chunkifier byte count mismatch")
|
|
|
-
|
|
|
- # ... or we have at least min_size + window_size + 1 bytes remaining.
|
|
|
- # We do not want to "cut" a chunk smaller than min_size and the hash
|
|
|
- # window starts at the potential cutting place.
|
|
|
- self.position += min_size
|
|
|
- self.remaining -= min_size
|
|
|
- sum = _buzhash(self.data + self.position, window_size, self.table)
|
|
|
-
|
|
|
- while self.remaining > self.window_size and (sum & chunk_mask) and not (self.eof and self.remaining <= window_size):
|
|
|
- p = self.data + self.position
|
|
|
- stop_at = p + self.remaining - window_size
|
|
|
-
|
|
|
- while p < stop_at and (sum & chunk_mask):
|
|
|
- sum = _buzhash_update(sum, p[0], p[window_size], window_size, self.table)
|
|
|
- p += 1
|
|
|
-
|
|
|
- did_bytes = p - (self.data + self.position)
|
|
|
- self.position += did_bytes
|
|
|
- self.remaining -= did_bytes
|
|
|
-
|
|
|
- if self.remaining <= window_size:
|
|
|
- if not self.fill():
|
|
|
- return None
|
|
|
-
|
|
|
- if self.remaining <= window_size:
|
|
|
- self.position += self.remaining
|
|
|
- self.remaining = 0
|
|
|
-
|
|
|
- old_last = self.last
|
|
|
- self.last = self.position
|
|
|
- n = self.last - old_last
|
|
|
- self.bytes_yielded += n
|
|
|
-
|
|
|
- # Return a memory view of the chunk
|
|
|
- return memoryview((self.data + old_last)[:n])
|
|
|
-
|
|
|
- def chunkify(self, fd, fh=-1, fmap=None):
|
|
|
- """
|
|
|
- Cut a file into chunks.
|
|
|
-
|
|
|
- :param fd: Python file object
|
|
|
- :param fh: OS-level file handle (if available),
|
|
|
- defaults to -1 which means not to use OS-level fd.
|
|
|
- :param fmap: a file map, same format as generated by sparsemap
|
|
|
- """
|
|
|
- self._fd = fd
|
|
|
- self.fh = fh
|
|
|
- self.file_reader = FileReader(fd=fd, fh=fh, read_size=self.reader_block_size, sparse=self.sparse, fmap=fmap)
|
|
|
- self.done = 0
|
|
|
- self.remaining = 0
|
|
|
- self.bytes_read = 0
|
|
|
- self.bytes_yielded = 0
|
|
|
- self.position = 0
|
|
|
- self.last = 0
|
|
|
- self.eof = 0
|
|
|
- return self
|
|
|
-
|
|
|
- def __iter__(self):
|
|
|
- return self
|
|
|
-
|
|
|
- def __next__(self):
|
|
|
- started_chunking = time.monotonic()
|
|
|
- data = self.process()
|
|
|
- got = len(data)
|
|
|
- # we do not have SEEK_DATA/SEEK_HOLE support in chunker_process C code,
|
|
|
- # but we can just check if data was all-zero (and either came from a hole
|
|
|
- # or from stored zeros - we can not detect that here).
|
|
|
- if zeros.startswith(data):
|
|
|
- data = None
|
|
|
- allocation = CH_ALLOC
|
|
|
- else:
|
|
|
- allocation = CH_DATA
|
|
|
- self.chunking_time += time.monotonic() - started_chunking
|
|
|
- return Chunk(data, size=got, allocation=allocation)
|
|
|
-
|
|
|
-
|
|
|
-def buzhash(data, unsigned long seed):
|
|
|
- cdef uint32_t *table
|
|
|
- cdef uint32_t sum
|
|
|
- table = buzhash_init_table(seed & 0xffffffff)
|
|
|
- sum = _buzhash(<const unsigned char *> data, len(data), table)
|
|
|
- free(table)
|
|
|
- return sum
|
|
|
-
|
|
|
-
|
|
|
-def buzhash_update(uint32_t sum, unsigned char remove, unsigned char add, size_t len, unsigned long seed):
|
|
|
- cdef uint32_t *table
|
|
|
- table = buzhash_init_table(seed & 0xffffffff)
|
|
|
- sum = _buzhash_update(sum, remove, add, len, table)
|
|
|
- free(table)
|
|
|
- return sum
|
|
|
-
|
|
|
-
|
|
|
-def get_chunker(algo, *params, **kw):
|
|
|
- if algo == 'buzhash':
|
|
|
- seed = kw['seed']
|
|
|
- sparse = kw['sparse']
|
|
|
- return Chunker(seed, *params, sparse=sparse)
|
|
|
- if algo == 'fixed':
|
|
|
- sparse = kw['sparse']
|
|
|
- return ChunkerFixed(*params, sparse=sparse)
|
|
|
- if algo == 'fail':
|
|
|
- return ChunkerFailing(*params)
|
|
|
- raise TypeError('unsupported chunker algo %r' % algo)
|