|
@@ -2,6 +2,7 @@
|
|
|
|
|
|
API_VERSION = '1.2_01'
|
|
API_VERSION = '1.2_01'
|
|
|
|
|
|
|
|
+import errno
|
|
import os
|
|
import os
|
|
|
|
|
|
from libc.stdlib cimport free
|
|
from libc.stdlib cimport free
|
|
@@ -19,65 +20,176 @@ cdef extern from "_chunker.c":
|
|
uint32_t c_buzhash_update "buzhash_update"(uint32_t sum, unsigned char remove, unsigned char add, size_t len, uint32_t *h)
|
|
uint32_t c_buzhash_update "buzhash_update"(uint32_t sum, unsigned char remove, unsigned char add, size_t len, uint32_t *h)
|
|
|
|
|
|
|
|
|
|
-class ChunkerFixed:
|
|
|
|
|
|
+# this will be True if Python's seek implementation supports data/holes seeking.
|
|
|
|
+# this does not imply that it will actually work on the filesystem,
|
|
|
|
+# because the FS also needs to support this.
|
|
|
|
+has_seek_hole = hasattr(os, 'SEEK_DATA') and hasattr(os, 'SEEK_HOLE')
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def dread(offset, size, fd=None, fh=-1):
|
|
|
|
+ use_fh = fh >= 0
|
|
|
|
+ if use_fh:
|
|
|
|
+ data = os.read(fh, size)
|
|
|
|
+ if hasattr(os, 'posix_fadvise'):
|
|
|
|
+ # UNIX only and, in case of block sizes that are not a multiple of the
|
|
|
|
+ # system's page size, better be used with a bug fixed linux kernel > 4.6.0,
|
|
|
|
+ # see comment/workaround in _chunker.c and borgbackup issue #907.
|
|
|
|
+ os.posix_fadvise(fh, offset, len(data), os.POSIX_FADV_DONTNEED)
|
|
|
|
+ return data
|
|
|
|
+ else:
|
|
|
|
+ return fd.read(size)
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def dseek(amount, whence, fd=None, fh=-1):
|
|
|
|
+ use_fh = fh >= 0
|
|
|
|
+ if use_fh:
|
|
|
|
+ return os.lseek(fh, amount, whence)
|
|
|
|
+ else:
|
|
|
|
+ return fd.seek(amount, whence)
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def dpos_curr_end(fd=None, fh=-1):
|
|
|
|
+ """
|
|
|
|
+ determine current position, file end position (== file length)
|
|
"""
|
|
"""
|
|
- Fixed blocksize Chunker, optionally supporting a header block of different size.
|
|
|
|
|
|
+ curr = dseek(0, os.SEEK_CUR, fd, fh)
|
|
|
|
+ end = dseek(0, os.SEEK_END, fd, fh)
|
|
|
|
+ dseek(curr, os.SEEK_SET, fd, fh)
|
|
|
|
+ return curr, end
|
|
|
|
|
|
- This is a very simple chunker for input data with known block/record sizes:
|
|
|
|
|
|
+
|
|
|
|
+def sparsemap(fd=None, fh=-1):
|
|
|
|
+ """
|
|
|
|
+ generator yielding a (start, length, is_data) tuple for each range.
|
|
|
|
+ is_data is indicating data ranges (True) or hole ranges (False).
|
|
|
|
+
|
|
|
|
+ note:
|
|
|
|
+ the map is generated starting from the current seek position (it
|
|
|
|
+ is not required to be 0 / to be at the start of the file) and
|
|
|
|
+ work from there up to the end of the file.
|
|
|
|
+ when the generator is finished, the file pointer position will be
|
|
|
|
+ reset to where it was before calling this function.
|
|
|
|
+ """
|
|
|
|
+ curr, file_len = dpos_curr_end(fd, fh)
|
|
|
|
+ start = curr
|
|
|
|
+ try:
|
|
|
|
+ whence = os.SEEK_HOLE
|
|
|
|
+ while True:
|
|
|
|
+ is_data = whence == os.SEEK_HOLE # True: range with data, False: range is a hole
|
|
|
|
+ try:
|
|
|
|
+ end = dseek(start, whence, fd, fh)
|
|
|
|
+ except OSError as e:
|
|
|
|
+ if e.errno == errno.ENXIO:
|
|
|
|
+ if not is_data and start < file_len:
|
|
|
|
+ # if there is a hole at the end of a file, we can not find the file end by SEEK_DATA
|
|
|
|
+ # (because we run into ENXIO), thus we must manually deal with this case:
|
|
|
|
+ end = file_len
|
|
|
|
+ yield (start, end - start, is_data)
|
|
|
|
+ break
|
|
|
|
+ else:
|
|
|
|
+ raise
|
|
|
|
+ # we do not want to yield zero-length ranges with start == end:
|
|
|
|
+ if end > start:
|
|
|
|
+ yield (start, end - start, is_data)
|
|
|
|
+ start = end
|
|
|
|
+ whence = os.SEEK_DATA if is_data else os.SEEK_HOLE
|
|
|
|
+ finally:
|
|
|
|
+ # seek to same position as before calling this function
|
|
|
|
+ dseek(curr, os.SEEK_SET, fd, fh)
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+class ChunkerFixed:
|
|
|
|
+ """
|
|
|
|
+ This is a simple chunker for input data with data usually staying at same
|
|
|
|
+ offset and / or with known block/record sizes:
|
|
|
|
|
|
- raw disk images
|
|
- raw disk images
|
|
- block devices
|
|
- block devices
|
|
- database files with simple header + fixed-size records layout
|
|
- database files with simple header + fixed-size records layout
|
|
|
|
|
|
- Note: the last block of the input data may be less than the block size,
|
|
|
|
|
|
+ It optionally supports:
|
|
|
|
+
|
|
|
|
+ - a header block of different size
|
|
|
|
+ - using a sparsemap to only read data ranges and seek over hole ranges
|
|
|
|
+ for sparse files.
|
|
|
|
+ - using an externally given filemap to only read specific ranges from
|
|
|
|
+ a file.
|
|
|
|
+
|
|
|
|
+ Note: the last block of a data or hole range may be less than the block size,
|
|
this is supported and not considered to be an error.
|
|
this is supported and not considered to be an error.
|
|
"""
|
|
"""
|
|
- def __init__(self, block_size, header_size=0):
|
|
|
|
|
|
+ def __init__(self, block_size, header_size=0, sparse=False):
|
|
self.block_size = block_size
|
|
self.block_size = block_size
|
|
self.header_size = header_size
|
|
self.header_size = header_size
|
|
|
|
+ # should borg try to do sparse input processing?
|
|
|
|
+ # whether it actually can be done depends on the input file being seekable.
|
|
|
|
+ self.try_sparse = sparse and has_seek_hole
|
|
|
|
+ self.zeros = memoryview(bytes(block_size))
|
|
|
|
|
|
- def chunkify(self, fd, fh=-1):
|
|
|
|
|
|
+ def chunkify(self, fd=None, fh=-1, fmap=None):
|
|
"""
|
|
"""
|
|
Cut a file into chunks.
|
|
Cut a file into chunks.
|
|
|
|
|
|
:param fd: Python file object
|
|
:param fd: Python file object
|
|
:param fh: OS-level file handle (if available),
|
|
:param fh: OS-level file handle (if available),
|
|
defaults to -1 which means not to use OS-level fd.
|
|
defaults to -1 which means not to use OS-level fd.
|
|
|
|
+ :param fmap: a file map, same format as generated by sparsemap
|
|
"""
|
|
"""
|
|
|
|
+ if fmap is None:
|
|
|
|
+ if self.try_sparse:
|
|
|
|
+ try:
|
|
|
|
+ if self.header_size > 0:
|
|
|
|
+ header_map = [(0, self.header_size, True), ]
|
|
|
|
+ dseek(self.header_size, os.SEEK_SET, fd, fh)
|
|
|
|
+ body_map = list(sparsemap(fd, fh))
|
|
|
|
+ dseek(0, os.SEEK_SET, fd, fh)
|
|
|
|
+ else:
|
|
|
|
+ header_map = []
|
|
|
|
+ body_map = list(sparsemap(fd, fh))
|
|
|
|
+ except OSError as err:
|
|
|
|
+ # seeking did not work
|
|
|
|
+ pass
|
|
|
|
+ else:
|
|
|
|
+ fmap = header_map + body_map
|
|
|
|
+
|
|
|
|
+ if fmap is None:
|
|
|
|
+ # either sparse processing (building the fmap) was not tried or it failed.
|
|
|
|
+ # in these cases, we just build a "fake fmap" that considers the whole file
|
|
|
|
+ # as range(s) of data (no holes), so we can use the same code.
|
|
|
|
+ # we build different fmaps here for the purpose of correct block alignment
|
|
|
|
+ # with or without a header block (of potentially different size).
|
|
|
|
+ if self.header_size > 0:
|
|
|
|
+ header_map = [(0, self.header_size, True), ]
|
|
|
|
+ body_map = [(self.header_size, 2 ** 62, True), ]
|
|
|
|
+ else:
|
|
|
|
+ header_map = []
|
|
|
|
+ body_map = [(0, 2 ** 62, True), ]
|
|
|
|
+ fmap = header_map + body_map
|
|
|
|
+
|
|
offset = 0
|
|
offset = 0
|
|
- use_fh = fh >= 0
|
|
|
|
-
|
|
|
|
- if use_fh:
|
|
|
|
- def read(size):
|
|
|
|
- nonlocal offset
|
|
|
|
- data = os.read(fh, size)
|
|
|
|
- amount = len(data)
|
|
|
|
- if hasattr(os, 'posix_fadvise'):
|
|
|
|
- # UNIX only and, in case of block sizes that are not a multiple of the
|
|
|
|
- # system's page size, better be used with a bug fixed linux kernel > 4.6.0,
|
|
|
|
- # see comment/workaround in _chunker.c and borgbackup issue #907.
|
|
|
|
- os.posix_fadvise(fh, offset, amount, os.POSIX_FADV_DONTNEED)
|
|
|
|
- offset += amount
|
|
|
|
- return data
|
|
|
|
- else:
|
|
|
|
- def read(size):
|
|
|
|
- nonlocal offset
|
|
|
|
- data = fd.read(size)
|
|
|
|
- amount = len(data)
|
|
|
|
- offset += amount
|
|
|
|
- return data
|
|
|
|
-
|
|
|
|
- if self.header_size > 0:
|
|
|
|
- data = read(self.header_size)
|
|
|
|
- if data:
|
|
|
|
- yield data
|
|
|
|
- else:
|
|
|
|
- data = True # get into next while loop
|
|
|
|
- while data:
|
|
|
|
- data = read(self.block_size)
|
|
|
|
- if data:
|
|
|
|
- yield data
|
|
|
|
- # empty data means we are at EOF and we terminate the generator.
|
|
|
|
|
|
+ for range_start, range_size, is_data in fmap:
|
|
|
|
+ if range_start != offset:
|
|
|
|
+ # this is for the case when the fmap does not cover the file completely,
|
|
|
|
+ # e.g. it could be without the ranges of holes or of unchanged data.
|
|
|
|
+ offset = range_start
|
|
|
|
+ dseek(offset, os.SEEK_SET, fd, fh)
|
|
|
|
+ while range_size:
|
|
|
|
+ wanted = min(range_size, self.block_size)
|
|
|
|
+ if is_data:
|
|
|
|
+ # read block from the range
|
|
|
|
+ data = dread(offset, wanted, fd, fh)
|
|
|
|
+ else: # hole
|
|
|
|
+ # seek over block from the range
|
|
|
|
+ pos = dseek(wanted, os.SEEK_CUR, fd, fh)
|
|
|
|
+ data = self.zeros[:pos - offset] # for now, create zero-bytes here
|
|
|
|
+ got = len(data)
|
|
|
|
+ if got > 0:
|
|
|
|
+ offset += got
|
|
|
|
+ range_size -= got
|
|
|
|
+ yield data # later, use a better api that tags data vs. hole
|
|
|
|
+ if got < wanted:
|
|
|
|
+ # we did not get enough data, looks like EOF.
|
|
|
|
+ return
|
|
|
|
|
|
|
|
|
|
cdef class Chunker:
|
|
cdef class Chunker:
|
|
@@ -129,7 +241,8 @@ def get_chunker(algo, *params, **kw):
|
|
seed = kw['seed']
|
|
seed = kw['seed']
|
|
return Chunker(seed, *params)
|
|
return Chunker(seed, *params)
|
|
if algo == 'fixed':
|
|
if algo == 'fixed':
|
|
- return ChunkerFixed(*params)
|
|
|
|
|
|
+ sparse = kw['sparse']
|
|
|
|
+ return ChunkerFixed(*params, sparse=sparse)
|
|
raise TypeError('unsupported chunker algo %r' % algo)
|
|
raise TypeError('unsupported chunker algo %r' % algo)
|
|
|
|
|
|
|
|
|