123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159 |
- # -*- coding: utf-8 -*-
- API_VERSION = '1.1_03'
- import os
- from libc.stdlib cimport free
- cdef extern from "_chunker.c":
- ctypedef int uint32_t
- ctypedef struct _Chunker "Chunker":
- pass
- _Chunker *chunker_init(int window_size, int chunk_mask, int min_size, int max_size, uint32_t seed)
- void chunker_set_fd(_Chunker *chunker, object f, int fd)
- void chunker_free(_Chunker *chunker)
- object chunker_process(_Chunker *chunker)
- uint32_t *buzhash_init_table(uint32_t seed)
- uint32_t c_buzhash "buzhash"(unsigned char *data, size_t len, uint32_t *h)
- uint32_t c_buzhash_update "buzhash_update"(uint32_t sum, unsigned char remove, unsigned char add, size_t len, uint32_t *h)
- class ChunkerFixed:
- """
- Fixed blocksize Chunker, optionally supporting a header block of different size.
- This is a very simple chunker for input data with known block/record sizes:
- - raw disk images
- - block devices
- - database files with simple header + fixed-size records layout
- Note: the last block of the input data may be less than the block size,
- this is supported and not considered to be an error.
- """
- def __init__(self, block_size, header_size=0):
- self.block_size = block_size
- self.header_size = header_size
- def chunkify(self, fd, fh=-1):
- """
- Cut a file into chunks.
- :param fd: Python file object
- :param fh: OS-level file handle (if available),
- defaults to -1 which means not to use OS-level fd.
- """
- offset = 0
- use_fh = fh >= 0
- if use_fh:
- def read(size):
- nonlocal offset
- data = os.read(fh, size)
- amount = len(data)
- if hasattr(os, 'posix_fadvise'):
- # UNIX only and, in case of block sizes that are not a multiple of the
- # system's page size, better be used with a bug fixed linux kernel > 4.6.0,
- # see comment/workaround in _chunker.c and borgbackup issue #907.
- os.posix_fadvise(fh, offset, amount, os.POSIX_FADV_DONTNEED)
- offset += amount
- return data
- else:
- def read(size):
- nonlocal offset
- data = fd.read(size)
- amount = len(data)
- offset += amount
- return data
- if self.header_size > 0:
- data = read(self.header_size)
- if data:
- yield data
- else:
- data = True # get into next while loop
- while data:
- data = read(self.block_size)
- if data:
- yield data
- # empty data means we are at EOF and we terminate the generator.
- cdef class Chunker:
- """
- Content-Defined Chunker, variable chunk sizes.
- This chunker does quite some effort to mostly cut the same-content chunks, even if
- the content moves to a different offset inside the file. It uses the buzhash
- rolling-hash algorithm to identify the chunk cutting places by looking at the
- content inside the moving window and computing the rolling hash value over the
- window contents. If the last n bits of the rolling hash are 0, a chunk is cut.
- Additionally it obeys some more criteria, like a minimum and maximum chunk size.
- It also uses a per-repo random seed to avoid some chunk length fingerprinting attacks.
- """
- cdef _Chunker *chunker
- def __cinit__(self, int seed, int chunk_min_exp, int chunk_max_exp, int hash_mask_bits, int hash_window_size):
- min_size = 1 << chunk_min_exp
- max_size = 1 << chunk_max_exp
- # see chunker_process, first while loop condition, first term must be able to get True:
- assert hash_window_size + min_size + 1 <= max_size, "too small max_size"
- hash_mask = (1 << hash_mask_bits) - 1
- self.chunker = chunker_init(hash_window_size, hash_mask, min_size, max_size, seed & 0xffffffff)
- def chunkify(self, fd, fh=-1):
- """
- Cut a file into chunks.
- :param fd: Python file object
- :param fh: OS-level file handle (if available),
- defaults to -1 which means not to use OS-level fd.
- """
- chunker_set_fd(self.chunker, fd, fh)
- return self
- def __dealloc__(self):
- if self.chunker:
- chunker_free(self.chunker)
- def __iter__(self):
- return self
- def __next__(self):
- return chunker_process(self.chunker)
- def get_chunker(algo, *params, **kw):
- if algo == 'buzhash':
- seed = kw['seed']
- return Chunker(seed, *params)
- if algo == 'fixed':
- return ChunkerFixed(*params)
- raise TypeError('unsupported chunker algo %r' % algo)
- def max_chunk_size(algo, *params):
- # see also parseformat.ChunkerParams return values
- if algo == 'buzhash':
- return 1 << params[1]
- if algo == 'fixed':
- return max(params[0], params[1])
- raise TypeError('unsupported chunker algo %r' % algo)
- def buzhash(data, unsigned long seed):
- cdef uint32_t *table
- cdef uint32_t sum
- table = buzhash_init_table(seed & 0xffffffff)
- sum = c_buzhash(<const unsigned char *> data, len(data), table)
- free(table)
- return sum
- def buzhash_update(uint32_t sum, unsigned char remove, unsigned char add, size_t len, unsigned long seed):
- cdef uint32_t *table
- table = buzhash_init_table(seed & 0xffffffff)
- sum = c_buzhash_update(sum, remove, add, len, table)
- free(table)
- return sum
|