chunker.pyx 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159
  1. # -*- coding: utf-8 -*-
  2. API_VERSION = '1.1_03'
  3. import os
  4. from libc.stdlib cimport free
  5. cdef extern from "_chunker.c":
  6. ctypedef int uint32_t
  7. ctypedef struct _Chunker "Chunker":
  8. pass
  9. _Chunker *chunker_init(int window_size, int chunk_mask, int min_size, int max_size, uint32_t seed)
  10. void chunker_set_fd(_Chunker *chunker, object f, int fd)
  11. void chunker_free(_Chunker *chunker)
  12. object chunker_process(_Chunker *chunker)
  13. uint32_t *buzhash_init_table(uint32_t seed)
  14. uint32_t c_buzhash "buzhash"(unsigned char *data, size_t len, uint32_t *h)
  15. uint32_t c_buzhash_update "buzhash_update"(uint32_t sum, unsigned char remove, unsigned char add, size_t len, uint32_t *h)
  16. class ChunkerFixed:
  17. """
  18. Fixed blocksize Chunker, optionally supporting a header block of different size.
  19. This is a very simple chunker for input data with known block/record sizes:
  20. - raw disk images
  21. - block devices
  22. - database files with simple header + fixed-size records layout
  23. Note: the last block of the input data may be less than the block size,
  24. this is supported and not considered to be an error.
  25. """
  26. def __init__(self, block_size, header_size=0):
  27. self.block_size = block_size
  28. self.header_size = header_size
  29. def chunkify(self, fd, fh=-1):
  30. """
  31. Cut a file into chunks.
  32. :param fd: Python file object
  33. :param fh: OS-level file handle (if available),
  34. defaults to -1 which means not to use OS-level fd.
  35. """
  36. offset = 0
  37. use_fh = fh >= 0
  38. if use_fh:
  39. def read(size):
  40. nonlocal offset
  41. data = os.read(fh, size)
  42. amount = len(data)
  43. if hasattr(os, 'posix_fadvise'):
  44. # UNIX only and, in case of block sizes that are not a multiple of the
  45. # system's page size, better be used with a bug fixed linux kernel > 4.6.0,
  46. # see comment/workaround in _chunker.c and borgbackup issue #907.
  47. os.posix_fadvise(fh, offset, amount, os.POSIX_FADV_DONTNEED)
  48. offset += amount
  49. return data
  50. else:
  51. def read(size):
  52. nonlocal offset
  53. data = fd.read(size)
  54. amount = len(data)
  55. offset += amount
  56. return data
  57. if self.header_size > 0:
  58. data = read(self.header_size)
  59. if data:
  60. yield data
  61. else:
  62. data = True # get into next while loop
  63. while data:
  64. data = read(self.block_size)
  65. if data:
  66. yield data
  67. # empty data means we are at EOF and we terminate the generator.
  68. cdef class Chunker:
  69. """
  70. Content-Defined Chunker, variable chunk sizes.
  71. This chunker does quite some effort to mostly cut the same-content chunks, even if
  72. the content moves to a different offset inside the file. It uses the buzhash
  73. rolling-hash algorithm to identify the chunk cutting places by looking at the
  74. content inside the moving window and computing the rolling hash value over the
  75. window contents. If the last n bits of the rolling hash are 0, a chunk is cut.
  76. Additionally it obeys some more criteria, like a minimum and maximum chunk size.
  77. It also uses a per-repo random seed to avoid some chunk length fingerprinting attacks.
  78. """
  79. cdef _Chunker *chunker
  80. def __cinit__(self, int seed, int chunk_min_exp, int chunk_max_exp, int hash_mask_bits, int hash_window_size):
  81. min_size = 1 << chunk_min_exp
  82. max_size = 1 << chunk_max_exp
  83. # see chunker_process, first while loop condition, first term must be able to get True:
  84. assert hash_window_size + min_size + 1 <= max_size, "too small max_size"
  85. hash_mask = (1 << hash_mask_bits) - 1
  86. self.chunker = chunker_init(hash_window_size, hash_mask, min_size, max_size, seed & 0xffffffff)
  87. def chunkify(self, fd, fh=-1):
  88. """
  89. Cut a file into chunks.
  90. :param fd: Python file object
  91. :param fh: OS-level file handle (if available),
  92. defaults to -1 which means not to use OS-level fd.
  93. """
  94. chunker_set_fd(self.chunker, fd, fh)
  95. return self
  96. def __dealloc__(self):
  97. if self.chunker:
  98. chunker_free(self.chunker)
  99. def __iter__(self):
  100. return self
  101. def __next__(self):
  102. return chunker_process(self.chunker)
  103. def get_chunker(algo, *params, **kw):
  104. if algo == 'buzhash':
  105. seed = kw['seed']
  106. return Chunker(seed, *params)
  107. if algo == 'fixed':
  108. return ChunkerFixed(*params)
  109. raise TypeError('unsupported chunker algo %r' % algo)
  110. def max_chunk_size(algo, *params):
  111. # see also parseformat.ChunkerParams return values
  112. if algo == 'buzhash':
  113. return 1 << params[1]
  114. if algo == 'fixed':
  115. return max(params[0], params[1])
  116. raise TypeError('unsupported chunker algo %r' % algo)
  117. def buzhash(data, unsigned long seed):
  118. cdef uint32_t *table
  119. cdef uint32_t sum
  120. table = buzhash_init_table(seed & 0xffffffff)
  121. sum = c_buzhash(<const unsigned char *> data, len(data), table)
  122. free(table)
  123. return sum
  124. def buzhash_update(uint32_t sum, unsigned char remove, unsigned char add, size_t len, unsigned long seed):
  125. cdef uint32_t *table
  126. table = buzhash_init_table(seed & 0xffffffff)
  127. sum = c_buzhash_update(sum, remove, add, len, table)
  128. free(table)
  129. return sum