chunker_pytest_test.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286
  1. from io import BytesIO
  2. import os
  3. import tempfile
  4. import pytest
  5. from .chunker_test import cf
  6. from ..chunker import Chunker, ChunkerFixed, sparsemap, has_seek_hole, ChunkerFailing, FileReader, Chunk
  7. from ..constants import * # NOQA
  8. BS = 4096 # fs block size
  9. # some sparse files. X = content blocks, _ = sparse blocks.
  10. # X__XXX____
  11. map_sparse1 = [(0 * BS, 1 * BS, True), (1 * BS, 2 * BS, False), (3 * BS, 3 * BS, True), (6 * BS, 4 * BS, False)]
  12. # _XX___XXXX
  13. map_sparse2 = [(0 * BS, 1 * BS, False), (1 * BS, 2 * BS, True), (3 * BS, 3 * BS, False), (6 * BS, 4 * BS, True)]
  14. # XXX
  15. map_notsparse = [(0 * BS, 3 * BS, True)]
  16. # ___
  17. map_onlysparse = [(0 * BS, 3 * BS, False)]
  18. def make_sparsefile(fname, sparsemap, header_size=0):
  19. with open(fname, "wb") as fd:
  20. total = 0
  21. if header_size:
  22. fd.write(b"H" * header_size)
  23. total += header_size
  24. for offset, size, is_data in sparsemap:
  25. if is_data:
  26. fd.write(b"X" * size)
  27. else:
  28. fd.seek(size, os.SEEK_CUR)
  29. total += size
  30. fd.truncate(total)
  31. assert os.path.getsize(fname) == total
  32. def make_content(sparsemap, header_size=0):
  33. result = []
  34. total = 0
  35. if header_size:
  36. result.append(b"H" * header_size)
  37. total += header_size
  38. for offset, size, is_data in sparsemap:
  39. if is_data:
  40. result.append(b"X" * size) # bytes!
  41. else:
  42. result.append(size) # int!
  43. total += size
  44. return result
  45. def fs_supports_sparse():
  46. if not has_seek_hole:
  47. return False
  48. with tempfile.TemporaryDirectory() as tmpdir:
  49. fn = os.path.join(tmpdir, "test_sparse")
  50. make_sparsefile(fn, [(0, BS, False), (BS, BS, True)])
  51. with open(fn, "rb") as f:
  52. try:
  53. offset_hole = f.seek(0, os.SEEK_HOLE)
  54. offset_data = f.seek(0, os.SEEK_DATA)
  55. except OSError:
  56. # no sparse support if these seeks do not work
  57. return False
  58. return offset_hole == 0 and offset_data == BS
  59. @pytest.mark.skipif(not fs_supports_sparse(), reason="fs does not support sparse files")
  60. @pytest.mark.parametrize(
  61. "fname, sparse_map",
  62. [("sparse1", map_sparse1), ("sparse2", map_sparse2), ("onlysparse", map_onlysparse), ("notsparse", map_notsparse)],
  63. )
  64. def test_sparsemap(tmpdir, fname, sparse_map):
  65. def get_sparsemap_fh(fname):
  66. fh = os.open(fname, flags=os.O_RDONLY)
  67. try:
  68. return list(sparsemap(fh=fh))
  69. finally:
  70. os.close(fh)
  71. def get_sparsemap_fd(fname):
  72. with open(fname, "rb") as fd:
  73. return list(sparsemap(fd=fd))
  74. fn = str(tmpdir / fname)
  75. make_sparsefile(fn, sparse_map)
  76. assert get_sparsemap_fh(fn) == sparse_map
  77. assert get_sparsemap_fd(fn) == sparse_map
  78. @pytest.mark.skipif(not fs_supports_sparse(), reason="fs does not support sparse files")
  79. @pytest.mark.parametrize(
  80. "fname, sparse_map, header_size, sparse",
  81. [
  82. ("sparse1", map_sparse1, 0, False),
  83. ("sparse1", map_sparse1, 0, True),
  84. ("sparse1", map_sparse1, BS, False),
  85. ("sparse1", map_sparse1, BS, True),
  86. ("sparse2", map_sparse2, 0, False),
  87. ("sparse2", map_sparse2, 0, True),
  88. ("sparse2", map_sparse2, BS, False),
  89. ("sparse2", map_sparse2, BS, True),
  90. ("onlysparse", map_onlysparse, 0, False),
  91. ("onlysparse", map_onlysparse, 0, True),
  92. ("onlysparse", map_onlysparse, BS, False),
  93. ("onlysparse", map_onlysparse, BS, True),
  94. ("notsparse", map_notsparse, 0, False),
  95. ("notsparse", map_notsparse, 0, True),
  96. ("notsparse", map_notsparse, BS, False),
  97. ("notsparse", map_notsparse, BS, True),
  98. ],
  99. )
  100. def test_chunkify_sparse(tmpdir, fname, sparse_map, header_size, sparse):
  101. def get_chunks(fname, sparse, header_size):
  102. chunker = ChunkerFixed(4096, header_size=header_size, sparse=sparse)
  103. with open(fname, "rb") as fd:
  104. return cf(chunker.chunkify(fd))
  105. fn = str(tmpdir / fname)
  106. make_sparsefile(fn, sparse_map, header_size=header_size)
  107. get_chunks(fn, sparse=sparse, header_size=header_size) == make_content(sparse_map, header_size=header_size)
  108. def test_chunker_failing():
  109. SIZE = 4096
  110. data = bytes(2 * SIZE + 1000)
  111. chunker = ChunkerFailing(SIZE, "rEErrr") # cut <SIZE> chunks, start failing at block 1, fail 2 times
  112. with BytesIO(data) as fd:
  113. ch = chunker.chunkify(fd)
  114. c1 = next(ch) # block 0: ok
  115. assert c1.meta["allocation"] == CH_DATA
  116. assert c1.data == data[:SIZE]
  117. with pytest.raises(OSError): # block 1: failure 1
  118. next(ch)
  119. with BytesIO(data) as fd:
  120. ch = chunker.chunkify(fd)
  121. with pytest.raises(OSError): # block 2: failure 2
  122. next(ch)
  123. with BytesIO(data) as fd:
  124. ch = chunker.chunkify(fd)
  125. c1 = next(ch) # block 3: success!
  126. c2 = next(ch) # block 4: success!
  127. c3 = next(ch) # block 5: success!
  128. assert c1.meta["allocation"] == c2.meta["allocation"] == c3.meta["allocation"] == CH_DATA
  129. assert c1.data == data[:SIZE]
  130. assert c2.data == data[SIZE : 2 * SIZE]
  131. assert c3.data == data[2 * SIZE :]
  132. def test_buzhash_chunksize_distribution():
  133. data = os.urandom(1048576)
  134. min_exp, max_exp, mask = 10, 16, 14 # chunk size target 16kiB, clip at 1kiB and 64kiB
  135. chunker = Chunker(0, min_exp, max_exp, mask, 4095)
  136. f = BytesIO(data)
  137. chunks = cf(chunker.chunkify(f))
  138. del chunks[-1] # get rid of the last chunk, it can be smaller than 2**min_exp
  139. chunk_sizes = [len(chunk) for chunk in chunks]
  140. chunks_count = len(chunks)
  141. min_chunksize_observed = min(chunk_sizes)
  142. max_chunksize_observed = max(chunk_sizes)
  143. min_count = sum(int(size == 2**min_exp) for size in chunk_sizes)
  144. max_count = sum(int(size == 2**max_exp) for size in chunk_sizes)
  145. print(
  146. f"count: {chunks_count} min: {min_chunksize_observed} max: {max_chunksize_observed} "
  147. f"min count: {min_count} max count: {max_count}"
  148. )
  149. # usually there will about 64 chunks
  150. assert 32 < chunks_count < 128
  151. # chunks always must be between min and max (clipping must work):
  152. assert min_chunksize_observed >= 2**min_exp
  153. assert max_chunksize_observed <= 2**max_exp
  154. # most chunks should be cut due to buzhash triggering, not due to clipping at min/max size:
  155. assert min_count < 10
  156. assert max_count < 10
  157. @pytest.mark.parametrize(
  158. "file_content, read_size, expected_data, expected_allocation, expected_size",
  159. [
  160. # Empty file
  161. (b"", 1024, b"", CH_DATA, 0),
  162. # Small data
  163. (b"data", 1024, b"data", CH_DATA, 4),
  164. # More data than read_size
  165. (b"data", 2, b"da", CH_DATA, 2),
  166. ],
  167. )
  168. def test_filereader_read_simple(file_content, read_size, expected_data, expected_allocation, expected_size):
  169. """Test read with different file contents."""
  170. reader = FileReader(fd=BytesIO(file_content), fh=-1, read_size=1024, sparse=False, fmap=None)
  171. chunk = reader.read(read_size)
  172. assert chunk.data == expected_data
  173. assert chunk.meta["allocation"] == expected_allocation
  174. assert chunk.meta["size"] == expected_size
  175. @pytest.mark.parametrize(
  176. "file_content, read_sizes, expected_results",
  177. [
  178. # Partial data read
  179. (
  180. b"data1234",
  181. [4, 4],
  182. [{"data": b"data", "allocation": CH_DATA, "size": 4}, {"data": b"1234", "allocation": CH_DATA, "size": 4}],
  183. ),
  184. # Multiple calls with EOF
  185. (
  186. b"0123456789",
  187. [4, 4, 4, 4],
  188. [
  189. {"data": b"0123", "allocation": CH_DATA, "size": 4},
  190. {"data": b"4567", "allocation": CH_DATA, "size": 4},
  191. {"data": b"89", "allocation": CH_DATA, "size": 2},
  192. {"data": b"", "allocation": CH_DATA, "size": 0},
  193. ],
  194. ),
  195. ],
  196. )
  197. def test_filereader_read_multiple(file_content, read_sizes, expected_results):
  198. """Test multiple read calls with different file contents."""
  199. reader = FileReader(fd=BytesIO(file_content), fh=-1, read_size=1024, sparse=False, fmap=None)
  200. for i, read_size in enumerate(read_sizes):
  201. chunk = reader.read(read_size)
  202. assert chunk.data == expected_results[i]["data"]
  203. assert chunk.meta["allocation"] == expected_results[i]["allocation"]
  204. assert chunk.meta["size"] == expected_results[i]["size"]
  205. @pytest.mark.parametrize(
  206. "mock_chunks, read_size, expected_data, expected_allocation, expected_size",
  207. [
  208. # Multiple chunks with mixed types
  209. (
  210. [
  211. Chunk(b"chunk1", size=6, allocation=CH_DATA),
  212. Chunk(None, size=4, allocation=CH_HOLE),
  213. Chunk(b"chunk2", size=6, allocation=CH_DATA),
  214. ],
  215. 16,
  216. b"chunk1" + b"\0" * 4 + b"chunk2",
  217. CH_DATA,
  218. 16,
  219. ),
  220. # Mixed allocation types (hole and alloc)
  221. ([Chunk(None, size=4, allocation=CH_HOLE), Chunk(None, size=4, allocation=CH_ALLOC)], 8, None, CH_HOLE, 8),
  222. # All alloc chunks
  223. ([Chunk(None, size=4, allocation=CH_ALLOC), Chunk(None, size=4, allocation=CH_ALLOC)], 8, None, CH_ALLOC, 8),
  224. # All hole chunks
  225. ([Chunk(None, size=4, allocation=CH_HOLE), Chunk(None, size=4, allocation=CH_HOLE)], 8, None, CH_HOLE, 8),
  226. ],
  227. )
  228. def test_filereader_read_with_mock(mock_chunks, read_size, expected_data, expected_allocation, expected_size):
  229. """Test read with a mock FileFMAPReader."""
  230. # Create a mock FileFMAPReader that yields specific chunks
  231. class MockFileFMAPReader:
  232. def __init__(self, chunks):
  233. self.chunks = chunks
  234. self.index = 0
  235. # Add required attributes to satisfy FileReader
  236. self.reading_time = 0.0
  237. def blockify(self):
  238. for chunk in self.chunks:
  239. yield chunk
  240. # Create a FileReader with a dummy BytesIO to satisfy the assertion
  241. reader = FileReader(fd=BytesIO(b""), fh=-1, read_size=1024, sparse=False, fmap=None)
  242. # Replace the reader with our mock
  243. reader.reader = MockFileFMAPReader(mock_chunks)
  244. reader.blockify_gen = reader.reader.blockify()
  245. # Read all chunks at once
  246. chunk = reader.read(read_size)
  247. # Check the result
  248. assert chunk.data == expected_data
  249. assert chunk.meta["allocation"] == expected_allocation
  250. assert chunk.meta["size"] == expected_size