hashindex.py 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280
  1. import base64
  2. import hashlib
  3. import os
  4. import tempfile
  5. import zlib
  6. from ..hashindex import NSIndex, ChunkIndex
  7. from .. import hashindex
  8. from . import BaseTestCase
  9. # Note: these tests are part of the self test, do not use or import py.test functionality here.
  10. # See borg.selftest for details. If you add/remove test methods, update SELFTEST_COUNT
  11. def H(x):
  12. # make some 32byte long thing that depends on x
  13. return bytes('%-0.32d' % x, 'ascii')
  14. class HashIndexTestCase(BaseTestCase):
  15. def _generic_test(self, cls, make_value, sha):
  16. idx = cls()
  17. self.assert_equal(len(idx), 0)
  18. # Test set
  19. for x in range(100):
  20. idx[bytes('%-32d' % x, 'ascii')] = make_value(x)
  21. self.assert_equal(len(idx), 100)
  22. for x in range(100):
  23. self.assert_equal(idx[bytes('%-32d' % x, 'ascii')], make_value(x))
  24. # Test update
  25. for x in range(100):
  26. idx[bytes('%-32d' % x, 'ascii')] = make_value(x * 2)
  27. self.assert_equal(len(idx), 100)
  28. for x in range(100):
  29. self.assert_equal(idx[bytes('%-32d' % x, 'ascii')], make_value(x * 2))
  30. # Test delete
  31. for x in range(50):
  32. del idx[bytes('%-32d' % x, 'ascii')]
  33. self.assert_equal(len(idx), 50)
  34. idx_name = tempfile.NamedTemporaryFile()
  35. idx.write(idx_name.name)
  36. del idx
  37. # Verify file contents
  38. with open(idx_name.name, 'rb') as fd:
  39. self.assert_equal(hashlib.sha256(fd.read()).hexdigest(), sha)
  40. # Make sure we can open the file
  41. idx = cls.read(idx_name.name)
  42. self.assert_equal(len(idx), 50)
  43. for x in range(50, 100):
  44. self.assert_equal(idx[bytes('%-32d' % x, 'ascii')], make_value(x * 2))
  45. idx.clear()
  46. self.assert_equal(len(idx), 0)
  47. idx.write(idx_name.name)
  48. del idx
  49. self.assert_equal(len(cls.read(idx_name.name)), 0)
  50. def test_nsindex(self):
  51. self._generic_test(NSIndex, lambda x: (x, x),
  52. '80fba5b40f8cf12f1486f1ba33c9d852fb2b41a5b5961d3b9d1228cf2aa9c4c9')
  53. def test_chunkindex(self):
  54. self._generic_test(ChunkIndex, lambda x: (x, x, x),
  55. '1d71865e72e3c3af18d3c7216b6fa7b014695eaa3ed7f14cf9cd02fba75d1c95')
  56. def test_resize(self):
  57. n = 2000 # Must be >= MIN_BUCKETS
  58. idx_name = tempfile.NamedTemporaryFile()
  59. idx = NSIndex()
  60. idx.write(idx_name.name)
  61. initial_size = os.path.getsize(idx_name.name)
  62. self.assert_equal(len(idx), 0)
  63. for x in range(n):
  64. idx[bytes('%-32d' % x, 'ascii')] = x, x
  65. idx.write(idx_name.name)
  66. self.assert_true(initial_size < os.path.getsize(idx_name.name))
  67. for x in range(n):
  68. del idx[bytes('%-32d' % x, 'ascii')]
  69. self.assert_equal(len(idx), 0)
  70. idx.write(idx_name.name)
  71. self.assert_equal(initial_size, os.path.getsize(idx_name.name))
  72. def test_iteritems(self):
  73. idx = NSIndex()
  74. for x in range(100):
  75. idx[bytes('%-0.32d' % x, 'ascii')] = x, x
  76. all = list(idx.iteritems())
  77. self.assert_equal(len(all), 100)
  78. second_half = list(idx.iteritems(marker=all[49][0]))
  79. self.assert_equal(len(second_half), 50)
  80. self.assert_equal(second_half, all[50:])
  81. def test_chunkindex_merge(self):
  82. idx1 = ChunkIndex()
  83. idx1[H(1)] = 1, 100, 100
  84. idx1[H(2)] = 2, 200, 200
  85. idx1[H(3)] = 3, 300, 300
  86. # no H(4) entry
  87. idx2 = ChunkIndex()
  88. idx2[H(1)] = 4, 100, 100
  89. idx2[H(2)] = 5, 200, 200
  90. # no H(3) entry
  91. idx2[H(4)] = 6, 400, 400
  92. idx1.merge(idx2)
  93. assert idx1[H(1)] == (5, 100, 100)
  94. assert idx1[H(2)] == (7, 200, 200)
  95. assert idx1[H(3)] == (3, 300, 300)
  96. assert idx1[H(4)] == (6, 400, 400)
  97. def test_chunkindex_summarize(self):
  98. idx = ChunkIndex()
  99. idx[H(1)] = 1, 1000, 100
  100. idx[H(2)] = 2, 2000, 200
  101. idx[H(3)] = 3, 3000, 300
  102. size, csize, unique_size, unique_csize, unique_chunks, chunks = idx.summarize()
  103. assert size == 1000 + 2 * 2000 + 3 * 3000
  104. assert csize == 100 + 2 * 200 + 3 * 300
  105. assert unique_size == 1000 + 2000 + 3000
  106. assert unique_csize == 100 + 200 + 300
  107. assert chunks == 1 + 2 + 3
  108. assert unique_chunks == 3
  109. class HashIndexRefcountingTestCase(BaseTestCase):
  110. def test_chunkindex_limit(self):
  111. idx = ChunkIndex()
  112. idx[H(1)] = hashindex.MAX_VALUE - 1, 1, 2
  113. # 5 is arbitray, any number of incref/decrefs shouldn't move it once it's limited
  114. for i in range(5):
  115. # first incref to move it to the limit
  116. refcount, *_ = idx.incref(H(1))
  117. assert refcount == hashindex.MAX_VALUE
  118. for i in range(5):
  119. refcount, *_ = idx.decref(H(1))
  120. assert refcount == hashindex.MAX_VALUE
  121. def _merge(self, refcounta, refcountb):
  122. def merge(refcount1, refcount2):
  123. idx1 = ChunkIndex()
  124. idx1[H(1)] = refcount1, 1, 2
  125. idx2 = ChunkIndex()
  126. idx2[H(1)] = refcount2, 1, 2
  127. idx1.merge(idx2)
  128. refcount, *_ = idx1[H(1)]
  129. return refcount
  130. result = merge(refcounta, refcountb)
  131. # check for commutativity
  132. assert result == merge(refcountb, refcounta)
  133. return result
  134. def test_chunkindex_merge_limit1(self):
  135. # Check that it does *not* limit at MAX_VALUE - 1
  136. # (MAX_VALUE is odd)
  137. half = hashindex.MAX_VALUE // 2
  138. assert self._merge(half, half) == hashindex.MAX_VALUE - 1
  139. def test_chunkindex_merge_limit2(self):
  140. # 3000000000 + 2000000000 > MAX_VALUE
  141. assert self._merge(3000000000, 2000000000) == hashindex.MAX_VALUE
  142. def test_chunkindex_merge_limit3(self):
  143. # Crossover point: both addition and limit semantics will yield the same result
  144. half = hashindex.MAX_VALUE // 2
  145. assert self._merge(half + 1, half) == hashindex.MAX_VALUE
  146. def test_chunkindex_merge_limit4(self):
  147. # Beyond crossover, result of addition would be 2**31
  148. half = hashindex.MAX_VALUE // 2
  149. assert self._merge(half + 2, half) == hashindex.MAX_VALUE
  150. assert self._merge(half + 1, half + 1) == hashindex.MAX_VALUE
  151. def test_chunkindex_add(self):
  152. idx1 = ChunkIndex()
  153. idx1.add(H(1), 5, 6, 7)
  154. assert idx1[H(1)] == (5, 6, 7)
  155. idx1.add(H(1), 1, 2, 3)
  156. assert idx1[H(1)] == (6, 2, 3)
  157. def test_incref_limit(self):
  158. idx1 = ChunkIndex()
  159. idx1[H(1)] = (hashindex.MAX_VALUE, 6, 7)
  160. idx1.incref(H(1))
  161. refcount, *_ = idx1[H(1)]
  162. assert refcount == hashindex.MAX_VALUE
  163. def test_decref_limit(self):
  164. idx1 = ChunkIndex()
  165. idx1[H(1)] = hashindex.MAX_VALUE, 6, 7
  166. idx1.decref(H(1))
  167. refcount, *_ = idx1[H(1)]
  168. assert refcount == hashindex.MAX_VALUE
  169. def test_decref_zero(self):
  170. idx1 = ChunkIndex()
  171. idx1[H(1)] = 0, 0, 0
  172. with self.assert_raises(AssertionError):
  173. idx1.decref(H(1))
  174. def test_incref_decref(self):
  175. idx1 = ChunkIndex()
  176. idx1.add(H(1), 5, 6, 7)
  177. assert idx1[H(1)] == (5, 6, 7)
  178. idx1.incref(H(1))
  179. assert idx1[H(1)] == (6, 6, 7)
  180. idx1.decref(H(1))
  181. assert idx1[H(1)] == (5, 6, 7)
  182. def test_setitem_raises(self):
  183. idx1 = ChunkIndex()
  184. with self.assert_raises(AssertionError):
  185. idx1[H(1)] = hashindex.MAX_VALUE + 1, 0, 0
  186. def test_keyerror(self):
  187. idx = ChunkIndex()
  188. with self.assert_raises(KeyError):
  189. idx.incref(H(1))
  190. with self.assert_raises(KeyError):
  191. idx.decref(H(1))
  192. with self.assert_raises(KeyError):
  193. idx[H(1)]
  194. with self.assert_raises(OverflowError):
  195. idx.add(H(1), -1, 0, 0)
  196. class HashIndexDataTestCase(BaseTestCase):
  197. # This bytestring was created with 1.0-maint at c2f9533
  198. HASHINDEX = b'eJzt0L0NgmAUhtHLT0LDEI6AuAEhMVYmVnSuYefC7AB3Aj9KNedJbnfyFne6P67P27w0EdG1Eac+Cm1ZybAsy7Isy7Isy7Isy7I' \
  199. b'sy7Isy7Isy7Isy7Isy7Isy7Isy7Isy7Isy7Isy7Isy7Isy7Isy7Isy7Isy7Isy7Isy7Isy7Isy7Isy7Isy7Isy7LsL9nhc+cqTZ' \
  200. b'3XlO2Ys++Du5fX+l1/YFmWZVmWZVmWZVmWZVmWZVmWZVmWZVmWZVmWZVmWZVmWZVmWZVmWZVmWZVmWZVmWZVn2/+0O2rYccw=='
  201. def _serialize_hashindex(self, idx):
  202. with tempfile.TemporaryDirectory() as tempdir:
  203. file = os.path.join(tempdir, 'idx')
  204. idx.write(file)
  205. with open(file, 'rb') as f:
  206. return self._pack(f.read())
  207. def _deserialize_hashindex(self, bytestring):
  208. with tempfile.TemporaryDirectory() as tempdir:
  209. file = os.path.join(tempdir, 'idx')
  210. with open(file, 'wb') as f:
  211. f.write(self._unpack(bytestring))
  212. return ChunkIndex.read(file)
  213. def _pack(self, bytestring):
  214. return base64.b64encode(zlib.compress(bytestring))
  215. def _unpack(self, bytestring):
  216. return zlib.decompress(base64.b64decode(bytestring))
  217. def test_identical_creation(self):
  218. idx1 = ChunkIndex()
  219. idx1[H(1)] = 1, 2, 3
  220. idx1[H(2)] = 2**31 - 1, 0, 0
  221. idx1[H(3)] = 4294962296, 0, 0 # 4294962296 is -5000 interpreted as an uint32_t
  222. assert self._serialize_hashindex(idx1) == self.HASHINDEX
  223. def test_read_known_good(self):
  224. idx1 = self._deserialize_hashindex(self.HASHINDEX)
  225. assert idx1[H(1)] == (1, 2, 3)
  226. assert idx1[H(2)] == (2**31 - 1, 0, 0)
  227. assert idx1[H(3)] == (4294962296, 0, 0)
  228. idx2 = ChunkIndex()
  229. idx2[H(3)] = 2**32 - 123456, 6, 7
  230. idx1.merge(idx2)
  231. assert idx1[H(3)] == (hashindex.MAX_VALUE, 6, 7)
  232. class NSIndexTestCase(BaseTestCase):
  233. def test_nsindex_segment_limit(self):
  234. idx = NSIndex()
  235. with self.assert_raises(AssertionError):
  236. idx[H(1)] = hashindex.MAX_VALUE + 1, 0
  237. assert H(1) not in idx
  238. idx[H(2)] = hashindex.MAX_VALUE, 0
  239. assert H(2) in idx