hashindex.pyx 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358
  1. # -*- coding: utf-8 -*-
  2. import os
  3. cimport cython
  4. from libc.stdint cimport uint32_t, UINT32_MAX, uint64_t
  5. API_VERSION = 3
  6. cdef extern from "_hashindex.c":
  7. ctypedef struct HashIndex:
  8. pass
  9. HashIndex *hashindex_read(char *path)
  10. HashIndex *hashindex_init(int capacity, int key_size, int value_size)
  11. void hashindex_free(HashIndex *index)
  12. int hashindex_get_size(HashIndex *index)
  13. int hashindex_write(HashIndex *index, char *path)
  14. void *hashindex_get(HashIndex *index, void *key)
  15. void *hashindex_next_key(HashIndex *index, void *key)
  16. int hashindex_delete(HashIndex *index, void *key)
  17. int hashindex_set(HashIndex *index, void *key, void *value)
  18. uint32_t _htole32(uint32_t v)
  19. uint32_t _le32toh(uint32_t v)
  20. double HASH_MAX_LOAD
  21. cdef _NoDefault = object()
  22. """
  23. The HashIndex is *not* a general purpose data structure. The value size must be at least 4 bytes, and these
  24. first bytes are used for in-band signalling in the data structure itself.
  25. The constant MAX_VALUE defines the valid range for these 4 bytes when interpreted as an uint32_t from 0
  26. to MAX_VALUE (inclusive). The following reserved values beyond MAX_VALUE are currently in use
  27. (byte order is LE)::
  28. 0xffffffff marks empty entries in the hashtable
  29. 0xfffffffe marks deleted entries in the hashtable
  30. None of the publicly available classes in this module will accept nor return a reserved value;
  31. AssertionError is raised instead.
  32. """
  33. assert UINT32_MAX == 2**32-1
  34. # module-level constant because cdef's in classes can't have default values
  35. cdef uint32_t _MAX_VALUE = 2**32-1025
  36. assert _MAX_VALUE % 2 == 1
  37. @cython.internal
  38. cdef class IndexBase:
  39. cdef HashIndex *index
  40. cdef int key_size
  41. MAX_LOAD_FACTOR = HASH_MAX_LOAD
  42. MAX_VALUE = _MAX_VALUE
  43. def __cinit__(self, capacity=0, path=None, key_size=32):
  44. self.key_size = key_size
  45. if path:
  46. path = os.fsencode(path)
  47. self.index = hashindex_read(path)
  48. if not self.index:
  49. raise Exception('hashindex_read failed')
  50. else:
  51. self.index = hashindex_init(capacity, self.key_size, self.value_size)
  52. if not self.index:
  53. raise Exception('hashindex_init failed')
  54. def __dealloc__(self):
  55. if self.index:
  56. hashindex_free(self.index)
  57. @classmethod
  58. def read(cls, path):
  59. return cls(path=path)
  60. def write(self, path):
  61. path = os.fsencode(path)
  62. if not hashindex_write(self.index, path):
  63. raise Exception('hashindex_write failed')
  64. def clear(self):
  65. hashindex_free(self.index)
  66. self.index = hashindex_init(0, self.key_size, self.value_size)
  67. if not self.index:
  68. raise Exception('hashindex_init failed')
  69. def setdefault(self, key, value):
  70. if not key in self:
  71. self[key] = value
  72. def __delitem__(self, key):
  73. assert len(key) == self.key_size
  74. if not hashindex_delete(self.index, <char *>key):
  75. raise Exception('hashindex_delete failed')
  76. def get(self, key, default=None):
  77. try:
  78. return self[key]
  79. except KeyError:
  80. return default
  81. def pop(self, key, default=_NoDefault):
  82. try:
  83. value = self[key]
  84. del self[key]
  85. return value
  86. except KeyError:
  87. if default != _NoDefault:
  88. return default
  89. raise
  90. def __len__(self):
  91. return hashindex_get_size(self.index)
  92. cdef class NSIndex(IndexBase):
  93. value_size = 8
  94. def __getitem__(self, key):
  95. assert len(key) == self.key_size
  96. data = <uint32_t *>hashindex_get(self.index, <char *>key)
  97. if not data:
  98. raise KeyError(key)
  99. cdef uint32_t segment = _le32toh(data[0])
  100. assert segment <= _MAX_VALUE, "maximum number of segments reached"
  101. return segment, _le32toh(data[1])
  102. def __setitem__(self, key, value):
  103. assert len(key) == self.key_size
  104. cdef uint32_t[2] data
  105. cdef uint32_t segment = value[0]
  106. assert segment <= _MAX_VALUE, "maximum number of segments reached"
  107. data[0] = _htole32(segment)
  108. data[1] = _htole32(value[1])
  109. if not hashindex_set(self.index, <char *>key, data):
  110. raise Exception('hashindex_set failed')
  111. def __contains__(self, key):
  112. cdef uint32_t segment
  113. assert len(key) == self.key_size
  114. data = <uint32_t *>hashindex_get(self.index, <char *>key)
  115. if data != NULL:
  116. segment = _le32toh(data[0])
  117. assert segment <= _MAX_VALUE, "maximum number of segments reached"
  118. return data != NULL
  119. def iteritems(self, marker=None):
  120. cdef const void *key
  121. iter = NSKeyIterator(self.key_size)
  122. iter.idx = self
  123. iter.index = self.index
  124. if marker:
  125. key = hashindex_get(self.index, <char *>marker)
  126. if marker is None:
  127. raise IndexError
  128. iter.key = key - self.key_size
  129. return iter
  130. cdef class NSKeyIterator:
  131. cdef NSIndex idx
  132. cdef HashIndex *index
  133. cdef const void *key
  134. cdef int key_size
  135. cdef int exhausted
  136. def __cinit__(self, key_size):
  137. self.key = NULL
  138. self.key_size = key_size
  139. self.exhausted = 0
  140. def __iter__(self):
  141. return self
  142. def __next__(self):
  143. if self.exhausted:
  144. raise StopIteration
  145. self.key = hashindex_next_key(self.index, <char *>self.key)
  146. if not self.key:
  147. self.exhausted = 1
  148. raise StopIteration
  149. cdef uint32_t *value = <uint32_t *>(self.key + self.key_size)
  150. cdef uint32_t segment = _le32toh(value[0])
  151. assert segment <= _MAX_VALUE, "maximum number of segments reached"
  152. return (<char *>self.key)[:self.key_size], (segment, _le32toh(value[1]))
  153. cdef class ChunkIndex(IndexBase):
  154. """
  155. Mapping of 32 byte keys to (refcount, size, csize), which are all 32-bit unsigned.
  156. The reference count cannot overflow. If an overflow would occur, the refcount
  157. is fixed to MAX_VALUE and will neither increase nor decrease by incref(), decref()
  158. or add().
  159. Prior signed 32-bit overflow is handled correctly for most cases: All values
  160. from UINT32_MAX (2**32-1, inclusive) to MAX_VALUE (exclusive) are reserved and either
  161. cause silent data loss (-1, -2) or will raise an AssertionError when accessed.
  162. Other values are handled correctly. Note that previously the refcount could also reach
  163. 0 by *increasing* it.
  164. Assigning refcounts in this reserved range is an invalid operation and raises AssertionError.
  165. """
  166. value_size = 12
  167. def __getitem__(self, key):
  168. assert len(key) == self.key_size
  169. data = <uint32_t *>hashindex_get(self.index, <char *>key)
  170. if not data:
  171. raise KeyError(key)
  172. cdef uint32_t refcount = _le32toh(data[0])
  173. assert refcount <= _MAX_VALUE
  174. return refcount, _le32toh(data[1]), _le32toh(data[2])
  175. def __setitem__(self, key, value):
  176. assert len(key) == self.key_size
  177. cdef uint32_t[3] data
  178. cdef uint32_t refcount = value[0]
  179. assert refcount <= _MAX_VALUE, "invalid reference count"
  180. data[0] = _htole32(refcount)
  181. data[1] = _htole32(value[1])
  182. data[2] = _htole32(value[2])
  183. if not hashindex_set(self.index, <char *>key, data):
  184. raise Exception('hashindex_set failed')
  185. def __contains__(self, key):
  186. assert len(key) == self.key_size
  187. data = <uint32_t *>hashindex_get(self.index, <char *>key)
  188. if data != NULL:
  189. assert data[0] <= _MAX_VALUE
  190. return data != NULL
  191. def incref(self, key):
  192. """Increase refcount for 'key', return (refcount, size, csize)"""
  193. assert len(key) == self.key_size
  194. data = <uint32_t *>hashindex_get(self.index, <char *>key)
  195. if not data:
  196. raise KeyError(key)
  197. cdef uint32_t refcount = _le32toh(data[0])
  198. assert refcount <= _MAX_VALUE, "invalid reference count"
  199. if refcount != _MAX_VALUE:
  200. refcount += 1
  201. data[0] = _htole32(refcount)
  202. return refcount, _le32toh(data[1]), _le32toh(data[2])
  203. def decref(self, key):
  204. """Decrease refcount for 'key', return (refcount, size, csize)"""
  205. assert len(key) == self.key_size
  206. data = <uint32_t *>hashindex_get(self.index, <char *>key)
  207. if not data:
  208. raise KeyError(key)
  209. cdef uint32_t refcount = _le32toh(data[0])
  210. # Never decrease a reference count of zero
  211. assert 0 < refcount <= _MAX_VALUE, "invalid reference count"
  212. if refcount != _MAX_VALUE:
  213. refcount -= 1
  214. data[0] = _htole32(refcount)
  215. return refcount, _le32toh(data[1]), _le32toh(data[2])
  216. def iteritems(self, marker=None):
  217. cdef const void *key
  218. iter = ChunkKeyIterator(self.key_size)
  219. iter.idx = self
  220. iter.index = self.index
  221. if marker:
  222. key = hashindex_get(self.index, <char *>marker)
  223. if marker is None:
  224. raise IndexError
  225. iter.key = key - self.key_size
  226. return iter
  227. def summarize(self):
  228. cdef uint64_t size = 0, csize = 0, unique_size = 0, unique_csize = 0, chunks = 0, unique_chunks = 0
  229. cdef uint32_t *values
  230. cdef uint32_t refcount
  231. cdef void *key = NULL
  232. while True:
  233. key = hashindex_next_key(self.index, key)
  234. if not key:
  235. break
  236. unique_chunks += 1
  237. values = <uint32_t*> (key + self.key_size)
  238. refcount = _le32toh(values[0])
  239. assert refcount <= _MAX_VALUE, "invalid reference count"
  240. chunks += refcount
  241. unique_size += _le32toh(values[1])
  242. unique_csize += _le32toh(values[2])
  243. size += <uint64_t> _le32toh(values[1]) * _le32toh(values[0])
  244. csize += <uint64_t> _le32toh(values[2]) * _le32toh(values[0])
  245. return size, csize, unique_size, unique_csize, unique_chunks, chunks
  246. def add(self, key, refs, size, csize):
  247. assert len(key) == self.key_size
  248. cdef uint32_t[3] data
  249. data[0] = _htole32(refs)
  250. data[1] = _htole32(size)
  251. data[2] = _htole32(csize)
  252. self._add(<char*> key, data)
  253. cdef _add(self, void *key, uint32_t *data):
  254. cdef uint64_t refcount1, refcount2, result64
  255. values = <uint32_t*> hashindex_get(self.index, key)
  256. if values:
  257. refcount1 = _le32toh(values[0])
  258. refcount2 = _le32toh(data[0])
  259. assert refcount1 <= _MAX_VALUE
  260. assert refcount2 <= _MAX_VALUE
  261. result64 = refcount1 + refcount2
  262. values[0] = _htole32(min(result64, _MAX_VALUE))
  263. else:
  264. if not hashindex_set(self.index, key, data):
  265. raise Exception('hashindex_set failed')
  266. def merge(self, ChunkIndex other):
  267. cdef void *key = NULL
  268. while True:
  269. key = hashindex_next_key(other.index, key)
  270. if not key:
  271. break
  272. self._add(key, <uint32_t*> (key + self.key_size))
  273. cdef class ChunkKeyIterator:
  274. cdef ChunkIndex idx
  275. cdef HashIndex *index
  276. cdef const void *key
  277. cdef int key_size
  278. cdef int exhausted
  279. def __cinit__(self, key_size):
  280. self.key = NULL
  281. self.key_size = key_size
  282. self.exhausted = 0
  283. def __iter__(self):
  284. return self
  285. def __next__(self):
  286. if self.exhausted:
  287. raise StopIteration
  288. self.key = hashindex_next_key(self.index, <char *>self.key)
  289. if not self.key:
  290. self.exhausted = 1
  291. raise StopIteration
  292. cdef uint32_t *value = <uint32_t *>(self.key + self.key_size)
  293. cdef uint32_t refcount = _le32toh(value[0])
  294. assert refcount <= _MAX_VALUE, "invalid reference count"
  295. return (<char *>self.key)[:self.key_size], (refcount, _le32toh(value[1]), _le32toh(value[2]))