hashindex.pyx 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367
  1. # -*- coding: utf-8 -*-
  2. from collections import namedtuple
  3. import locale
  4. import os
  5. cimport cython
  6. from libc.stdint cimport uint32_t, UINT32_MAX, uint64_t
  7. API_VERSION = 2
  8. cdef extern from "_hashindex.c":
  9. ctypedef struct HashIndex:
  10. pass
  11. HashIndex *hashindex_read(char *path)
  12. HashIndex *hashindex_init(int capacity, int key_size, int value_size)
  13. void hashindex_free(HashIndex *index)
  14. void hashindex_merge(HashIndex *index, HashIndex *other)
  15. void hashindex_add(HashIndex *index, void *key, void *value)
  16. int hashindex_get_size(HashIndex *index)
  17. int hashindex_write(HashIndex *index, char *path)
  18. void *hashindex_get(HashIndex *index, void *key)
  19. void *hashindex_next_key(HashIndex *index, void *key)
  20. int hashindex_delete(HashIndex *index, void *key)
  21. int hashindex_set(HashIndex *index, void *key, void *value)
  22. uint32_t _htole32(uint32_t v)
  23. uint32_t _le32toh(uint32_t v)
  24. cdef extern from "errno.h":
  25. int errno
  26. cdef extern from "string.h":
  27. char *strerror(int errnum)
  28. cdef _NoDefault = object()
  29. """
  30. The HashIndex is *not* a general purpose data structure. The value size must be at least 4 bytes, and these
  31. first bytes are used for in-band signalling in the data structure itself.
  32. The constant MAX_VALUE defines the valid range for these 4 bytes when interpreted as an uint32_t from 0
  33. to MAX_VALUE (inclusive). The following reserved values beyond MAX_VALUE are currently in use
  34. (byte order is LE)::
  35. 0xffffffff marks empty entries in the hashtable
  36. 0xfffffffe marks deleted entries in the hashtable
  37. None of the publicly available classes in this module will accept nor return a reserved value;
  38. AssertionError is raised instead.
  39. """
  40. assert UINT32_MAX == 2**32-1
  41. # module-level constant because cdef's in classes can't have default values
  42. cdef uint32_t _MAX_VALUE = 2**32-1025
  43. MAX_VALUE = _MAX_VALUE
  44. assert _MAX_VALUE % 2 == 1
  45. def decoded_strerror(errno):
  46. return strerror(errno).decode(locale.getpreferredencoding(), 'surrogateescape')
  47. @cython.internal
  48. cdef class IndexBase:
  49. cdef HashIndex *index
  50. cdef int key_size
  51. def __cinit__(self, capacity=0, path=None, key_size=32):
  52. self.key_size = key_size
  53. if path:
  54. path = os.fsencode(path)
  55. self.index = hashindex_read(path)
  56. if not self.index:
  57. if errno:
  58. raise OSError(errno, decoded_strerror(errno), os.fsdecode(path))
  59. raise RuntimeError('hashindex_read failed')
  60. else:
  61. self.index = hashindex_init(capacity, self.key_size, self.value_size)
  62. if not self.index:
  63. raise Exception('hashindex_init failed')
  64. def __dealloc__(self):
  65. if self.index:
  66. hashindex_free(self.index)
  67. @classmethod
  68. def read(cls, path):
  69. return cls(path=path)
  70. def write(self, path):
  71. path = os.fsencode(path)
  72. if not hashindex_write(self.index, path):
  73. raise Exception('hashindex_write failed')
  74. def clear(self):
  75. hashindex_free(self.index)
  76. self.index = hashindex_init(0, self.key_size, self.value_size)
  77. if not self.index:
  78. raise Exception('hashindex_init failed')
  79. def setdefault(self, key, value):
  80. if not key in self:
  81. self[key] = value
  82. def __delitem__(self, key):
  83. assert len(key) == self.key_size
  84. if not hashindex_delete(self.index, <char *>key):
  85. raise Exception('hashindex_delete failed')
  86. def get(self, key, default=None):
  87. try:
  88. return self[key]
  89. except KeyError:
  90. return default
  91. def pop(self, key, default=_NoDefault):
  92. try:
  93. value = self[key]
  94. del self[key]
  95. return value
  96. except KeyError:
  97. if default != _NoDefault:
  98. return default
  99. raise
  100. def __len__(self):
  101. return hashindex_get_size(self.index)
  102. cdef class NSIndex(IndexBase):
  103. value_size = 8
  104. def __getitem__(self, key):
  105. assert len(key) == self.key_size
  106. data = <uint32_t *>hashindex_get(self.index, <char *>key)
  107. if not data:
  108. raise KeyError(key)
  109. cdef uint32_t segment = _le32toh(data[0])
  110. assert segment <= _MAX_VALUE, "maximum number of segments reached"
  111. return segment, _le32toh(data[1])
  112. def __setitem__(self, key, value):
  113. assert len(key) == self.key_size
  114. cdef uint32_t[2] data
  115. cdef uint32_t segment = value[0]
  116. assert segment <= _MAX_VALUE, "maximum number of segments reached"
  117. data[0] = _htole32(segment)
  118. data[1] = _htole32(value[1])
  119. if not hashindex_set(self.index, <char *>key, data):
  120. raise Exception('hashindex_set failed')
  121. def __contains__(self, key):
  122. cdef uint32_t segment
  123. assert len(key) == self.key_size
  124. data = <uint32_t *>hashindex_get(self.index, <char *>key)
  125. if data != NULL:
  126. segment = _le32toh(data[0])
  127. assert segment <= _MAX_VALUE, "maximum number of segments reached"
  128. return data != NULL
  129. def iteritems(self, marker=None):
  130. cdef const void *key
  131. iter = NSKeyIterator(self.key_size)
  132. iter.idx = self
  133. iter.index = self.index
  134. if marker:
  135. key = hashindex_get(self.index, <char *>marker)
  136. if marker is None:
  137. raise IndexError
  138. iter.key = key - self.key_size
  139. return iter
  140. cdef class NSKeyIterator:
  141. cdef NSIndex idx
  142. cdef HashIndex *index
  143. cdef const void *key
  144. cdef int key_size
  145. def __cinit__(self, key_size):
  146. self.key = NULL
  147. self.key_size = key_size
  148. def __iter__(self):
  149. return self
  150. def __next__(self):
  151. self.key = hashindex_next_key(self.index, <char *>self.key)
  152. if not self.key:
  153. raise StopIteration
  154. cdef uint32_t *value = <uint32_t *>(self.key + self.key_size)
  155. cdef uint32_t segment = _le32toh(value[0])
  156. assert segment <= _MAX_VALUE, "maximum number of segments reached"
  157. return (<char *>self.key)[:self.key_size], (segment, _le32toh(value[1]))
  158. ChunkIndexEntry = namedtuple('ChunkIndexEntry', 'refcount size csize')
  159. cdef class ChunkIndex(IndexBase):
  160. """
  161. Mapping of 32 byte keys to (refcount, size, csize), which are all 32-bit unsigned.
  162. The reference count cannot overflow. If an overflow would occur, the refcount
  163. is fixed to MAX_VALUE and will neither increase nor decrease by incref(), decref()
  164. or add().
  165. Prior signed 32-bit overflow is handled correctly for most cases: All values
  166. from UINT32_MAX (2**32-1, inclusive) to MAX_VALUE (exclusive) are reserved and either
  167. cause silent data loss (-1, -2) or will raise an AssertionError when accessed.
  168. Other values are handled correctly. Note that previously the refcount could also reach
  169. 0 by *increasing* it.
  170. Assigning refcounts in this reserved range is an invalid operation and raises AssertionError.
  171. """
  172. value_size = 12
  173. def __getitem__(self, key):
  174. assert len(key) == self.key_size
  175. data = <uint32_t *>hashindex_get(self.index, <char *>key)
  176. if not data:
  177. raise KeyError(key)
  178. cdef uint32_t refcount = _le32toh(data[0])
  179. assert refcount <= _MAX_VALUE
  180. return ChunkIndexEntry(refcount, _le32toh(data[1]), _le32toh(data[2]))
  181. def __setitem__(self, key, value):
  182. assert len(key) == self.key_size
  183. cdef uint32_t[3] data
  184. cdef uint32_t refcount = value[0]
  185. assert refcount <= _MAX_VALUE, "invalid reference count"
  186. data[0] = _htole32(refcount)
  187. data[1] = _htole32(value[1])
  188. data[2] = _htole32(value[2])
  189. if not hashindex_set(self.index, <char *>key, data):
  190. raise Exception('hashindex_set failed')
  191. def __contains__(self, key):
  192. assert len(key) == self.key_size
  193. data = <uint32_t *>hashindex_get(self.index, <char *>key)
  194. if data != NULL:
  195. assert data[0] <= _MAX_VALUE
  196. return data != NULL
  197. def incref(self, key):
  198. """Increase refcount for 'key', return (refcount, size, csize)"""
  199. assert len(key) == self.key_size
  200. data = <uint32_t *>hashindex_get(self.index, <char *>key)
  201. if not data:
  202. raise KeyError(key)
  203. cdef uint32_t refcount = _le32toh(data[0])
  204. assert refcount <= _MAX_VALUE, "invalid reference count"
  205. if refcount != _MAX_VALUE:
  206. refcount += 1
  207. data[0] = _htole32(refcount)
  208. return refcount, _le32toh(data[1]), _le32toh(data[2])
  209. def decref(self, key):
  210. """Decrease refcount for 'key', return (refcount, size, csize)"""
  211. assert len(key) == self.key_size
  212. data = <uint32_t *>hashindex_get(self.index, <char *>key)
  213. if not data:
  214. raise KeyError(key)
  215. cdef uint32_t refcount = _le32toh(data[0])
  216. # Never decrease a reference count of zero
  217. assert 0 < refcount <= _MAX_VALUE, "invalid reference count"
  218. if refcount != _MAX_VALUE:
  219. refcount -= 1
  220. data[0] = _htole32(refcount)
  221. return refcount, _le32toh(data[1]), _le32toh(data[2])
  222. def iteritems(self, marker=None):
  223. cdef const void *key
  224. iter = ChunkKeyIterator(self.key_size)
  225. iter.idx = self
  226. iter.index = self.index
  227. if marker:
  228. key = hashindex_get(self.index, <char *>marker)
  229. if marker is None:
  230. raise IndexError
  231. iter.key = key - self.key_size
  232. return iter
  233. def summarize(self):
  234. cdef uint64_t size = 0, csize = 0, unique_size = 0, unique_csize = 0, chunks = 0, unique_chunks = 0
  235. cdef uint32_t *values
  236. cdef uint32_t refcount
  237. cdef void *key = NULL
  238. while True:
  239. key = hashindex_next_key(self.index, key)
  240. if not key:
  241. break
  242. unique_chunks += 1
  243. values = <uint32_t*> (key + self.key_size)
  244. refcount = _le32toh(values[0])
  245. assert refcount <= MAX_VALUE, "invalid reference count"
  246. chunks += refcount
  247. unique_size += _le32toh(values[1])
  248. unique_csize += _le32toh(values[2])
  249. size += <uint64_t> _le32toh(values[1]) * _le32toh(values[0])
  250. csize += <uint64_t> _le32toh(values[2]) * _le32toh(values[0])
  251. return size, csize, unique_size, unique_csize, unique_chunks, chunks
  252. def add(self, key, refs, size, csize):
  253. assert len(key) == self.key_size
  254. cdef uint32_t[3] data
  255. data[0] = _htole32(refs)
  256. data[1] = _htole32(size)
  257. data[2] = _htole32(csize)
  258. self._add(<char*> key, data)
  259. cdef _add(self, void *key, uint32_t *data):
  260. cdef uint64_t refcount1, refcount2, result64
  261. values = <uint32_t*> hashindex_get(self.index, key)
  262. if values:
  263. refcount1 = _le32toh(values[0])
  264. refcount2 = _le32toh(data[0])
  265. assert refcount1 <= _MAX_VALUE
  266. assert refcount2 <= _MAX_VALUE
  267. result64 = refcount1 + refcount2
  268. values[0] = _htole32(min(result64, _MAX_VALUE))
  269. values[1] = data[1]
  270. values[2] = data[2]
  271. else:
  272. hashindex_set(self.index, key, data)
  273. def merge(self, ChunkIndex other):
  274. cdef void *key = NULL
  275. while True:
  276. key = hashindex_next_key(other.index, key)
  277. if not key:
  278. break
  279. self._add(key, <uint32_t*> (key + self.key_size))
  280. cdef class ChunkKeyIterator:
  281. cdef ChunkIndex idx
  282. cdef HashIndex *index
  283. cdef const void *key
  284. cdef int key_size
  285. def __cinit__(self, key_size):
  286. self.key = NULL
  287. self.key_size = key_size
  288. def __iter__(self):
  289. return self
  290. def __next__(self):
  291. self.key = hashindex_next_key(self.index, <char *>self.key)
  292. if not self.key:
  293. raise StopIteration
  294. cdef uint32_t *value = <uint32_t *>(self.key + self.key_size)
  295. cdef uint32_t refcount = _le32toh(value[0])
  296. assert refcount <= MAX_VALUE, "invalid reference count"
  297. return (<char *>self.key)[:self.key_size], ChunkIndexEntry(refcount, _le32toh(value[1]), _le32toh(value[2]))