hashindex.pyx 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230
  1. # -*- coding: utf-8 -*-
  2. import os
  3. API_VERSION = 2
  4. cdef extern from "_hashindex.c":
  5. ctypedef struct HashIndex:
  6. pass
  7. HashIndex *hashindex_read(char *path)
  8. HashIndex *hashindex_init(int capacity, int key_size, int value_size)
  9. void hashindex_free(HashIndex *index)
  10. void hashindex_summarize(HashIndex *index, long long *total_size, long long *total_csize,
  11. long long *unique_size, long long *unique_csize,
  12. long long *total_unique_chunks, long long *total_chunks)
  13. void hashindex_merge(HashIndex *index, HashIndex *other)
  14. void hashindex_add(HashIndex *index, void *key, void *value)
  15. int hashindex_get_size(HashIndex *index)
  16. int hashindex_write(HashIndex *index, char *path)
  17. void *hashindex_get(HashIndex *index, void *key)
  18. void *hashindex_next_key(HashIndex *index, void *key)
  19. int hashindex_delete(HashIndex *index, void *key)
  20. int hashindex_set(HashIndex *index, void *key, void *value)
  21. int _htole32(int v)
  22. int _le32toh(int v)
  23. cdef _NoDefault = object()
  24. cimport cython
  25. @cython.internal
  26. cdef class IndexBase:
  27. cdef HashIndex *index
  28. cdef int key_size
  29. def __cinit__(self, capacity=0, path=None, key_size=32):
  30. self.key_size = key_size
  31. if path:
  32. path = os.fsencode(path)
  33. self.index = hashindex_read(path)
  34. if not self.index:
  35. raise Exception('hashindex_read failed')
  36. else:
  37. self.index = hashindex_init(capacity, self.key_size, self.value_size)
  38. if not self.index:
  39. raise Exception('hashindex_init failed')
  40. def __dealloc__(self):
  41. if self.index:
  42. hashindex_free(self.index)
  43. @classmethod
  44. def read(cls, path):
  45. return cls(path=path)
  46. def write(self, path):
  47. path = os.fsencode(path)
  48. if not hashindex_write(self.index, path):
  49. raise Exception('hashindex_write failed')
  50. def clear(self):
  51. hashindex_free(self.index)
  52. self.index = hashindex_init(0, self.key_size, self.value_size)
  53. if not self.index:
  54. raise Exception('hashindex_init failed')
  55. def setdefault(self, key, value):
  56. if not key in self:
  57. self[key] = value
  58. def __delitem__(self, key):
  59. assert len(key) == self.key_size
  60. if not hashindex_delete(self.index, <char *>key):
  61. raise Exception('hashindex_delete failed')
  62. def get(self, key, default=None):
  63. try:
  64. return self[key]
  65. except KeyError:
  66. return default
  67. def pop(self, key, default=_NoDefault):
  68. try:
  69. value = self[key]
  70. del self[key]
  71. return value
  72. except KeyError:
  73. if default != _NoDefault:
  74. return default
  75. raise
  76. def __len__(self):
  77. return hashindex_get_size(self.index)
  78. cdef class NSIndex(IndexBase):
  79. value_size = 8
  80. def __getitem__(self, key):
  81. assert len(key) == self.key_size
  82. data = <int *>hashindex_get(self.index, <char *>key)
  83. if not data:
  84. raise KeyError
  85. return _le32toh(data[0]), _le32toh(data[1])
  86. def __setitem__(self, key, value):
  87. assert len(key) == self.key_size
  88. cdef int[2] data
  89. data[0] = _htole32(value[0])
  90. data[1] = _htole32(value[1])
  91. if not hashindex_set(self.index, <char *>key, data):
  92. raise Exception('hashindex_set failed')
  93. def __contains__(self, key):
  94. assert len(key) == self.key_size
  95. data = <int *>hashindex_get(self.index, <char *>key)
  96. return data != NULL
  97. def iteritems(self, marker=None):
  98. cdef const void *key
  99. iter = NSKeyIterator(self.key_size)
  100. iter.idx = self
  101. iter.index = self.index
  102. if marker:
  103. key = hashindex_get(self.index, <char *>marker)
  104. if marker is None:
  105. raise IndexError
  106. iter.key = key - self.key_size
  107. return iter
  108. cdef class NSKeyIterator:
  109. cdef NSIndex idx
  110. cdef HashIndex *index
  111. cdef const void *key
  112. cdef int key_size
  113. def __cinit__(self, key_size):
  114. self.key = NULL
  115. self.key_size = key_size
  116. def __iter__(self):
  117. return self
  118. def __next__(self):
  119. self.key = hashindex_next_key(self.index, <char *>self.key)
  120. if not self.key:
  121. raise StopIteration
  122. cdef int *value = <int *>(self.key + self.key_size)
  123. return (<char *>self.key)[:self.key_size], (_le32toh(value[0]), _le32toh(value[1]))
  124. cdef class ChunkIndex(IndexBase):
  125. value_size = 12
  126. def __getitem__(self, key):
  127. assert len(key) == self.key_size
  128. data = <int *>hashindex_get(self.index, <char *>key)
  129. if not data:
  130. raise KeyError
  131. return _le32toh(data[0]), _le32toh(data[1]), _le32toh(data[2])
  132. def __setitem__(self, key, value):
  133. assert len(key) == self.key_size
  134. cdef int[3] data
  135. data[0] = _htole32(value[0])
  136. data[1] = _htole32(value[1])
  137. data[2] = _htole32(value[2])
  138. if not hashindex_set(self.index, <char *>key, data):
  139. raise Exception('hashindex_set failed')
  140. def __contains__(self, key):
  141. assert len(key) == self.key_size
  142. data = <int *>hashindex_get(self.index, <char *>key)
  143. return data != NULL
  144. def iteritems(self, marker=None):
  145. cdef const void *key
  146. iter = ChunkKeyIterator(self.key_size)
  147. iter.idx = self
  148. iter.index = self.index
  149. if marker:
  150. key = hashindex_get(self.index, <char *>marker)
  151. if marker is None:
  152. raise IndexError
  153. iter.key = key - self.key_size
  154. return iter
  155. def summarize(self):
  156. cdef long long total_size, total_csize, unique_size, unique_csize, total_unique_chunks, total_chunks
  157. hashindex_summarize(self.index, &total_size, &total_csize,
  158. &unique_size, &unique_csize,
  159. &total_unique_chunks, &total_chunks)
  160. return total_size, total_csize, unique_size, unique_csize, total_unique_chunks, total_chunks
  161. def add(self, key, refs, size, csize):
  162. assert len(key) == self.key_size
  163. cdef int[3] data
  164. data[0] = _htole32(refs)
  165. data[1] = _htole32(size)
  166. data[2] = _htole32(csize)
  167. hashindex_add(self.index, <char *>key, data)
  168. def merge(self, ChunkIndex other):
  169. hashindex_merge(self.index, other.index)
  170. cdef class ChunkKeyIterator:
  171. cdef ChunkIndex idx
  172. cdef HashIndex *index
  173. cdef const void *key
  174. cdef int key_size
  175. def __cinit__(self, key_size):
  176. self.key = NULL
  177. self.key_size = key_size
  178. def __iter__(self):
  179. return self
  180. def __next__(self):
  181. self.key = hashindex_next_key(self.index, <char *>self.key)
  182. if not self.key:
  183. raise StopIteration
  184. cdef int *value = <int *>(self.key + self.key_size)
  185. return (<char *>self.key)[:self.key_size], (_le32toh(value[0]), _le32toh(value[1]), _le32toh(value[2]))