_hashindex.c 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413
  1. #include <assert.h>
  2. #include <stdio.h>
  3. #include <stdlib.h>
  4. #include <stdint.h>
  5. #include <string.h>
  6. #include <sys/types.h>
  7. #include <sys/stat.h>
  8. #include <fcntl.h>
  9. #include <unistd.h>
  10. #if defined(BYTE_ORDER)&&(BYTE_ORDER == BIG_ENDIAN)
  11. #define _le32toh(x) __builtin_bswap32(x)
  12. #define _htole32(x) __builtin_bswap32(x)
  13. #elif defined(BYTE_ORDER)&&(BYTE_ORDER == LITTLE_ENDIAN)
  14. #define _le32toh(x) (x)
  15. #define _htole32(x) (x)
  16. #else
  17. #error Unknown byte order
  18. #endif
  19. #define MAGIC "BORG_IDX"
  20. #define MAGIC_LEN 8
  21. typedef struct {
  22. char magic[MAGIC_LEN];
  23. int32_t num_entries;
  24. int32_t num_buckets;
  25. int8_t key_size;
  26. int8_t value_size;
  27. } __attribute__((__packed__)) HashHeader;
  28. typedef struct {
  29. void *buckets;
  30. int num_entries;
  31. int num_buckets;
  32. int key_size;
  33. int value_size;
  34. off_t bucket_size;
  35. int lower_limit;
  36. int upper_limit;
  37. } HashIndex;
  38. #define EMPTY _htole32(0xffffffff)
  39. #define DELETED _htole32(0xfffffffe)
  40. #define MAX_BUCKET_SIZE 512
  41. #define BUCKET_LOWER_LIMIT .25
  42. #define BUCKET_UPPER_LIMIT .75 /* don't go higher than 0.75, otherwise performance severely suffers! */
  43. #define MIN_BUCKETS 1031 /* must be prime, otherwise performance breaks down! */
  44. #define MAX(x, y) ((x) > (y) ? (x): (y))
  45. #define BUCKET_ADDR(index, idx) (index->buckets + (idx * index->bucket_size))
  46. #define BUCKET_IS_DELETED(index, idx) (*((uint32_t *)(BUCKET_ADDR(index, idx) + index->key_size)) == DELETED)
  47. #define BUCKET_IS_EMPTY(index, idx) (*((uint32_t *)(BUCKET_ADDR(index, idx) + index->key_size)) == EMPTY)
  48. #define BUCKET_MATCHES_KEY(index, idx, key) (memcmp(key, BUCKET_ADDR(index, idx), index->key_size) == 0)
  49. #define BUCKET_MARK_DELETED(index, idx) (*((uint32_t *)(BUCKET_ADDR(index, idx) + index->key_size)) = DELETED)
  50. #define BUCKET_MARK_EMPTY(index, idx) (*((uint32_t *)(BUCKET_ADDR(index, idx) + index->key_size)) = EMPTY)
  51. #define EPRINTF_MSG(msg, ...) fprintf(stderr, "hashindex: " msg "\n", ##__VA_ARGS__)
  52. #define EPRINTF_MSG_PATH(path, msg, ...) fprintf(stderr, "hashindex: %s: " msg "\n", path, ##__VA_ARGS__)
  53. #define EPRINTF(msg, ...) fprintf(stderr, "hashindex: " msg "(%s)\n", ##__VA_ARGS__, strerror(errno))
  54. #define EPRINTF_PATH(path, msg, ...) fprintf(stderr, "hashindex: %s: " msg " (%s)\n", path, ##__VA_ARGS__, strerror(errno))
  55. static HashIndex *hashindex_read(const char *path);
  56. static int hashindex_write(HashIndex *index, const char *path);
  57. static HashIndex *hashindex_init(int capacity, int key_size, int value_size);
  58. static const void *hashindex_get(HashIndex *index, const void *key);
  59. static int hashindex_set(HashIndex *index, const void *key, const void *value);
  60. static int hashindex_delete(HashIndex *index, const void *key);
  61. static void *hashindex_next_key(HashIndex *index, const void *key);
  62. /* Private API */
  63. static int
  64. hashindex_index(HashIndex *index, const void *key)
  65. {
  66. return _le32toh(*((uint32_t *)key)) % index->num_buckets;
  67. }
  68. static int
  69. hashindex_lookup(HashIndex *index, const void *key)
  70. {
  71. int didx = -1;
  72. int start = hashindex_index(index, key);
  73. int idx = start;
  74. for(;;) {
  75. if(BUCKET_IS_EMPTY(index, idx))
  76. {
  77. return -1;
  78. }
  79. if(BUCKET_IS_DELETED(index, idx)) {
  80. if(didx == -1) {
  81. didx = idx;
  82. }
  83. }
  84. else if(BUCKET_MATCHES_KEY(index, idx, key)) {
  85. if (didx != -1) {
  86. memcpy(BUCKET_ADDR(index, didx), BUCKET_ADDR(index, idx), index->bucket_size);
  87. BUCKET_MARK_DELETED(index, idx);
  88. idx = didx;
  89. }
  90. return idx;
  91. }
  92. idx = (idx + 1) % index->num_buckets;
  93. if(idx == start) {
  94. return -1;
  95. }
  96. }
  97. }
  98. static int
  99. hashindex_resize(HashIndex *index, int capacity)
  100. {
  101. HashIndex *new;
  102. void *key = NULL;
  103. int32_t key_size = index->key_size;
  104. if(!(new = hashindex_init(capacity, key_size, index->value_size))) {
  105. return 0;
  106. }
  107. while((key = hashindex_next_key(index, key))) {
  108. hashindex_set(new, key, key + key_size);
  109. }
  110. free(index->buckets);
  111. index->buckets = new->buckets;
  112. index->num_buckets = new->num_buckets;
  113. index->lower_limit = new->lower_limit;
  114. index->upper_limit = new->upper_limit;
  115. free(new);
  116. return 1;
  117. }
  118. /* Public API */
  119. static HashIndex *
  120. hashindex_read(const char *path)
  121. {
  122. FILE *fd;
  123. off_t length, buckets_length, bytes_read;
  124. HashHeader header;
  125. HashIndex *index = NULL;
  126. if((fd = fopen(path, "rb")) == NULL) {
  127. EPRINTF_PATH(path, "fopen for reading failed");
  128. return NULL;
  129. }
  130. bytes_read = fread(&header, 1, sizeof(HashHeader), fd);
  131. if(bytes_read != sizeof(HashHeader)) {
  132. if(ferror(fd)) {
  133. EPRINTF_PATH(path, "fread header failed (expected %ju, got %ju)",
  134. (uintmax_t) sizeof(HashHeader), (uintmax_t) bytes_read);
  135. }
  136. else {
  137. EPRINTF_MSG_PATH(path, "fread header failed (expected %ju, got %ju)",
  138. (uintmax_t) sizeof(HashHeader), (uintmax_t) bytes_read);
  139. }
  140. goto fail;
  141. }
  142. if(fseek(fd, 0, SEEK_END) < 0) {
  143. EPRINTF_PATH(path, "fseek failed");
  144. goto fail;
  145. }
  146. if((length = ftell(fd)) < 0) {
  147. EPRINTF_PATH(path, "ftell failed");
  148. goto fail;
  149. }
  150. if(fseek(fd, sizeof(HashHeader), SEEK_SET) < 0) {
  151. EPRINTF_PATH(path, "fseek failed");
  152. goto fail;
  153. }
  154. if(memcmp(header.magic, MAGIC, MAGIC_LEN)) {
  155. EPRINTF_MSG_PATH(path, "Unknown MAGIC in header");
  156. goto fail;
  157. }
  158. buckets_length = (off_t)_le32toh(header.num_buckets) * (header.key_size + header.value_size);
  159. if((size_t) length != sizeof(HashHeader) + buckets_length) {
  160. EPRINTF_MSG_PATH(path, "Incorrect file length (expected %ju, got %ju)",
  161. (uintmax_t) sizeof(HashHeader) + buckets_length, (uintmax_t) length);
  162. goto fail;
  163. }
  164. if(!(index = malloc(sizeof(HashIndex)))) {
  165. EPRINTF_PATH(path, "malloc header failed");
  166. goto fail;
  167. }
  168. if(!(index->buckets = malloc(buckets_length))) {
  169. EPRINTF_PATH(path, "malloc buckets failed");
  170. free(index);
  171. index = NULL;
  172. goto fail;
  173. }
  174. bytes_read = fread(index->buckets, 1, buckets_length, fd);
  175. if(bytes_read != buckets_length) {
  176. if(ferror(fd)) {
  177. EPRINTF_PATH(path, "fread buckets failed (expected %ju, got %ju)",
  178. (uintmax_t) buckets_length, (uintmax_t) bytes_read);
  179. }
  180. else {
  181. EPRINTF_MSG_PATH(path, "fread buckets failed (expected %ju, got %ju)",
  182. (uintmax_t) buckets_length, (uintmax_t) bytes_read);
  183. }
  184. free(index->buckets);
  185. free(index);
  186. index = NULL;
  187. goto fail;
  188. }
  189. index->num_entries = _le32toh(header.num_entries);
  190. index->num_buckets = _le32toh(header.num_buckets);
  191. index->key_size = header.key_size;
  192. index->value_size = header.value_size;
  193. index->bucket_size = index->key_size + index->value_size;
  194. index->lower_limit = index->num_buckets > MIN_BUCKETS ? ((int)(index->num_buckets * BUCKET_LOWER_LIMIT)) : 0;
  195. index->upper_limit = (int)(index->num_buckets * BUCKET_UPPER_LIMIT);
  196. fail:
  197. if(fclose(fd) < 0) {
  198. EPRINTF_PATH(path, "fclose failed");
  199. }
  200. return index;
  201. }
  202. static HashIndex *
  203. hashindex_init(int capacity, int key_size, int value_size)
  204. {
  205. HashIndex *index;
  206. int i;
  207. capacity = MAX(MIN_BUCKETS, capacity);
  208. if(!(index = malloc(sizeof(HashIndex)))) {
  209. EPRINTF("malloc header failed");
  210. return NULL;
  211. }
  212. if(!(index->buckets = calloc(capacity, key_size + value_size))) {
  213. EPRINTF("malloc buckets failed");
  214. free(index);
  215. return NULL;
  216. }
  217. index->num_entries = 0;
  218. index->key_size = key_size;
  219. index->value_size = value_size;
  220. index->num_buckets = capacity;
  221. index->bucket_size = index->key_size + index->value_size;
  222. index->lower_limit = index->num_buckets > MIN_BUCKETS ? ((int)(index->num_buckets * BUCKET_LOWER_LIMIT)) : 0;
  223. index->upper_limit = (int)(index->num_buckets * BUCKET_UPPER_LIMIT);
  224. for(i = 0; i < capacity; i++) {
  225. BUCKET_MARK_EMPTY(index, i);
  226. }
  227. return index;
  228. }
  229. static void
  230. hashindex_free(HashIndex *index)
  231. {
  232. free(index->buckets);
  233. free(index);
  234. }
  235. static int
  236. hashindex_write(HashIndex *index, const char *path)
  237. {
  238. off_t buckets_length = (off_t)index->num_buckets * index->bucket_size;
  239. FILE *fd;
  240. HashHeader header = {
  241. .magic = MAGIC,
  242. .num_entries = _htole32(index->num_entries),
  243. .num_buckets = _htole32(index->num_buckets),
  244. .key_size = index->key_size,
  245. .value_size = index->value_size
  246. };
  247. int ret = 1;
  248. if((fd = fopen(path, "wb")) == NULL) {
  249. EPRINTF_PATH(path, "fopen for writing failed");
  250. return 0;
  251. }
  252. if(fwrite(&header, 1, sizeof(header), fd) != sizeof(header)) {
  253. EPRINTF_PATH(path, "fwrite header failed");
  254. ret = 0;
  255. }
  256. if(fwrite(index->buckets, 1, buckets_length, fd) != (size_t) buckets_length) {
  257. EPRINTF_PATH(path, "fwrite buckets failed");
  258. ret = 0;
  259. }
  260. if(fclose(fd) < 0) {
  261. EPRINTF_PATH(path, "fclose failed");
  262. }
  263. return ret;
  264. }
  265. static const void *
  266. hashindex_get(HashIndex *index, const void *key)
  267. {
  268. int idx = hashindex_lookup(index, key);
  269. if(idx < 0) {
  270. return NULL;
  271. }
  272. return BUCKET_ADDR(index, idx) + index->key_size;
  273. }
  274. static int
  275. hashindex_set(HashIndex *index, const void *key, const void *value)
  276. {
  277. int idx = hashindex_lookup(index, key);
  278. uint8_t *ptr;
  279. if(idx < 0)
  280. {
  281. if(index->num_entries > index->upper_limit) {
  282. if(!hashindex_resize(index, index->num_buckets * 2)) {
  283. return 0;
  284. }
  285. }
  286. idx = hashindex_index(index, key);
  287. while(!BUCKET_IS_EMPTY(index, idx) && !BUCKET_IS_DELETED(index, idx)) {
  288. idx = (idx + 1) % index->num_buckets;
  289. }
  290. ptr = BUCKET_ADDR(index, idx);
  291. memcpy(ptr, key, index->key_size);
  292. memcpy(ptr + index->key_size, value, index->value_size);
  293. index->num_entries += 1;
  294. }
  295. else
  296. {
  297. memcpy(BUCKET_ADDR(index, idx) + index->key_size, value, index->value_size);
  298. }
  299. return 1;
  300. }
  301. static int
  302. hashindex_delete(HashIndex *index, const void *key)
  303. {
  304. int idx = hashindex_lookup(index, key);
  305. if (idx < 0) {
  306. return 1;
  307. }
  308. BUCKET_MARK_DELETED(index, idx);
  309. index->num_entries -= 1;
  310. if(index->num_entries < index->lower_limit) {
  311. if(!hashindex_resize(index, index->num_buckets / 2)) {
  312. return 0;
  313. }
  314. }
  315. return 1;
  316. }
  317. static void *
  318. hashindex_next_key(HashIndex *index, const void *key)
  319. {
  320. int idx = 0;
  321. if(key) {
  322. idx = 1 + (key - index->buckets) / index->bucket_size;
  323. }
  324. if (idx == index->num_buckets) {
  325. return NULL;
  326. }
  327. while(BUCKET_IS_EMPTY(index, idx) || BUCKET_IS_DELETED(index, idx)) {
  328. idx ++;
  329. if (idx == index->num_buckets) {
  330. return NULL;
  331. }
  332. }
  333. return BUCKET_ADDR(index, idx);
  334. }
  335. static int
  336. hashindex_get_size(HashIndex *index)
  337. {
  338. return index->num_entries;
  339. }
  340. static void
  341. hashindex_summarize(HashIndex *index, long long *total_size, long long *total_csize,
  342. long long *total_unique_size, long long *total_unique_csize,
  343. long long *total_unique_chunks, long long *total_chunks)
  344. {
  345. int64_t size = 0, csize = 0, unique_size = 0, unique_csize = 0, chunks = 0, unique_chunks = 0;
  346. const int32_t *values;
  347. void *key = NULL;
  348. while((key = hashindex_next_key(index, key))) {
  349. values = key + index->key_size;
  350. unique_chunks++;
  351. chunks += values[0];
  352. unique_size += values[1];
  353. unique_csize += values[2];
  354. size += (int64_t) values[0] * values[1];
  355. csize += (int64_t) values[0] * values[2];
  356. }
  357. *total_size = size;
  358. *total_csize = csize;
  359. *total_unique_size = unique_size;
  360. *total_unique_csize = unique_csize;
  361. *total_unique_chunks = unique_chunks;
  362. *total_chunks = chunks;
  363. }
  364. static void
  365. hashindex_add(HashIndex *index, const void *key, int32_t *other_values)
  366. {
  367. int32_t *my_values = (int32_t *)hashindex_get(index, key);
  368. if(my_values == NULL) {
  369. hashindex_set(index, key, other_values);
  370. } else {
  371. *my_values += *other_values;
  372. }
  373. }
  374. static void
  375. hashindex_merge(HashIndex *index, HashIndex *other)
  376. {
  377. int32_t key_size = index->key_size;
  378. void *key = NULL;
  379. while((key = hashindex_next_key(other, key))) {
  380. hashindex_add(index, key, key + key_size);
  381. }
  382. }