_hashindex.c 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411
  1. #include <assert.h>
  2. #include <stdio.h>
  3. #include <stdlib.h>
  4. #include <stdint.h>
  5. #include <string.h>
  6. #include <sys/types.h>
  7. #include <sys/stat.h>
  8. #include <fcntl.h>
  9. #include <unistd.h>
  10. #if defined(BYTE_ORDER)&&(BYTE_ORDER == BIG_ENDIAN)
  11. #define _le32toh(x) __builtin_bswap32(x)
  12. #define _htole32(x) __builtin_bswap32(x)
  13. #elif defined(BYTE_ORDER)&&(BYTE_ORDER == LITTLE_ENDIAN)
  14. #define _le32toh(x) (x)
  15. #define _htole32(x) (x)
  16. #else
  17. #error Unknown byte order
  18. #endif
  19. #define MAGIC "BORG_IDX"
  20. #define MAGIC_LEN 8
  21. typedef struct {
  22. char magic[MAGIC_LEN];
  23. int32_t num_entries;
  24. int32_t num_buckets;
  25. int8_t key_size;
  26. int8_t value_size;
  27. } __attribute__((__packed__)) HashHeader;
  28. typedef struct {
  29. void *buckets;
  30. int num_entries;
  31. int num_buckets;
  32. int key_size;
  33. int value_size;
  34. off_t bucket_size;
  35. int lower_limit;
  36. int upper_limit;
  37. } HashIndex;
  38. #define EMPTY _htole32(0xffffffff)
  39. #define DELETED _htole32(0xfffffffe)
  40. #define MAX_BUCKET_SIZE 512
  41. #define BUCKET_LOWER_LIMIT .25
  42. #define BUCKET_UPPER_LIMIT .90
  43. #define MIN_BUCKETS 1024
  44. #define MAX(x, y) ((x) > (y) ? (x): (y))
  45. #define BUCKET_ADDR(index, idx) (index->buckets + (idx * index->bucket_size))
  46. #define BUCKET_IS_DELETED(index, idx) (*((uint32_t *)(BUCKET_ADDR(index, idx) + index->key_size)) == DELETED)
  47. #define BUCKET_IS_EMPTY(index, idx) (*((uint32_t *)(BUCKET_ADDR(index, idx) + index->key_size)) == EMPTY)
  48. #define BUCKET_MATCHES_KEY(index, idx, key) (memcmp(key, BUCKET_ADDR(index, idx), index->key_size) == 0)
  49. #define BUCKET_MARK_DELETED(index, idx) (*((uint32_t *)(BUCKET_ADDR(index, idx) + index->key_size)) = DELETED)
  50. #define BUCKET_MARK_EMPTY(index, idx) (*((uint32_t *)(BUCKET_ADDR(index, idx) + index->key_size)) = EMPTY)
  51. #define EPRINTF_MSG(msg, ...) fprintf(stderr, "hashindex: " msg "\n", ##__VA_ARGS__)
  52. #define EPRINTF_MSG_PATH(path, msg, ...) fprintf(stderr, "hashindex: %s: " msg "\n", path, ##__VA_ARGS__)
  53. #define EPRINTF(msg, ...) fprintf(stderr, "hashindex: " msg "(%s)\n", ##__VA_ARGS__, strerror(errno))
  54. #define EPRINTF_PATH(path, msg, ...) fprintf(stderr, "hashindex: %s: " msg " (%s)\n", path, ##__VA_ARGS__, strerror(errno))
  55. static HashIndex *hashindex_read(const char *path);
  56. static int hashindex_write(HashIndex *index, const char *path);
  57. static HashIndex *hashindex_init(int capacity, int key_size, int value_size);
  58. static const void *hashindex_get(HashIndex *index, const void *key);
  59. static int hashindex_set(HashIndex *index, const void *key, const void *value);
  60. static int hashindex_delete(HashIndex *index, const void *key);
  61. static void *hashindex_next_key(HashIndex *index, const void *key);
  62. /* Private API */
  63. static int
  64. hashindex_index(HashIndex *index, const void *key)
  65. {
  66. return _le32toh(*((uint32_t *)key)) % index->num_buckets;
  67. }
  68. static int
  69. hashindex_lookup(HashIndex *index, const void *key)
  70. {
  71. int didx = -1;
  72. int start = hashindex_index(index, key);
  73. int idx = start;
  74. for(;;) {
  75. if(BUCKET_IS_EMPTY(index, idx))
  76. {
  77. return -1;
  78. }
  79. if(BUCKET_IS_DELETED(index, idx)) {
  80. if(didx == -1) {
  81. didx = idx;
  82. }
  83. }
  84. else if(BUCKET_MATCHES_KEY(index, idx, key)) {
  85. if (didx != -1) {
  86. memcpy(BUCKET_ADDR(index, didx), BUCKET_ADDR(index, idx), index->bucket_size);
  87. BUCKET_MARK_DELETED(index, idx);
  88. idx = didx;
  89. }
  90. return idx;
  91. }
  92. idx = (idx + 1) % index->num_buckets;
  93. if(idx == start) {
  94. return -1;
  95. }
  96. }
  97. }
  98. static int
  99. hashindex_resize(HashIndex *index, int capacity)
  100. {
  101. HashIndex *new;
  102. void *key = NULL;
  103. if(!(new = hashindex_init(capacity, index->key_size, index->value_size))) {
  104. return 0;
  105. }
  106. while((key = hashindex_next_key(index, key))) {
  107. hashindex_set(new, key, hashindex_get(index, key));
  108. }
  109. free(index->buckets);
  110. index->buckets = new->buckets;
  111. index->num_buckets = new->num_buckets;
  112. index->lower_limit = new->lower_limit;
  113. index->upper_limit = new->upper_limit;
  114. free(new);
  115. return 1;
  116. }
  117. /* Public API */
  118. static HashIndex *
  119. hashindex_read(const char *path)
  120. {
  121. FILE *fd;
  122. off_t length, buckets_length, bytes_read;
  123. HashHeader header;
  124. HashIndex *index = NULL;
  125. if((fd = fopen(path, "rb")) == NULL) {
  126. EPRINTF_PATH(path, "fopen for reading failed");
  127. return NULL;
  128. }
  129. bytes_read = fread(&header, 1, sizeof(HashHeader), fd);
  130. if(bytes_read != sizeof(HashHeader)) {
  131. if(ferror(fd)) {
  132. EPRINTF_PATH(path, "fread header failed (expected %ju, got %ju)",
  133. (uintmax_t) sizeof(HashHeader), (uintmax_t) bytes_read);
  134. }
  135. else {
  136. EPRINTF_MSG_PATH(path, "fread header failed (expected %ju, got %ju)",
  137. (uintmax_t) sizeof(HashHeader), (uintmax_t) bytes_read);
  138. }
  139. goto fail;
  140. }
  141. if(fseek(fd, 0, SEEK_END) < 0) {
  142. EPRINTF_PATH(path, "fseek failed");
  143. goto fail;
  144. }
  145. if((length = ftell(fd)) < 0) {
  146. EPRINTF_PATH(path, "ftell failed");
  147. goto fail;
  148. }
  149. if(fseek(fd, sizeof(HashHeader), SEEK_SET) < 0) {
  150. EPRINTF_PATH(path, "fseek failed");
  151. goto fail;
  152. }
  153. if(memcmp(header.magic, MAGIC, MAGIC_LEN)) {
  154. EPRINTF_MSG_PATH(path, "Unknown MAGIC in header");
  155. goto fail;
  156. }
  157. buckets_length = (off_t)_le32toh(header.num_buckets) * (header.key_size + header.value_size);
  158. if(length != sizeof(HashHeader) + buckets_length) {
  159. EPRINTF_MSG_PATH(path, "Incorrect file length (expected %ju, got %ju)",
  160. (uintmax_t) sizeof(HashHeader) + buckets_length, (uintmax_t) length);
  161. goto fail;
  162. }
  163. if(!(index = malloc(sizeof(HashIndex)))) {
  164. EPRINTF_PATH(path, "malloc header failed");
  165. goto fail;
  166. }
  167. if(!(index->buckets = malloc(buckets_length))) {
  168. EPRINTF_PATH(path, "malloc buckets failed");
  169. free(index);
  170. index = NULL;
  171. goto fail;
  172. }
  173. bytes_read = fread(index->buckets, 1, buckets_length, fd);
  174. if(bytes_read != buckets_length) {
  175. if(ferror(fd)) {
  176. EPRINTF_PATH(path, "fread buckets failed (expected %ju, got %ju)",
  177. (uintmax_t) buckets_length, (uintmax_t) bytes_read);
  178. }
  179. else {
  180. EPRINTF_MSG_PATH(path, "fread buckets failed (expected %ju, got %ju)",
  181. (uintmax_t) buckets_length, (uintmax_t) bytes_read);
  182. }
  183. free(index->buckets);
  184. free(index);
  185. index = NULL;
  186. goto fail;
  187. }
  188. index->num_entries = _le32toh(header.num_entries);
  189. index->num_buckets = _le32toh(header.num_buckets);
  190. index->key_size = header.key_size;
  191. index->value_size = header.value_size;
  192. index->bucket_size = index->key_size + index->value_size;
  193. index->lower_limit = index->num_buckets > MIN_BUCKETS ? ((int)(index->num_buckets * BUCKET_LOWER_LIMIT)) : 0;
  194. index->upper_limit = (int)(index->num_buckets * BUCKET_UPPER_LIMIT);
  195. fail:
  196. if(fclose(fd) < 0) {
  197. EPRINTF_PATH(path, "fclose failed");
  198. }
  199. return index;
  200. }
  201. static HashIndex *
  202. hashindex_init(int capacity, int key_size, int value_size)
  203. {
  204. off_t buckets_length;
  205. HashIndex *index;
  206. int i;
  207. capacity = MAX(MIN_BUCKETS, capacity);
  208. if(!(index = malloc(sizeof(HashIndex)))) {
  209. EPRINTF("malloc header failed");
  210. return NULL;
  211. }
  212. buckets_length = (off_t)capacity * (key_size + value_size);
  213. if(!(index->buckets = calloc(buckets_length, 1))) {
  214. EPRINTF("malloc buckets failed");
  215. free(index);
  216. return NULL;
  217. }
  218. index->num_entries = 0;
  219. index->key_size = key_size;
  220. index->value_size = value_size;
  221. index->num_buckets = capacity;
  222. index->bucket_size = index->key_size + index->value_size;
  223. index->lower_limit = index->num_buckets > MIN_BUCKETS ? ((int)(index->num_buckets * BUCKET_LOWER_LIMIT)) : 0;
  224. index->upper_limit = (int)(index->num_buckets * BUCKET_UPPER_LIMIT);
  225. for(i = 0; i < capacity; i++) {
  226. BUCKET_MARK_EMPTY(index, i);
  227. }
  228. return index;
  229. }
  230. static void
  231. hashindex_free(HashIndex *index)
  232. {
  233. free(index->buckets);
  234. free(index);
  235. }
  236. static int
  237. hashindex_write(HashIndex *index, const char *path)
  238. {
  239. off_t buckets_length = (off_t)index->num_buckets * index->bucket_size;
  240. FILE *fd;
  241. HashHeader header = {
  242. .magic = MAGIC,
  243. .num_entries = _htole32(index->num_entries),
  244. .num_buckets = _htole32(index->num_buckets),
  245. .key_size = index->key_size,
  246. .value_size = index->value_size
  247. };
  248. int ret = 1;
  249. if((fd = fopen(path, "wb")) == NULL) {
  250. EPRINTF_PATH(path, "fopen for writing failed");
  251. return 0;
  252. }
  253. if(fwrite(&header, 1, sizeof(header), fd) != sizeof(header)) {
  254. EPRINTF_PATH(path, "fwrite header failed");
  255. ret = 0;
  256. }
  257. if(fwrite(index->buckets, 1, buckets_length, fd) != buckets_length) {
  258. EPRINTF_PATH(path, "fwrite buckets failed");
  259. ret = 0;
  260. }
  261. if(fclose(fd) < 0) {
  262. EPRINTF_PATH(path, "fclose failed");
  263. }
  264. return ret;
  265. }
  266. static const void *
  267. hashindex_get(HashIndex *index, const void *key)
  268. {
  269. int idx = hashindex_lookup(index, key);
  270. if(idx < 0) {
  271. return NULL;
  272. }
  273. return BUCKET_ADDR(index, idx) + index->key_size;
  274. }
  275. static int
  276. hashindex_set(HashIndex *index, const void *key, const void *value)
  277. {
  278. int idx = hashindex_lookup(index, key);
  279. uint8_t *ptr;
  280. if(idx < 0)
  281. {
  282. if(index->num_entries > index->upper_limit) {
  283. if(!hashindex_resize(index, index->num_buckets * 2)) {
  284. return 0;
  285. }
  286. }
  287. idx = hashindex_index(index, key);
  288. while(!BUCKET_IS_EMPTY(index, idx) && !BUCKET_IS_DELETED(index, idx)) {
  289. idx = (idx + 1) % index->num_buckets;
  290. }
  291. ptr = BUCKET_ADDR(index, idx);
  292. memcpy(ptr, key, index->key_size);
  293. memcpy(ptr + index->key_size, value, index->value_size);
  294. index->num_entries += 1;
  295. }
  296. else
  297. {
  298. memcpy(BUCKET_ADDR(index, idx) + index->key_size, value, index->value_size);
  299. }
  300. return 1;
  301. }
  302. static int
  303. hashindex_delete(HashIndex *index, const void *key)
  304. {
  305. int idx = hashindex_lookup(index, key);
  306. if (idx < 0) {
  307. return 1;
  308. }
  309. BUCKET_MARK_DELETED(index, idx);
  310. index->num_entries -= 1;
  311. if(index->num_entries < index->lower_limit) {
  312. if(!hashindex_resize(index, index->num_buckets / 2)) {
  313. return 0;
  314. }
  315. }
  316. return 1;
  317. }
  318. static void *
  319. hashindex_next_key(HashIndex *index, const void *key)
  320. {
  321. int idx = 0;
  322. if(key) {
  323. idx = 1 + (key - index->buckets) / index->bucket_size;
  324. }
  325. if (idx == index->num_buckets) {
  326. return NULL;
  327. }
  328. while(BUCKET_IS_EMPTY(index, idx) || BUCKET_IS_DELETED(index, idx)) {
  329. idx ++;
  330. if (idx == index->num_buckets) {
  331. return NULL;
  332. }
  333. }
  334. return BUCKET_ADDR(index, idx);
  335. }
  336. static int
  337. hashindex_get_size(HashIndex *index)
  338. {
  339. return index->num_entries;
  340. }
  341. static void
  342. hashindex_summarize(HashIndex *index, long long *total_size, long long *total_csize,
  343. long long *total_unique_size, long long *total_unique_csize,
  344. long long *total_unique_chunks, long long *total_chunks)
  345. {
  346. int64_t size = 0, csize = 0, unique_size = 0, unique_csize = 0, chunks = 0, unique_chunks = 0;
  347. const int32_t *values;
  348. void *key = NULL;
  349. while((key = hashindex_next_key(index, key))) {
  350. values = key + index->key_size;
  351. unique_chunks++;
  352. chunks += values[0];
  353. unique_size += values[1];
  354. unique_csize += values[2];
  355. size += (int64_t) values[0] * values[1];
  356. csize += (int64_t) values[0] * values[2];
  357. }
  358. *total_size = size;
  359. *total_csize = csize;
  360. *total_unique_size = unique_size;
  361. *total_unique_csize = unique_csize;
  362. *total_unique_chunks = unique_chunks;
  363. *total_chunks = chunks;
  364. }
  365. static void
  366. hashindex_merge(HashIndex *index, HashIndex *other)
  367. {
  368. int32_t key_size = index->key_size;
  369. const int32_t *other_values;
  370. int32_t *my_values;
  371. void *key = NULL;
  372. while((key = hashindex_next_key(other, key))) {
  373. other_values = key + key_size;
  374. my_values = (int32_t *)hashindex_get(index, key);
  375. if(my_values == NULL) {
  376. hashindex_set(index, key, other_values);
  377. } else {
  378. *my_values += *other_values;
  379. }
  380. }
  381. }