|
@@ -1,14 +1,13 @@
|
|
|
from ConfigParser import RawConfigParser
|
|
|
import fcntl
|
|
|
-import numpy
|
|
|
import os
|
|
|
import shutil
|
|
|
import struct
|
|
|
import tempfile
|
|
|
import unittest
|
|
|
-from UserDict import DictMixin
|
|
|
|
|
|
from .lrucache import LRUCache
|
|
|
+from .hashindex import NSIndex, BandIndex
|
|
|
|
|
|
|
|
|
class Store(object):
|
|
@@ -20,7 +19,7 @@ class Store(object):
|
|
|
dir/bands/<X / BANDS_PER_DIR>/<X>
|
|
|
dir/indexes/<NS>
|
|
|
"""
|
|
|
- DEFAULT_MAX_BAND_SIZE = 10 * 1024 * 1024
|
|
|
+ DEFAULT_MAX_BAND_SIZE = 5 * 1024 * 1024
|
|
|
DEFAULT_BANDS_PER_DIR = 10000
|
|
|
|
|
|
class DoesNotExist(KeyError):
|
|
@@ -44,6 +43,7 @@ class Store(object):
|
|
|
fd.write('This is a DARC store')
|
|
|
os.mkdir(os.path.join(path, 'bands'))
|
|
|
os.mkdir(os.path.join(path, 'indexes'))
|
|
|
+ BandIndex.create(os.path.join(path, 'indexes', 'bands'))
|
|
|
config = RawConfigParser()
|
|
|
config.add_section('store')
|
|
|
config.set('store', 'version', '1')
|
|
@@ -85,6 +85,7 @@ class Store(object):
|
|
|
os.path.join(self.path, 'txn.active'))
|
|
|
self.compact = set()
|
|
|
self.txn_active = True
|
|
|
+ self.bands = BandIndex(os.path.join(self.path, 'indexes', 'bands'))
|
|
|
|
|
|
def close(self):
|
|
|
self.rollback()
|
|
@@ -102,6 +103,7 @@ class Store(object):
|
|
|
self.config.write(fd)
|
|
|
for i in self.indexes.values():
|
|
|
i.flush()
|
|
|
+ self.bands.flush()
|
|
|
os.rename(os.path.join(self.path, 'txn.active'),
|
|
|
os.path.join(self.path, 'txn.tmp'))
|
|
|
shutil.rmtree(os.path.join(self.path, 'txn.tmp'))
|
|
@@ -115,9 +117,16 @@ class Store(object):
|
|
|
def lookup(ns, key):
|
|
|
return key in self.indexes[ns]
|
|
|
for band in self.compact:
|
|
|
- for ns, key, data in self.io.iter_objects(band, lookup):
|
|
|
- self.indexes[ns][key] = self.io.write(ns, key, data)
|
|
|
+ if self.bands[band] > 0:
|
|
|
+ for ns, key, data in self.io.iter_objects(band, lookup):
|
|
|
+ new_band, offset = self.io.write(ns, key, data)
|
|
|
+ self.indexes[ns][key] = new_band, offset
|
|
|
+ self.bands[band] -= 1
|
|
|
+ self.bands.setdefault(new_band, 0)
|
|
|
+ self.bands[new_band] += 1
|
|
|
+
|
|
|
for band in self.compact:
|
|
|
+ assert self.bands.pop(band) == 0
|
|
|
self.io.delete_band(band)
|
|
|
|
|
|
def rollback(self):
|
|
@@ -141,11 +150,11 @@ class Store(object):
|
|
|
try:
|
|
|
return self.indexes[ns]
|
|
|
except KeyError:
|
|
|
- filename = os.path.join(self.path, 'indexes', str(ns))
|
|
|
+ filename = os.path.join(self.path, 'indexes', 'ns%d' % ns)
|
|
|
if os.path.exists(filename):
|
|
|
- self.indexes[ns] = HashIndex(filename)
|
|
|
+ self.indexes[ns] = NSIndex(filename)
|
|
|
else:
|
|
|
- self.indexes[ns] = HashIndex.create(filename)
|
|
|
+ self.indexes[ns] = NSIndex.create(filename)
|
|
|
return self.indexes[ns]
|
|
|
|
|
|
def get(self, ns, id):
|
|
@@ -159,6 +168,8 @@ class Store(object):
|
|
|
if not self.txn_active:
|
|
|
self.begin_txn()
|
|
|
band, offset = self.io.write(ns, id, data)
|
|
|
+ self.bands.setdefault(band, 0)
|
|
|
+ self.bands[band] += 1
|
|
|
self.get_index(ns)[id] = band, offset
|
|
|
|
|
|
def delete(self, ns, id):
|
|
@@ -166,6 +177,7 @@ class Store(object):
|
|
|
self.begin_txn()
|
|
|
try:
|
|
|
band, offset = self.get_index(ns).pop(id)
|
|
|
+ self.bands[band] -= 1
|
|
|
self.compact.add(band)
|
|
|
except KeyError:
|
|
|
raise self.DoesNotExist
|
|
@@ -175,130 +187,6 @@ class Store(object):
|
|
|
self.get_index(ns).iteritems(marker=marker, limit=limit)]
|
|
|
|
|
|
|
|
|
-class HashIndex(DictMixin):
|
|
|
- """Hash Table with open addressing and lazy deletes
|
|
|
- """
|
|
|
- EMPTY, DELETED = -1, -2
|
|
|
- FREE = (EMPTY, DELETED)
|
|
|
-
|
|
|
- i_fmt = struct.Struct('<i')
|
|
|
- assert i_fmt.size == 4
|
|
|
- idx_type = numpy.dtype('V32,<i,<i')
|
|
|
- assert idx_type.itemsize == 40
|
|
|
-
|
|
|
- def __init__(self, path):
|
|
|
- self.path = path
|
|
|
- self.fd = open(path, 'r+')
|
|
|
- assert self.fd.read(8) == 'DARCHASH'
|
|
|
- self.num_entries = self.i_fmt.unpack(self.fd.read(4))[0]
|
|
|
- self.buckets = numpy.memmap(self.fd, self.idx_type, offset=12)
|
|
|
- self.limit = 3 * self.buckets.size / 4 # 75% fill rate
|
|
|
-
|
|
|
- def flush(self):
|
|
|
- self.fd.seek(8)
|
|
|
- self.fd.write(self.i_fmt.pack(self.num_entries))
|
|
|
- self.fd.flush()
|
|
|
- self.buckets.flush()
|
|
|
-
|
|
|
- @classmethod
|
|
|
- def create(cls, path, capacity=1024):
|
|
|
- with open(path, 'wb') as fd:
|
|
|
- fd.write('DARCHASH\0\0\0\0')
|
|
|
- a = numpy.zeros(capacity, cls.idx_type)
|
|
|
- for i in xrange(capacity):
|
|
|
- a[i][1] = cls.EMPTY
|
|
|
- a.tofile(fd)
|
|
|
- return cls(path)
|
|
|
-
|
|
|
- def index(self, key):
|
|
|
- hash = self.i_fmt.unpack(key[:4])[0]
|
|
|
- return hash % self.buckets.size
|
|
|
-
|
|
|
- def lookup(self, key):
|
|
|
- didx = -1
|
|
|
- idx = self.index(key)
|
|
|
- while True:
|
|
|
- while self.buckets[idx][1] == self.DELETED:
|
|
|
- if didx == -1:
|
|
|
- didx = idx
|
|
|
- idx = (idx + 1) % self.buckets.size
|
|
|
- if self.buckets[idx][1] == self.EMPTY:
|
|
|
- raise KeyError
|
|
|
- if str(self.buckets[idx][0]) == key:
|
|
|
- if didx != -1:
|
|
|
- self.buckets[didx] = self.buckets[idx]
|
|
|
- self.buckets[idx][1] = self.DELETED
|
|
|
- idx = didx
|
|
|
- return idx
|
|
|
- idx = (idx + 1) % self.buckets.size
|
|
|
-
|
|
|
- def __contains__(self, key):
|
|
|
- try:
|
|
|
- self[key]
|
|
|
- return True
|
|
|
- except KeyError:
|
|
|
- return False
|
|
|
-
|
|
|
- def pop(self, key):
|
|
|
- idx = self.lookup(key)
|
|
|
- band = self.buckets[idx][1]
|
|
|
- self.buckets[idx][1] = self.DELETED
|
|
|
- self.num_entries -= 1
|
|
|
- return band, self.buckets[idx][2]
|
|
|
-
|
|
|
- def __getitem__(self, key):
|
|
|
- idx = self.lookup(key)
|
|
|
- return self.buckets[idx][1], self.buckets[idx][2]
|
|
|
-
|
|
|
- def __delitem__(self, key):
|
|
|
- self.buckets[self.lookup(key)][1] = self.DELETED
|
|
|
- self.num_entries -= 1
|
|
|
-
|
|
|
- def __setitem__(self, key, value):
|
|
|
- if self.num_entries >= self.limit:
|
|
|
- self.resize()
|
|
|
- try:
|
|
|
- idx = self.lookup(key)
|
|
|
- self.buckets[idx][1], self.buckets[idx][2] = value
|
|
|
- return
|
|
|
- except KeyError:
|
|
|
- idx = self.index(key)
|
|
|
- while self.buckets[idx][1] not in self.FREE:
|
|
|
- idx = (idx + 1) % self.buckets.size
|
|
|
- self.buckets[idx][1], self.buckets[idx][2] = value
|
|
|
- self.buckets[idx][0] = key
|
|
|
- self.num_entries += 1
|
|
|
-
|
|
|
- def iteritems(self, limit=0, marker=None):
|
|
|
- n = 0
|
|
|
- for idx in xrange(self.buckets.size):
|
|
|
- if self.buckets[idx][1] in self.FREE:
|
|
|
- continue
|
|
|
- key = str(self.buckets[idx][0])
|
|
|
- if marker and key != marker:
|
|
|
- continue
|
|
|
- elif marker:
|
|
|
- marker = None
|
|
|
- yield key, (self.buckets[idx][1], self.buckets[idx][2])
|
|
|
- n += 1
|
|
|
- if n == limit:
|
|
|
- return
|
|
|
-
|
|
|
- def resize(self, capacity=0):
|
|
|
- capacity = capacity or self.buckets.size * 2
|
|
|
- if capacity < self.num_entries:
|
|
|
- raise ValueError('HashIndex full')
|
|
|
- new = HashIndex.create(self.path + '.tmp', capacity)
|
|
|
- for key, value in self.iteritems():
|
|
|
- new[key] = value
|
|
|
- new.flush()
|
|
|
- os.unlink(self.path)
|
|
|
- os.rename(self.path + '.tmp', self.path)
|
|
|
- self.fd = new.fd
|
|
|
- self.buckets = new.buckets
|
|
|
- self.limit = 3 * self.buckets.size / 4
|
|
|
-
|
|
|
-
|
|
|
class BandIO(object):
|
|
|
|
|
|
header_fmt = struct.Struct('<iBB32s')
|