Procházet zdrojové kódy

New filesystem based store implementation

Jonas Borgström před 14 roky
rodič
revize
a04ad26975
7 změnil soubory, kde provedl 519 přidání a 230 odebrání
  1. 4 4
      darc/__init__.py
  2. 31 13
      darc/archive.py
  3. 0 3
      darc/archiver.py
  4. 6 6
      darc/cache.py
  5. 115 0
      darc/lrucache.py
  6. 361 203
      darc/store.py
  7. 2 1
      darc/test.py

+ 4 - 4
darc/__init__.py

@@ -1,7 +1,7 @@
 # This is a python package
 
-NS_ARCHIVE_ITEMS = 'AI'
-NS_ARCHIVE_CHUNKS = 'AC'
-NS_ARCHIVE_METADATA = 'AM'
-NS_CHUNK = 'C'
+NS_CHUNK = 0
+NS_ARCHIVE_METADATA = 1
+NS_ARCHIVE_CHUNKS = 2
+NS_ARCHIVE_ITEMS = 3
 

+ 31 - 13
darc/archive.py

@@ -42,11 +42,13 @@ class Archive(object):
         assert self.metadata['version'] == 1
 
     def get_chunks(self):
-        data, chunks_hash = self.keychain.decrypt(self.store.get(NS_ARCHIVE_CHUNKS, self.id))
-        chunks = msgpack.unpackb(data)
-        assert chunks['version'] == 1
-        assert self.metadata['chunks_hash'] == chunks_hash
-        return chunks['chunks']
+        for id in self.metadata['chunks_ids']:
+            data, items_hash = self.keychain.decrypt(self.store.get(NS_ARCHIVE_CHUNKS, id))
+            assert items_hash == id
+            items = msgpack.unpackb(data)
+            assert items['version'] == 1
+            for item in items['chunks']:
+                yield item
 
     def get_items(self):
         for id in self.metadata['items_ids']:
@@ -69,17 +71,31 @@ class Archive(object):
         self.items = []
         self.items_ids.append(items_hash)
 
+    def save_chunks(self, cache):
+        chunks = []
+        ids = []
+        def flush(chunks):
+            data = { 'version': 1, 'chunks': chunks }
+            data, chunks_hash = self.keychain.encrypt_create(msgpack.packb(data))
+            self.store.put(NS_ARCHIVE_CHUNKS, chunks_hash, data)
+            ids.append(chunks_hash)
+        for id, (count, size) in cache.chunk_counts.iteritems():
+            if count > 1000000:
+                chunks.append((id, size))
+            if len(chunks) > 100000:
+                flush(chunks)
+                chunks = []
+        flush(chunks)
+        return ids
+
     def save(self, name, cache):
         self.id = self.keychain.id_hash(name)
-        self.chunks = [(id, size) for (id, (count, size)) in cache.chunk_counts.iteritems() if count > 1000000]
-        chunks = {'version': 1, 'chunks': self.chunks}
-        data, chunks_hash = self.keychain.encrypt_create(msgpack.packb(chunks))
-        self.store.put(NS_ARCHIVE_CHUNKS, self.id, data)
+        chunks_ids = self.save_chunks(cache)
         self.flush_items()
         metadata = {
             'version': 1,
             'name': name,
-            'chunks_hash': chunks_hash,
+            'chunks_ids': chunks_ids,
             'items_ids': self.items_ids,
             'cmdline': sys.argv,
             'hostname': socket.gethostname(),
@@ -181,11 +197,13 @@ class Archive(object):
         return True
 
     def delete(self, cache):
-        self.store.delete(NS_ARCHIVE_CHUNKS, self.id)
-        self.store.delete(NS_ARCHIVE_ITEMS, self.id)
-        self.store.delete(NS_ARCHIVE_METADATA, self.id)
         for id, size in self.get_chunks():
             cache.chunk_decref(id)
+        self.store.delete(NS_ARCHIVE_METADATA, self.id)
+        for id in self.metadata['chunks_ids']:
+            self.store.delete(NS_ARCHIVE_CHUNKS, id)
+        for id in self.metadata['items_ids']:
+            self.store.delete(NS_ARCHIVE_ITEMS, id)
         self.store.commit()
         cache.save()
 

+ 0 - 3
darc/archiver.py

@@ -156,8 +156,6 @@ class Archiver(object):
         return self.exit_code
 
     def do_verify(self, args):
-        import ipdb
-        ipdb.set_trace()
         store = self.open_store(args.archive)
         keychain = Keychain(args.keychain)
         archive = Archive(store, keychain, args.archive.archive)
@@ -182,7 +180,6 @@ class Archiver(object):
         print 'Username:', archive.metadata['username']
         print 'Time:', archive.metadata['time']
         print 'Command line:', ' '.join(archive.metadata['cmdline'])
-        print 'Number of Files:', len(archive.get_items())
         print 'Original size:', format_file_size(osize)
         print 'Compressed size:', format_file_size(csize)
         print 'Unique data:', format_file_size(usize)

+ 6 - 6
darc/cache.py

@@ -9,13 +9,13 @@ class Cache(object):
     """
 
     def __init__(self, store, keychain):
+        self.tid = -1
         self.store = store
         self.keychain = keychain
         self.path = os.path.join(Cache.cache_dir_path(),
                                  '%s.cache' % self.store.id.encode('hex'))
-        self.tid = -1
         self.open()
-        if self.tid != self.store.tid:
+        if self.tid != store.tid:
             self.init()
 
     @staticmethod
@@ -40,10 +40,10 @@ class Cache(object):
         print 'Initializing cache...'
         self.chunk_counts = {}
         self.file_chunks = {}
-        self.tid = self.store.tid
-        if self.store.tid == 0:
-            return
-        for id in list(self.store.list(NS_ARCHIVE_CHUNKS)):
+        for id in self.store.list(NS_ARCHIVE_CHUNKS):
+            if len(id) != 32:
+                import ipdb
+                ipdb.set_trace()
             data, hash = self.keychain.decrypt(self.store.get(NS_ARCHIVE_CHUNKS, id))
             cindex = msgpack.unpackb(data)
             for id, size in cindex['chunks']:

+ 115 - 0
darc/lrucache.py

@@ -0,0 +1,115 @@
+from UserDict import DictMixin
+from heapq import heappush, heapify, heapreplace, heappop
+import unittest
+
+
+class LRUCache(DictMixin):
+    """Heap queue based Least Recently Used Cache implementation
+    """
+
+    class Node(object):
+        """Internal cache node
+        """
+        __slots__ = ('key', 'value', 't')
+
+        def __init__(self, key, value, t):
+            self.key = key
+            self.value = value
+            self.t = t
+
+        def __cmp__(self, other):
+            return cmp(self.t, other.t)
+
+
+    def __init__(self, size):
+        self._heap = []
+        self._dict = {}
+        self.size = size
+        self._t = 0
+
+    def __setitem__(self, key, value):
+        self._t += 1
+        try:
+            node = self._dict[key]
+            node.value = value
+            node.t = self._t
+            heapify(self._heap)
+        except KeyError:
+            node = self.Node(key, value, self._t)
+            self._dict[key] = node
+            if len(self) < self.size:
+                heappush(self._heap, node)
+            else:
+                old = heapreplace(self._heap, node)
+                del self._dict[old.key]
+
+    def __getitem__(self, key):
+        node = self._dict[key]
+        self[key] = node.value
+        return node.value
+
+    def __delitem__(self, key):
+        node = self._dict[key]
+        del self._dict[key]
+        self._heap.remove(node)
+        heapify(self._heap)
+
+    def __iter__(self):
+        copy = self._heap[:]
+        while copy:
+            yield heappop(copy).key
+
+    def iteritems(self):
+        copy = self._heap[:]
+        while copy:
+            node = heappop(copy)
+            yield node.key, node.value
+
+    def keys(self):
+        return self._dict.keys()
+
+    def __contains__(self, key):
+        return key in self._dict
+
+    def __len__(self):
+        return len(self._heap)
+
+
+class LRUCacheTestCase(unittest.TestCase):
+
+    def test(self):
+        c = LRUCache(2)
+        self.assertEqual(len(c), 0)
+        for i, x in enumerate('abc'):
+            c[x] = i
+        self.assertEqual(len(c), 2)
+        self.assertEqual(list(c), ['b', 'c'])
+        self.assertEqual(list(c.iteritems()), [('b', 1), ('c', 2)])
+        self.assertEqual(False, 'a' in c)
+        self.assertEqual(True, 'b' in c)
+        self.assertRaises(KeyError, lambda: c['a'])
+        self.assertEqual(c['b'], 1)
+        self.assertEqual(c['c'], 2)
+        c['d'] = 3
+        self.assertEqual(len(c), 2)
+        self.assertEqual(c['c'], 2)
+        self.assertEqual(c['d'], 3)
+        c['c'] = 22
+        c['e'] = 4
+        self.assertEqual(len(c), 2)
+        self.assertRaises(KeyError, lambda: c['d'])
+        self.assertEqual(c['c'], 22)
+        self.assertEqual(c['e'], 4)
+        del c['c']
+        self.assertEqual(len(c), 1)
+        self.assertRaises(KeyError, lambda: c['c'])
+        self.assertEqual(c['e'], 4)
+
+
+def suite():
+    return unittest.TestLoader().loadTestsFromTestCase(LRUCacheTestCase)
+
+
+if __name__ == '__main__':
+    unittest.main()
+

+ 361 - 203
darc/store.py

@@ -1,212 +1,380 @@
+from ConfigParser import RawConfigParser
+import fcntl
+import numpy
 import os
-import tempfile
 import shutil
+import struct
+import tempfile
 import unittest
-import sqlite3
-import fcntl
+from UserDict import DictMixin
 
-Binary = sqlite3.Binary
+from .lrucache import LRUCache
 
 
 class Store(object):
+    """Filesystem based transactional key value store
+
+    On disk layout:
+    dir/README
+    dir/config
+    dir/bands/<X / BANDS_PER_DIR>/<X>
+    dir/indexes/<NS>
     """
-    """
+    DEFAULT_MAX_BAND_SIZE = 10 * 1024 * 1024
+    DEFAULT_BANDS_PER_DIR = 10000
 
     class DoesNotExist(KeyError):
-        """"""
+        """Requested key does not exist"""
 
-    class AlreadyExists(KeyError):
-        """"""
-
-    IDLE = 'Idle'
-    OPEN = 'Open'
-    ACTIVE = 'Active'
-    BAND_LIMIT = 1 * 1024 * 1024
 
     def __init__(self, path, create=False):
-        self.read_fd = None
-        self.write_fd = None
+        self.txn_active = False
         if create:
             self.create(path)
         self.open(path)
 
-    def get_option(self, key):
-        return self.cursor.execute('SELECT value FROM system WHERE key=?', (key,)) \
-            .fetchone()[0]
-
-    def set_option(self, key, value):
-        return self.cursor.execute('UPDATE system SET value=? WHERE key=?',
-                                   (value, key))
-
-    def open(self, path):
-        if not os.path.isdir(path):
-            raise Exception('%s Does not look like a darc store' % path)
-        db_path = os.path.join(path, 'darcstore.db')
-        if not os.path.exists(db_path):
-            raise Exception('%s Does not look like a darc store')
-        self.lock_fd = open(os.path.join(path, 'lock'), 'w')
-        fcntl.flock(self.lock_fd, fcntl.LOCK_EX)
-        self.path = path
-        self.cnx = sqlite3.connect(db_path)
-        self.cnx.text_factory = str
-        self.cursor = self.cnx.cursor()
-        self._begin()
-
-    def _begin(self):
-        if self.read_fd:
-            self.read_fd.close()
-            self.read_fd = None
-        if self.write_fd:
-            self.write_fd.close()
-            self.write_fd = None
-        self.version = self.get_option('version')
-        self.id = self.get_option('id').decode('hex')
-        self.tid = self.get_option('tid')
-        self.nextband = self.get_option('nextband')
-        self.bandlimit = self.get_option('bandlimit')
-        assert self.version == 1
-        self.state = self.OPEN
-        self.read_band = None
-        self.write_band = None
-        self.to_delete = set()
-        band = self.nextband
-        while os.path.exists(self.band_filename(band)):
-            os.unlink(self.band_filename(band))
-            band += 1
-
     def create(self, path):
+        """Create a new empty store at `path`
+        """
         if os.path.exists(path) and (not os.path.isdir(path) or os.listdir(path)):
             raise Exception('Path "%s" already exists' % path)
         if not os.path.exists(path):
             os.mkdir(path)
+        with open(os.path.join(path, 'README'), 'wb') as fd:
+            fd.write('This is a DARC store')
         os.mkdir(os.path.join(path, 'bands'))
-        cnx = sqlite3.connect(os.path.join(path, 'darcstore.db'))
-        cnx.execute('CREATE TABLE objects(ns BINARY NOT NULL, id BINARY NOT NULL, '
-                    'band NOT NULL, offset NOT NULL, size NOT NULL)')
-        cnx.execute('CREATE UNIQUE INDEX objects_pk ON objects(ns, id)')
-        cnx.execute('CREATE TABLE system(key UNIQUE NOT NULL, value)')
-        cnx.executemany('INSERT INTO system VALUES(?, ?)',
-                        (('id', os.urandom(32).encode('hex')),
-                         ('version', 1),
-                         ('tid', 0),
-                         ('nextband', 0),
-                         ('bandlimit', self.BAND_LIMIT)))
-        cnx.commit()
+        os.mkdir(os.path.join(path, 'indexes'))
+        config = RawConfigParser()
+        config.add_section('store')
+        config.set('store', 'version', '1')
+        config.set('store', 'id', os.urandom(32).encode('hex'))
+        config.set('store', 'bands_per_dir', self.DEFAULT_BANDS_PER_DIR)
+        config.set('store', 'max_band_size', self.DEFAULT_MAX_BAND_SIZE)
+        config.add_section('state')
+        config.set('state', 'next_band', '0')
+        config.set('state', 'tid', '0')
+        with open(os.path.join(path, 'config'), 'w') as fd:
+            config.write(fd)
+
+    def open(self, path):
+        self.path = path
+        if not os.path.isdir(path):
+            raise Exception('%s Does not look like a darc store' % path)
+        self.lock_fd = open(os.path.join(path, 'README'), 'r+')
+        fcntl.flock(self.lock_fd, fcntl.LOCK_EX)
+        self.config = RawConfigParser()
+        self.config.read(os.path.join(path, 'config'))
+        if self.config.getint('store', 'version') != 1:
+            raise Exception('%s Does not look like a darc store')
+        self.id = self.config.get('store', 'id')
+        self.tid = self.config.getint('state', 'tid')
+        next_band = self.config.getint('state', 'next_band')
+        max_band_size = self.config.getint('store', 'max_band_size')
+        bands_per_dir = self.config.getint('store', 'bands_per_dir')
+        self.rollback()
+        self.io = BandIO(self.path, next_band, max_band_size, bands_per_dir)
+
+    def begin_txn(self):
+        txn_dir = os.path.join(self.path, 'txn.tmp')
+        # Initialize transaction snapshot
+        os.mkdir(txn_dir)
+        shutil.copytree(os.path.join(self.path, 'indexes'),
+                        os.path.join(txn_dir, 'indexes'))
+        shutil.copy(os.path.join(self.path, 'config'), txn_dir)
+        os.rename(os.path.join(self.path, 'txn.tmp'),
+                  os.path.join(self.path, 'txn.active'))
+        self.compact = set()
+        self.txn_active = True
 
     def close(self):
         self.rollback()
-        self.cnx.close()
         self.lock_fd.close()
-        os.unlink(os.path.join(self.path, 'lock'))
 
     def commit(self):
+        """Commit transaction, `tid` will be increased by 1
         """
-        """
-        self.band = None
-        self.delete_bands()
+        self.compact_bands()
+        self.io.close()
         self.tid += 1
-        self._begin()
-
-    def delete_bands(self):
-        for b in self.to_delete:
-            objects = self.cursor.execute('SELECT ns, id, offset, size '
-                                          'FROM objects WHERE band=? ORDER BY offset',
-                                          (b,)).fetchall()
-            for o in objects:
-                band, offset, size = self.store_data(self.retrieve_data(b, *o[2:]))
-                self.cursor.execute('UPDATE objects SET band=?, offset=?, size=? '
-                                    'WHERE ns=? AND id=?', (band, offset, size,
-                                    Binary(o[0]), Binary(o[1])))
-        self.set_option('tid', self.tid + 1)
-        self.set_option('nextband', self.nextband)
-        self.cnx.commit()
-        for b in self.to_delete:
-            os.unlink(self.band_filename(b))
+        self.config.set('state', 'tid', self.tid)
+        self.config.set('state', 'next_band', self.io.band + 1)
+        with open(os.path.join(self.path, 'config'), 'w') as fd:
+            self.config.write(fd)
+        for i in self.indexes.values():
+            i.flush()
+        os.rename(os.path.join(self.path, 'txn.active'),
+                  os.path.join(self.path, 'txn.tmp'))
+        shutil.rmtree(os.path.join(self.path, 'txn.tmp'))
+        self.indexes = {}
+        self.txn_active = False
+
+    def compact_bands(self):
+        if not self.compact:
+            return
+        self.io.close_band()
+        for band in self.compact:
+            for ns, key, offset, size in self.io.iter_objects(band):
+                if key in self.indexes[ns]:
+                    del self.indexes[ns][key]
+                    data = self.io.read(band, offset)
+                    self.indexes[ns][key] = self.io.write(ns, key, data)
+        for band in self.compact:
+            self.io.delete_band(band)
 
     def rollback(self):
         """
         """
-        self.cnx.rollback()
-        self._begin()
+        # Remove partial transaction
+        if os.path.exists(os.path.join(self.path, 'txn.tmp')):
+            shutil.rmtree(os.path.join(self.path, 'txn.tmp'))
+        # Roll back active transaction
+        txn_dir = os.path.join(self.path, 'txn.active')
+        if os.path.exists(txn_dir):
+            shutil.rmtree(os.path.join(self.path, 'indexes'))
+            shutil.copytree(os.path.join(txn_dir, 'indexes'),
+                            os.path.join(self.path, 'indexes'))
+            shutil.copy(os.path.join(txn_dir, 'config'), self.path)
+            shutil.rmtree(txn_dir)
+        self.indexes = {}
+        self.txn_active = False
+
+    def get_index(self, ns):
+        try:
+            return self.indexes[ns]
+        except KeyError:
+            filename = os.path.join(self.path, 'indexes', str(ns))
+            if os.path.exists(filename):
+                self.indexes[ns] = HashIndex(filename)
+            else:
+                self.indexes[ns] = HashIndex.create(filename)
+            return self.indexes[ns]
 
     def get(self, ns, id):
-        """
-        """
-        self.cursor.execute('SELECT band, offset, size FROM objects WHERE ns=? and id=?',
-                            (Binary(ns), Binary(id)))
-        row = self.cursor.fetchone()
-        if row:
-            return self.retrieve_data(*row)
-        else:
+        try:
+            band, offset = self.get_index(ns)[id]
+            return self.io.read(band, offset)
+        except KeyError:
             raise self.DoesNotExist
 
-    def band_filename(self, band):
-        return os.path.join(self.path, 'bands', str(band / 1000), str(band))
-
-    def retrieve_data(self, band, offset, size):
-        if self.write_band == band:
-            self.write_fd.flush()
-        if self.read_band != band:
-            self.read_band = band
-            if self.read_fd:
-                self.read_fd.close()
-            self.read_fd = open(self.band_filename(band), 'rb')
-        self.read_fd.seek(offset)
-        return self.read_fd.read(size)
-
-    def store_data(self, data):
-        if self.write_band is None:
-            self.write_band = self.nextband
-            self.nextband += 1
-            if self.write_band % 1000 == 0:
-                path = os.path.join(self.path, 'bands', str(self.write_band / 1000))
-                if not os.path.exists(path):
-                    os.mkdir(path)
-            assert not os.path.exists(self.band_filename(self.write_band))
-            self.write_fd = open(self.band_filename(self.write_band), 'ab')
-        band = self.write_band
-        offset = self.write_fd.tell()
-        self.write_fd.write(data)
-        if offset + len(data) > self.bandlimit:
-            self.write_band = None
-        return band, offset, len(data)
-
     def put(self, ns, id, data):
-        """
-        """
-        try:
-            band, offset, size = self.store_data(data)
-            self.cursor.execute('INSERT INTO objects (ns, id, band, offset, size) '
-                                'VALUES(?, ?, ?, ?, ?)',
-                                (Binary(ns), Binary(id), band, offset, size))
-        except sqlite3.IntegrityError:
-            raise self.AlreadyExists
+        if not self.txn_active:
+            self.begin_txn()
+        band, offset = self.io.write(ns, id, data)
+        self.get_index(ns)[id] = band, offset
 
     def delete(self, ns, id):
-        """
-        """
-        self.cursor.execute('SELECT band FROM objects WHERE ns=? and id=?',
-                            (Binary(ns), Binary(id)))
-        row = self.cursor.fetchone()
-        if not row:
+        if not self.txn_active:
+            self.begin_txn()
+        try:
+            band, offset = self.get_index(ns).pop(id)
+            self.compact.add(band)
+        except KeyError:
             raise self.DoesNotExist
-        self.cursor.execute('DELETE FROM objects WHERE ns=? AND id=?',
-                           (Binary(ns), Binary(id)))
-        self.to_delete.add(row[0])
 
-    def list(self, ns, prefix='', marker=None, max_keys=1000000):
-        """
-        """
-        sql = 'SELECT id FROM objects WHERE ns=:ns'
-        args = dict(ns=Binary(ns))
-        if prefix:
-            args['prefix'] = Binary(prefix)
-            args['end'] = Binary(prefix + chr(255))
-            sql += ' AND id >= :prefix AND id < :end'
-        if marker:
-            sql += ' AND id >= :marker'
-            args['marker'] = Binary(marker)
-        return list(str(row[0]) for row in self.cursor.execute(sql + ' LIMIT ' + str(max_keys), args))
+    def list(self, ns, marker=None, limit=1000000):
+        return [key for (key, value) in
+                self.get_index(ns).iteritems(marker=marker, limit=limit)]
+
+
+class HashIndex(DictMixin):
+    """Hash Table with open addressing and lazy deletes
+    """
+    EMPTY, DELETED = -1, -2
+    FREE = (EMPTY, DELETED)
+
+    i_fmt    = struct.Struct('<i')
+    assert i_fmt.size == 4
+    idx_type = numpy.dtype('<i,<i,V32')
+    assert idx_type.itemsize == 40
+
+    def __init__(self, path):
+        self.path = path
+        self.fd = open(path, 'r+')
+        assert self.fd.read(8) == 'DARCHASH'
+        self.num_entries = self.i_fmt.unpack(self.fd.read(4))[0]
+        self.buckets = numpy.memmap(self.fd, self.idx_type, offset=12)
+        self.limit = 3 * self.buckets.size / 4  # 75% fill rate
+
+    def flush(self):
+        self.fd.seek(8)
+        self.fd.write(self.i_fmt.pack(self.num_entries))
+        self.fd.flush()
+        self.buckets.flush()
+
+    @classmethod
+    def create(cls, path, capacity=1024):
+        with open(path, 'wb') as fd:
+            fd.write('DARCHASH\0\0\0\0')
+            a = numpy.zeros(capacity, cls.idx_type)
+            for i in xrange(capacity):
+                a[i][0] = cls.EMPTY
+            a.tofile(fd)
+        return cls(path)
+
+    def index(self, key):
+        hash = self.i_fmt.unpack(key[:4])[0]
+        return hash % self.buckets.size
+
+    def lookup(self, key):
+        didx = -1
+        idx = self.index(key)
+        while True:
+            while self.buckets[idx][0] == self.DELETED:
+                if didx == -1:
+                    didx = idx
+                idx = (idx + 1) % self.buckets.size
+            if self.buckets[idx][0] == self.EMPTY:
+                raise KeyError
+            if str(self.buckets[idx][2]) == key:
+                if didx != -1:
+                    self.buckets[didx] = self.buckets[idx]
+                    self.buckets[idx][0] = self.DELETED
+                    idx = didx
+                return idx
+            idx = (idx + 1) % self.buckets.size
+
+    def __contains__(self, key):
+        try:
+            self[key]
+            return True
+        except KeyError:
+            return False
+
+    def pop(self, key):
+        idx = self.lookup(key)
+        band = self.buckets[idx][0]
+        self.buckets[idx][0] = self.DELETED
+        self.num_entries -= 1
+        return band, self.buckets[idx][1]
+
+    def __getitem__(self, key):
+        idx = self.lookup(key)
+        return self.buckets[idx][0], self.buckets[idx][1]
+
+    def __delitem__(self, key):
+        self.buckets[self.lookup(key)][0] = self.DELETED
+        self.num_entries -= 1
+
+    def __setitem__(self, key, value):
+        if self.num_entries >= self.limit:
+            self.resize()
+        try:
+            idx = self.lookup(key)
+            self.buckets[idx][0], self.buckets[idx][1] = value
+            return
+        except KeyError:
+            idx = self.index(key)
+            while self.buckets[idx][0] not in self.FREE:
+                idx = (idx + 1) % self.buckets.size
+            self.buckets[idx][0], self.buckets[idx][1] = value
+            self.buckets[idx][2] = key
+            self.num_entries += 1
+
+    def iteritems(self, limit=0, marker=None):
+        n = 0
+        for idx in xrange(self.buckets.size):
+            if self.buckets[idx][0] in self.FREE:
+                continue
+            key = str(self.buckets[idx][2])
+            if marker and key != marker:
+                continue
+            elif marker:
+                marker = None
+            yield key, (self.buckets[idx][0], self.buckets[idx][1])
+            n += 1
+            if n == limit:
+                return
+
+    def resize(self, capacity=0):
+        capacity = capacity or self.buckets.size * 2
+        print 'resizing to', capacity
+        if capacity < self.num_entries:
+            raise ValueError('HashIndex full')
+        new = HashIndex.create(self.path + '.tmp', capacity)
+        for key, value in self.iteritems():
+            new[key] = value
+        new.flush()
+        os.unlink(self.path)
+        os.rename(self.path + '.tmp', self.path)
+        self.fd = new.fd
+        self.buckets = new.buckets
+        self.limit = 3 * self.buckets.size / 4
+
+
+class BandIO(object):
+
+    header_fmt = struct.Struct('<iBB32s')
+    assert header_fmt.size == 38
+
+    def __init__(self, path, nextband, limit, bands_per_dir, capacity=100):
+        self.path = path
+        self.fds = LRUCache(capacity)
+        self.band = nextband
+        self.limit = limit
+        self.bands_per_dir = bands_per_dir
+        self.offset = 0
+
+    def close(self):
+        for band in self.fds.keys():
+            self.fds.pop(band).close()
+
+    def band_filename(self, band):
+        return os.path.join(self.path, 'bands', str(band / self.bands_per_dir), str(band))
+
+    def get_fd(self, band, write=False):
+        try:
+            return self.fds[band]
+        except KeyError:
+            if write and band % 1000 == 0:
+                dirname = os.path.join(self.path, 'bands', str(band / self.bands_per_dir))
+                if not os.path.exists(dirname):
+                    os.mkdir(dirname)
+            fd = open(self.band_filename(band), write and 'w+' or 'rb')
+            self.fds[band] = fd
+            return fd
+
+    def delete_band(self, band):
+        os.unlink(self.band_filename(band))
+
+    def read(self, band, offset):
+        fd = self.get_fd(band)
+        fd.seek(offset)
+        data = fd.read(self.header_fmt.size)
+        size, magic, ns, id = self.header_fmt.unpack(data)
+        return fd.read(size - self.header_fmt.size)
+
+    def iter_objects(self, band):
+        fd = self.get_fd(band)
+        fd.seek(0)
+        assert fd.read(8) == 'DARCBAND'
+        offset = 8
+        data = fd.read(self.header_fmt.size)
+        while data:
+            size, magic, ns, key = self.header_fmt.unpack(data)
+            size -= self.header_fmt.size
+            yield ns, key, offset, size
+            offset += size + self.header_fmt.size
+            fd.seek(offset)
+            data = fd.read(self.header_fmt.size)
+
+    def write(self, ns, id, data):
+        size = len(data) + self.header_fmt.size
+        if self.offset and self.offset + size > self.limit:
+            self.close_band()
+        fd = self.get_fd(self.band, write=True)
+        fd.seek(self.offset)
+        if self.offset == 0:
+            fd.write('DARCBAND')
+            self.offset = 8
+        offset = self.offset
+        fd.write(self.header_fmt.pack(size, 0, ns, id))
+        fd.write(data)
+        self.offset += size
+        return self.band, offset
+
+    def close_band(self):
+        self.band += 1
+        self.offset = 0
 
 
 class StoreTestCase(unittest.TestCase):
@@ -220,46 +388,36 @@ class StoreTestCase(unittest.TestCase):
 
     def test1(self):
         self.assertEqual(self.store.tid, 0)
-        self.assertEqual(self.store.state, self.store.OPEN)
-        self.store.put('SOMENS', 'SOMEID', 'SOMEDATA')
-        self.assertRaises(self.store.AlreadyExists, lambda: self.store.put('SOMENS', 'SOMEID', 'SOMEDATA'))
-        self.assertEqual(self.store.get('SOMENS', 'SOMEID'), 'SOMEDATA')
-        self.store.rollback()
-        self.assertRaises(self.store.DoesNotExist, lambda: self.store.get('SOMENS', 'SOMEID'))
-        self.assertEqual(self.store.tid, 0)
-
-    def test2(self):
-        self.assertEqual(self.store.tid, 0)
-        self.assertEqual(self.store.state, self.store.OPEN)
-        self.store.put('SOMENS', 'SOMEID', 'SOMEDATA')
-        self.assertEqual(self.store.get('SOMENS', 'SOMEID'), 'SOMEDATA')
+        for x in range(100):
+            self.store.put(0, '%-32d' % x, 'SOMEDATA')
+        key50 = '%-32d' % 50
+        self.assertEqual(self.store.get(0, key50), 'SOMEDATA')
+        self.store.delete(0, key50)
+        self.assertRaises(self.store.DoesNotExist, lambda: self.store.get(0, key50))
         self.store.commit()
         self.assertEqual(self.store.tid, 1)
-        self.assertEqual(self.store.get('SOMENS', 'SOMEID'), 'SOMEDATA')
-        self.store.delete('SOMENS', 'SOMEID')
-        self.assertRaises(self.store.DoesNotExist, lambda: self.store.get('SOMENS', 'SOMEID'))
-        self.store.rollback()
-        self.assertEqual(self.store.get('SOMENS', 'SOMEID'), 'SOMEDATA')
-        self.store.delete('SOMENS', 'SOMEID')
-        self.assertRaises(self.store.DoesNotExist, lambda: self.store.get('SOMENS', 'SOMEID'))
-        self.store.commit()
-        self.assertEqual(self.store.tid, 2)
-        self.assertRaises(self.store.DoesNotExist, lambda: self.store.get('SOMENS', 'SOMEID'))
-
-    def test_list(self):
-        self.store.put('SOMENS', 'SOMEID12', 'SOMEDATA')
-        self.store.put('SOMENS', 'SOMEID', 'SOMEDATA')
-        self.store.put('SOMENS', 'SOMEID1', 'SOMEDATA')
-        self.store.put('SOMENS', 'SOMEID123', 'SOMEDATA')
-        self.store.commit()
-        self.assertEqual(list(self.store.list('SOMENS', max_keys=3)),
-            ['SOMEID', 'SOMEID1', 'SOMEID12'])
-        self.assertEqual(list(self.store.list('SOMENS', marker='SOMEID12')),
-            ['SOMEID12', 'SOMEID123'])
-        self.assertEqual(list(self.store.list('SOMENS', prefix='SOMEID1', max_keys=2)),
-            ['SOMEID1', 'SOMEID12'])
-        self.assertEqual(list(self.store.list('SOMENS', prefix='SOMEID1', marker='SOMEID12')),
-            ['SOMEID12', 'SOMEID123'])
+        self.store.close()
+        store2 = Store(os.path.join(self.tmppath, 'store'))
+        self.assertEqual(store2.tid, 1)
+        keys = store2.list(0)
+        for x in range(50):
+            key = '%-32d' % x
+            self.assertEqual(store2.get(0, key), 'SOMEDATA')
+        self.assertRaises(store2.DoesNotExist, lambda: store2.get(0, key50))
+        assert key50 not in keys
+        for x in range(51, 100):
+            key = '%-32d' % x
+            assert key in keys
+            self.assertEqual(store2.get(0, key), 'SOMEDATA')
+        self.assertEqual(len(keys), 99)
+        for x in range(50):
+            key = '%-32d' % x
+            store2.delete(0, key)
+        self.assertEqual(len(store2.list(0)), 49)
+        for x in range(51, 100):
+            key = '%-32d' % x
+            store2.delete(0, key)
+        self.assertEqual(len(store2.list(0)), 0)
 
 
 def suite():

+ 2 - 1
darc/test.py

@@ -11,7 +11,7 @@ from xattr import xattr, XATTR_NOFOLLOW
 import getpass
 getpass.getpass = lambda m: 'abc123'
 
-from . import store, helpers
+from . import store, helpers, lrucache
 from .archiver import Archiver
 
 
@@ -123,6 +123,7 @@ def suite():
     suite.addTest(unittest.TestLoader().loadTestsFromTestCase(Test))
     suite.addTest(store.suite())
     suite.addTest(doctest.DocTestSuite(helpers))
+    suite.addTest(lrucache.suite())
     return suite
 
 if __name__ == '__main__':