16 年之前 · 723c636f06
--- a/dedupestore/__init__.py
+++ b/dedupestore/__init__.py
@@ -0,0 +1 @@
 
				+# This is a python package
			
--- a/dedupestore/archiver.py
+++ b/dedupestore/archiver.py
@@ -2,118 +2,12 @@ import os
 
				 import sys
			
 
				 import hashlib
			
 
				 import zlib
			
 
				-import struct
			
 
				 import cPickle
			
 
				 from optparse import OptionParser
			
 
				 
			
 
				-from chunkifier import chunkify, checksum
			
 
				-from store import Store
			
 
				-
			
 
				-
			
 
				-CHUNKSIZE = 64 * 1024
			
 
				-NS_ARCHIVES = 'ARCHIVES'
			
 
				-NS_CHUNKS  = 'CHUNKS'
			
 
				-
			
 
				-class Cache(object):
			
 
				-    """Client Side cache
			
 
				-    """
			
 
				-    def __init__(self, store):
			
 
				-        self.store = store
			
 
				-        self.path = os.path.join(os.path.expanduser('~'), '.dedupestore', 'cache', 
			
 
				-                                 '%s.cache' % self.store.uuid)
			
 
				-        self.tid = -1
			
 
				-        self.open()
			
 
				-        if self.tid != self.store.tid:
			
 
				-            self.init()
			
 
				-
			
 
				-    def open(self):
			
 
				-        if not os.path.exists(self.path):
			
 
				-            return
			
 
				-        print 'Loading cache: ', self.path, '...'
			
 
				-        data = cPickle.loads(zlib.decompress(open(self.path, 'rb').read()))
			
 
				-        if data['uuid'] != self.store.uuid:
			
 
				-            print >> sys.stderr, 'Cache UUID mismatch'
			
 
				-            return
			
 
				-        self.chunkmap = data['chunkmap']
			
 
				-        self.summap = data['summap']
			
 
				-        self.archives = data['archives']
			
 
				-        self.tid = data['tid']
			
 
				-        print 'done'
			
 
				-
			
 
				-    def init(self):
			
 
				-        """Initializes cache by fetching and reading all archive indicies
			
 
				-        """
			
 
				-        self.summap = {}
			
 
				-        self.chunkmap = {}
			
 
				-        self.archives = []
			
 
				-        self.tid = self.store.tid
			
 
				-        if self.store.tid == 0:
			
 
				-            return
			
 
				-        print 'Recreating cache...'
			
 
				-        for id in self.store.list(NS_ARCHIVES):
			
 
				-            archive = cPickle.loads(zlib.decompress(self.store.get(NS_ARCHIVES, id)))
			
 
				-            self.archives.append(archive['name'])
			
 
				-            for item in archive['items']:
			
 
				-                if item['type'] == 'FILE':
			
 
				-                    for c in item['chunks']:
			
 
				-                        self.chunk_incref(c)
			
 
				-        print 'done'
			
 
				-
			
 
				-    def save(self):
			
 
				-        assert self.store.state == Store.OPEN
			
 
				-        print 'saving cache'
			
 
				-        data = {'uuid': self.store.uuid, 
			
 
				-                'chunkmap': self.chunkmap, 'summap': self.summap,
			
 
				-                'tid': self.store.tid, 'archives': self.archives}
			
 
				-        print 'Saving cache as:', self.path
			
 
				-        cachedir = os.path.dirname(self.path)
			
 
				-        if not os.path.exists(cachedir):
			
 
				-            os.makedirs(cachedir)
			
 
				-        with open(self.path, 'wb') as fd:
			
 
				-            fd.write(zlib.compress(cPickle.dumps(data)))
			
 
				-        print 'done'
			
 
				-
			
 
				-    def add_chunk(self, data):
			
 
				-        sum = checksum(data)
			
 
				-        data = zlib.compress(data)
			
 
				-        #print 'chunk %d: %d' % (len(data), sum)
			
 
				-        id = struct.pack('I', sum) + hashlib.sha1(data).digest()
			
 
				-        if not self.seen_chunk(id):
			
 
				-            size = len(data)
			
 
				-            self.store.put(NS_CHUNKS, id, data)
			
 
				-        else:
			
 
				-            size = 0
			
 
				-            #print 'seen chunk', hash.encode('hex')
			
 
				-        self.chunk_incref(id)
			
 
				-        return id, size
			
 
				-
			
 
				-    def seen_chunk(self, hash):
			
 
				-        return self.chunkmap.get(hash, 0) > 0
			
 
				-
			
 
				-    def chunk_incref(self, id):
			
 
				-        sum = struct.unpack('I', id[:4])[0]
			
 
				-        self.chunkmap.setdefault(id, 0)
			
 
				-        self.summap.setdefault(sum, 0)
			
 
				-        self.chunkmap[id] += 1
			
 
				-        self.summap[sum] += 1
			
 
				-
			
 
				-    def chunk_decref(self, id):
			
 
				-        sum = struct.unpack('I', id[:4])[0]
			
 
				-        sumcount = self.summap[sum] - 1
			
 
				-        count = self.chunkmap[id] - 1
			
 
				-        assert sumcount >= 0
			
 
				-        assert count >= 0
			
 
				-        if sumcount:
			
 
				-            self.summap[sum] = sumcount
			
 
				-        else:
			
 
				-            del self.summap[sum]
			
 
				-        if count:
			
 
				-            self.chunkmap[id] = count
			
 
				-        else:
			
 
				-            del self.chunkmap[id]
			
 
				-            print 'deleting chunk: ', id.encode('hex')
			
 
				-            self.store.delete(NS_CHUNKS, id)
			
 
				-        return count
			
 
				+from chunkifier import chunkify
			
 
				+from cache import Cache
			
 
				+from store import Store, NS_ARCHIVES, NS_CHUNKS, CHUNK_SIZE
			
 
				 
			
 
				 
			
 
				 class Archiver(object):
			
@@ -135,7 +29,7 @@ class Archiver(object):
 
				                     name = os.path.join(root, f)
			
 
				                     items.append(self.process_file(name, self.cache))
			
 
				         archive = {'name': archive_name, 'items': items}
			
 
				-        hash = self.store.put(NS_ARCHIVES, archive_name, zlib.compress(cPickle.dumps(archive)))
			
 
				+        self.store.put(NS_ARCHIVES, archive_name, zlib.compress(cPickle.dumps(archive)))
			
 
				         self.store.commit()
			
 
				         self.cache.archives.append(archive_name)
			
 
				         self.cache.save()
			
@@ -214,7 +108,7 @@ class Archiver(object):
 
				             origsize = 0
			
 
				             compsize = 0
			
 
				             chunks = []
			
 
				-            for chunk in chunkify(fd, CHUNKSIZE, self.cache.summap):
			
 
				+            for chunk in chunkify(fd, CHUNK_SIZE, self.cache.summap):
			
 
				                 origsize += len(chunk)
			
 
				                 id, size = cache.add_chunk(chunk)
			
 
				                 compsize += size
			
@@ -260,9 +154,10 @@ class Archiver(object):
 
				         else:
			
 
				             self.create_archive(options.create_archive, args)
			
 
				 
			
 
				+
			
 
				 def main():
			
 
				     archiver = Archiver()
			
 
				     archiver.run()
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				-    main()
			
 
				+    main()
			
--- a/dedupestore/cache.py
+++ b/dedupestore/cache.py
@@ -0,0 +1,112 @@
 
				+import cPickle
			
 
				+import hashlib
			
 
				+import os
			
 
				+import sys
			
 
				+import struct
			
 
				+import zlib
			
 
				+
			
 
				+from chunkifier import checksum
			
 
				+from store import Store, NS_ARCHIVES, NS_CHUNKS
			
 
				+
			
 
				+
			
 
				+class Cache(object):
			
 
				+    """Client Side cache
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, store):
			
 
				+        self.store = store
			
 
				+        self.path = os.path.join(os.path.expanduser('~'), '.dedupestore', 'cache', 
			
 
				+                                 '%s.cache' % self.store.uuid)
			
 
				+        self.tid = -1
			
 
				+        self.open()
			
 
				+        if self.tid != self.store.tid:
			
 
				+            self.init()
			
 
				+
			
 
				+    def open(self):
			
 
				+        if not os.path.exists(self.path):
			
 
				+            return
			
 
				+        print 'Loading cache: ', self.path, '...'
			
 
				+        data = cPickle.loads(zlib.decompress(open(self.path, 'rb').read()))
			
 
				+        if data['uuid'] != self.store.uuid:
			
 
				+            print >> sys.stderr, 'Cache UUID mismatch'
			
 
				+            return
			
 
				+        self.chunkmap = data['chunkmap']
			
 
				+        self.summap = data['summap']
			
 
				+        self.archives = data['archives']
			
 
				+        self.tid = data['tid']
			
 
				+        print 'done'
			
 
				+
			
 
				+    def init(self):
			
 
				+        """Initializes cache by fetching and reading all archive indicies
			
 
				+        """
			
 
				+        self.summap = {}
			
 
				+        self.chunkmap = {}
			
 
				+        self.archives = []
			
 
				+        self.tid = self.store.tid
			
 
				+        if self.store.tid == 0:
			
 
				+            return
			
 
				+        print 'Recreating cache...'
			
 
				+        for id in self.store.list(NS_ARCHIVES):
			
 
				+            archive = cPickle.loads(zlib.decompress(self.store.get(NS_ARCHIVES, id)))
			
 
				+            self.archives.append(archive['name'])
			
 
				+            for item in archive['items']:
			
 
				+                if item['type'] == 'FILE':
			
 
				+                    for c in item['chunks']:
			
 
				+                        self.chunk_incref(c)
			
 
				+        print 'done'
			
 
				+
			
 
				+    def save(self):
			
 
				+        assert self.store.state == Store.OPEN
			
 
				+        print 'saving cache'
			
 
				+        data = {'uuid': self.store.uuid, 
			
 
				+                'chunkmap': self.chunkmap, 'summap': self.summap,
			
 
				+                'tid': self.store.tid, 'archives': self.archives}
			
 
				+        print 'Saving cache as:', self.path
			
 
				+        cachedir = os.path.dirname(self.path)
			
 
				+        if not os.path.exists(cachedir):
			
 
				+            os.makedirs(cachedir)
			
 
				+        with open(self.path, 'wb') as fd:
			
 
				+            fd.write(zlib.compress(cPickle.dumps(data)))
			
 
				+        print 'done'
			
 
				+
			
 
				+    def add_chunk(self, data):
			
 
				+        sum = checksum(data)
			
 
				+        data = zlib.compress(data)
			
 
				+        #print 'chunk %d: %d' % (len(data), sum)
			
 
				+        id = struct.pack('I', sum) + hashlib.sha1(data).digest()
			
 
				+        if not self.seen_chunk(id):
			
 
				+            size = len(data)
			
 
				+            self.store.put(NS_CHUNKS, id, data)
			
 
				+        else:
			
 
				+            size = 0
			
 
				+            #print 'seen chunk', hash.encode('hex')
			
 
				+        self.chunk_incref(id)
			
 
				+        return id, size
			
 
				+
			
 
				+    def seen_chunk(self, hash):
			
 
				+        return self.chunkmap.get(hash, 0) > 0
			
 
				+
			
 
				+    def chunk_incref(self, id):
			
 
				+        sum = struct.unpack('I', id[:4])[0]
			
 
				+        self.chunkmap.setdefault(id, 0)
			
 
				+        self.summap.setdefault(sum, 0)
			
 
				+        self.chunkmap[id] += 1
			
 
				+        self.summap[sum] += 1
			
 
				+
			
 
				+    def chunk_decref(self, id):
			
 
				+        sum = struct.unpack('I', id[:4])[0]
			
 
				+        sumcount = self.summap[sum] - 1
			
 
				+        count = self.chunkmap[id] - 1
			
 
				+        assert sumcount >= 0
			
 
				+        assert count >= 0
			
 
				+        if sumcount:
			
 
				+            self.summap[sum] = sumcount
			
 
				+        else:
			
 
				+            del self.summap[sum]
			
 
				+        if count:
			
 
				+            self.chunkmap[id] = count
			
 
				+        else:
			
 
				+            del self.chunkmap[id]
			
 
				+            print 'deleting chunk: ', id.encode('hex')
			
 
				+            self.store.delete(NS_CHUNKS, id)
			
 
				+        return count
			
--- a/dedupestore/store.py
+++ b/dedupestore/store.py
@@ -1,16 +1,21 @@
 
				 #!/usr/bin/env python
			
 
				 import os
			
 
				 import fcntl
			
 
				-import hashlib
			
 
				 import tempfile
			
 
				 import shutil
			
 
				 import unittest
			
 
				 import uuid
			
 
				 
			
 
				 
			
 
				+CHUNK_SIZE = 256 * 1024
			
 
				+NS_ARCHIVES = 'ARCHIVES'
			
 
				+NS_CHUNKS = 'CHUNKS'
			
 
				+
			
 
				+
			
 
				 class Store(object):
			
 
				     """
			
 
				     """
			
 
				+
			
 
				     class DoesNotExist(KeyError):
			
 
				         """"""
			
 
				 
			
@@ -173,18 +178,15 @@ class Store(object):
 
				                 raise Store.DoesNotExist('Object does not exist: %s' % hash.encode('hex'))
			
 
				 
			
 
				     def list(self, ns, prefix='', marker=None, max_keys=1000000):
			
 
				-        for x in self.foo(os.path.join(self.path, 'data', ns.encode('hex')), 
			
 
				+        for x in self._walker(os.path.join(self.path, 'data', ns.encode('hex')), 
			
 
				                           prefix, marker, '', max_keys):
			
 
				             yield x
			
 
				         
			
 
				-
			
 
				-    def foo(self, path, prefix, marker, base, max_keys):
			
 
				+    def _walker(self, path, prefix, marker, base, max_keys):
			
 
				         n = 0
			
 
				         for name in sorted(os.listdir(path)):
			
 
				             if n >= max_keys:
			
 
				                 return
			
 
				-            dirs = []
			
 
				-            names = []
			
 
				             id = name.decode('hex')
			
 
				             if os.path.isdir(os.path.join(path, name)):
			
 
				                 if prefix and not id.startswith(prefix[:len(id)]):