浏览代码

Various code refactoring.

Jonas Borgström 15 年之前
父节点
当前提交
723c636f06
共有 4 个文件被更改,包括 128 次插入118 次删除
  1. 1 0
      dedupestore/__init__.py
  2. 7 112
      dedupestore/archiver.py
  3. 112 0
      dedupestore/cache.py
  4. 8 6
      dedupestore/store.py

+ 1 - 0
dedupestore/__init__.py

@@ -0,0 +1 @@
+# This is a python package

+ 7 - 112
dedupestore/archiver.py

@@ -2,118 +2,12 @@ import os
 import sys
 import hashlib
 import zlib
-import struct
 import cPickle
 from optparse import OptionParser
 
-from chunkifier import chunkify, checksum
-from store import Store
-
-
-CHUNKSIZE = 64 * 1024
-NS_ARCHIVES = 'ARCHIVES'
-NS_CHUNKS  = 'CHUNKS'
-
-class Cache(object):
-    """Client Side cache
-    """
-    def __init__(self, store):
-        self.store = store
-        self.path = os.path.join(os.path.expanduser('~'), '.dedupestore', 'cache', 
-                                 '%s.cache' % self.store.uuid)
-        self.tid = -1
-        self.open()
-        if self.tid != self.store.tid:
-            self.init()
-
-    def open(self):
-        if not os.path.exists(self.path):
-            return
-        print 'Loading cache: ', self.path, '...'
-        data = cPickle.loads(zlib.decompress(open(self.path, 'rb').read()))
-        if data['uuid'] != self.store.uuid:
-            print >> sys.stderr, 'Cache UUID mismatch'
-            return
-        self.chunkmap = data['chunkmap']
-        self.summap = data['summap']
-        self.archives = data['archives']
-        self.tid = data['tid']
-        print 'done'
-
-    def init(self):
-        """Initializes cache by fetching and reading all archive indicies
-        """
-        self.summap = {}
-        self.chunkmap = {}
-        self.archives = []
-        self.tid = self.store.tid
-        if self.store.tid == 0:
-            return
-        print 'Recreating cache...'
-        for id in self.store.list(NS_ARCHIVES):
-            archive = cPickle.loads(zlib.decompress(self.store.get(NS_ARCHIVES, id)))
-            self.archives.append(archive['name'])
-            for item in archive['items']:
-                if item['type'] == 'FILE':
-                    for c in item['chunks']:
-                        self.chunk_incref(c)
-        print 'done'
-
-    def save(self):
-        assert self.store.state == Store.OPEN
-        print 'saving cache'
-        data = {'uuid': self.store.uuid, 
-                'chunkmap': self.chunkmap, 'summap': self.summap,
-                'tid': self.store.tid, 'archives': self.archives}
-        print 'Saving cache as:', self.path
-        cachedir = os.path.dirname(self.path)
-        if not os.path.exists(cachedir):
-            os.makedirs(cachedir)
-        with open(self.path, 'wb') as fd:
-            fd.write(zlib.compress(cPickle.dumps(data)))
-        print 'done'
-
-    def add_chunk(self, data):
-        sum = checksum(data)
-        data = zlib.compress(data)
-        #print 'chunk %d: %d' % (len(data), sum)
-        id = struct.pack('I', sum) + hashlib.sha1(data).digest()
-        if not self.seen_chunk(id):
-            size = len(data)
-            self.store.put(NS_CHUNKS, id, data)
-        else:
-            size = 0
-            #print 'seen chunk', hash.encode('hex')
-        self.chunk_incref(id)
-        return id, size
-
-    def seen_chunk(self, hash):
-        return self.chunkmap.get(hash, 0) > 0
-
-    def chunk_incref(self, id):
-        sum = struct.unpack('I', id[:4])[0]
-        self.chunkmap.setdefault(id, 0)
-        self.summap.setdefault(sum, 0)
-        self.chunkmap[id] += 1
-        self.summap[sum] += 1
-
-    def chunk_decref(self, id):
-        sum = struct.unpack('I', id[:4])[0]
-        sumcount = self.summap[sum] - 1
-        count = self.chunkmap[id] - 1
-        assert sumcount >= 0
-        assert count >= 0
-        if sumcount:
-            self.summap[sum] = sumcount
-        else:
-            del self.summap[sum]
-        if count:
-            self.chunkmap[id] = count
-        else:
-            del self.chunkmap[id]
-            print 'deleting chunk: ', id.encode('hex')
-            self.store.delete(NS_CHUNKS, id)
-        return count
+from chunkifier import chunkify
+from cache import Cache
+from store import Store, NS_ARCHIVES, NS_CHUNKS, CHUNK_SIZE
 
 
 class Archiver(object):
@@ -135,7 +29,7 @@ class Archiver(object):
                     name = os.path.join(root, f)
                     items.append(self.process_file(name, self.cache))
         archive = {'name': archive_name, 'items': items}
-        hash = self.store.put(NS_ARCHIVES, archive_name, zlib.compress(cPickle.dumps(archive)))
+        self.store.put(NS_ARCHIVES, archive_name, zlib.compress(cPickle.dumps(archive)))
         self.store.commit()
         self.cache.archives.append(archive_name)
         self.cache.save()
@@ -214,7 +108,7 @@ class Archiver(object):
             origsize = 0
             compsize = 0
             chunks = []
-            for chunk in chunkify(fd, CHUNKSIZE, self.cache.summap):
+            for chunk in chunkify(fd, CHUNK_SIZE, self.cache.summap):
                 origsize += len(chunk)
                 id, size = cache.add_chunk(chunk)
                 compsize += size
@@ -260,9 +154,10 @@ class Archiver(object):
         else:
             self.create_archive(options.create_archive, args)
 
+
 def main():
     archiver = Archiver()
     archiver.run()
 
 if __name__ == '__main__':
-    main()
+    main()

+ 112 - 0
dedupestore/cache.py

@@ -0,0 +1,112 @@
+import cPickle
+import hashlib
+import os
+import sys
+import struct
+import zlib
+
+from chunkifier import checksum
+from store import Store, NS_ARCHIVES, NS_CHUNKS
+
+
+class Cache(object):
+    """Client Side cache
+    """
+
+    def __init__(self, store):
+        self.store = store
+        self.path = os.path.join(os.path.expanduser('~'), '.dedupestore', 'cache', 
+                                 '%s.cache' % self.store.uuid)
+        self.tid = -1
+        self.open()
+        if self.tid != self.store.tid:
+            self.init()
+
+    def open(self):
+        if not os.path.exists(self.path):
+            return
+        print 'Loading cache: ', self.path, '...'
+        data = cPickle.loads(zlib.decompress(open(self.path, 'rb').read()))
+        if data['uuid'] != self.store.uuid:
+            print >> sys.stderr, 'Cache UUID mismatch'
+            return
+        self.chunkmap = data['chunkmap']
+        self.summap = data['summap']
+        self.archives = data['archives']
+        self.tid = data['tid']
+        print 'done'
+
+    def init(self):
+        """Initializes cache by fetching and reading all archive indicies
+        """
+        self.summap = {}
+        self.chunkmap = {}
+        self.archives = []
+        self.tid = self.store.tid
+        if self.store.tid == 0:
+            return
+        print 'Recreating cache...'
+        for id in self.store.list(NS_ARCHIVES):
+            archive = cPickle.loads(zlib.decompress(self.store.get(NS_ARCHIVES, id)))
+            self.archives.append(archive['name'])
+            for item in archive['items']:
+                if item['type'] == 'FILE':
+                    for c in item['chunks']:
+                        self.chunk_incref(c)
+        print 'done'
+
+    def save(self):
+        assert self.store.state == Store.OPEN
+        print 'saving cache'
+        data = {'uuid': self.store.uuid, 
+                'chunkmap': self.chunkmap, 'summap': self.summap,
+                'tid': self.store.tid, 'archives': self.archives}
+        print 'Saving cache as:', self.path
+        cachedir = os.path.dirname(self.path)
+        if not os.path.exists(cachedir):
+            os.makedirs(cachedir)
+        with open(self.path, 'wb') as fd:
+            fd.write(zlib.compress(cPickle.dumps(data)))
+        print 'done'
+
+    def add_chunk(self, data):
+        sum = checksum(data)
+        data = zlib.compress(data)
+        #print 'chunk %d: %d' % (len(data), sum)
+        id = struct.pack('I', sum) + hashlib.sha1(data).digest()
+        if not self.seen_chunk(id):
+            size = len(data)
+            self.store.put(NS_CHUNKS, id, data)
+        else:
+            size = 0
+            #print 'seen chunk', hash.encode('hex')
+        self.chunk_incref(id)
+        return id, size
+
+    def seen_chunk(self, hash):
+        return self.chunkmap.get(hash, 0) > 0
+
+    def chunk_incref(self, id):
+        sum = struct.unpack('I', id[:4])[0]
+        self.chunkmap.setdefault(id, 0)
+        self.summap.setdefault(sum, 0)
+        self.chunkmap[id] += 1
+        self.summap[sum] += 1
+
+    def chunk_decref(self, id):
+        sum = struct.unpack('I', id[:4])[0]
+        sumcount = self.summap[sum] - 1
+        count = self.chunkmap[id] - 1
+        assert sumcount >= 0
+        assert count >= 0
+        if sumcount:
+            self.summap[sum] = sumcount
+        else:
+            del self.summap[sum]
+        if count:
+            self.chunkmap[id] = count
+        else:
+            del self.chunkmap[id]
+            print 'deleting chunk: ', id.encode('hex')
+            self.store.delete(NS_CHUNKS, id)
+        return count

+ 8 - 6
dedupestore/store.py

@@ -1,16 +1,21 @@
 #!/usr/bin/env python
 import os
 import fcntl
-import hashlib
 import tempfile
 import shutil
 import unittest
 import uuid
 
 
+CHUNK_SIZE = 256 * 1024
+NS_ARCHIVES = 'ARCHIVES'
+NS_CHUNKS = 'CHUNKS'
+
+
 class Store(object):
     """
     """
+
     class DoesNotExist(KeyError):
         """"""
 
@@ -173,18 +178,15 @@ class Store(object):
                 raise Store.DoesNotExist('Object does not exist: %s' % hash.encode('hex'))
 
     def list(self, ns, prefix='', marker=None, max_keys=1000000):
-        for x in self.foo(os.path.join(self.path, 'data', ns.encode('hex')), 
+        for x in self._walker(os.path.join(self.path, 'data', ns.encode('hex')), 
                           prefix, marker, '', max_keys):
             yield x
         
-
-    def foo(self, path, prefix, marker, base, max_keys):
+    def _walker(self, path, prefix, marker, base, max_keys):
         n = 0
         for name in sorted(os.listdir(path)):
             if n >= max_keys:
                 return
-            dirs = []
-            names = []
             id = name.decode('hex')
             if os.path.isdir(os.path.join(path, name)):
                 if prefix and not id.startswith(prefix[:len(id)]):