Przeglądaj źródła

Store more chunk metadata in the archive.

Jonas Borgström 15 lat temu
rodzic
commit
42ff0a850b
2 zmienionych plików z 82 dodań i 77 usunięć
  1. 53 45
      dedupestore/archiver.py
  2. 29 32
      dedupestore/cache.py

+ 53 - 45
dedupestore/archiver.py

@@ -15,15 +15,28 @@ class Archive(object):
     def __init__(self, store, name=None):
         self.store = store
         self.items = []
+        self.chunks = []
+        self.chunk_idx = {}
         if name:
             self.open(name)
 
+    def add_chunk(self, id, sum, csize, osize):
+        try:
+            return self.chunk_idx[id]
+        except KeyError:
+            idx = len(self.chunks)
+            self.chunks.append((id, sum, csize, osize))
+            self.chunk_idx[id] = idx
+            return idx
+        
     def open(self, name):
         archive = cPickle.loads(zlib.decompress(self.store.get(NS_ARCHIVES, name)))
         self.items = archive['items']
+        for i, (id, sum, csize, osize) in enumerate(archive['chunks']):
+            self.chunk_idx[i] = id
 
     def save(self, name):
-        archive = {'name': name, 'items': self.items}
+        archive = {'name': name, 'items': self.items, 'chunks': self.chunks}
         self.store.put(NS_ARCHIVES, name, zlib.compress(cPickle.dumps(archive)))
         self.store.commit()
 
@@ -39,10 +52,14 @@ class Archive(object):
                 if not os.path.exists(item['path']):
                     os.makedirs(item['path'])
             if item['type'] == 'FILE':
+                path = item['path']
+                if not os.path.exists(os.path.dirname(path)):
+                    os.makedirs(os.path.dirname(path))
                 with open(item['path'], 'wb') as fd:
                     for chunk in item['chunks']:
-                        data = self.store.get(NS_CHUNKS, chunk)
-                        if hashlib.sha1(data).digest() != chunk[4:]:
+                        id = self.chunk_idx[chunk]
+                        data = self.store.get(NS_CHUNKS, id)
+                        if hashlib.sha1(data).digest() != id:
                             raise Exception('Invalid chunk checksum')
                         fd.write(zlib.decompress(data))
 
@@ -50,9 +67,11 @@ class Archive(object):
         for item in self.items:
             if item['type'] == 'FILE':
                 print item['path'], '...',
+                print self.chunk_idx[0].encode('hex')
                 for chunk in item['chunks']:
-                    data = self.store.get(NS_CHUNKS, chunk)
-                    if hashlib.sha1(data).digest() != chunk[4:]:
+                    id = self.chunk_idx[chunk]
+                    data = self.store.get(NS_CHUNKS, id)
+                    if hashlib.sha1(data).digest() != id:
                         print 'ERROR'
                         break
                 else:
@@ -68,28 +87,39 @@ class Archive(object):
         cache.archives.remove(self.name)
         cache.save()
 
-
-class Archiver(object):
-
-    def create_archive(self, archive_name, paths):
-        try:
-            self.store.get(NS_ARCHIVES, archive_name)
-        except Store.DoesNotExist:
-            pass
-        else:
-            raise Exception('Archive "%s" already exists' % archive_name)
-        archive = Archive(self.store)
+    def create(self, name, paths, cache):
         for path in paths:
             for root, dirs, files in os.walk(path):
                 for d in dirs:
-                    name = os.path.join(root, d)
-                    archive.items.append(self.process_dir(name, self.cache))
+                    p = os.path.join(root, d)
+                    self.items.append(self.process_dir(p, cache))
                 for f in files:
-                    name = os.path.join(root, f)
-                    archive.items.append(self.process_file(name, self.cache))
-        archive.save(archive_name)
-        self.cache.archives.append(archive_name)
-        self.cache.save()
+                    p = os.path.join(root, f)
+                    self.items.append(self.process_file(p, cache))
+        self.save(name)
+        cache.archives.append(name)
+        cache.save()
+
+    def process_dir(self, path, cache):
+        path = path.lstrip('/\\:')
+        print 'Directory: %s' % (path)
+        return {'type': 'DIR', 'path': path}
+
+    def process_file(self, path, cache):
+        with open(path, 'rb') as fd:
+            path = path.lstrip('/\\:')
+            print 'Adding: %s...' % path
+            chunks = []
+            for chunk in chunkify(fd, CHUNK_SIZE, cache.summap):
+                chunks.append(self.add_chunk(*cache.add_chunk(chunk)))
+        return {'type': 'FILE', 'path': path, 'chunks': chunks}
+
+
+class Archiver(object):
+
+    def create_archive(self, name, paths):
+        archive = Archive(self.store)
+        archive.create(name, paths, self.cache)
 
     def delete_archive(self, archive_name):
         archive = Archive(self.store, archive_name)
@@ -112,28 +142,6 @@ class Archiver(object):
         archive = Archive(self.store, archive_name)
         archive.extract()
 
-    def process_dir(self, path, cache):
-        path = path.lstrip('/\\:')
-        print 'Directory: %s' % (path)
-        return {'type': 'DIR', 'path': path}
-
-    def process_file(self, path, cache):
-        with open(path, 'rb') as fd:
-            path = path.lstrip('/\\:')
-            print 'Adding: %s...' % path,
-            sys.stdout.flush()
-            origsize = 0
-            compsize = 0
-            chunks = []
-            for chunk in chunkify(fd, CHUNK_SIZE, self.cache.summap):
-                origsize += len(chunk)
-                id, size = cache.add_chunk(chunk)
-                compsize += size
-                chunks.append(id)
-        ratio = origsize and compsize * 100 / origsize or 0
-        print '(%d chunks: %d%%)' % (len(chunks), ratio)
-        return {'type': 'FILE', 'path': path, 'size': origsize, 'chunks': chunks}
-
     def run(self):
         parser = OptionParser()
         parser.add_option("-s", "--store", dest="store",

+ 29 - 32
dedupestore/cache.py

@@ -2,7 +2,6 @@ import cPickle
 import hashlib
 import os
 import sys
-import struct
 import zlib
 
 from chunkifier import checksum
@@ -49,10 +48,11 @@ class Cache(object):
         for id in self.store.list(NS_ARCHIVES):
             archive = cPickle.loads(zlib.decompress(self.store.get(NS_ARCHIVES, id)))
             self.archives.append(archive['name'])
-            for item in archive['items']:
-                if item['type'] == 'FILE':
-                    for c in item['chunks']:
-                        self.chunk_incref(c)
+            for id, sum, csize, osize in archive['chunks']:
+                if self.seen_chunk(id):
+                    self.chunk_incref(id)
+                else:
+                    self.init_chunk(id, sum, csize, osize)
         print 'done'
 
     def save(self):
@@ -71,42 +71,39 @@ class Cache(object):
 
     def add_chunk(self, data):
         sum = checksum(data)
+        osize = len(data)
         data = zlib.compress(data)
-        #print 'chunk %d: %d' % (len(data), sum)
-        id = struct.pack('I', sum) + hashlib.sha1(data).digest()
-        if not self.seen_chunk(id):
-            size = len(data)
-            self.store.put(NS_CHUNKS, id, data)
-        else:
-            size = 0
-            #print 'seen chunk', hash.encode('hex')
-        self.chunk_incref(id)
-        return id, size
+        id = hashlib.sha1(data).digest()
+        if self.seen_chunk(id):
+            return self.chunk_incref(id)
+        csize = len(data)
+        self.store.put(NS_CHUNKS, id, data)
+        return self.init_chunk(id, sum, csize, osize)
+
+    def init_chunk(self, id, sum, csize, osize):
+        self.chunkmap[id] = (1, sum, osize, csize)
+        self.summap.setdefault(sum, 1)
+        return id, sum, csize, osize
 
-    def seen_chunk(self, hash):
-        return self.chunkmap.get(hash, 0) > 0
+    def seen_chunk(self, id):
+        return id in self.chunkmap
 
     def chunk_incref(self, id):
-        sum = struct.unpack('I', id[:4])[0]
-        self.chunkmap.setdefault(id, 0)
-        self.summap.setdefault(sum, 0)
-        self.chunkmap[id] += 1
+        count, sum, csize, osize = self.chunkmap[id]
+        self.chunkmap[id] = (count + 1, sum, osize, csize)
         self.summap[sum] += 1
+        return id, sum, csize, osize
 
     def chunk_decref(self, id):
-        sum = struct.unpack('I', id[:4])[0]
-        sumcount = self.summap[sum] - 1
-        count = self.chunkmap[id] - 1
-        assert sumcount >= 0
-        assert count >= 0
-        if sumcount:
-            self.summap[sum] = sumcount
-        else:
+        count, sum, csize, osize = self.chunkmap[id]
+        sumcount = self.summap[sum] 
+        if sumcount == 1:
             del self.summap[sum]
-        if count:
-            self.chunkmap[id] = count
         else:
+            self.summap[sum] = sumcount - 1
+        if count == 1:
             del self.chunkmap[id]
             print 'deleting chunk: ', id.encode('hex')
             self.store.delete(NS_CHUNKS, id)
-        return count
+        else:
+            self.chunkmap[id] = (count - 1, sum, csize, osize)