浏览代码

Working variable length file chunking.

Jonas Borgström 15 年之前
父节点
当前提交
9f955bf9fb
共有 2 个文件被更改,包括 17 次插入6 次删除
  1. 15 5
      dedupstore/archiver.py
  2. 2 1
      dedupstore/chunker.py

+ 15 - 5
dedupstore/archiver.py

@@ -2,6 +2,7 @@ import os
 import sys
 import hashlib
 import zlib
+import struct
 import cPickle
 from optparse import OptionParser
 
@@ -34,11 +35,13 @@ class Cache(object):
         print 'Loading cache: ', filename, '...'
         data = cPickle.loads(zlib.decompress(open(filename, 'rb').read()))
         self.chunkmap = data['chunkmap']
+        self.summap = data['summap']
         self.archives = data['archives']
         self.tid = data['tid']
         print 'done'
 
     def create(self):
+        self.summap = {}
         self.chunkmap = {}
         self.archives = []
         self.tid = self.store.tid
@@ -57,7 +60,8 @@ class Cache(object):
     def save(self):
         assert self.store.state == Store.OPEN
         print 'saving cache'
-        data = {'chunkmap': self.chunkmap, 'tid': self.store.tid, 'archives': self.archives}
+        data = {'chunkmap': self.chunkmap, 'summap': self.summap,
+                'tid': self.store.tid, 'archives': self.archives}
         filename = os.path.join(self.path, '%s.cache' % self.store.uuid)
         print 'Saving cache as:', filename
         with open(filename, 'wb') as fd:
@@ -65,9 +69,11 @@ class Cache(object):
         print 'done'
 
     def add_chunk(self, data):
-        hash = hashlib.sha1(data).digest()
+        sum = checksum(data)
+        #print 'chunk %d: %d' % (len(data), sum)
+        hash = struct.pack('I', sum) + hashlib.sha1(data).digest()
         if not self.seen_chunk(hash):
-            self.store.put(NS_CHUNKS, hash, data)
+            self.store.put(NS_CHUNKS, hash, zlib.compress(data))
         else:
             print 'seen chunk', hash.encode('hex')
         self.chunk_incref(hash)
@@ -77,10 +83,14 @@ class Cache(object):
         return self.chunkmap.get(hash, 0) > 0
 
     def chunk_incref(self, hash):
+        sum = struct.unpack('I', hash[:4])[0]
         self.chunkmap.setdefault(hash, 0)
+        self.summap.setdefault(sum, 0)
         self.chunkmap[hash] += 1
+        self.summap[sum] += 1
 
     def chunk_decref(self, hash):
+        self.summap[struct.unpack('I', hash[:4])[0]] -= 1
         count = self.chunkmap.get(hash, 0) - 1
         assert count >= 0
         self.chunkmap[hash] = count
@@ -182,9 +192,9 @@ class Archiver(object):
         with open(path, 'rb') as fd:
             size = 0
             chunks = []
-            for chunk in chunker(fd, CHUNKSIZE, {}):
+            for chunk in chunker(fd, CHUNKSIZE, self.cache.summap):
                 size += len(chunk)
-                chunks.append(cache.add_chunk(zlib.compress(chunk)))
+                chunks.append(cache.add_chunk(chunk))
         path = path.lstrip('/\\:')
         print 'File: %s (%d chunks)' % (path, len(chunks))
         return {'type': 'FILE', 'path': path, 'size': size, 'chunks': chunks}

+ 2 - 1
dedupstore/chunker.py

@@ -68,7 +68,8 @@ def chunker(fd, chunk_size, chunks):
             return
         sum = roll_checksum(sum, data[i - 1], data[i - 1 + chunk_size], chunk_size)
         #print data[i:i + chunk_size], sum
-        if sum in chunks:
+        if chunks.get(sum):
+            print 'Woot', i
             if i > 1:
                 yield data[1:i]
             yield data[i:i + chunk_size]