Przeglądaj źródła

Some chunker fixes.

Jonas Borgström 15 lat temu
rodzic
commit
0cca830981
2 zmienionych plików z 19 dodań i 11 usunięć
  1. 16 7
      dedupstore/archiver.py
  2. 3 4
      dedupstore/chunker.py

+ 16 - 7
dedupstore/archiver.py

@@ -73,11 +73,14 @@ class Cache(object):
         #print 'chunk %d: %d' % (len(data), sum)
         #print 'chunk %d: %d' % (len(data), sum)
         hash = struct.pack('I', sum) + hashlib.sha1(data).digest()
         hash = struct.pack('I', sum) + hashlib.sha1(data).digest()
         if not self.seen_chunk(hash):
         if not self.seen_chunk(hash):
-            self.store.put(NS_CHUNKS, hash, zlib.compress(data))
+            zdata = zlib.compress(data)
+            size = len(zdata)
+            self.store.put(NS_CHUNKS, hash, zdata)
         else:
         else:
-            print 'seen chunk', hash.encode('hex')
+            size = 0
+            #print 'seen chunk', hash.encode('hex')
         self.chunk_incref(hash)
         self.chunk_incref(hash)
-        return hash
+        return hash, size
 
 
     def seen_chunk(self, hash):
     def seen_chunk(self, hash):
         return self.chunkmap.get(hash, 0) > 0
         return self.chunkmap.get(hash, 0) > 0
@@ -189,14 +192,20 @@ class Archiver(object):
         return {'type': 'DIR', 'path': path}
         return {'type': 'DIR', 'path': path}
 
 
     def process_file(self, path, cache):
     def process_file(self, path, cache):
+        print 'Adding: %s...' % path,
+        sys.stdout.flush()
         with open(path, 'rb') as fd:
         with open(path, 'rb') as fd:
-            size = 0
+            origsize = 0
+            compsize = 0
             chunks = []
             chunks = []
             for chunk in chunker(fd, CHUNKSIZE, self.cache.summap):
             for chunk in chunker(fd, CHUNKSIZE, self.cache.summap):
-                size += len(chunk)
-                chunks.append(cache.add_chunk(chunk))
+                origsize += len(chunk)
+                id, size = cache.add_chunk(chunk)
+                compsize += size
+                chunks.append(id)
         path = path.lstrip('/\\:')
         path = path.lstrip('/\\:')
-        print 'File: %s (%d chunks)' % (path, len(chunks))
+        ratio = origsize and compsize * 100 / origsize or 0
+        print '(%d chunks: %d%%)' % (len(chunks), ratio)
         return {'type': 'FILE', 'path': path, 'size': size, 'chunks': chunks}
         return {'type': 'FILE', 'path': path, 'size': size, 'chunks': chunks}
 
 
     def run(self):
     def run(self):

+ 3 - 4
dedupstore/chunker.py

@@ -49,11 +49,11 @@ def chunker(fd, chunk_size, chunks):
     >>> list(chunker(fd, 4, chunks))
     >>> list(chunker(fd, 4, chunks))
     ['ABCD', 'EFGH', 'IJ', 'KLMN']
     ['ABCD', 'EFGH', 'IJ', 'KLMN']
     """
     """
-    data = 'X' + fd.read(chunk_size * 2)
+    data = 'X' + fd.read(chunk_size * 3)
     i = 1
     i = 1
     sum = checksum(data[:chunk_size])
     sum = checksum(data[:chunk_size])
     while True:
     while True:
-        if len(data) - i - 2 <= chunk_size:
+        if len(data) - i <= chunk_size * 2:
             data += fd.read(chunk_size * 2)
             data += fd.read(chunk_size * 2)
         if i == chunk_size + 1:
         if i == chunk_size + 1:
             yield data[1:chunk_size + 1]
             yield data[1:chunk_size + 1]
@@ -62,14 +62,13 @@ def chunker(fd, chunk_size, chunks):
         if len(data) - i <= chunk_size:  # EOF?
         if len(data) - i <= chunk_size:  # EOF?
             if len(data) > chunk_size + 1:
             if len(data) > chunk_size + 1:
                 yield data[1:len(data) - chunk_size]
                 yield data[1:len(data) - chunk_size]
-                yield data[-chunk_size:]
+                yield data[:chunk_size]
             else:
             else:
                 yield data[1:]
                 yield data[1:]
             return
             return
         sum = roll_checksum(sum, data[i - 1], data[i - 1 + chunk_size], chunk_size)
         sum = roll_checksum(sum, data[i - 1], data[i - 1 + chunk_size], chunk_size)
         #print data[i:i + chunk_size], sum
         #print data[i:i + chunk_size], sum
         if chunks.get(sum):
         if chunks.get(sum):
-            print 'Woot', i
             if i > 1:
             if i > 1:
                 yield data[1:i]
                 yield data[1:i]
             yield data[i:i + chunk_size]
             yield data[i:i + chunk_size]