| 
														
															@@ -2,10 +2,14 @@ import os 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 import hashlib 
														 | 
														
														 | 
														
															 import hashlib 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 import logging 
														 | 
														
														 | 
														
															 import logging 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 import zlib 
														 | 
														
														 | 
														
															 import zlib 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-import cPickle 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 import argparse 
														 | 
														
														 | 
														
															 import argparse 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 import sys 
														 | 
														
														 | 
														
															 import sys 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+from cStringIO import StringIO 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+from datetime import datetime 
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+from avro import io 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+ 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+from dedupestore import archive_schema 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 from chunkifier import chunkify 
														 | 
														
														 | 
														
															 from chunkifier import chunkify 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 from cache import Cache, NS_ARCHIVES, NS_CHUNKS 
														 | 
														
														 | 
														
															 from cache import Cache, NS_ARCHIVES, NS_CHUNKS 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 from bandstore import BandStore 
														 | 
														
														 | 
														
															 from bandstore import BandStore 
														 | 
													
												
											
										
											
												
													
														 | 
														
															@@ -41,26 +45,39 @@ class Archive(object): 
														 | 
													
												
											
												
													
														| 
														 | 
														
															         data = self.store.get(NS_ARCHIVES, id) 
														 | 
														
														 | 
														
															         data = self.store.get(NS_ARCHIVES, id) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															         if hashlib.sha256(data).digest() != id: 
														 | 
														
														 | 
														
															         if hashlib.sha256(data).digest() != id: 
														 | 
													
												
											
												
													
														| 
														 | 
														
															             raise Exception('Archive hash did not match') 
														 | 
														
														 | 
														
															             raise Exception('Archive hash did not match') 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-        archive = cPickle.loads(zlib.decompress(data)) 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+        buffer = StringIO(zlib.decompress(data)) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+        reader = io.DatumReader(archive_schema) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+        decoder = io.BinaryDecoder(buffer) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+        archive = reader.read(decoder) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															         self.items = archive['items'] 
														 | 
														
														 | 
														
															         self.items = archive['items'] 
														 | 
													
												
											
												
													
														| 
														 | 
														
															         self.name = archive['name'] 
														 | 
														
														 | 
														
															         self.name = archive['name'] 
														 | 
													
												
											
												
													
														| 
														 | 
														
															         self.chunks = archive['chunks'] 
														 | 
														
														 | 
														
															         self.chunks = archive['chunks'] 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-        for i, (id, csize, osize) in enumerate(archive['chunks']): 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-            self.chunk_idx[i] = id 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+        for i, chunk in enumerate(archive['chunks']): 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+            self.chunk_idx[i] = chunk['id'] 
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
												
													
														| 
														 | 
														
															     def save(self, name): 
														 | 
														
														 | 
														
															     def save(self, name): 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-        archive = {'name': name, 'items': self.items, 'chunks': self.chunks} 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-        data = zlib.compress(cPickle.dumps(archive)) 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+        archive = { 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+            'name': name, 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+            'ts': datetime.utcnow().isoformat(), 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+            'items': self.items, 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+            'chunks': self.chunks 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+        } 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+        writer = StringIO() 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+        encoder = io.BinaryEncoder(writer) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+        datum_writer = io.DatumWriter(archive_schema) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+        datum_writer.write(archive, encoder) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+        data = zlib.compress(writer.getvalue()) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+        print 'archive size: %d' % len(data) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															         self.id = hashlib.sha256(data).digest() 
														 | 
														
														 | 
														
															         self.id = hashlib.sha256(data).digest() 
														 | 
													
												
											
												
													
														| 
														 | 
														
															         self.store.put(NS_ARCHIVES, self.id, data) 
														 | 
														
														 | 
														
															         self.store.put(NS_ARCHIVES, self.id, data) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															         self.store.commit() 
														 | 
														
														 | 
														
															         self.store.commit() 
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
												
													
														| 
														 | 
														
															-    def add_chunk(self, id, csize, osize): 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    def add_chunk(self, id, size): 
														 | 
													
												
											
												
													
														| 
														 | 
														
															         try: 
														 | 
														
														 | 
														
															         try: 
														 | 
													
												
											
												
													
														| 
														 | 
														
															             return self.chunk_idx[id] 
														 | 
														
														 | 
														
															             return self.chunk_idx[id] 
														 | 
													
												
											
												
													
														| 
														 | 
														
															         except KeyError: 
														 | 
														
														 | 
														
															         except KeyError: 
														 | 
													
												
											
												
													
														| 
														 | 
														
															             idx = len(self.chunks) 
														 | 
														
														 | 
														
															             idx = len(self.chunks) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-            self.chunks.append((id, csize, osize)) 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+            self.chunks.append(dict(id=id, size=size)) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															             self.chunk_idx[id] = idx 
														 | 
														
														 | 
														
															             self.chunk_idx[id] = idx 
														 | 
													
												
											
												
													
														| 
														 | 
														
															             return idx 
														 | 
														
														 | 
														
															             return idx 
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
										
											
												
													
														 | 
														
															@@ -77,10 +94,10 @@ class Archive(object): 
														 | 
													
												
											
												
													
														| 
														 | 
														
															                     chunk_count.setdefault(id, 0) 
														 | 
														
														 | 
														
															                     chunk_count.setdefault(id, 0) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															                     chunk_count[id] += 1 
														 | 
														
														 | 
														
															                     chunk_count[id] += 1 
														 | 
													
												
											
												
													
														| 
														 | 
														
															         for id, c in chunk_count.items(): 
														 | 
														
														 | 
														
															         for id, c in chunk_count.items(): 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-            count, csize, osize = cache.chunkmap[id] 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-            total_csize += csize 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+            count, size = cache.chunkmap[id] 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+            total_csize += size 
														 | 
													
												
											
												
													
														| 
														 | 
														
															             if  c == count: 
														 | 
														
														 | 
														
															             if  c == count: 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-                total_usize += csize 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+                total_usize += size 
														 | 
													
												
											
												
													
														| 
														 | 
														
															         return dict(osize=total_osize, csize=total_csize, usize=total_usize) 
														 | 
														
														 | 
														
															         return dict(osize=total_osize, csize=total_csize, usize=total_usize) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
												
													
														| 
														 | 
														
															     def list(self): 
														 | 
														
														 | 
														
															     def list(self): 
														 | 
													
												
											
										
											
												
													
														 | 
														
															@@ -93,7 +110,7 @@ class Archive(object): 
														 | 
													
												
											
												
													
														| 
														 | 
														
															             assert item['path'][0] not in ('/', '\\', ':') 
														 | 
														
														 | 
														
															             assert item['path'][0] not in ('/', '\\', ':') 
														 | 
													
												
											
												
													
														| 
														 | 
														
															             path = os.path.join(dest, item['path']) 
														 | 
														
														 | 
														
															             path = os.path.join(dest, item['path']) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															             logging.info(path) 
														 | 
														
														 | 
														
															             logging.info(path) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-            if item['type'] == 'DIR': 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+            if item['type'] == 'DIRECTORY': 
														 | 
													
												
											
												
													
														| 
														 | 
														
															                 if not os.path.exists(path): 
														 | 
														
														 | 
														
															                 if not os.path.exists(path): 
														 | 
													
												
											
												
													
														| 
														 | 
														
															                     os.makedirs(path) 
														 | 
														
														 | 
														
															                     os.makedirs(path) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															             if item['type'] == 'FILE': 
														 | 
														
														 | 
														
															             if item['type'] == 'FILE': 
														 | 
													
												
											
										
											
												
													
														 | 
														
															@@ -142,7 +159,7 @@ class Archive(object): 
														 | 
													
												
											
												
													
														| 
														 | 
														
															         if name in cache.archives: 
														 | 
														
														 | 
														
															         if name in cache.archives: 
														 | 
													
												
											
												
													
														| 
														 | 
														
															             raise NameError('Archive already exists') 
														 | 
														
														 | 
														
															             raise NameError('Archive already exists') 
														 | 
													
												
											
												
													
														| 
														 | 
														
															         for path in paths: 
														 | 
														
														 | 
														
															         for path in paths: 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-            for root, dirs, files in os.walk(path): 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+            for root, dirs, files in os.walk(unicode(path)): 
														 | 
													
												
											
												
													
														| 
														 | 
														
															                 for d in dirs: 
														 | 
														
														 | 
														
															                 for d in dirs: 
														 | 
													
												
											
												
													
														| 
														 | 
														
															                     p = os.path.join(root, d) 
														 | 
														
														 | 
														
															                     p = os.path.join(root, d) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															                     self.items.append(self.process_dir(p, cache)) 
														 | 
														
														 | 
														
															                     self.items.append(self.process_dir(p, cache)) 
														 | 
													
												
											
										
											
												
													
														 | 
														
															@@ -158,7 +175,7 @@ class Archive(object): 
														 | 
													
												
											
												
													
														| 
														 | 
														
															     def process_dir(self, path, cache): 
														 | 
														
														 | 
														
															     def process_dir(self, path, cache): 
														 | 
													
												
											
												
													
														| 
														 | 
														
															         path = path.lstrip('/\\:') 
														 | 
														
														 | 
														
															         path = path.lstrip('/\\:') 
														 | 
													
												
											
												
													
														| 
														 | 
														
															         logging.info(path) 
														 | 
														
														 | 
														
															         logging.info(path) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-        return {'type': 'DIR', 'path': path} 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+        return {'type': 'DIRECTORY', 'path': path} 
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
												
													
														| 
														 | 
														
															     def process_file(self, path, cache): 
														 | 
														
														 | 
														
															     def process_file(self, path, cache): 
														 | 
													
												
											
												
													
														| 
														 | 
														
															         try: 
														 | 
														
														 | 
														
															         try: 
														 |