Browse Source

Memory usage improvments

Jonas Borgström 14 years ago
parent
commit
198b3f90fc
4 changed files with 20 additions and 42 deletions
  1. 9 37
      darc/archive.py
  2. 1 1
      darc/archiver.py
  3. 9 4
      darc/cache.py
  4. 1 0
      darc/test.py

+ 9 - 37
darc/archive.py

@@ -27,8 +27,6 @@ class Archive(object):
         self.keychain = keychain
         self.keychain = keychain
         self.store = store
         self.store = store
         self.items = []
         self.items = []
-        self.chunks = []
-        self.chunk_idx = {}
         self.hard_links = {}
         self.hard_links = {}
         if name:
         if name:
             self.load(self.keychain.id_hash(name))
             self.load(self.keychain.id_hash(name))
@@ -53,11 +51,10 @@ class Archive(object):
         assert items['version'] == 1
         assert items['version'] == 1
         assert self.metadata['items_hash'] == items_hash
         assert self.metadata['items_hash'] == items_hash
         self.items = items['items']
         self.items = items['items']
-        for i, chunk in enumerate(self.chunks):
-            self.chunk_idx[i] = chunk[0]
 
 
-    def save(self, name):
+    def save(self, name, cache):
         self.id = self.keychain.id_hash(name)
         self.id = self.keychain.id_hash(name)
+        self.chunks = [(id, size) for (id, (count, size)) in cache.chunk_counts.iteritems() if count > 1000000]
         chunks = {'version': 1, 'chunks': self.chunks}
         chunks = {'version': 1, 'chunks': self.chunks}
         data, chunks_hash = self.keychain.encrypt_create(msgpack.packb(chunks))
         data, chunks_hash = self.keychain.encrypt_create(msgpack.packb(chunks))
         self.store.put(NS_ARCHIVE_CHUNKS, self.id, data)
         self.store.put(NS_ARCHIVE_CHUNKS, self.id, data)
@@ -124,12 +121,11 @@ class Archive(object):
                 os.link(source, path)
                 os.link(source, path)
             else:
             else:
                 with open(path, 'wb') as fd:
                 with open(path, 'wb') as fd:
-                    for chunk in item['chunks']:
-                        id = self.chunk_idx[chunk]
+                    for id in item['chunks']:
                         try:
                         try:
                             data, hash = self.keychain.decrypt(self.store.get(NS_CHUNK, id))
                             data, hash = self.keychain.decrypt(self.store.get(NS_CHUNK, id))
                             if self.keychain.id_hash(data) != id:
                             if self.keychain.id_hash(data) != id:
-                                raise IntegrityError('chunk id did not match')
+                                raise IntegrityError('chunk hash did not match')
                             fd.write(data)
                             fd.write(data)
                         except ValueError:
                         except ValueError:
                             raise Exception('Invalid chunk checksum')
                             raise Exception('Invalid chunk checksum')
@@ -161,8 +157,7 @@ class Archive(object):
             os.utime(path, (item['atime'], item['mtime']))
             os.utime(path, (item['atime'], item['mtime']))
 
 
     def verify_file(self, item):
     def verify_file(self, item):
-        for chunk in item['chunks']:
-            id = self.chunk_idx[chunk]
+        for id in item['chunks']:
             try:
             try:
                 data, hash = self.keychain.decrypt(self.store.get(NS_CHUNK, id))
                 data, hash = self.keychain.decrypt(self.store.get(NS_CHUNK, id))
                 if self.keychain.id_hash(data) != id:
                 if self.keychain.id_hash(data) != id:
@@ -239,45 +234,22 @@ class Archive(object):
                     ids = None
                     ids = None
                     break
                     break
             else:
             else:
-                chunks = [self.process_chunk2(id, cache) for id in ids]
+                for id in ids:
+                    cache.chunk_incref(id)
         # Only chunkify the file if needed
         # Only chunkify the file if needed
         if ids is None:
         if ids is None:
-            fd = open(path, 'rb')
             with open(path, 'rb') as fd:
             with open(path, 'rb') as fd:
                 size = 0
                 size = 0
                 ids = []
                 ids = []
-                chunks = []
                 for chunk in chunkify(fd, CHUNK_SIZE, WINDOW_SIZE,
                 for chunk in chunkify(fd, CHUNK_SIZE, WINDOW_SIZE,
                                       self.keychain.get_chunkify_seed()):
                                       self.keychain.get_chunkify_seed()):
-                    id = self.keychain.id_hash(chunk)
-                    ids.append(id)
-                    try:
-                        chunks.append(self.chunk_idx[id])
-                    except KeyError:
-                        chunks.append(self.process_chunk(id, chunk, cache))
+                    ids.append(cache.add_chunk(self.keychain.id_hash(chunk), chunk))
                     size += len(chunk)
                     size += len(chunk)
             cache.memorize_file_chunks(path_hash, st, ids)
             cache.memorize_file_chunks(path_hash, st, ids)
-        item = {'path': safe_path, 'chunks': chunks, 'size': size}
+        item = {'path': safe_path, 'chunks': ids, 'size': size}
         item.update(self.stat_attrs(st, path))
         item.update(self.stat_attrs(st, path))
         self.items.append(item)
         self.items.append(item)
 
 
-    def process_chunk2(self, id, cache):
-        try:
-            return self.chunk_idx[id]
-        except KeyError:
-            idx = len(self.chunks)
-            id, size = cache.chunk_incref(id)
-            self.chunks.append((id, size))
-            self.chunk_idx[id] = idx
-            return idx
-
-    def process_chunk(self, id, data, cache):
-        idx = len(self.chunks)
-        id, size = cache.add_chunk(id, data)
-        self.chunks.append((id, size))
-        self.chunk_idx[id] = idx
-        return idx
-
     @staticmethod
     @staticmethod
     def list_archives(store, keychain):
     def list_archives(store, keychain):
         for id in list(store.list(NS_ARCHIVE_METADATA)):
         for id in list(store.list(NS_ARCHIVE_METADATA)):

+ 1 - 1
darc/archiver.py

@@ -75,7 +75,7 @@ class Archiver(object):
                 pass
                 pass
         for path in args.paths:
         for path in args.paths:
             self._process(archive, cache, args.patterns, unicode(path))
             self._process(archive, cache, args.patterns, unicode(path))
-        archive.save(args.archive.archive)
+        archive.save(args.archive.archive, cache)
         cache.save()
         cache.save()
         return self.exit_code
         return self.exit_code
 
 

+ 9 - 4
darc/cache.py

@@ -60,6 +60,10 @@ class Cache(object):
                 yield key, (value[0] + 1,) + value[1:]
                 yield key, (value[0] + 1,) + value[1:]
 
 
     def save(self):
     def save(self):
+        for id, (count, size) in self.chunk_counts.iteritems():
+            if count > 1000000:
+                self.chunk_counts[id] = count - 1000000, size
+
         cache = {'version': 1,
         cache = {'version': 1,
                 'tid': self.store.tid,
                 'tid': self.store.tid,
                 'chunk_counts': self.chunk_counts,
                 'chunk_counts': self.chunk_counts,
@@ -78,16 +82,17 @@ class Cache(object):
         data, hash = self.keychain.encrypt_read(data)
         data, hash = self.keychain.encrypt_read(data)
         csize = len(data)
         csize = len(data)
         self.store.put(NS_CHUNK, id, data)
         self.store.put(NS_CHUNK, id, data)
-        self.chunk_counts[id] = (1, csize)
-        return id, csize
+        self.chunk_counts[id] = (1000001, csize)
+        return id
 
 
     def seen_chunk(self, id):
     def seen_chunk(self, id):
         return self.chunk_counts.get(id, (0, 0))[0]
         return self.chunk_counts.get(id, (0, 0))[0]
 
 
     def chunk_incref(self, id):
     def chunk_incref(self, id):
         count, size = self.chunk_counts[id]
         count, size = self.chunk_counts[id]
-        self.chunk_counts[id] = (count + 1, size)
-        return id, size
+        if count < 1000000:
+            self.chunk_counts[id] = (count + 1000001, size)
+        return id
 
 
     def chunk_decref(self, id):
     def chunk_decref(self, id):
         count, size = self.chunk_counts[id]
         count, size = self.chunk_counts[id]

+ 1 - 0
darc/test.py

@@ -97,6 +97,7 @@ class Test(unittest.TestCase):
         os.symlink('somewhere', os.path.join(self.input_path, 'link1'))
         os.symlink('somewhere', os.path.join(self.input_path, 'link1'))
         os.mkfifo(os.path.join(self.input_path, 'fifo1'))
         os.mkfifo(os.path.join(self.input_path, 'fifo1'))
         self.darc('create', self.store_path + '::test', 'input')
         self.darc('create', self.store_path + '::test', 'input')
+        self.darc('create', self.store_path + '::test.2', 'input')
         self.darc('extract', self.store_path + '::test', 'output')
         self.darc('extract', self.store_path + '::test', 'output')
         self.diff_dirs('input', 'output/input')
         self.diff_dirs('input', 'output/input')