Преглед изворни кода

Initial rough implementation of chunkification cache

Jonas Borgström пре 14 година
родитељ
комит
e181829365
2 измењених фајлова са 59 додато и 16 уклоњено
  1. 39 12
      dedupestore/archive.py
  2. 20 4
      dedupestore/cache.py

+ 39 - 12
dedupestore/archive.py

@@ -243,6 +243,7 @@ class Archive(object):
         })
     def process_file(self, path, st, cache):
         safe_path = path.lstrip('/\\:')
+        # Is it a hard link?
         if st.st_nlink > 1:
             source = self.hard_links.get((st.st_ino, st.st_dev))
             if (st.st_ino, st.st_dev) in self.hard_links:
@@ -252,18 +253,34 @@ class Archive(object):
                 return
             else:
                 self.hard_links[st.st_ino, st.st_dev] = safe_path
-        try:
-            fd = open(path, 'rb')
-        except IOError, e:
-            logging.error(e)
-            return
-        with fd:
-            logging.info(safe_path)
-            chunks = []
-            size = 0
-            for chunk in chunkify(fd, CHUNK_SIZE, 30):
-                chunks.append(self.process_chunk(chunk, cache))
-                size += len(chunk)
+        logging.info(safe_path)
+        path_hash = self.crypto.id_hash(path.encode('utf-8'))
+        ids, size = cache.file_known_and_unchanged(path_hash, st)
+        if ids is not None:
+            # Make sure all ids are available
+            for id in ids:
+                if not cache.seen_chunk(id):
+                    ids = None
+                    break
+            else:
+                chunks = [self.process_chunk2(id, cache) for id in ids]
+        # Only chunkify the file if needed
+        if ids is None:
+            try:
+                fd = open(path, 'rb')
+            except IOError, e:
+                logging.error(e)
+                return
+            with fd:
+                size = 0
+                ids = []
+                chunks = []
+                for chunk in chunkify(fd, CHUNK_SIZE, 30):
+                    ids.append(self.crypto.id_hash(chunk))
+                    chunks.append(chunk)
+                    size += len(chunk)
+            cache.memorize_file_chunks(path_hash, st, ids)
+            chunks = [self.process_chunk(chunk, cache) for chunk in chunks]
         self.items.append({
             'type': 'FILE', 'path': safe_path, 'chunks': chunks, 'size': size,
             'mode': st.st_mode,
@@ -272,6 +289,16 @@ class Archive(object):
             'ctime': st.st_ctime, 'mtime': st.st_mtime,
         })
 
+    def process_chunk2(self, id, cache):
+        try:
+            return self.chunk_idx[id]
+        except KeyError:
+            idx = len(self.chunks)
+            size = cache.chunk_incref(id)
+            self.chunks.append((id, size))
+            self.chunk_idx[id] = idx
+            return idx
+
     def process_chunk(self, data, cache):
         id = self.crypto.id_hash(data)
         try:

+ 20 - 4
dedupestore/cache.py

@@ -14,6 +14,7 @@ class Cache(object):
         self.path = os.path.join(os.path.expanduser('~'), '.dedupestore', 'cache',
                                  '%s.cache' % self.store.uuid)
         self.tid = -1
+        self.file_chunks = {}
         self.open()
         if self.tid != self.store.tid:
             self.init(crypto)
@@ -22,13 +23,15 @@ class Cache(object):
         if not os.path.exists(self.path):
             return
         cache = msgpack.unpackb(open(self.path, 'rb').read())
-        version = cache.get('version')
-        if version != 1:
-            logging.error('Unsupported cache version %r' % version)
-            return
+        assert cache['version'] == 1
         if cache['store'] != self.store.uuid:
             raise Exception('Cache UUID mismatch')
         self.chunkmap = cache['chunkmap']
+        # Discard old file_chunks entries
+        for hash, entry in cache['file_chunks'].iteritems():
+            count = entry[0]
+            if count < 8:
+                self.file_chunks[hash] = [count + 1] + list(entry[1:])
         self.tid = cache['tid']
 
     def init(self, crypto):
@@ -56,6 +59,7 @@ class Cache(object):
                 'store': self.store.uuid,
                 'chunkmap': self.chunkmap,
                 'tid': self.store.tid,
+                'file_chunks': self.file_chunks,
         }
         data = msgpack.packb(cache)
         cachedir = os.path.dirname(self.path)
@@ -90,4 +94,16 @@ class Cache(object):
         else:
             self.chunkmap[id] = (count - 1, size)
 
+    def file_known_and_unchanged(self, path_hash, st):
+        entry = self.file_chunks.get(path_hash)
+        if (entry and entry[1] == st.st_ino
+            and entry[2] == st.st_size and entry[3] == st.st_mtime):
+            entry[0] = 0 # reset entry age
+            return entry[4], entry[2]
+        else:
+            return None, 0
+
+    def memorize_file_chunks(self, path_hash, st, ids):
+        # Entry: Age, inode, size, mtime, chunk ids
+        self.file_chunks[path_hash] = 0, st.st_ino, st.st_size, st.st_mtime, ids