archiver.py 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232
  1. import os
  2. import hashlib
  3. import zlib
  4. import cPickle
  5. from optparse import OptionParser
  6. from chunkifier import chunkify
  7. from cache import Cache, NS_ARCHIVES, NS_CHUNKS
  8. #from sqlitestore import SqliteStore
  9. from bandstore import BandStore
  10. CHUNK_SIZE = 55001
  11. class Archive(object):
  12. def __init__(self, store, name=None):
  13. self.store = store
  14. self.items = []
  15. self.chunks = []
  16. self.chunk_idx = {}
  17. if name:
  18. self.open(name)
  19. def add_chunk(self, id, csize, osize):
  20. try:
  21. return self.chunk_idx[id]
  22. except KeyError:
  23. idx = len(self.chunks)
  24. self.chunks.append((id, csize, osize))
  25. self.chunk_idx[id] = idx
  26. return idx
  27. def open(self, name):
  28. archive = cPickle.loads(zlib.decompress(self.store.get(NS_ARCHIVES, name)))
  29. self.items = archive['items']
  30. self.name = archive['name']
  31. self.chunks = archive['chunks']
  32. for i, (id, csize, osize) in enumerate(archive['chunks']):
  33. self.chunk_idx[i] = id
  34. def save(self, name):
  35. archive = {'name': name, 'items': self.items, 'chunks': self.chunks}
  36. self.store.put(NS_ARCHIVES, name, zlib.compress(cPickle.dumps(archive)))
  37. self.store.commit()
  38. def stats(self, cache):
  39. total_osize = 0
  40. total_csize = 0
  41. total_usize = 0
  42. chunk_count = {}
  43. for item in self.items:
  44. if item['type'] == 'FILE':
  45. total_osize += item['size']
  46. for idx in item['chunks']:
  47. id = self.chunk_idx[idx]
  48. chunk_count.setdefault(id, 0)
  49. chunk_count[id] += 1
  50. for id, c in chunk_count.items():
  51. count, csize, osize = cache.chunkmap[id]
  52. total_csize += csize
  53. if c == count:
  54. total_usize += csize
  55. return dict(osize=total_osize, csize=total_csize, usize=total_usize)
  56. def list(self):
  57. for item in self.items:
  58. print item['path']
  59. def extract(self):
  60. for item in self.items:
  61. assert item['path'][0] not in ('/', '\\', ':')
  62. print item['path']
  63. if item['type'] == 'DIR':
  64. if not os.path.exists(item['path']):
  65. os.makedirs(item['path'])
  66. if item['type'] == 'FILE':
  67. path = item['path']
  68. if not os.path.exists(os.path.dirname(path)):
  69. os.makedirs(os.path.dirname(path))
  70. with open(item['path'], 'wb') as fd:
  71. for chunk in item['chunks']:
  72. id = self.chunk_idx[chunk]
  73. data = self.store.get(NS_CHUNKS, id)
  74. if hashlib.sha1(data).digest() != id:
  75. raise Exception('Invalid chunk checksum')
  76. fd.write(zlib.decompress(data))
  77. def verify(self):
  78. for item in self.items:
  79. if item['type'] == 'FILE':
  80. print item['path'], '...',
  81. for chunk in item['chunks']:
  82. id = self.chunk_idx[chunk]
  83. data = self.store.get(NS_CHUNKS, id)
  84. if hashlib.sha1(data).digest() != id:
  85. print 'ERROR'
  86. break
  87. else:
  88. print 'OK'
  89. def delete(self, cache):
  90. self.store.delete(NS_ARCHIVES, self.name)
  91. for item in self.items:
  92. if item['type'] == 'FILE':
  93. for c in item['chunks']:
  94. id = self.chunk_idx[c]
  95. cache.chunk_decref(id)
  96. self.store.commit()
  97. cache.archives.remove(self.name)
  98. cache.save()
  99. def create(self, name, paths, cache):
  100. for path in paths:
  101. for root, dirs, files in os.walk(path):
  102. for d in dirs:
  103. p = os.path.join(root, d)
  104. print p
  105. self.items.append(self.process_dir(p, cache))
  106. for f in files:
  107. p = os.path.join(root, f)
  108. print p
  109. self.items.append(self.process_file(p, cache))
  110. self.save(name)
  111. cache.archives.append(name)
  112. cache.save()
  113. def process_dir(self, path, cache):
  114. path = path.lstrip('/\\:')
  115. return {'type': 'DIR', 'path': path}
  116. def process_file(self, path, cache):
  117. with open(path, 'rb') as fd:
  118. path = path.lstrip('/\\:')
  119. chunks = []
  120. size = 0
  121. for chunk in chunkify(fd, CHUNK_SIZE, 30):
  122. size += len(chunk)
  123. chunks.append(self.add_chunk(*cache.add_chunk(chunk)))
  124. return {'type': 'FILE', 'path': path, 'chunks': chunks, 'size': size}
  125. class Archiver(object):
  126. def pretty_size(self, v):
  127. if v > 1024 * 1024 * 1024:
  128. return '%.2f GB' % (v / 1024. / 1024. / 1024.)
  129. elif v > 1024 * 1024:
  130. return '%.2f MB' % (v / 1024. / 1024.)
  131. elif v > 1024:
  132. return '%.2f kB' % (v / 1024.)
  133. else:
  134. return str(v)
  135. def create_archive(self, name, paths):
  136. archive = Archive(self.store)
  137. archive.create(name, paths, self.cache)
  138. def delete_archive(self, archive_name):
  139. archive = Archive(self.store, archive_name)
  140. archive.delete(self.cache)
  141. def list_archives(self):
  142. print 'Archives:'
  143. for archive in sorted(self.cache.archives):
  144. print archive
  145. def list_archive(self, archive_name):
  146. archive = Archive(self.store, archive_name)
  147. archive.list()
  148. def verify_archive(self, archive_name):
  149. archive = Archive(self.store, archive_name)
  150. archive.verify()
  151. def extract_archive(self, archive_name):
  152. archive = Archive(self.store, archive_name)
  153. archive.extract()
  154. def archive_stats(self, archive_name):
  155. archive = Archive(self.store, archive_name)
  156. stats = archive.stats(self.cache)
  157. print 'Original size:', self.pretty_size(stats['osize'])
  158. print 'Compressed size:', self.pretty_size(stats['csize'])
  159. print 'Unique data:', self.pretty_size(stats['usize'])
  160. def run(self):
  161. parser = OptionParser()
  162. parser.add_option("-s", "--store", dest="store",
  163. help="path to dedupe store", metavar="STORE")
  164. parser.add_option("-c", "--create", dest="create_archive",
  165. help="create ARCHIVE", metavar="ARCHIVE")
  166. parser.add_option("-d", "--delete", dest="delete_archive",
  167. help="delete ARCHIVE", metavar="ARCHIVE")
  168. parser.add_option("-l", "--list-archives", dest="list_archives",
  169. action="store_true", default=False,
  170. help="list archives")
  171. parser.add_option("-V", "--verify", dest="verify_archive",
  172. help="verify archive consistency")
  173. parser.add_option("-e", "--extract", dest="extract_archive",
  174. help="extract ARCHIVE")
  175. parser.add_option("-L", "--list-archive", dest="list_archive",
  176. help="verify archive consistency", metavar="ARCHIVE")
  177. parser.add_option("-S", "--stats", dest="archive_stats",
  178. help="Display archive statistics", metavar="ARCHIVE")
  179. (options, args) = parser.parse_args()
  180. if options.store:
  181. self.store = BandStore(options.store)
  182. else:
  183. parser.error('No store path specified')
  184. self.cache = Cache(self.store)
  185. if options.list_archives:
  186. self.list_archives()
  187. elif options.list_archive:
  188. self.list_archive(options.list_archive)
  189. elif options.verify_archive:
  190. self.verify_archive(options.verify_archive)
  191. elif options.extract_archive:
  192. self.extract_archive(options.extract_archive)
  193. elif options.delete_archive:
  194. self.delete_archive(options.delete_archive)
  195. elif options.create_archive:
  196. self.create_archive(options.create_archive, args)
  197. elif options.archive_stats:
  198. self.archive_stats(options.archive_stats)
  199. def main():
  200. archiver = Archiver()
  201. archiver.run()
  202. if __name__ == '__main__':
  203. main()