archiver.py 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260
  1. import os
  2. import sys
  3. import hashlib
  4. import zlib
  5. import struct
  6. import cPickle
  7. from optparse import OptionParser
  8. from chunkifier import chunkify, checksum
  9. from store import Store
  10. CHUNKSIZE = 64 * 1024
  11. NS_ARCHIVES = 'ARCHIVES'
  12. NS_CHUNKS = 'CHUNKS'
  13. class Cache(object):
  14. """Client Side cache
  15. """
  16. def __init__(self, path, store):
  17. self.store = store
  18. self.path = path
  19. self.tid = -1
  20. self.open()
  21. if self.tid != self.store.tid:
  22. print self.tid, self.store.tid
  23. self.create()
  24. def open(self):
  25. if self.store.tid == -1:
  26. return
  27. filename = os.path.join(self.path, '%s.cache' % self.store.uuid)
  28. if not os.path.exists(filename):
  29. return
  30. print 'Loading cache: ', filename, '...'
  31. data = cPickle.loads(zlib.decompress(open(filename, 'rb').read()))
  32. self.chunkmap = data['chunkmap']
  33. self.summap = data['summap']
  34. self.archives = data['archives']
  35. self.tid = data['tid']
  36. print 'done'
  37. def create(self):
  38. self.summap = {}
  39. self.chunkmap = {}
  40. self.archives = []
  41. self.tid = self.store.tid
  42. if self.store.tid == 0:
  43. return
  44. print 'Recreating cache...'
  45. for id in self.store.list(NS_ARCHIVES):
  46. archive = cPickle.loads(zlib.decompress(self.store.get(NS_ARCHIVES, id)))
  47. self.archives.append(archive['name'])
  48. for item in archive['items']:
  49. if item['type'] == 'FILE':
  50. for c in item['chunks']:
  51. self.chunk_incref(c)
  52. print 'done'
  53. def save(self):
  54. assert self.store.state == Store.OPEN
  55. print 'saving cache'
  56. data = {'chunkmap': self.chunkmap, 'summap': self.summap,
  57. 'tid': self.store.tid, 'archives': self.archives}
  58. filename = os.path.join(self.path, '%s.cache' % self.store.uuid)
  59. print 'Saving cache as:', filename
  60. with open(filename, 'wb') as fd:
  61. fd.write(zlib.compress(cPickle.dumps(data)))
  62. print 'done'
  63. def add_chunk(self, data):
  64. sum = checksum(data)
  65. data = zlib.compress(data)
  66. #print 'chunk %d: %d' % (len(data), sum)
  67. id = struct.pack('I', sum) + hashlib.sha1(data).digest()
  68. if not self.seen_chunk(id):
  69. size = len(data)
  70. self.store.put(NS_CHUNKS, id, data)
  71. else:
  72. size = 0
  73. #print 'seen chunk', hash.encode('hex')
  74. self.chunk_incref(hash)
  75. return hash, size
  76. def seen_chunk(self, hash):
  77. return self.chunkmap.get(hash, 0) > 0
  78. def chunk_incref(self, hash):
  79. sum = struct.unpack('I', hash[:4])[0]
  80. self.chunkmap.setdefault(hash, 0)
  81. self.summap.setdefault(sum, 0)
  82. self.chunkmap[hash] += 1
  83. self.summap[sum] += 1
  84. def chunk_decref(self, hash):
  85. self.summap[struct.unpack('I', hash[:4])[0]] -= 1
  86. count = self.chunkmap.get(hash, 0) - 1
  87. assert count >= 0
  88. self.chunkmap[hash] = count
  89. if not count:
  90. print 'deleting chunk: ', hash.encode('hex')
  91. self.store.delete(NS_CHUNKS, hash)
  92. return count
  93. class Archiver(object):
  94. def create_archive(self, archive_name, paths):
  95. try:
  96. self.store.get(NS_ARCHIVES, archive_name)
  97. except Store.DoesNotExist:
  98. pass
  99. else:
  100. raise Exception('Archive "%s" already exists' % archive_name)
  101. items = []
  102. for path in paths:
  103. for root, dirs, files in os.walk(path):
  104. for d in dirs:
  105. name = os.path.join(root, d)
  106. items.append(self.process_dir(name, self.cache))
  107. for f in files:
  108. name = os.path.join(root, f)
  109. items.append(self.process_file(name, self.cache))
  110. archive = {'name': archive_name, 'items': items}
  111. hash = self.store.put(NS_ARCHIVES, archive_name, zlib.compress(cPickle.dumps(archive)))
  112. self.store.commit()
  113. self.cache.archives.append(archive_name)
  114. self.cache.save()
  115. def delete_archive(self, archive_name):
  116. try:
  117. archive = cPickle.loads(zlib.decompress(self.store.get(NS_ARCHIVES, archive_name)))
  118. except Store.DoesNotExist:
  119. raise Exception('Archive "%s" does not exist' % archive_name)
  120. self.store.delete(NS_ARCHIVES, archive_name)
  121. for item in archive['items']:
  122. if item['type'] == 'FILE':
  123. for c in item['chunks']:
  124. self.cache.chunk_decref(c)
  125. self.store.commit()
  126. self.cache.archives.remove(archive_name)
  127. self.cache.save()
  128. def list_archives(self):
  129. print 'Archives:'
  130. for archive in sorted(self.cache.archives):
  131. print archive
  132. def list_archive(self, archive_name):
  133. try:
  134. archive = cPickle.loads(zlib.decompress(self.store.get(NS_ARCHIVES, archive_name)))
  135. except Store.DoesNotExist:
  136. raise Exception('Archive "%s" does not exist' % archive_name)
  137. for item in archive['items']:
  138. print item['path']
  139. def verify_archive(self, archive_name):
  140. try:
  141. archive = cPickle.loads(zlib.decompress(self.store.get(NS_ARCHIVES, archive_name)))
  142. except Store.DoesNotExist:
  143. raise Exception('Archive "%s" does not exist' % archive_name)
  144. for item in archive['items']:
  145. if item['type'] == 'FILE':
  146. print item['path'], '...',
  147. for chunk in item['chunks']:
  148. data = self.store.get(NS_CHUNKS, chunk)
  149. if hashlib.sha1(data).digest() != chunk[4:]:
  150. print 'ERROR'
  151. break
  152. else:
  153. print 'OK'
  154. def extract_archive(self, archive_name):
  155. try:
  156. archive = cPickle.loads(zlib.decompress(self.store.get(NS_ARCHIVES, archive_name)))
  157. except Store.DoesNotExist:
  158. raise Exception('Archive "%s" does not exist' % archive_name)
  159. for item in archive['items']:
  160. assert item['path'][0] not in ('/', '\\', ':')
  161. print item['path']
  162. if item['type'] == 'DIR':
  163. if not os.path.exists(item['path']):
  164. os.makedirs(item['path'])
  165. if item['type'] == 'FILE':
  166. with open(item['path'], 'wb') as fd:
  167. for chunk in item['chunks']:
  168. data = self.store.get(NS_CHUNKS, chunk)
  169. if hashlib.sha1(data).digest() != chunk[4:]:
  170. raise Exception('Invalid chunk checksum')
  171. fd.write(zlib.decompress(data))
  172. def process_dir(self, path, cache):
  173. path = path.lstrip('/\\:')
  174. print 'Directory: %s' % (path)
  175. return {'type': 'DIR', 'path': path}
  176. def process_file(self, path, cache):
  177. print 'Adding: %s...' % path,
  178. sys.stdout.flush()
  179. with open(path, 'rb') as fd:
  180. origsize = 0
  181. compsize = 0
  182. chunks = []
  183. for chunk in chunkify(fd, CHUNKSIZE, self.cache.summap):
  184. origsize += len(chunk)
  185. id, size = cache.add_chunk(chunk)
  186. compsize += size
  187. chunks.append(id)
  188. path = path.lstrip('/\\:')
  189. ratio = origsize and compsize * 100 / origsize or 0
  190. print '(%d chunks: %d%%)' % (len(chunks), ratio)
  191. return {'type': 'FILE', 'path': path, 'size': size, 'chunks': chunks}
  192. def run(self):
  193. parser = OptionParser()
  194. parser.add_option("-C", "--cache", dest="cache",
  195. help="cache directory to use", metavar="CACHE")
  196. parser.add_option("-s", "--store", dest="store",
  197. help="path to dedupe store", metavar="STORE")
  198. parser.add_option("-c", "--create", dest="create_archive",
  199. help="create ARCHIVE", metavar="ARCHIVE")
  200. parser.add_option("-d", "--delete", dest="delete_archive",
  201. help="delete ARCHIVE", metavar="ARCHIVE")
  202. parser.add_option("-l", "--list-archives", dest="list_archives",
  203. action="store_true", default=False,
  204. help="list archives")
  205. parser.add_option("-V", "--verify", dest="verify_archive",
  206. help="verify archive consistency")
  207. parser.add_option("-e", "--extract", dest="extract_archive",
  208. help="extract ARCHIVE")
  209. parser.add_option("-L", "--list-archive", dest="list_archive",
  210. help="verify archive consistency", metavar="ARCHIVE")
  211. (options, args) = parser.parse_args()
  212. if options.store:
  213. self.store = Store(options.store)
  214. else:
  215. parser.error('No store path specified')
  216. if options.cache:
  217. self.cache = Cache(options.cache, self.store)
  218. else:
  219. parser.error('No cache path specified')
  220. if options.list_archives:
  221. self.list_archives()
  222. elif options.list_archive:
  223. self.list_archive(options.list_archive)
  224. elif options.verify_archive:
  225. self.verify_archive(options.verify_archive)
  226. elif options.extract_archive:
  227. self.extract_archive(options.extract_archive)
  228. elif options.delete_archive:
  229. self.delete_archive(options.delete_archive)
  230. else:
  231. self.create_archive(options.create_archive, args)
  232. def main():
  233. archiver = Archiver()
  234. archiver.run()
  235. if __name__ == '__main__':
  236. main()