archiver.py 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134
  1. import os
  2. import sys
  3. import hashlib
  4. import zlib
  5. import cPickle
  6. from repository import Repository
  7. CHUNKSIZE = 256 * 1024
  8. class Cache(object):
  9. """Client Side cache
  10. """
  11. def __init__(self, path, repo):
  12. self.repo = repo
  13. self.path = path
  14. self.chunkmap = {}
  15. self.archives = []
  16. self.tid = -1
  17. self.open()
  18. if self.tid != self.repo.tid:
  19. print self.tid, self.repo.tid
  20. self.create()
  21. def open(self):
  22. if self.repo.tid == 0:
  23. return
  24. filename = os.path.join(self.path, '%s.cache' % self.repo.uuid)
  25. if not os.path.exists(filename):
  26. return
  27. print 'Reading cache: ', filename, '...'
  28. data = cPickle.loads(zlib.decompress(open(filename, 'rb').read()))
  29. self.chunkmap = data['chunkmap']
  30. self.tid = data['tid']
  31. self.archives = data['archives']
  32. print 'done'
  33. def create(self):
  34. print 'Recreating cache...'
  35. for archive in self.repo.listdir('archives'):
  36. self.archives.append(archive)
  37. data = self.repo.get_file(os.path.join('archives', archive))
  38. a = cPickle.loads(zlib.decompress(data))
  39. for item in a['items']:
  40. if item['type'] == 'FILE':
  41. for c in item['chunks']:
  42. self.chunk_incref(c)
  43. self.tid = self.repo.tid
  44. print 'done'
  45. def save(self):
  46. assert self.repo.state == Repository.OPEN
  47. print 'saving',self.tid, self.repo.tid
  48. data = {'chunkmap': self.chunkmap, 'tid': self.repo.tid, 'archives': self.archives}
  49. filename = os.path.join(self.path, '%s.cache' % self.repo.uuid)
  50. print 'Saving cache as:', filename
  51. with open(filename, 'wb') as fd:
  52. fd.write(zlib.compress(cPickle.dumps(data)))
  53. print 'done'
  54. def chunk_filename(self, sha):
  55. hex = sha.encode('hex')
  56. return 'chunks/%s/%s/%s' % (hex[:2], hex[2:4], hex[4:])
  57. def add_chunk(self, data):
  58. sha = hashlib.sha1(data).digest()
  59. if not self.seen_chunk(sha):
  60. self.repo.put_file(self.chunk_filename(sha), data)
  61. else:
  62. print 'seen chunk', sha.encode('hex')
  63. self.chunk_incref(sha)
  64. return sha
  65. def seen_chunk(self, sha):
  66. return self.chunkmap.get(sha, 0) > 0
  67. def chunk_incref(self, sha):
  68. self.chunkmap.setdefault(sha, 0)
  69. self.chunkmap[sha] += 1
  70. def chunk_decref(self, sha):
  71. assert self.chunkmap.get(sha, 0) > 0
  72. self.chunkmap[sha] -= 1
  73. return self.chunkmap[sha]
  74. class Archiver(object):
  75. def __init__(self):
  76. self.repo = Repository('/tmp/repo')
  77. self.cache = Cache('/tmp/cache', self.repo)
  78. def create_archive(self, archive_name, path):
  79. if archive_name in self.cache.archives:
  80. raise Exception('Archive "%s" already exists' % archive_name)
  81. items = []
  82. for root, dirs, files in os.walk(path):
  83. for d in dirs:
  84. name = os.path.join(root, d)
  85. items.append(self.process_dir(name, self.cache))
  86. for f in files:
  87. name = os.path.join(root, f)
  88. items.append(self.process_file(name, self.cache))
  89. archive = {'name': name, 'items': items}
  90. zdata = zlib.compress(cPickle.dumps(archive))
  91. self.repo.put_file(os.path.join('archives', archive_name), zdata)
  92. self.cache.archives.append(archive_name)
  93. print 'Archive file size: %d' % len(zdata)
  94. self.repo.commit()
  95. self.cache.save()
  96. def process_dir(self, path, cache):
  97. print 'Directory: %s' % (path)
  98. return {'type': 'DIR', 'path': path}
  99. def process_file(self, path, cache):
  100. fd = open(path, 'rb')
  101. size = 0
  102. chunks = []
  103. while True:
  104. data = fd.read(CHUNKSIZE)
  105. if not data:
  106. break
  107. size += len(data)
  108. chunks.append(cache.add_chunk(zlib.compress(data)))
  109. print 'File: %s (%d chunks)' % (path, len(chunks))
  110. return {'type': 'FILE', 'path': path, 'size': size, 'chunks': chunks}
  111. def main():
  112. archiver = Archiver()
  113. archiver.create_archive(sys.argv[1], sys.argv[2])
  114. if __name__ == '__main__':
  115. main()