archiver.py 2.2 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192
  1. import os
  2. import sys
  3. import hashlib
  4. import zlib
  5. from repository import Repository
  6. CHUNKSIZE = 256 * 1024
  7. class FileItem(object):
  8. def __init__(self):
  9. """"""
  10. def process_file(self, filename, cache):
  11. self.filename = filename
  12. fd = open(filename, 'rb')
  13. self.size = 0
  14. self.chunks = []
  15. while True:
  16. data = fd.read(CHUNKSIZE)
  17. if not data:
  18. break
  19. self.size += len(data)
  20. self.chunks.append(cache.add_chunk(zlib.compress(data)))
  21. print '%s: %d chunks' % (filename, len(self.chunks))
  22. class Cache(object):
  23. """Client Side cache
  24. """
  25. def __init__(self, repo):
  26. self.repo = repo
  27. self.chunkmap = {}
  28. def chunk_filename(self, sha):
  29. hex = sha.encode('hex')
  30. return 'chunks/%s/%s/%s' % (hex[:2], hex[2:4], hex[4:])
  31. def add_chunk(self, data):
  32. sha = hashlib.sha1(data).digest()
  33. if not self.seen_chunk(sha):
  34. self.repo.put_file(self.chunk_filename(sha), data)
  35. else:
  36. print 'seen chunk', sha.encode('hex')
  37. self.chunk_incref(sha)
  38. return sha
  39. def seen_chunk(self, sha):
  40. return self.chunkmap.get(sha, 0) > 0
  41. def chunk_incref(self, sha):
  42. self.chunkmap.setdefault(sha, 0)
  43. self.chunkmap[sha] += 1
  44. def chunk_decref(self, sha):
  45. assert self.chunkmap.get(sha, 0) > 0
  46. self.chunkmap[sha] -= 1
  47. return self.chunkmap[sha]
  48. class Archive(object):
  49. """
  50. """
  51. def __init__(self):
  52. self.items = []
  53. def add_item(self, item):
  54. self.items.append(item)
  55. class Archiver(object):
  56. def __init__(self):
  57. self.cache = Cache(Repository('/tmp/repo'))
  58. self.archive = Archive()
  59. def run(self, path):
  60. for root, dirs, files in os.walk(path):
  61. for f in files:
  62. filename = os.path.join(root, f)
  63. item = FileItem()
  64. item.process_file(filename, self.cache)
  65. self.archive.add_item(item)
  66. self.cache.repo.commit()
  67. def main():
  68. archiver = Archiver()
  69. archiver.run(sys.argv[1])
  70. if __name__ == '__main__':
  71. main()