chunkifier.py 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145
  1. def checksum(data, sum=0):
  2. """Simple but fast checksum that can be updated at either end.
  3. >>> checksum('FOOBAR')
  4. 102367679
  5. >>> checksum('FOOBAR') == checksum('BAR', checksum('FOO'))
  6. True
  7. """
  8. s1 = sum & 0xffff
  9. s2 = sum >> 16
  10. for c in data:
  11. s1 += ord(c) + 1
  12. s2 += s1
  13. return ((s2 & 0xffff) << 16) + (s1 & 0xffff)
  14. def roll_checksum(sum, remove, add, len):
  15. """
  16. >>> roll_checksum(checksum('XFOOBA'), 'X', 'R', 6) == checksum('FOOBAR')
  17. True
  18. """
  19. s1 = sum & 0xffff
  20. s2 = sum >> 16
  21. add = ord(add)
  22. remove = ord(remove)
  23. s1 -= remove - add
  24. s2 -= len * (remove + 1) - s1
  25. return (s1 & 0xffff) + ((s2 & 0xffff) << 16)
  26. class ChunkifyIter(object):
  27. def __init__(self, fd, chunk_size, chunks):
  28. self.fd = fd
  29. self.chunk_size = chunk_size
  30. self.chunks = chunks
  31. def __iter__(self):
  32. self.data = ''
  33. self.i = 0
  34. self.full_sum = True
  35. self.extra = None
  36. self.done = False
  37. self.buf_size = self.chunk_size * 10
  38. return self
  39. def next(self):
  40. o = 0
  41. if self.done:
  42. raise StopIteration
  43. if self.extra:
  44. self.done = True
  45. return self.extra
  46. while True:
  47. if self.i > self.buf_size - self.chunk_size:
  48. self.data = self.data[self.i - o:]
  49. self.i = o
  50. if len(self.data) - self.i < self.chunk_size:
  51. self.data += self.fd.read(self.buf_size - len(self.data))
  52. if len(self.data) == self.i:
  53. raise StopIteration
  54. if len(self.data) - self.i < self.chunk_size: # EOF?
  55. if o == 1:
  56. self.done = True
  57. return self.data[self.i - 1:]
  58. elif o > 1:
  59. self.extra = self.data[-self.chunk_size:]
  60. return self.data[-self.chunk_size - o + 1:-self.chunk_size]
  61. else:
  62. self.done = True
  63. return self.data[self.i:]
  64. elif o == self.chunk_size:
  65. return self.data[self.i-self.chunk_size:self.i]
  66. if self.full_sum or len(self.data) - self.i < self.chunk_size:
  67. self.sum = checksum(self.data[self.i:self.i + self.chunk_size])
  68. self.full_sum = False
  69. self.remove = self.data[self.i]
  70. else:
  71. self.sum = roll_checksum(self.sum, self.remove, self.data[self.i + self.chunk_size - 1],
  72. self.chunk_size)
  73. self.remove = self.data[self.i]
  74. if self.sum in self.chunks:
  75. if o > 0:
  76. chunk = self.data[self.i - o:self.i]
  77. else:
  78. chunk = self.data[self.i:self.i + self.chunk_size]
  79. self.i += self.chunk_size
  80. self.full_sum = True
  81. return chunk
  82. else:
  83. self.i += 1
  84. o += 1
  85. def chunkify(fd, chunk_size, chunks):
  86. """
  87. >>> list(chunkify(StringIO.StringIO('A'), 4, {}))
  88. ['A']
  89. >>> list(chunkify(StringIO.StringIO('AB'), 4, {}))
  90. ['AB']
  91. >>> list(chunkify(StringIO.StringIO('ABC'), 4, {}))
  92. ['ABC']
  93. >>> list(chunkify(StringIO.StringIO('ABCD'), 4, {}))
  94. ['ABCD']
  95. >>> list(chunkify(StringIO.StringIO('ABCDE'), 4, {}))
  96. ['A', 'BCDE']
  97. >>> list(chunkify(StringIO.StringIO('ABCDEF'), 4, {}))
  98. ['AB', 'CDEF']
  99. >>> list(chunkify(StringIO.StringIO('ABCDEFG'), 4, {}))
  100. ['ABC', 'DEFG']
  101. >>> list(chunkify(StringIO.StringIO('ABCDEFGH'), 4, {}))
  102. ['ABCD', 'EFGH']
  103. >>> list(chunkify(StringIO.StringIO('ABCDEFGHI'), 4, {}))
  104. ['ABCD', 'E', 'FGHI']
  105. >>> list(chunkify(StringIO.StringIO('ABCDEFGHIJKLMN'), 4, {}))
  106. ['ABCD', 'EFGH', 'IJ', 'KLMN']
  107. >>> chunks = {44564754: True} # 'BCDE'
  108. >>> list(chunkify(StringIO.StringIO('ABCDEFGHIJKLMN'), 4, chunks))
  109. ['A', 'BCDE', 'FGHI', 'J', 'KLMN']
  110. >>> chunks = {44564754: True, 48496938: True} # 'BCDE', 'HIJK'
  111. >>> list(chunkify(StringIO.StringIO('ABCDEFGHIJKLMN'), 4, chunks))
  112. ['A', 'BCDE', 'FG', 'HIJK', 'LMN']
  113. >>> chunks = {43909390: True, 50463030: True} # 'ABCD', 'KLMN'
  114. >>> list(chunkify(StringIO.StringIO('ABCDEFGHIJKLMN'), 4, chunks))
  115. ['ABCD', 'EFGH', 'IJ', 'KLMN']
  116. """
  117. return ChunkifyIter(fd, chunk_size, chunks)
  118. try:
  119. import _speedups
  120. checksum = _speedups.checksum
  121. roll_checksum = _speedups.roll_checksum
  122. py_chunkify = chunkify
  123. chunkify = _speedups.chunkify
  124. except ImportError:
  125. print 'Failed to load _speedups module, things will be slow'
  126. if __name__ == '__main__':
  127. import StringIO
  128. import doctest
  129. doctest.testmod()