chunkifier.py 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122
  1. def checksum(data, sum=0):
  2. """Simple but fast checksum that can be updated at either end.
  3. >>> checksum('FOOBAR')
  4. 102367679
  5. >>> checksum('FOOBAR') == checksum('BAR', checksum('FOO'))
  6. True
  7. """
  8. s1 = sum & 0xffff
  9. s2 = sum >> 16
  10. for c in data:
  11. s1 += ord(c) + 1
  12. s2 += s1
  13. return ((s2 & 0xffff) << 16) + (s1 & 0xffff)
  14. def roll_checksum(sum, remove, add, len):
  15. """
  16. >>> roll_checksum(checksum('XFOOBA'), 'X', 'R', 6) == checksum('FOOBAR')
  17. True
  18. """
  19. s1 = sum & 0xffff
  20. s2 = sum >> 16
  21. add = ord(add)
  22. remove = ord(remove)
  23. s1 -= remove - add
  24. s2 -= len * (remove + 1) - s1
  25. return (s1 & 0xffff) + ((s2 & 0xffff) << 16)
  26. class ChunkifyIter(object):
  27. def __init__(self, fd, chunk_size, window_size):
  28. self.fd = fd
  29. self.chunk_size = chunk_size
  30. self.window_size = window_size
  31. self.buf_size = self.chunk_size * 10
  32. def __iter__(self):
  33. self.data = ''
  34. self.done = False
  35. self.i = 0
  36. self.sum = 0
  37. self.last = -1
  38. self.initial = self.window_size
  39. return self
  40. def next(self):
  41. if self.done:
  42. raise StopIteration
  43. while True:
  44. if self.i == self.buf_size:
  45. diff = self.last + 1 - self.window_size
  46. if diff < 0:
  47. import ipdb
  48. ipdb.set_trace()
  49. self.data = self.data[diff:]
  50. self.last -= diff
  51. self.i -= diff
  52. if self.i == len(self.data):
  53. self.data += self.fd.read(self.buf_size - len(self.data))
  54. if self.i == len(self.data):
  55. if self.last < self.i - 1:
  56. self.done = True
  57. return self.data[self.last + 1:]
  58. raise StopIteration
  59. if self.initial:
  60. self.initial -= 1
  61. self.sum = checksum(self.data[self.i], self.sum)
  62. else:
  63. self.sum = roll_checksum(self.sum,
  64. self.data[self.i - self.window_size],
  65. self.data[self.i],
  66. self.window_size)
  67. self.i += 1
  68. if self.i == self.buf_size and self.last == -1:
  69. old_last = self.last
  70. self.last = self.i - 1
  71. return self.data[old_last + 1:self.last + 1]
  72. elif self.sum % self.chunk_size == 0:
  73. old_last = self.last
  74. self.last = self.i - 1
  75. return self.data[old_last + 1:self.last + 1]
  76. def chunkify(fd, chunk_size, chunks):
  77. """
  78. >>> list(chunkify(StringIO.StringIO(''), 5, 3))
  79. []
  80. >>> list(chunkify(StringIO.StringIO('A'), 5, 3))
  81. ['A']
  82. >>> list(chunkify(StringIO.StringIO('AB'), 5, 3))
  83. ['AB']
  84. >>> list(chunkify(StringIO.StringIO('1B'), 5, 3))
  85. ['1', 'B']
  86. >>> list(chunkify(StringIO.StringIO('ABCDEFGHIJKLMNOPQ'), 5, 3))
  87. ['ABCD', 'EFGHI', 'JKLMN', 'OPQ']
  88. >>> list(chunkify(StringIO.StringIO('1ABCDEFGHIJKLMNOPQ'), 5, 3))
  89. ['1', 'ABCD', 'EFGHI', 'JKLMN', 'OPQ']
  90. >>> list(chunkify(StringIO.StringIO('12ABCDEFGHIJKLMNOPQ'), 5, 3))
  91. ['1', '2A', 'BCD', 'EFGHI', 'JKLMN', 'OPQ']
  92. >>> list(chunkify(StringIO.StringIO('12ABCDEFGHIJKLMNOPQRSTUVWXYZ'), 5, 3))
  93. ['1', '2A', 'BCD', 'EFGHI', 'JKLMN', 'OPQRS', 'TUVWX', 'YZ']
  94. >>> list(chunkify(StringIO.StringIO('12ABCDEFGHIJKLMNOPQRSTUVWXYZ'), 5, 3))
  95. ['1', '2A', 'BCD', 'EFGHI', 'JKLMN', 'OPQRS', 'TUVWX', 'YZ']
  96. """
  97. return ChunkifyIter(fd, chunk_size, chunks)
  98. try:
  99. import _speedups
  100. checksum = _speedups.checksum
  101. roll_checksum = _speedups.roll_checksum
  102. py_chunkify = chunkify
  103. chunkify = _speedups.chunkify
  104. except ImportError:
  105. print 'Failed to load _speedups module, things will be slow'
  106. if __name__ == '__main__':
  107. import doctest
  108. import StringIO
  109. doctest.testmod()