chunkifier.py 2.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283
  1. def checksum(data, sum=0):
  2. """Simple but fast checksum that can be updated at either end.
  3. >>> checksum('FOOBAR')
  4. 102367679
  5. >>> checksum('FOOBAR') == checksum('BAR', checksum('FOO'))
  6. True
  7. """
  8. s1 = sum & 0xffff
  9. s2 = sum >> 16
  10. for c in data:
  11. s1 += ord(c) + 1
  12. s2 += s1
  13. return ((s2 & 0xffff) << 16) + (s1 & 0xffff)
  14. def roll_checksum(sum, remove, add, len):
  15. """
  16. >>> roll_checksum(checksum('XFOOBA'), 'X', 'R', 6) == checksum('FOOBAR')
  17. True
  18. """
  19. s1 = sum & 0xffff
  20. s2 = sum >> 16
  21. add = ord(add)
  22. remove = ord(remove)
  23. s1 -= remove - add
  24. s2 -= len * (remove + 1) - s1
  25. return (s1 & 0xffff) + ((s2 & 0xffff) << 16)
  26. def chunkify(fd, chunk_size, chunks):
  27. """
  28. >>> fd = StringIO.StringIO('ABCDEFGHIJKLMN')
  29. >>> list(chunkify(fd, 4, {}))
  30. ['ABCD', 'EFGH', 'IJ', 'KLMN']
  31. >>> fd = StringIO.StringIO('ABCDEFGHIJKLMN')
  32. >>> chunks = {44564754: True} # 'BCDE'
  33. >>> list(chunkify(fd, 4, chunks))
  34. ['A', 'BCDE', 'FGHI', 'J', 'KLMN']
  35. >>> fd = StringIO.StringIO('ABCDEFGHIJKLMN')
  36. >>> chunks = {44564754: True, 48496938: True} # 'BCDE', 'HIJK'
  37. >>> list(chunkify(fd, 4, chunks))
  38. ['A', 'BCDE', 'FG', 'HIJK', 'LMN']
  39. >>> fd = StringIO.StringIO('ABCDEFGHIJKLMN')
  40. >>> chunks = {43909390: True, 50463030: True} # 'ABCD', 'KLMN'
  41. >>> list(chunkify(fd, 4, chunks))
  42. ['ABCD', 'EFGH', 'IJ', 'KLMN']
  43. """
  44. data = 'X' + fd.read(chunk_size * 3)
  45. i = 1
  46. sum = checksum(data[:chunk_size])
  47. while True:
  48. if len(data) - i <= chunk_size * 2:
  49. data += fd.read(chunk_size * 2)
  50. if i == chunk_size + 1:
  51. yield data[1:chunk_size + 1]
  52. i = 1
  53. data = data[chunk_size:]
  54. if len(data) - i <= chunk_size: # EOF?
  55. if len(data) > chunk_size + 1:
  56. yield data[1:len(data) - chunk_size]
  57. yield data[-chunk_size:]
  58. else:
  59. yield data[1:]
  60. return
  61. sum = roll_checksum(sum, data[i - 1], data[i - 1 + chunk_size], chunk_size)
  62. #print data[i:i + chunk_size], sum
  63. if chunks.get(sum):
  64. if i > 1:
  65. yield data[1:i]
  66. yield data[i:i + chunk_size]
  67. data = data[i + chunk_size - 1:]
  68. i = 0
  69. sum = checksum(data[:chunk_size])
  70. i += 1
  71. if __name__ == '__main__':
  72. import StringIO
  73. import doctest
  74. doctest.testmod()