123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145 |
- def checksum(data, sum=0):
- """Simple but fast checksum that can be updated at either end.
-
- >>> checksum('FOOBAR')
- 102367679
- >>> checksum('FOOBAR') == checksum('BAR', checksum('FOO'))
- True
- """
- s1 = sum & 0xffff
- s2 = sum >> 16
- for c in data:
- s1 += ord(c) + 1
- s2 += s1
- return ((s2 & 0xffff) << 16) + (s1 & 0xffff)
- def roll_checksum(sum, remove, add, len):
- """
- >>> roll_checksum(checksum('XFOOBA'), 'X', 'R', 6) == checksum('FOOBAR')
- True
- """
- s1 = sum & 0xffff
- s2 = sum >> 16
- add = ord(add)
- remove = ord(remove)
- s1 -= remove - add
- s2 -= len * (remove + 1) - s1
- return (s1 & 0xffff) + ((s2 & 0xffff) << 16)
- class ChunkifyIter(object):
- def __init__(self, fd, chunk_size, chunks):
- self.fd = fd
- self.chunk_size = chunk_size
- self.chunks = chunks
- def __iter__(self):
- self.data = ''
- self.i = 0
- self.full_sum = True
- self.extra = None
- self.done = False
- self.buf_size = self.chunk_size * 10
- return self
- def next(self):
- o = 0
- if self.done:
- raise StopIteration
- if self.extra:
- self.done = True
- return self.extra
- while True:
- if self.i > self.buf_size - self.chunk_size:
- self.data = self.data[self.i - o:]
- self.i = o
- if len(self.data) - self.i < self.chunk_size:
- self.data += self.fd.read(self.buf_size - len(self.data))
- if len(self.data) == self.i:
- raise StopIteration
- if len(self.data) - self.i < self.chunk_size: # EOF?
- if o == 1:
- self.done = True
- return self.data[self.i - 1:]
- elif o > 1:
- self.extra = self.data[-self.chunk_size:]
- return self.data[-self.chunk_size - o + 1:-self.chunk_size]
- else:
- self.done = True
- return self.data[self.i:]
- elif o == self.chunk_size:
- return self.data[self.i-self.chunk_size:self.i]
- if self.full_sum or len(self.data) - self.i < self.chunk_size:
- self.sum = checksum(self.data[self.i:self.i + self.chunk_size])
- self.full_sum = False
- self.remove = self.data[self.i]
- else:
- self.sum = roll_checksum(self.sum, self.remove, self.data[self.i + self.chunk_size - 1],
- self.chunk_size)
- self.remove = self.data[self.i]
- if self.sum in self.chunks:
- if o > 0:
- chunk = self.data[self.i - o:self.i]
- else:
- chunk = self.data[self.i:self.i + self.chunk_size]
- self.i += self.chunk_size
- self.full_sum = True
- return chunk
- else:
- self.i += 1
- o += 1
- def chunkify(fd, chunk_size, chunks):
- """
- >>> list(chunkify(StringIO.StringIO('A'), 4, {}))
- ['A']
- >>> list(chunkify(StringIO.StringIO('AB'), 4, {}))
- ['AB']
- >>> list(chunkify(StringIO.StringIO('ABC'), 4, {}))
- ['ABC']
- >>> list(chunkify(StringIO.StringIO('ABCD'), 4, {}))
- ['ABCD']
- >>> list(chunkify(StringIO.StringIO('ABCDE'), 4, {}))
- ['A', 'BCDE']
- >>> list(chunkify(StringIO.StringIO('ABCDEF'), 4, {}))
- ['AB', 'CDEF']
- >>> list(chunkify(StringIO.StringIO('ABCDEFG'), 4, {}))
- ['ABC', 'DEFG']
- >>> list(chunkify(StringIO.StringIO('ABCDEFGH'), 4, {}))
- ['ABCD', 'EFGH']
- >>> list(chunkify(StringIO.StringIO('ABCDEFGHI'), 4, {}))
- ['ABCD', 'E', 'FGHI']
- >>> list(chunkify(StringIO.StringIO('ABCDEFGHIJKLMN'), 4, {}))
- ['ABCD', 'EFGH', 'IJ', 'KLMN']
- >>> chunks = {44564754: True} # 'BCDE'
- >>> list(chunkify(StringIO.StringIO('ABCDEFGHIJKLMN'), 4, chunks))
- ['A', 'BCDE', 'FGHI', 'J', 'KLMN']
- >>> chunks = {44564754: True, 48496938: True} # 'BCDE', 'HIJK'
- >>> list(chunkify(StringIO.StringIO('ABCDEFGHIJKLMN'), 4, chunks))
- ['A', 'BCDE', 'FG', 'HIJK', 'LMN']
- >>> chunks = {43909390: True, 50463030: True} # 'ABCD', 'KLMN'
- >>> list(chunkify(StringIO.StringIO('ABCDEFGHIJKLMN'), 4, chunks))
- ['ABCD', 'EFGH', 'IJ', 'KLMN']
- """
- return ChunkifyIter(fd, chunk_size, chunks)
- try:
- import _speedups
- checksum = _speedups.checksum
- roll_checksum = _speedups.roll_checksum
- py_chunkify = chunkify
- chunkify = _speedups.chunkify
- except ImportError:
- print 'Failed to load _speedups module, things will be slow'
- if __name__ == '__main__':
- import StringIO
- import doctest
- doctest.testmod()
|