|  | @@ -30,103 +30,79 @@ def roll_checksum(sum, remove, add, len):
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |  class ChunkifyIter(object):
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -    def __init__(self, fd, chunk_size, chunks):
 | 
	
		
			
				|  |  | +    def __init__(self, fd, chunk_size, window_size):
 | 
	
		
			
				|  |  |          self.fd = fd
 | 
	
		
			
				|  |  |          self.chunk_size = chunk_size
 | 
	
		
			
				|  |  | -        self.chunks = chunks
 | 
	
		
			
				|  |  | +        self.window_size = window_size
 | 
	
		
			
				|  |  | +        self.buf_size = self.chunk_size * 10
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |      def __iter__(self):
 | 
	
		
			
				|  |  |          self.data = ''
 | 
	
		
			
				|  |  | -        self.i = 0
 | 
	
		
			
				|  |  | -        self.full_sum = True
 | 
	
		
			
				|  |  | -        self.extra = None
 | 
	
		
			
				|  |  |          self.done = False
 | 
	
		
			
				|  |  | -        self.buf_size = self.chunk_size * 10
 | 
	
		
			
				|  |  | +        self.i = 0
 | 
	
		
			
				|  |  | +        self.sum = 0
 | 
	
		
			
				|  |  | +        self.last = -1
 | 
	
		
			
				|  |  | +        self.initial = self.window_size
 | 
	
		
			
				|  |  |          return self
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |      def next(self):
 | 
	
		
			
				|  |  | -        o = 0
 | 
	
		
			
				|  |  |          if self.done:
 | 
	
		
			
				|  |  |              raise StopIteration
 | 
	
		
			
				|  |  | -        if self.extra:
 | 
	
		
			
				|  |  | -            self.done = True
 | 
	
		
			
				|  |  | -            return self.extra
 | 
	
		
			
				|  |  |          while True:
 | 
	
		
			
				|  |  | -            if self.i >  self.buf_size - self.chunk_size:
 | 
	
		
			
				|  |  | -                self.data = self.data[self.i - o:]
 | 
	
		
			
				|  |  | -                self.i = o
 | 
	
		
			
				|  |  | -            if len(self.data) - self.i < self.chunk_size:
 | 
	
		
			
				|  |  | +            if self.i == self.buf_size:
 | 
	
		
			
				|  |  | +                diff = self.last + 1 - self.window_size
 | 
	
		
			
				|  |  | +                if diff < 0:
 | 
	
		
			
				|  |  | +                    import ipdb
 | 
	
		
			
				|  |  | +                    ipdb.set_trace()
 | 
	
		
			
				|  |  | +                self.data = self.data[diff:]
 | 
	
		
			
				|  |  | +                self.last -= diff
 | 
	
		
			
				|  |  | +                self.i -= diff
 | 
	
		
			
				|  |  | +            if self.i == len(self.data):
 | 
	
		
			
				|  |  |                  self.data += self.fd.read(self.buf_size - len(self.data))
 | 
	
		
			
				|  |  | -            if len(self.data) == self.i:
 | 
	
		
			
				|  |  | -                raise StopIteration
 | 
	
		
			
				|  |  | -            if len(self.data) - self.i < self.chunk_size:  # EOF?
 | 
	
		
			
				|  |  | -                if o == 1:
 | 
	
		
			
				|  |  | -                    self.done = True
 | 
	
		
			
				|  |  | -                    return self.data[self.i - 1:]
 | 
	
		
			
				|  |  | -                elif o > 1:
 | 
	
		
			
				|  |  | -                    self.extra = self.data[-self.chunk_size:]
 | 
	
		
			
				|  |  | -                    return self.data[-self.chunk_size - o + 1:-self.chunk_size]
 | 
	
		
			
				|  |  | -                else:
 | 
	
		
			
				|  |  | +            if self.i == len(self.data):
 | 
	
		
			
				|  |  | +                if self.last < self.i - 1:
 | 
	
		
			
				|  |  |                      self.done = True
 | 
	
		
			
				|  |  | -                    return self.data[self.i:]
 | 
	
		
			
				|  |  | -            elif o == self.chunk_size:
 | 
	
		
			
				|  |  | -                return self.data[self.i-self.chunk_size:self.i]
 | 
	
		
			
				|  |  | -            if self.full_sum or len(self.data) - self.i < self.chunk_size:
 | 
	
		
			
				|  |  | -                self.sum = checksum(self.data[self.i:self.i + self.chunk_size])
 | 
	
		
			
				|  |  | -                self.full_sum = False
 | 
	
		
			
				|  |  | -                self.remove = self.data[self.i]
 | 
	
		
			
				|  |  | -            else:
 | 
	
		
			
				|  |  | -                self.sum = roll_checksum(self.sum, self.remove, self.data[self.i + self.chunk_size - 1], 
 | 
	
		
			
				|  |  | -                                         self.chunk_size)
 | 
	
		
			
				|  |  | -                self.remove = self.data[self.i]
 | 
	
		
			
				|  |  | -            if self.sum in self.chunks:
 | 
	
		
			
				|  |  | -                if o > 0:
 | 
	
		
			
				|  |  | -                    chunk = self.data[self.i - o:self.i]
 | 
	
		
			
				|  |  | -                else:
 | 
	
		
			
				|  |  | -                    chunk = self.data[self.i:self.i + self.chunk_size]
 | 
	
		
			
				|  |  | -                    self.i += self.chunk_size
 | 
	
		
			
				|  |  | -                self.full_sum = True
 | 
	
		
			
				|  |  | -                return chunk
 | 
	
		
			
				|  |  | +                    return self.data[self.last + 1:]
 | 
	
		
			
				|  |  | +                raise StopIteration
 | 
	
		
			
				|  |  | +            if self.initial:
 | 
	
		
			
				|  |  | +                self.initial -= 1
 | 
	
		
			
				|  |  | +                self.sum = checksum(self.data[self.i], self.sum)
 | 
	
		
			
				|  |  |              else:
 | 
	
		
			
				|  |  | -                self.i += 1
 | 
	
		
			
				|  |  | -                o += 1
 | 
	
		
			
				|  |  | +                self.sum = roll_checksum(self.sum,
 | 
	
		
			
				|  |  | +                                         self.data[self.i - self.window_size],
 | 
	
		
			
				|  |  | +                                         self.data[self.i],
 | 
	
		
			
				|  |  | +                                         self.window_size)
 | 
	
		
			
				|  |  | +            self.i += 1
 | 
	
		
			
				|  |  | +            if self.i == self.buf_size and self.last == self.window_size - 1:
 | 
	
		
			
				|  |  | +                old_last = self.last
 | 
	
		
			
				|  |  | +                self.last = self.i - 1
 | 
	
		
			
				|  |  | +                return self.data[old_last + 1:self.last + 1]
 | 
	
		
			
				|  |  | +            elif self.sum % self.chunk_size == 0:
 | 
	
		
			
				|  |  | +                old_last = self.last
 | 
	
		
			
				|  |  | +                self.last = self.i - 1
 | 
	
		
			
				|  |  | +                return self.data[old_last + 1:self.last + 1]
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |  def chunkify(fd, chunk_size, chunks):
 | 
	
		
			
				|  |  |      """
 | 
	
		
			
				|  |  | -    >>> list(chunkify(StringIO.StringIO('A'), 4, {}))
 | 
	
		
			
				|  |  | +    >>> list(chunkify(StringIO.StringIO(''), 5, 3))
 | 
	
		
			
				|  |  | +    []
 | 
	
		
			
				|  |  | +    >>> list(chunkify(StringIO.StringIO('A'), 5, 3))
 | 
	
		
			
				|  |  |      ['A']
 | 
	
		
			
				|  |  | -    >>> list(chunkify(StringIO.StringIO('AB'), 4, {}))
 | 
	
		
			
				|  |  | +    >>> list(chunkify(StringIO.StringIO('AB'), 5, 3))
 | 
	
		
			
				|  |  |      ['AB']
 | 
	
		
			
				|  |  | -    >>> list(chunkify(StringIO.StringIO('ABC'), 4, {}))
 | 
	
		
			
				|  |  | -    ['ABC']
 | 
	
		
			
				|  |  | -    >>> list(chunkify(StringIO.StringIO('ABCD'), 4, {}))
 | 
	
		
			
				|  |  | -    ['ABCD']
 | 
	
		
			
				|  |  | -    >>> list(chunkify(StringIO.StringIO('ABCDE'), 4, {}))
 | 
	
		
			
				|  |  | -    ['A', 'BCDE']
 | 
	
		
			
				|  |  | -    >>> list(chunkify(StringIO.StringIO('ABCDEF'), 4, {}))
 | 
	
		
			
				|  |  | -    ['AB', 'CDEF']
 | 
	
		
			
				|  |  | -    >>> list(chunkify(StringIO.StringIO('ABCDEFG'), 4, {}))
 | 
	
		
			
				|  |  | -    ['ABC', 'DEFG']
 | 
	
		
			
				|  |  | -    >>> list(chunkify(StringIO.StringIO('ABCDEFGH'), 4, {}))
 | 
	
		
			
				|  |  | -    ['ABCD', 'EFGH']
 | 
	
		
			
				|  |  | -    >>> list(chunkify(StringIO.StringIO('ABCDEFGHI'), 4, {}))
 | 
	
		
			
				|  |  | -    ['ABCD', 'E', 'FGHI']
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -    >>> list(chunkify(StringIO.StringIO('ABCDEFGHIJKLMN'), 4, {}))
 | 
	
		
			
				|  |  | -    ['ABCD', 'EFGH', 'IJ', 'KLMN']
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -    >>> chunks = {44564754: True} # 'BCDE'
 | 
	
		
			
				|  |  | -    >>> list(chunkify(StringIO.StringIO('ABCDEFGHIJKLMN'), 4, chunks))
 | 
	
		
			
				|  |  | -    ['A', 'BCDE', 'FGHI', 'J', 'KLMN']
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -    >>> chunks = {44564754: True, 48496938: True} # 'BCDE', 'HIJK'
 | 
	
		
			
				|  |  | -    >>> list(chunkify(StringIO.StringIO('ABCDEFGHIJKLMN'), 4, chunks))
 | 
	
		
			
				|  |  | -    ['A', 'BCDE', 'FG', 'HIJK', 'LMN']
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -    >>> chunks = {43909390: True, 50463030: True} # 'ABCD', 'KLMN'
 | 
	
		
			
				|  |  | -    >>> list(chunkify(StringIO.StringIO('ABCDEFGHIJKLMN'), 4, chunks))
 | 
	
		
			
				|  |  | -    ['ABCD', 'EFGH', 'IJ', 'KLMN']
 | 
	
		
			
				|  |  | +    >>> list(chunkify(StringIO.StringIO('1B'), 5, 3))
 | 
	
		
			
				|  |  | +    ['1', 'B']
 | 
	
		
			
				|  |  | +    >>> list(chunkify(StringIO.StringIO('ABCDEFGHIJKLMNOPQ'), 5, 3))
 | 
	
		
			
				|  |  | +    ['ABCD', 'EFGHI', 'JKLMN', 'OPQ']
 | 
	
		
			
				|  |  | +    >>> list(chunkify(StringIO.StringIO('1ABCDEFGHIJKLMNOPQ'), 5, 3))
 | 
	
		
			
				|  |  | +    ['1', 'ABCD', 'EFGHI', 'JKLMN', 'OPQ']
 | 
	
		
			
				|  |  | +    >>> list(chunkify(StringIO.StringIO('12ABCDEFGHIJKLMNOPQ'), 5, 3))
 | 
	
		
			
				|  |  | +    ['1', '2A', 'BCD', 'EFGHI', 'JKLMN', 'OPQ']
 | 
	
		
			
				|  |  | +    >>> list(chunkify(StringIO.StringIO('12ABCDEFGHIJKLMNOPQRSTUVWXYZ'), 5, 3))
 | 
	
		
			
				|  |  | +    ['1', '2A', 'BCD', 'EFGHI', 'JKLMN', 'OPQRS', 'TUVWX', 'YZ']
 | 
	
		
			
				|  |  | +    >>> list(chunkify(StringIO.StringIO('12ABCDEFGHIJKLMNOPQRSTUVWXYZ'), 5, 3))
 | 
	
		
			
				|  |  | +    ['1', '2A', 'BCD', 'EFGHI', 'JKLMN', 'OPQRS', 'TUVWX', 'YZ']
 | 
	
		
			
				|  |  |      """
 | 
	
		
			
				|  |  |      return ChunkifyIter(fd, chunk_size, chunks)
 | 
	
		
			
				|  |  |  
 | 
	
	
		
			
				|  | @@ -142,4 +118,5 @@ except ImportError:
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |  if __name__ == '__main__':
 | 
	
		
			
				|  |  |      import doctest
 | 
	
		
			
				|  |  | +    import StringIO
 | 
	
		
			
				|  |  |      doctest.testmod()
 |