Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1#!/usr/local/bin/python 

2# encoding: utf-8 

3""" 

4*Iterate through large line-based files in batches of lines* 

5 

6:Author: 

7 David Young 

8""" 

9from builtins import range 

10from builtins import object 

11import sys 

12import os 

13os.environ['TERM'] = 'vt100' 

14from fundamentals import tools 

15import codecs 

16 

17class fileChunker(object): 

18 """ 

19 *The fileChunker iterator - iterate over large line-based files to reduce memory footprint* 

20 

21 **Key Arguments** 

22 

23 - ``filepath`` -- path to the large file to iterate over 

24 - ``batchSize`` -- size of the chunks to return in lines 

25  

26 

27 **Usage** 

28 

29 To setup your logger, settings and database connections, please use the ``fundamentals`` package (`see tutorial here <http://fundamentals.readthedocs.io/en/latest/#tutorial>`_).  

30 

31 To initiate a fileChunker iterator and then process the file in batches of 100000 lines, use the following: 

32 

33 ```python 

34 from fundamentals.files import fileChunker 

35 fc = fileChunker( 

36 filepath="/path/to/large/file.csv", 

37 batchSize=100000 

38 ) 

39 for i in fc: 

40 print len(i) 

41 ``` 

42  

43 """ 

44 

45 def __init__(self, filepath, batchSize): 

46 self.filepath = filepath 

47 self.batchSize = batchSize 

48 

49 try: 

50 self.readFile = codecs.open( 

51 self.filepath, encoding='utf-8', mode='r') 

52 except IOError as e: 

53 message = 'could not open the file %s' % (self.filepath,) 

54 raise IOError(message) 

55 

56 def __iter__(self): return self 

57 

58 def __next__(self): 

59 batch = [] 

60 for lines in range(self.batchSize): 

61 l = self.readFile.readline() 

62 if len(l): 

63 batch.append(l) 

64 if len(batch) == 0: 

65 raise StopIteration 

66 

67 return batch