Coverage for fundamentals/files/fileChunker.py : 0%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1#!/usr/local/bin/python
2# encoding: utf-8
3"""
4*Iterate through large line-based files in batches of lines*
6:Author:
7 David Young
8"""
9from builtins import range
10from builtins import object
11import sys
12import os
13os.environ['TERM'] = 'vt100'
14from fundamentals import tools
15import codecs
17class fileChunker(object):
18 """
19 *The fileChunker iterator - iterate over large line-based files to reduce memory footprint*
21 **Key Arguments**
23 - ``filepath`` -- path to the large file to iterate over
24 - ``batchSize`` -- size of the chunks to return in lines
27 **Usage**
29 To setup your logger, settings and database connections, please use the ``fundamentals`` package (`see tutorial here <http://fundamentals.readthedocs.io/en/latest/#tutorial>`_).
31 To initiate a fileChunker iterator and then process the file in batches of 100000 lines, use the following:
33 ```python
34 from fundamentals.files import fileChunker
35 fc = fileChunker(
36 filepath="/path/to/large/file.csv",
37 batchSize=100000
38 )
39 for i in fc:
40 print len(i)
41 ```
43 """
45 def __init__(self, filepath, batchSize):
46 self.filepath = filepath
47 self.batchSize = batchSize
49 try:
50 self.readFile = codecs.open(
51 self.filepath, encoding='utf-8', mode='r')
52 except IOError as e:
53 message = 'could not open the file %s' % (self.filepath,)
54 raise IOError(message)
56 def __iter__(self): return self
58 def __next__(self):
59 batch = []
60 for lines in range(self.batchSize):
61 l = self.readFile.readline()
62 if len(l):
63 batch.append(l)
64 if len(batch) == 0:
65 raise StopIteration
67 return batch