Skip to content

Commit 55adfb7

Browse files
committed
ENH: add chunksize parameter to DataFrame.to_csv to enable constant memory usage
by writing in chunks
1 parent d78f4f6 commit 55adfb7

File tree

3 files changed

+63
-25
lines changed

3 files changed

+63
-25
lines changed

RELEASE.rst

+2
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,8 @@ pandas 0.11.0
6262
strings that can be parsed with datetime.strptime
6363
- Add ``axes`` property to ``Series`` for compatibility
6464
- Add ``xs`` function to ``Series`` for compatibility
65+
- Add ``chunksize`` parameter to ``to_csv`` to allow writing in chunks
66+
to enable constant memory usage
6567

6668
**API Changes**
6769

pandas/core/frame.py

+45-25
Original file line numberDiff line numberDiff line change
@@ -1291,31 +1291,11 @@ def to_panel(self):
12911291

12921292
def _helper_csv(self, writer, na_rep=None, cols=None,
12931293
header=True, index=True,
1294-
index_label=None, float_format=None):
1294+
index_label=None, float_format=None,
1295+
chunksize=None):
12951296
if cols is None:
12961297
cols = self.columns
12971298

1298-
series = {}
1299-
for k, v in self._series.iteritems():
1300-
mask = isnull(v)
1301-
imask = -mask
1302-
if v.dtype == 'datetime64[ns]' or v.dtype == 'timedelta64[ns]':
1303-
values = np.empty(len(v),dtype=object)
1304-
values[mask] = 'NaT'
1305-
1306-
if v.dtype == 'datetime64[ns]':
1307-
values[imask] = np.array([ val._repr_base for val in v[imask] ],dtype=object)
1308-
elif v.dtype == 'timedelta64[ns]':
1309-
values[imask] = np.array([ lib.repr_timedelta64(val) for val in v[imask] ],dtype=object)
1310-
else:
1311-
values = np.array(v.values,dtype=object)
1312-
values[mask] = na_rep
1313-
if issubclass(v.dtype.type,np.floating):
1314-
if float_format:
1315-
values[imask] = np.array([ float_format % val for val in v[imask] ])
1316-
1317-
series[k] = values.tolist()
1318-
13191299
has_aliases = isinstance(header, (tuple, list, np.ndarray))
13201300
if has_aliases or header:
13211301
if index:
@@ -1365,12 +1345,50 @@ def _helper_csv(self, writer, na_rep=None, cols=None,
13651345
if not index:
13661346
nlevels = 0
13671347

1368-
lib.write_csv_rows(series, list(data_index), nlevels, list(cols), writer)
1348+
rows = len(data_index)
1349+
1350+
# write in chunksize bites
1351+
if chunksize is None:
1352+
chunksize = 100000
1353+
chunks = int(rows / chunksize)+1
1354+
1355+
for i in xrange(chunks):
1356+
start_i = i * chunksize
1357+
end_i = min((i + 1) * chunksize, rows)
1358+
if start_i == end_i:
1359+
continue
1360+
1361+
# create the data for a chunk
1362+
chunk = self.iloc[start_i:end_i]
1363+
1364+
series = {}
1365+
for k, v in chunk.iteritems():
1366+
mask = isnull(v)
1367+
imask = -mask
1368+
1369+
if v.dtype == 'datetime64[ns]' or v.dtype == 'timedelta64[ns]':
1370+
values = np.empty(len(v),dtype=object)
1371+
values[mask] = 'NaT'
1372+
1373+
if v.dtype == 'datetime64[ns]':
1374+
values[imask] = np.array([ val._repr_base for val in v[imask] ],dtype=object)
1375+
elif v.dtype == 'timedelta64[ns]':
1376+
values[imask] = np.array([ lib.repr_timedelta64(val) for val in v[imask] ],dtype=object)
1377+
else:
1378+
values = np.array(v.values,dtype=object)
1379+
values[mask] = na_rep
1380+
if issubclass(v.dtype.type,np.floating):
1381+
if float_format:
1382+
values[imask] = np.array([ float_format % val for val in v[imask] ])
1383+
1384+
series[k] = values.tolist()
1385+
1386+
lib.write_csv_rows(series, list(data_index[start_i:end_i]), nlevels, list(cols), writer)
13691387

13701388
def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None,
13711389
cols=None, header=True, index=True, index_label=None,
13721390
mode='w', nanRep=None, encoding=None, quoting=None,
1373-
line_terminator='\n'):
1391+
line_terminator='\n', chunksize=None):
13741392
"""
13751393
Write DataFrame to a comma-separated values (csv) file
13761394
@@ -1407,6 +1425,7 @@ def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None,
14071425
file
14081426
quoting : optional constant from csv module
14091427
defaults to csv.QUOTE_MINIMAL
1428+
chunksize : rows to write at a time
14101429
"""
14111430
if nanRep is not None: # pragma: no cover
14121431
import warnings
@@ -1435,7 +1454,8 @@ def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None,
14351454
self._helper_csv(csvout, na_rep=na_rep,
14361455
float_format=float_format, cols=cols,
14371456
header=header, index=index,
1438-
index_label=index_label)
1457+
index_label=index_label,
1458+
chunksize=chunksize)
14391459

14401460
finally:
14411461
if close:

pandas/tests/test_frame.py

+16
Original file line numberDiff line numberDiff line change
@@ -4593,6 +4593,22 @@ def create_cols(name):
45934593
assert_frame_equal(rs, df)
45944594
os.remove(filename)
45954595

4596+
def test_to_csv_chunking(self):
4597+
filename = '__tmp_to_csv_chunking__.csv'
4598+
4599+
aa=DataFrame({'A':range(100000)})
4600+
4601+
aa['B'] = aa.A + 1.0
4602+
aa['C'] = aa.A + 2.0
4603+
aa['D'] = aa.A + 3.0
4604+
4605+
for chunksize in [10000,50000,100000]:
4606+
aa.to_csv(filename,chunksize=chunksize)
4607+
rs = pan.read_csv(filename,index_col=0)
4608+
assert_frame_equal(rs, aa)
4609+
4610+
os.remove(filename)
4611+
45964612
def test_to_csv_bug(self):
45974613
path = '__tmp_to_csv_bug__.csv'
45984614
f1 = StringIO('a,1.0\nb,2.0')

0 commit comments

Comments
 (0)