Skip to content

Commit 1a2ee8f

Browse files
qinghao1Chu Qinghao
authored and
Chu Qinghao
committed
ENH: Add strings_as_bytes option for df.to_records() (#18146)
This option changes DataFrame.to_records() dtype for string arrays to 'Sx', where x is the length of the longest string, instead of 'O"
1 parent 2156431 commit 1a2ee8f

File tree

3 files changed

+50
-2
lines changed

3 files changed

+50
-2
lines changed

Diff for: doc/source/whatsnew/v0.24.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,7 @@ Other Enhancements
181181
The default compression for ``to_csv``, ``to_json``, and ``to_pickle`` methods has been updated to ``'infer'`` (:issue:`22004`).
182182
- :func:`to_timedelta` now supports iso-formated timedelta strings (:issue:`21877`)
183183
- :class:`Series` and :class:`DataFrame` now support :class:`Iterable` in constructor (:issue:`2193`)
184+
- :func:`DataFrame.to_records` now accepts a ``strings_as_bytes`` parameter to efficiently store strings as bytes dtype (``S``) instead of object dtype (``O``) (:issue:`18146`)
184185

185186
.. _whatsnew_0240.api_breaking:
186187

Diff for: pandas/core/frame.py

+37-2
Original file line numberDiff line numberDiff line change
@@ -1335,7 +1335,8 @@ def from_records(cls, data, index=None, exclude=None, columns=None,
13351335

13361336
return cls(mgr)
13371337

1338-
def to_records(self, index=True, convert_datetime64=None):
1338+
def to_records(self, index=True, convert_datetime64=None,
1339+
strings_as_bytes=False):
13391340
"""
13401341
Convert DataFrame to a NumPy record array.
13411342
@@ -1351,6 +1352,11 @@ def to_records(self, index=True, convert_datetime64=None):
13511352
13521353
Whether to convert the index to datetime.datetime if it is a
13531354
DatetimeIndex.
1355+
strings_as_bytes : boolean, default False
1356+
.. versionadded:: 0.24.0
1357+
1358+
Store strings as bytes (``S`` dtype) instead of Python objects
1359+
(``O`` dtype)
13541360
13551361
Returns
13561362
-------
@@ -1401,6 +1407,24 @@ def to_records(self, index=True, convert_datetime64=None):
14011407
rec.array([('2018-01-01T09:00:00.000000000', 1, 0.5 ),
14021408
('2018-01-01T09:01:00.000000000', 2, 0.75)],
14031409
dtype=[('index', '<M8[ns]'), ('A', '<i8'), ('B', '<f8')])
1410+
1411+
By default, strings are recorded as dtype `O` for object:
1412+
1413+
>>> df = pd.DataFrame({'A': [1, 2], 'B': ['abc', 'defg']},
1414+
... index=['a', 'b'])
1415+
>>> df.to_records()
1416+
rec.array([('a', 1, 'abc'), ('b', 2, 'defg')],
1417+
dtype=[('index', 'O'), ('A', '<i8'), ('B', 'O')])
1418+
1419+
This can be inefficient (e.g. for short strings, or when storing with
1420+
`np.save()`). They can be recorded as dtype `S` for zero-terminated
1421+
bytes instead:
1422+
1423+
>>> df = pd.DataFrame({'A': [1, 2], 'B': ['abc', 'defg']},
1424+
... index=['a', 'b'])
1425+
>>> df.to_records(strings_as_bytes=True)
1426+
rec.array([('a', 1, 'abc'), ('b', 2, 'defg')],
1427+
dtype=[('index', 'S1'), ('A', '<i8'), ('B', 'S4')])
14041428
"""
14051429

14061430
if convert_datetime64 is not None:
@@ -1436,7 +1460,18 @@ def to_records(self, index=True, convert_datetime64=None):
14361460
arrays = [self[c].get_values() for c in self.columns]
14371461
names = lmap(compat.text_type, self.columns)
14381462

1439-
formats = [v.dtype for v in arrays]
1463+
if strings_as_bytes:
1464+
is_string = np.vectorize(
1465+
lambda s: isinstance(s, compat.string_types))
1466+
# GH18146
1467+
# for string arrays, set dtype as zero-terminated bytes with max
1468+
# length equals to that of the longest string
1469+
formats = ['S{}'.format(max(map(len, v)))
1470+
if v.dtype == '|O' and is_string(v).all()
1471+
else v.dtype
1472+
for v in arrays]
1473+
else:
1474+
formats = [v.dtype for v in arrays]
14401475
return np.rec.fromarrays(
14411476
arrays,
14421477
dtype={'names': names, 'formats': formats}

Diff for: pandas/tests/frame/test_convert_to.py

+12
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,18 @@ def test_to_records_with_categorical(self):
186186
dtype=[('index', '=i8'), ('0', 'O')])
187187
tm.assert_almost_equal(result, expected)
188188

189+
def test_to_records_with_strings_as_bytes(self):
190+
191+
# GH18146
192+
193+
df = DataFrame({'A': [1, 2], 'B': ['abc', 'defg']},
194+
index=['a', 'b'])
195+
result = df.to_records(strings_as_bytes=True)
196+
expected = np.rec.array([('a', 1, 'abc'), ('b', 2, 'defg')],
197+
dtype=[('index', 'S1'), ('A', '<i8'),
198+
('B', 'S4')])
199+
tm.assert_almost_equal(result, expected)
200+
189201
@pytest.mark.parametrize('mapping', [
190202
dict,
191203
collections.defaultdict(list),

0 commit comments

Comments
 (0)