ENH: Add strings_as_bytes option for df.to_records() (#18146)

qinghao1 · Chu Qinghao · commit 1a2ee8ff75af · 2018-08-08T11:01:57.000+08:00
This option changes DataFrame.to_records() dtype for string arrays
to 'Sx', where x is the length of the longest string, instead of 'O"
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -181,6 +181,7 @@ Other Enhancements
   The default compression for ``to_csv``, ``to_json``, and ``to_pickle`` methods has been updated to ``'infer'`` (:issue:`22004`).
 - :func:`to_timedelta` now supports iso-formated timedelta strings (:issue:`21877`)
 - :class:`Series` and :class:`DataFrame` now support :class:`Iterable` in constructor (:issue:`2193`)
+- :func:`DataFrame.to_records` now accepts a ``strings_as_bytes`` parameter to efficiently store strings as bytes dtype (``S``) instead of object dtype (``O``)  (:issue:`18146`)
 
 .. _whatsnew_0240.api_breaking:
 
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -1335,7 +1335,8 @@ def from_records(cls, data, index=None, exclude=None, columns=None,
 
         return cls(mgr)
 
-    def to_records(self, index=True, convert_datetime64=None):
+    def to_records(self, index=True, convert_datetime64=None,
+                   strings_as_bytes=False):
         """
         Convert DataFrame to a NumPy record array.
 
@@ -1351,6 +1352,11 @@ def to_records(self, index=True, convert_datetime64=None):
 
             Whether to convert the index to datetime.datetime if it is a
             DatetimeIndex.
+        strings_as_bytes : boolean, default False
+            .. versionadded:: 0.24.0
+
+            Store strings as bytes (``S`` dtype) instead of Python objects
+            (``O`` dtype)
 
         Returns
         -------
@@ -1401,6 +1407,24 @@ def to_records(self, index=True, convert_datetime64=None):
         rec.array([('2018-01-01T09:00:00.000000000', 1, 0.5 ),
                    ('2018-01-01T09:01:00.000000000', 2, 0.75)],
                   dtype=[('index', '<M8[ns]'), ('A', '<i8'), ('B', '<f8')])
+
+        By default, strings are recorded as dtype `O` for object:
+
+        >>> df = pd.DataFrame({'A': [1, 2], 'B': ['abc', 'defg']},
+        ...                   index=['a', 'b'])
+        >>> df.to_records()
+        rec.array([('a', 1, 'abc'), ('b', 2, 'defg')],
+                  dtype=[('index', 'O'), ('A', '<i8'), ('B', 'O')])
+
+        This can be inefficient (e.g. for short strings, or when storing with
+        `np.save()`). They can be recorded as dtype `S` for zero-terminated
+        bytes instead:
+
+        >>> df = pd.DataFrame({'A': [1, 2], 'B': ['abc', 'defg']},
+        ...                   index=['a', 'b'])
+        >>> df.to_records(strings_as_bytes=True)
+        rec.array([('a', 1, 'abc'), ('b', 2, 'defg')],
+                  dtype=[('index', 'S1'), ('A', '<i8'), ('B', 'S4')])
         """
 
         if convert_datetime64 is not None:
@@ -1436,7 +1460,18 @@ def to_records(self, index=True, convert_datetime64=None):
             arrays = [self[c].get_values() for c in self.columns]
             names = lmap(compat.text_type, self.columns)
 
-        formats = [v.dtype for v in arrays]
+        if strings_as_bytes:
+            is_string = np.vectorize(
+                lambda s: isinstance(s, compat.string_types))
+            # GH18146
+            # for string arrays, set dtype as zero-terminated bytes with max
+            # length equals to that of the longest string
+            formats = ['S{}'.format(max(map(len, v)))
+                       if v.dtype == '|O' and is_string(v).all()
+                       else v.dtype
+                       for v in arrays]
+        else:
+            formats = [v.dtype for v in arrays]
         return np.rec.fromarrays(
             arrays,
             dtype={'names': names, 'formats': formats}
diff --git a/pandas/tests/frame/test_convert_to.py b/pandas/tests/frame/test_convert_to.py
@@ -186,6 +186,18 @@ def test_to_records_with_categorical(self):
                                 dtype=[('index', '=i8'), ('0', 'O')])
         tm.assert_almost_equal(result, expected)
 
+    def test_to_records_with_strings_as_bytes(self):
+
+        # GH18146
+
+        df = DataFrame({'A': [1, 2], 'B': ['abc', 'defg']},
+                       index=['a', 'b'])
+        result = df.to_records(strings_as_bytes=True)
+        expected = np.rec.array([('a', 1, 'abc'), ('b', 2, 'defg')],
+                                dtype=[('index', 'S1'), ('A', '<i8'),
+                                       ('B', 'S4')])
+        tm.assert_almost_equal(result, expected)
+
     @pytest.mark.parametrize('mapping', [
         dict,
         collections.defaultdict(list),