Skip to content

Commit 5cf3d85

Browse files
Merge pull request #8663 from billletson/strsplit
ENH: Series.str.split can return a DataFrame instead of Series of lists
2 parents 6bbb39e + 9b45d74 commit 5cf3d85

File tree

3 files changed

+42
-5
lines changed

3 files changed

+42
-5
lines changed

Diff for: doc/source/whatsnew/v0.15.1.txt

+1
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ Enhancements
109109

110110
- Added support for 3-character ISO and non-standard country codes in :func:``io.wb.download()`` (:issue:`8482`)
111111
- :ref:`World Bank data requests <remote_data.wb>` now will warn/raise based on an ``errors`` argument, as well as a list of hard-coded country codes and the World Bank's JSON response. In prior versions, the error messages didn't look at the World Bank's JSON response. Problem-inducing input were simply dropped prior to the request. The issue was that many good countries were cropped in the hard-coded approach. All countries will work now, but some bad countries will raise exceptions because some edge cases break the entire response. (:issue:`8482`)
112+
- Added option to ``Series.str.split()`` to return a ``DataFrame`` rather than a ``Series`` (:issue:`8428`)
112113

113114
.. _whatsnew_0151.performance:
114115

Diff for: pandas/core/strings.py

+13-5
Original file line numberDiff line numberDiff line change
@@ -621,7 +621,7 @@ def str_center(arr, width):
621621
return str_pad(arr, width, side='both')
622622

623623

624-
def str_split(arr, pat=None, n=None):
624+
def str_split(arr, pat=None, n=None, return_type='series'):
625625
"""
626626
Split each string (a la re.split) in array by given pattern, propagating NA
627627
values
@@ -631,6 +631,9 @@ def str_split(arr, pat=None, n=None):
631631
pat : string, default None
632632
String or regular expression to split on. If None, splits on whitespace
633633
n : int, default None (all)
634+
return_type : {'series', 'frame'}, default 'series
635+
If frame, returns a DataFrame (elements are strings)
636+
If series, returns an Series (elements are lists of strings).
634637
635638
Notes
636639
-----
@@ -640,6 +643,8 @@ def str_split(arr, pat=None, n=None):
640643
-------
641644
split : array
642645
"""
646+
if return_type not in ('series', 'frame'):
647+
raise ValueError("return_type must be {'series', 'frame'}")
643648
if pat is None:
644649
if n is None or n == 0:
645650
n = -1
@@ -654,8 +659,11 @@ def str_split(arr, pat=None, n=None):
654659
n = 0
655660
regex = re.compile(pat)
656661
f = lambda x: regex.split(x, maxsplit=n)
657-
658-
return _na_map(f, arr)
662+
if return_type == 'frame':
663+
res = DataFrame((Series(x) for x in _na_map(f, arr)), index=arr.index)
664+
else:
665+
res = _na_map(f, arr)
666+
return res
659667

660668

661669
def str_slice(arr, start=None, stop=None, step=1):
@@ -937,8 +945,8 @@ def cat(self, others=None, sep=None, na_rep=None):
937945
return self._wrap_result(result)
938946

939947
@copy(str_split)
940-
def split(self, pat=None, n=-1):
941-
result = str_split(self.series, pat, n=n)
948+
def split(self, pat=None, n=-1, return_type='series'):
949+
result = str_split(self.series, pat, n=n, return_type=return_type)
942950
return self._wrap_result(result)
943951

944952
@copy(str_get)

Diff for: pandas/tests/test_strings.py

+28
Original file line numberDiff line numberDiff line change
@@ -873,6 +873,34 @@ def test_split_no_pat_with_nonzero_n(self):
873873
expected = Series({0: ['split', 'once'], 1: ['split', 'once too!']})
874874
tm.assert_series_equal(expected, result)
875875

876+
def test_split_to_dataframe(self):
877+
s = Series(['nosplit', 'alsonosplit'])
878+
result = s.str.split('_', return_type='frame')
879+
exp = DataFrame({0: Series(['nosplit', 'alsonosplit'])})
880+
tm.assert_frame_equal(result, exp)
881+
882+
s = Series(['some_equal_splits', 'with_no_nans'])
883+
result = s.str.split('_', return_type='frame')
884+
exp = DataFrame({0: ['some', 'with'], 1: ['equal', 'no'],
885+
2: ['splits', 'nans']})
886+
tm.assert_frame_equal(result, exp)
887+
888+
s = Series(['some_unequal_splits', 'one_of_these_things_is_not'])
889+
result = s.str.split('_', return_type='frame')
890+
exp = DataFrame({0: ['some', 'one'], 1: ['unequal', 'of'],
891+
2: ['splits', 'these'], 3: [NA, 'things'],
892+
4: [NA, 'is'], 5: [NA, 'not']})
893+
tm.assert_frame_equal(result, exp)
894+
895+
s = Series(['some_splits', 'with_index'], index=['preserve', 'me'])
896+
result = s.str.split('_', return_type='frame')
897+
exp = DataFrame({0: ['some', 'with'], 1: ['splits', 'index']},
898+
index=['preserve', 'me'])
899+
tm.assert_frame_equal(result, exp)
900+
901+
with tm.assertRaisesRegexp(ValueError, "return_type must be"):
902+
s.str.split('_', return_type="some_invalid_type")
903+
876904
def test_pipe_failures(self):
877905
# #2119
878906
s = Series(['A|B|C'])

0 commit comments

Comments
 (0)