Merge pull request #8663 from billletson/strsplit

jorisvandenbossche · jorisvandenbossche · commit 5cf3d85a7d4c · 2014-10-30T00:45:11.000+01:00
ENH: Series.str.split can return a DataFrame instead of Series of lists
diff --git a/doc/source/whatsnew/v0.15.1.txt b/doc/source/whatsnew/v0.15.1.txt
@@ -109,6 +109,7 @@ Enhancements
 
 - Added support for 3-character ISO and non-standard country codes in :func:``io.wb.download()`` (:issue:`8482`)
 - :ref:`World Bank data requests <remote_data.wb>` now will warn/raise based on an ``errors`` argument, as well as a list of hard-coded country codes and the World Bank's JSON response.  In prior versions, the error messages didn't look at the World Bank's JSON response.  Problem-inducing input were simply dropped prior to the request.  The issue was that many good countries were cropped in the hard-coded approach.  All countries will work now, but some bad countries will raise exceptions because some edge cases break the entire response. (:issue:`8482`)
+- Added option to ``Series.str.split()`` to return a ``DataFrame`` rather than a ``Series`` (:issue:`8428`)
 
 .. _whatsnew_0151.performance:
 
diff --git a/pandas/core/strings.py b/pandas/core/strings.py
@@ -621,7 +621,7 @@ def str_center(arr, width):
     return str_pad(arr, width, side='both')
 
 
-def str_split(arr, pat=None, n=None):
+def str_split(arr, pat=None, n=None, return_type='series'):
     """
     Split each string (a la re.split) in array by given pattern, propagating NA
     values
@@ -631,6 +631,9 @@ def str_split(arr, pat=None, n=None):
     pat : string, default None
         String or regular expression to split on. If None, splits on whitespace
     n : int, default None (all)
+    return_type : {'series', 'frame'}, default 'series
+        If frame, returns a DataFrame (elements are strings)
+        If series, returns an Series (elements are lists of strings).
 
     Notes
     -----
@@ -640,6 +643,8 @@ def str_split(arr, pat=None, n=None):
     -------
     split : array
     """
+    if return_type not in ('series', 'frame'):
+        raise ValueError("return_type must be {'series', 'frame'}")
     if pat is None:
         if n is None or n == 0:
             n = -1
@@ -654,8 +659,11 @@ def str_split(arr, pat=None, n=None):
                 n = 0
             regex = re.compile(pat)
             f = lambda x: regex.split(x, maxsplit=n)
-
-    return _na_map(f, arr)
+    if return_type == 'frame':
+        res = DataFrame((Series(x) for x in _na_map(f, arr)), index=arr.index)
+    else:
+        res = _na_map(f, arr)
+    return res
 
 
 def str_slice(arr, start=None, stop=None, step=1):
@@ -937,8 +945,8 @@ def cat(self, others=None, sep=None, na_rep=None):
         return self._wrap_result(result)
 
     @copy(str_split)
-    def split(self, pat=None, n=-1):
-        result = str_split(self.series, pat, n=n)
+    def split(self, pat=None, n=-1, return_type='series'):
+        result = str_split(self.series, pat, n=n, return_type=return_type)
         return self._wrap_result(result)
 
     @copy(str_get)
diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py
@@ -873,6 +873,34 @@ def test_split_no_pat_with_nonzero_n(self):
         expected = Series({0: ['split', 'once'], 1: ['split', 'once too!']})
         tm.assert_series_equal(expected, result)
 
+    def test_split_to_dataframe(self):
+        s = Series(['nosplit', 'alsonosplit'])
+        result = s.str.split('_', return_type='frame')
+        exp = DataFrame({0: Series(['nosplit', 'alsonosplit'])})
+        tm.assert_frame_equal(result, exp)
+
+        s = Series(['some_equal_splits', 'with_no_nans'])
+        result = s.str.split('_', return_type='frame')
+        exp = DataFrame({0: ['some', 'with'], 1: ['equal', 'no'],
+                         2: ['splits', 'nans']})
+        tm.assert_frame_equal(result, exp)
+
+        s = Series(['some_unequal_splits', 'one_of_these_things_is_not'])
+        result = s.str.split('_', return_type='frame')
+        exp = DataFrame({0: ['some', 'one'], 1: ['unequal', 'of'],
+                         2: ['splits', 'these'], 3: [NA, 'things'],
+                         4: [NA, 'is'], 5: [NA, 'not']})
+        tm.assert_frame_equal(result, exp)
+
+        s = Series(['some_splits', 'with_index'], index=['preserve', 'me'])
+        result = s.str.split('_', return_type='frame')
+        exp = DataFrame({0: ['some', 'with'], 1: ['splits', 'index']},
+                        index=['preserve', 'me'])
+        tm.assert_frame_equal(result, exp)
+
+        with tm.assertRaisesRegexp(ValueError, "return_type must be"):
+            s.str.split('_', return_type="some_invalid_type")
+
     def test_pipe_failures(self):
         # #2119
         s = Series(['A|B|C'])