diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index fa7b945492d5d..2348f23a48955 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -23,7 +23,7 @@ Enhancements .. _whatsnew_1000.enhancements.other: -- +- :class:`Series` gained a :meth:`Series.set_index`, which facilitates the use of method-chaining. - Other enhancements diff --git a/pandas/core/series.py b/pandas/core/series.py index 418b3fc8c57d0..15cecb119b1ea 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1,7 +1,7 @@ """ Data structure for 1-dimensional cross-sectional and time series data """ -from collections import OrderedDict +from collections import OrderedDict, abc from io import StringIO from shutil import get_terminal_size from textwrap import dedent @@ -42,6 +42,8 @@ ABCDataFrame, ABCDatetimeArray, ABCDatetimeIndex, + ABCIndexClass, + ABCMultiIndex, ABCSeries, ABCSparseArray, ABCSparseSeries, @@ -67,6 +69,7 @@ InvalidIndexError, MultiIndex, ensure_index, + ensure_index_from_sequences, ) from pandas.core.indexers import maybe_convert_indices from pandas.core.indexes.accessors import CombinedDatetimelikeProperties @@ -1417,6 +1420,130 @@ def _set_value(self, label, value, takeable: bool = False): return self + def set_index(self, labels, append=False, inplace=False, verify_integrity=False): + """ + Set a new index for the Series. + + This method can take either: + - an array-like to be used as labels for the new index. It length + must match the length of the frame. + - a list of array-likes. The new index will be a MultiIndex and each item + in the list will serve as a level in it. + + Parameters + ---------- + labels : array-like or list of array-likes + Each array must have the same length as the calling Series. + If a list of array-likes is passed, the new index will be a MultiIndex. + array-like in this context means a 1D Pandas object like + Index/MultiIndex/Series, an ndarray or abc.Iter. + append : bool, default False + If True, convert the existing index to a MultiIndex and + add the new labels to it as a new level. + inplace : bool, default False + Modify the DataFrame in place (do not create a new object). + verify_integrity : bool, default False + Check the new index for duplicates. Otherwise defer the check until + necessary. Setting to False will improve the performance of this + method. + + Returns + ------- + Series + With new index. + + See Also + -------- + Series.reset_index : Opposite of set_index. + Series.reindex : Change to new indices or expand indices. + Series.reindex_like : Change to same indices as another FrameOrSeries. + + Examples + -------- + >>> ser = pd.Series([1,2,3]) + >>> ser.set_index(pd.Index(['D', 'E', 'F'])) + D 1 + E 2 + F 3 + dtype: int64 + """ + inplace = validate_bool_kwarg(inplace, "inplace") + if not isinstance(labels, list): + labels = [labels] + + err_msg = ( + 'The parameter "labels" may be a column key, one-dimensional ' + "array, or a list containing only " + "one-dimensional arrays." + ) + + for col in labels: + if isinstance( + col, (ABCIndexClass, ABCSeries, np.ndarray, list, abc.Iterator) + ): + # arrays are fine as long as they are one-dimensional + # iterators get converted to list below + if getattr(col, "ndim", 1) != 1: + raise ValueError(err_msg) + + if inplace: + ser = self + else: + ser = self.copy() + + arrays = [] + names = [] + if append: + names = [x for x in self.index.names] + if isinstance(self.index, ABCMultiIndex): + for i in range(self.index.nlevels): + arrays.append(self.index._get_level_values(i)) + else: + arrays.append(self.index) + + for col in labels: + if isinstance(col, ABCMultiIndex): + for n in range(col.nlevels): + arrays.append(col._get_level_values(n)) + names.extend(col.names) + elif isinstance(col, (ABCIndexClass, ABCSeries)): + # if Index then not MultiIndex (treated above) + arrays.append(col) + names.append(col.name) + elif isinstance(col, (list, np.ndarray)): + arrays.append(col) + names.append(None) + elif isinstance(col, abc.Iterator): + arrays.append(list(col)) + names.append(None) + # from here, col can only be a column label + else: + raise ValueError("MultiIndex Levels must be array-like") + + if len(arrays[-1]) != len(self): + # check newest element against length of calling ser, since + # ensure_index_from_sequences would not raise for append=False. + raise ValueError( + "Length mismatch: Expected {len_self} rows, " + "received array of length {len_col}".format( + len_self=len(self), len_col=len(arrays[-1]) + ) + ) + + index = ensure_index_from_sequences(arrays, names) + + if verify_integrity and not index.is_unique: + duplicates = index[index.duplicated()].unique() + raise ValueError("Index has duplicate keys: {dup}".format(dup=duplicates)) + + # clear up memory usage + index._cleanup() + + ser.index = index + + if not inplace: + return ser + def reset_index(self, level=None, drop=False, name=None, inplace=False): """ Generate a new DataFrame or Series with the index reset. diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py index d204d7d2a1d7c..c86e71076e558 100644 --- a/pandas/tests/series/test_api.py +++ b/pandas/tests/series/test_api.py @@ -757,3 +757,53 @@ def test_dt_accessor_api_for_categorical(self): with pytest.raises(AttributeError, match=msg): invalid.dt assert not hasattr(invalid, "str") + + @pytest.mark.parametrize("arrayklass", [iter, np.array, pd.Series, pd.Index]) + def test_set_index(self, arrayklass): + ser = pd.Series([0, 1, 2]) + + res = ser.set_index(arrayklass(["A", "B", "C"])) + exp = pd.Series([0, 1, 2], index=["A", "B", "C"]) + tm.assert_series_equal(res, exp) + + # inplace + ser = pd.Series([0, 1, 2]) + ser.set_index(arrayklass(["A", "B", "C"]), inplace=True) + exp = pd.Series([0, 1, 2], index=pd.Index(["A", "B", "C"])) + tm.assert_series_equal(ser, exp) + + # check for duplicates + with pytest.raises(ValueError, match="duplicate keys"): + ser.set_index(arrayklass(["A", "B", "B"]), verify_integrity=True) + + # MultiIndex + ser = pd.Series([0, 1, 2]) + levels = [pd.Series(["A", "B", "C"]), pd.Series(["x", "y", "z"])] + ix = pd.MultiIndex.from_arrays(levels) + res = ser.set_index(levels) + exp = pd.Series([0, 1, 2], index=ix) + tm.assert_series_equal(res, exp) + + # append + ser = pd.Series([0, 1, 2]) + labels = pd.Series(["A", "B", "C"]) + ix = pd.MultiIndex.from_arrays([ser.index, labels]) + res = ser.set_index(pd.Series(["A", "B", "C"]), append=True) + exp = pd.Series([0, 1, 2], index=ix) + tm.assert_series_equal(res, exp) + + # append MultIndex + ser = pd.Series([0, 1, 2]) + level1 = pd.Series(["A", "B", "C"]) + level2 = pd.Series(["X", "Y", "Z"]) + ix = pd.MultiIndex.from_arrays([level1, level2]) + exp_ix = pd.MultiIndex.from_arrays([ser.index, level1, level2]) + res = ser.set_index(ix, append=True) + exp = pd.Series([0, 1, 2], index=exp_ix) + tm.assert_series_equal(res, exp) + + def test_set_index_raises(self): + ser = pd.Series([0, 1, 2]) + + with pytest.raises(ValueError, match="must be array"): + ser.set_index(["A", "B", "C"])