diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index 9c4b408a1d24b..2df10592ab1af 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -53,6 +53,7 @@ Fixed Regressions ~~~~~~~~~~~~~~~~~ - Fixed regression in :meth:`to_csv` when handling file-like object incorrectly (:issue:`21471`) +- Re-allowed duplicate level names of a ``MultiIndex``. Accessing a level that has a duplicate name by name still raises an error (:issue:`19029`). - Bug in both :meth:`DataFrame.first_valid_index` and :meth:`Series.first_valid_index` raised for a row index having duplicate values (:issue:`21441`) - diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index f9f3041bef073..a2322348e1caa 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -671,30 +671,18 @@ def _set_names(self, names, level=None, validate=True): if level is None: level = range(self.nlevels) - used = {} else: level = [self._get_level_number(l) for l in level] - used = {self.levels[l].name: l - for l in set(range(self.nlevels)) - set(level)} # set the name for l, name in zip(level, names): if name is not None: - # GH 20527 # All items in 'names' need to be hashable: if not is_hashable(name): raise TypeError('{}.name must be a hashable type' .format(self.__class__.__name__)) - - if name in used: - raise ValueError( - 'Duplicated level name: "{}", assigned to ' - 'level {}, is already used for level ' - '{}.'.format(name, l, used[name])) - self.levels[l].rename(name, inplace=True) - used[name] = l names = property(fset=_set_names, fget=_get_names, doc="Names of levels in MultiIndex") @@ -2893,6 +2881,13 @@ def isin(self, values, level=None): else: return np.lib.arraysetops.in1d(labs, sought_labels) + def _reference_duplicate_name(self, name): + """ + Returns True if the name refered to in self.names is duplicated. + """ + # count the times name equals an element in self.names. + return sum(name == n for n in self.names) > 1 + MultiIndex._add_numeric_methods_disabled() MultiIndex._add_numeric_methods_add_sub_disabled() diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 2757e0797a410..3d9e84954a63b 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -115,6 +115,12 @@ def __init__(self, values, index, level=-1, value_columns=None, self.index = index.remove_unused_levels() + if isinstance(self.index, MultiIndex): + if index._reference_duplicate_name(level): + msg = ("Ambiguous reference to {level}. The index " + "names are not unique.".format(level=level)) + raise ValueError(msg) + self.level = self.index._get_level_number(level) # when index includes `nan`, need to lift levels/strides by 1 @@ -528,6 +534,12 @@ def factorize(index): N, K = frame.shape + if isinstance(frame.columns, MultiIndex): + if frame.columns._reference_duplicate_name(level): + msg = ("Ambiguous reference to {level}. The column " + "names are not unique.".format(level=level)) + raise ValueError(msg) + # Will also convert negative level numbers and check if out of bounds. level_num = frame.columns._get_level_number(level) diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index 164d6746edec0..21961906c39bb 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -130,19 +130,27 @@ def test_set_index2(self): result = df.set_index(df.C) assert result.index.name == 'C' - @pytest.mark.parametrize('level', ['a', pd.Series(range(3), name='a')]) + @pytest.mark.parametrize( + 'level', ['a', pd.Series(range(0, 8, 2), name='a')]) def test_set_index_duplicate_names(self, level): - # GH18872 + # GH18872 - GH19029 df = pd.DataFrame(np.arange(8).reshape(4, 2), columns=['a', 'b']) # Pass an existing level name: df.index.name = 'a' - pytest.raises(ValueError, df.set_index, level, append=True) - pytest.raises(ValueError, df.set_index, [level], append=True) - - # Pass twice the same level name: - df.index.name = 'c' - pytest.raises(ValueError, df.set_index, [level, level]) + expected = pd.MultiIndex.from_tuples([(0, 0), (1, 2), (2, 4), (3, 6)], + names=['a', 'a']) + result = df.set_index(level, append=True) + tm.assert_index_equal(result.index, expected) + result = df.set_index([level], append=True) + tm.assert_index_equal(result.index, expected) + + # Pass twice the same level name (only works with passing actual data) + if isinstance(level, pd.Series): + result = df.set_index([level, level]) + expected = pd.MultiIndex.from_tuples( + [(0, 0), (2, 2), (4, 4), (6, 6)], names=['a', 'a']) + tm.assert_index_equal(result.index, expected) def test_set_index_nonuniq(self): df = DataFrame({'A': ['foo', 'foo', 'foo', 'bar', 'bar'], @@ -617,6 +625,19 @@ def test_reorder_levels(self): index=e_idx) assert_frame_equal(result, expected) + result = df.reorder_levels([0, 0, 0]) + e_idx = MultiIndex(levels=[['bar'], ['bar'], ['bar']], + labels=[[0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0]], + names=['L0', 'L0', 'L0']) + expected = DataFrame({'A': np.arange(6), 'B': np.arange(6)}, + index=e_idx) + assert_frame_equal(result, expected) + + result = df.reorder_levels(['L0', 'L0', 'L0']) + assert_frame_equal(result, expected) + def test_reset_index(self): stacked = self.frame.stack()[::2] stacked = DataFrame({'foo': stacked, 'bar': stacked}) diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index d05321abefca6..ebf6c5e37b916 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -560,6 +560,16 @@ def test_unstack_dtypes(self): assert left.shape == (3, 2) tm.assert_frame_equal(left, right) + def test_unstack_non_unique_index_names(self): + idx = MultiIndex.from_tuples([('a', 'b'), ('c', 'd')], + names=['c1', 'c1']) + df = DataFrame([1, 2], index=idx) + with pytest.raises(ValueError): + df.unstack('c1') + + with pytest.raises(ValueError): + df.T.stack('c1') + def test_unstack_unused_levels(self): # GH 17845: unused labels in index make unstack() cast int to float idx = pd.MultiIndex.from_product([['a'], ['A', 'B', 'C', 'D']])[:-1] diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 0fec6a8f96a24..cb76195eacf40 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -555,15 +555,11 @@ def test_as_index(): columns=['cat', 'A', 'B']) tm.assert_frame_equal(result, expected) - # another not in-axis grouper - s = Series(['a', 'b', 'b'], name='cat2') + # another not in-axis grouper (conflicting names in index) + s = Series(['a', 'b', 'b'], name='cat') result = df.groupby(['cat', s], as_index=False, observed=True).sum() tm.assert_frame_equal(result, expected) - # GH18872: conflicting names in desired index - with pytest.raises(ValueError): - df.groupby(['cat', s.rename('cat')], observed=True).sum() - # is original index dropped? group_columns = ['cat', 'A'] expected = DataFrame( diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index c925c4c403960..1dc44677ab3ad 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -656,22 +656,27 @@ def test_constructor_nonhashable_names(self): # With .set_names() tm.assert_raises_regex(TypeError, message, mi.set_names, names=renamed) - @pytest.mark.parametrize('names', [['a', 'b', 'a'], ['1', '1', '2'], - ['1', 'a', '1']]) + @pytest.mark.parametrize('names', [['a', 'b', 'a'], [1, 1, 2], + [1, 'a', 1]]) def test_duplicate_level_names(self, names): - # GH18872 - pytest.raises(ValueError, pd.MultiIndex.from_product, - [[0, 1]] * 3, names=names) + # GH18872, GH19029 + mi = pd.MultiIndex.from_product([[0, 1]] * 3, names=names) + assert mi.names == names # With .rename() mi = pd.MultiIndex.from_product([[0, 1]] * 3) - tm.assert_raises_regex(ValueError, "Duplicated level name:", - mi.rename, names) + mi = mi.rename(names) + assert mi.names == names # With .rename(., level=) - mi.rename(names[0], level=1, inplace=True) - tm.assert_raises_regex(ValueError, "Duplicated level name:", - mi.rename, names[:2], level=[0, 2]) + mi.rename(names[1], level=1, inplace=True) + mi = mi.rename([names[0], names[2]], level=[0, 2]) + assert mi.names == names + + def test_duplicate_level_names_access_raises(self): + self.index.names = ['foo', 'foo'] + tm.assert_raises_regex(KeyError, 'Level foo not found', + self.index._get_level_number, 'foo') def assert_multiindex_copied(self, copy, original): # Levels should be (at least, shallow copied) diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index 29063b64221c1..865cab7a1596e 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -1893,6 +1893,12 @@ def make_index(names=None): 'a', 'b'], index=make_index(['date', 'a', 't'])) pytest.raises(ValueError, store.append, 'df', df) + # dup within level + _maybe_remove(store, 'df') + df = DataFrame(np.zeros((12, 2)), columns=['a', 'b'], + index=make_index(['date', 'date', 'date'])) + pytest.raises(ValueError, store.append, 'df', df) + # fully names _maybe_remove(store, 'df') df = DataFrame(np.zeros((12, 2)), columns=[ diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index ca95dde1a20c9..7e7e081408534 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1747,9 +1747,15 @@ def test_crosstab_with_numpy_size(self): tm.assert_frame_equal(result, expected) def test_crosstab_dup_index_names(self): - # GH 13279, GH 18872 + # GH 13279 s = pd.Series(range(3), name='foo') - pytest.raises(ValueError, pd.crosstab, s, s) + + result = pd.crosstab(s, s) + expected_index = pd.Index(range(3), name='foo') + expected = pd.DataFrame(np.eye(3, dtype=np.int64), + index=expected_index, + columns=expected_index) + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("names", [['a', ('b', 'c')], [('a', 'b'), 'c']])