Skip to content

Commit 66b517c

Browse files
jorisvandenbosschejreback
authored andcommitted
API: re-allow duplicate index level names (#21423)
1 parent 8cb6be0 commit 66b517c

File tree

9 files changed

+90
-38
lines changed

9 files changed

+90
-38
lines changed

Diff for: doc/source/whatsnew/v0.23.2.txt

+1
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ Fixed Regressions
5353
~~~~~~~~~~~~~~~~~
5454

5555
- Fixed regression in :meth:`to_csv` when handling file-like object incorrectly (:issue:`21471`)
56+
- Re-allowed duplicate level names of a ``MultiIndex``. Accessing a level that has a duplicate name by name still raises an error (:issue:`19029`).
5657
- Bug in both :meth:`DataFrame.first_valid_index` and :meth:`Series.first_valid_index` raised for a row index having duplicate values (:issue:`21441`)
5758
- Fixed regression in unary negative operations with object dtype (:issue:`21380`)
5859
- Bug in :meth:`Timestamp.ceil` and :meth:`Timestamp.floor` when timestamp is a multiple of the rounding frequency (:issue:`21262`)

Diff for: pandas/core/indexes/multi.py

+7-12
Original file line numberDiff line numberDiff line change
@@ -671,30 +671,18 @@ def _set_names(self, names, level=None, validate=True):
671671

672672
if level is None:
673673
level = range(self.nlevels)
674-
used = {}
675674
else:
676675
level = [self._get_level_number(l) for l in level]
677-
used = {self.levels[l].name: l
678-
for l in set(range(self.nlevels)) - set(level)}
679676

680677
# set the name
681678
for l, name in zip(level, names):
682679
if name is not None:
683-
684680
# GH 20527
685681
# All items in 'names' need to be hashable:
686682
if not is_hashable(name):
687683
raise TypeError('{}.name must be a hashable type'
688684
.format(self.__class__.__name__))
689-
690-
if name in used:
691-
raise ValueError(
692-
'Duplicated level name: "{}", assigned to '
693-
'level {}, is already used for level '
694-
'{}.'.format(name, l, used[name]))
695-
696685
self.levels[l].rename(name, inplace=True)
697-
used[name] = l
698686

699687
names = property(fset=_set_names, fget=_get_names,
700688
doc="Names of levels in MultiIndex")
@@ -2893,6 +2881,13 @@ def isin(self, values, level=None):
28932881
else:
28942882
return np.lib.arraysetops.in1d(labs, sought_labels)
28952883

2884+
def _reference_duplicate_name(self, name):
2885+
"""
2886+
Returns True if the name refered to in self.names is duplicated.
2887+
"""
2888+
# count the times name equals an element in self.names.
2889+
return sum(name == n for n in self.names) > 1
2890+
28962891

28972892
MultiIndex._add_numeric_methods_disabled()
28982893
MultiIndex._add_numeric_methods_add_sub_disabled()

Diff for: pandas/core/reshape/reshape.py

+12
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,12 @@ def __init__(self, values, index, level=-1, value_columns=None,
115115

116116
self.index = index.remove_unused_levels()
117117

118+
if isinstance(self.index, MultiIndex):
119+
if index._reference_duplicate_name(level):
120+
msg = ("Ambiguous reference to {level}. The index "
121+
"names are not unique.".format(level=level))
122+
raise ValueError(msg)
123+
118124
self.level = self.index._get_level_number(level)
119125

120126
# when index includes `nan`, need to lift levels/strides by 1
@@ -528,6 +534,12 @@ def factorize(index):
528534

529535
N, K = frame.shape
530536

537+
if isinstance(frame.columns, MultiIndex):
538+
if frame.columns._reference_duplicate_name(level):
539+
msg = ("Ambiguous reference to {level}. The column "
540+
"names are not unique.".format(level=level))
541+
raise ValueError(msg)
542+
531543
# Will also convert negative level numbers and check if out of bounds.
532544
level_num = frame.columns._get_level_number(level)
533545

Diff for: pandas/tests/frame/test_alter_axes.py

+29-8
Original file line numberDiff line numberDiff line change
@@ -130,19 +130,27 @@ def test_set_index2(self):
130130
result = df.set_index(df.C)
131131
assert result.index.name == 'C'
132132

133-
@pytest.mark.parametrize('level', ['a', pd.Series(range(3), name='a')])
133+
@pytest.mark.parametrize(
134+
'level', ['a', pd.Series(range(0, 8, 2), name='a')])
134135
def test_set_index_duplicate_names(self, level):
135-
# GH18872
136+
# GH18872 - GH19029
136137
df = pd.DataFrame(np.arange(8).reshape(4, 2), columns=['a', 'b'])
137138

138139
# Pass an existing level name:
139140
df.index.name = 'a'
140-
pytest.raises(ValueError, df.set_index, level, append=True)
141-
pytest.raises(ValueError, df.set_index, [level], append=True)
142-
143-
# Pass twice the same level name:
144-
df.index.name = 'c'
145-
pytest.raises(ValueError, df.set_index, [level, level])
141+
expected = pd.MultiIndex.from_tuples([(0, 0), (1, 2), (2, 4), (3, 6)],
142+
names=['a', 'a'])
143+
result = df.set_index(level, append=True)
144+
tm.assert_index_equal(result.index, expected)
145+
result = df.set_index([level], append=True)
146+
tm.assert_index_equal(result.index, expected)
147+
148+
# Pass twice the same level name (only works with passing actual data)
149+
if isinstance(level, pd.Series):
150+
result = df.set_index([level, level])
151+
expected = pd.MultiIndex.from_tuples(
152+
[(0, 0), (2, 2), (4, 4), (6, 6)], names=['a', 'a'])
153+
tm.assert_index_equal(result.index, expected)
146154

147155
def test_set_index_nonuniq(self):
148156
df = DataFrame({'A': ['foo', 'foo', 'foo', 'bar', 'bar'],
@@ -617,6 +625,19 @@ def test_reorder_levels(self):
617625
index=e_idx)
618626
assert_frame_equal(result, expected)
619627

628+
result = df.reorder_levels([0, 0, 0])
629+
e_idx = MultiIndex(levels=[['bar'], ['bar'], ['bar']],
630+
labels=[[0, 0, 0, 0, 0, 0],
631+
[0, 0, 0, 0, 0, 0],
632+
[0, 0, 0, 0, 0, 0]],
633+
names=['L0', 'L0', 'L0'])
634+
expected = DataFrame({'A': np.arange(6), 'B': np.arange(6)},
635+
index=e_idx)
636+
assert_frame_equal(result, expected)
637+
638+
result = df.reorder_levels(['L0', 'L0', 'L0'])
639+
assert_frame_equal(result, expected)
640+
620641
def test_reset_index(self):
621642
stacked = self.frame.stack()[::2]
622643
stacked = DataFrame({'foo': stacked, 'bar': stacked})

Diff for: pandas/tests/frame/test_reshape.py

+10
Original file line numberDiff line numberDiff line change
@@ -560,6 +560,16 @@ def test_unstack_dtypes(self):
560560
assert left.shape == (3, 2)
561561
tm.assert_frame_equal(left, right)
562562

563+
def test_unstack_non_unique_index_names(self):
564+
idx = MultiIndex.from_tuples([('a', 'b'), ('c', 'd')],
565+
names=['c1', 'c1'])
566+
df = DataFrame([1, 2], index=idx)
567+
with pytest.raises(ValueError):
568+
df.unstack('c1')
569+
570+
with pytest.raises(ValueError):
571+
df.T.stack('c1')
572+
563573
def test_unstack_unused_levels(self):
564574
# GH 17845: unused labels in index make unstack() cast int to float
565575
idx = pd.MultiIndex.from_product([['a'], ['A', 'B', 'C', 'D']])[:-1]

Diff for: pandas/tests/groupby/test_categorical.py

+2-6
Original file line numberDiff line numberDiff line change
@@ -555,15 +555,11 @@ def test_as_index():
555555
columns=['cat', 'A', 'B'])
556556
tm.assert_frame_equal(result, expected)
557557

558-
# another not in-axis grouper
559-
s = Series(['a', 'b', 'b'], name='cat2')
558+
# another not in-axis grouper (conflicting names in index)
559+
s = Series(['a', 'b', 'b'], name='cat')
560560
result = df.groupby(['cat', s], as_index=False, observed=True).sum()
561561
tm.assert_frame_equal(result, expected)
562562

563-
# GH18872: conflicting names in desired index
564-
with pytest.raises(ValueError):
565-
df.groupby(['cat', s.rename('cat')], observed=True).sum()
566-
567563
# is original index dropped?
568564
group_columns = ['cat', 'A']
569565
expected = DataFrame(

Diff for: pandas/tests/indexes/test_multi.py

+15-10
Original file line numberDiff line numberDiff line change
@@ -656,22 +656,27 @@ def test_constructor_nonhashable_names(self):
656656
# With .set_names()
657657
tm.assert_raises_regex(TypeError, message, mi.set_names, names=renamed)
658658

659-
@pytest.mark.parametrize('names', [['a', 'b', 'a'], ['1', '1', '2'],
660-
['1', 'a', '1']])
659+
@pytest.mark.parametrize('names', [['a', 'b', 'a'], [1, 1, 2],
660+
[1, 'a', 1]])
661661
def test_duplicate_level_names(self, names):
662-
# GH18872
663-
pytest.raises(ValueError, pd.MultiIndex.from_product,
664-
[[0, 1]] * 3, names=names)
662+
# GH18872, GH19029
663+
mi = pd.MultiIndex.from_product([[0, 1]] * 3, names=names)
664+
assert mi.names == names
665665

666666
# With .rename()
667667
mi = pd.MultiIndex.from_product([[0, 1]] * 3)
668-
tm.assert_raises_regex(ValueError, "Duplicated level name:",
669-
mi.rename, names)
668+
mi = mi.rename(names)
669+
assert mi.names == names
670670

671671
# With .rename(., level=)
672-
mi.rename(names[0], level=1, inplace=True)
673-
tm.assert_raises_regex(ValueError, "Duplicated level name:",
674-
mi.rename, names[:2], level=[0, 2])
672+
mi.rename(names[1], level=1, inplace=True)
673+
mi = mi.rename([names[0], names[2]], level=[0, 2])
674+
assert mi.names == names
675+
676+
def test_duplicate_level_names_access_raises(self):
677+
self.index.names = ['foo', 'foo']
678+
tm.assert_raises_regex(KeyError, 'Level foo not found',
679+
self.index._get_level_number, 'foo')
675680

676681
def assert_multiindex_copied(self, copy, original):
677682
# Levels should be (at least, shallow copied)

Diff for: pandas/tests/io/test_pytables.py

+6
Original file line numberDiff line numberDiff line change
@@ -1893,6 +1893,12 @@ def make_index(names=None):
18931893
'a', 'b'], index=make_index(['date', 'a', 't']))
18941894
pytest.raises(ValueError, store.append, 'df', df)
18951895

1896+
# dup within level
1897+
_maybe_remove(store, 'df')
1898+
df = DataFrame(np.zeros((12, 2)), columns=['a', 'b'],
1899+
index=make_index(['date', 'date', 'date']))
1900+
pytest.raises(ValueError, store.append, 'df', df)
1901+
18961902
# fully names
18971903
_maybe_remove(store, 'df')
18981904
df = DataFrame(np.zeros((12, 2)), columns=[

Diff for: pandas/tests/reshape/test_pivot.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -1747,9 +1747,15 @@ def test_crosstab_with_numpy_size(self):
17471747
tm.assert_frame_equal(result, expected)
17481748

17491749
def test_crosstab_dup_index_names(self):
1750-
# GH 13279, GH 18872
1750+
# GH 13279
17511751
s = pd.Series(range(3), name='foo')
1752-
pytest.raises(ValueError, pd.crosstab, s, s)
1752+
1753+
result = pd.crosstab(s, s)
1754+
expected_index = pd.Index(range(3), name='foo')
1755+
expected = pd.DataFrame(np.eye(3, dtype=np.int64),
1756+
index=expected_index,
1757+
columns=expected_index)
1758+
tm.assert_frame_equal(result, expected)
17531759

17541760
@pytest.mark.parametrize("names", [['a', ('b', 'c')],
17551761
[('a', 'b'), 'c']])

0 commit comments

Comments
 (0)