Skip to content

API: re-allow duplicate index level names #21423

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.23.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ Fixed Regressions
~~~~~~~~~~~~~~~~~

- Fixed regression in :meth:`to_csv` when handling file-like object incorrectly (:issue:`21471`)
- Re-allowed duplicate level names of a ``MultiIndex``. Accessing a level that has a duplicate name by name still raises an error (:issue:`19029`).
- Bug in both :meth:`DataFrame.first_valid_index` and :meth:`Series.first_valid_index` raised for a row index having duplicate values (:issue:`21441`)
-

Expand Down
19 changes: 7 additions & 12 deletions pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -671,30 +671,18 @@ def _set_names(self, names, level=None, validate=True):

if level is None:
level = range(self.nlevels)
used = {}
else:
level = [self._get_level_number(l) for l in level]
used = {self.levels[l].name: l
for l in set(range(self.nlevels)) - set(level)}

# set the name
for l, name in zip(level, names):
if name is not None:

# GH 20527
# All items in 'names' need to be hashable:
if not is_hashable(name):
raise TypeError('{}.name must be a hashable type'
.format(self.__class__.__name__))

if name in used:
raise ValueError(
'Duplicated level name: "{}", assigned to '
'level {}, is already used for level '
'{}.'.format(name, l, used[name]))

self.levels[l].rename(name, inplace=True)
used[name] = l

names = property(fset=_set_names, fget=_get_names,
doc="Names of levels in MultiIndex")
Expand Down Expand Up @@ -2893,6 +2881,13 @@ def isin(self, values, level=None):
else:
return np.lib.arraysetops.in1d(labs, sought_labels)

def _reference_duplicate_name(self, name):
"""
Returns True if the name refered to in self.names is duplicated.
"""
# count the times name equals an element in self.names.
return sum(name == n for n in self.names) > 1


MultiIndex._add_numeric_methods_disabled()
MultiIndex._add_numeric_methods_add_sub_disabled()
Expand Down
12 changes: 12 additions & 0 deletions pandas/core/reshape/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,12 @@ def __init__(self, values, index, level=-1, value_columns=None,

self.index = index.remove_unused_levels()

if isinstance(self.index, MultiIndex):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would rather not add API here. why don't you just do this check inside _get_level_number itself, and then also add to Index so you don't also have to add an instance check?

Copy link
Member Author

@jorisvandenbossche jorisvandenbossche Jun 28, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did you read my comment in the main thread?
This code above is exactly how it was before. As I said, I also think this is better in get_level_number, but propose to do a follow-up PR that can be merged for 0.24.0, as it gives an additional API change (change in error type)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

then let's just wait till 0.24.0 to do this.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems like a good fix for the regression, why not merge it? It's no different that reverting a previous commit.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, it is how it was for a long time, and this is fixing a regression so it is the point to do it now and not wait for 0.24.0.

I can certainly do the PR for 0.24.0 to further clean this up, but that will involve an API change so I will not do that in this PR.

if index._reference_duplicate_name(level):
msg = ("Ambiguous reference to {level}. The index "
"names are not unique.".format(level=level))
raise ValueError(msg)

self.level = self.index._get_level_number(level)

# when index includes `nan`, need to lift levels/strides by 1
Expand Down Expand Up @@ -528,6 +534,12 @@ def factorize(index):

N, K = frame.shape

if isinstance(frame.columns, MultiIndex):
if frame.columns._reference_duplicate_name(level):
msg = ("Ambiguous reference to {level}. The column "
"names are not unique.".format(level=level))
raise ValueError(msg)

# Will also convert negative level numbers and check if out of bounds.
level_num = frame.columns._get_level_number(level)

Expand Down
37 changes: 29 additions & 8 deletions pandas/tests/frame/test_alter_axes.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,19 +130,27 @@ def test_set_index2(self):
result = df.set_index(df.C)
assert result.index.name == 'C'

@pytest.mark.parametrize('level', ['a', pd.Series(range(3), name='a')])
@pytest.mark.parametrize(
'level', ['a', pd.Series(range(0, 8, 2), name='a')])
def test_set_index_duplicate_names(self, level):
# GH18872
# GH18872 - GH19029
df = pd.DataFrame(np.arange(8).reshape(4, 2), columns=['a', 'b'])

# Pass an existing level name:
df.index.name = 'a'
pytest.raises(ValueError, df.set_index, level, append=True)
pytest.raises(ValueError, df.set_index, [level], append=True)

# Pass twice the same level name:
df.index.name = 'c'
pytest.raises(ValueError, df.set_index, [level, level])
expected = pd.MultiIndex.from_tuples([(0, 0), (1, 2), (2, 4), (3, 6)],
names=['a', 'a'])
result = df.set_index(level, append=True)
tm.assert_index_equal(result.index, expected)
result = df.set_index([level], append=True)
tm.assert_index_equal(result.index, expected)

# Pass twice the same level name (only works with passing actual data)
if isinstance(level, pd.Series):
result = df.set_index([level, level])
expected = pd.MultiIndex.from_tuples(
[(0, 0), (2, 2), (4, 4), (6, 6)], names=['a', 'a'])
tm.assert_index_equal(result.index, expected)

def test_set_index_nonuniq(self):
df = DataFrame({'A': ['foo', 'foo', 'foo', 'bar', 'bar'],
Expand Down Expand Up @@ -617,6 +625,19 @@ def test_reorder_levels(self):
index=e_idx)
assert_frame_equal(result, expected)

result = df.reorder_levels([0, 0, 0])
e_idx = MultiIndex(levels=[['bar'], ['bar'], ['bar']],
labels=[[0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0]],
names=['L0', 'L0', 'L0'])
expected = DataFrame({'A': np.arange(6), 'B': np.arange(6)},
index=e_idx)
assert_frame_equal(result, expected)

result = df.reorder_levels(['L0', 'L0', 'L0'])
assert_frame_equal(result, expected)

def test_reset_index(self):
stacked = self.frame.stack()[::2]
stacked = DataFrame({'foo': stacked, 'bar': stacked})
Expand Down
10 changes: 10 additions & 0 deletions pandas/tests/frame/test_reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -560,6 +560,16 @@ def test_unstack_dtypes(self):
assert left.shape == (3, 2)
tm.assert_frame_equal(left, right)

def test_unstack_non_unique_index_names(self):
idx = MultiIndex.from_tuples([('a', 'b'), ('c', 'd')],
names=['c1', 'c1'])
df = DataFrame([1, 2], index=idx)
with pytest.raises(ValueError):
df.unstack('c1')

with pytest.raises(ValueError):
df.T.stack('c1')

def test_unstack_unused_levels(self):
# GH 17845: unused labels in index make unstack() cast int to float
idx = pd.MultiIndex.from_product([['a'], ['A', 'B', 'C', 'D']])[:-1]
Expand Down
8 changes: 2 additions & 6 deletions pandas/tests/groupby/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -555,15 +555,11 @@ def test_as_index():
columns=['cat', 'A', 'B'])
tm.assert_frame_equal(result, expected)

# another not in-axis grouper
s = Series(['a', 'b', 'b'], name='cat2')
# another not in-axis grouper (conflicting names in index)
s = Series(['a', 'b', 'b'], name='cat')
result = df.groupby(['cat', s], as_index=False, observed=True).sum()
tm.assert_frame_equal(result, expected)

# GH18872: conflicting names in desired index
with pytest.raises(ValueError):
df.groupby(['cat', s.rename('cat')], observed=True).sum()

# is original index dropped?
group_columns = ['cat', 'A']
expected = DataFrame(
Expand Down
25 changes: 15 additions & 10 deletions pandas/tests/indexes/test_multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -656,22 +656,27 @@ def test_constructor_nonhashable_names(self):
# With .set_names()
tm.assert_raises_regex(TypeError, message, mi.set_names, names=renamed)

@pytest.mark.parametrize('names', [['a', 'b', 'a'], ['1', '1', '2'],
['1', 'a', '1']])
@pytest.mark.parametrize('names', [['a', 'b', 'a'], [1, 1, 2],
[1, 'a', 1]])
def test_duplicate_level_names(self, names):
# GH18872
pytest.raises(ValueError, pd.MultiIndex.from_product,
[[0, 1]] * 3, names=names)
# GH18872, GH19029
mi = pd.MultiIndex.from_product([[0, 1]] * 3, names=names)
assert mi.names == names

# With .rename()
mi = pd.MultiIndex.from_product([[0, 1]] * 3)
tm.assert_raises_regex(ValueError, "Duplicated level name:",
mi.rename, names)
mi = mi.rename(names)
assert mi.names == names

# With .rename(., level=)
mi.rename(names[0], level=1, inplace=True)
tm.assert_raises_regex(ValueError, "Duplicated level name:",
mi.rename, names[:2], level=[0, 2])
mi.rename(names[1], level=1, inplace=True)
mi = mi.rename([names[0], names[2]], level=[0, 2])
assert mi.names == names

def test_duplicate_level_names_access_raises(self):
self.index.names = ['foo', 'foo']
tm.assert_raises_regex(KeyError, 'Level foo not found',
self.index._get_level_number, 'foo')

def assert_multiindex_copied(self, copy, original):
# Levels should be (at least, shallow copied)
Expand Down
6 changes: 6 additions & 0 deletions pandas/tests/io/test_pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -1893,6 +1893,12 @@ def make_index(names=None):
'a', 'b'], index=make_index(['date', 'a', 't']))
pytest.raises(ValueError, store.append, 'df', df)

# dup within level
_maybe_remove(store, 'df')
df = DataFrame(np.zeros((12, 2)), columns=['a', 'b'],
index=make_index(['date', 'date', 'date']))
pytest.raises(ValueError, store.append, 'df', df)

# fully names
_maybe_remove(store, 'df')
df = DataFrame(np.zeros((12, 2)), columns=[
Expand Down
10 changes: 8 additions & 2 deletions pandas/tests/reshape/test_pivot.py
Original file line number Diff line number Diff line change
Expand Up @@ -1747,9 +1747,15 @@ def test_crosstab_with_numpy_size(self):
tm.assert_frame_equal(result, expected)

def test_crosstab_dup_index_names(self):
# GH 13279, GH 18872
# GH 13279
s = pd.Series(range(3), name='foo')
pytest.raises(ValueError, pd.crosstab, s, s)

result = pd.crosstab(s, s)
expected_index = pd.Index(range(3), name='foo')
expected = pd.DataFrame(np.eye(3, dtype=np.int64),
index=expected_index,
columns=expected_index)
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize("names", [['a', ('b', 'c')],
[('a', 'b'), 'c']])
Expand Down