Skip to content

API: disallow duplicate level names #18882

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Dec 29, 2017
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.23.0.txt
Original file line number Diff line number Diff line change
@@ -184,6 +184,7 @@ Other API Changes
- A :class:`Series` of ``dtype=category`` constructed from an empty ``dict`` will now have categories of ``dtype=object`` rather than ``dtype=float64``, consistently with the case in which an empty list is passed (:issue:`18515`)
- ``NaT`` division with :class:`datetime.timedelta` will now return ``NaN`` instead of raising (:issue:`17876`)
- All-NaN levels in a ``MultiIndex`` are now assigned ``float`` rather than ``object`` dtype, promoting consistency with ``Index`` (:issue:`17929`).
- Levels names of a ``MultiIndex`` (when not None) are now required to be unique: trying to create a ``MultiIndex`` with repeated names will raise a ``ValueError`` (:issue:`18872`)
- :class:`Timestamp` will no longer silently ignore unused or invalid ``tz`` or ``tzinfo`` keyword arguments (:issue:`17690`)
- :class:`Timestamp` will no longer silently ignore invalid ``freq`` arguments (:issue:`5168`)
- :class:`CacheableOffset` and :class:`WeekDay` are no longer available in the ``pandas.tseries.offsets`` module (:issue:`17830`)
15 changes: 8 additions & 7 deletions pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
@@ -579,23 +579,24 @@ def _set_names(self, names, level=None, validate=True):

if level is None:
level = range(self.nlevels)
used = {}
else:
level = [self._get_level_number(l) for l in level]
used = {self.levels[l].name: l
for l in set(range(self.nlevels)) - set(level)}

# set the name
for l, name in zip(level, names):
if name is not None and name in used:
raise ValueError('Duplicated level name: "{}", assigned to '
'level {}, is already used for level '
'{}.'.format(name, l, used[name]))
self.levels[l].rename(name, inplace=True)
used[name] = l

names = property(fset=_set_names, fget=_get_names,
doc="Names of levels in MultiIndex")

def _reference_duplicate_name(self, name):
"""
Returns True if the name refered to in self.names is duplicated.
"""
# count the times name equals an element in self.names.
return sum(name == n for n in self.names) > 1

def _format_native_types(self, na_rep='nan', **kwargs):
new_levels = []
new_labels = []
11 changes: 0 additions & 11 deletions pandas/core/reshape/reshape.py
Original file line number Diff line number Diff line change
@@ -91,12 +91,6 @@ def __init__(self, values, index, level=-1, value_columns=None,

self.index = index

if isinstance(self.index, MultiIndex):
if index._reference_duplicate_name(level):
msg = ("Ambiguous reference to {level}. The index "
"names are not unique.".format(level=level))
raise ValueError(msg)

self.level = self.index._get_level_number(level)

# when index includes `nan`, need to lift levels/strides by 1
@@ -502,11 +496,6 @@ def factorize(index):
return categories, codes

N, K = frame.shape
if isinstance(frame.columns, MultiIndex):
if frame.columns._reference_duplicate_name(level):
msg = ("Ambiguous reference to {level}. The column "
"names are not unique.".format(level=level))
raise ValueError(msg)

# Will also convert negative level numbers and check if out of bounds.
level_num = frame.columns._get_level_number(level)
36 changes: 19 additions & 17 deletions pandas/tests/frame/test_alter_axes.py
Original file line number Diff line number Diff line change
@@ -130,6 +130,20 @@ def test_set_index2(self):
result = df.set_index(df.C)
assert result.index.name == 'C'

@pytest.mark.parametrize('level', ['a', pd.Series(range(3), name='a')])
def test_set_index_duplicate_names(self, level):
# GH18872
df = pd.DataFrame(np.arange(8).reshape(4, 2), columns=['a', 'b'])

# Pass an existing level name:
df.index.name = 'a'
pytest.raises(ValueError, df.set_index, level, append=True)
pytest.raises(ValueError, df.set_index, [level], append=True)

# Pass twice the same level name:
df.index.name = 'c'
pytest.raises(ValueError, df.set_index, [level, level])

def test_set_index_nonuniq(self):
df = DataFrame({'A': ['foo', 'foo', 'foo', 'bar', 'bar'],
'B': ['one', 'two', 'three', 'one', 'two'],
@@ -591,19 +605,6 @@ def test_reorder_levels(self):
index=e_idx)
assert_frame_equal(result, expected)

result = df.reorder_levels([0, 0, 0])
e_idx = MultiIndex(levels=[['bar'], ['bar'], ['bar']],
labels=[[0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0]],
names=['L0', 'L0', 'L0'])
expected = DataFrame({'A': np.arange(6), 'B': np.arange(6)},
index=e_idx)
assert_frame_equal(result, expected)

result = df.reorder_levels(['L0', 'L0', 'L0'])
assert_frame_equal(result, expected)

def test_reset_index(self):
stacked = self.frame.stack()[::2]
stacked = DataFrame({'foo': stacked, 'bar': stacked})
@@ -831,7 +832,7 @@ def test_set_index_names(self):

mi = MultiIndex.from_arrays(df[['A', 'B']].T.values, names=['A', 'B'])
mi2 = MultiIndex.from_arrays(df[['A', 'B', 'A', 'B']].T.values,
names=['A', 'B', 'A', 'B'])
names=['A', 'B', 'C', 'D'])

df = df.set_index(['A', 'B'])

@@ -843,13 +844,14 @@ def test_set_index_names(self):
# Check actual equality
tm.assert_index_equal(df.set_index(df.index).index, mi)

idx2 = df.index.rename(['C', 'D'])

# Check that [MultiIndex, MultiIndex] yields a MultiIndex rather
# than a pair of tuples
assert isinstance(df.set_index(
[df.index, df.index]).index, MultiIndex)
assert isinstance(df.set_index([df.index, idx2]).index, MultiIndex)

# Check equality
tm.assert_index_equal(df.set_index([df.index, df.index]).index, mi2)
tm.assert_index_equal(df.set_index([df.index, idx2]).index, mi2)

def test_rename_objects(self):
renamed = self.mixed_frame.rename(columns=str.upper)
10 changes: 0 additions & 10 deletions pandas/tests/frame/test_reshape.py
Original file line number Diff line number Diff line change
@@ -560,16 +560,6 @@ def test_unstack_dtypes(self):
assert left.shape == (3, 2)
tm.assert_frame_equal(left, right)

def test_unstack_non_unique_index_names(self):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what does this do now?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It raises when you try to create the MultiIndex with duplicated name.

idx = MultiIndex.from_tuples([('a', 'b'), ('c', 'd')],
names=['c1', 'c1'])
df = DataFrame([1, 2], index=idx)
with pytest.raises(ValueError):
df.unstack('c1')

with pytest.raises(ValueError):
df.T.stack('c1')

def test_unstack_nan_index(self): # GH7466
cast = lambda val: '{0:1}'.format('' if val != val else val)
nan = np.nan
8 changes: 6 additions & 2 deletions pandas/tests/groupby/test_categorical.py
Original file line number Diff line number Diff line change
@@ -388,15 +388,19 @@ def test_groupby_multi_categorical_as_index(self):
columns=['cat', 'A', 'B'])
tm.assert_frame_equal(result, expected)

# another not in-axis grouper (conflicting names in index)
s = Series(['a', 'b', 'b'], name='cat')
# another not in-axis grouper
s = Series(['a', 'b', 'b'], name='cat2')
result = df.groupby(['cat', s], as_index=False).sum()
expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]),
'A': [10.0, nan, nan, 22.0, nan, nan],
'B': [101.0, nan, nan, 205.0, nan, nan]},
columns=['cat', 'A', 'B'])
tm.assert_frame_equal(result, expected)

# GH18872: conflicting names in desired index
pytest.raises(ValueError, lambda: df.groupby(['cat',
s.rename('cat')]).sum())

# is original index dropped?
expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]),
'A': [10, 11, 10, 11, 10, 11],
31 changes: 17 additions & 14 deletions pandas/tests/indexes/test_multi.py
Original file line number Diff line number Diff line change
@@ -536,15 +536,6 @@ def test_names(self):
level_names = [level.name for level in index.levels]
assert ind_names == level_names

def test_reference_duplicate_name(self):
idx = MultiIndex.from_tuples(
[('a', 'b'), ('c', 'd')], names=['x', 'x'])
assert idx._reference_duplicate_name('x')

idx = MultiIndex.from_tuples(
[('a', 'b'), ('c', 'd')], names=['x', 'y'])
assert not idx._reference_duplicate_name('x')

def test_astype(self):
expected = self.index.copy()
actual = self.index.astype('O')
@@ -609,6 +600,23 @@ def test_constructor_mismatched_label_levels(self):
with tm.assert_raises_regex(ValueError, label_error):
self.index.copy().set_labels([[0, 0, 0, 0], [0, 0]])

@pytest.mark.parametrize('names', [['a', 'b', 'a'], [1, 1, 2],
[1, 'a', 1]])
def test_duplicate_level_names(self, names):
# GH18872
pytest.raises(ValueError, pd.MultiIndex.from_product,
[[0, 1]] * 3, names=names)

# With .rename()
mi = pd.MultiIndex.from_product([[0, 1]] * 3)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you check the error messages here, use tm.assert_raises_regex

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(done)

tm.assert_raises_regex(ValueError, "Duplicated level name:",
mi.rename, names)

# With .rename(., level=)
mi.rename(names[0], level=1, inplace=True)
tm.assert_raises_regex(ValueError, "Duplicated level name:",
mi.rename, names[:2], level=[0, 2])

def assert_multiindex_copied(self, copy, original):
# Levels should be (at least, shallow copied)
tm.assert_copy(copy.levels, original.levels)
@@ -667,11 +675,6 @@ def test_changing_names(self):
shallow_copy.names = [name + "c" for name in shallow_copy.names]
self.check_level_names(self.index, new_names)

def test_duplicate_names(self):
self.index.names = ['foo', 'foo']
tm.assert_raises_regex(KeyError, 'Level foo not found',
self.index._get_level_number, 'foo')

def test_get_level_number_integer(self):
self.index.names = [1, 0]
assert self.index._get_level_number(1) == 0
4 changes: 2 additions & 2 deletions pandas/tests/io/formats/test_to_latex.py
Original file line number Diff line number Diff line change
@@ -589,8 +589,8 @@ def test_to_latex_no_bold_rows(self):
"""
assert observed == expected

@pytest.mark.parametrize('name0', [None, 'named'])
@pytest.mark.parametrize('name1', [None, 'named'])
@pytest.mark.parametrize('name0', [None, 'named0'])
@pytest.mark.parametrize('name1', [None, 'named1'])
@pytest.mark.parametrize('axes', [[0], [1], [0, 1]])
def test_to_latex_multiindex_names(self, name0, name1, axes):
# GH 18667
6 changes: 0 additions & 6 deletions pandas/tests/io/test_pytables.py
Original file line number Diff line number Diff line change
@@ -1908,12 +1908,6 @@ def make_index(names=None):
'a', 'b'], index=make_index(['date', 'a', 't']))
pytest.raises(ValueError, store.append, 'df', df)

# dup within level
_maybe_remove(store, 'df')
df = DataFrame(np.zeros((12, 2)), columns=['a', 'b'],
index=make_index(['date', 'date', 'date']))
pytest.raises(ValueError, store.append, 'df', df)

# fully names
_maybe_remove(store, 'df')
df = DataFrame(np.zeros((12, 2)), columns=[
9 changes: 2 additions & 7 deletions pandas/tests/reshape/test_pivot.py
Original file line number Diff line number Diff line change
@@ -1623,14 +1623,9 @@ def test_crosstab_with_numpy_size(self):
tm.assert_frame_equal(result, expected)

def test_crosstab_dup_index_names(self):
# GH 13279
# GH 13279, GH 18872
s = pd.Series(range(3), name='foo')
result = pd.crosstab(s, s)
expected_index = pd.Index(range(3), name='foo')
expected = pd.DataFrame(np.eye(3, dtype=np.int64),
index=expected_index,
columns=expected_index)
tm.assert_frame_equal(result, expected)
pytest.raises(ValueError, pd.crosstab, s, s)

@pytest.mark.parametrize("names", [['a', ('b', 'c')],
[('a', 'b'), 'c']])
11 changes: 0 additions & 11 deletions pandas/tests/series/test_alter_axes.py
Original file line number Diff line number Diff line change
@@ -214,17 +214,6 @@ def test_reorder_levels(self):
expected = Series(np.arange(6), index=e_idx)
assert_series_equal(result, expected)

result = s.reorder_levels([0, 0, 0])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add a test that raises

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is the new test_set_index_duplicate_names (the constructor itself raises).

e_idx = MultiIndex(levels=[['bar'], ['bar'], ['bar']],
labels=[[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0]],
names=['L0', 'L0', 'L0'])
expected = Series(np.arange(6), index=e_idx)
assert_series_equal(result, expected)

result = s.reorder_levels(['L0', 'L0', 'L0'])
assert_series_equal(result, expected)

def test_rename_axis_inplace(self):
# GH 15704
series = self.ts.copy()