Skip to content

Commit 8ab863b

Browse files
committed
Rebased version of #22486
1 parent 99bae05 commit 8ab863b

File tree

7 files changed

+76
-37
lines changed

7 files changed

+76
-37
lines changed

Diff for: doc/source/whatsnew/v0.24.0.txt

+2
Original file line numberDiff line numberDiff line change
@@ -712,6 +712,8 @@ Other API Changes
712712
- :class:`pandas.io.formats.style.Styler` supports a ``number-format`` property when using :meth:`~pandas.io.formats.style.Styler.to_excel` (:issue:`22015`)
713713
- :meth:`DataFrame.corr` and :meth:`Series.corr` now raise a ``ValueError`` along with a helpful error message instead of a ``KeyError`` when supplied with an invalid method (:issue:`22298`)
714714
- :meth:`shift` will now always return a copy, instead of the previous behaviour of returning self when shifting by 0 (:issue:`22397`)
715+
- :meth:`DataFrame.set_index` now raises a ``TypeError`` for incorrect types, has an improved ``KeyError`` message,
716+
and will not fail on duplicate column names with ``drop=True``. (:issue:`22484`)
715717
- Slicing a single row of a DataFrame with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`)
716718
- :class:`DateOffset` attribute `_cacheable` and method `_should_cache` have been removed (:issue:`23118`)
717719

Diff for: pandas/core/frame.py

+22-2
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@
8282
from pandas.core.accessor import CachedAccessor
8383
from pandas.core.arrays import Categorical, ExtensionArray
8484
from pandas.core.config import get_option
85+
8586
from pandas.core.generic import NDFrame, _shared_docs
8687
from pandas.core.index import (Index, MultiIndex, ensure_index,
8788
ensure_index_from_sequences)
@@ -3963,7 +3964,7 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
39633964
7 2013 84
39643965
10 2014 31
39653966
3966-
Create a multi-index using columns 'year' and 'month':
3967+
Create a MultiIndex using columns 'year' and 'month':
39673968
39683969
>>> df.set_index(['year', 'month'])
39693970
sale
@@ -3973,7 +3974,7 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
39733974
2013 7 84
39743975
2014 10 31
39753976
3976-
Create a multi-index using a set of values and a column:
3977+
Create a MultiIndex using a set of values and a column:
39773978
39783979
>>> df.set_index([[1, 2, 3, 4], 'year'])
39793980
month sale
@@ -3986,6 +3987,25 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
39863987
if not isinstance(keys, list):
39873988
keys = [keys]
39883989

3990+
missing = []
3991+
for col in keys:
3992+
if (is_scalar(col) or isinstance(col, tuple)) and col in self:
3993+
# tuples can be both column keys or list-likes
3994+
# if they are valid column keys, everything is fine
3995+
continue
3996+
elif is_scalar(col) and col not in self:
3997+
# tuples that are not column keys are considered list-like,
3998+
# not considered missing
3999+
missing.append(col)
4000+
elif (not is_list_like(col) or isinstance(col, set)
4001+
or getattr(col, 'ndim', 1) > 1):
4002+
raise TypeError('The parameter "keys" may only contain a '
4003+
'combination of valid column keys and '
4004+
'one-dimensional list-likes')
4005+
4006+
if missing:
4007+
raise KeyError('{}'.format(missing))
4008+
39894009
vi = verify_integrity
39904010
return super(DataFrame, self).set_index(keys=keys, drop=drop,
39914011
append=append, inplace=inplace,

Diff for: pandas/core/generic.py

+13-10
Original file line numberDiff line numberDiff line change
@@ -699,7 +699,7 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
699699
7 2013 84
700700
10 2014 31
701701
702-
Create a multi-index using columns 'year' and 'month':
702+
Create a MultiIndex using columns 'year' and 'month':
703703
704704
>>> df.set_index(['year', 'month'])
705705
sale
@@ -709,7 +709,7 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
709709
2013 7 84
710710
2014 10 31
711711
712-
Create a multi-index using a set of values and a column:
712+
Create a MultiIndex using a set of values and a column:
713713
714714
>>> df.set_index([[1, 2, 3, 4], 'year'])
715715
month sale
@@ -741,18 +741,20 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
741741
for n in range(col.nlevels):
742742
arrays.append(col._get_level_values(n))
743743
names.extend(col.names)
744-
elif isinstance(col, ABCIndexClass):
745-
# Index but not MultiIndex (treated above)
744+
elif isinstance(col, (ABCIndexClass, ABCSeries)):
745+
# if Index then not MultiIndex (treated above)
746746
arrays.append(col)
747747
names.append(col.name)
748-
elif isinstance(col, ABCSeries):
749-
arrays.append(col._values)
750-
names.append(col.name)
751748
elif isinstance(col, (list, np.ndarray)):
752749
arrays.append(col)
753750
names.append(None)
754-
# from here, col can only be a column label (and obj a DataFrame);
755-
# see checks in Series.set_index and DataFrame.set_index
751+
elif (is_list_like(col)
752+
and not (isinstance(col, tuple) and col in self)):
753+
# all other list-likes (but avoid valid column keys)
754+
col = list(col) # ensure iterator do not get read twice etc.
755+
arrays.append(col)
756+
names.append(None)
757+
# from here, col can only be a column label
756758
else:
757759
arrays.append(obj[col]._values)
758760
names.append(col)
@@ -766,7 +768,8 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
766768
raise ValueError('Index has duplicate keys: {dup}'.format(
767769
dup=duplicates))
768770

769-
for c in to_remove:
771+
# use set to handle duplicate column names gracefully in case of drop
772+
for c in set(to_remove):
770773
del obj[c]
771774

772775
# clear up memory usage

Diff for: pandas/core/series.py

+7-7
Original file line numberDiff line numberDiff line change
@@ -1136,15 +1136,15 @@ def set_index(self, arrays, append=False, inplace=False,
11361136
c 12
11371137
dtype: int64
11381138
1139-
Create a multi-index by appending to the existing index:
1139+
Create a MultiIndex by appending to the existing index:
11401140
11411141
>>> s.set_index(['a', 'b', 'c'], append=True)
11421142
0 a 10
11431143
1 b 11
11441144
2 c 12
11451145
dtype: int64
11461146
1147-
Create a multi-index by passing a list of arrays:
1147+
Create a MultiIndex by passing a list of arrays:
11481148
11491149
>>> t = (s ** 2).set_index([['a', 'b', 'c'], ['I', 'II', 'III']])
11501150
>>> t
@@ -1166,11 +1166,11 @@ def set_index(self, arrays, append=False, inplace=False,
11661166
elif all(is_scalar(x) for x in arrays):
11671167
arrays = [arrays]
11681168

1169-
if any(not isinstance(x, (ABCSeries, ABCIndexClass, list, np.ndarray))
1170-
for x in arrays):
1171-
raise TypeError('arrays must be Series, Index, MultiIndex, list, '
1172-
'np.ndarray or list containing only Series, '
1173-
'Index, MultiIndex, list, np.ndarray')
1169+
if any(not is_list_like(x) or isinstance(x, set)
1170+
or getattr(x, 'ndim', 1) > 1 for x in arrays):
1171+
raise TypeError('The parameter "arrays" may only contain a '
1172+
'combination of valid column keys and '
1173+
'one-dimensional list-likes')
11741174

11751175
return super(Series, self).set_index(keys=arrays, drop=False,
11761176
append=append, inplace=inplace,

Diff for: pandas/tests/frame/conftest.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -211,12 +211,13 @@ def frame_of_index_cols():
211211
"""
212212
Fixture for DataFrame of columns that can be used for indexing
213213
214-
Columns are ['A', 'B', 'C', 'D', 'E']; 'A' & 'B' contain duplicates (but
215-
are jointly unique), the rest are unique.
214+
Columns are ['A', 'B', 'C', 'D', 'E', ('tuple', 'as', 'label')];
215+
'A' & 'B' contain duplicates (but are jointly unique), the rest are unique.
216216
"""
217217
df = DataFrame({'A': ['foo', 'foo', 'foo', 'bar', 'bar'],
218218
'B': ['one', 'two', 'three', 'one', 'two'],
219219
'C': ['a', 'b', 'c', 'd', 'e'],
220220
'D': np.random.randn(5),
221-
'E': np.random.randn(5)})
221+
'E': np.random.randn(5),
222+
('tuple', 'as', 'label'): np.random.randn(5)})
222223
return df

Diff for: pandas/tests/frame/test_alter_axes.py

+23-11
Original file line numberDiff line numberDiff line change
@@ -186,18 +186,19 @@ def test_set_index_pass_arrays_duplicate(self, frame_of_index_cols, drop,
186186

187187
# == gives ambiguous Boolean for Series
188188
if drop and keys[0] is 'A' and keys[1] is 'A':
189-
with tm.assert_raises_regex(KeyError, '.*'):
190-
df.set_index(keys, drop=drop, append=append)
189+
# can't drop same column twice
190+
first_drop = False
191191
else:
192-
result = df.set_index(keys, drop=drop, append=append)
192+
first_drop = drop
193193

194-
# to test against already-tested behavior, we add sequentially,
195-
# hence second append always True; must wrap in list, otherwise
196-
# list-box will be illegal
197-
expected = df.set_index([keys[0]], drop=drop, append=append)
198-
expected = expected.set_index([keys[1]], drop=drop, append=True)
194+
# to test against already-tested behaviour, we add sequentially,
195+
# hence second append always True; must wrap in list, otherwise
196+
# list-box will be illegal
197+
expected = df.set_index([keys[0]], drop=first_drop, append=append)
198+
expected = expected.set_index([keys[1]], drop=drop, append=True)
199199

200-
tm.assert_frame_equal(result, expected)
200+
result = df.set_index(keys, drop=drop, append=append)
201+
tm.assert_frame_equal(result, expected)
201202

202203
@pytest.mark.parametrize('append', [True, False])
203204
@pytest.mark.parametrize('drop', [True, False])
@@ -229,13 +230,24 @@ def test_set_index_verify_integrity(self, frame_of_index_cols):
229230
def test_set_index_raise(self, frame_of_index_cols, drop, append):
230231
df = frame_of_index_cols
231232

232-
with tm.assert_raises_regex(KeyError, '.*'): # column names are A-E
233+
with tm.assert_raises_regex(KeyError, "['foo', 'bar', 'baz']"):
234+
# column names are A-E
233235
df.set_index(['foo', 'bar', 'baz'], drop=drop, append=append)
234236

235237
# non-existent key in list with arrays
236-
with tm.assert_raises_regex(KeyError, '.*'):
238+
with tm.assert_raises_regex(KeyError, 'X'):
237239
df.set_index([df['A'], df['B'], 'X'], drop=drop, append=append)
238240

241+
msg = 'The parameter "keys" may only contain a combination of.*'
242+
# forbidden type, e.g. set
243+
with tm.assert_raises_regex(TypeError, msg):
244+
df.set_index(set(df['A']), drop=drop, append=append)
245+
246+
# forbidden type in list, e.g. set
247+
with tm.assert_raises_regex(TypeError, msg):
248+
df.set_index(['A', df['A'], set(df['A'])],
249+
drop=drop, append=append)
250+
239251
def test_construction_with_categorical_index(self):
240252
ci = tm.makeCategoricalIndex(10)
241253
ci.name = 'B'

Diff for: pandas/tests/series/test_alter_axes.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -131,13 +131,14 @@ def test_set_index_verify_integrity(self, string_series):
131131
string_series.set_index([idx, idx], verify_integrity=True)
132132

133133
def test_set_index_raise(self, string_series):
134-
# wrong type: iterator
135-
with tm.assert_raises_regex(TypeError, 'arrays must be.*'):
136-
string_series.set_index(iter(string_series.index),
134+
msg = 'The parameter "arrays" may only contain a combination.*'
135+
# forbidden type, e.g. set
136+
with tm.assert_raises_regex(TypeError, msg):
137+
string_series.set_index(set(string_series.index),
137138
verify_integrity=True)
138139

139140
# wrong type in list with arrays
140-
with tm.assert_raises_regex(TypeError, 'arrays must be.*'):
141+
with tm.assert_raises_regex(TypeError, msg):
141142
string_series.set_index([string_series.index, 'X'],
142143
verify_integrity=True)
143144

0 commit comments

Comments
 (0)