Rebased version of #22486

h-vetinari · h-vetinari · commit 8ab863b8453e · 2018-10-16T08:27:52.000+02:00
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -712,6 +712,8 @@ Other API Changes
 - :class:`pandas.io.formats.style.Styler` supports a ``number-format`` property when using :meth:`~pandas.io.formats.style.Styler.to_excel` (:issue:`22015`)
 - :meth:`DataFrame.corr` and :meth:`Series.corr` now raise a ``ValueError`` along with a helpful error message instead of a ``KeyError`` when supplied with an invalid method (:issue:`22298`)
 - :meth:`shift` will now always return a copy, instead of the previous behaviour of returning self when shifting by 0 (:issue:`22397`)
+- :meth:`DataFrame.set_index` now raises a ``TypeError`` for incorrect types, has an improved ``KeyError`` message,
+  and will not fail on duplicate column names with ``drop=True``. (:issue:`22484`)
 - Slicing a single row of a DataFrame with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`)
 - :class:`DateOffset` attribute `_cacheable` and method `_should_cache` have been removed (:issue:`23118`)
 
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -82,6 +82,7 @@
 from pandas.core.accessor import CachedAccessor
 from pandas.core.arrays import Categorical, ExtensionArray
 from pandas.core.config import get_option
+
 from pandas.core.generic import NDFrame, _shared_docs
 from pandas.core.index import (Index, MultiIndex, ensure_index,
                                ensure_index_from_sequences)
@@ -3963,7 +3964,7 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
         7      2013    84
         10     2014    31
 
-        Create a multi-index using columns 'year' and 'month':
+        Create a MultiIndex using columns 'year' and 'month':
 
         >>> df.set_index(['year', 'month'])
                     sale
@@ -3973,7 +3974,7 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
         2013  7     84
         2014  10    31
 
-        Create a multi-index using a set of values and a column:
+        Create a MultiIndex using a set of values and a column:
 
         >>> df.set_index([[1, 2, 3, 4], 'year'])
                  month  sale
@@ -3986,6 +3987,25 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
         if not isinstance(keys, list):
             keys = [keys]
 
+        missing = []
+        for col in keys:
+            if (is_scalar(col) or isinstance(col, tuple)) and col in self:
+                # tuples can be both column keys or list-likes
+                # if they are valid column keys, everything is fine
+                continue
+            elif is_scalar(col) and col not in self:
+                # tuples that are not column keys are considered list-like,
+                # not considered missing
+                missing.append(col)
+            elif (not is_list_like(col) or isinstance(col, set)
+                  or getattr(col, 'ndim', 1) > 1):
+                raise TypeError('The parameter "keys" may only contain a '
+                                'combination of valid column keys and '
+                                'one-dimensional list-likes')
+
+        if missing:
+            raise KeyError('{}'.format(missing))
+
         vi = verify_integrity
         return super(DataFrame, self).set_index(keys=keys, drop=drop,
                                                 append=append, inplace=inplace,
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -699,7 +699,7 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
         7      2013    84
         10     2014    31
 
-        Create a multi-index using columns 'year' and 'month':
+        Create a MultiIndex using columns 'year' and 'month':
 
         >>> df.set_index(['year', 'month'])
                     sale
@@ -709,7 +709,7 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
         2013  7     84
         2014  10    31
 
-        Create a multi-index using a set of values and a column:
+        Create a MultiIndex using a set of values and a column:
 
         >>> df.set_index([[1, 2, 3, 4], 'year'])
                  month  sale
@@ -741,18 +741,20 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
                 for n in range(col.nlevels):
                     arrays.append(col._get_level_values(n))
                 names.extend(col.names)
-            elif isinstance(col, ABCIndexClass):
-                # Index but not MultiIndex (treated above)
+            elif isinstance(col, (ABCIndexClass, ABCSeries)):
+                # if Index then not MultiIndex (treated above)
                 arrays.append(col)
                 names.append(col.name)
-            elif isinstance(col, ABCSeries):
-                arrays.append(col._values)
-                names.append(col.name)
             elif isinstance(col, (list, np.ndarray)):
                 arrays.append(col)
                 names.append(None)
-            # from here, col can only be a column label (and obj a DataFrame);
-            # see checks in Series.set_index and DataFrame.set_index
+            elif (is_list_like(col)
+                  and not (isinstance(col, tuple) and col in self)):
+                # all other list-likes (but avoid valid column keys)
+                col = list(col)  # ensure iterator do not get read twice etc.
+                arrays.append(col)
+                names.append(None)
+            # from here, col can only be a column label
             else:
                 arrays.append(obj[col]._values)
                 names.append(col)
@@ -766,7 +768,8 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
             raise ValueError('Index has duplicate keys: {dup}'.format(
                 dup=duplicates))
 
-        for c in to_remove:
+        # use set to handle duplicate column names gracefully in case of drop
+        for c in set(to_remove):
             del obj[c]
 
         # clear up memory usage
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -1136,15 +1136,15 @@ def set_index(self, arrays, append=False, inplace=False,
         c    12
         dtype: int64
 
-        Create a multi-index by appending to the existing index:
+        Create a MultiIndex by appending to the existing index:
 
         >>> s.set_index(['a', 'b', 'c'], append=True)
         0  a    10
         1  b    11
         2  c    12
         dtype: int64
 
-        Create a multi-index by passing a list of arrays:
+        Create a MultiIndex by passing a list of arrays:
 
         >>> t = (s ** 2).set_index([['a', 'b', 'c'], ['I', 'II', 'III']])
         >>> t
@@ -1166,11 +1166,11 @@ def set_index(self, arrays, append=False, inplace=False,
         elif all(is_scalar(x) for x in arrays):
             arrays = [arrays]
 
-        if any(not isinstance(x, (ABCSeries, ABCIndexClass, list, np.ndarray))
-               for x in arrays):
-            raise TypeError('arrays must be Series, Index, MultiIndex, list, '
-                            'np.ndarray or list containing only Series, '
-                            'Index, MultiIndex, list, np.ndarray')
+        if any(not is_list_like(x) or isinstance(x, set)
+               or getattr(x, 'ndim', 1) > 1 for x in arrays):
+            raise TypeError('The parameter "arrays" may only contain a '
+                            'combination of valid column keys and '
+                            'one-dimensional list-likes')
 
         return super(Series, self).set_index(keys=arrays, drop=False,
                                              append=append, inplace=inplace,
diff --git a/pandas/tests/frame/conftest.py b/pandas/tests/frame/conftest.py
@@ -211,12 +211,13 @@ def frame_of_index_cols():
     """
     Fixture for DataFrame of columns that can be used for indexing
 
-    Columns are ['A', 'B', 'C', 'D', 'E']; 'A' & 'B' contain duplicates (but
-    are jointly unique), the rest are unique.
+    Columns are ['A', 'B', 'C', 'D', 'E', ('tuple', 'as', 'label')];
+    'A' & 'B' contain duplicates (but are jointly unique), the rest are unique.
     """
     df = DataFrame({'A': ['foo', 'foo', 'foo', 'bar', 'bar'],
                     'B': ['one', 'two', 'three', 'one', 'two'],
                     'C': ['a', 'b', 'c', 'd', 'e'],
                     'D': np.random.randn(5),
-                    'E': np.random.randn(5)})
+                    'E': np.random.randn(5),
+                    ('tuple', 'as', 'label'): np.random.randn(5)})
     return df
diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py
@@ -186,18 +186,19 @@ def test_set_index_pass_arrays_duplicate(self, frame_of_index_cols, drop,
 
         # == gives ambiguous Boolean for Series
         if drop and keys[0] is 'A' and keys[1] is 'A':
-            with tm.assert_raises_regex(KeyError, '.*'):
-                df.set_index(keys, drop=drop, append=append)
+            # can't drop same column twice
+            first_drop = False
         else:
-            result = df.set_index(keys, drop=drop, append=append)
+            first_drop = drop
 
-            # to test against already-tested behavior, we add sequentially,
-            # hence second append always True; must wrap in list, otherwise
-            # list-box will be illegal
-            expected = df.set_index([keys[0]], drop=drop, append=append)
-            expected = expected.set_index([keys[1]], drop=drop, append=True)
+        # to test against already-tested behaviour, we add sequentially,
+        # hence second append always True; must wrap in list, otherwise
+        # list-box will be illegal
+        expected = df.set_index([keys[0]], drop=first_drop, append=append)
+        expected = expected.set_index([keys[1]], drop=drop, append=True)
 
-            tm.assert_frame_equal(result, expected)
+        result = df.set_index(keys, drop=drop, append=append)
+        tm.assert_frame_equal(result, expected)
 
     @pytest.mark.parametrize('append', [True, False])
     @pytest.mark.parametrize('drop', [True, False])
@@ -229,13 +230,24 @@ def test_set_index_verify_integrity(self, frame_of_index_cols):
     def test_set_index_raise(self, frame_of_index_cols, drop, append):
         df = frame_of_index_cols
 
-        with tm.assert_raises_regex(KeyError, '.*'):  # column names are A-E
+        with tm.assert_raises_regex(KeyError, "['foo', 'bar', 'baz']"):
+            # column names are A-E
             df.set_index(['foo', 'bar', 'baz'], drop=drop, append=append)
 
         # non-existent key in list with arrays
-        with tm.assert_raises_regex(KeyError, '.*'):
+        with tm.assert_raises_regex(KeyError, 'X'):
             df.set_index([df['A'], df['B'], 'X'], drop=drop, append=append)
 
+        msg = 'The parameter "keys" may only contain a combination of.*'
+        # forbidden type, e.g. set
+        with tm.assert_raises_regex(TypeError, msg):
+            df.set_index(set(df['A']), drop=drop, append=append)
+
+        # forbidden type in list, e.g. set
+        with tm.assert_raises_regex(TypeError, msg):
+            df.set_index(['A', df['A'], set(df['A'])],
+                         drop=drop, append=append)
+
     def test_construction_with_categorical_index(self):
         ci = tm.makeCategoricalIndex(10)
         ci.name = 'B'
diff --git a/pandas/tests/series/test_alter_axes.py b/pandas/tests/series/test_alter_axes.py
@@ -131,13 +131,14 @@ def test_set_index_verify_integrity(self, string_series):
             string_series.set_index([idx, idx], verify_integrity=True)
 
     def test_set_index_raise(self, string_series):
-        # wrong type: iterator
-        with tm.assert_raises_regex(TypeError, 'arrays must be.*'):
-            string_series.set_index(iter(string_series.index),
+        msg = 'The parameter "arrays" may only contain a combination.*'
+        # forbidden type, e.g. set
+        with tm.assert_raises_regex(TypeError, msg):
+            string_series.set_index(set(string_series.index),
                                     verify_integrity=True)
 
         # wrong type in list with arrays
-        with tm.assert_raises_regex(TypeError, 'arrays must be.*'):
+        with tm.assert_raises_regex(TypeError, msg):
             string_series.set_index([string_series.index, 'X'],
                                     verify_integrity=True)