pandas-dev · jreback · Apr 29, 2013 · Apr 29, 2013 · Apr 30, 2013 · Apr 30, 2013
diff --git a/RELEASE.rst b/RELEASE.rst
@@ -61,8 +61,21 @@ pandas 0.11.1
   - Fix regression in a DataFrame apply with axis=1, objects were not being converted back
     to base dtypes correctly (GH3480_)
   - Fix issue when storing uint dtypes in an HDFStore. (GH3493_)
+  - Fix assigning a new index to a duplicate index in a DataFrame would fail (GH3468_)
+  - ref_locs support to allow duplicative indices across dtypes (GH3468_)
+  - Non-unique index support clarified (GH3468_)
+
+    - Fix assigning a new index to a duplicate index in a DataFrame would fail
+    - Fix construction of a DataFrame with a duplicate index
+    - ref_locs support to allow duplicative indices across dtypes
+      (GH2194_)
+    - applymap on a DataFrame with a non-unique index now works
+      (removed warning) (GH2786_), and fix (GH3230_)
 
 .. _GH3164: https://github.com/pydata/pandas/issues/3164
+.. _GH2786: https://github.com/pydata/pandas/issues/2786
+.. _GH2194: https://github.com/pydata/pandas/issues/2194
+.. _GH3230: https://github.com/pydata/pandas/issues/3230
 .. _GH3251: https://github.com/pydata/pandas/issues/3251
 .. _GH3379: https://github.com/pydata/pandas/issues/3379
 .. _GH3480: https://github.com/pydata/pandas/issues/3480
@@ -75,6 +88,7 @@ pandas 0.11.1
 .. _GH3455: https://github.com/pydata/pandas/issues/3455
 .. _GH3457: https://github.com/pydata/pandas/issues/3457
 .. _GH3461: https://github.com/pydata/pandas/issues/3461
+.. _GH3468: https://github.com/pydata/pandas/issues/3468
 .. _GH3448: https://github.com/pydata/pandas/issues/3448
 .. _GH3449: https://github.com/pydata/pandas/issues/3449
 .. _GH3493: https://github.com/pydata/pandas/issues/3493

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -4261,9 +4261,6 @@ def infer(x):
             if com.is_datetime64_dtype(x):
                 x = lib.map_infer(x, lib.Timestamp)
             return lib.map_infer(x, func)
-        #GH2786
-        if not self.columns.is_unique:
-            raise ValueError("applymap does not support dataframes having duplicate column labels")
         return self.apply(infer)
 
     #----------------------------------------------------------------------

diff --git a/pandas/core/internals.py b/pandas/core/internals.py
@@ -61,6 +61,7 @@ def ref_locs(self):
             if (indexer == -1).any():
                 raise AssertionError('Some block items were not in block '
                                      'ref_items')
+
             self._ref_locs = indexer
         return self._ref_locs
 
@@ -164,6 +165,9 @@ def get(self, item):
         loc = self.items.get_loc(item)
         return self.values[loc]
 
+    def iget(self, i):
+        return self.values[i]
+
     def set(self, item, value):
         """
         Modify Block in-place with new item value
@@ -710,7 +714,7 @@ def convert(self, convert_dates = True, convert_numeric = True, copy = True):
         # attempt to create new type blocks
         blocks = []
         for i, c in enumerate(self.items):
-            values = self.get(c)
+            values = self.iget(i)
 
             values = com._possibly_convert_objects(values, convert_dates=convert_dates, convert_numeric=convert_numeric)
             values = _block_shape(values)
@@ -879,7 +883,7 @@ class BlockManager(object):
     -----
     This is *not* a public API class
     """
-    __slots__ = ['axes', 'blocks', '_known_consolidated', '_is_consolidated']
+    __slots__ = ['axes', 'blocks', '_known_consolidated', '_is_consolidated', '_ref_locs']
 
     def __init__(self, blocks, axes, do_integrity_check=True):
         self.axes = [_ensure_index(ax) for ax in axes]
@@ -915,12 +919,82 @@ def set_axis(self, axis, value):
         if len(value) != len(cur_axis):
             raise Exception('Length mismatch (%d vs %d)'
                             % (len(value), len(cur_axis)))
+
         self.axes[axis] = value
 
         if axis == 0:
+
+            # we have a non-unique index, so setup the ref_locs
+            if not cur_axis.is_unique:
+                self.set_ref_locs(cur_axis)
+
+            # take via ref_locs
             for block in self.blocks:
                 block.set_ref_items(self.items, maybe_rename=True)
 
+    def set_ref_locs(self, labels = None):
+        # if we have a non-unique index on this axis, set the indexers
+        # we need to set an absolute indexer for the blocks
+        # return the indexer if we are not unique
+        if labels is None:
+            labels = self.items
+
+        if labels.is_unique: 
+            return None
+
+        #### THIS IS POTENTIALLY VERY SLOW #####
+
+        # if we are already computed, then we are done
+        rl = getattr(self,'_ref_locs',None)
+        if rl is not None:
+            return rl
+
+        blocks = self.blocks
+
+        # initialize
+        blockmap = dict()
+        for b in blocks:
+            arr = np.empty(len(b.items),dtype='int64')
+            arr.fill(-1)
+            b._ref_locs = arr
+
+            # add this block to the blockmap for each
+            # of the items in the block
+            for item in b.items:
+               if item not in blockmap:
+                   blockmap[item] = []
+               blockmap[item].append(b)
+
+        rl = np.empty(len(labels),dtype=object)
+        for i, item in enumerate(labels.values):
+
+            try:
+                block = blockmap[item].pop(0)
+            except:
+                raise Exception("not enough items in set_ref_locs")
+
+            indexer = np.arange(len(block.items))
+            mask = (block.items == item) & (block._ref_locs == -1)
+            if not mask.any():
+
+                # this case will catch a comparison of a index of tuples
+                mask = np.empty(len(block.items),dtype=bool)
+                mask.fill(False)
+                for j, (bitem, brl) in enumerate(zip(block.items,block._ref_locs)):
+                    mask[j] = bitem == item and brl == -1
+
+            indices = indexer[mask]
+            if len(indices):
+                idx = indices[0]
+            else:
+                raise Exception("already set too many items in set_ref_locs")
+
+            block._ref_locs[idx] = i
+            rl[i] = (block,idx)
+
+        self._ref_locs = rl
+        return rl
+
     # make items read only for now
     def _get_items(self):
         return self.axes[0]
@@ -1387,26 +1461,11 @@ def iget(self, i):
         item = self.items[i]
         if self.items.is_unique:
             return self.get(item)
-        else:
-            # ugh
-            try:
-                inds, = (self.items == item).nonzero()
-            except AttributeError:  # MultiIndex
-                inds, = self.items.map(lambda x: x == item).nonzero()
-
-            _, block = self._find_block(item)
-
-            try:
-                binds, = (block.items == item).nonzero()
-            except AttributeError:  # MultiIndex
-                binds, = block.items.map(lambda x: x == item).nonzero()
 
-            for j, (k, b) in enumerate(zip(inds, binds)):
-                if i == k:
-                    return block.values[b]
-
-            raise Exception('Cannot have duplicate column names '
-                            'split across dtypes')
+        # compute the duplicative indexer if needed
+        ref_locs = self.set_ref_locs()
+        b, loc = ref_locs[i]
+        return b.values[loc]
 
     def get_scalar(self, tup):
         """
@@ -1582,6 +1641,8 @@ def _reindex_indexer_items(self, new_items, indexer, fill_value):
         # keep track of what items aren't found anywhere
         mask = np.zeros(len(item_order), dtype=bool)
 
+        new_axes = [new_items] + self.axes[1:]
+
         new_blocks = []
         for blk in self.blocks:
             blk_indexer = blk.items.get_indexer(item_order)
@@ -1605,7 +1666,7 @@ def _reindex_indexer_items(self, new_items, indexer, fill_value):
             new_blocks.append(na_block)
             new_blocks = _consolidate(new_blocks, new_items)
 
-        return BlockManager(new_blocks, [new_items] + self.axes[1:])
+        return BlockManager(new_blocks, new_axes)
 
     def reindex_items(self, new_items, copy=True, fill_value=np.nan):
         """
@@ -1619,6 +1680,7 @@ def reindex_items(self, new_items, copy=True, fill_value=np.nan):
 
         # TODO: this part could be faster (!)
         new_items, indexer = self.items.reindex(new_items)
+        new_axes = [new_items] + self.axes[1:]
 
         # could have so me pathological (MultiIndex) issues here
         new_blocks = []
@@ -1643,7 +1705,7 @@ def reindex_items(self, new_items, copy=True, fill_value=np.nan):
                 new_blocks.append(na_block)
                 new_blocks = _consolidate(new_blocks, new_items)
 
-        return BlockManager(new_blocks, [new_items] + self.axes[1:])
+        return BlockManager(new_blocks, new_axes)
 
     def _make_na_block(self, items, ref_items, fill_value=np.nan):
         # TODO: infer dtypes other than float64 from fill_value
@@ -1685,11 +1747,11 @@ def merge(self, other, lsuffix=None, rsuffix=None):
         this, other = self._maybe_rename_join(other, lsuffix, rsuffix)
 
         cons_items = this.items + other.items
-        consolidated = _consolidate(this.blocks + other.blocks, cons_items)
-
         new_axes = list(this.axes)
         new_axes[0] = cons_items
 
+        consolidated = _consolidate(this.blocks + other.blocks, cons_items)
+
         return BlockManager(consolidated, new_axes)
 
     def _maybe_rename_join(self, other, lsuffix, rsuffix, copydata=True):
@@ -1902,7 +1964,6 @@ def form_blocks(arrays, names, axes):
 
         na_block = make_block(block_values, extra_items, items)
         blocks.append(na_block)
-        blocks = _consolidate(blocks, items)
 
     return blocks
 
@@ -1953,16 +2014,21 @@ def _shape_compat(x):
 
     names, arrays = zip(*tuples)
 
-    # index may box values
-    items = ref_items[ref_items.isin(names)]
-
     first = arrays[0]
     shape = (len(arrays),) + _shape_compat(first)
 
     stacked = np.empty(shape, dtype=dtype)
     for i, arr in enumerate(arrays):
         stacked[i] = _asarray_compat(arr)
 
+    # index may box values
+    if ref_items.is_unique:
+        items = ref_items[ref_items.isin(names)]
+    else:
+        items = _ensure_index([ n for n in names if n in ref_items ])
+        if len(items) != len(stacked):
+            raise Exception("invalid names passed _stack_arrays")
+
     return items, stacked
 
 

diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
@@ -7492,12 +7492,15 @@ def test_applymap(self):
         self.assert_(result.dtypes[0] == object)
 
         # GH2786
-        df = DataFrame(np.random.random((3,4)))
-        df.columns = ['a','a','a','a']
-        try:
-            df.applymap(str)
-        except ValueError as e:
-            self.assertTrue("support" in str(e))
+        df  = DataFrame(np.random.random((3,4)))
+        df2 = df.copy()
+        cols = ['a','a','a','a']
+        df.columns = cols
+
+        expected = df2.applymap(str)
+        expected.columns = cols
+        result = df.applymap(str)
+        assert_frame_equal(result,expected)
 
     def test_filter(self):
         # items
@@ -9201,6 +9204,62 @@ def test_assign_columns(self):
         assert_series_equal(self.frame['C'], frame['baz'])
         assert_series_equal(self.frame['hi'], frame['foo2'])
 
+    def test_columns_with_dups(self):
+
+        # GH 3468 related
+
+        # basic
+        df = DataFrame([[1,2]], columns=['a','a'])
+        df.columns = ['a','a.1']
+        str(df)
+        expected = DataFrame([[1,2]], columns=['a','a.1'])
+        assert_frame_equal(df, expected)
+
+        df = DataFrame([[1,2,3]], columns=['b','a','a'])
+        df.columns = ['b','a','a.1']
+        str(df)
+        expected = DataFrame([[1,2,3]], columns=['b','a','a.1'])
+        assert_frame_equal(df, expected)
+
+        # with a dup index
+        df = DataFrame([[1,2]], columns=['a','a'])
+        df.columns = ['b','b']
+        str(df)
+        expected = DataFrame([[1,2]], columns=['b','b'])
+        assert_frame_equal(df, expected)
+
+        # multi-dtype
+        df = DataFrame([[1,2,1.,2.,3.,'foo','bar']], columns=['a','a','b','b','d','c','c'])
+        df.columns = list('ABCDEFG')
+        str(df)
+        expected = DataFrame([[1,2,1.,2.,3.,'foo','bar']], columns=list('ABCDEFG'))
+        assert_frame_equal(df, expected)
+
+        # this is an error because we cannot disambiguate the dup columns
+        self.assertRaises(Exception, lambda x: DataFrame([[1,2,'foo','bar']], columns=['a','a','a','a']))
+
+        # dups across blocks
+        df_float  = DataFrame(np.random.randn(10, 3),dtype='float64')
+        df_int    = DataFrame(np.random.randn(10, 3),dtype='int64')
+        df_bool   = DataFrame(True,index=df_float.index,columns=df_float.columns)
+        df_object = DataFrame('foo',index=df_float.index,columns=df_float.columns)
+        df_dt     = DataFrame(Timestamp('20010101'),index=df_float.index,columns=df_float.columns)
+        df        = pan.concat([ df_float, df_int, df_bool, df_object, df_dt ], axis=1)
+
+        result = df._data.set_ref_locs()
+        self.assert_(len(result) == len(df.columns))
+
+        # testing iget
+        for i in range(len(df.columns)):
+             df.iloc[:,i]
+
+        # dup columns across dtype GH 2079/2194
+        vals = [[1, -1, 2.], [2, -2, 3.]] 
+        rs = DataFrame(vals, columns=['A', 'A', 'B']) 
+        xp = DataFrame(vals) 
+        xp.columns = ['A', 'A', 'B'] 
+        assert_frame_equal(rs, xp) 
+
     def test_cast_internals(self):
         casted = DataFrame(self.frame._data, dtype=int)
         expected = DataFrame(self.frame._series, dtype=int)

diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py
@@ -772,6 +772,13 @@ def test_dups_fancy_indexing(self):
         expected = Index(['b','a','a'])
         self.assert_(result.equals(expected))
 
+        # across dtypes
+        df = DataFrame([[1,2,1.,2.,3.,'foo','bar']], columns=list('aaaaaaa'))
+        result = DataFrame([[1,2,1.,2.,3.,'foo','bar']])
+        result.columns = list('aaaaaaa')
+        assert_frame_equal(df,result)
+
+
 if __name__ == '__main__':
     import nose
     nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],

diff --git a/pandas/tests/test_internals.py b/pandas/tests/test_internals.py
@@ -268,7 +268,7 @@ def test_duplicate_item_failure(self):
             b.ref_items = items
 
         mgr = BlockManager(blocks, [items, np.arange(N)])
-        self.assertRaises(Exception, mgr.iget, 1)
+        mgr.iget(1)
 
     def test_contains(self):
         self.assert_('a' in self.mgr)