Skip to content

BUG: GH3468 Fix assigning a new index to a duplicate index in a DataFrame would fail #3483

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions RELEASE.rst
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,21 @@ pandas 0.11.1
- Fix regression in a DataFrame apply with axis=1, objects were not being converted back
to base dtypes correctly (GH3480_)
- Fix issue when storing uint dtypes in an HDFStore. (GH3493_)
- Fix assigning a new index to a duplicate index in a DataFrame would fail (GH3468_)
- ref_locs support to allow duplicative indices across dtypes (GH3468_)
- Non-unique index support clarified (GH3468_)

- Fix assigning a new index to a duplicate index in a DataFrame would fail
- Fix construction of a DataFrame with a duplicate index
- ref_locs support to allow duplicative indices across dtypes
(GH2194_)
- applymap on a DataFrame with a non-unique index now works
(removed warning) (GH2786_), and fix (GH3230_)

.. _GH3164: https://github.com/pydata/pandas/issues/3164
.. _GH2786: https://github.com/pydata/pandas/issues/2786
.. _GH2194: https://github.com/pydata/pandas/issues/2194
.. _GH3230: https://github.com/pydata/pandas/issues/3230
.. _GH3251: https://github.com/pydata/pandas/issues/3251
.. _GH3379: https://github.com/pydata/pandas/issues/3379
.. _GH3480: https://github.com/pydata/pandas/issues/3480
Expand All @@ -75,6 +88,7 @@ pandas 0.11.1
.. _GH3455: https://github.com/pydata/pandas/issues/3455
.. _GH3457: https://github.com/pydata/pandas/issues/3457
.. _GH3461: https://github.com/pydata/pandas/issues/3461
.. _GH3468: https://github.com/pydata/pandas/issues/3468
.. _GH3448: https://github.com/pydata/pandas/issues/3448
.. _GH3449: https://github.com/pydata/pandas/issues/3449
.. _GH3493: https://github.com/pydata/pandas/issues/3493
Expand Down
3 changes: 0 additions & 3 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4261,9 +4261,6 @@ def infer(x):
if com.is_datetime64_dtype(x):
x = lib.map_infer(x, lib.Timestamp)
return lib.map_infer(x, func)
#GH2786
if not self.columns.is_unique:
raise ValueError("applymap does not support dataframes having duplicate column labels")
return self.apply(infer)

#----------------------------------------------------------------------
Expand Down
124 changes: 95 additions & 29 deletions pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ def ref_locs(self):
if (indexer == -1).any():
raise AssertionError('Some block items were not in block '
'ref_items')

self._ref_locs = indexer
return self._ref_locs

Expand Down Expand Up @@ -164,6 +165,9 @@ def get(self, item):
loc = self.items.get_loc(item)
return self.values[loc]

def iget(self, i):
return self.values[i]

def set(self, item, value):
"""
Modify Block in-place with new item value
Expand Down Expand Up @@ -710,7 +714,7 @@ def convert(self, convert_dates = True, convert_numeric = True, copy = True):
# attempt to create new type blocks
blocks = []
for i, c in enumerate(self.items):
values = self.get(c)
values = self.iget(i)

values = com._possibly_convert_objects(values, convert_dates=convert_dates, convert_numeric=convert_numeric)
values = _block_shape(values)
Expand Down Expand Up @@ -879,7 +883,7 @@ class BlockManager(object):
-----
This is *not* a public API class
"""
__slots__ = ['axes', 'blocks', '_known_consolidated', '_is_consolidated']
__slots__ = ['axes', 'blocks', '_known_consolidated', '_is_consolidated', '_ref_locs']

def __init__(self, blocks, axes, do_integrity_check=True):
self.axes = [_ensure_index(ax) for ax in axes]
Expand Down Expand Up @@ -915,12 +919,82 @@ def set_axis(self, axis, value):
if len(value) != len(cur_axis):
raise Exception('Length mismatch (%d vs %d)'
% (len(value), len(cur_axis)))

self.axes[axis] = value

if axis == 0:

# we have a non-unique index, so setup the ref_locs
if not cur_axis.is_unique:
self.set_ref_locs(cur_axis)

# take via ref_locs
for block in self.blocks:
block.set_ref_items(self.items, maybe_rename=True)

def set_ref_locs(self, labels = None):
# if we have a non-unique index on this axis, set the indexers
# we need to set an absolute indexer for the blocks
# return the indexer if we are not unique
if labels is None:
labels = self.items

if labels.is_unique:
return None

#### THIS IS POTENTIALLY VERY SLOW #####

# if we are already computed, then we are done
rl = getattr(self,'_ref_locs',None)
if rl is not None:
return rl

blocks = self.blocks

# initialize
blockmap = dict()
for b in blocks:
arr = np.empty(len(b.items),dtype='int64')
arr.fill(-1)
b._ref_locs = arr

# add this block to the blockmap for each
# of the items in the block
for item in b.items:
if item not in blockmap:
blockmap[item] = []
blockmap[item].append(b)

rl = np.empty(len(labels),dtype=object)
for i, item in enumerate(labels.values):

try:
block = blockmap[item].pop(0)
except:
raise Exception("not enough items in set_ref_locs")

indexer = np.arange(len(block.items))
mask = (block.items == item) & (block._ref_locs == -1)
if not mask.any():

# this case will catch a comparison of a index of tuples
mask = np.empty(len(block.items),dtype=bool)
mask.fill(False)
for j, (bitem, brl) in enumerate(zip(block.items,block._ref_locs)):
mask[j] = bitem == item and brl == -1

indices = indexer[mask]
if len(indices):
idx = indices[0]
else:
raise Exception("already set too many items in set_ref_locs")

block._ref_locs[idx] = i
rl[i] = (block,idx)

self._ref_locs = rl
return rl

# make items read only for now
def _get_items(self):
return self.axes[0]
Expand Down Expand Up @@ -1387,26 +1461,11 @@ def iget(self, i):
item = self.items[i]
if self.items.is_unique:
return self.get(item)
else:
# ugh
try:
inds, = (self.items == item).nonzero()
except AttributeError: # MultiIndex
inds, = self.items.map(lambda x: x == item).nonzero()

_, block = self._find_block(item)

try:
binds, = (block.items == item).nonzero()
except AttributeError: # MultiIndex
binds, = block.items.map(lambda x: x == item).nonzero()

for j, (k, b) in enumerate(zip(inds, binds)):
if i == k:
return block.values[b]

raise Exception('Cannot have duplicate column names '
'split across dtypes')
# compute the duplicative indexer if needed
ref_locs = self.set_ref_locs()
b, loc = ref_locs[i]
return b.values[loc]

def get_scalar(self, tup):
"""
Expand Down Expand Up @@ -1582,6 +1641,8 @@ def _reindex_indexer_items(self, new_items, indexer, fill_value):
# keep track of what items aren't found anywhere
mask = np.zeros(len(item_order), dtype=bool)

new_axes = [new_items] + self.axes[1:]

new_blocks = []
for blk in self.blocks:
blk_indexer = blk.items.get_indexer(item_order)
Expand All @@ -1605,7 +1666,7 @@ def _reindex_indexer_items(self, new_items, indexer, fill_value):
new_blocks.append(na_block)
new_blocks = _consolidate(new_blocks, new_items)

return BlockManager(new_blocks, [new_items] + self.axes[1:])
return BlockManager(new_blocks, new_axes)

def reindex_items(self, new_items, copy=True, fill_value=np.nan):
"""
Expand All @@ -1619,6 +1680,7 @@ def reindex_items(self, new_items, copy=True, fill_value=np.nan):

# TODO: this part could be faster (!)
new_items, indexer = self.items.reindex(new_items)
new_axes = [new_items] + self.axes[1:]

# could have so me pathological (MultiIndex) issues here
new_blocks = []
Expand All @@ -1643,7 +1705,7 @@ def reindex_items(self, new_items, copy=True, fill_value=np.nan):
new_blocks.append(na_block)
new_blocks = _consolidate(new_blocks, new_items)

return BlockManager(new_blocks, [new_items] + self.axes[1:])
return BlockManager(new_blocks, new_axes)

def _make_na_block(self, items, ref_items, fill_value=np.nan):
# TODO: infer dtypes other than float64 from fill_value
Expand Down Expand Up @@ -1685,11 +1747,11 @@ def merge(self, other, lsuffix=None, rsuffix=None):
this, other = self._maybe_rename_join(other, lsuffix, rsuffix)

cons_items = this.items + other.items
consolidated = _consolidate(this.blocks + other.blocks, cons_items)

new_axes = list(this.axes)
new_axes[0] = cons_items

consolidated = _consolidate(this.blocks + other.blocks, cons_items)

return BlockManager(consolidated, new_axes)

def _maybe_rename_join(self, other, lsuffix, rsuffix, copydata=True):
Expand Down Expand Up @@ -1902,7 +1964,6 @@ def form_blocks(arrays, names, axes):

na_block = make_block(block_values, extra_items, items)
blocks.append(na_block)
blocks = _consolidate(blocks, items)

return blocks

Expand Down Expand Up @@ -1953,16 +2014,21 @@ def _shape_compat(x):

names, arrays = zip(*tuples)

# index may box values
items = ref_items[ref_items.isin(names)]

first = arrays[0]
shape = (len(arrays),) + _shape_compat(first)

stacked = np.empty(shape, dtype=dtype)
for i, arr in enumerate(arrays):
stacked[i] = _asarray_compat(arr)

# index may box values
if ref_items.is_unique:
items = ref_items[ref_items.isin(names)]
else:
items = _ensure_index([ n for n in names if n in ref_items ])
if len(items) != len(stacked):
raise Exception("invalid names passed _stack_arrays")

return items, stacked


Expand Down
71 changes: 65 additions & 6 deletions pandas/tests/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -7492,12 +7492,15 @@ def test_applymap(self):
self.assert_(result.dtypes[0] == object)

# GH2786
df = DataFrame(np.random.random((3,4)))
df.columns = ['a','a','a','a']
try:
df.applymap(str)
except ValueError as e:
self.assertTrue("support" in str(e))
df = DataFrame(np.random.random((3,4)))
df2 = df.copy()
cols = ['a','a','a','a']
df.columns = cols

expected = df2.applymap(str)
expected.columns = cols
result = df.applymap(str)
assert_frame_equal(result,expected)

def test_filter(self):
# items
Expand Down Expand Up @@ -9201,6 +9204,62 @@ def test_assign_columns(self):
assert_series_equal(self.frame['C'], frame['baz'])
assert_series_equal(self.frame['hi'], frame['foo2'])

def test_columns_with_dups(self):

# GH 3468 related

# basic
df = DataFrame([[1,2]], columns=['a','a'])
df.columns = ['a','a.1']
str(df)
expected = DataFrame([[1,2]], columns=['a','a.1'])
assert_frame_equal(df, expected)

df = DataFrame([[1,2,3]], columns=['b','a','a'])
df.columns = ['b','a','a.1']
str(df)
expected = DataFrame([[1,2,3]], columns=['b','a','a.1'])
assert_frame_equal(df, expected)

# with a dup index
df = DataFrame([[1,2]], columns=['a','a'])
df.columns = ['b','b']
str(df)
expected = DataFrame([[1,2]], columns=['b','b'])
assert_frame_equal(df, expected)

# multi-dtype
df = DataFrame([[1,2,1.,2.,3.,'foo','bar']], columns=['a','a','b','b','d','c','c'])
df.columns = list('ABCDEFG')
str(df)
expected = DataFrame([[1,2,1.,2.,3.,'foo','bar']], columns=list('ABCDEFG'))
assert_frame_equal(df, expected)

# this is an error because we cannot disambiguate the dup columns
self.assertRaises(Exception, lambda x: DataFrame([[1,2,'foo','bar']], columns=['a','a','a','a']))

# dups across blocks
df_float = DataFrame(np.random.randn(10, 3),dtype='float64')
df_int = DataFrame(np.random.randn(10, 3),dtype='int64')
df_bool = DataFrame(True,index=df_float.index,columns=df_float.columns)
df_object = DataFrame('foo',index=df_float.index,columns=df_float.columns)
df_dt = DataFrame(Timestamp('20010101'),index=df_float.index,columns=df_float.columns)
df = pan.concat([ df_float, df_int, df_bool, df_object, df_dt ], axis=1)

result = df._data.set_ref_locs()
self.assert_(len(result) == len(df.columns))

# testing iget
for i in range(len(df.columns)):
df.iloc[:,i]

# dup columns across dtype GH 2079/2194
vals = [[1, -1, 2.], [2, -2, 3.]]
rs = DataFrame(vals, columns=['A', 'A', 'B'])
xp = DataFrame(vals)
xp.columns = ['A', 'A', 'B']
assert_frame_equal(rs, xp)

def test_cast_internals(self):
casted = DataFrame(self.frame._data, dtype=int)
expected = DataFrame(self.frame._series, dtype=int)
Expand Down
7 changes: 7 additions & 0 deletions pandas/tests/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -772,6 +772,13 @@ def test_dups_fancy_indexing(self):
expected = Index(['b','a','a'])
self.assert_(result.equals(expected))

# across dtypes
df = DataFrame([[1,2,1.,2.,3.,'foo','bar']], columns=list('aaaaaaa'))
result = DataFrame([[1,2,1.,2.,3.,'foo','bar']])
result.columns = list('aaaaaaa')
assert_frame_equal(df,result)


if __name__ == '__main__':
import nose
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/test_internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,7 +268,7 @@ def test_duplicate_item_failure(self):
b.ref_items = items

mgr = BlockManager(blocks, [items, np.arange(N)])
self.assertRaises(Exception, mgr.iget, 1)
mgr.iget(1)

def test_contains(self):
self.assert_('a' in self.mgr)
Expand Down