Skip to content

Commit c03f0ca

Browse files
committed
Merge pull request #3509 from jreback/dup_columns2
BUG/CLN: Allow the BlockManager to have a non-unique items (axis 0)
2 parents 15bca1c + 8c08aca commit c03f0ca

File tree

10 files changed

+349
-111
lines changed

10 files changed

+349
-111
lines changed

Diff for: RELEASE.rst

+14
Original file line numberDiff line numberDiff line change
@@ -61,8 +61,20 @@ pandas 0.11.1
6161
- Fix regression in a DataFrame apply with axis=1, objects were not being converted back
6262
to base dtypes correctly (GH3480_)
6363
- Fix issue when storing uint dtypes in an HDFStore. (GH3493_)
64+
- Non-unique index support clarified (GH3468_)
65+
66+
- Fix assigning a new index to a duplicate index in a DataFrame would fail (GH3468_)
67+
- Fix construction of a DataFrame with a duplicate index
68+
- ref_locs support to allow duplicative indices across dtypes,
69+
allows iget support to always find the index (even across dtypes) (GH2194_)
70+
- applymap on a DataFrame with a non-unique index now works
71+
(removed warning) (GH2786_), and fix (GH3230_)
72+
- Fix to_csv to handle non-unique columns (GH3495_)
6473

6574
.. _GH3164: https://github.com/pydata/pandas/issues/3164
75+
.. _GH2786: https://github.com/pydata/pandas/issues/2786
76+
.. _GH2194: https://github.com/pydata/pandas/issues/2194
77+
.. _GH3230: https://github.com/pydata/pandas/issues/3230
6678
.. _GH3251: https://github.com/pydata/pandas/issues/3251
6779
.. _GH3379: https://github.com/pydata/pandas/issues/3379
6880
.. _GH3480: https://github.com/pydata/pandas/issues/3480
@@ -75,8 +87,10 @@ pandas 0.11.1
7587
.. _GH3455: https://github.com/pydata/pandas/issues/3455
7688
.. _GH3457: https://github.com/pydata/pandas/issues/3457
7789
.. _GH3461: https://github.com/pydata/pandas/issues/3461
90+
.. _GH3468: https://github.com/pydata/pandas/issues/3468
7891
.. _GH3448: https://github.com/pydata/pandas/issues/3448
7992
.. _GH3449: https://github.com/pydata/pandas/issues/3449
93+
.. _GH3495: https://github.com/pydata/pandas/issues/3495
8094
.. _GH3493: https://github.com/pydata/pandas/issues/3493
8195

8296

Diff for: pandas/core/common.py

+1
Original file line numberDiff line numberDiff line change
@@ -1156,6 +1156,7 @@ def _default_index(n):
11561156
values = np.arange(n, dtype=np.int64)
11571157
result = values.view(Int64Index)
11581158
result.name = None
1159+
result.is_unique = True
11591160
return result
11601161

11611162

Diff for: pandas/core/format.py

+8-27
Original file line numberDiff line numberDiff line change
@@ -820,21 +820,7 @@ def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None,
820820
self.blocks = self.obj._data.blocks
821821
ncols = sum(len(b.items) for b in self.blocks)
822822
self.data =[None] * ncols
823-
824-
if self.obj.columns.is_unique:
825-
self.colname_map = dict((k,i) for i,k in enumerate(self.obj.columns))
826-
else:
827-
ks = [set(x.items) for x in self.blocks]
828-
u = len(reduce(lambda a,x: a.union(x),ks,set()))
829-
t = sum(map(len,ks))
830-
if u != t:
831-
if len(set(self.cols)) != len(self.cols):
832-
raise NotImplementedError("duplicate columns with differing dtypes are unsupported")
833-
else:
834-
# if columns are not unique and we acces this,
835-
# we're doing it wrong
836-
pass
837-
823+
self.column_map = self.obj._data.get_items_map()
838824

839825
if chunksize is None:
840826
chunksize = (100000/ (len(self.cols) or 1)) or 1
@@ -1034,18 +1020,13 @@ def _save_chunk(self, start_i, end_i):
10341020

10351021
# create the data for a chunk
10361022
slicer = slice(start_i,end_i)
1037-
if self.obj.columns.is_unique:
1038-
for i in range(len(self.blocks)):
1039-
b = self.blocks[i]
1040-
d = b.to_native_types(slicer=slicer, na_rep=self.na_rep, float_format=self.float_format)
1041-
for j, k in enumerate(b.items):
1042-
# self.data is a preallocated list
1043-
self.data[self.colname_map[k]] = d[j]
1044-
else:
1045-
# self.obj should contain a proper view of the dataframes
1046-
# with the specified ordering of cols if cols was specified
1047-
for i in range(len(self.obj.columns)):
1048-
self.data[i] = self.obj.icol(i).values[slicer].tolist()
1023+
for i in range(len(self.blocks)):
1024+
b = self.blocks[i]
1025+
d = b.to_native_types(slicer=slicer, na_rep=self.na_rep, float_format=self.float_format)
1026+
for i, item in enumerate(b.items):
1027+
1028+
# self.data is a preallocated list
1029+
self.data[self.column_map[b][i]] = d[i]
10491030

10501031
ix = data_index.to_native_types(slicer=slicer, na_rep=self.na_rep, float_format=self.float_format)
10511032

Diff for: pandas/core/frame.py

-3
Original file line numberDiff line numberDiff line change
@@ -4261,9 +4261,6 @@ def infer(x):
42614261
if com.is_datetime64_dtype(x):
42624262
x = lib.map_infer(x, lib.Timestamp)
42634263
return lib.map_infer(x, func)
4264-
#GH2786
4265-
if not self.columns.is_unique:
4266-
raise ValueError("applymap does not support dataframes having duplicate column labels")
42674264
return self.apply(infer)
42684265

42694266
#----------------------------------------------------------------------

Diff for: pandas/core/index.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -278,7 +278,7 @@ def is_monotonic(self):
278278
def is_lexsorted_for_tuple(self, tup):
279279
return True
280280

281-
@cache_readonly
281+
@cache_readonly(allow_setting=True)
282282
def is_unique(self):
283283
return self._engine.is_unique
284284

0 commit comments

Comments
 (0)