diff --git a/doc/source/release.rst b/doc/source/release.rst index 8c6cf34b0dbbe..198948259be15 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -83,6 +83,10 @@ pandas 0.13 - In ``to_json``, raise if a passed ``orient`` would cause loss of data because of a duplicate index (:issue:`4359`) - Fixed passing ``keep_default_na=False`` when ``na_values=None`` (:issue:`4318`) + - Fixed bug with ``values`` raising an error on a DataFrame with duplicate columns and mixed + dtypes, surfaced in (:issue:`4377`) + - Fixed bug with duplicate columns and type conversion in ``read_json`` when + ``orient='split'`` (:issue:`4377`) pandas 0.12 =========== diff --git a/doc/source/v0.13.0.txt b/doc/source/v0.13.0.txt index 9f2f7c870f849..11c5ef5fe80b9 100644 --- a/doc/source/v0.13.0.txt +++ b/doc/source/v0.13.0.txt @@ -61,9 +61,6 @@ Bug Fixes - Fixed bug where ``network`` testing was throwing ``NameError`` because a local variable was undefined (:issue:`4381`) - - In ``to_json``, raise if a passed ``orient`` would cause loss of data because - of a duplicate index (:issue:`4359`) - - Suppressed DeprecationWarning associated with internal calls issued by repr() (:issue:`4391`) See the :ref:`full release notes diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 2d09bbec85ffa..abe70e9037264 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1538,23 +1538,23 @@ def _interleave(self, items): # By construction, all of the item should be covered by one of the # blocks if items.is_unique: + for block in self.blocks: indexer = items.get_indexer(block.items) if (indexer == -1).any(): raise AssertionError('Items must contain all block items') result[indexer] = block.get_values(dtype) itemmask[indexer] = 1 + + if not itemmask.all(): + raise AssertionError('Some items were not contained in blocks') + else: - for block in self.blocks: - mask = items.isin(block.items) - indexer = mask.nonzero()[0] - if (len(indexer) != len(block.items)): - raise AssertionError('All items must be in block items') - result[indexer] = block.get_values(dtype) - itemmask[indexer] = 1 - if not itemmask.all(): - raise AssertionError('Some items were not contained in blocks') + # non-unique, must use ref_locs + rl = self._set_ref_locs() + for i, (block, idx) in enumerate(rl): + result[i] = block.iget(idx) return result diff --git a/pandas/io/json.py b/pandas/io/json.py index 7b6c97be21393..78d1bc83d6107 100644 --- a/pandas/io/json.py +++ b/pandas/io/json.py @@ -52,19 +52,24 @@ def __init__(self, obj, orient, date_format, double_precision, ensure_ascii): self._format_axes() self._format_dates() + def _needs_to_date(self, obj): + return obj.dtype == 'datetime64[ns]' + def _format_dates(self): raise NotImplementedError def _format_axes(self): raise NotImplementedError - def _needs_to_date(self, data): - return self.date_format == 'iso' and data.dtype == 'datetime64[ns]' - def _format_to_date(self, data): - if self._needs_to_date(data): + + # iso + if self.date_format == 'iso': return data.apply(lambda x: x.isoformat()) - return data + + # int64 + else: + return data.astype(np.int64) def copy_if_needed(self): """ copy myself if necessary """ @@ -87,13 +92,11 @@ def _format_axes(self): self.obj.index = self._format_to_date(self.obj.index.to_series()) def _format_dates(self): - if self._needs_to_date(self.obj): - self.copy_if_needed() + if self.obj.dtype == 'datetime64[ns]': self.obj = self._format_to_date(self.obj) def _format_bools(self): if self._needs_to_bool(self.obj): - self.copy_if_needed() self.obj = self._format_to_bool(self.obj) class FrameWriter(Writer): @@ -123,13 +126,22 @@ def _format_axes(self): setattr(self.obj,axis,self._format_to_date(a.to_series())) def _format_dates(self): - if self.date_format == 'iso': - dtypes = self.obj.dtypes - dtypes = dtypes[dtypes == 'datetime64[ns]'] - if len(dtypes): - self.copy_if_needed() - for c in dtypes.index: - self.obj[c] = self._format_to_date(self.obj[c]) + dtypes = self.obj.dtypes + if len(dtypes[dtypes == 'datetime64[ns]']): + + # need to create a new object + d = {} + + for i, (col, c) in enumerate(self.obj.iteritems()): + + if c.dtype == 'datetime64[ns]': + c = self._format_to_date(c) + + d[i] = c + + d = DataFrame(d,index=self.obj.index) + d.columns = self.obj.columns + self.obj = d def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, convert_axes=True, convert_dates=True, keep_default_dates=True, @@ -291,14 +303,16 @@ def _try_convert_data(self, name, data, use_dtypes=True, convert_dates=True): except: pass - if data.dtype == 'float': + if data.dtype.kind == 'f': - # coerce floats to 64 - try: - data = data.astype('float64') - result = True - except: - pass + if data.dtype != 'float64': + + # coerce floats to 64 + try: + data = data.astype('float64') + result = True + except: + pass # do't coerce 0-len data if len(data) and (data.dtype == 'float' or data.dtype == 'object'): @@ -448,14 +462,35 @@ def _parse_no_numpy(self): self.obj = DataFrame( loads(json, precise_float=self.precise_float), dtype=None) + def _process_converter(self, f, filt=None): + """ take a conversion function and possibly recreate the frame """ + + if filt is None: + filt = lambda col, c: True + + needs_new_obj = False + new_obj = dict() + for i, (col, c) in enumerate(self.obj.iteritems()): + if filt(col, c): + new_data, result = f(col, c) + if result: + c = new_data + needs_new_obj = True + new_obj[i] = c + + if needs_new_obj: + + # possibly handle dup columns + new_obj = DataFrame(new_obj,index=self.obj.index) + new_obj.columns = self.obj.columns + self.obj = new_obj + def _try_convert_types(self): if self.obj is None: return if self.convert_dates: self._try_convert_dates() - for col in self.obj.columns: - new_data, result = self._try_convert_data(col, self.obj[col], convert_dates=False) - if result: - self.obj[col] = new_data + + self._process_converter(lambda col, c: self._try_convert_data(col, c, convert_dates=False)) def _try_convert_dates(self): if self.obj is None: return @@ -478,9 +513,6 @@ def is_ok(col): return True return False + self._process_converter(lambda col, c: self._try_convert_to_date(c), + lambda col, c: (self.keep_default_dates and is_ok(col)) or col in convert_dates) - for col in self.obj.columns: - if (self.keep_default_dates and is_ok(col)) or col in convert_dates: - new_data, result = self._try_convert_to_date(self.obj[col]) - if result: - self.obj[col] = new_data diff --git a/pandas/io/tests/test_json/test_pandas.py b/pandas/io/tests/test_json/test_pandas.py index 893243d148618..cd0e56db84256 100644 --- a/pandas/io/tests/test_json/test_pandas.py +++ b/pandas/io/tests/test_json/test_pandas.py @@ -83,6 +83,21 @@ def test_frame_non_unique_columns(self): unser = read_json(df.to_json(orient='values'), orient='values') np.testing.assert_equal(df.values, unser.values) + # GH4377; duplicate columns not processing correctly + df = DataFrame([['a','b'],['c','d']], index=[1,2], columns=['x','y']) + result = read_json(df.to_json(orient='split'), orient='split') + assert_frame_equal(result, df) + + def _check(df): + result = read_json(df.to_json(orient='split'), orient='split', convert_dates=['x']) + assert_frame_equal(result, df) + + for o in [[['a','b'],['c','d']], + [[1.5,2.5],[3.5,4.5]], + [[1,2.5],[3,4.5]], + [[Timestamp('20130101'),3.5],[Timestamp('20130102'),4.5]]]: + _check(DataFrame(o, index=[1,2], columns=['x','x'])) + def test_frame_from_json_to_json(self): def _check_orient(df, orient, dtype=None, numpy=False, convert_axes=True, check_dtype=True, raise_ok=None): diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index e08f3552382c2..842f114090a50 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -2950,6 +2950,12 @@ def check(result, expected=None): expected = DataFrame([[1],[1],[1]],columns=['bar']) check(df,expected) + # values + df = DataFrame([[1,2.5],[3,4.5]], index=[1,2], columns=['x','x']) + result = df.values + expected = np.array([[1,2.5],[3,4.5]]) + self.assert_((result == expected).all().all()) + def test_insert_benchmark(self): # from the vb_suite/frame_methods/frame_insert_columns N = 10