Skip to content

BUG: fixes for GH4377 #4388

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jul 30, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,10 @@ pandas 0.13
- In ``to_json``, raise if a passed ``orient`` would cause loss of data because
of a duplicate index (:issue:`4359`)
- Fixed passing ``keep_default_na=False`` when ``na_values=None`` (:issue:`4318`)
- Fixed bug with ``values`` raising an error on a DataFrame with duplicate columns and mixed
dtypes, surfaced in (:issue:`4377`)
- Fixed bug with duplicate columns and type conversion in ``read_json`` when
``orient='split'`` (:issue:`4377`)

pandas 0.12
===========
Expand Down
3 changes: 0 additions & 3 deletions doc/source/v0.13.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,6 @@ Bug Fixes
- Fixed bug where ``network`` testing was throwing ``NameError`` because a
local variable was undefined (:issue:`4381`)

- In ``to_json``, raise if a passed ``orient`` would cause loss of data because
of a duplicate index (:issue:`4359`)

- Suppressed DeprecationWarning associated with internal calls issued by repr() (:issue:`4391`)

See the :ref:`full release notes
Expand Down
18 changes: 9 additions & 9 deletions pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -1538,23 +1538,23 @@ def _interleave(self, items):
# By construction, all of the item should be covered by one of the
# blocks
if items.is_unique:

for block in self.blocks:
indexer = items.get_indexer(block.items)
if (indexer == -1).any():
raise AssertionError('Items must contain all block items')
result[indexer] = block.get_values(dtype)
itemmask[indexer] = 1

if not itemmask.all():
raise AssertionError('Some items were not contained in blocks')

else:
for block in self.blocks:
mask = items.isin(block.items)
indexer = mask.nonzero()[0]
if (len(indexer) != len(block.items)):
raise AssertionError('All items must be in block items')
result[indexer] = block.get_values(dtype)
itemmask[indexer] = 1

if not itemmask.all():
raise AssertionError('Some items were not contained in blocks')
# non-unique, must use ref_locs
rl = self._set_ref_locs()
for i, (block, idx) in enumerate(rl):
result[i] = block.iget(idx)

return result

Expand Down
94 changes: 63 additions & 31 deletions pandas/io/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,19 +52,24 @@ def __init__(self, obj, orient, date_format, double_precision, ensure_ascii):
self._format_axes()
self._format_dates()

def _needs_to_date(self, obj):
return obj.dtype == 'datetime64[ns]'

def _format_dates(self):
raise NotImplementedError

def _format_axes(self):
raise NotImplementedError

def _needs_to_date(self, data):
return self.date_format == 'iso' and data.dtype == 'datetime64[ns]'

def _format_to_date(self, data):
if self._needs_to_date(data):

# iso
if self.date_format == 'iso':
return data.apply(lambda x: x.isoformat())
return data

# int64
else:
return data.astype(np.int64)

def copy_if_needed(self):
""" copy myself if necessary """
Expand All @@ -87,13 +92,11 @@ def _format_axes(self):
self.obj.index = self._format_to_date(self.obj.index.to_series())

def _format_dates(self):
if self._needs_to_date(self.obj):
self.copy_if_needed()
if self.obj.dtype == 'datetime64[ns]':
self.obj = self._format_to_date(self.obj)

def _format_bools(self):
if self._needs_to_bool(self.obj):
self.copy_if_needed()
self.obj = self._format_to_bool(self.obj)

class FrameWriter(Writer):
Expand Down Expand Up @@ -123,13 +126,22 @@ def _format_axes(self):
setattr(self.obj,axis,self._format_to_date(a.to_series()))

def _format_dates(self):
if self.date_format == 'iso':
dtypes = self.obj.dtypes
dtypes = dtypes[dtypes == 'datetime64[ns]']
if len(dtypes):
self.copy_if_needed()
for c in dtypes.index:
self.obj[c] = self._format_to_date(self.obj[c])
dtypes = self.obj.dtypes
if len(dtypes[dtypes == 'datetime64[ns]']):

# need to create a new object
d = {}

for i, (col, c) in enumerate(self.obj.iteritems()):

if c.dtype == 'datetime64[ns]':
c = self._format_to_date(c)

d[i] = c

d = DataFrame(d,index=self.obj.index)
d.columns = self.obj.columns
self.obj = d

def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
convert_axes=True, convert_dates=True, keep_default_dates=True,
Expand Down Expand Up @@ -291,14 +303,16 @@ def _try_convert_data(self, name, data, use_dtypes=True, convert_dates=True):
except:
pass

if data.dtype == 'float':
if data.dtype.kind == 'f':

# coerce floats to 64
try:
data = data.astype('float64')
result = True
except:
pass
if data.dtype != 'float64':

# coerce floats to 64
try:
data = data.astype('float64')
result = True
except:
pass

# do't coerce 0-len data
if len(data) and (data.dtype == 'float' or data.dtype == 'object'):
Expand Down Expand Up @@ -448,14 +462,35 @@ def _parse_no_numpy(self):
self.obj = DataFrame(
loads(json, precise_float=self.precise_float), dtype=None)

def _process_converter(self, f, filt=None):
""" take a conversion function and possibly recreate the frame """

if filt is None:
filt = lambda col, c: True

needs_new_obj = False
new_obj = dict()
for i, (col, c) in enumerate(self.obj.iteritems()):
if filt(col, c):
new_data, result = f(col, c)
if result:
c = new_data
needs_new_obj = True
new_obj[i] = c

if needs_new_obj:

# possibly handle dup columns
new_obj = DataFrame(new_obj,index=self.obj.index)
new_obj.columns = self.obj.columns
self.obj = new_obj

def _try_convert_types(self):
if self.obj is None: return
if self.convert_dates:
self._try_convert_dates()
for col in self.obj.columns:
new_data, result = self._try_convert_data(col, self.obj[col], convert_dates=False)
if result:
self.obj[col] = new_data

self._process_converter(lambda col, c: self._try_convert_data(col, c, convert_dates=False))

def _try_convert_dates(self):
if self.obj is None: return
Expand All @@ -478,9 +513,6 @@ def is_ok(col):
return True
return False

self._process_converter(lambda col, c: self._try_convert_to_date(c),
lambda col, c: (self.keep_default_dates and is_ok(col)) or col in convert_dates)

for col in self.obj.columns:
if (self.keep_default_dates and is_ok(col)) or col in convert_dates:
new_data, result = self._try_convert_to_date(self.obj[col])
if result:
self.obj[col] = new_data
15 changes: 15 additions & 0 deletions pandas/io/tests/test_json/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,21 @@ def test_frame_non_unique_columns(self):
unser = read_json(df.to_json(orient='values'), orient='values')
np.testing.assert_equal(df.values, unser.values)

# GH4377; duplicate columns not processing correctly
df = DataFrame([['a','b'],['c','d']], index=[1,2], columns=['x','y'])
result = read_json(df.to_json(orient='split'), orient='split')
assert_frame_equal(result, df)

def _check(df):
result = read_json(df.to_json(orient='split'), orient='split', convert_dates=['x'])
assert_frame_equal(result, df)

for o in [[['a','b'],['c','d']],
[[1.5,2.5],[3.5,4.5]],
[[1,2.5],[3,4.5]],
[[Timestamp('20130101'),3.5],[Timestamp('20130102'),4.5]]]:
_check(DataFrame(o, index=[1,2], columns=['x','x']))

def test_frame_from_json_to_json(self):

def _check_orient(df, orient, dtype=None, numpy=False, convert_axes=True, check_dtype=True, raise_ok=None):
Expand Down
6 changes: 6 additions & 0 deletions pandas/tests/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2950,6 +2950,12 @@ def check(result, expected=None):
expected = DataFrame([[1],[1],[1]],columns=['bar'])
check(df,expected)

# values
df = DataFrame([[1,2.5],[3,4.5]], index=[1,2], columns=['x','x'])
result = df.values
expected = np.array([[1,2.5],[3,4.5]])
self.assert_((result == expected).all().all())

def test_insert_benchmark(self):
# from the vb_suite/frame_methods/frame_insert_columns
N = 10
Expand Down