Skip to content

Commit c293fd7

Browse files
committed
BUG: Bug in to_json with certain orients and a CategoricalIndex would segfault pandas-dev#10307
1 parent 2619889 commit c293fd7

File tree

3 files changed

+96
-34
lines changed

3 files changed

+96
-34
lines changed

doc/source/whatsnew/v0.16.2.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ Bug Fixes
120120
- Bug where read_hdf store.select modifies the passed columns list when
121121
multi-indexed (:issue:`7212`)
122122
- Bug in ``Categorical`` repr with ``display.width`` of ``None`` in Python 3 (:issue:`10087`)
123-
123+
- Bug in ``to_json`` with certain orients and a ``CategoricalIndex`` would segfault (:issue:`10307`)
124124
- Bug where some of the nan funcs do not have consistent return dtypes (:issue:`10251`)
125125

126126
- Bug in ``DataFrame.quantile`` on checking that a valid axis was passed (:issue:`9543`)

pandas/io/json.py

Lines changed: 40 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from pandas import compat, isnull
1212
from pandas import Series, DataFrame, to_datetime
1313
from pandas.io.common import get_filepath_or_buffer
14-
from pandas.core.common import AbstractMethodError
14+
from pandas.core.common import AbstractMethodError, is_categorical_dtype
1515
import pandas.core.common as com
1616

1717
loads = _json.loads
@@ -60,11 +60,32 @@ def __init__(self, obj, orient, date_format, double_precision,
6060
self.ensure_ascii = ensure_ascii
6161
self.date_unit = date_unit
6262
self.default_handler = default_handler
63+
self._coerce_axes()
64+
self._coerce_data()
6365

64-
self.is_copy = None
65-
self._format_axes()
66+
def _coerce_axes(self):
67+
for i in range(self.obj._AXIS_LEN):
68+
self._coerce_axis(i)
6669

67-
def _format_axes(self):
70+
def _coerce_axis(self, axis):
71+
"""
72+
Parameters
73+
----------
74+
axis : axis number
75+
76+
if the axis needs coercion, then copy the .obj
77+
and set the index
78+
79+
"""
80+
81+
# GH 10317
82+
# coerce CategoricalIndexes to Index dtypes
83+
ax = self.obj._get_axis(axis)
84+
if is_categorical_dtype(ax):
85+
self.obj = self.obj.copy()
86+
self.obj.set_axis(axis, np.array(ax))
87+
88+
def _coerce_data(self):
6889
raise AbstractMethodError(self)
6990

7091
def write(self):
@@ -81,16 +102,20 @@ def write(self):
81102
class SeriesWriter(Writer):
82103
_default_orient = 'index'
83104

84-
def _format_axes(self):
105+
def _coerce_axes(self):
85106
if not self.obj.index.is_unique and self.orient == 'index':
86107
raise ValueError("Series index must be unique for orient="
87108
"'%s'" % self.orient)
109+
super(SeriesWriter, self)._coerce_axes()
88110

111+
def _coerce_data(self):
112+
if is_categorical_dtype(self.obj):
113+
self.obj = np.array(self.obj)
89114

90115
class FrameWriter(Writer):
91116
_default_orient = 'columns'
92117

93-
def _format_axes(self):
118+
def _coerce_axes(self):
94119
""" try to axes if they are datelike """
95120
if not self.obj.index.is_unique and self.orient in (
96121
'index', 'columns'):
@@ -100,7 +125,16 @@ def _format_axes(self):
100125
'index', 'columns', 'records'):
101126
raise ValueError("DataFrame columns must be unique for orient="
102127
"'%s'." % self.orient)
128+
super(FrameWriter, self)._coerce_axes()
129+
130+
def _coerce_data(self):
103131

132+
is_copy = False
133+
for c, col in self.obj.iteritems():
134+
if is_categorical_dtype(col):
135+
if not is_copy:
136+
is_copy, self.obj = True, self.obj.copy()
137+
self.obj[c] = np.array(col)
104138

105139
def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
106140
convert_axes=True, convert_dates=True, keep_default_dates=True,

pandas/io/tests/test_json/test_pandas.py

Lines changed: 55 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import os
55

66
import numpy as np
7-
from pandas import Series, DataFrame, DatetimeIndex, Timestamp
7+
from pandas import Series, DataFrame, DatetimeIndex, Timestamp, CategoricalIndex
88
from datetime import timedelta
99
import pandas as pd
1010
read_json = pd.read_json
@@ -23,6 +23,11 @@
2323
for k, v in compat.iteritems(_seriesd)))
2424

2525
_tsframe = DataFrame(_tsd)
26+
_cat_frame = _frame.copy()
27+
cat = ['bah']*5 + ['bar']*5 + ['baz']*5 + ['foo']*(len(_cat_frame)-15)
28+
_cat_frame.index = pd.CategoricalIndex(cat,name='E')
29+
_cat_frame['E'] = list(reversed(cat))
30+
_cat_frame['sort'] = np.arange(len(_cat_frame))
2631

2732
_mixed_frame = _frame.copy()
2833

@@ -48,6 +53,7 @@ def setUp(self):
4853
self.intframe = _intframe.copy()
4954
self.tsframe = _tsframe.copy()
5055
self.mixed_frame = _mixed_frame.copy()
56+
self.categorical = _cat_frame.copy()
5157

5258
def tearDown(self):
5359
del self.dirpath
@@ -128,8 +134,22 @@ def _check(df):
128134

129135
def test_frame_from_json_to_json(self):
130136
def _check_orient(df, orient, dtype=None, numpy=False,
131-
convert_axes=True, check_dtype=True, raise_ok=None):
132-
df = df.sort()
137+
convert_axes=True, check_dtype=True, raise_ok=None,
138+
sort=None):
139+
if sort is not None:
140+
df = df.sort(sort)
141+
else:
142+
df = df.sort()
143+
144+
# if we are not unique, then check that we are raising ValueError
145+
# for the appropriate orients
146+
if not df.index.is_unique and orient in ['index','columns']:
147+
self.assertRaises(ValueError, lambda : df.to_json(orient=orient))
148+
return
149+
if not df.columns.is_unique and orient in ['index','columns','records']:
150+
self.assertRaises(ValueError, lambda : df.to_json(orient=orient))
151+
return
152+
133153
dfjson = df.to_json(orient=orient)
134154

135155
try:
@@ -141,7 +161,10 @@ def _check_orient(df, orient, dtype=None, numpy=False,
141161
return
142162
raise
143163

144-
unser = unser.sort()
164+
if sort is not None and sort in unser.columns:
165+
unser = unser.sort(sort)
166+
else:
167+
unser = unser.sort()
145168

146169
if dtype is False:
147170
check_dtype=False
@@ -160,7 +183,9 @@ def _check_orient(df, orient, dtype=None, numpy=False,
160183
# index and col labels might not be strings
161184
unser.index = [str(i) for i in unser.index]
162185
unser.columns = [str(i) for i in unser.columns]
163-
unser = unser.sort()
186+
187+
if sort is None:
188+
unser = unser.sort()
164189
assert_almost_equal(df.values, unser.values)
165190
else:
166191
if convert_axes:
@@ -169,45 +194,45 @@ def _check_orient(df, orient, dtype=None, numpy=False,
169194
assert_frame_equal(df, unser, check_less_precise=False,
170195
check_dtype=check_dtype)
171196

172-
def _check_all_orients(df, dtype=None, convert_axes=True, raise_ok=None):
197+
def _check_all_orients(df, dtype=None, convert_axes=True, raise_ok=None, sort=None):
173198

174199
# numpy=False
175200
if convert_axes:
176-
_check_orient(df, "columns", dtype=dtype)
177-
_check_orient(df, "records", dtype=dtype)
178-
_check_orient(df, "split", dtype=dtype)
179-
_check_orient(df, "index", dtype=dtype)
180-
_check_orient(df, "values", dtype=dtype)
181-
182-
_check_orient(df, "columns", dtype=dtype, convert_axes=False)
183-
_check_orient(df, "records", dtype=dtype, convert_axes=False)
184-
_check_orient(df, "split", dtype=dtype, convert_axes=False)
185-
_check_orient(df, "index", dtype=dtype, convert_axes=False)
186-
_check_orient(df, "values", dtype=dtype ,convert_axes=False)
201+
_check_orient(df, "columns", dtype=dtype, sort=sort)
202+
_check_orient(df, "records", dtype=dtype, sort=sort)
203+
_check_orient(df, "split", dtype=dtype, sort=sort)
204+
_check_orient(df, "index", dtype=dtype, sort=sort)
205+
_check_orient(df, "values", dtype=dtype, sort=sort)
206+
207+
_check_orient(df, "columns", dtype=dtype, convert_axes=False, sort=sort)
208+
_check_orient(df, "records", dtype=dtype, convert_axes=False, sort=sort)
209+
_check_orient(df, "split", dtype=dtype, convert_axes=False, sort=sort)
210+
_check_orient(df, "index", dtype=dtype, convert_axes=False, sort=sort)
211+
_check_orient(df, "values", dtype=dtype ,convert_axes=False, sort=sort)
187212

188213
# numpy=True and raise_ok might be not None, so ignore the error
189214
if convert_axes:
190215
_check_orient(df, "columns", dtype=dtype, numpy=True,
191-
raise_ok=raise_ok)
216+
raise_ok=raise_ok, sort=sort)
192217
_check_orient(df, "records", dtype=dtype, numpy=True,
193-
raise_ok=raise_ok)
218+
raise_ok=raise_ok, sort=sort)
194219
_check_orient(df, "split", dtype=dtype, numpy=True,
195-
raise_ok=raise_ok)
220+
raise_ok=raise_ok, sort=sort)
196221
_check_orient(df, "index", dtype=dtype, numpy=True,
197-
raise_ok=raise_ok)
222+
raise_ok=raise_ok, sort=sort)
198223
_check_orient(df, "values", dtype=dtype, numpy=True,
199-
raise_ok=raise_ok)
224+
raise_ok=raise_ok, sort=sort)
200225

201226
_check_orient(df, "columns", dtype=dtype, numpy=True,
202-
convert_axes=False, raise_ok=raise_ok)
227+
convert_axes=False, raise_ok=raise_ok, sort=sort)
203228
_check_orient(df, "records", dtype=dtype, numpy=True,
204-
convert_axes=False, raise_ok=raise_ok)
229+
convert_axes=False, raise_ok=raise_ok, sort=sort)
205230
_check_orient(df, "split", dtype=dtype, numpy=True,
206-
convert_axes=False, raise_ok=raise_ok)
231+
convert_axes=False, raise_ok=raise_ok, sort=sort)
207232
_check_orient(df, "index", dtype=dtype, numpy=True,
208-
convert_axes=False, raise_ok=raise_ok)
233+
convert_axes=False, raise_ok=raise_ok, sort=sort)
209234
_check_orient(df, "values", dtype=dtype, numpy=True,
210-
convert_axes=False, raise_ok=raise_ok)
235+
convert_axes=False, raise_ok=raise_ok, sort=sort)
211236

212237
# basic
213238
_check_all_orients(self.frame)
@@ -233,6 +258,9 @@ def _check_all_orients(df, dtype=None, convert_axes=True, raise_ok=None):
233258
_check_all_orients(DataFrame(biggie, dtype='U3'), dtype='U3',
234259
convert_axes=False, raise_ok=ValueError)
235260

261+
# categorical
262+
_check_all_orients(self.categorical, sort='sort', raise_ok=ValueError)
263+
236264
# empty
237265
_check_all_orients(self.empty_frame)
238266

0 commit comments

Comments
 (0)