Skip to content

Commit e0680ec

Browse files
committed
Merge pull request #8793 from jreback/cat_hdf
ENH: serialization of categorical to HDF5 (GH7621)
2 parents d881ba2 + fa378ab commit e0680ec

File tree

8 files changed

+317
-68
lines changed

8 files changed

+317
-68
lines changed

Diff for: doc/source/categorical.rst

+6-3
Original file line numberDiff line numberDiff line change
@@ -541,8 +541,12 @@ The same applies to ``df.append(df_different)``.
541541
Getting Data In/Out
542542
-------------------
543543

544-
Writing data (`Series`, `Frames`) to a HDF store that contains a ``category`` dtype will currently
545-
raise ``NotImplementedError``.
544+
.. versionadded:: 0.15.2
545+
546+
Writing data (`Series`, `Frames`) to a HDF store that contains a ``category`` dtype was implemented
547+
in 0.15.2. See :ref:`here <io.hdf5-categorical>` for an example and caveats.
548+
549+
Writing data to/from Stata format files was implemented in 0.15.2.
546550

547551
Writing to a CSV file will convert the data, effectively removing any information about the
548552
categorical (categories and ordering). So if you read back the CSV file you have to convert the
@@ -805,4 +809,3 @@ Use ``copy=True`` to prevent such a behaviour or simply don't reuse `Categorical
805809
This also happens in some cases when you supply a `numpy` array instead of a `Categorical`:
806810
using an int array (e.g. ``np.array([1,2,3,4])``) will exhibit the same behaviour, while using
807811
a string array (e.g. ``np.array(["a","b","c","a"])``) will not.
808-

Diff for: doc/source/io.rst

+46
Original file line numberDiff line numberDiff line change
@@ -3091,6 +3091,50 @@ conversion may not be necessary in future versions of pandas)
30913091
df
30923092
df.dtypes
30933093
3094+
.. _io.hdf5-categorical:
3095+
3096+
Categorical Data
3097+
~~~~~~~~~~~~~~~~
3098+
3099+
.. versionadded:: 0.15.2
3100+
3101+
Writing data (`Series`, `Frames`) to a HDF store that contains a ``category`` dtype was implemented
3102+
in 0.15.2. Queries work the same as if it was an object array (but the ``Categorical`` is stored in a more efficient manner)
3103+
3104+
.. ipython:: python
3105+
3106+
dfcat = DataFrame({ 'A' : Series(list('aabbcdba')).astype('category'),
3107+
'B' : np.random.randn(8) })
3108+
cstore = pd.HDFStore('cats.h5', mode='w')
3109+
cstore.append('dfcat', dfcat, format='table', data_columns=['A'])
3110+
result = cstore.select('dfcat', where="A in ['b','c']")
3111+
result
3112+
result.dtypes
3113+
3114+
.. warning::
3115+
3116+
The format of the ``Categoricals` is readable by prior versions of pandas (< 0.15.2), but will retrieve
3117+
the data as an integer based column (e.g. the ``codes``). However, the ``categories`` *can* be retrieved
3118+
but require the user to select them manually using the explicit meta path.
3119+
3120+
The data is stored like so:
3121+
3122+
.. ipython:: python
3123+
3124+
cstore
3125+
3126+
# to get the categories
3127+
cstore.select('dfcat/meta/A/meta')
3128+
3129+
.. ipython:: python
3130+
:suppress:
3131+
:okexcept:
3132+
3133+
cstore.close()
3134+
import os
3135+
os.remove('cats.h5')
3136+
3137+
30943138
String Columns
30953139
~~~~~~~~~~~~~~
30963140

@@ -3660,6 +3704,8 @@ outside of this range, the data is cast to ``int16``.
36603704
data frames containing categorical data will convert non-string categorical values
36613705
to strings.
36623706

3707+
Writing data to/from Stata format files with a ``category`` dtype was implemented in 0.15.2.
3708+
36633709
.. _io.stata_reader:
36643710

36653711
Reading from STATA format

Diff for: doc/source/whatsnew/v0.15.2.txt

+1
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ Enhancements
4242
~~~~~~~~~~~~
4343

4444
- Added ability to export Categorical data to Stata (:issue:`8633`).
45+
- Added ability to export Categorical data to to/from HDF5 (:issue:`7621`). Queries work the same as if it was an object array (but a ``Categorical`` is stored in a much more efficient manner). See :ref:`here <io.hdf5-categorical>` for an example and caveats w.r.t. prior versions of pandas.
4546

4647
.. _whatsnew_0152.performance:
4748

Diff for: pandas/computation/pytables.py

+16-1
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,17 @@ def is_in_table(self):
147147
@property
148148
def kind(self):
149149
""" the kind of my field """
150-
return self.queryables.get(self.lhs)
150+
return getattr(self.queryables.get(self.lhs),'kind',None)
151+
152+
@property
153+
def meta(self):
154+
""" the meta of my field """
155+
return getattr(self.queryables.get(self.lhs),'meta',None)
156+
157+
@property
158+
def metadata(self):
159+
""" the metadata of my field """
160+
return getattr(self.queryables.get(self.lhs),'metadata',None)
151161

152162
def generate(self, v):
153163
""" create and return the op string for this TermValue """
@@ -167,6 +177,7 @@ def stringify(value):
167177
return encoder(value)
168178

169179
kind = _ensure_decoded(self.kind)
180+
meta = _ensure_decoded(self.meta)
170181
if kind == u('datetime64') or kind == u('datetime'):
171182
if isinstance(v, (int, float)):
172183
v = stringify(v)
@@ -182,6 +193,10 @@ def stringify(value):
182193
elif kind == u('timedelta64') or kind == u('timedelta'):
183194
v = _coerce_scalar_to_timedelta_type(v, unit='s').value
184195
return TermValue(int(v), v, kind)
196+
elif meta == u('category'):
197+
metadata = com._values_from_object(self.metadata)
198+
result = metadata.searchsorted(v,side='left')
199+
return TermValue(result, result, u('integer'))
185200
elif kind == u('integer'):
186201
v = int(float(v))
187202
return TermValue(v, v, kind)

Diff for: pandas/core/categorical.py

+10-2
Original file line numberDiff line numberDiff line change
@@ -319,6 +319,15 @@ def ndim(self):
319319
"""Number of dimensions of the Categorical """
320320
return self._codes.ndim
321321

322+
def reshape(self, new_shape, **kwargs):
323+
""" compat with .reshape """
324+
return self
325+
326+
@property
327+
def base(self):
328+
""" compat, we are always our own object """
329+
return None
330+
322331
@classmethod
323332
def from_array(cls, data, **kwargs):
324333
"""
@@ -363,10 +372,9 @@ def from_codes(cls, codes, categories, ordered=False, name=None):
363372

364373
categories = cls._validate_categories(categories)
365374

366-
if codes.max() >= len(categories) or codes.min() < -1:
375+
if len(codes) and (codes.max() >= len(categories) or codes.min() < -1):
367376
raise ValueError("codes need to be between -1 and len(categories)-1")
368377

369-
370378
return Categorical(codes, categories=categories, ordered=ordered, name=name, fastpath=True)
371379

372380
_codes = None

Diff for: pandas/core/internals.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -4381,7 +4381,7 @@ def get_reindexed_values(self, empty_dtype, upcasted_na):
43814381
else:
43824382
fill_value = upcasted_na
43834383

4384-
if self.is_null:
4384+
if self.is_null and not getattr(self.block,'is_categorical',None):
43854385
missing_arr = np.empty(self.shape, dtype=empty_dtype)
43864386
if np.prod(self.shape):
43874387
# NumPy 1.6 workaround: this statement gets strange if all

0 commit comments

Comments
 (0)