Merge pull request #8793 from jreback/cat_hdf

jreback · jreback · commit e0680eca0931 · 2014-11-16T19:34:49.000-05:00
ENH: serialization of categorical to HDF5 (GH7621)
diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst
@@ -541,8 +541,12 @@ The same applies to ``df.append(df_different)``.
 Getting Data In/Out
 -------------------
 
-Writing data (`Series`, `Frames`) to a HDF store that contains a ``category`` dtype will currently
-raise ``NotImplementedError``.
+.. versionadded:: 0.15.2
+
+Writing data (`Series`, `Frames`) to a HDF store that contains a ``category`` dtype was implemented
+in 0.15.2. See :ref:`here <io.hdf5-categorical>` for an example and caveats.
+
+Writing data to/from Stata format files was implemented in 0.15.2.
 
 Writing to a CSV file will convert the data, effectively removing any information about the
 categorical (categories and ordering). So if you read back the CSV file you have to convert the
@@ -805,4 +809,3 @@ Use ``copy=True`` to prevent such a behaviour or simply don't reuse `Categorical
     This also happens in some cases when you supply a `numpy` array instead of a `Categorical`:
     using an int array (e.g. ``np.array([1,2,3,4])``) will exhibit the same behaviour, while using
     a string array (e.g. ``np.array(["a","b","c","a"])``) will not.
-
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -3091,6 +3091,50 @@ conversion may not be necessary in future versions of pandas)
        df
        df.dtypes
 
+.. _io.hdf5-categorical:
+
+Categorical Data
+~~~~~~~~~~~~~~~~
+
+.. versionadded:: 0.15.2
+
+Writing data (`Series`, `Frames`) to a HDF store that contains a ``category`` dtype was implemented
+in 0.15.2. Queries work the same as if it was an object array (but the ``Categorical`` is stored in a more efficient manner)
+
+.. ipython:: python
+
+   dfcat = DataFrame({ 'A' : Series(list('aabbcdba')).astype('category'),
+                       'B' : np.random.randn(8) })
+   cstore = pd.HDFStore('cats.h5', mode='w')
+   cstore.append('dfcat', dfcat, format='table', data_columns=['A'])
+   result = cstore.select('dfcat', where="A in ['b','c']")
+   result
+   result.dtypes
+
+.. warning::
+
+   The format of the ``Categoricals` is readable by prior versions of pandas (< 0.15.2), but will retrieve
+   the data as an integer based column (e.g. the ``codes``). However, the ``categories`` *can* be retrieved
+   but require the user to select them manually using the explicit meta path.
+
+   The data is stored like so:
+
+   .. ipython:: python
+
+      cstore
+
+      # to get the categories
+      cstore.select('dfcat/meta/A/meta')
+
+.. ipython:: python
+   :suppress:
+   :okexcept:
+
+   cstore.close()
+   import os
+   os.remove('cats.h5')
+
+
 String Columns
 ~~~~~~~~~~~~~~
 
@@ -3660,6 +3704,8 @@ outside of this range, the data is cast to ``int16``.
   data frames containing categorical data will convert non-string categorical values
   to strings.
 
+Writing data to/from Stata format files with a ``category`` dtype was implemented in 0.15.2.
+
 .. _io.stata_reader:
 
 Reading from STATA format
diff --git a/doc/source/whatsnew/v0.15.2.txt b/doc/source/whatsnew/v0.15.2.txt
@@ -42,6 +42,7 @@ Enhancements
 ~~~~~~~~~~~~
 
 - Added ability to export Categorical data to Stata (:issue:`8633`).
+- Added ability to export Categorical data to to/from HDF5 (:issue:`7621`). Queries work the same as if it was an object array (but a ``Categorical`` is stored in a much more efficient manner). See :ref:`here <io.hdf5-categorical>` for an example and caveats w.r.t. prior versions of pandas.
 
 .. _whatsnew_0152.performance:
 
diff --git a/pandas/computation/pytables.py b/pandas/computation/pytables.py
@@ -147,7 +147,17 @@ def is_in_table(self):
     @property
     def kind(self):
         """ the kind of my field """
-        return self.queryables.get(self.lhs)
+        return getattr(self.queryables.get(self.lhs),'kind',None)
+
+    @property
+    def meta(self):
+        """ the meta of my field """
+        return getattr(self.queryables.get(self.lhs),'meta',None)
+
+    @property
+    def metadata(self):
+        """ the metadata of my field """
+        return getattr(self.queryables.get(self.lhs),'metadata',None)
 
     def generate(self, v):
         """ create and return the op string for this TermValue """
@@ -167,6 +177,7 @@ def stringify(value):
             return encoder(value)
 
         kind = _ensure_decoded(self.kind)
+        meta = _ensure_decoded(self.meta)
         if kind == u('datetime64') or kind == u('datetime'):
             if isinstance(v, (int, float)):
                 v = stringify(v)
@@ -182,6 +193,10 @@ def stringify(value):
         elif kind == u('timedelta64') or kind == u('timedelta'):
             v = _coerce_scalar_to_timedelta_type(v, unit='s').value
             return TermValue(int(v), v, kind)
+        elif meta == u('category'):
+            metadata = com._values_from_object(self.metadata)
+            result = metadata.searchsorted(v,side='left')
+            return TermValue(result, result, u('integer'))
         elif kind == u('integer'):
             v = int(float(v))
             return TermValue(v, v, kind)
diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
@@ -319,6 +319,15 @@ def ndim(self):
         """Number of dimensions of the Categorical """
         return self._codes.ndim
 
+    def reshape(self, new_shape, **kwargs):
+        """ compat with .reshape """
+        return self
+
+    @property
+    def base(self):
+        """ compat, we are always our own object """
+        return None
+
     @classmethod
     def from_array(cls, data, **kwargs):
         """
@@ -363,10 +372,9 @@ def from_codes(cls, codes, categories, ordered=False, name=None):
 
         categories = cls._validate_categories(categories)
 
-        if codes.max() >= len(categories) or codes.min() < -1:
+        if len(codes) and (codes.max() >= len(categories) or codes.min() < -1):
             raise ValueError("codes need to be between -1 and len(categories)-1")
 
-
         return Categorical(codes, categories=categories, ordered=ordered, name=name, fastpath=True)
 
     _codes = None
diff --git a/pandas/core/internals.py b/pandas/core/internals.py
@@ -4381,7 +4381,7 @@ def get_reindexed_values(self, empty_dtype, upcasted_na):
         else:
             fill_value = upcasted_na
 
-            if self.is_null:
+            if self.is_null and not getattr(self.block,'is_categorical',None):
                 missing_arr = np.empty(self.shape, dtype=empty_dtype)
                 if np.prod(self.shape):
                     # NumPy 1.6 workaround: this statement gets strange if all
diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py