Skip to content

ENH: allow saving of meta-data via CArrays to support wide tables #11788

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 71 additions & 4 deletions pandas/io/pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@
import pandas.algos as algos
import pandas.tslib as tslib

from tables.exceptions import NoSuchNodeError, NodeError

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

update the version info

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

0.18?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yep

from contextlib import contextmanager
from distutils.version import LooseVersion

Expand Down Expand Up @@ -1455,6 +1457,7 @@ def infer(self, handler):
"""infer this column from the table: create and return a new object"""
table = handler.table
new_self = self.copy()
new_self._handle = handler._handle
new_self.set_table(table)
new_self.get_attr()
new_self.read_metadata(handler)
Expand Down Expand Up @@ -1511,6 +1514,10 @@ def cvalues(self):
""" return my cython values """
return self.values

@property
def handle(self):
return self._handle

def __iter__(self):
return iter(self.values)

Expand All @@ -1534,6 +1541,7 @@ def validate_names(self):
pass

def validate_and_set(self, handler, append, **kwargs):
self._handle = handler._handle
self.set_table(handler.table)
self.validate_col()
self.validate_attr(append)
Expand Down Expand Up @@ -2043,13 +2051,35 @@ def convert(self, values, nan_rep, encoding):
def get_attr(self):
""" get the data for this colummn """
self.values = getattr(self.attrs, self.kind_attr, None)
if self.values is None:
try:
data = self.handle.get_node(self.attrs._v_node._v_parent, self.kind_attr)[:]
if len(data.shape) > 1 and data.shape[1] > 1: # multiIndex
self.values = map(tuple, data.tolist())
else:
self.values = data.tolist()

except NoSuchNodeError:
pass
self.dtype = getattr(self.attrs, self.dtype_attr, None)
self.meta = getattr(self.attrs, self.meta_attr, None)
self.set_kind()

def set_attr(self):
""" set the data for this colummn """
setattr(self.attrs, self.kind_attr, self.values)
#setattr(self.attrs, self.kind_attr, self.values)
try:
self.handle.create_carray(self.attrs._v_node._v_parent,
self.kind_attr,
obj=np.array(self.values))
except NodeError as e:
self.handle.remove_node(self.attrs._v_node._v_parent,
self.kind_attr)
self.handle.create_carray(self.attrs._v_node._v_parent,
self.kind_attr,
obj=np.array(self.values))
except Exception as e: # for debugging
raise
setattr(self.attrs, self.meta_attr, self.meta)
if self.dtype is not None:
setattr(self.attrs, self.dtype_attr, self.dtype)
Expand Down Expand Up @@ -3020,12 +3050,50 @@ def set_info(self):
""" update our table index info """
self.attrs.info = self.info

def set_non_index_axes(self):
""" Write the axes to carrays """
def f(dim, flds):
name = "non_index_axes_%d" % dim
try:
self._handle.create_carray(self.attrs._v_node, name, obj=np.array(flds))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use _convert_index on the data (and _unconvert_index on deserialize); this will do all the proper type conversions

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wasn't aware of that code. My simple implementation of your suggestion sort of works, but actually raises different errors. The root of the problem really seems to be non_index_axes, which converts proper index objects to arrays. And at the moment lib.infer_dtype does the wrong thing for the test case - it says "datetime" for an array of Timestamps rather than datetime64 so the roundtrip of _unconvert_index(_convert_index()) is incorrect. I'll have to investigate a bit more.

except ValueError as e:
# Should probably make this check:
#if e.message == "unknown type: 'object'":
# raise ValueError("axis {} has dtype 'object' which cannot be saved to carray".format(dim))
raise
except NodeError as e:
self._handle.remove_node(self.attrs._v_node, name)
self._handle.create_carray(self.attrs._v_node, name, obj=np.array(flds))
return dim, flds

replacement = [f(dim, flds) for dim, flds in self.non_index_axes]
self.attrs.non_index_axes = replacement

def get_non_index_axes(self):
"""Load the non-index axes from their carrays. This is a pass-through
for tables stored prior to v0.17"""
def f(dim, flds):
if isinstance(flds, string_types):
flds = self._handle.get_node(self.attrs._v_node, flds)[:]
if len(flds.shape) > 1 and flds.shape[1] > 1:
flds = map(tuple, flds.tolist())
else:
flds = flds.tolist()
return dim, flds
else:
return dim, flds #if not a string presumably pre v17 list
non_index_axes = getattr(self.attrs, 'non_index_axes', [])
new = [f(dim, flds) for dim, flds in non_index_axes]
return new

def set_attrs(self):
""" set our table type & indexables """
self.attrs.table_type = str(self.table_type)
self.attrs.index_cols = self.index_cols()
self.attrs.values_cols = self.values_cols()
self.attrs.non_index_axes = self.non_index_axes

#self.attrs.non_index_axes = self.non_index_axes
self.set_non_index_axes()
self.attrs.data_columns = self.data_columns
self.attrs.nan_rep = self.nan_rep
self.attrs.encoding = self.encoding
Expand All @@ -3035,8 +3103,7 @@ def set_attrs(self):

def get_attrs(self):
""" retrieve our attributes """
self.non_index_axes = getattr(
self.attrs, 'non_index_axes', None) or []
self.non_index_axes = self.get_non_index_axes()
self.data_columns = getattr(
self.attrs, 'data_columns', None) or []
self.info = getattr(
Expand Down
Binary file not shown.
18 changes: 18 additions & 0 deletions pandas/io/tests/test_pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -4606,6 +4606,22 @@ def test_read_nokey(self):
df.to_hdf(path, 'df2', mode='a')
self.assertRaises(ValueError, read_hdf, path)

def test_legacy_non_index_axes(self):
filename = tm.get_data_path('legacy_hdf/legacy_non_index_axes_0.17.1.h5')
with HDFStore(filename, 'r') as store:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just use read_hdf

df_legacy = store.get("df")

index = pd.date_range(start = Timestamp("2015-11-01 0:00"), freq = "H", periods = 3, tz = None)
columns = MultiIndex(levels=[['A', 'B', 'C', 'D'],
[1, 2, 3]],
labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
[0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
names=['alpha', 'num'])
data = np.array([index.asi8+i for i in range(10)])
df_new = DataFrame(data.T, columns=columns, index=index)

tm.assert_frame_equal(df_legacy, df_new)
#df_new.to_hdf(filename, "df", format = "table")

class TestHDFComplexValues(Base):
# GH10447
Expand Down Expand Up @@ -5025,5 +5041,7 @@ def _test_sort(obj):

if __name__ == '__main__':
import nose
#nose.runmodule(argv=[__file__, '-vvs'],
# exit=False)
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
exit=False)