diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index da9264557931d..93d27a8bd4374 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1759,6 +1759,7 @@ def infer(self, handler): """infer this column from the table: create and return a new object""" table = handler.table new_self = self.copy() + new_self._handle = handler._handle new_self.set_table(table) new_self.get_attr() new_self.read_metadata(handler) @@ -1816,6 +1817,10 @@ def cvalues(self): """ return my cython values """ return self.values + @property + def handle(self): + return self._handle + def __iter__(self): return iter(self.values) @@ -1838,6 +1843,7 @@ def validate_names(self): pass def validate_and_set(self, handler, append): + self._handle = handler._handle self.set_table(handler.table) self.validate_col() self.validate_attr(append) @@ -2431,14 +2437,37 @@ def convert(self, values, nan_rep, encoding, errors, start=None, stop=None): def get_attr(self): """ get the data for this column """ + # reading tables prior to 0.x.x self.values = getattr(self.attrs, self.kind_attr, None) + + if self.values is None: + try: + data = self.handle.get_node( + self.attrs._v_node._v_parent, self.kind_attr + )[:] + data = np.array(data, dtype="object") + # check for multiindex + if len(data.shape) > 1 and data.shape[1] > 1: + self.values = list(map(tuple, data.tolist())) + else: + self.values = data.tolist() + except _table_mod.exceptions.NoSuchNodeError: + pass + self.dtype = getattr(self.attrs, self.dtype_attr, None) self.meta = getattr(self.attrs, self.meta_attr, None) self.set_kind() def set_attr(self): """ set the data for this column """ - setattr(self.attrs, self.kind_attr, self.values) + group, key = self.attrs._v_node._v_parent, self.kind_attr + if key in group: + self.handle.remove_node(group, key) + + vlarray = self.handle.create_vlarray(group, key, _tables().ObjectAtom()) + for fld in self.values: + vlarray.append(fld) + setattr(self.attrs, self.meta_attr, self.meta) if self.dtype is not None: setattr(self.attrs, self.dtype_attr, self.dtype) @@ -3488,12 +3517,48 @@ def set_info(self): """ update our table index info """ self.attrs.info = self.info + def set_non_index_axes(self): + """ Write the axes to vlarrays """ + group = self.attrs._v_node + + def f(dim, flds): + key = "non_index_axes_%d" % dim + if key in group: + self.handle.remove_node(group, key) + + vlarray = self._handle.create_vlarray(group, key, _tables().ObjectAtom()) + for fld in flds: + vlarray.append(fld) + return dim, key + + replacement = [f(dim, flds) for dim, flds in self.non_index_axes] + self.attrs.non_index_axes = replacement + + def get_non_index_axes(self): + """Load the non-index axes from their vlarrays.""" + + def f(dim, flds): + if isinstance(flds, str): + flds = self._handle.get_node(self.attrs._v_node, flds)[:] + flds = np.array(flds, dtype="object") + if len(flds.shape) > 1 and flds.shape[1] > 1: + flds = list(map(tuple, flds.tolist())) + else: + flds = flds.tolist() + return dim, flds + else: + return dim, flds # if not a string presumably pre v0.xx list + + non_index_axes = getattr(self.attrs, "non_index_axes", []) + new = [f(dim, flds) for dim, flds in non_index_axes] + return new + def set_attrs(self): """ set our table type & indexables """ self.attrs.table_type = str(self.table_type) self.attrs.index_cols = self.index_cols() self.attrs.values_cols = self.values_cols() - self.attrs.non_index_axes = self.non_index_axes + self.set_non_index_axes() self.attrs.data_columns = self.data_columns self.attrs.nan_rep = self.nan_rep self.attrs.encoding = self.encoding @@ -3504,7 +3569,6 @@ def set_attrs(self): def get_attrs(self): """ retrieve our attributes """ - self.non_index_axes = getattr(self.attrs, "non_index_axes", None) or [] self.data_columns = getattr(self.attrs, "data_columns", None) or [] self.info = getattr(self.attrs, "info", None) or dict() self.nan_rep = getattr(self.attrs, "nan_rep", None) @@ -3516,6 +3580,7 @@ def get_attrs(self): a.infer(self) for a in self.indexables if not a.is_an_indexable ] self.metadata = getattr(self.attrs, "metadata", None) or [] + self.non_index_axes = self.get_non_index_axes() def validate_version(self, where=None): """ are we trying to operate on an old version? """ diff --git a/pandas/tests/io/data/legacy_hdf/legacy_table_table_format.h5 b/pandas/tests/io/data/legacy_hdf/legacy_table_table_format.h5 new file mode 100644 index 0000000000000..44c8e795c0b8d Binary files /dev/null and b/pandas/tests/io/data/legacy_hdf/legacy_table_table_format.h5 differ diff --git a/pandas/tests/io/pytables/test_pytables.py b/pandas/tests/io/pytables/test_pytables.py index d67f2c3b7bd66..c337cfd8e944d 100644 --- a/pandas/tests/io/pytables/test_pytables.py +++ b/pandas/tests/io/pytables/test_pytables.py @@ -874,8 +874,10 @@ def test_complibs_default_settings(self): assert node.filters.complevel == 0 assert node.filters.complib is None for node in h5file.walk_nodes(where="/dfc", classname="Leaf"): - assert node.filters.complevel == 9 - assert node.filters.complib == "blosc" + # only check table, skip column + if node.name == "table": + assert node.filters.complevel == 9 + assert node.filters.complib == "blosc" def test_complibs(self): # GH14478 @@ -4928,6 +4930,62 @@ def test_to_hdf_multiindex_extension_dtype(self, idx): with pytest.raises(NotImplementedError, match="Saving a MultiIndex"): df.to_hdf(path, "df") + def test_wide_table_format(self): + # GH 26135 + # test storing wide dataframes with in table format + + df = DataFrame(np.random.random((10, 10000))) + + with ensure_clean_path(self.path) as path: + df.to_hdf(path, "df", format="table") + reread = read_hdf(path, "df") + assert_frame_equal(df, reread) + + def test_append_wide_table_format(self): + # GH 26135 + # test append to hdf with wide dataframe + + df1 = DataFrame(np.random.random((10, 10000))) + df2 = DataFrame(np.random.random((10, 10000))) + + with ensure_clean_path(self.path) as path: + df1.to_hdf(path, "df", format="table") + df2.to_hdf(path, "df", append=True) + reread = read_hdf(path) + assert_frame_equal(pd.concat([df1, df2]), reread) + + def test_legacy_table_table_format_read(self, datapath): + # GH 26135 + # test read of legacy table with table format and column + # saved as pytables metadata + + column_numeric = [1, 2, 3, 4] + column_str_1 = ["A", "B", "C", "D"] + column_str_2 = ["Ä", "Ö", "Â", "é"] + column_dt = pd.date_range("19700101", "19700104") + column_multi_1 = pd.MultiIndex.from_tuples(zip(column_numeric, column_str_1)) + column_multi_2 = pd.MultiIndex.from_tuples(zip(column_str_2, column_dt)) + + columns = [ + column_numeric, + column_str_1, + column_str_2, + column_dt, + column_multi_1, + column_multi_2, + ] + + data = np.arange(0, 16).reshape(4, 4).astype("int64") + + with ensure_clean_store( + datapath("io", "data", "legacy_hdf", "legacy_table_table_format.h5"), + mode="r", + ) as store: + for i, column in enumerate(columns): + table_name = "table_{}".format(i) + df = pd.DataFrame(data, columns=column) + tm.assert_frame_equal(store[table_name], df) + class TestHDFComplexValues(Base): # GH10447