Skip to content

CLN: Python 2 pickle/hdf support #57387

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Feb 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ Other API changes
^^^^^^^^^^^^^^^^^
- 3rd party ``py.path`` objects are no longer explicitly supported in IO methods. Use :py:class:`pathlib.Path` objects instead (:issue:`57091`)
- :attr:`MultiIndex.codes`, :attr:`MultiIndex.levels`, and :attr:`MultiIndex.names` now returns a ``tuple`` instead of a ``FrozenList`` (:issue:`53531`)
- pickle and HDF (``.h5``) files created with Python 2 are no longer explicitly supported (:issue:`57387`)
-

.. ---------------------------------------------------------------------------
Expand Down
25 changes: 10 additions & 15 deletions pandas/io/pickle.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,7 @@ def read_pickle(
3 3 8
4 4 9
"""
# TypeError for Cython complaints about object.__new__ vs Tick.__new__
excs_to_catch = (AttributeError, ImportError, ModuleNotFoundError, TypeError)
with get_handle(
filepath_or_buffer,
Expand All @@ -194,20 +195,14 @@ def read_pickle(
) as handles:
# 1) try standard library Pickle
# 2) try pickle_compat (older pandas version) to handle subclass changes
# 3) try pickle_compat with latin-1 encoding upon a UnicodeDecodeError

try:
# TypeError for Cython complaints about object.__new__ vs Tick.__new__
try:
with warnings.catch_warnings(record=True):
# We want to silence any warnings about, e.g. moved modules.
warnings.simplefilter("ignore", Warning)
return pickle.load(handles.handle)
except excs_to_catch:
# e.g.
# "No module named 'pandas.core.sparse.series'"
# "Can't get attribute '__nat_unpickle' on <module 'pandas._libs.tslib"
return pc.load(handles.handle, encoding=None)
except UnicodeDecodeError:
# e.g. can occur for files written in py27; see GH#28645 and GH#31988
return pc.load(handles.handle, encoding="latin-1")
with warnings.catch_warnings(record=True):
# We want to silence any warnings about, e.g. moved modules.
warnings.simplefilter("ignore", Warning)
return pickle.load(handles.handle)
except excs_to_catch:
# e.g.
# "No module named 'pandas.core.sparse.series'"
# "Can't get attribute '__nat_unpickle' on <module 'pandas._libs.tslib"
return pc.load(handles.handle, encoding=None)
75 changes: 29 additions & 46 deletions pandas/io/pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,13 +132,6 @@
_default_encoding = "UTF-8"


def _ensure_decoded(s):
"""if we have bytes, decode them to unicode"""
if isinstance(s, np.bytes_):
s = s.decode("UTF-8")
return s


def _ensure_encoding(encoding: str | None) -> str:
# set the encoding if we need
if encoding is None:
Expand Down Expand Up @@ -1730,8 +1723,8 @@ def _create_storer(
if value is not None and not isinstance(value, (Series, DataFrame)):
raise TypeError("value must be None, Series, or DataFrame")

pt = _ensure_decoded(getattr(group._v_attrs, "pandas_type", None))
tt = _ensure_decoded(getattr(group._v_attrs, "table_type", None))
pt = getattr(group._v_attrs, "pandas_type", None)
tt = getattr(group._v_attrs, "table_type", None)

# infer the pt from the passed value
if pt is None:
Expand Down Expand Up @@ -1798,7 +1791,7 @@ def _create_storer(
"worm": WORMTable,
}
try:
cls = _TABLE_MAP[tt]
cls = _TABLE_MAP[tt] # type: ignore[index]
except KeyError as err:
raise TypeError(
f"cannot properly create the storer for: [_TABLE_MAP] [group->"
Expand Down Expand Up @@ -2145,13 +2138,13 @@ def convert(
# preventing the original recarry from being free'ed
values = values[self.cname].copy()

val_kind = _ensure_decoded(self.kind)
val_kind = self.kind
values = _maybe_convert(values, val_kind, encoding, errors)
kwargs = {}
kwargs["name"] = _ensure_decoded(self.index_name)
kwargs["name"] = self.index_name

if self.freq is not None:
kwargs["freq"] = _ensure_decoded(self.freq)
kwargs["freq"] = self.freq

factory: type[Index | DatetimeIndex] = Index
if lib.is_np_dtype(values.dtype, "M") or isinstance(
Expand Down Expand Up @@ -2210,7 +2203,7 @@ def maybe_set_size(self, min_itemsize=None) -> None:
min_itemsize can be an integer or a dict with this columns name
with an integer size
"""
if _ensure_decoded(self.kind) == "string":
if self.kind == "string":
if isinstance(min_itemsize, dict):
min_itemsize = min_itemsize.get(self.name)

Expand All @@ -2231,7 +2224,7 @@ def validate_and_set(self, handler: AppendableTable, append: bool) -> None:
def validate_col(self, itemsize=None):
"""validate this column: return the compared against itemsize"""
# validate this column for string truncation (or reset to the max size)
if _ensure_decoded(self.kind) == "string":
if self.kind == "string":
c = self.col
if c is not None:
if itemsize is None:
Expand Down Expand Up @@ -2561,14 +2554,14 @@ def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str):
assert isinstance(converted, np.ndarray) # for mypy

# use the meta if needed
meta = _ensure_decoded(self.meta)
meta = self.meta
metadata = self.metadata
ordered = self.ordered
tz = self.tz

assert dtype_name is not None
# convert to the correct dtype
dtype = _ensure_decoded(dtype_name)
dtype = dtype_name

# reverse converts
if dtype.startswith("datetime64"):
Expand Down Expand Up @@ -2618,7 +2611,7 @@ def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str):
converted = converted.astype("O", copy=False)

# convert nans / decode
if _ensure_decoded(kind) == "string":
if kind == "string":
converted = _unconvert_string_array(
converted, nan_rep=nan_rep, encoding=encoding, errors=errors
)
Expand Down Expand Up @@ -2706,18 +2699,19 @@ def is_old_version(self) -> bool:
@property
def version(self) -> tuple[int, int, int]:
"""compute and set our version"""
version = _ensure_decoded(getattr(self.group._v_attrs, "pandas_version", None))
try:
version = tuple(int(x) for x in version.split("."))
if len(version) == 2:
version = version + (0,)
except AttributeError:
version = (0, 0, 0)
return version
version = getattr(self.group._v_attrs, "pandas_version", None)
if isinstance(version, str):
version_tup = tuple(int(x) for x in version.split("."))
if len(version_tup) == 2:
version_tup = version_tup + (0,)
assert len(version_tup) == 3 # needed for mypy
return version_tup
else:
return (0, 0, 0)

@property
def pandas_type(self):
return _ensure_decoded(getattr(self.group._v_attrs, "pandas_type", None))
return getattr(self.group._v_attrs, "pandas_type", None)

def __repr__(self) -> str:
"""return a pretty representation of myself"""
Expand Down Expand Up @@ -2854,9 +2848,7 @@ def _alias_to_class(self, alias):
return self._reverse_index_map.get(alias, Index)

def _get_index_factory(self, attrs):
index_class = self._alias_to_class(
_ensure_decoded(getattr(attrs, "index_class", ""))
)
index_class = self._alias_to_class(getattr(attrs, "index_class", ""))

factory: Callable

Expand Down Expand Up @@ -2892,12 +2884,7 @@ def f(values, freq=None, tz=None):
factory = TimedeltaIndex

if "tz" in attrs:
if isinstance(attrs["tz"], bytes):
# created by python2
kwargs["tz"] = attrs["tz"].decode("utf-8")
else:
# created by python3
kwargs["tz"] = attrs["tz"]
kwargs["tz"] = attrs["tz"]
assert index_class is DatetimeIndex # just checking

return factory, kwargs
Expand Down Expand Up @@ -2929,9 +2916,9 @@ def set_attrs(self) -> None:
def get_attrs(self) -> None:
"""retrieve our attributes"""
self.encoding = _ensure_encoding(getattr(self.attrs, "encoding", None))
self.errors = _ensure_decoded(getattr(self.attrs, "errors", "strict"))
self.errors = getattr(self.attrs, "errors", "strict")
for n in self.attributes:
setattr(self, n, _ensure_decoded(getattr(self.attrs, n, None)))
setattr(self, n, getattr(self.attrs, n, None))

def write(self, obj, **kwargs) -> None:
self.set_attrs()
Expand All @@ -2948,7 +2935,7 @@ def read_array(self, key: str, start: int | None = None, stop: int | None = None
if isinstance(node, tables.VLArray):
ret = node[0][start:stop]
else:
dtype = _ensure_decoded(getattr(attrs, "value_type", None))
dtype = getattr(attrs, "value_type", None)
shape = getattr(attrs, "shape", None)

if shape is not None:
Expand All @@ -2973,7 +2960,7 @@ def read_array(self, key: str, start: int | None = None, stop: int | None = None
def read_index(
self, key: str, start: int | None = None, stop: int | None = None
) -> Index:
variety = _ensure_decoded(getattr(self.attrs, f"{key}_variety"))
variety = getattr(self.attrs, f"{key}_variety")

if variety == "multi":
return self.read_multi_index(key, start=start, stop=stop)
Expand Down Expand Up @@ -3063,12 +3050,11 @@ def read_index_node(
# have written a sentinel. Here we replace it with the original.
if "shape" in node._v_attrs and np.prod(node._v_attrs.shape) == 0:
data = np.empty(node._v_attrs.shape, dtype=node._v_attrs.value_type)
kind = _ensure_decoded(node._v_attrs.kind)
kind = node._v_attrs.kind
name = None

if "name" in node._v_attrs:
name = _ensure_str(node._v_attrs.name)
name = _ensure_decoded(name)

attrs = node._v_attrs
factory, kwargs = self._get_index_factory(attrs)
Expand Down Expand Up @@ -3584,7 +3570,7 @@ def get_attrs(self) -> None:
self.info = getattr(self.attrs, "info", None) or {}
self.nan_rep = getattr(self.attrs, "nan_rep", None)
self.encoding = _ensure_encoding(getattr(self.attrs, "encoding", None))
self.errors = _ensure_decoded(getattr(self.attrs, "errors", "strict"))
self.errors = getattr(self.attrs, "errors", "strict")
self.levels: list[Hashable] = getattr(self.attrs, "levels", None) or []
self.index_axes = [a for a in self.indexables if a.is_an_indexable]
self.values_axes = [a for a in self.indexables if not a.is_an_indexable]
Expand Down Expand Up @@ -4926,7 +4912,6 @@ def _set_tz(
name = None
values = values.ravel()

tz = _ensure_decoded(tz)
values = DatetimeIndex(values, name=name)
values = values.tz_localize("UTC").tz_convert(tz)
elif coerce:
Expand Down Expand Up @@ -5228,8 +5213,6 @@ def _dtype_to_kind(dtype_str: str) -> str:
"""
Find the "kind" string describing the given dtype name.
"""
dtype_str = _ensure_decoded(dtype_str)

if dtype_str.startswith(("string", "bytes")):
kind = "string"
elif dtype_str.startswith("float"):
Expand Down
Binary file not shown.
Binary file removed pandas/tests/io/data/legacy_hdf/gh26443.h5
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file removed pandas/tests/io/data/legacy_hdf/legacy_table_py2.h5
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file removed pandas/tests/io/data/pickle/test_mi_py27.pkl
Binary file not shown.
Binary file removed pandas/tests/io/data/pickle/test_py27.pkl
Binary file not shown.
73 changes: 0 additions & 73 deletions pandas/tests/io/pytables/test_read.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import numpy as np
import pytest

from pandas._libs.tslibs import Timestamp
from pandas.compat import is_platform_windows

import pandas as pd
Expand Down Expand Up @@ -171,50 +170,6 @@ def test_pytables_native2_read(datapath):
assert isinstance(d1, DataFrame)


def test_legacy_table_fixed_format_read_py2(datapath):
# GH 24510
# legacy table with fixed format written in Python 2
with ensure_clean_store(
datapath("io", "data", "legacy_hdf", "legacy_table_fixed_py2.h5"), mode="r"
) as store:
result = store.select("df")
expected = DataFrame(
[[1, 2, 3, "D"]],
columns=["A", "B", "C", "D"],
index=Index(["ABC"], name="INDEX_NAME"),
)
tm.assert_frame_equal(expected, result)


def test_legacy_table_fixed_format_read_datetime_py2(datapath):
# GH 31750
# legacy table with fixed format and datetime64 column written in Python 2
expected = DataFrame(
[[Timestamp("2020-02-06T18:00")]],
columns=["A"],
index=Index(["date"]),
dtype="M8[ns]",
)
with ensure_clean_store(
datapath("io", "data", "legacy_hdf", "legacy_table_fixed_datetime_py2.h5"),
mode="r",
) as store:
result = store.select("df")
tm.assert_frame_equal(expected, result)


def test_legacy_table_read_py2(datapath):
# issue: 24925
# legacy table written in Python 2
with ensure_clean_store(
datapath("io", "data", "legacy_hdf", "legacy_table_py2.h5"), mode="r"
) as store:
result = store.select("table")

expected = DataFrame({"a": ["a", "b"], "b": [2, 3]})
tm.assert_frame_equal(expected, result)


def test_read_hdf_open_store(tmp_path, setup_path):
# GH10330
# No check for non-string path_or-buf, and no test of open store
Expand Down Expand Up @@ -348,34 +303,6 @@ def test_read_hdf_series_mode_r(tmp_path, format, setup_path):
tm.assert_series_equal(result, series)


@pytest.mark.filterwarnings(r"ignore:Period with BDay freq is deprecated:FutureWarning")
@pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning")
def test_read_py2_hdf_file_in_py3(datapath):
# GH 16781

# tests reading a PeriodIndex DataFrame written in Python2 in Python3

# the file was generated in Python 2.7 like so:
#
# df = DataFrame([1.,2,3], index=pd.PeriodIndex(
# ['2015-01-01', '2015-01-02', '2015-01-05'], freq='B'))
# df.to_hdf('periodindex_0.20.1_x86_64_darwin_2.7.13.h5', 'p')

expected = DataFrame(
[1.0, 2, 3],
index=pd.PeriodIndex(["2015-01-01", "2015-01-02", "2015-01-05"], freq="B"),
)

with ensure_clean_store(
datapath(
"io", "data", "legacy_hdf", "periodindex_0.20.1_x86_64_darwin_2.7.13.h5"
),
mode="r",
) as store:
result = store["p"]
tm.assert_frame_equal(result, expected)


def test_read_infer_string(tmp_path, setup_path):
# GH#54431
pytest.importorskip("pyarrow")
Expand Down
Loading