pandas-dev · phofl · Feb 14, 2024 · Feb 12, 2024 · Feb 12, 2024 · Feb 12, 2024
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -88,6 +88,7 @@ Other API changes
 ^^^^^^^^^^^^^^^^^
 - 3rd party ``py.path`` objects are no longer explicitly supported in IO methods. Use :py:class:`pathlib.Path` objects instead (:issue:`57091`)
 - :attr:`MultiIndex.codes`, :attr:`MultiIndex.levels`, and :attr:`MultiIndex.names` now returns a ``tuple`` instead of a ``FrozenList`` (:issue:`53531`)
+- pickle and HDF (``.h5``) files created with Python 2 are no longer explicitly supported (:issue:`57387`)
 -
 
 .. ---------------------------------------------------------------------------

diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py
@@ -184,6 +184,7 @@ def read_pickle(
     3    3    8
     4    4    9
     """
+    # TypeError for Cython complaints about object.__new__ vs Tick.__new__
     excs_to_catch = (AttributeError, ImportError, ModuleNotFoundError, TypeError)
     with get_handle(
         filepath_or_buffer,
@@ -194,20 +195,14 @@ def read_pickle(
     ) as handles:
         # 1) try standard library Pickle
         # 2) try pickle_compat (older pandas version) to handle subclass changes
-        # 3) try pickle_compat with latin-1 encoding upon a UnicodeDecodeError
 
         try:
-            # TypeError for Cython complaints about object.__new__ vs Tick.__new__
-            try:
-                with warnings.catch_warnings(record=True):
-                    # We want to silence any warnings about, e.g. moved modules.
-                    warnings.simplefilter("ignore", Warning)
-                    return pickle.load(handles.handle)
-            except excs_to_catch:
-                # e.g.
-                #  "No module named 'pandas.core.sparse.series'"
-                #  "Can't get attribute '__nat_unpickle' on <module 'pandas._libs.tslib"
-                return pc.load(handles.handle, encoding=None)
-        except UnicodeDecodeError:
-            # e.g. can occur for files written in py27; see GH#28645 and GH#31988
-            return pc.load(handles.handle, encoding="latin-1")
+            with warnings.catch_warnings(record=True):
+                # We want to silence any warnings about, e.g. moved modules.
+                warnings.simplefilter("ignore", Warning)
+                return pickle.load(handles.handle)
+        except excs_to_catch:
+            # e.g.
+            #  "No module named 'pandas.core.sparse.series'"
+            #  "Can't get attribute '__nat_unpickle' on <module 'pandas._libs.tslib"
+            return pc.load(handles.handle, encoding=None)
diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
@@ -132,13 +132,6 @@
 _default_encoding = "UTF-8"
 
 
-def _ensure_decoded(s):
-    """if we have bytes, decode them to unicode"""
-    if isinstance(s, np.bytes_):
-        s = s.decode("UTF-8")
-    return s
-
-
 def _ensure_encoding(encoding: str | None) -> str:
     # set the encoding if we need
     if encoding is None:
@@ -1730,8 +1723,8 @@ def _create_storer(
         if value is not None and not isinstance(value, (Series, DataFrame)):
             raise TypeError("value must be None, Series, or DataFrame")
 
-        pt = _ensure_decoded(getattr(group._v_attrs, "pandas_type", None))
-        tt = _ensure_decoded(getattr(group._v_attrs, "table_type", None))
+        pt = getattr(group._v_attrs, "pandas_type", None)
+        tt = getattr(group._v_attrs, "table_type", None)
 
         # infer the pt from the passed value
         if pt is None:
@@ -1798,7 +1791,7 @@ def _create_storer(
             "worm": WORMTable,
         }
         try:
-            cls = _TABLE_MAP[tt]
+            cls = _TABLE_MAP[tt]  # type: ignore[index]
         except KeyError as err:
             raise TypeError(
                 f"cannot properly create the storer for: [_TABLE_MAP] [group->"
@@ -2145,13 +2138,13 @@ def convert(
             # preventing the original recarry from being free'ed
             values = values[self.cname].copy()
 
-        val_kind = _ensure_decoded(self.kind)
+        val_kind = self.kind
         values = _maybe_convert(values, val_kind, encoding, errors)
         kwargs = {}
-        kwargs["name"] = _ensure_decoded(self.index_name)
+        kwargs["name"] = self.index_name
 
         if self.freq is not None:
-            kwargs["freq"] = _ensure_decoded(self.freq)
+            kwargs["freq"] = self.freq
 
         factory: type[Index | DatetimeIndex] = Index
         if lib.is_np_dtype(values.dtype, "M") or isinstance(
@@ -2210,7 +2203,7 @@ def maybe_set_size(self, min_itemsize=None) -> None:
             min_itemsize can be an integer or a dict with this columns name
             with an integer size
         """
-        if _ensure_decoded(self.kind) == "string":
+        if self.kind == "string":
             if isinstance(min_itemsize, dict):
                 min_itemsize = min_itemsize.get(self.name)
 
@@ -2231,7 +2224,7 @@ def validate_and_set(self, handler: AppendableTable, append: bool) -> None:
     def validate_col(self, itemsize=None):
         """validate this column: return the compared against itemsize"""
         # validate this column for string truncation (or reset to the max size)
-        if _ensure_decoded(self.kind) == "string":
+        if self.kind == "string":
             c = self.col
             if c is not None:
                 if itemsize is None:
@@ -2561,14 +2554,14 @@ def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str):
         assert isinstance(converted, np.ndarray)  # for mypy
 
         # use the meta if needed
-        meta = _ensure_decoded(self.meta)
+        meta = self.meta
         metadata = self.metadata
         ordered = self.ordered
         tz = self.tz
 
         assert dtype_name is not None
         # convert to the correct dtype
-        dtype = _ensure_decoded(dtype_name)
+        dtype = dtype_name
 
         # reverse converts
         if dtype.startswith("datetime64"):
@@ -2618,7 +2611,7 @@ def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str):
                 converted = converted.astype("O", copy=False)
 
         # convert nans / decode
-        if _ensure_decoded(kind) == "string":
+        if kind == "string":
             converted = _unconvert_string_array(
                 converted, nan_rep=nan_rep, encoding=encoding, errors=errors
             )
@@ -2706,18 +2699,19 @@ def is_old_version(self) -> bool:
     @property
     def version(self) -> tuple[int, int, int]:
         """compute and set our version"""
-        version = _ensure_decoded(getattr(self.group._v_attrs, "pandas_version", None))
-        try:
-            version = tuple(int(x) for x in version.split("."))
-            if len(version) == 2:
-                version = version + (0,)
-        except AttributeError:
-            version = (0, 0, 0)
-        return version
+        version = getattr(self.group._v_attrs, "pandas_version", None)
+        if isinstance(version, str):
+            version_tup = tuple(int(x) for x in version.split("."))
+            if len(version_tup) == 2:
+                version_tup = version_tup + (0,)
+            assert len(version_tup) == 3  # needed for mypy
+            return version_tup
+        else:
+            return (0, 0, 0)
 
     @property
     def pandas_type(self):
-        return _ensure_decoded(getattr(self.group._v_attrs, "pandas_type", None))
+        return getattr(self.group._v_attrs, "pandas_type", None)
 
     def __repr__(self) -> str:
         """return a pretty representation of myself"""
@@ -2854,9 +2848,7 @@ def _alias_to_class(self, alias):
         return self._reverse_index_map.get(alias, Index)
 
     def _get_index_factory(self, attrs):
-        index_class = self._alias_to_class(
-            _ensure_decoded(getattr(attrs, "index_class", ""))
-        )
+        index_class = self._alias_to_class(getattr(attrs, "index_class", ""))
 
         factory: Callable
 
@@ -2892,12 +2884,7 @@ def f(values, freq=None, tz=None):
                 factory = TimedeltaIndex
 
         if "tz" in attrs:
-            if isinstance(attrs["tz"], bytes):
-                # created by python2
-                kwargs["tz"] = attrs["tz"].decode("utf-8")
-            else:
-                # created by python3
-                kwargs["tz"] = attrs["tz"]
+            kwargs["tz"] = attrs["tz"]
             assert index_class is DatetimeIndex  # just checking
 
         return factory, kwargs
@@ -2929,9 +2916,9 @@ def set_attrs(self) -> None:
     def get_attrs(self) -> None:
         """retrieve our attributes"""
         self.encoding = _ensure_encoding(getattr(self.attrs, "encoding", None))
-        self.errors = _ensure_decoded(getattr(self.attrs, "errors", "strict"))
+        self.errors = getattr(self.attrs, "errors", "strict")
         for n in self.attributes:
-            setattr(self, n, _ensure_decoded(getattr(self.attrs, n, None)))
+            setattr(self, n, getattr(self.attrs, n, None))
 
     def write(self, obj, **kwargs) -> None:
         self.set_attrs()
@@ -2948,7 +2935,7 @@ def read_array(self, key: str, start: int | None = None, stop: int | None = None
         if isinstance(node, tables.VLArray):
             ret = node[0][start:stop]
         else:
-            dtype = _ensure_decoded(getattr(attrs, "value_type", None))
+            dtype = getattr(attrs, "value_type", None)
             shape = getattr(attrs, "shape", None)
 
             if shape is not None:
@@ -2973,7 +2960,7 @@ def read_array(self, key: str, start: int | None = None, stop: int | None = None
     def read_index(
         self, key: str, start: int | None = None, stop: int | None = None
     ) -> Index:
-        variety = _ensure_decoded(getattr(self.attrs, f"{key}_variety"))
+        variety = getattr(self.attrs, f"{key}_variety")
 
         if variety == "multi":
             return self.read_multi_index(key, start=start, stop=stop)
@@ -3063,12 +3050,11 @@ def read_index_node(
         # have written a sentinel. Here we replace it with the original.
         if "shape" in node._v_attrs and np.prod(node._v_attrs.shape) == 0:
             data = np.empty(node._v_attrs.shape, dtype=node._v_attrs.value_type)
-        kind = _ensure_decoded(node._v_attrs.kind)
+        kind = node._v_attrs.kind
         name = None
 
         if "name" in node._v_attrs:
             name = _ensure_str(node._v_attrs.name)
-            name = _ensure_decoded(name)
 
         attrs = node._v_attrs
         factory, kwargs = self._get_index_factory(attrs)
@@ -3584,7 +3570,7 @@ def get_attrs(self) -> None:
         self.info = getattr(self.attrs, "info", None) or {}
         self.nan_rep = getattr(self.attrs, "nan_rep", None)
         self.encoding = _ensure_encoding(getattr(self.attrs, "encoding", None))
-        self.errors = _ensure_decoded(getattr(self.attrs, "errors", "strict"))
+        self.errors = getattr(self.attrs, "errors", "strict")
         self.levels: list[Hashable] = getattr(self.attrs, "levels", None) or []
         self.index_axes = [a for a in self.indexables if a.is_an_indexable]
         self.values_axes = [a for a in self.indexables if not a.is_an_indexable]
@@ -4926,7 +4912,6 @@ def _set_tz(
             name = None
             values = values.ravel()
 
-        tz = _ensure_decoded(tz)
         values = DatetimeIndex(values, name=name)
         values = values.tz_localize("UTC").tz_convert(tz)
     elif coerce:
@@ -5228,8 +5213,6 @@ def _dtype_to_kind(dtype_str: str) -> str:
     """
     Find the "kind" string describing the given dtype name.
     """
-    dtype_str = _ensure_decoded(dtype_str)
-
     if dtype_str.startswith(("string", "bytes")):
         kind = "string"
     elif dtype_str.startswith("float"):

diff --git a/pandas/tests/io/data/legacy_hdf/datetimetz_object.h5 b/pandas/tests/io/data/legacy_hdf/datetimetz_object.h5
diff --git a/pandas/tests/io/data/legacy_hdf/gh26443.h5 b/pandas/tests/io/data/legacy_hdf/gh26443.h5
diff --git a/pandas/tests/io/data/legacy_hdf/legacy_table_fixed_datetime_py2.h5 b/pandas/tests/io/data/legacy_hdf/legacy_table_fixed_datetime_py2.h5
diff --git a/pandas/tests/io/data/legacy_hdf/legacy_table_fixed_py2.h5 b/pandas/tests/io/data/legacy_hdf/legacy_table_fixed_py2.h5
diff --git a/pandas/tests/io/data/legacy_hdf/legacy_table_py2.h5 b/pandas/tests/io/data/legacy_hdf/legacy_table_py2.h5
diff --git a/pandas/tests/io/data/legacy_hdf/periodindex_0.20.1_x86_64_darwin_2.7.13.h5 b/pandas/tests/io/data/legacy_hdf/periodindex_0.20.1_x86_64_darwin_2.7.13.h5
diff --git a/pandas/tests/io/data/legacy_msgpack/0.20.3/0.20.3_x86_64_darwin_3.5.2.msgpack b/pandas/tests/io/data/legacy_msgpack/0.20.3/0.20.3_x86_64_darwin_3.5.2.msgpack
diff --git a/pandas/tests/io/data/pickle/test_mi_py27.pkl b/pandas/tests/io/data/pickle/test_mi_py27.pkl
diff --git a/pandas/tests/io/data/pickle/test_py27.pkl b/pandas/tests/io/data/pickle/test_py27.pkl
diff --git a/pandas/tests/io/pytables/test_read.py b/pandas/tests/io/pytables/test_read.py
@@ -5,7 +5,6 @@
 import numpy as np
 import pytest
 
-from pandas._libs.tslibs import Timestamp
 from pandas.compat import is_platform_windows
 
 import pandas as pd
@@ -171,50 +170,6 @@ def test_pytables_native2_read(datapath):
     assert isinstance(d1, DataFrame)
 
 
-def test_legacy_table_fixed_format_read_py2(datapath):
-    # GH 24510
-    # legacy table with fixed format written in Python 2
-    with ensure_clean_store(
-        datapath("io", "data", "legacy_hdf", "legacy_table_fixed_py2.h5"), mode="r"
-    ) as store:
-        result = store.select("df")
-    expected = DataFrame(
-        [[1, 2, 3, "D"]],
-        columns=["A", "B", "C", "D"],
-        index=Index(["ABC"], name="INDEX_NAME"),
-    )
-    tm.assert_frame_equal(expected, result)
-
-
-def test_legacy_table_fixed_format_read_datetime_py2(datapath):
-    # GH 31750
-    # legacy table with fixed format and datetime64 column written in Python 2
-    expected = DataFrame(
-        [[Timestamp("2020-02-06T18:00")]],
-        columns=["A"],
-        index=Index(["date"]),
-        dtype="M8[ns]",
-    )
-    with ensure_clean_store(
-        datapath("io", "data", "legacy_hdf", "legacy_table_fixed_datetime_py2.h5"),
-        mode="r",
-    ) as store:
-        result = store.select("df")
-    tm.assert_frame_equal(expected, result)
-
-
-def test_legacy_table_read_py2(datapath):
-    # issue: 24925
-    # legacy table written in Python 2
-    with ensure_clean_store(
-        datapath("io", "data", "legacy_hdf", "legacy_table_py2.h5"), mode="r"
-    ) as store:
-        result = store.select("table")
-
-    expected = DataFrame({"a": ["a", "b"], "b": [2, 3]})
-    tm.assert_frame_equal(expected, result)
-
-
 def test_read_hdf_open_store(tmp_path, setup_path):
     # GH10330
     # No check for non-string path_or-buf, and no test of open store
@@ -348,34 +303,6 @@ def test_read_hdf_series_mode_r(tmp_path, format, setup_path):
     tm.assert_series_equal(result, series)
 
 
-@pytest.mark.filterwarnings(r"ignore:Period with BDay freq is deprecated:FutureWarning")
-@pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning")
-def test_read_py2_hdf_file_in_py3(datapath):
-    # GH 16781
-
-    # tests reading a PeriodIndex DataFrame written in Python2 in Python3
-
-    # the file was generated in Python 2.7 like so:
-    #
-    # df = DataFrame([1.,2,3], index=pd.PeriodIndex(
-    #              ['2015-01-01', '2015-01-02', '2015-01-05'], freq='B'))
-    # df.to_hdf('periodindex_0.20.1_x86_64_darwin_2.7.13.h5', 'p')
-
-    expected = DataFrame(
-        [1.0, 2, 3],
-        index=pd.PeriodIndex(["2015-01-01", "2015-01-02", "2015-01-05"], freq="B"),
-    )
-
-    with ensure_clean_store(
-        datapath(
-            "io", "data", "legacy_hdf", "periodindex_0.20.1_x86_64_darwin_2.7.13.h5"
-        ),
-        mode="r",
-    ) as store:
-        result = store["p"]
-    tm.assert_frame_equal(result, expected)
-
-
 def test_read_infer_string(tmp_path, setup_path):
     # GH#54431
     pytest.importorskip("pyarrow")