diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 0758d58ed3912..82b4ab6af0818 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -88,6 +88,7 @@ Other API changes ^^^^^^^^^^^^^^^^^ - 3rd party ``py.path`` objects are no longer explicitly supported in IO methods. Use :py:class:`pathlib.Path` objects instead (:issue:`57091`) - :attr:`MultiIndex.codes`, :attr:`MultiIndex.levels`, and :attr:`MultiIndex.names` now returns a ``tuple`` instead of a ``FrozenList`` (:issue:`53531`) +- pickle and HDF (``.h5``) files created with Python 2 are no longer explicitly supported (:issue:`57387`) - .. --------------------------------------------------------------------------- diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index d3e93ebeb8fbb..c565544075ecf 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -184,6 +184,7 @@ def read_pickle( 3 3 8 4 4 9 """ + # TypeError for Cython complaints about object.__new__ vs Tick.__new__ excs_to_catch = (AttributeError, ImportError, ModuleNotFoundError, TypeError) with get_handle( filepath_or_buffer, @@ -194,20 +195,14 @@ def read_pickle( ) as handles: # 1) try standard library Pickle # 2) try pickle_compat (older pandas version) to handle subclass changes - # 3) try pickle_compat with latin-1 encoding upon a UnicodeDecodeError try: - # TypeError for Cython complaints about object.__new__ vs Tick.__new__ - try: - with warnings.catch_warnings(record=True): - # We want to silence any warnings about, e.g. moved modules. - warnings.simplefilter("ignore", Warning) - return pickle.load(handles.handle) - except excs_to_catch: - # e.g. - # "No module named 'pandas.core.sparse.series'" - # "Can't get attribute '__nat_unpickle' on str: # set the encoding if we need if encoding is None: @@ -1730,8 +1723,8 @@ def _create_storer( if value is not None and not isinstance(value, (Series, DataFrame)): raise TypeError("value must be None, Series, or DataFrame") - pt = _ensure_decoded(getattr(group._v_attrs, "pandas_type", None)) - tt = _ensure_decoded(getattr(group._v_attrs, "table_type", None)) + pt = getattr(group._v_attrs, "pandas_type", None) + tt = getattr(group._v_attrs, "table_type", None) # infer the pt from the passed value if pt is None: @@ -1798,7 +1791,7 @@ def _create_storer( "worm": WORMTable, } try: - cls = _TABLE_MAP[tt] + cls = _TABLE_MAP[tt] # type: ignore[index] except KeyError as err: raise TypeError( f"cannot properly create the storer for: [_TABLE_MAP] [group->" @@ -2145,13 +2138,13 @@ def convert( # preventing the original recarry from being free'ed values = values[self.cname].copy() - val_kind = _ensure_decoded(self.kind) + val_kind = self.kind values = _maybe_convert(values, val_kind, encoding, errors) kwargs = {} - kwargs["name"] = _ensure_decoded(self.index_name) + kwargs["name"] = self.index_name if self.freq is not None: - kwargs["freq"] = _ensure_decoded(self.freq) + kwargs["freq"] = self.freq factory: type[Index | DatetimeIndex] = Index if lib.is_np_dtype(values.dtype, "M") or isinstance( @@ -2210,7 +2203,7 @@ def maybe_set_size(self, min_itemsize=None) -> None: min_itemsize can be an integer or a dict with this columns name with an integer size """ - if _ensure_decoded(self.kind) == "string": + if self.kind == "string": if isinstance(min_itemsize, dict): min_itemsize = min_itemsize.get(self.name) @@ -2231,7 +2224,7 @@ def validate_and_set(self, handler: AppendableTable, append: bool) -> None: def validate_col(self, itemsize=None): """validate this column: return the compared against itemsize""" # validate this column for string truncation (or reset to the max size) - if _ensure_decoded(self.kind) == "string": + if self.kind == "string": c = self.col if c is not None: if itemsize is None: @@ -2561,14 +2554,14 @@ def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str): assert isinstance(converted, np.ndarray) # for mypy # use the meta if needed - meta = _ensure_decoded(self.meta) + meta = self.meta metadata = self.metadata ordered = self.ordered tz = self.tz assert dtype_name is not None # convert to the correct dtype - dtype = _ensure_decoded(dtype_name) + dtype = dtype_name # reverse converts if dtype.startswith("datetime64"): @@ -2618,7 +2611,7 @@ def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str): converted = converted.astype("O", copy=False) # convert nans / decode - if _ensure_decoded(kind) == "string": + if kind == "string": converted = _unconvert_string_array( converted, nan_rep=nan_rep, encoding=encoding, errors=errors ) @@ -2706,18 +2699,19 @@ def is_old_version(self) -> bool: @property def version(self) -> tuple[int, int, int]: """compute and set our version""" - version = _ensure_decoded(getattr(self.group._v_attrs, "pandas_version", None)) - try: - version = tuple(int(x) for x in version.split(".")) - if len(version) == 2: - version = version + (0,) - except AttributeError: - version = (0, 0, 0) - return version + version = getattr(self.group._v_attrs, "pandas_version", None) + if isinstance(version, str): + version_tup = tuple(int(x) for x in version.split(".")) + if len(version_tup) == 2: + version_tup = version_tup + (0,) + assert len(version_tup) == 3 # needed for mypy + return version_tup + else: + return (0, 0, 0) @property def pandas_type(self): - return _ensure_decoded(getattr(self.group._v_attrs, "pandas_type", None)) + return getattr(self.group._v_attrs, "pandas_type", None) def __repr__(self) -> str: """return a pretty representation of myself""" @@ -2854,9 +2848,7 @@ def _alias_to_class(self, alias): return self._reverse_index_map.get(alias, Index) def _get_index_factory(self, attrs): - index_class = self._alias_to_class( - _ensure_decoded(getattr(attrs, "index_class", "")) - ) + index_class = self._alias_to_class(getattr(attrs, "index_class", "")) factory: Callable @@ -2892,12 +2884,7 @@ def f(values, freq=None, tz=None): factory = TimedeltaIndex if "tz" in attrs: - if isinstance(attrs["tz"], bytes): - # created by python2 - kwargs["tz"] = attrs["tz"].decode("utf-8") - else: - # created by python3 - kwargs["tz"] = attrs["tz"] + kwargs["tz"] = attrs["tz"] assert index_class is DatetimeIndex # just checking return factory, kwargs @@ -2929,9 +2916,9 @@ def set_attrs(self) -> None: def get_attrs(self) -> None: """retrieve our attributes""" self.encoding = _ensure_encoding(getattr(self.attrs, "encoding", None)) - self.errors = _ensure_decoded(getattr(self.attrs, "errors", "strict")) + self.errors = getattr(self.attrs, "errors", "strict") for n in self.attributes: - setattr(self, n, _ensure_decoded(getattr(self.attrs, n, None))) + setattr(self, n, getattr(self.attrs, n, None)) def write(self, obj, **kwargs) -> None: self.set_attrs() @@ -2948,7 +2935,7 @@ def read_array(self, key: str, start: int | None = None, stop: int | None = None if isinstance(node, tables.VLArray): ret = node[0][start:stop] else: - dtype = _ensure_decoded(getattr(attrs, "value_type", None)) + dtype = getattr(attrs, "value_type", None) shape = getattr(attrs, "shape", None) if shape is not None: @@ -2973,7 +2960,7 @@ def read_array(self, key: str, start: int | None = None, stop: int | None = None def read_index( self, key: str, start: int | None = None, stop: int | None = None ) -> Index: - variety = _ensure_decoded(getattr(self.attrs, f"{key}_variety")) + variety = getattr(self.attrs, f"{key}_variety") if variety == "multi": return self.read_multi_index(key, start=start, stop=stop) @@ -3063,12 +3050,11 @@ def read_index_node( # have written a sentinel. Here we replace it with the original. if "shape" in node._v_attrs and np.prod(node._v_attrs.shape) == 0: data = np.empty(node._v_attrs.shape, dtype=node._v_attrs.value_type) - kind = _ensure_decoded(node._v_attrs.kind) + kind = node._v_attrs.kind name = None if "name" in node._v_attrs: name = _ensure_str(node._v_attrs.name) - name = _ensure_decoded(name) attrs = node._v_attrs factory, kwargs = self._get_index_factory(attrs) @@ -3584,7 +3570,7 @@ def get_attrs(self) -> None: self.info = getattr(self.attrs, "info", None) or {} self.nan_rep = getattr(self.attrs, "nan_rep", None) self.encoding = _ensure_encoding(getattr(self.attrs, "encoding", None)) - self.errors = _ensure_decoded(getattr(self.attrs, "errors", "strict")) + self.errors = getattr(self.attrs, "errors", "strict") self.levels: list[Hashable] = getattr(self.attrs, "levels", None) or [] self.index_axes = [a for a in self.indexables if a.is_an_indexable] self.values_axes = [a for a in self.indexables if not a.is_an_indexable] @@ -4926,7 +4912,6 @@ def _set_tz( name = None values = values.ravel() - tz = _ensure_decoded(tz) values = DatetimeIndex(values, name=name) values = values.tz_localize("UTC").tz_convert(tz) elif coerce: @@ -5228,8 +5213,6 @@ def _dtype_to_kind(dtype_str: str) -> str: """ Find the "kind" string describing the given dtype name. """ - dtype_str = _ensure_decoded(dtype_str) - if dtype_str.startswith(("string", "bytes")): kind = "string" elif dtype_str.startswith("float"): diff --git a/pandas/tests/io/data/legacy_hdf/datetimetz_object.h5 b/pandas/tests/io/data/legacy_hdf/datetimetz_object.h5 deleted file mode 100644 index 8cb4eda470398..0000000000000 Binary files a/pandas/tests/io/data/legacy_hdf/datetimetz_object.h5 and /dev/null differ diff --git a/pandas/tests/io/data/legacy_hdf/gh26443.h5 b/pandas/tests/io/data/legacy_hdf/gh26443.h5 deleted file mode 100644 index 45aa64324530f..0000000000000 Binary files a/pandas/tests/io/data/legacy_hdf/gh26443.h5 and /dev/null differ diff --git a/pandas/tests/io/data/legacy_hdf/legacy_table_fixed_datetime_py2.h5 b/pandas/tests/io/data/legacy_hdf/legacy_table_fixed_datetime_py2.h5 deleted file mode 100644 index 18cfae15a3a78..0000000000000 Binary files a/pandas/tests/io/data/legacy_hdf/legacy_table_fixed_datetime_py2.h5 and /dev/null differ diff --git a/pandas/tests/io/data/legacy_hdf/legacy_table_fixed_py2.h5 b/pandas/tests/io/data/legacy_hdf/legacy_table_fixed_py2.h5 deleted file mode 100644 index 540251d9fae86..0000000000000 Binary files a/pandas/tests/io/data/legacy_hdf/legacy_table_fixed_py2.h5 and /dev/null differ diff --git a/pandas/tests/io/data/legacy_hdf/legacy_table_py2.h5 b/pandas/tests/io/data/legacy_hdf/legacy_table_py2.h5 deleted file mode 100644 index 3863d714a315b..0000000000000 Binary files a/pandas/tests/io/data/legacy_hdf/legacy_table_py2.h5 and /dev/null differ diff --git a/pandas/tests/io/data/legacy_hdf/periodindex_0.20.1_x86_64_darwin_2.7.13.h5 b/pandas/tests/io/data/legacy_hdf/periodindex_0.20.1_x86_64_darwin_2.7.13.h5 deleted file mode 100644 index 6fb92d3c564bd..0000000000000 Binary files a/pandas/tests/io/data/legacy_hdf/periodindex_0.20.1_x86_64_darwin_2.7.13.h5 and /dev/null differ diff --git a/pandas/tests/io/data/legacy_msgpack/0.20.3/0.20.3_x86_64_darwin_3.5.2.msgpack b/pandas/tests/io/data/legacy_msgpack/0.20.3/0.20.3_x86_64_darwin_3.5.2.msgpack deleted file mode 100644 index 7a546a82ae766..0000000000000 Binary files a/pandas/tests/io/data/legacy_msgpack/0.20.3/0.20.3_x86_64_darwin_3.5.2.msgpack and /dev/null differ diff --git a/pandas/tests/io/data/pickle/test_mi_py27.pkl b/pandas/tests/io/data/pickle/test_mi_py27.pkl deleted file mode 100644 index 89021dd828108..0000000000000 Binary files a/pandas/tests/io/data/pickle/test_mi_py27.pkl and /dev/null differ diff --git a/pandas/tests/io/data/pickle/test_py27.pkl b/pandas/tests/io/data/pickle/test_py27.pkl deleted file mode 100644 index 5308b864bc0c7..0000000000000 Binary files a/pandas/tests/io/data/pickle/test_py27.pkl and /dev/null differ diff --git a/pandas/tests/io/pytables/test_read.py b/pandas/tests/io/pytables/test_read.py index c8563ee4af4a8..e33ddaf3b81f0 100644 --- a/pandas/tests/io/pytables/test_read.py +++ b/pandas/tests/io/pytables/test_read.py @@ -5,7 +5,6 @@ import numpy as np import pytest -from pandas._libs.tslibs import Timestamp from pandas.compat import is_platform_windows import pandas as pd @@ -171,50 +170,6 @@ def test_pytables_native2_read(datapath): assert isinstance(d1, DataFrame) -def test_legacy_table_fixed_format_read_py2(datapath): - # GH 24510 - # legacy table with fixed format written in Python 2 - with ensure_clean_store( - datapath("io", "data", "legacy_hdf", "legacy_table_fixed_py2.h5"), mode="r" - ) as store: - result = store.select("df") - expected = DataFrame( - [[1, 2, 3, "D"]], - columns=["A", "B", "C", "D"], - index=Index(["ABC"], name="INDEX_NAME"), - ) - tm.assert_frame_equal(expected, result) - - -def test_legacy_table_fixed_format_read_datetime_py2(datapath): - # GH 31750 - # legacy table with fixed format and datetime64 column written in Python 2 - expected = DataFrame( - [[Timestamp("2020-02-06T18:00")]], - columns=["A"], - index=Index(["date"]), - dtype="M8[ns]", - ) - with ensure_clean_store( - datapath("io", "data", "legacy_hdf", "legacy_table_fixed_datetime_py2.h5"), - mode="r", - ) as store: - result = store.select("df") - tm.assert_frame_equal(expected, result) - - -def test_legacy_table_read_py2(datapath): - # issue: 24925 - # legacy table written in Python 2 - with ensure_clean_store( - datapath("io", "data", "legacy_hdf", "legacy_table_py2.h5"), mode="r" - ) as store: - result = store.select("table") - - expected = DataFrame({"a": ["a", "b"], "b": [2, 3]}) - tm.assert_frame_equal(expected, result) - - def test_read_hdf_open_store(tmp_path, setup_path): # GH10330 # No check for non-string path_or-buf, and no test of open store @@ -348,34 +303,6 @@ def test_read_hdf_series_mode_r(tmp_path, format, setup_path): tm.assert_series_equal(result, series) -@pytest.mark.filterwarnings(r"ignore:Period with BDay freq is deprecated:FutureWarning") -@pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") -def test_read_py2_hdf_file_in_py3(datapath): - # GH 16781 - - # tests reading a PeriodIndex DataFrame written in Python2 in Python3 - - # the file was generated in Python 2.7 like so: - # - # df = DataFrame([1.,2,3], index=pd.PeriodIndex( - # ['2015-01-01', '2015-01-02', '2015-01-05'], freq='B')) - # df.to_hdf('periodindex_0.20.1_x86_64_darwin_2.7.13.h5', 'p') - - expected = DataFrame( - [1.0, 2, 3], - index=pd.PeriodIndex(["2015-01-01", "2015-01-02", "2015-01-05"], freq="B"), - ) - - with ensure_clean_store( - datapath( - "io", "data", "legacy_hdf", "periodindex_0.20.1_x86_64_darwin_2.7.13.h5" - ), - mode="r", - ) as store: - result = store["p"] - tm.assert_frame_equal(result, expected) - - def test_read_infer_string(tmp_path, setup_path): # GH#54431 pytest.importorskip("pyarrow") diff --git a/pandas/tests/io/pytables/test_timezones.py b/pandas/tests/io/pytables/test_timezones.py index 8c61830ebe038..b455235669636 100644 --- a/pandas/tests/io/pytables/test_timezones.py +++ b/pandas/tests/io/pytables/test_timezones.py @@ -312,23 +312,6 @@ def test_store_timezone(setup_path): tm.assert_frame_equal(result, df) -def test_legacy_datetimetz_object(datapath): - # legacy from < 0.17.0 - # 8260 - expected = DataFrame( - { - "A": Timestamp("20130102", tz="US/Eastern").as_unit("ns"), - "B": Timestamp("20130603", tz="CET").as_unit("ns"), - }, - index=range(5), - ) - with ensure_clean_store( - datapath("io", "data", "legacy_hdf", "datetimetz_object.h5"), mode="r" - ) as store: - result = store["df"] - tm.assert_frame_equal(result, expected) - - def test_dst_transitions(setup_path): # make sure we are not failing on transitions with ensure_clean_store(setup_path) as store: @@ -362,17 +345,3 @@ def test_read_with_where_tz_aware_index(tmp_path, setup_path): store.append(key, expected, format="table", append=True) result = pd.read_hdf(path, key, where="DATE > 20151130") tm.assert_frame_equal(result, expected) - - -def test_py2_created_with_datetimez(datapath): - # The test HDF5 file was created in Python 2, but could not be read in - # Python 3. - # - # GH26443 - index = DatetimeIndex(["2019-01-01T18:00"], dtype="M8[ns, America/New_York]") - expected = DataFrame({"data": 123}, index=index) - with ensure_clean_store( - datapath("io", "data", "legacy_hdf", "gh26443.h5"), mode="r" - ) as store: - result = store["key"] - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index b43f430b8895b..886bff332a420 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -289,7 +289,7 @@ def test_read_expands_user_home_dir( ( pd.read_hdf, "tables", - ("io", "data", "legacy_hdf", "datetimetz_object.h5"), + ("io", "data", "legacy_hdf", "pytables_native2.h5"), ), (pd.read_stata, "os", ("io", "data", "stata", "stata10_115.dta")), (pd.read_sas, "os", ("io", "sas", "data", "test1.sas7bdat")), diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 38a0888909663..ed8d4371e0f3a 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -399,31 +399,6 @@ def test_read(self, protocol, get_random_path): tm.assert_frame_equal(df, df2) -@pytest.mark.parametrize( - ["pickle_file", "excols"], - [ - ("test_py27.pkl", Index(["a", "b", "c"])), - ( - "test_mi_py27.pkl", - pd.MultiIndex.from_arrays([["a", "b", "c"], ["A", "B", "C"]]), - ), - ], -) -def test_unicode_decode_error(datapath, pickle_file, excols): - # pickle file written with py27, should be readable without raising - # UnicodeDecodeError, see GH#28645 and GH#31988 - path = datapath("io", "data", "pickle", pickle_file) - df = pd.read_pickle(path) - - # just test the columns are correct since the values are random - tm.assert_index_equal(df.columns, excols) - - -# --------------------- -# tests for buffer I/O -# --------------------- - - def test_pickle_buffer_roundtrip(): with tm.ensure_clean() as path: df = DataFrame(