From e675f82476763108a57a81347036051095814a93 Mon Sep 17 00:00:00 2001 From: Pedro Reys Date: Mon, 17 Feb 2020 09:39:00 -0300 Subject: [PATCH 1/4] BUG: read_pickle fallback to `latin_1` upon a UnicodeDecodeError When a reading a pickle with MultiIndex columns generated in py27 `pickle_compat.load()` with `enconding=None` would throw an UnicodeDecodeError when reading a pickle created in py27. Now, `read_pickle` catches that exception and fallback to use `latin-1` explicitly. --- doc/source/whatsnew/v1.0.2.rst | 1 + pandas/io/pickle.py | 6 +++++- pandas/tests/io/data/pickle/test_mi_py27.pkl | Bin 0 -> 1395 bytes pandas/tests/io/test_pickle.py | 12 ++++++++++++ 4 files changed, 18 insertions(+), 1 deletion(-) create mode 100644 pandas/tests/io/data/pickle/test_mi_py27.pkl diff --git a/doc/source/whatsnew/v1.0.2.rst b/doc/source/whatsnew/v1.0.2.rst index c9031ac1ae9fe..dcdffcfb48ee7 100644 --- a/doc/source/whatsnew/v1.0.2.rst +++ b/doc/source/whatsnew/v1.0.2.rst @@ -38,6 +38,7 @@ Bug fixes - Using ``pd.NA`` with :meth:`DataFrame.to_json` now correctly outputs a null value instead of an empty object (:issue:`31615`) - Fixed bug in parquet roundtrip with nullable unsigned integer dtypes (:issue:`31896`). +- Fixed bug where :meth:`pandas.io.pickleread_pickle` raised a ``UnicodeDecodeError` when reading a py27 pickle with MultiIndex column (:issue:`31988`). diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index e51f24b551f31..86069d49d488d 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -183,7 +183,11 @@ def read_pickle( # e.g. # "No module named 'pandas.core.sparse.series'" # "Can't get attribute '__nat_unpickle' on PwD7TRsEv2h?Ktee@rxtj}jd-wJa zQspX|v;t~k1mZV8_@U9n1S9GfW459J8?Eu1HYO^XXdpEYX6 ziP3#UVrvyxC$Z{9@oMLn|V&Pu?$ zDXz0}rkyhr)C9?M*7-jiO=6^|I9^*H@MqGxslCR_HOb-z6JL`Q1PB3c=Yd~EeyMwgCEb}A=6M=wq{M$qYbwD z7k-*N?>3ybM|Pk1#a(;!LgKA6pSj!S-yR$2o^T%{Soc!b<JE34+1007t#D! zQ*UtYXSapDa_Z&QbMDUB*V)O>$rEL==} z>FkD;ofl}YP-_GLagA#d_WaAY6JINZ-D Ni#f!xY|YW4{{SjH`hoxe literal 0 HcmV?d00001 diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 78b630bb5ada1..7e9d76f712075 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -501,3 +501,15 @@ def test_read_pickle_with_subclass(): tm.assert_series_equal(result[0], expected[0]) assert isinstance(result[1], MyTz) + + +def test_read_py27_pickle_with_MultiIndex_column(datapath): + # pickle file with MultiIndex column written with py27 + # should be readable without raising UnicodeDecodeError + # see GH#31988 + path = datapath("io", "data", "pickle", "test_mi_py27.pkl") + df = pd.read_pickle(path) + + # just test the columns are correct since the values are random + expected = pd.MultiIndex.from_arrays([["a", "b", "c"], ["A", "B", "C"]]) + tm.assert_index_equal(df.columns, expected) From bcf41c9f2441a3c3f9f12fb963bbb38ed5deff01 Mon Sep 17 00:00:00 2001 From: Pedro Reys Date: Tue, 18 Feb 2020 11:42:10 -0300 Subject: [PATCH 2/4] Cleanup exception handling Cleanup the code so that it only has a single catch for UnicodeDecodeError --- pandas/io/pickle.py | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 86069d49d488d..4e731b8ecca11 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -171,25 +171,22 @@ def read_pickle( # 1) try standard library Pickle # 2) try pickle_compat (older pandas version) to handle subclass changes - - excs_to_catch = (AttributeError, ImportError, ModuleNotFoundError) + # 3) try pickle_compat with latin-1 encoding upon a UnicodeDecodeError try: - with warnings.catch_warnings(record=True): - # We want to silence any warnings about, e.g. moved modules. - warnings.simplefilter("ignore", Warning) - return pickle.load(f) - except excs_to_catch: - # e.g. - # "No module named 'pandas.core.sparse.series'" - # "Can't get attribute '__nat_unpickle' on Date: Wed, 19 Feb 2020 18:06:26 -0300 Subject: [PATCH 3/4] parametrize test --- pandas/tests/io/test_pickle.py | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 7e9d76f712075..584a545769c4c 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -382,14 +382,23 @@ def test_read(self, protocol, get_random_path): tm.assert_frame_equal(df, df2) -def test_unicode_decode_error(datapath): +@pytest.mark.parametrize( + ["pickle_file", "excols"], + [ + ("test_py27.pkl", pd.Index(["a", "b", "c"])), + ( + "test_mi_py27.pkl", + pd.MultiIndex.from_arrays([["a", "b", "c"], ["A", "B", "C"]]), + ), + ], +) +def test_unicode_decode_error(datapath, pickle_file, excols): # pickle file written with py27, should be readable without raising - # UnicodeDecodeError, see GH#28645 - path = datapath("io", "data", "pickle", "test_py27.pkl") + # UnicodeDecodeError, see GH#28645 and GH#31988 + path = datapath("io", "data", "pickle", pickle_file) df = pd.read_pickle(path) # just test the columns are correct since the values are random - excols = pd.Index(["a", "b", "c"]) tm.assert_index_equal(df.columns, excols) @@ -501,15 +510,3 @@ def test_read_pickle_with_subclass(): tm.assert_series_equal(result[0], expected[0]) assert isinstance(result[1], MyTz) - - -def test_read_py27_pickle_with_MultiIndex_column(datapath): - # pickle file with MultiIndex column written with py27 - # should be readable without raising UnicodeDecodeError - # see GH#31988 - path = datapath("io", "data", "pickle", "test_mi_py27.pkl") - df = pd.read_pickle(path) - - # just test the columns are correct since the values are random - expected = pd.MultiIndex.from_arrays([["a", "b", "c"], ["A", "B", "C"]]) - tm.assert_index_equal(df.columns, expected) From 08c007da68e3ac75ea713997a15131aedf053f49 Mon Sep 17 00:00:00 2001 From: Pedro Reys Date: Thu, 20 Feb 2020 12:26:26 -0300 Subject: [PATCH 4/4] Update doc/source/whatsnew/v1.0.2.rst Co-Authored-By: Simon Hawkins --- doc/source/whatsnew/v1.0.2.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.0.2.rst b/doc/source/whatsnew/v1.0.2.rst index dcdffcfb48ee7..57ed6adf667c8 100644 --- a/doc/source/whatsnew/v1.0.2.rst +++ b/doc/source/whatsnew/v1.0.2.rst @@ -19,6 +19,7 @@ Fixed regressions - Fixed regression in :meth:`Series.align` when ``other`` is a DataFrame and ``method`` is not None (:issue:`31785`) - Fixed regression in :meth:`pandas.core.groupby.RollingGroupby.apply` where the ``raw`` parameter was ignored (:issue:`31754`) - Fixed regression in :meth:`rolling(..).corr() ` when using a time offset (:issue:`31789`) +- Fixed regression where :func:`read_pickle` raised a ``UnicodeDecodeError`` when reading a py27 pickle with :class:`MultiIndex` column (:issue:`31988`). - Fixed regression in :class:`DataFrame` arithmetic operations with mis-matched columns (:issue:`31623`) - @@ -38,7 +39,6 @@ Bug fixes - Using ``pd.NA`` with :meth:`DataFrame.to_json` now correctly outputs a null value instead of an empty object (:issue:`31615`) - Fixed bug in parquet roundtrip with nullable unsigned integer dtypes (:issue:`31896`). -- Fixed bug where :meth:`pandas.io.pickleread_pickle` raised a ``UnicodeDecodeError` when reading a py27 pickle with MultiIndex column (:issue:`31988`).