From b29fb0f7852e60d579133e5e944c4a6f766f84f2 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Wed, 1 May 2024 17:41:40 +0200 Subject: [PATCH 01/22] ArrowDtype type are taken into account in a column assignment --- pandas/core/frame.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 3d2a6093464a9..3a945ab38f7be 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -39,6 +39,7 @@ import numpy as np from numpy import ma +import pyarrow as pa from pandas._config import get_option @@ -5024,7 +5025,12 @@ def _sanitize_column(self, value) -> tuple[ArrayLike, BlockValuesRefs | None]: if is_list_like(value): com.require_length_match(value, self.index) - arr = sanitize_array(value, self.index, copy=True, allow_2d=True) + + if isinstance(value.type, pa.DataType): + dtype = ArrowDtype(value.type) + else: + dtype = None + arr = sanitize_array(value, self.index, dtype, copy=True, allow_2d=True) if ( isinstance(value, Index) and value.dtype == "object" From bfbd7dc52be955245919d55a7419b65d2d54dfef Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Wed, 1 May 2024 17:43:44 +0200 Subject: [PATCH 02/22] Add test for pyarrow assignment in column test_assign_pyarrow_columns --- pandas/tests/frame/test_alter_axes.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index c68171ab254c7..8fcaa9a9aab26 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -1,8 +1,13 @@ from datetime import datetime +import pyarrow as pa import pytz -from pandas import DataFrame +from pandas import ( + ArrowDtype, + DataFrame, + Series, +) import pandas._testing as tm @@ -28,3 +33,11 @@ def test_assign_columns(self, float_frame): df.columns = ["foo", "bar", "baz", "quux", "foo2"] tm.assert_series_equal(float_frame["C"], df["baz"], check_names=False) tm.assert_series_equal(float_frame["hi"], df["foo2"], check_names=False) + + def test_assign_pyarrow_columns(self): + df = DataFrame({"A": [1]}, dtype=ArrowDtype(pa.uint64())) + df["B"] = pa.array([1], type=pa.uint64()) + result = df.dtypes + expected = Series(Series({"A": "uint64[pyarrow]", "B": "uint64[pyarrow]"})) + + tm.assert_series_equal(result, expected) From 04b1edcc529883bea8c455a5328d708274cbbb0e Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Wed, 1 May 2024 21:39:26 +0200 Subject: [PATCH 03/22] force dtype cast in _sanitize_column only for pa.lib.Array --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 3a945ab38f7be..16e67ad736769 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5026,7 +5026,7 @@ def _sanitize_column(self, value) -> tuple[ArrayLike, BlockValuesRefs | None]: if is_list_like(value): com.require_length_match(value, self.index) - if isinstance(value.type, pa.DataType): + if isinstance(value, pa.lib.Array): dtype = ArrowDtype(value.type) else: dtype = None From 5d2016d2030a48f8ae9b897f780372d9c45deb2e Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Thu, 2 May 2024 21:16:04 +0200 Subject: [PATCH 04/22] move test_assign_column_in_dataframe from test_alter_axes.py to test_arrow.py --- pandas/tests/extension/test_arrow.py | 8 ++++++++ pandas/tests/frame/test_alter_axes.py | 15 +-------------- 2 files changed, 9 insertions(+), 14 deletions(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 7d31fe6085c3a..89c37013b1c88 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1125,6 +1125,14 @@ def test_comp_masked_numpy(self, masked_dtype, comparison_op): expected = pd.Series(exp, dtype=ArrowDtype(pa.bool_())) tm.assert_series_equal(result, expected) + def test_assign_column_in_dataframe(self, data): + df = pd.DataFrame({"A": [1]}, dtype=ArrowDtype(pa.uint64())) + df["B"] = pa.array([1], type=pa.uint64()) + result = df.dtypes + expected = pd.Series({"A": "uint64[pyarrow]", "B": "uint64[pyarrow]"}) + + tm.assert_series_equal(result, expected) + class TestLogicalOps: """Various Series and DataFrame logical ops methods.""" diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index 8fcaa9a9aab26..c68171ab254c7 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -1,13 +1,8 @@ from datetime import datetime -import pyarrow as pa import pytz -from pandas import ( - ArrowDtype, - DataFrame, - Series, -) +from pandas import DataFrame import pandas._testing as tm @@ -33,11 +28,3 @@ def test_assign_columns(self, float_frame): df.columns = ["foo", "bar", "baz", "quux", "foo2"] tm.assert_series_equal(float_frame["C"], df["baz"], check_names=False) tm.assert_series_equal(float_frame["hi"], df["foo2"], check_names=False) - - def test_assign_pyarrow_columns(self): - df = DataFrame({"A": [1]}, dtype=ArrowDtype(pa.uint64())) - df["B"] = pa.array([1], type=pa.uint64()) - result = df.dtypes - expected = Series(Series({"A": "uint64[pyarrow]", "B": "uint64[pyarrow]"})) - - tm.assert_series_equal(result, expected) From 86efdfd4664cc7961cc1f67bbac4be6d3ea54b89 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Mon, 6 May 2024 22:57:11 +0200 Subject: [PATCH 05/22] test_assign_column_in_dataframe is configurable by the data fixture --- pandas/tests/extension/test_arrow.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 89c37013b1c88..9a1a942cbd5c5 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1126,10 +1126,10 @@ def test_comp_masked_numpy(self, masked_dtype, comparison_op): tm.assert_series_equal(result, expected) def test_assign_column_in_dataframe(self, data): - df = pd.DataFrame({"A": [1]}, dtype=ArrowDtype(pa.uint64())) - df["B"] = pa.array([1], type=pa.uint64()) + df = pd.DataFrame(data=data, columns=["A"], dtype=data.dtype) + df["B"] = pa.array(data, type=data.dtype.pyarrow_dtype) result = df.dtypes - expected = pd.Series({"A": "uint64[pyarrow]", "B": "uint64[pyarrow]"}) + expected = pd.Series({"A": data.dtype, "B": data.dtype}) tm.assert_series_equal(result, expected) From e7e3a8bb4909830162be28160c65bbac167e0cfd Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Mon, 6 May 2024 23:48:33 +0200 Subject: [PATCH 06/22] manage optional pyarrow import --- pandas/core/frame.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4c016230da506..2dadec37f6538 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -39,7 +39,6 @@ import numpy as np from numpy import ma -import pyarrow as pa from pandas._config import get_option @@ -50,10 +49,17 @@ ) from pandas._libs.hashtable import duplicated from pandas._libs.lib import is_range_indexer -from pandas.compat import PYPY +from pandas.compat import ( + PYPY, + pa_version_under10p1, +) from pandas.compat._constants import REF_COUNT from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import function as nv + +if not pa_version_under10p1: + import pyarrow as pa + from pandas.errors import ( ChainedAssignmentError, InvalidIndexError, @@ -3012,6 +3018,7 @@ def to_orc( index: bool | None = None, engine_kwargs: dict[str, Any] | None = None, ) -> bytes | None: + self.pyarrow = """pyarrow""" """ Write a DataFrame to the Optimized Row Columnar (ORC) format. @@ -3065,7 +3072,7 @@ def to_orc( `here `__. * Before using this function you should read the :ref:`user guide about ORC ` and :ref:`install optional dependencies `. - * This function requires `pyarrow `_ + * This function requires `%s `_ library. * For supported dtypes please refer to `supported ORC features in Arrow `__. @@ -3088,7 +3095,7 @@ def to_orc( >>> b.seek(0) # doctest: +SKIP 0 >>> content = b.read() # doctest: +SKIP - """ + """ % self.pyarrow from pandas.io.orc import to_orc return to_orc( @@ -5061,7 +5068,7 @@ def _sanitize_column(self, value) -> tuple[ArrayLike, BlockValuesRefs | None]: if is_list_like(value): com.require_length_match(value, self.index) - if isinstance(value, pa.lib.Array): + if not pa_version_under10p1 and isinstance(value, pa.lib.Array): dtype = ArrowDtype(value.type) else: dtype = None From da3f135c7b9e75b8eef60978b5abba0a916a99ea Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Tue, 7 May 2024 14:26:23 +0200 Subject: [PATCH 07/22] Integrate docstring correction --- pandas/core/frame.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 2dadec37f6538..131557c1fb7de 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3018,7 +3018,6 @@ def to_orc( index: bool | None = None, engine_kwargs: dict[str, Any] | None = None, ) -> bytes | None: - self.pyarrow = """pyarrow""" """ Write a DataFrame to the Optimized Row Columnar (ORC) format. @@ -3072,7 +3071,7 @@ def to_orc( `here `__. * Before using this function you should read the :ref:`user guide about ORC ` and :ref:`install optional dependencies `. - * This function requires `%s `_ + * This function requires `pyarrow `_ library. * For supported dtypes please refer to `supported ORC features in Arrow `__. @@ -3095,7 +3094,7 @@ def to_orc( >>> b.seek(0) # doctest: +SKIP 0 >>> content = b.read() # doctest: +SKIP - """ % self.pyarrow + """ from pandas.io.orc import to_orc return to_orc( From 9c53b89aa0efc4b507aa7ce2f3e4817a909791e6 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Tue, 7 May 2024 14:29:39 +0200 Subject: [PATCH 08/22] add an xfail to test_assign_column_in_dataframe to manage version without pyarrow --- pandas/tests/extension/test_arrow.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 9a1a942cbd5c5..cdf20ad06e8ba 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -39,6 +39,7 @@ PY312, is_ci_environment, is_platform_windows, + pa_version_under10p1, pa_version_under11p0, pa_version_under13p0, pa_version_under14p0, @@ -1125,6 +1126,10 @@ def test_comp_masked_numpy(self, masked_dtype, comparison_op): expected = pd.Series(exp, dtype=ArrowDtype(pa.bool_())) tm.assert_series_equal(result, expected) + @pytest.mark.xfail( + pa_version_under10p1, + reason="Assignment of pyarrow arrays yield unexpected dtypes", + ) def test_assign_column_in_dataframe(self, data): df = pd.DataFrame(data=data, columns=["A"], dtype=data.dtype) df["B"] = pa.array(data, type=data.dtype.pyarrow_dtype) From f28b026ebe75a0f52d611e876b0ca42bf666cf6d Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Tue, 7 May 2024 15:11:36 +0200 Subject: [PATCH 09/22] correct pyarrow version check --- pandas/core/frame.py | 6 +++--- pandas/tests/extension/test_arrow.py | 3 +-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 131557c1fb7de..7f04f8e5ec8a6 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -51,13 +51,13 @@ from pandas._libs.lib import is_range_indexer from pandas.compat import ( PYPY, - pa_version_under10p1, + pa_version_under11p0, ) from pandas.compat._constants import REF_COUNT from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import function as nv -if not pa_version_under10p1: +if not pa_version_under11p0: import pyarrow as pa from pandas.errors import ( @@ -5067,7 +5067,7 @@ def _sanitize_column(self, value) -> tuple[ArrayLike, BlockValuesRefs | None]: if is_list_like(value): com.require_length_match(value, self.index) - if not pa_version_under10p1 and isinstance(value, pa.lib.Array): + if not pa_version_under11p0 and isinstance(value, pa.lib.Array): dtype = ArrowDtype(value.type) else: dtype = None diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index cdf20ad06e8ba..d963bc06d619c 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -39,7 +39,6 @@ PY312, is_ci_environment, is_platform_windows, - pa_version_under10p1, pa_version_under11p0, pa_version_under13p0, pa_version_under14p0, @@ -1127,7 +1126,7 @@ def test_comp_masked_numpy(self, masked_dtype, comparison_op): tm.assert_series_equal(result, expected) @pytest.mark.xfail( - pa_version_under10p1, + pa_version_under11p0, reason="Assignment of pyarrow arrays yield unexpected dtypes", ) def test_assign_column_in_dataframe(self, data): From 703a0c647b298a5a242b8cfbc2ffec90de1a4c10 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Fri, 17 May 2024 21:36:40 +0200 Subject: [PATCH 10/22] Correct wrong pyarrow check --- pandas/core/frame.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 7f04f8e5ec8a6..bbbb78b892d7c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -51,13 +51,13 @@ from pandas._libs.lib import is_range_indexer from pandas.compat import ( PYPY, - pa_version_under11p0, + pa_version_under10p1, ) from pandas.compat._constants import REF_COUNT from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import function as nv -if not pa_version_under11p0: +if not pa_version_under10p1: import pyarrow as pa from pandas.errors import ( @@ -5067,7 +5067,7 @@ def _sanitize_column(self, value) -> tuple[ArrayLike, BlockValuesRefs | None]: if is_list_like(value): com.require_length_match(value, self.index) - if not pa_version_under11p0 and isinstance(value, pa.lib.Array): + if not pa_version_under10p1 and isinstance(value, pa.lib._PandasConvertible): dtype = ArrowDtype(value.type) else: dtype = None From 0f924e1eb850bc297c5520db9961d42a5934a637 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Fri, 17 May 2024 21:37:01 +0200 Subject: [PATCH 11/22] Remove unnecessary xfail --- pandas/tests/extension/test_arrow.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index d963bc06d619c..9a1a942cbd5c5 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1125,10 +1125,6 @@ def test_comp_masked_numpy(self, masked_dtype, comparison_op): expected = pd.Series(exp, dtype=ArrowDtype(pa.bool_())) tm.assert_series_equal(result, expected) - @pytest.mark.xfail( - pa_version_under11p0, - reason="Assignment of pyarrow arrays yield unexpected dtypes", - ) def test_assign_column_in_dataframe(self, data): df = pd.DataFrame(data=data, columns=["A"], dtype=data.dtype) df["B"] = pa.array(data, type=data.dtype.pyarrow_dtype) From 5aeac95fdd6a64381685e27e1af5aa8a7bc98459 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Fri, 21 Jun 2024 00:02:32 +0200 Subject: [PATCH 12/22] Move pyarrow check in sanitize_array --- pandas/core/construction.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 360e1d5ddd3ff..d2cc6eca5b984 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -23,6 +23,7 @@ get_supported_dtype, is_supported_dtype, ) +from pandas.compat import pa_version_under10p1 from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.cast import ( @@ -40,7 +41,10 @@ is_object_dtype, pandas_dtype, ) -from pandas.core.dtypes.dtypes import NumpyEADtype +from pandas.core.dtypes.dtypes import ( + ArrowDtype, + NumpyEADtype, +) from pandas.core.dtypes.generic import ( ABCDataFrame, ABCExtensionArray, @@ -51,6 +55,9 @@ import pandas.core.common as com +if not pa_version_under10p1: + import pyarrow as pa + if TYPE_CHECKING: from collections.abc import Sequence @@ -554,6 +561,9 @@ def sanitize_array( # Avoid ending up with a NumpyExtensionArray dtype = dtype.numpy_dtype + elif not pa_version_under10p1 and isinstance(data, (pa.ChunkedArray, pa.Array)): + dtype = ArrowDtype(data.type) + infer_object = not isinstance(data, (ABCIndex, ABCSeries)) # extract ndarray or ExtensionArray, ensure we have no NumpyExtensionArray From 2f07cc08a51a0865c75204f3ed70a8ace3beca1a Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Fri, 21 Jun 2024 00:03:15 +0200 Subject: [PATCH 13/22] Move pyarrow check in sanitize_array --- pandas/core/frame.py | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 55e3a5872f019..5efba860577bc 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -48,17 +48,10 @@ ) from pandas._libs.hashtable import duplicated from pandas._libs.lib import is_range_indexer -from pandas.compat import ( - PYPY, - pa_version_under10p1, -) +from pandas.compat import PYPY from pandas.compat._constants import REF_COUNT from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import function as nv - -if not pa_version_under10p1: - import pyarrow as pa - from pandas.errors import ( ChainedAssignmentError, InvalidIndexError, @@ -5090,11 +5083,7 @@ def _sanitize_column(self, value) -> tuple[ArrayLike, BlockValuesRefs | None]: if is_list_like(value): com.require_length_match(value, self.index) - if not pa_version_under10p1 and isinstance(value, pa.lib._PandasConvertible): - dtype = ArrowDtype(value.type) - else: - dtype = None - return sanitize_array(value, self.index, dtype, copy=True, allow_2d=True), None + return sanitize_array(value, self.index, copy=True, allow_2d=True), None @property def _series(self): From a94023fd5b17c124e5b0e578e4b4cea6884c09b7 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Fri, 21 Jun 2024 00:52:25 +0200 Subject: [PATCH 14/22] Code clean up --- pandas/core/frame.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5efba860577bc..0aeda77233125 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5082,7 +5082,6 @@ def _sanitize_column(self, value) -> tuple[ArrayLike, BlockValuesRefs | None]: if is_list_like(value): com.require_length_match(value, self.index) - return sanitize_array(value, self.index, copy=True, allow_2d=True), None @property From e69265b207ef5093d00d49a2cceb77fb1df8703d Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Fri, 21 Jun 2024 21:46:02 +0200 Subject: [PATCH 15/22] Check if dtype has been initialized before --- pandas/core/construction.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index d2cc6eca5b984..f26aab077c32a 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -561,7 +561,11 @@ def sanitize_array( # Avoid ending up with a NumpyExtensionArray dtype = dtype.numpy_dtype - elif not pa_version_under10p1 and isinstance(data, (pa.ChunkedArray, pa.Array)): + elif ( + not pa_version_under10p1 + and isinstance(data, (pa.ChunkedArray, pa.Array)) + and dtype is None + ): dtype = ArrowDtype(data.type) infer_object = not isinstance(data, (ABCIndex, ABCSeries)) From 12727f022d59a5c0ec29f55c8fbefae774237729 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Fri, 21 Jun 2024 23:02:42 +0200 Subject: [PATCH 16/22] Try to process arrow dtype before any dtype modification --- pandas/core/construction.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index f26aab077c32a..2fb9c76693c13 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -554,6 +554,13 @@ def sanitize_array( np.ndarray or ExtensionArray """ original_dtype = dtype + if ( + not pa_version_under10p1 + and isinstance(data, (pa.ChunkedArray, pa.Array)) + and dtype is None + ): + dtype = ArrowDtype(data.type) + if isinstance(data, ma.MaskedArray): data = sanitize_masked_array(data) @@ -561,13 +568,6 @@ def sanitize_array( # Avoid ending up with a NumpyExtensionArray dtype = dtype.numpy_dtype - elif ( - not pa_version_under10p1 - and isinstance(data, (pa.ChunkedArray, pa.Array)) - and dtype is None - ): - dtype = ArrowDtype(data.type) - infer_object = not isinstance(data, (ABCIndex, ABCSeries)) # extract ndarray or ExtensionArray, ensure we have no NumpyExtensionArray From bd38f150067b1dc6b0cd643d5dd72a4ed819eb58 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Thu, 27 Jun 2024 22:29:57 +0200 Subject: [PATCH 17/22] Add entry in last new in section Conversion --- doc/source/whatsnew/v3.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 3d869bf31f372..62491a0897679 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -518,6 +518,7 @@ Conversion - Bug in :meth:`DataFrame.update` bool dtype being converted to object (:issue:`55509`) - Bug in :meth:`Series.astype` might modify read-only array inplace when casting to a string dtype (:issue:`57212`) - Bug in :meth:`Series.reindex` not maintaining ``float32`` type when a ``reindex`` introduces a missing value (:issue:`45857`) +- Bug in :meth:`sanitize_array` was not taking into account pyarrow arrays. (:issue:`56994`) Strings ^^^^^^^ From 96871c11d1ee11652fec224db9681ca4cae3d1f5 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Thu, 27 Jun 2024 22:35:59 +0200 Subject: [PATCH 18/22] Replace pyarrow type check by existing lib function --- pandas/core/construction.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 2fb9c76693c13..0192f27b47c7e 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -56,7 +56,7 @@ import pandas.core.common as com if not pa_version_under10p1: - import pyarrow as pa + pass if TYPE_CHECKING: from collections.abc import Sequence @@ -554,11 +554,7 @@ def sanitize_array( np.ndarray or ExtensionArray """ original_dtype = dtype - if ( - not pa_version_under10p1 - and isinstance(data, (pa.ChunkedArray, pa.Array)) - and dtype is None - ): + if not pa_version_under10p1 and lib.is_pyarrow_array(data) and dtype is None: dtype = ArrowDtype(data.type) if isinstance(data, ma.MaskedArray): From 9cc063e865e3ad828125b7eb90e28c1daf9b03b9 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Sun, 3 Nov 2024 19:55:59 +0000 Subject: [PATCH 19/22] Update description of issue 56994 --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 20e2eec2eccbc..63ca1b2228f48 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -644,11 +644,11 @@ Numeric Conversion ^^^^^^^^^^ +- Assigning a PyArrow array in a ``pd.DataFrame`` column now produces a ``pd.Series`` with a ``pd.ArrowDtype`` (:issue:`56994`) - Bug in :meth:`DataFrame.astype` not casting ``values`` for Arrow-based dictionary dtype correctly (:issue:`58479`) - Bug in :meth:`DataFrame.update` bool dtype being converted to object (:issue:`55509`) - Bug in :meth:`Series.astype` might modify read-only array inplace when casting to a string dtype (:issue:`57212`) - Bug in :meth:`Series.reindex` not maintaining ``float32`` type when a ``reindex`` introduces a missing value (:issue:`45857`) -- Bug in :meth:`sanitize_array` was not taking into account pyarrow arrays. (:issue:`56994`) Strings ^^^^^^^ From bf98984bbdbd12dc30087d111e2a0357529f1900 Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Sun, 3 Nov 2024 20:00:15 +0000 Subject: [PATCH 20/22] Code clean up --- pandas/core/construction.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 83afda354b4e3..7e3499cc30b8c 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -55,9 +55,6 @@ import pandas.core.common as com -if not pa_version_under10p1: - pass - if TYPE_CHECKING: from collections.abc import Sequence From 64a366d0a25a109663d9887e7202dbc7bc41442d Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Sun, 3 Nov 2024 20:08:49 +0000 Subject: [PATCH 21/22] Test assignment from a pyarrow array and a series --- pandas/tests/extension/test_arrow.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index bf4e235f894ee..badf2d1feec26 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1071,8 +1071,9 @@ def test_comp_masked_numpy(self, masked_dtype, comparison_op): def test_assign_column_in_dataframe(self, data): df = pd.DataFrame(data=data, columns=["A"], dtype=data.dtype) df["B"] = pa.array(data, type=data.dtype.pyarrow_dtype) + df["C"] = pd.Series(data) result = df.dtypes - expected = pd.Series({"A": data.dtype, "B": data.dtype}) + expected = pd.Series({"A": data.dtype, "B": data.dtype, "C": data.dtype}) tm.assert_series_equal(result, expected) From 01812efdcb2be93b8e453898de1d2a20c6b60d4f Mon Sep 17 00:00:00 2001 From: droussea2001 <19688507+droussea2001@users.noreply.github.com> Date: Mon, 4 Nov 2024 21:26:19 +0000 Subject: [PATCH 22/22] Add test for series instanciation with pyarrow type --- pandas/tests/extension/test_arrow.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index badf2d1feec26..95f28ae3c43b9 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1074,9 +1074,12 @@ def test_assign_column_in_dataframe(self, data): df["C"] = pd.Series(data) result = df.dtypes expected = pd.Series({"A": data.dtype, "B": data.dtype, "C": data.dtype}) - tm.assert_series_equal(result, expected) + def test_create_series_dtype(self, data): + ser = pd.Series(data._pa_array) + assert ser.dtype == data.dtype + class TestLogicalOps: """Various Series and DataFrame logical ops methods."""