Skip to content

Commit 436f5eb

Browse files
authored
ENH: add __from_pyarrow__ support to DatetimeTZDtype (#52201)
* ENH: add `__from_pyarrow__` support to `DatetimeTZDtype` * handle empty pyarrow arrays * add test with iNaT * mypy * nits * add docs for NaT handling * sort by issue number * use arrow to_numpy * don't copy * use safe cast * don't localize * don't convert to ns in DatetimeArray constructor
1 parent 9eee107 commit 436f5eb

File tree

4 files changed

+124
-1
lines changed

4 files changed

+124
-1
lines changed

Diff for: doc/source/whatsnew/v2.1.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@ Other enhancements
8585
- Add dtype of categories to ``repr`` information of :class:`CategoricalDtype` (:issue:`52179`)
8686
- Added to the escape mode "latex-math" preserving without escaping all characters between "\(" and "\)" in formatter (:issue:`51903`)
8787
- Adding ``engine_kwargs`` parameter to :meth:`DataFrame.read_excel` (:issue:`52214`)
88+
- Implemented ``__from_arrow__`` on :class:`DatetimeTZDtype`. (:issue:`52201`)
8889
- Implemented ``__pandas_priority__`` to allow custom types to take precedence over :class:`DataFrame`, :class:`Series`, :class:`Index`, or :class:`ExtensionArray` for arithmetic operations, :ref:`see the developer guide <extending.pandas_priority>` (:issue:`48347`)
8990
- Improve error message when having incompatible columns using :meth:`DataFrame.merge` (:issue:`51861`)
9091
- Improve error message when setting :class:`DataFrame` with wrong number of columns through :meth:`DataFrame.isetitem` (:issue:`51701`)

Diff for: pandas/core/arrays/datetimes.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -2344,7 +2344,9 @@ def _validate_dt64_dtype(dtype):
23442344
# a tz-aware Timestamp (with a tz specific to its datetime) will
23452345
# be incorrect(ish?) for the array as a whole
23462346
dtype = cast(DatetimeTZDtype, dtype)
2347-
dtype = DatetimeTZDtype(tz=timezones.tz_standardize(dtype.tz))
2347+
dtype = DatetimeTZDtype(
2348+
unit=dtype.unit, tz=timezones.tz_standardize(dtype.tz)
2349+
)
23482350

23492351
return dtype
23502352

Diff for: pandas/core/dtypes/dtypes.py

+34
Original file line numberDiff line numberDiff line change
@@ -817,6 +817,40 @@ def __eq__(self, other: Any) -> bool:
817817
and tz_compare(self.tz, other.tz)
818818
)
819819

820+
def __from_arrow__(
821+
self, array: pyarrow.Array | pyarrow.ChunkedArray
822+
) -> DatetimeArray:
823+
"""
824+
Construct DatetimeArray from pyarrow Array/ChunkedArray.
825+
826+
Note: If the units in the pyarrow Array are the same as this
827+
DatetimeDtype, then values corresponding to the integer representation
828+
of ``NaT`` (e.g. one nanosecond before :attr:`pandas.Timestamp.min`)
829+
are converted to ``NaT``, regardless of the null indicator in the
830+
pyarrow array.
831+
832+
Parameters
833+
----------
834+
array : pyarrow.Array or pyarrow.ChunkedArray
835+
The Arrow array to convert to DatetimeArray.
836+
837+
Returns
838+
-------
839+
extension array : DatetimeArray
840+
"""
841+
import pyarrow
842+
843+
from pandas.core.arrays import DatetimeArray
844+
845+
array = array.cast(pyarrow.timestamp(unit=self._unit), safe=True)
846+
847+
if isinstance(array, pyarrow.Array):
848+
np_arr = array.to_numpy(zero_copy_only=False)
849+
else:
850+
np_arr = array.to_numpy()
851+
852+
return DatetimeArray(np_arr, dtype=self, copy=False)
853+
820854
def __setstate__(self, state) -> None:
821855
# for pickle compat. __get_state__ is defined in the
822856
# PandasExtensionDtype superclass and uses the public properties to

Diff for: pandas/tests/arrays/datetimes/test_constructors.py

+86
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
import numpy as np
22
import pytest
33

4+
from pandas._libs import iNaT
5+
46
from pandas.core.dtypes.dtypes import DatetimeTZDtype
57

68
import pandas as pd
@@ -168,3 +170,87 @@ def test_2d(self, order):
168170
res = DatetimeArray._from_sequence(arr)
169171
expected = DatetimeArray._from_sequence(arr.ravel()).reshape(arr.shape)
170172
tm.assert_datetime_array_equal(res, expected)
173+
174+
175+
# ----------------------------------------------------------------------------
176+
# Arrow interaction
177+
178+
179+
EXTREME_VALUES = [0, 123456789, None, iNaT, 2**63 - 1, -(2**63) + 1]
180+
FINE_TO_COARSE_SAFE = [123_000_000_000, None, -123_000_000_000]
181+
COARSE_TO_FINE_SAFE = [123, None, -123]
182+
183+
184+
@pytest.mark.parametrize(
185+
("pa_unit", "pd_unit", "pa_tz", "pd_tz", "data"),
186+
[
187+
("s", "s", "UTC", "UTC", EXTREME_VALUES),
188+
("ms", "ms", "UTC", "Europe/Berlin", EXTREME_VALUES),
189+
("us", "us", "US/Eastern", "UTC", EXTREME_VALUES),
190+
("ns", "ns", "US/Central", "Asia/Kolkata", EXTREME_VALUES),
191+
("ns", "s", "UTC", "UTC", FINE_TO_COARSE_SAFE),
192+
("us", "ms", "UTC", "Europe/Berlin", FINE_TO_COARSE_SAFE),
193+
("ms", "us", "US/Eastern", "UTC", COARSE_TO_FINE_SAFE),
194+
("s", "ns", "US/Central", "Asia/Kolkata", COARSE_TO_FINE_SAFE),
195+
],
196+
)
197+
def test_from_arrowtest_from_arrow_with_different_units_and_timezones_with_(
198+
pa_unit, pd_unit, pa_tz, pd_tz, data
199+
):
200+
pa = pytest.importorskip("pyarrow")
201+
202+
pa_type = pa.timestamp(pa_unit, tz=pa_tz)
203+
arr = pa.array(data, type=pa_type)
204+
dtype = DatetimeTZDtype(unit=pd_unit, tz=pd_tz)
205+
206+
result = dtype.__from_arrow__(arr)
207+
expected = DatetimeArray(
208+
np.array(data, dtype=f"datetime64[{pa_unit}]").astype(f"datetime64[{pd_unit}]"),
209+
dtype=dtype,
210+
)
211+
tm.assert_extension_array_equal(result, expected)
212+
213+
result = dtype.__from_arrow__(pa.chunked_array([arr]))
214+
tm.assert_extension_array_equal(result, expected)
215+
216+
217+
@pytest.mark.parametrize(
218+
("unit", "tz"),
219+
[
220+
("s", "UTC"),
221+
("ms", "Europe/Berlin"),
222+
("us", "US/Eastern"),
223+
("ns", "Asia/Kolkata"),
224+
("ns", "UTC"),
225+
],
226+
)
227+
def test_from_arrow_from_empty(unit, tz):
228+
pa = pytest.importorskip("pyarrow")
229+
230+
data = []
231+
arr = pa.array(data)
232+
dtype = DatetimeTZDtype(unit=unit, tz=tz)
233+
234+
result = dtype.__from_arrow__(arr)
235+
expected = DatetimeArray(np.array(data, dtype=f"datetime64[{unit}]"))
236+
expected = expected.tz_localize(tz=tz)
237+
tm.assert_extension_array_equal(result, expected)
238+
239+
result = dtype.__from_arrow__(pa.chunked_array([arr]))
240+
tm.assert_extension_array_equal(result, expected)
241+
242+
243+
def test_from_arrow_from_integers():
244+
pa = pytest.importorskip("pyarrow")
245+
246+
data = [0, 123456789, None, 2**63 - 1, iNaT, -123456789]
247+
arr = pa.array(data)
248+
dtype = DatetimeTZDtype(unit="ns", tz="UTC")
249+
250+
result = dtype.__from_arrow__(arr)
251+
expected = DatetimeArray(np.array(data, dtype="datetime64[ns]"))
252+
expected = expected.tz_localize("UTC")
253+
tm.assert_extension_array_equal(result, expected)
254+
255+
result = dtype.__from_arrow__(pa.chunked_array([arr]))
256+
tm.assert_extension_array_equal(result, expected)

0 commit comments

Comments
 (0)