Skip to content

Commit c96dbb7

Browse files
authoredMay 16, 2023
BUG/REF: ArrowExtensionArray non-nanosecond units (#53171)
* BUG/REF: ArrowExtensionArray non-nanosecond units * mypy * gh refs * fixes * xfail min versions * docstrings * fix test * fix test * update imports * move imports
1 parent 489f15e commit c96dbb7

File tree

3 files changed

+249
-79
lines changed

3 files changed

+249
-79
lines changed
 

‎doc/source/whatsnew/v2.1.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -438,6 +438,7 @@ Sparse
438438

439439
ExtensionArray
440440
^^^^^^^^^^^^^^
441+
- Bug in :class:`~arrays.ArrowExtensionArray` converting pandas non-nanosecond temporal objects from non-zero values to zero values (:issue:`53171`)
441442
- Bug in :meth:`Series.quantile` for pyarrow temporal types raising ArrowInvalid (:issue:`52678`)
442443
- Bug in :meth:`Series.rank` returning wrong order for small values with ``Float64`` dtype (:issue:`52471`)
443444
- Bug where the ``__from_arrow__`` method of masked ExtensionDtypes(e.g. :class:`Float64Dtype`, :class:`BooleanDtype`) would not accept pyarrow arrays of type ``pyarrow.null()`` (:issue:`52223`)

‎pandas/core/arrays/arrow/array.py

+172-77
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,10 @@
1818
import numpy as np
1919

2020
from pandas._libs import lib
21+
from pandas._libs.tslibs import (
22+
Timedelta,
23+
Timestamp,
24+
)
2125
from pandas.compat import (
2226
pa_version_under7p0,
2327
pa_version_under8p0,
@@ -244,39 +248,9 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal
244248
"""
245249
Construct a new ExtensionArray from a sequence of scalars.
246250
"""
247-
pa_dtype = to_pyarrow_type(dtype)
248-
if (
249-
isinstance(scalars, np.ndarray)
250-
and isinstance(dtype, ArrowDtype)
251-
and (
252-
pa.types.is_large_binary(pa_dtype) or pa.types.is_large_string(pa_dtype)
253-
)
254-
):
255-
# See https://github.com/apache/arrow/issues/35289
256-
scalars = scalars.tolist()
257-
258-
if isinstance(scalars, cls):
259-
scalars = scalars._pa_array
260-
elif not isinstance(scalars, (pa.Array, pa.ChunkedArray)):
261-
if copy and is_array_like(scalars):
262-
# pa array should not get updated when numpy array is updated
263-
scalars = scalars.copy()
264-
try:
265-
scalars = pa.array(scalars, type=pa_dtype, from_pandas=True)
266-
except pa.ArrowInvalid:
267-
# GH50430: let pyarrow infer type, then cast
268-
scalars = pa.array(scalars, from_pandas=True)
269-
if pa_dtype and scalars.type != pa_dtype:
270-
if pa.types.is_dictionary(pa_dtype):
271-
scalars = scalars.dictionary_encode()
272-
else:
273-
scalars = scalars.cast(pa_dtype)
274-
arr = cls(scalars)
275-
if pa.types.is_duration(scalars.type) and scalars.null_count > 0:
276-
# GH52843: upstream bug for duration types when originally
277-
# constructed with data containing numpy NaT.
278-
# https://github.com/apache/arrow/issues/35088
279-
arr = arr.fillna(arr.dtype.na_value)
251+
pa_type = to_pyarrow_type(dtype)
252+
pa_array = cls._box_pa_array(scalars, pa_type=pa_type, copy=copy)
253+
arr = cls(pa_array)
280254
return arr
281255

282256
@classmethod
@@ -352,6 +326,150 @@ def _from_sequence_of_strings(
352326
)
353327
return cls._from_sequence(scalars, dtype=pa_type, copy=copy)
354328

329+
@classmethod
330+
def _box_pa(
331+
cls, value, pa_type: pa.DataType | None = None
332+
) -> pa.Array | pa.ChunkedArray | pa.Scalar:
333+
"""
334+
Box value into a pyarrow Array, ChunkedArray or Scalar.
335+
336+
Parameters
337+
----------
338+
value : any
339+
pa_type : pa.DataType | None
340+
341+
Returns
342+
-------
343+
pa.Array or pa.ChunkedArray or pa.Scalar
344+
"""
345+
if is_list_like(value):
346+
return cls._box_pa_array(value, pa_type)
347+
return cls._box_pa_scalar(value, pa_type)
348+
349+
@classmethod
350+
def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar:
351+
"""
352+
Box value into a pyarrow Scalar.
353+
354+
Parameters
355+
----------
356+
value : any
357+
pa_type : pa.DataType | None
358+
359+
Returns
360+
-------
361+
pa.Scalar
362+
"""
363+
if isinstance(value, pa.Scalar):
364+
pa_scalar = value
365+
elif isna(value):
366+
pa_scalar = pa.scalar(None, type=pa_type)
367+
else:
368+
# GH 53171: pyarrow does not yet handle pandas non-nano correctly
369+
# see https://github.com/apache/arrow/issues/33321
370+
if isinstance(value, Timedelta):
371+
if pa_type is None:
372+
pa_type = pa.duration(value.unit)
373+
elif value.unit != pa_type.unit:
374+
value = value.as_unit(pa_type.unit)
375+
value = value._value
376+
elif isinstance(value, Timestamp):
377+
if pa_type is None:
378+
pa_type = pa.timestamp(value.unit, tz=value.tz)
379+
elif value.unit != pa_type.unit:
380+
value = value.as_unit(pa_type.unit)
381+
value = value._value
382+
383+
pa_scalar = pa.scalar(value, type=pa_type, from_pandas=True)
384+
385+
if pa_type is not None and pa_scalar.type != pa_type:
386+
pa_scalar = pa_scalar.cast(pa_type)
387+
388+
return pa_scalar
389+
390+
@classmethod
391+
def _box_pa_array(
392+
cls, value, pa_type: pa.DataType | None = None, copy: bool = False
393+
) -> pa.Array | pa.ChunkedArray:
394+
"""
395+
Box value into a pyarrow Array or ChunkedArray.
396+
397+
Parameters
398+
----------
399+
value : Sequence
400+
pa_type : pa.DataType | None
401+
402+
Returns
403+
-------
404+
pa.Array or pa.ChunkedArray
405+
"""
406+
if isinstance(value, cls):
407+
pa_array = value._pa_array
408+
elif isinstance(value, (pa.Array, pa.ChunkedArray)):
409+
pa_array = value
410+
elif isinstance(value, BaseMaskedArray):
411+
# GH 52625
412+
if copy:
413+
value = value.copy()
414+
pa_array = value.__arrow_array__()
415+
else:
416+
if (
417+
isinstance(value, np.ndarray)
418+
and pa_type is not None
419+
and (
420+
pa.types.is_large_binary(pa_type)
421+
or pa.types.is_large_string(pa_type)
422+
)
423+
):
424+
# See https://github.com/apache/arrow/issues/35289
425+
value = value.tolist()
426+
elif copy and is_array_like(value):
427+
# pa array should not get updated when numpy array is updated
428+
value = value.copy()
429+
430+
if (
431+
pa_type is not None
432+
and pa.types.is_duration(pa_type)
433+
and (not isinstance(value, np.ndarray) or value.dtype.kind not in "mi")
434+
):
435+
# GH 53171: pyarrow does not yet handle pandas non-nano correctly
436+
# see https://github.com/apache/arrow/issues/33321
437+
from pandas.core.tools.timedeltas import to_timedelta
438+
439+
value = to_timedelta(value, unit=pa_type.unit).as_unit(pa_type.unit)
440+
value = value.to_numpy()
441+
442+
try:
443+
pa_array = pa.array(value, type=pa_type, from_pandas=True)
444+
except pa.ArrowInvalid:
445+
# GH50430: let pyarrow infer type, then cast
446+
pa_array = pa.array(value, from_pandas=True)
447+
448+
if pa_type is None and pa.types.is_duration(pa_array.type):
449+
# GH 53171: pyarrow does not yet handle pandas non-nano correctly
450+
# see https://github.com/apache/arrow/issues/33321
451+
from pandas.core.tools.timedeltas import to_timedelta
452+
453+
value = to_timedelta(value)
454+
value = value.to_numpy()
455+
pa_array = pa.array(value, type=pa_type, from_pandas=True)
456+
457+
if pa.types.is_duration(pa_array.type) and pa_array.null_count > 0:
458+
# GH52843: upstream bug for duration types when originally
459+
# constructed with data containing numpy NaT.
460+
# https://github.com/apache/arrow/issues/35088
461+
arr = cls(pa_array)
462+
arr = arr.fillna(arr.dtype.na_value)
463+
pa_array = arr._pa_array
464+
465+
if pa_type is not None and pa_array.type != pa_type:
466+
if pa.types.is_dictionary(pa_type):
467+
pa_array = pa_array.dictionary_encode()
468+
else:
469+
pa_array = pa_array.cast(pa_type)
470+
471+
return pa_array
472+
355473
def __getitem__(self, item: PositionalIndexer):
356474
"""Select a subset of self.
357475
@@ -470,65 +588,50 @@ def __setstate__(self, state) -> None:
470588

471589
def _cmp_method(self, other, op):
472590
pc_func = ARROW_CMP_FUNCS[op.__name__]
473-
if isinstance(other, ArrowExtensionArray):
474-
result = pc_func(self._pa_array, other._pa_array)
475-
elif isinstance(other, (np.ndarray, list)):
476-
result = pc_func(self._pa_array, other)
477-
elif isinstance(other, BaseMaskedArray):
478-
# GH 52625
479-
result = pc_func(self._pa_array, other.__arrow_array__())
480-
elif is_scalar(other):
481-
try:
482-
result = pc_func(self._pa_array, pa.scalar(other))
483-
except (pa.lib.ArrowNotImplementedError, pa.lib.ArrowInvalid):
591+
try:
592+
result = pc_func(self._pa_array, self._box_pa(other))
593+
except (pa.lib.ArrowNotImplementedError, pa.lib.ArrowInvalid):
594+
if is_scalar(other):
484595
mask = isna(self) | isna(other)
485596
valid = ~mask
486597
result = np.zeros(len(self), dtype="bool")
487598
result[valid] = op(np.array(self)[valid], other)
488599
result = pa.array(result, type=pa.bool_())
489600
result = pc.if_else(valid, result, None)
490-
else:
491-
raise NotImplementedError(
492-
f"{op.__name__} not implemented for {type(other)}"
493-
)
601+
else:
602+
raise NotImplementedError(
603+
f"{op.__name__} not implemented for {type(other)}"
604+
)
494605
return ArrowExtensionArray(result)
495606

496607
def _evaluate_op_method(self, other, op, arrow_funcs):
497608
pa_type = self._pa_array.type
609+
other = self._box_pa(other)
610+
498611
if (pa.types.is_string(pa_type) or pa.types.is_binary(pa_type)) and op in [
499612
operator.add,
500613
roperator.radd,
501614
]:
502615
sep = pa.scalar("", type=pa_type)
503-
if isinstance(other, type(self)):
504-
other = other._pa_array
505616
if op is operator.add:
506617
result = pc.binary_join_element_wise(self._pa_array, other, sep)
507618
else:
508619
result = pc.binary_join_element_wise(other, self._pa_array, sep)
509620
return type(self)(result)
510621

622+
if (
623+
isinstance(other, pa.Scalar)
624+
and pc.is_null(other).as_py()
625+
and op.__name__ in ARROW_LOGICAL_FUNCS
626+
):
627+
# pyarrow kleene ops require null to be typed
628+
other = other.cast(pa_type)
629+
511630
pc_func = arrow_funcs[op.__name__]
512631
if pc_func is NotImplemented:
513632
raise NotImplementedError(f"{op.__name__} not implemented.")
514-
if isinstance(other, ArrowExtensionArray):
515-
result = pc_func(self._pa_array, other._pa_array)
516-
elif isinstance(other, (np.ndarray, list)):
517-
result = pc_func(self._pa_array, pa.array(other, from_pandas=True))
518-
elif isinstance(other, BaseMaskedArray):
519-
# GH 52625
520-
result = pc_func(self._pa_array, other.__arrow_array__())
521-
elif is_scalar(other):
522-
if isna(other) and op.__name__ in ARROW_LOGICAL_FUNCS:
523-
# pyarrow kleene ops require null to be typed
524-
pa_scalar = pa.scalar(None, type=self._pa_array.type)
525-
else:
526-
pa_scalar = pa.scalar(other)
527-
result = pc_func(self._pa_array, pa_scalar)
528-
else:
529-
raise NotImplementedError(
530-
f"{op.__name__} not implemented for {type(other)}"
531-
)
633+
634+
result = pc_func(self._pa_array, other)
532635
return type(self)(result)
533636

534637
def _logical_method(self, other, op):
@@ -1610,16 +1713,8 @@ def _mode(self, dropna: bool = True) -> Self:
16101713

16111714
def _maybe_convert_setitem_value(self, value):
16121715
"""Maybe convert value to be pyarrow compatible."""
1613-
if value is None:
1614-
return value
1615-
if isinstance(value, (pa.Scalar, pa.Array, pa.ChunkedArray)):
1616-
return value
1617-
if is_list_like(value):
1618-
pa_box = pa.array
1619-
else:
1620-
pa_box = pa.scalar
16211716
try:
1622-
value = pa_box(value, type=self._pa_array.type, from_pandas=True)
1717+
value = self._box_pa(value, self._pa_array.type)
16231718
except pa.ArrowTypeError as err:
16241719
msg = f"Invalid value '{str(value)}' for dtype {self.dtype}"
16251720
raise TypeError(msg) from err

‎pandas/tests/extension/test_arrow.py

+76-2
Original file line numberDiff line numberDiff line change
@@ -1720,8 +1720,9 @@ def test_setitem_null_slice(data):
17201720

17211721
result = orig.copy()
17221722
result[:] = data[0]
1723-
expected = ArrowExtensionArray(
1724-
pa.array([data[0]] * len(data), type=data._pa_array.type)
1723+
expected = ArrowExtensionArray._from_sequence(
1724+
[data[0]] * len(data),
1725+
dtype=data._pa_array.type,
17251726
)
17261727
tm.assert_extension_array_equal(result, expected)
17271728

@@ -2934,3 +2935,76 @@ def test_infer_dtype_pyarrow_dtype(data, request):
29342935
request.node.add_marker(mark)
29352936

29362937
assert res == lib.infer_dtype(list(data), skipna=True)
2938+
2939+
2940+
@pytest.mark.parametrize(
2941+
"pa_type", tm.DATETIME_PYARROW_DTYPES + tm.TIMEDELTA_PYARROW_DTYPES
2942+
)
2943+
def test_from_sequence_temporal(pa_type):
2944+
# GH 53171
2945+
val = 3
2946+
unit = pa_type.unit
2947+
if pa.types.is_duration(pa_type):
2948+
seq = [pd.Timedelta(val, unit=unit).as_unit(unit)]
2949+
else:
2950+
seq = [pd.Timestamp(val, unit=unit, tz=pa_type.tz).as_unit(unit)]
2951+
2952+
result = ArrowExtensionArray._from_sequence(seq, dtype=pa_type)
2953+
expected = ArrowExtensionArray(pa.array([val], type=pa_type))
2954+
tm.assert_extension_array_equal(result, expected)
2955+
2956+
2957+
@pytest.mark.parametrize(
2958+
"pa_type", tm.DATETIME_PYARROW_DTYPES + tm.TIMEDELTA_PYARROW_DTYPES
2959+
)
2960+
def test_setitem_temporal(pa_type):
2961+
# GH 53171
2962+
unit = pa_type.unit
2963+
if pa.types.is_duration(pa_type):
2964+
val = pd.Timedelta(1, unit=unit).as_unit(unit)
2965+
else:
2966+
val = pd.Timestamp(1, unit=unit, tz=pa_type.tz).as_unit(unit)
2967+
2968+
arr = ArrowExtensionArray(pa.array([1, 2, 3], type=pa_type))
2969+
2970+
result = arr.copy()
2971+
result[:] = val
2972+
expected = ArrowExtensionArray(pa.array([1, 1, 1], type=pa_type))
2973+
tm.assert_extension_array_equal(result, expected)
2974+
2975+
2976+
@pytest.mark.parametrize(
2977+
"pa_type", tm.DATETIME_PYARROW_DTYPES + tm.TIMEDELTA_PYARROW_DTYPES
2978+
)
2979+
def test_arithmetic_temporal(pa_type, request):
2980+
# GH 53171
2981+
if pa_version_under8p0 and pa.types.is_duration(pa_type):
2982+
mark = pytest.mark.xfail(
2983+
raises=pa.ArrowNotImplementedError,
2984+
reason="Function 'subtract_checked' has no kernel matching input types",
2985+
)
2986+
request.node.add_marker(mark)
2987+
2988+
arr = ArrowExtensionArray(pa.array([1, 2, 3], type=pa_type))
2989+
unit = pa_type.unit
2990+
result = arr - pd.Timedelta(1, unit=unit).as_unit(unit)
2991+
expected = ArrowExtensionArray(pa.array([0, 1, 2], type=pa_type))
2992+
tm.assert_extension_array_equal(result, expected)
2993+
2994+
2995+
@pytest.mark.parametrize(
2996+
"pa_type", tm.DATETIME_PYARROW_DTYPES + tm.TIMEDELTA_PYARROW_DTYPES
2997+
)
2998+
def test_comparison_temporal(pa_type):
2999+
# GH 53171
3000+
unit = pa_type.unit
3001+
if pa.types.is_duration(pa_type):
3002+
val = pd.Timedelta(1, unit=unit).as_unit(unit)
3003+
else:
3004+
val = pd.Timestamp(1, unit=unit, tz=pa_type.tz).as_unit(unit)
3005+
3006+
arr = ArrowExtensionArray(pa.array([1, 2, 3], type=pa_type))
3007+
3008+
result = arr > val
3009+
expected = ArrowExtensionArray(pa.array([False, True, True], type=pa.bool_()))
3010+
tm.assert_extension_array_equal(result, expected)

0 commit comments

Comments
 (0)
Please sign in to comment.