|
18 | 18 | import numpy as np
|
19 | 19 |
|
20 | 20 | from pandas._libs import lib
|
| 21 | +from pandas._libs.tslibs import ( |
| 22 | + Timedelta, |
| 23 | + Timestamp, |
| 24 | +) |
21 | 25 | from pandas.compat import (
|
22 | 26 | pa_version_under7p0,
|
23 | 27 | pa_version_under8p0,
|
@@ -244,39 +248,9 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal
|
244 | 248 | """
|
245 | 249 | Construct a new ExtensionArray from a sequence of scalars.
|
246 | 250 | """
|
247 |
| - pa_dtype = to_pyarrow_type(dtype) |
248 |
| - if ( |
249 |
| - isinstance(scalars, np.ndarray) |
250 |
| - and isinstance(dtype, ArrowDtype) |
251 |
| - and ( |
252 |
| - pa.types.is_large_binary(pa_dtype) or pa.types.is_large_string(pa_dtype) |
253 |
| - ) |
254 |
| - ): |
255 |
| - # See https://github.com/apache/arrow/issues/35289 |
256 |
| - scalars = scalars.tolist() |
257 |
| - |
258 |
| - if isinstance(scalars, cls): |
259 |
| - scalars = scalars._pa_array |
260 |
| - elif not isinstance(scalars, (pa.Array, pa.ChunkedArray)): |
261 |
| - if copy and is_array_like(scalars): |
262 |
| - # pa array should not get updated when numpy array is updated |
263 |
| - scalars = scalars.copy() |
264 |
| - try: |
265 |
| - scalars = pa.array(scalars, type=pa_dtype, from_pandas=True) |
266 |
| - except pa.ArrowInvalid: |
267 |
| - # GH50430: let pyarrow infer type, then cast |
268 |
| - scalars = pa.array(scalars, from_pandas=True) |
269 |
| - if pa_dtype and scalars.type != pa_dtype: |
270 |
| - if pa.types.is_dictionary(pa_dtype): |
271 |
| - scalars = scalars.dictionary_encode() |
272 |
| - else: |
273 |
| - scalars = scalars.cast(pa_dtype) |
274 |
| - arr = cls(scalars) |
275 |
| - if pa.types.is_duration(scalars.type) and scalars.null_count > 0: |
276 |
| - # GH52843: upstream bug for duration types when originally |
277 |
| - # constructed with data containing numpy NaT. |
278 |
| - # https://github.com/apache/arrow/issues/35088 |
279 |
| - arr = arr.fillna(arr.dtype.na_value) |
| 251 | + pa_type = to_pyarrow_type(dtype) |
| 252 | + pa_array = cls._box_pa_array(scalars, pa_type=pa_type, copy=copy) |
| 253 | + arr = cls(pa_array) |
280 | 254 | return arr
|
281 | 255 |
|
282 | 256 | @classmethod
|
@@ -352,6 +326,150 @@ def _from_sequence_of_strings(
|
352 | 326 | )
|
353 | 327 | return cls._from_sequence(scalars, dtype=pa_type, copy=copy)
|
354 | 328 |
|
| 329 | + @classmethod |
| 330 | + def _box_pa( |
| 331 | + cls, value, pa_type: pa.DataType | None = None |
| 332 | + ) -> pa.Array | pa.ChunkedArray | pa.Scalar: |
| 333 | + """ |
| 334 | + Box value into a pyarrow Array, ChunkedArray or Scalar. |
| 335 | +
|
| 336 | + Parameters |
| 337 | + ---------- |
| 338 | + value : any |
| 339 | + pa_type : pa.DataType | None |
| 340 | +
|
| 341 | + Returns |
| 342 | + ------- |
| 343 | + pa.Array or pa.ChunkedArray or pa.Scalar |
| 344 | + """ |
| 345 | + if is_list_like(value): |
| 346 | + return cls._box_pa_array(value, pa_type) |
| 347 | + return cls._box_pa_scalar(value, pa_type) |
| 348 | + |
| 349 | + @classmethod |
| 350 | + def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar: |
| 351 | + """ |
| 352 | + Box value into a pyarrow Scalar. |
| 353 | +
|
| 354 | + Parameters |
| 355 | + ---------- |
| 356 | + value : any |
| 357 | + pa_type : pa.DataType | None |
| 358 | +
|
| 359 | + Returns |
| 360 | + ------- |
| 361 | + pa.Scalar |
| 362 | + """ |
| 363 | + if isinstance(value, pa.Scalar): |
| 364 | + pa_scalar = value |
| 365 | + elif isna(value): |
| 366 | + pa_scalar = pa.scalar(None, type=pa_type) |
| 367 | + else: |
| 368 | + # GH 53171: pyarrow does not yet handle pandas non-nano correctly |
| 369 | + # see https://github.com/apache/arrow/issues/33321 |
| 370 | + if isinstance(value, Timedelta): |
| 371 | + if pa_type is None: |
| 372 | + pa_type = pa.duration(value.unit) |
| 373 | + elif value.unit != pa_type.unit: |
| 374 | + value = value.as_unit(pa_type.unit) |
| 375 | + value = value._value |
| 376 | + elif isinstance(value, Timestamp): |
| 377 | + if pa_type is None: |
| 378 | + pa_type = pa.timestamp(value.unit, tz=value.tz) |
| 379 | + elif value.unit != pa_type.unit: |
| 380 | + value = value.as_unit(pa_type.unit) |
| 381 | + value = value._value |
| 382 | + |
| 383 | + pa_scalar = pa.scalar(value, type=pa_type, from_pandas=True) |
| 384 | + |
| 385 | + if pa_type is not None and pa_scalar.type != pa_type: |
| 386 | + pa_scalar = pa_scalar.cast(pa_type) |
| 387 | + |
| 388 | + return pa_scalar |
| 389 | + |
| 390 | + @classmethod |
| 391 | + def _box_pa_array( |
| 392 | + cls, value, pa_type: pa.DataType | None = None, copy: bool = False |
| 393 | + ) -> pa.Array | pa.ChunkedArray: |
| 394 | + """ |
| 395 | + Box value into a pyarrow Array or ChunkedArray. |
| 396 | +
|
| 397 | + Parameters |
| 398 | + ---------- |
| 399 | + value : Sequence |
| 400 | + pa_type : pa.DataType | None |
| 401 | +
|
| 402 | + Returns |
| 403 | + ------- |
| 404 | + pa.Array or pa.ChunkedArray |
| 405 | + """ |
| 406 | + if isinstance(value, cls): |
| 407 | + pa_array = value._pa_array |
| 408 | + elif isinstance(value, (pa.Array, pa.ChunkedArray)): |
| 409 | + pa_array = value |
| 410 | + elif isinstance(value, BaseMaskedArray): |
| 411 | + # GH 52625 |
| 412 | + if copy: |
| 413 | + value = value.copy() |
| 414 | + pa_array = value.__arrow_array__() |
| 415 | + else: |
| 416 | + if ( |
| 417 | + isinstance(value, np.ndarray) |
| 418 | + and pa_type is not None |
| 419 | + and ( |
| 420 | + pa.types.is_large_binary(pa_type) |
| 421 | + or pa.types.is_large_string(pa_type) |
| 422 | + ) |
| 423 | + ): |
| 424 | + # See https://github.com/apache/arrow/issues/35289 |
| 425 | + value = value.tolist() |
| 426 | + elif copy and is_array_like(value): |
| 427 | + # pa array should not get updated when numpy array is updated |
| 428 | + value = value.copy() |
| 429 | + |
| 430 | + if ( |
| 431 | + pa_type is not None |
| 432 | + and pa.types.is_duration(pa_type) |
| 433 | + and (not isinstance(value, np.ndarray) or value.dtype.kind not in "mi") |
| 434 | + ): |
| 435 | + # GH 53171: pyarrow does not yet handle pandas non-nano correctly |
| 436 | + # see https://github.com/apache/arrow/issues/33321 |
| 437 | + from pandas.core.tools.timedeltas import to_timedelta |
| 438 | + |
| 439 | + value = to_timedelta(value, unit=pa_type.unit).as_unit(pa_type.unit) |
| 440 | + value = value.to_numpy() |
| 441 | + |
| 442 | + try: |
| 443 | + pa_array = pa.array(value, type=pa_type, from_pandas=True) |
| 444 | + except pa.ArrowInvalid: |
| 445 | + # GH50430: let pyarrow infer type, then cast |
| 446 | + pa_array = pa.array(value, from_pandas=True) |
| 447 | + |
| 448 | + if pa_type is None and pa.types.is_duration(pa_array.type): |
| 449 | + # GH 53171: pyarrow does not yet handle pandas non-nano correctly |
| 450 | + # see https://github.com/apache/arrow/issues/33321 |
| 451 | + from pandas.core.tools.timedeltas import to_timedelta |
| 452 | + |
| 453 | + value = to_timedelta(value) |
| 454 | + value = value.to_numpy() |
| 455 | + pa_array = pa.array(value, type=pa_type, from_pandas=True) |
| 456 | + |
| 457 | + if pa.types.is_duration(pa_array.type) and pa_array.null_count > 0: |
| 458 | + # GH52843: upstream bug for duration types when originally |
| 459 | + # constructed with data containing numpy NaT. |
| 460 | + # https://github.com/apache/arrow/issues/35088 |
| 461 | + arr = cls(pa_array) |
| 462 | + arr = arr.fillna(arr.dtype.na_value) |
| 463 | + pa_array = arr._pa_array |
| 464 | + |
| 465 | + if pa_type is not None and pa_array.type != pa_type: |
| 466 | + if pa.types.is_dictionary(pa_type): |
| 467 | + pa_array = pa_array.dictionary_encode() |
| 468 | + else: |
| 469 | + pa_array = pa_array.cast(pa_type) |
| 470 | + |
| 471 | + return pa_array |
| 472 | + |
355 | 473 | def __getitem__(self, item: PositionalIndexer):
|
356 | 474 | """Select a subset of self.
|
357 | 475 |
|
@@ -470,65 +588,50 @@ def __setstate__(self, state) -> None:
|
470 | 588 |
|
471 | 589 | def _cmp_method(self, other, op):
|
472 | 590 | pc_func = ARROW_CMP_FUNCS[op.__name__]
|
473 |
| - if isinstance(other, ArrowExtensionArray): |
474 |
| - result = pc_func(self._pa_array, other._pa_array) |
475 |
| - elif isinstance(other, (np.ndarray, list)): |
476 |
| - result = pc_func(self._pa_array, other) |
477 |
| - elif isinstance(other, BaseMaskedArray): |
478 |
| - # GH 52625 |
479 |
| - result = pc_func(self._pa_array, other.__arrow_array__()) |
480 |
| - elif is_scalar(other): |
481 |
| - try: |
482 |
| - result = pc_func(self._pa_array, pa.scalar(other)) |
483 |
| - except (pa.lib.ArrowNotImplementedError, pa.lib.ArrowInvalid): |
| 591 | + try: |
| 592 | + result = pc_func(self._pa_array, self._box_pa(other)) |
| 593 | + except (pa.lib.ArrowNotImplementedError, pa.lib.ArrowInvalid): |
| 594 | + if is_scalar(other): |
484 | 595 | mask = isna(self) | isna(other)
|
485 | 596 | valid = ~mask
|
486 | 597 | result = np.zeros(len(self), dtype="bool")
|
487 | 598 | result[valid] = op(np.array(self)[valid], other)
|
488 | 599 | result = pa.array(result, type=pa.bool_())
|
489 | 600 | result = pc.if_else(valid, result, None)
|
490 |
| - else: |
491 |
| - raise NotImplementedError( |
492 |
| - f"{op.__name__} not implemented for {type(other)}" |
493 |
| - ) |
| 601 | + else: |
| 602 | + raise NotImplementedError( |
| 603 | + f"{op.__name__} not implemented for {type(other)}" |
| 604 | + ) |
494 | 605 | return ArrowExtensionArray(result)
|
495 | 606 |
|
496 | 607 | def _evaluate_op_method(self, other, op, arrow_funcs):
|
497 | 608 | pa_type = self._pa_array.type
|
| 609 | + other = self._box_pa(other) |
| 610 | + |
498 | 611 | if (pa.types.is_string(pa_type) or pa.types.is_binary(pa_type)) and op in [
|
499 | 612 | operator.add,
|
500 | 613 | roperator.radd,
|
501 | 614 | ]:
|
502 | 615 | sep = pa.scalar("", type=pa_type)
|
503 |
| - if isinstance(other, type(self)): |
504 |
| - other = other._pa_array |
505 | 616 | if op is operator.add:
|
506 | 617 | result = pc.binary_join_element_wise(self._pa_array, other, sep)
|
507 | 618 | else:
|
508 | 619 | result = pc.binary_join_element_wise(other, self._pa_array, sep)
|
509 | 620 | return type(self)(result)
|
510 | 621 |
|
| 622 | + if ( |
| 623 | + isinstance(other, pa.Scalar) |
| 624 | + and pc.is_null(other).as_py() |
| 625 | + and op.__name__ in ARROW_LOGICAL_FUNCS |
| 626 | + ): |
| 627 | + # pyarrow kleene ops require null to be typed |
| 628 | + other = other.cast(pa_type) |
| 629 | + |
511 | 630 | pc_func = arrow_funcs[op.__name__]
|
512 | 631 | if pc_func is NotImplemented:
|
513 | 632 | raise NotImplementedError(f"{op.__name__} not implemented.")
|
514 |
| - if isinstance(other, ArrowExtensionArray): |
515 |
| - result = pc_func(self._pa_array, other._pa_array) |
516 |
| - elif isinstance(other, (np.ndarray, list)): |
517 |
| - result = pc_func(self._pa_array, pa.array(other, from_pandas=True)) |
518 |
| - elif isinstance(other, BaseMaskedArray): |
519 |
| - # GH 52625 |
520 |
| - result = pc_func(self._pa_array, other.__arrow_array__()) |
521 |
| - elif is_scalar(other): |
522 |
| - if isna(other) and op.__name__ in ARROW_LOGICAL_FUNCS: |
523 |
| - # pyarrow kleene ops require null to be typed |
524 |
| - pa_scalar = pa.scalar(None, type=self._pa_array.type) |
525 |
| - else: |
526 |
| - pa_scalar = pa.scalar(other) |
527 |
| - result = pc_func(self._pa_array, pa_scalar) |
528 |
| - else: |
529 |
| - raise NotImplementedError( |
530 |
| - f"{op.__name__} not implemented for {type(other)}" |
531 |
| - ) |
| 633 | + |
| 634 | + result = pc_func(self._pa_array, other) |
532 | 635 | return type(self)(result)
|
533 | 636 |
|
534 | 637 | def _logical_method(self, other, op):
|
@@ -1610,16 +1713,8 @@ def _mode(self, dropna: bool = True) -> Self:
|
1610 | 1713 |
|
1611 | 1714 | def _maybe_convert_setitem_value(self, value):
|
1612 | 1715 | """Maybe convert value to be pyarrow compatible."""
|
1613 |
| - if value is None: |
1614 |
| - return value |
1615 |
| - if isinstance(value, (pa.Scalar, pa.Array, pa.ChunkedArray)): |
1616 |
| - return value |
1617 |
| - if is_list_like(value): |
1618 |
| - pa_box = pa.array |
1619 |
| - else: |
1620 |
| - pa_box = pa.scalar |
1621 | 1716 | try:
|
1622 |
| - value = pa_box(value, type=self._pa_array.type, from_pandas=True) |
| 1717 | + value = self._box_pa(value, self._pa_array.type) |
1623 | 1718 | except pa.ArrowTypeError as err:
|
1624 | 1719 | msg = f"Invalid value '{str(value)}' for dtype {self.dtype}"
|
1625 | 1720 | raise TypeError(msg) from err
|
|
0 commit comments