Skip to content

[ArrayManager] Array version of fillna logic #41104

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
11 changes: 11 additions & 0 deletions pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -1850,6 +1850,17 @@ def find_common_type(types: list[DtypeObj]) -> DtypeObj:
return np.find_common_type(types, []) # type: ignore[arg-type]


def coerce_to_target_dtype(values, other):
"""
Coerce the values to a dtype compat for other. This will always
return values, possibly object dtype, and not raise.
"""
dtype, _ = infer_dtype_from(other, pandas_dtype=True)
new_dtype = find_common_type([values.dtype, dtype])

return astype_array_safe(values, new_dtype, copy=False)


def construct_2d_arraylike_from_scalar(
value: Scalar, length: int, width: int, dtype: np.dtype, copy: bool
) -> np.ndarray:
Expand Down
5 changes: 3 additions & 2 deletions pandas/core/internals/array_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@
new_block,
to_native_types,
)
from pandas.core.missing import fillna_array

if TYPE_CHECKING:
from pandas import Float64Index
Expand Down Expand Up @@ -383,8 +384,8 @@ def shift(self: T, periods: int, axis: int, fill_value) -> T:
)

def fillna(self: T, value, limit, inplace: bool, downcast) -> T:
return self.apply_with_block(
"fillna", value=value, limit=limit, inplace=inplace, downcast=downcast
return self.apply(
fillna_array, value=value, limit=limit, inplace=inplace, downcast=downcast
)

def astype(self: T, dtype, copy: bool = False, errors: str = "raise") -> T:
Expand Down
40 changes: 17 additions & 23 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -1410,6 +1410,19 @@ def interpolate(
new_values = values.fillna(value=fill_value, method=method, limit=limit)
return self.make_block_same_class(new_values)

def fillna(
self, value, limit=None, inplace: bool = False, downcast=None
) -> list[Block]:

res_values = missing.fillna_ea_array(
self.values.ravel(), value, limit=limit, inplace=inplace, downcast=downcast
)
res_values = ensure_block_shape(res_values, self.ndim, self.shape)

if res_values.dtype == object:
return [self.make_block(values=res_values)]
return [self.make_block_same_class(values=res_values)]


class ExtensionBlock(libinternals.Block, EABackedBlock):
"""
Expand Down Expand Up @@ -1609,12 +1622,6 @@ def getitem_block_index(self, slicer: slice) -> ExtensionBlock:
new_values = self.values[slicer]
return type(self)(new_values, self._mgr_locs, ndim=self.ndim)

def fillna(
self, value, limit=None, inplace: bool = False, downcast=None
) -> list[Block]:
values = self.values.fillna(value=value, limit=limit)
return [self.make_block_same_class(values=values)]

def diff(self, n: int, axis: int = 1) -> list[Block]:
if axis == 0 and n != 0:
# n==0 case will be a no-op so let is fall through
Expand Down Expand Up @@ -1785,21 +1792,6 @@ def shift(self, periods: int, axis: int = 0, fill_value: Any = None) -> list[Blo
new_values = values.shift(periods, fill_value=fill_value, axis=axis)
return [self.make_block_same_class(new_values)]

def fillna(
self, value, limit=None, inplace: bool = False, downcast=None
) -> list[Block]:

if not self._can_hold_element(value) and self.dtype.kind != "m":
# We support filling a DatetimeTZ with a `value` whose timezone
# is different by coercing to object.
# TODO: don't special-case td64
return self.coerce_to_target_dtype(value).fillna(
value, limit, inplace, downcast
)

new_values = self.values.fillna(value=value, limit=limit)
return [self.make_block_same_class(values=new_values)]


class DatetimeLikeBlock(NDArrayBackedExtensionBlock):
"""Block for datetime64[ns], timedelta64[ns]."""
Expand Down Expand Up @@ -2059,7 +2051,9 @@ def extend_blocks(result, blocks=None) -> list[Block]:
return blocks


def ensure_block_shape(values: ArrayLike, ndim: int = 1) -> ArrayLike:
def ensure_block_shape(
values: ArrayLike, ndim: int = 1, shape: tuple = (1, -1)
) -> ArrayLike:
"""
Reshape if possible to have values.ndim == ndim.
"""
Expand All @@ -2070,7 +2064,7 @@ def ensure_block_shape(values: ArrayLike, ndim: int = 1) -> ArrayLike:
# block.shape is incorrect for "2D" ExtensionArrays
# We can't, and don't need to, reshape.
values = cast("np.ndarray | DatetimeArray | TimedeltaArray", values)
values = values.reshape(1, -1)
values = values.reshape(*shape)

return values

Expand Down
92 changes: 91 additions & 1 deletion pandas/core/missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,13 @@
)
from pandas.compat._optional import import_optional_dependency

from pandas.core.dtypes.cast import infer_dtype_from
from pandas.core.dtypes.cast import (
can_hold_element,
coerce_to_target_dtype,
infer_dtype_from,
maybe_downcast_to_dtype,
soft_convert_objects,
)
from pandas.core.dtypes.common import (
is_array_like,
is_numeric_v_string_like,
Expand All @@ -39,6 +45,8 @@
na_value_for_dtype,
)

from pandas.core.construction import extract_array

if TYPE_CHECKING:
from pandas import Index

Expand Down Expand Up @@ -975,3 +983,85 @@ def _rolling_window(a: npt.NDArray[np.bool_], window: int) -> npt.NDArray[np.boo
shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
strides = a.strides + (a.strides[-1],)
return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)


def _can_hold_element(values, element: Any) -> bool:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is there a version of this that makes the existing can_hold_element more accurate instead of implementing another function?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not fully sure what I was thinking in April / if that was already the case, but so right now the existing can_hold_element actually fully covers what I needed here.

It only doesn't do extract_array compared to Block._can_hold_element, so kept that here for now. But probably that could be moved to can_hold_element as well (will check later).

"""
Expanded version of core.dtypes.cast.can_hold_element
"""
element = extract_array(element, extract_numpy=True)
return can_hold_element(values, element)


def fillna_ea_array(values, value, limit=None, inplace: bool = False, downcast=None):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

values: ExtensionArray?

"""
Fillna logic for ExtensionArrays.

Dispatches to the EA.fillna method (in which case downcast is currently
ignored), except for datetime64 in which case fallback to object dtype
is currently allowed.
"""
if not _can_hold_element(values, value) and values.dtype.kind == "M":
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i think this would change behavior for e.g. Period or Interval?

(which would make us more consistent across dtypes, which i would like to see, just needs to be intentional)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Then we don't have tests for it in any case, since the tests are passing.

Now, checking for period, it seems to raise for both of array and Series:

In [5]: arr = pd.period_range("2012", periods=3).array

In [6]: arr[0] = None

In [8]: arr.fillna("?")
...
TypeError: value should be a 'Period', 'NaT', or array of those. Got 'str' instead.

In [9]: pd.Series(arr).fillna("?")
...
TypeError: value should be a 'Period', 'NaT', or array of those. Got 'str' instead.

while indeed for DatetimeArray, we coerce to object for Series:

In [10]: arr = pd.date_range("2012", periods=3).array

In [11]: arr[0] = None

In [12]: arr.fillna("?")
..
TypeError: value should be a 'Timestamp', 'NaT', or array of those. Got 'str' instead.

In [13]: pd.Series(arr).fillna("?")
Out[13]: 
0                      ?
1    2012-01-02 00:00:00
2    2012-01-03 00:00:00
dtype: object

# We support filling a DatetimeTZ with a `value` whose timezone
# is different by coercing to object.
# TODO: don't special-case td64
values = values.astype(object)
return fillna_array(values, value, limit=limit, inplace=True, downcast=downcast)

return values.fillna(value, limit=limit)


def fillna_array(values, value, limit=None, inplace: bool = False, downcast=None):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could be used for Block.fillna too?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't directly see an easy way to do that. The Block.fillna additionally has the "split" logic, which I personally don't want to put in here. And it's not directly straightforward to split a part of Block.fillna.

I could for example let this function return some sentinel like NotImplemented or None to signal to Block.fillna that the filling hasn't been done yet, and it should split the block and call fillna again. In which case most of the splitting logic stays in the internals. I could try something like this if you want, but it's also a bit of a hack.

Copy link
Member

@jbrockmendel jbrockmendel Nov 16, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The Block.fillna additionally has the "split" logic, which I personally don't want to put in here

100% agree

I've been looking at simplifying the downcasting in a way that would make the splitting much easier to separate the splitting from non-splitting (not just for fillna) (xref #44241, #40988). It would be an API change so wouldn't be implement in time to simplify this (and your putmask PR), but would eventually allow for some cleanup.

"""
Fillna logic for np.ndarray/ExtensionArray.

This includes the logic for downcasting if needed.
"""
from pandas.core.array_algos.putmask import (
putmask_inplace,
validate_putmask,
)
from pandas.core.arrays import ExtensionArray

# inplace = validate_bool_kwarg(inplace, "inplace")

if isinstance(values, ExtensionArray):
return fillna_ea_array(
values, value, limit=limit, inplace=inplace, downcast=downcast
)

mask = isna(values)
mask, noop = validate_putmask(values, mask)

if limit is not None:
limit = algos.validate_limit(None, limit=limit)
mask[mask.cumsum(values.ndim - 1) > limit] = False

if values.dtype.kind in ["b", "i", "u"]:
# those dtypes can never hold NAs
if inplace:
return values
else:
return values.copy()

if _can_hold_element(values, value):
values = values if inplace else values.copy()
putmask_inplace(values, mask, value)

if values.dtype == np.dtype(object):
if downcast is None:
values = soft_convert_objects(values, datetime=True, numeric=False)

if downcast is None and values.dtype.kind not in ["f", "m", "M"]:
downcast = "infer"
if downcast:
values = maybe_downcast_to_dtype(values, downcast)
return values

if noop:
# we can't process the value, but nothing to do
return values if inplace else values.copy()
else:
values = coerce_to_target_dtype(values, value)
# bc we have already cast, inplace=True may avoid an extra copy
return fillna_array(values, value, limit=limit, inplace=True, downcast=None)
3 changes: 0 additions & 3 deletions pandas/tests/frame/methods/test_fillna.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import numpy as np
import pytest

import pandas.util._test_decorators as td

from pandas import (
Categorical,
DataFrame,
Expand Down Expand Up @@ -281,7 +279,6 @@ def test_fillna_dtype_conversion_equiv_replace(self, val):
result = df.fillna(val)
tm.assert_frame_equal(result, expected)

@td.skip_array_manager_invalid_test
def test_fillna_datetime_columns(self):
# GH#7095
df = DataFrame(
Expand Down