-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
[ArrayManager] Array version of fillna logic #41104
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
dce2341
d0e00b3
a1b3b6a
c645017
3226ef5
643b6a5
e0de7b8
a6f74db
bdf9c5a
e67faa1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -27,7 +27,13 @@ | |
) | ||
from pandas.compat._optional import import_optional_dependency | ||
|
||
from pandas.core.dtypes.cast import infer_dtype_from | ||
from pandas.core.dtypes.cast import ( | ||
can_hold_element, | ||
coerce_to_target_dtype, | ||
infer_dtype_from, | ||
maybe_downcast_to_dtype, | ||
soft_convert_objects, | ||
) | ||
from pandas.core.dtypes.common import ( | ||
is_array_like, | ||
is_numeric_v_string_like, | ||
|
@@ -39,6 +45,8 @@ | |
na_value_for_dtype, | ||
) | ||
|
||
from pandas.core.construction import extract_array | ||
|
||
if TYPE_CHECKING: | ||
from pandas import Index | ||
|
||
|
@@ -975,3 +983,85 @@ def _rolling_window(a: npt.NDArray[np.bool_], window: int) -> npt.NDArray[np.boo | |
shape = a.shape[:-1] + (a.shape[-1] - window + 1, window) | ||
strides = a.strides + (a.strides[-1],) | ||
return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides) | ||
|
||
|
||
def _can_hold_element(values, element: Any) -> bool: | ||
""" | ||
Expanded version of core.dtypes.cast.can_hold_element | ||
""" | ||
element = extract_array(element, extract_numpy=True) | ||
return can_hold_element(values, element) | ||
|
||
|
||
def fillna_ea_array(values, value, limit=None, inplace: bool = False, downcast=None): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
""" | ||
Fillna logic for ExtensionArrays. | ||
|
||
Dispatches to the EA.fillna method (in which case downcast is currently | ||
ignored), except for datetime64 in which case fallback to object dtype | ||
is currently allowed. | ||
""" | ||
if not _can_hold_element(values, value) and values.dtype.kind == "M": | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i think this would change behavior for e.g. Period or Interval? (which would make us more consistent across dtypes, which i would like to see, just needs to be intentional) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Then we don't have tests for it in any case, since the tests are passing. Now, checking for period, it seems to raise for both of array and Series:
while indeed for DatetimeArray, we coerce to object for Series:
|
||
# We support filling a DatetimeTZ with a `value` whose timezone | ||
# is different by coercing to object. | ||
# TODO: don't special-case td64 | ||
values = values.astype(object) | ||
return fillna_array(values, value, limit=limit, inplace=True, downcast=downcast) | ||
|
||
return values.fillna(value, limit=limit) | ||
|
||
|
||
def fillna_array(values, value, limit=None, inplace: bool = False, downcast=None): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. could be used for Block.fillna too? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't directly see an easy way to do that. The Block.fillna additionally has the "split" logic, which I personally don't want to put in here. And it's not directly straightforward to split a part of Block.fillna. I could for example let this function return some sentinel like There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
100% agree I've been looking at simplifying the downcasting in a way that would make the splitting much easier to separate the splitting from non-splitting (not just for fillna) (xref #44241, #40988). It would be an API change so wouldn't be implement in time to simplify this (and your putmask PR), but would eventually allow for some cleanup. |
||
""" | ||
Fillna logic for np.ndarray/ExtensionArray. | ||
|
||
This includes the logic for downcasting if needed. | ||
""" | ||
from pandas.core.array_algos.putmask import ( | ||
putmask_inplace, | ||
validate_putmask, | ||
) | ||
from pandas.core.arrays import ExtensionArray | ||
|
||
# inplace = validate_bool_kwarg(inplace, "inplace") | ||
|
||
if isinstance(values, ExtensionArray): | ||
return fillna_ea_array( | ||
values, value, limit=limit, inplace=inplace, downcast=downcast | ||
) | ||
|
||
mask = isna(values) | ||
mask, noop = validate_putmask(values, mask) | ||
|
||
if limit is not None: | ||
limit = algos.validate_limit(None, limit=limit) | ||
mask[mask.cumsum(values.ndim - 1) > limit] = False | ||
|
||
if values.dtype.kind in ["b", "i", "u"]: | ||
# those dtypes can never hold NAs | ||
if inplace: | ||
return values | ||
else: | ||
return values.copy() | ||
|
||
if _can_hold_element(values, value): | ||
values = values if inplace else values.copy() | ||
putmask_inplace(values, mask, value) | ||
|
||
if values.dtype == np.dtype(object): | ||
if downcast is None: | ||
values = soft_convert_objects(values, datetime=True, numeric=False) | ||
|
||
if downcast is None and values.dtype.kind not in ["f", "m", "M"]: | ||
downcast = "infer" | ||
if downcast: | ||
values = maybe_downcast_to_dtype(values, downcast) | ||
return values | ||
|
||
if noop: | ||
# we can't process the value, but nothing to do | ||
return values if inplace else values.copy() | ||
else: | ||
values = coerce_to_target_dtype(values, value) | ||
# bc we have already cast, inplace=True may avoid an extra copy | ||
return fillna_array(values, value, limit=limit, inplace=True, downcast=None) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
is there a version of this that makes the existing can_hold_element more accurate instead of implementing another function?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not fully sure what I was thinking in April / if that was already the case, but so right now the existing
can_hold_element
actually fully covers what I needed here.It only doesn't do
extract_array
compared toBlock._can_hold_element
, so kept that here for now. But probably that could be moved tocan_hold_element
as well (will check later).