-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
CLN: avoid values_from_object in Series #32426
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 7 commits
2d8a274
ec7d005
cf6466b
29c785b
45a278f
fb6c6ff
200ac68
09c7354
da451a1
1017b08
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,7 +7,7 @@ | |
|
||
from pandas._config import get_option | ||
|
||
from pandas._libs import NaT, Timedelta, Timestamp, iNaT, lib | ||
from pandas._libs import NaT, Period, Timedelta, Timestamp, iNaT, lib | ||
from pandas._typing import Dtype, Scalar | ||
from pandas.compat._optional import import_optional_dependency | ||
|
||
|
@@ -17,9 +17,7 @@ | |
is_any_int_dtype, | ||
is_bool_dtype, | ||
is_complex, | ||
is_datetime64_dtype, | ||
is_datetime64tz_dtype, | ||
is_datetime_or_timedelta_dtype, | ||
is_datetime64_any_dtype, | ||
is_float, | ||
is_float_dtype, | ||
is_integer, | ||
|
@@ -28,10 +26,14 @@ | |
is_object_dtype, | ||
is_scalar, | ||
is_timedelta64_dtype, | ||
needs_i8_conversion, | ||
pandas_dtype, | ||
) | ||
from pandas.core.dtypes.dtypes import PeriodDtype | ||
from pandas.core.dtypes.missing import isna, na_value_for_dtype, notna | ||
|
||
from pandas.core.construction import extract_array | ||
|
||
bn = import_optional_dependency("bottleneck", raise_on_missing=False, on_version="warn") | ||
_BOTTLENECK_INSTALLED = bn is not None | ||
_USE_BOTTLENECK = False | ||
|
@@ -132,10 +134,8 @@ def f( | |
|
||
|
||
def _bn_ok_dtype(dtype: Dtype, name: str) -> bool: | ||
# Bottleneck chokes on datetime64 | ||
if not is_object_dtype(dtype) and not ( | ||
is_datetime_or_timedelta_dtype(dtype) or is_datetime64tz_dtype(dtype) | ||
): | ||
# Bottleneck chokes on datetime64, PeriodDtype (or and EA) | ||
if not is_object_dtype(dtype) and not needs_i8_conversion(dtype): | ||
|
||
# GH 15507 | ||
# bottleneck does not properly upcast during the sum | ||
|
@@ -281,23 +281,20 @@ def _get_values( | |
# with scalar fill_value. This guarantee is important for the | ||
# maybe_upcast_putmask call below | ||
assert is_scalar(fill_value) | ||
values = extract_array(values, extract_numpy=True) | ||
|
||
mask = _maybe_get_mask(values, skipna, mask) | ||
|
||
if is_datetime64tz_dtype(values): | ||
# lib.values_from_object returns M8[ns] dtype instead of tz-aware, | ||
# so this case must be handled separately from the rest | ||
dtype = values.dtype | ||
values = getattr(values, "_values", values) | ||
else: | ||
values = lib.values_from_object(values) | ||
dtype = values.dtype | ||
dtype = values.dtype | ||
|
||
if is_datetime_or_timedelta_dtype(values) or is_datetime64tz_dtype(values): | ||
if needs_i8_conversion(values): | ||
# changing timedelta64/datetime64 to int64 needs to happen after | ||
# finding `mask` above | ||
values = getattr(values, "asi8", values) | ||
values = values.view(np.int64) | ||
if isinstance(values, np.ndarray): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why is this case still here? when is this actually an ndarray? (or conversely, when is this a DTI/TDI). it is non-obvious how we get to this point. |
||
values = values.view(np.int64) | ||
else: | ||
# DatetimeArray or TimedeltaArray, use asi8 to get a view | ||
values = values.asi8 | ||
|
||
dtype_ok = _na_ok_dtype(dtype) | ||
|
||
|
@@ -311,7 +308,8 @@ def _get_values( | |
|
||
if skipna and copy: | ||
values = values.copy() | ||
if dtype_ok: | ||
assert mask is not None # for mypy | ||
if dtype_ok and mask.any(): | ||
np.putmask(values, mask, fill_value) | ||
|
||
# promote if needed | ||
|
@@ -330,12 +328,14 @@ def _get_values( | |
|
||
def _na_ok_dtype(dtype) -> bool: | ||
# TODO: what about datetime64tz? PeriodDtype? | ||
return not issubclass(dtype.type, (np.integer, np.timedelta64, np.datetime64)) | ||
if needs_i8_conversion(dtype): | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
return False | ||
return not issubclass(dtype.type, np.integer) | ||
|
||
|
||
def _wrap_results(result, dtype: Dtype, fill_value=None): | ||
""" wrap our results if needed """ | ||
if is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype): | ||
if is_datetime64_any_dtype(dtype): | ||
if fill_value is None: | ||
# GH#24293 | ||
fill_value = iNaT | ||
|
@@ -346,7 +346,8 @@ def _wrap_results(result, dtype: Dtype, fill_value=None): | |
result = np.nan | ||
result = Timestamp(result, tz=tz) | ||
else: | ||
result = result.view(dtype) | ||
# If we have float dtype, taking a view will give the wrong result | ||
result = result.astype(dtype) | ||
elif is_timedelta64_dtype(dtype): | ||
if not isinstance(result, np.ndarray): | ||
if result == fill_value: | ||
|
@@ -360,6 +361,14 @@ def _wrap_results(result, dtype: Dtype, fill_value=None): | |
else: | ||
result = result.astype("m8[ns]").view(dtype) | ||
|
||
elif isinstance(dtype, PeriodDtype): | ||
if is_float(result) and result.is_integer(): | ||
result = int(result) | ||
if is_integer(result): | ||
result = Period._from_ordinal(result, freq=dtype.freq) | ||
else: | ||
raise NotImplementedError(type(result), result) | ||
|
||
return result | ||
|
||
|
||
|
@@ -546,12 +555,7 @@ def nanmean(values, axis=None, skipna=True, mask=None): | |
) | ||
dtype_sum = dtype_max | ||
dtype_count = np.float64 | ||
if ( | ||
is_integer_dtype(dtype) | ||
or is_timedelta64_dtype(dtype) | ||
or is_datetime64_dtype(dtype) | ||
or is_datetime64tz_dtype(dtype) | ||
): | ||
if is_integer_dtype(dtype) or needs_i8_conversion(dtype): | ||
dtype_sum = np.float64 | ||
elif is_float_dtype(dtype): | ||
dtype_sum = dtype | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1990,7 +1990,7 @@ def idxmin(self, axis=0, skipna=True, *args, **kwargs): | |
nan | ||
""" | ||
skipna = nv.validate_argmin_with_skipna(skipna, args, kwargs) | ||
i = nanops.nanargmin(com.values_from_object(self), skipna=skipna) | ||
i = nanops.nanargmin(self._values, skipna=skipna) | ||
if i == -1: | ||
return np.nan | ||
return self.index[i] | ||
|
@@ -2061,7 +2061,7 @@ def idxmax(self, axis=0, skipna=True, *args, **kwargs): | |
nan | ||
""" | ||
skipna = nv.validate_argmax_with_skipna(skipna, args, kwargs) | ||
i = nanops.nanargmax(com.values_from_object(self), skipna=skipna) | ||
i = nanops.nanargmax(self._values, skipna=skipna) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nanargmax/nanargmin expect to get an ndarray. Due to this change, it is no longer guaranteed to be an ndarray. Reported this as #32749 So those lines should either be reverted, or another "convert to ndarray" function should be used (or nanargmax/nanargmin could be rewritten to support EAs, but personally I think it is much cleaner to keep those algos based on numpy arrays) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jbrockmendel can you respond to this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I'd be fine with either of these options. Probably prefer both actually: a EA-supporting public method and an ndarray-only private method for each of the relevant nanops funcs. |
||
if i == -1: | ||
return np.nan | ||
return self.index[i] | ||
|
@@ -2099,7 +2099,7 @@ def round(self, decimals=0, *args, **kwargs) -> "Series": | |
dtype: float64 | ||
""" | ||
nv.validate_round(args, kwargs) | ||
result = com.values_from_object(self).round(decimals) | ||
result = self._values.round(decimals) | ||
result = self._constructor(result, index=self.index).__finalize__(self) | ||
|
||
return result | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -875,11 +875,6 @@ def test_mean_datetimelike(self): | |
expected = pd.Series({"A": 1.0, "C": df.loc[1, "C"]}) | ||
tm.assert_series_equal(result, expected) | ||
|
||
@pytest.mark.xfail( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess you should technically have a whatsnew note as this 'bug' is fixed (do in followon) |
||
reason="casts to object-dtype and then tries to add timestamps", | ||
raises=TypeError, | ||
strict=True, | ||
) | ||
def test_mean_datetimelike_numeric_only_false(self): | ||
df = pd.DataFrame( | ||
{ | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can you apply needs_i8_conversion instead here? (like you do below), this is a non-standard usage
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
trouble is we don't want to include td64
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ok how about using is_datetime64_any_dtype then; this is a non-obvious pattern