Skip to content

DEPR: concat ignoring all-NA entries #58314

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Apr 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,7 @@ Removal of prior version deprecations/changes
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- :class:`.DataFrameGroupBy.idxmin`, :class:`.DataFrameGroupBy.idxmax`, :class:`.SeriesGroupBy.idxmin`, and :class:`.SeriesGroupBy.idxmax` will now raise a ``ValueError`` when used with ``skipna=False`` and an NA value is encountered (:issue:`10694`)
- :func:`concat` no longer ignores empty objects when determining output dtypes (:issue:`39122`)
- :func:`concat` with all-NA entries no longer ignores the dtype of those entries when determining the result dtype (:issue:`40893`)
- :func:`read_excel`, :func:`read_json`, :func:`read_html`, and :func:`read_xml` no longer accept raw string or byte representation of the data. That type of data must be wrapped in a :py:class:`StringIO` or :py:class:`BytesIO` (:issue:`53767`)
- :meth:`DataFrame.groupby` with ``as_index=False`` and aggregation methods will no longer exclude from the result the groupings that do not arise from the input (:issue:`49519`)
- :meth:`Series.dt.to_pydatetime` now returns a :class:`Series` of :py:class:`datetime.datetime` objects (:issue:`52459`)
Expand Down
91 changes: 7 additions & 84 deletions pandas/core/internals/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
TYPE_CHECKING,
cast,
)
import warnings

import numpy as np

Expand All @@ -16,27 +15,18 @@
)
from pandas._libs.missing import NA
from pandas.util._decorators import cache_readonly
from pandas.util._exceptions import find_stack_level

from pandas.core.dtypes.cast import (
ensure_dtype_can_hold_na,
find_common_type,
)
from pandas.core.dtypes.common import (
is_1d_only_ea_dtype,
is_scalar,
needs_i8_conversion,
)
from pandas.core.dtypes.concat import concat_compat
from pandas.core.dtypes.dtypes import (
ExtensionDtype,
SparseDtype,
)
from pandas.core.dtypes.missing import (
is_valid_na_for_dtype,
isna,
isna_all,
)
from pandas.core.dtypes.dtypes import ExtensionDtype
from pandas.core.dtypes.missing import is_valid_na_for_dtype

from pandas.core.construction import ensure_wrapped_if_datetimelike
from pandas.core.internals.blocks import (
Expand Down Expand Up @@ -100,6 +90,7 @@ def concatenate_managers(
if first_dtype in [np.float64, np.float32]:
# TODO: support more dtypes here. This will be simpler once
# JoinUnit.is_na behavior is deprecated.
# (update 2024-04-13 that deprecation has been enforced)
if (
all(_is_homogeneous_mgr(mgr, first_dtype) for mgr, _ in mgrs_indexers)
and len(mgrs_indexers) > 1
Expand Down Expand Up @@ -351,41 +342,6 @@ def _is_valid_na_for(self, dtype: DtypeObj) -> bool:

@cache_readonly
def is_na(self) -> bool:
blk = self.block
if blk.dtype.kind == "V":
return True

if not blk._can_hold_na:
return False

values = blk.values
if values.size == 0:
# GH#39122 this case will return False once deprecation is enforced
return True

if isinstance(values.dtype, SparseDtype):
return False

if values.ndim == 1:
# TODO(EA2D): no need for special case with 2D EAs
val = values[0]
if not is_scalar(val) or not isna(val):
# ideally isna_all would do this short-circuiting
return False
return isna_all(values)
else:
val = values[0][0]
if not is_scalar(val) or not isna(val):
# ideally isna_all would do this short-circuiting
return False
return all(isna_all(row) for row in values)

@cache_readonly
def is_na_after_size_and_isna_all_deprecation(self) -> bool:
"""
Will self.is_na be True after values.size == 0 deprecation and isna_all
deprecation are enforced?
"""
blk = self.block
if blk.dtype.kind == "V":
return True
Expand Down Expand Up @@ -421,7 +377,7 @@ def _concatenate_join_units(join_units: list[JoinUnit], copy: bool) -> ArrayLike
"""
Concatenate values from several join units along axis=1.
"""
empty_dtype, empty_dtype_future = _get_empty_dtype(join_units)
empty_dtype = _get_empty_dtype(join_units)

has_none_blocks = any(unit.block.dtype.kind == "V" for unit in join_units)
upcasted_na = _dtype_to_na_value(empty_dtype, has_none_blocks)
Expand All @@ -446,18 +402,6 @@ def _concatenate_join_units(join_units: list[JoinUnit], copy: bool) -> ArrayLike
else:
concat_values = concat_compat(to_concat, axis=1)

if empty_dtype != empty_dtype_future:
if empty_dtype == concat_values.dtype:
# GH#39122, GH#40893
warnings.warn(
"The behavior of DataFrame concatenation with empty or all-NA "
"entries is deprecated. In a future version, this will no longer "
"exclude empty or all-NA columns when determining the result dtypes. "
"To retain the old behavior, exclude the relevant entries before "
"the concat operation.",
FutureWarning,
stacklevel=find_stack_level(),
)
return concat_values


Expand All @@ -484,7 +428,7 @@ def _dtype_to_na_value(dtype: DtypeObj, has_none_blocks: bool):
raise NotImplementedError


def _get_empty_dtype(join_units: Sequence[JoinUnit]) -> tuple[DtypeObj, DtypeObj]:
def _get_empty_dtype(join_units: Sequence[JoinUnit]) -> DtypeObj:
"""
Return dtype and N/A values to use when concatenating specified units.

Expand All @@ -496,38 +440,17 @@ def _get_empty_dtype(join_units: Sequence[JoinUnit]) -> tuple[DtypeObj, DtypeObj
"""
if lib.dtypes_all_equal([ju.block.dtype for ju in join_units]):
empty_dtype = join_units[0].block.dtype
return empty_dtype, empty_dtype
return empty_dtype

has_none_blocks = any(unit.block.dtype.kind == "V" for unit in join_units)

dtypes = [unit.block.dtype for unit in join_units if not unit.is_na]
if not len(dtypes):
dtypes = [
unit.block.dtype for unit in join_units if unit.block.dtype.kind != "V"
]

dtype = find_common_type(dtypes)
if has_none_blocks:
dtype = ensure_dtype_can_hold_na(dtype)

dtype_future = dtype
if len(dtypes) != len(join_units):
dtypes_future = [
unit.block.dtype
for unit in join_units
if not unit.is_na_after_size_and_isna_all_deprecation
]
if not len(dtypes_future):
dtypes_future = [
unit.block.dtype for unit in join_units if unit.block.dtype.kind != "V"
]

if len(dtypes) != len(dtypes_future):
dtype_future = find_common_type(dtypes_future)
if has_none_blocks:
dtype_future = ensure_dtype_can_hold_na(dtype_future)

return dtype, dtype_future
return dtype


def _is_uniform_join_units(join_units: list[JoinUnit]) -> bool:
Expand Down
8 changes: 1 addition & 7 deletions pandas/tests/reshape/concat/test_append.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,7 +332,7 @@ def test_append_empty_tz_frame_with_datetime64ns(self):

# pd.NaT gets inferred as tz-naive, so append result is tz-naive
result = df._append({"a": pd.NaT}, ignore_index=True)
expected = DataFrame({"a": [np.nan]}, dtype=object)
expected = DataFrame({"a": [pd.NaT]}, dtype=object)
tm.assert_frame_equal(result, expected)

# also test with typed value to append
Expand All @@ -359,12 +359,6 @@ def test_append_empty_frame_with_timedelta64ns_nat(self, dtype_str, val):
result = df._append(other, ignore_index=True)

expected = other.astype(object)
if isinstance(val, str) and dtype_str != "int64":
# TODO: expected used to be `other.astype(object)` which is a more
# reasonable result. This was changed when tightening
# assert_frame_equal's treatment of mismatched NAs to match the
# existing behavior.
expected = DataFrame({"a": [np.nan]}, dtype=object)
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize(
Expand Down
49 changes: 30 additions & 19 deletions pandas/tests/reshape/concat/test_concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -789,21 +789,24 @@ def test_concat_ignore_empty_object_float(empty_dtype, df_dtype):
df = DataFrame({"foo": [1, 2], "bar": [1, 2]}, dtype=df_dtype)
empty = DataFrame(columns=["foo", "bar"], dtype=empty_dtype)

msg = "The behavior of DataFrame concatenation with empty or all-NA entries"
warn = None
needs_update = False
if df_dtype == "datetime64[ns]" or (
df_dtype == "float64" and empty_dtype != "float64"
):
warn = FutureWarning
with tm.assert_produces_warning(warn, match=msg):
result = concat([empty, df])
needs_update = True

result = concat([empty, df])
expected = df
if df_dtype == "int64":
# TODO what exact behaviour do we want for integer eventually?
if empty_dtype == "float64":
expected = df.astype("float64")
else:
expected = df.astype("object")

if needs_update:
# GH#40893 changed the expected here to retain dependence on empty
expected = expected.astype(object)
tm.assert_frame_equal(result, expected)


Expand All @@ -820,17 +823,19 @@ def test_concat_ignore_all_na_object_float(empty_dtype, df_dtype):
else:
df_dtype = "float64"

msg = "The behavior of DataFrame concatenation with empty or all-NA entries"
warn = None
needs_update = False
if empty_dtype != df_dtype and empty_dtype is not None:
warn = FutureWarning
needs_update = True
elif df_dtype == "datetime64[ns]":
warn = FutureWarning
needs_update = True

with tm.assert_produces_warning(warn, match=msg):
result = concat([empty, df], ignore_index=True)
result = concat([empty, df], ignore_index=True)

expected = DataFrame({"foo": [np.nan, 1, 2], "bar": [np.nan, 1, 2]}, dtype=df_dtype)
if needs_update:
# GH#40893 changed the expected here to retain dependence on empty
expected = expected.astype(object)
expected.iloc[0] = np.nan
tm.assert_frame_equal(result, expected)


Expand All @@ -841,10 +846,16 @@ def test_concat_ignore_empty_from_reindex():

aligned = df2.reindex(columns=df1.columns)

msg = "The behavior of DataFrame concatenation with empty or all-NA entries"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = concat([df1, aligned], ignore_index=True)
expected = df1 = DataFrame({"a": [1, 2], "b": [pd.Timestamp("2012-01-01"), pd.NaT]})
result = concat([df1, aligned], ignore_index=True)

expected = DataFrame(
{
"a": [1, 2],
"b": pd.array([pd.Timestamp("2012-01-01"), np.nan], dtype=object),
},
dtype=object,
)
expected["a"] = expected["a"].astype("int64")
tm.assert_frame_equal(result, expected)


Expand Down Expand Up @@ -907,10 +918,10 @@ def test_concat_none_with_timezone_timestamp():
# GH#52093
df1 = DataFrame([{"A": None}])
df2 = DataFrame([{"A": pd.Timestamp("1990-12-20 00:00:00+00:00")}])
msg = "The behavior of DataFrame concatenation with empty or all-NA entries"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = concat([df1, df2], ignore_index=True)
expected = DataFrame({"A": [None, pd.Timestamp("1990-12-20 00:00:00+00:00")]})
result = concat([df1, df2], ignore_index=True)
expected = DataFrame(
{"A": [None, pd.Timestamp("1990-12-20 00:00:00+00:00")]}, dtype=object
)
tm.assert_frame_equal(result, expected)


Expand Down
18 changes: 5 additions & 13 deletions pandas/tests/reshape/concat/test_datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,15 +226,6 @@ def test_concat_NaT_dataframes_all_NaT_axis_0(self, tz1, tz2, item):
expected = expected.apply(lambda x: x.dt.tz_localize(tz2))
if tz1 != tz2:
expected = expected.astype(object)
if item is pd.NaT:
# GH#18463
# TODO: setting nan here is to keep the test passing as we
# make assert_frame_equal stricter, but is nan really the
# ideal behavior here?
if tz1 is not None:
expected.iloc[-1, 0] = np.nan
else:
expected.iloc[:-1, 0] = np.nan

tm.assert_frame_equal(result, expected)

Expand Down Expand Up @@ -590,8 +581,9 @@ def test_concat_float_datetime64():
result = concat([df_time.iloc[:0], df_float])
tm.assert_frame_equal(result, expected)

expected = DataFrame({"A": pd.array(["2000"], dtype="datetime64[ns]")})
msg = "The behavior of DataFrame concatenation with empty or all-NA entries"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = concat([df_time, df_float.iloc[:0]])
expected = DataFrame({"A": pd.array(["2000"], dtype="datetime64[ns]")}).astype(
object
)

result = concat([df_time, df_float.iloc[:0]])
tm.assert_frame_equal(result, expected)
10 changes: 4 additions & 6 deletions pandas/tests/reshape/merge/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -709,16 +709,14 @@ def test_join_append_timedeltas(self):
{"d": [datetime(2013, 11, 5, 5, 56)], "t": [timedelta(0, 22500)]}
)
df = DataFrame(columns=list("dt"))
msg = "The behavior of DataFrame concatenation with empty or all-NA entries"
warn = FutureWarning
with tm.assert_produces_warning(warn, match=msg):
df = concat([df, d], ignore_index=True)
result = concat([df, d], ignore_index=True)
df = concat([df, d], ignore_index=True)
result = concat([df, d], ignore_index=True)
expected = DataFrame(
{
"d": [datetime(2013, 11, 5, 5, 56), datetime(2013, 11, 5, 5, 56)],
"t": [timedelta(0, 22500), timedelta(0, 22500)],
}
},
dtype=object,
)
tm.assert_frame_equal(result, expected)

Expand Down
Loading