Skip to content

Commit 084c543

Browse files
authored
BUG/API: concat with empty DataFrames or all-NA columns (#43507)
1 parent 6161f67 commit 084c543

File tree

6 files changed

+62
-55
lines changed

6 files changed

+62
-55
lines changed

Diff for: doc/source/whatsnew/v1.4.0.rst

+37-2
Original file line numberDiff line numberDiff line change
@@ -150,9 +150,44 @@ The ``dayfirst`` option of :func:`to_datetime` isn't strict, and this can lead t
150150
Now, a warning will be raised if a date string cannot be parsed accordance to the given ``dayfirst`` value when
151151
the value is a delimited date string (e.g. ``31-12-2012``).
152152

153-
.. _whatsnew_140.notable_bug_fixes.notable_bug_fix2:
153+
.. _whatsnew_140.notable_bug_fixes.concat_with_empty_or_all_na:
154154

155-
notable_bug_fix2
155+
Ignoring dtypes in concat with empty or all-NA columns
156+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
157+
158+
When using :func:`concat` to concatenate two or more :class:`DataFrame` objects,
159+
if one of the DataFrames was empty or had all-NA values, its dtype was _sometimes_
160+
ignored when finding the concatenated dtype. These are now consistently _not_ ignored (:issue:`43507`).
161+
162+
.. ipython:: python
163+
164+
df1 = pd.DataFrame({"bar": [pd.Timestamp("2013-01-01")]}, index=range(1))
165+
df2 = pd.DataFrame({"bar": np.nan}, index=range(1, 2))
166+
res = df1.append(df2)
167+
168+
Previously, the float-dtype in ``df2`` would be ignored so the result dtype would be ``datetime64[ns]``. As a result, the ``np.nan`` would be cast to ``NaT``.
169+
170+
*Previous behavior*:
171+
172+
.. code-block:: ipython
173+
174+
In [4]: res
175+
Out[4]:
176+
bar
177+
0 2013-01-01
178+
1 NaT
179+
180+
Now the float-dtype is respected. Since the common dtype for these DataFrames is object, the ``np.nan`` is retained.
181+
182+
*New behavior*:
183+
184+
.. ipython:: python
185+
186+
res
187+
188+
.. _whatsnew_140.notable_bug_fixes.notable_bug_fix3:
189+
190+
notable_bug_fix3
156191
^^^^^^^^^^^^^^^^
157192

158193
.. ---------------------------------------------------------------------------

Diff for: pandas/core/indexing.py

+13-1
Original file line numberDiff line numberDiff line change
@@ -1921,6 +1921,7 @@ def _setitem_with_indexer_missing(self, indexer, value):
19211921
# no columns and scalar
19221922
raise ValueError("cannot set a frame with no defined columns")
19231923

1924+
has_dtype = hasattr(value, "dtype")
19241925
if isinstance(value, ABCSeries):
19251926
# append a Series
19261927
value = value.reindex(index=self.obj.columns, copy=True)
@@ -1938,7 +1939,18 @@ def _setitem_with_indexer_missing(self, indexer, value):
19381939

19391940
value = Series(value, index=self.obj.columns, name=indexer)
19401941

1941-
self.obj._mgr = self.obj.append(value)._mgr
1942+
if not len(self.obj):
1943+
# We will ignore the existing dtypes instead of using
1944+
# internals.concat logic
1945+
df = value.to_frame().T
1946+
df.index = [indexer]
1947+
if not has_dtype:
1948+
# i.e. if we already had a Series or ndarray, keep that
1949+
# dtype. But if we had a list or dict, then do inference
1950+
df = df.infer_objects()
1951+
self.obj._mgr = df._mgr
1952+
else:
1953+
self.obj._mgr = self.obj.append(value)._mgr
19421954
self.obj._maybe_update_cacher(clear=True)
19431955

19441956
def _ensure_iterable_column_indexer(self, column_indexer):

Diff for: pandas/core/internals/concat.py

+2-33
Original file line numberDiff line numberDiff line change
@@ -32,26 +32,20 @@
3232
is_1d_only_ea_obj,
3333
is_datetime64tz_dtype,
3434
is_dtype_equal,
35-
is_scalar,
3635
needs_i8_conversion,
3736
)
3837
from pandas.core.dtypes.concat import (
3938
cast_to_common_type,
4039
concat_compat,
4140
)
4241
from pandas.core.dtypes.dtypes import ExtensionDtype
43-
from pandas.core.dtypes.missing import (
44-
is_valid_na_for_dtype,
45-
isna,
46-
isna_all,
47-
)
42+
from pandas.core.dtypes.missing import is_valid_na_for_dtype
4843

4944
import pandas.core.algorithms as algos
5045
from pandas.core.arrays import (
5146
DatetimeArray,
5247
ExtensionArray,
5348
)
54-
from pandas.core.arrays.sparse import SparseDtype
5549
from pandas.core.construction import ensure_wrapped_if_datetimelike
5650
from pandas.core.internals.array_manager import (
5751
ArrayManager,
@@ -422,29 +416,7 @@ def is_na(self) -> bool:
422416
blk = self.block
423417
if blk.dtype.kind == "V":
424418
return True
425-
426-
if not blk._can_hold_na:
427-
return False
428-
429-
values = blk.values
430-
if values.size == 0:
431-
return True
432-
if isinstance(values.dtype, SparseDtype):
433-
return False
434-
435-
if values.ndim == 1:
436-
# TODO(EA2D): no need for special case with 2D EAs
437-
val = values[0]
438-
if not is_scalar(val) or not isna(val):
439-
# ideally isna_all would do this short-circuiting
440-
return False
441-
return isna_all(values)
442-
else:
443-
val = values[0][0]
444-
if not is_scalar(val) or not isna(val):
445-
# ideally isna_all would do this short-circuiting
446-
return False
447-
return all(isna_all(row) for row in values)
419+
return False
448420

449421
def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike:
450422
values: ArrayLike
@@ -590,9 +562,6 @@ def _dtype_to_na_value(dtype: DtypeObj, has_none_blocks: bool):
590562
# different from missing.na_value_for_dtype
591563
return None
592564
elif dtype.kind in ["i", "u"]:
593-
if not has_none_blocks:
594-
# different from missing.na_value_for_dtype
595-
return None
596565
return np.nan
597566
elif dtype.kind == "O":
598567
return np.nan

Diff for: pandas/tests/frame/methods/test_append.py

+4-11
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,7 @@ def test_append_empty_dataframe(self):
140140
expected = df1.copy()
141141
tm.assert_frame_equal(result, expected)
142142

143-
def test_append_dtypes(self, using_array_manager):
143+
def test_append_dtypes(self):
144144

145145
# GH 5754
146146
# row appends of different dtypes (so need to do by-item)
@@ -164,10 +164,7 @@ def test_append_dtypes(self, using_array_manager):
164164
expected = DataFrame(
165165
{"bar": Series([Timestamp("20130101"), np.nan], dtype="M8[ns]")}
166166
)
167-
if using_array_manager:
168-
# TODO(ArrayManager) decide on exact casting rules in concat
169-
# With ArrayManager, all-NaN float is not ignored
170-
expected = expected.astype(object)
167+
expected = expected.astype(object)
171168
tm.assert_frame_equal(result, expected)
172169

173170
df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1))
@@ -176,9 +173,7 @@ def test_append_dtypes(self, using_array_manager):
176173
expected = DataFrame(
177174
{"bar": Series([Timestamp("20130101"), np.nan], dtype="M8[ns]")}
178175
)
179-
if using_array_manager:
180-
# With ArrayManager, all-NaN float is not ignored
181-
expected = expected.astype(object)
176+
expected = expected.astype(object)
182177
tm.assert_frame_equal(result, expected)
183178

184179
df1 = DataFrame({"bar": np.nan}, index=range(1))
@@ -187,9 +182,7 @@ def test_append_dtypes(self, using_array_manager):
187182
expected = DataFrame(
188183
{"bar": Series([np.nan, Timestamp("20130101")], dtype="M8[ns]")}
189184
)
190-
if using_array_manager:
191-
# With ArrayManager, all-NaN float is not ignored
192-
expected = expected.astype(object)
185+
expected = expected.astype(object)
193186
tm.assert_frame_equal(result, expected)
194187

195188
df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1))

Diff for: pandas/tests/indexing/test_partial.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,8 @@ def test_partial_setting_mixed_dtype(self):
168168
# columns will align
169169
df = DataFrame(columns=["A", "B"])
170170
df.loc[0] = Series(1, index=range(4))
171-
tm.assert_frame_equal(df, DataFrame(columns=["A", "B"], index=[0]))
171+
expected = DataFrame(columns=["A", "B"], index=[0], dtype=np.float64)
172+
tm.assert_frame_equal(df, expected)
172173

173174
# columns will align
174175
# TODO: it isn't great that this behavior depends on consolidation
@@ -185,11 +186,10 @@ def test_partial_setting_mixed_dtype(self):
185186
with pytest.raises(ValueError, match=msg):
186187
df.loc[0] = [1, 2, 3]
187188

188-
# TODO: #15657, these are left as object and not coerced
189189
df = DataFrame(columns=["A", "B"])
190190
df.loc[3] = [6, 7]
191191

192-
exp = DataFrame([[6, 7]], index=[3], columns=["A", "B"], dtype="object")
192+
exp = DataFrame([[6, 7]], index=[3], columns=["A", "B"], dtype=np.int64)
193193
tm.assert_frame_equal(df, exp)
194194

195195
def test_series_partial_set(self):

Diff for: pandas/tests/reshape/merge/test_merge.py

+3-5
Original file line numberDiff line numberDiff line change
@@ -695,7 +695,7 @@ def _constructor(self):
695695

696696
assert isinstance(result, NotADataFrame)
697697

698-
def test_join_append_timedeltas(self, using_array_manager):
698+
def test_join_append_timedeltas(self):
699699
# timedelta64 issues with join/merge
700700
# GH 5695
701701

@@ -707,11 +707,9 @@ def test_join_append_timedeltas(self, using_array_manager):
707707
{
708708
"d": [datetime(2013, 11, 5, 5, 56), datetime(2013, 11, 5, 5, 56)],
709709
"t": [timedelta(0, 22500), timedelta(0, 22500)],
710-
}
710+
},
711+
dtype=object,
711712
)
712-
if using_array_manager:
713-
# TODO(ArrayManager) decide on exact casting rules in concat
714-
expected = expected.astype(object)
715713
tm.assert_frame_equal(result, expected)
716714

717715
def test_join_append_timedeltas2(self):

0 commit comments

Comments
 (0)