BUG/API: concat with empty DataFrames or all-NA columns (#43507)

jbrockmendel · web-flow · commit 084c543bf9e7 · 2021-09-14T21:37:14.000-04:00
diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
@@ -150,9 +150,44 @@ The ``dayfirst`` option of :func:`to_datetime` isn't strict, and this can lead t
 Now, a warning will be raised if a date string cannot be parsed accordance to the given ``dayfirst`` value when
 the value is a delimited date string (e.g. ``31-12-2012``).
 
-.. _whatsnew_140.notable_bug_fixes.notable_bug_fix2:
+.. _whatsnew_140.notable_bug_fixes.concat_with_empty_or_all_na:
 
-notable_bug_fix2
+Ignoring dtypes in concat with empty or all-NA columns
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+When using :func:`concat` to concatenate two or more :class:`DataFrame` objects,
+if one of the DataFrames was empty or had all-NA values, its dtype was _sometimes_
+ignored when finding the concatenated dtype.  These are now consistently _not_ ignored (:issue:`43507`).
+
+.. ipython:: python
+
+    df1 = pd.DataFrame({"bar": [pd.Timestamp("2013-01-01")]}, index=range(1))
+    df2 = pd.DataFrame({"bar": np.nan}, index=range(1, 2))
+    res = df1.append(df2)
+
+Previously, the float-dtype in ``df2`` would be ignored so the result dtype would be ``datetime64[ns]``. As a result, the ``np.nan`` would be cast to ``NaT``.
+
+*Previous behavior*:
+
+.. code-block:: ipython
+
+    In [4]: res
+    Out[4]:
+             bar
+    0 2013-01-01
+    1        NaT
+
+Now the float-dtype is respected. Since the common dtype for these DataFrames is object, the ``np.nan`` is retained.
+
+*New behavior*:
+
+.. ipython:: python
+
+    res
+
+.. _whatsnew_140.notable_bug_fixes.notable_bug_fix3:
+
+notable_bug_fix3
 ^^^^^^^^^^^^^^^^
 
 .. ---------------------------------------------------------------------------
diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py
@@ -1921,6 +1921,7 @@ def _setitem_with_indexer_missing(self, indexer, value):
                 # no columns and scalar
                 raise ValueError("cannot set a frame with no defined columns")
 
+            has_dtype = hasattr(value, "dtype")
             if isinstance(value, ABCSeries):
                 # append a Series
                 value = value.reindex(index=self.obj.columns, copy=True)
@@ -1938,7 +1939,18 @@ def _setitem_with_indexer_missing(self, indexer, value):
 
                 value = Series(value, index=self.obj.columns, name=indexer)
 
-            self.obj._mgr = self.obj.append(value)._mgr
+            if not len(self.obj):
+                # We will ignore the existing dtypes instead of using
+                #  internals.concat logic
+                df = value.to_frame().T
+                df.index = [indexer]
+                if not has_dtype:
+                    # i.e. if we already had a Series or ndarray, keep that
+                    #  dtype.  But if we had a list or dict, then do inference
+                    df = df.infer_objects()
+                self.obj._mgr = df._mgr
+            else:
+                self.obj._mgr = self.obj.append(value)._mgr
             self.obj._maybe_update_cacher(clear=True)
 
     def _ensure_iterable_column_indexer(self, column_indexer):
diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py
@@ -32,26 +32,20 @@
     is_1d_only_ea_obj,
     is_datetime64tz_dtype,
     is_dtype_equal,
-    is_scalar,
     needs_i8_conversion,
 )
 from pandas.core.dtypes.concat import (
     cast_to_common_type,
     concat_compat,
 )
 from pandas.core.dtypes.dtypes import ExtensionDtype
-from pandas.core.dtypes.missing import (
-    is_valid_na_for_dtype,
-    isna,
-    isna_all,
-)
+from pandas.core.dtypes.missing import is_valid_na_for_dtype
 
 import pandas.core.algorithms as algos
 from pandas.core.arrays import (
     DatetimeArray,
     ExtensionArray,
 )
-from pandas.core.arrays.sparse import SparseDtype
 from pandas.core.construction import ensure_wrapped_if_datetimelike
 from pandas.core.internals.array_manager import (
     ArrayManager,
@@ -422,29 +416,7 @@ def is_na(self) -> bool:
         blk = self.block
         if blk.dtype.kind == "V":
             return True
-
-        if not blk._can_hold_na:
-            return False
-
-        values = blk.values
-        if values.size == 0:
-            return True
-        if isinstance(values.dtype, SparseDtype):
-            return False
-
-        if values.ndim == 1:
-            # TODO(EA2D): no need for special case with 2D EAs
-            val = values[0]
-            if not is_scalar(val) or not isna(val):
-                # ideally isna_all would do this short-circuiting
-                return False
-            return isna_all(values)
-        else:
-            val = values[0][0]
-            if not is_scalar(val) or not isna(val):
-                # ideally isna_all would do this short-circuiting
-                return False
-            return all(isna_all(row) for row in values)
+        return False
 
     def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike:
         values: ArrayLike
@@ -590,9 +562,6 @@ def _dtype_to_na_value(dtype: DtypeObj, has_none_blocks: bool):
         # different from missing.na_value_for_dtype
         return None
     elif dtype.kind in ["i", "u"]:
-        if not has_none_blocks:
-            # different from missing.na_value_for_dtype
-            return None
         return np.nan
     elif dtype.kind == "O":
         return np.nan
diff --git a/pandas/tests/frame/methods/test_append.py b/pandas/tests/frame/methods/test_append.py
@@ -140,7 +140,7 @@ def test_append_empty_dataframe(self):
         expected = df1.copy()
         tm.assert_frame_equal(result, expected)
 
-    def test_append_dtypes(self, using_array_manager):
+    def test_append_dtypes(self):
 
         # GH 5754
         # row appends of different dtypes (so need to do by-item)
@@ -164,10 +164,7 @@ def test_append_dtypes(self, using_array_manager):
         expected = DataFrame(
             {"bar": Series([Timestamp("20130101"), np.nan], dtype="M8[ns]")}
         )
-        if using_array_manager:
-            # TODO(ArrayManager) decide on exact casting rules in concat
-            # With ArrayManager, all-NaN float is not ignored
-            expected = expected.astype(object)
+        expected = expected.astype(object)
         tm.assert_frame_equal(result, expected)
 
         df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1))
@@ -176,9 +173,7 @@ def test_append_dtypes(self, using_array_manager):
         expected = DataFrame(
             {"bar": Series([Timestamp("20130101"), np.nan], dtype="M8[ns]")}
         )
-        if using_array_manager:
-            # With ArrayManager, all-NaN float is not ignored
-            expected = expected.astype(object)
+        expected = expected.astype(object)
         tm.assert_frame_equal(result, expected)
 
         df1 = DataFrame({"bar": np.nan}, index=range(1))
@@ -187,9 +182,7 @@ def test_append_dtypes(self, using_array_manager):
         expected = DataFrame(
             {"bar": Series([np.nan, Timestamp("20130101")], dtype="M8[ns]")}
         )
-        if using_array_manager:
-            # With ArrayManager, all-NaN float is not ignored
-            expected = expected.astype(object)
+        expected = expected.astype(object)
         tm.assert_frame_equal(result, expected)
 
         df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1))
diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py
@@ -168,7 +168,8 @@ def test_partial_setting_mixed_dtype(self):
         # columns will align
         df = DataFrame(columns=["A", "B"])
         df.loc[0] = Series(1, index=range(4))
-        tm.assert_frame_equal(df, DataFrame(columns=["A", "B"], index=[0]))
+        expected = DataFrame(columns=["A", "B"], index=[0], dtype=np.float64)
+        tm.assert_frame_equal(df, expected)
 
         # columns will align
         # TODO: it isn't great that this behavior depends on consolidation
@@ -185,11 +186,10 @@ def test_partial_setting_mixed_dtype(self):
         with pytest.raises(ValueError, match=msg):
             df.loc[0] = [1, 2, 3]
 
-        # TODO: #15657, these are left as object and not coerced
         df = DataFrame(columns=["A", "B"])
         df.loc[3] = [6, 7]
 
-        exp = DataFrame([[6, 7]], index=[3], columns=["A", "B"], dtype="object")
+        exp = DataFrame([[6, 7]], index=[3], columns=["A", "B"], dtype=np.int64)
         tm.assert_frame_equal(df, exp)
 
     def test_series_partial_set(self):
diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py
@@ -695,7 +695,7 @@ def _constructor(self):
 
         assert isinstance(result, NotADataFrame)
 
-    def test_join_append_timedeltas(self, using_array_manager):
+    def test_join_append_timedeltas(self):
         # timedelta64 issues with join/merge
         # GH 5695
 
@@ -707,11 +707,9 @@ def test_join_append_timedeltas(self, using_array_manager):
             {
                 "d": [datetime(2013, 11, 5, 5, 56), datetime(2013, 11, 5, 5, 56)],
                 "t": [timedelta(0, 22500), timedelta(0, 22500)],
-            }
+            },
+            dtype=object,
         )
-        if using_array_manager:
-            # TODO(ArrayManager) decide on exact casting rules in concat
-            expected = expected.astype(object)
         tm.assert_frame_equal(result, expected)
 
     def test_join_append_timedeltas2(self):