PERF: Avoid Series constructor in DataFrame(dict(...), columns=) (#57205)

mroeschke · web-flow · commit 9a6c8f0ad02f · 2024-02-23T16:13:52.000-08:00
* Avoid Series constructor inference in dict_to_mgr

* test_constructors passes

* Use construct_1d_arraylike_from_scalar

* PERF: Avoid Series constructor in DataFrame(dict(...), columns=)

* Fix whitespace and comment

* typing

* Just ignore

* add bug fix and test

* don't overwrite dtype
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -167,6 +167,7 @@ Removal of prior version deprecations/changes
 
 Performance improvements
 ~~~~~~~~~~~~~~~~~~~~~~~~
+- Performance improvement in :class:`DataFrame` when ``data`` is a ``dict`` and ``columns`` is specified (:issue:`24368`)
 - Performance improvement in :meth:`DataFrame.join` for sorted but non-unique indexes (:issue:`56941`)
 - Performance improvement in :meth:`DataFrame.join` when left and/or right are non-unique and ``how`` is ``"left"``, ``"right"``, or ``"inner"`` (:issue:`56817`)
 - Performance improvement in :meth:`DataFrame.join` with ``how="left"`` or ``how="right"`` and ``sort=True`` (:issue:`56919`)
@@ -290,6 +291,7 @@ Styler
 
 Other
 ^^^^^
+- Bug in :class:`DataFrame` when passing a ``dict`` with a NA scalar and ``columns`` that would always return ``np.nan`` (:issue:`57205`)
 - Bug in :func:`tseries.api.guess_datetime_format` would fail to infer time format when "%Y" == "%H%M" (:issue:`57452`)
 - Bug in :meth:`DataFrame.sort_index` when passing ``axis="columns"`` and ``ignore_index=True`` and ``ascending=False`` not returning a :class:`RangeIndex` columns (:issue:`57293`)
 - Bug in :meth:`DataFrame.where` where using a non-bool type array in the function would return a ``ValueError`` instead of a ``TypeError`` (:issue:`56330`)
diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py
@@ -31,12 +31,14 @@
     is_list_like,
     is_named_tuple,
     is_object_dtype,
+    is_scalar,
 )
 from pandas.core.dtypes.dtypes import ExtensionDtype
 from pandas.core.dtypes.generic import (
     ABCDataFrame,
     ABCSeries,
 )
+from pandas.core.dtypes.missing import isna
 
 from pandas.core import (
     algorithms,
@@ -354,45 +356,48 @@ def dict_to_mgr(
 
     Used in DataFrame.__init__
     """
-    arrays: Sequence[Any] | Series
+    arrays: Sequence[Any]
 
     if columns is not None:
-        from pandas.core.series import Series
+        columns = ensure_index(columns)
+        arrays = [np.nan] * len(columns)
+        midxs = set()
+        data_keys = ensure_index(data.keys())  # type: ignore[arg-type]
+        data_values = list(data.values())
+
+        for i, column in enumerate(columns):
+            try:
+                idx = data_keys.get_loc(column)
+            except KeyError:
+                midxs.add(i)
+                continue
+            array = data_values[idx]
+            arrays[i] = array
+            if is_scalar(array) and isna(array):
+                midxs.add(i)
 
-        arrays = Series(data, index=columns, dtype=object)
-        missing = arrays.isna()
         if index is None:
             # GH10856
             # raise ValueError if only scalars in dict
-            index = _extract_index(arrays[~missing])
+            if midxs:
+                index = _extract_index(
+                    [array for i, array in enumerate(arrays) if i not in midxs]
+                )
+            else:
+                index = _extract_index(arrays)
         else:
             index = ensure_index(index)
 
         # no obvious "empty" int column
-        if missing.any() and not is_integer_dtype(dtype):
-            nan_dtype: DtypeObj
-
-            if dtype is not None:
-                # calling sanitize_array ensures we don't mix-and-match
-                #  NA dtypes
-                midxs = missing.values.nonzero()[0]
-                for i in midxs:
-                    arr = sanitize_array(arrays.iat[i], index, dtype=dtype)
-                    arrays.iat[i] = arr
-            else:
-                # GH#1783
-                nan_dtype = np.dtype("object")
-                val = construct_1d_arraylike_from_scalar(np.nan, len(index), nan_dtype)
-                nmissing = missing.sum()
-                if copy:
-                    rhs = [val] * nmissing
-                else:
-                    # GH#45369
-                    rhs = [val.copy() for _ in range(nmissing)]
-                arrays.loc[missing] = rhs
-
-        arrays = list(arrays)
-        columns = ensure_index(columns)
+        if midxs and not is_integer_dtype(dtype):
+            # GH#1783
+            for i in midxs:
+                arr = construct_1d_arraylike_from_scalar(
+                    arrays[i],
+                    len(index),
+                    dtype if dtype is not None else np.dtype("object"),
+                )
+                arrays[i] = arr
 
     else:
         keys = list(data.keys())
diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py
@@ -3042,6 +3042,11 @@ def test_columns_indexes_raise_on_sets(self):
         with pytest.raises(ValueError, match="columns cannot be a set"):
             DataFrame(data, columns={"a", "b", "c"})
 
+    def test_from_dict_with_columns_na_scalar(self):
+        result = DataFrame({"a": pd.NaT}, columns=["a"], index=range(2))
+        expected = DataFrame({"a": Series([pd.NaT, pd.NaT])})
+        tm.assert_frame_equal(result, expected)
+
 
 def get1(obj):  # TODO: make a helper in tm?
     if isinstance(obj, Series):