Skip to content

Commit 9a6c8f0

Browse files
authored
PERF: Avoid Series constructor in DataFrame(dict(...), columns=) (#57205)
* Avoid Series constructor inference in dict_to_mgr * test_constructors passes * Use construct_1d_arraylike_from_scalar * PERF: Avoid Series constructor in DataFrame(dict(...), columns=) * Fix whitespace and comment * typing * Just ignore * add bug fix and test * don't overwrite dtype
1 parent 1d70500 commit 9a6c8f0

File tree

3 files changed

+41
-29
lines changed

3 files changed

+41
-29
lines changed

Diff for: doc/source/whatsnew/v3.0.0.rst

+2
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,7 @@ Removal of prior version deprecations/changes
167167

168168
Performance improvements
169169
~~~~~~~~~~~~~~~~~~~~~~~~
170+
- Performance improvement in :class:`DataFrame` when ``data`` is a ``dict`` and ``columns`` is specified (:issue:`24368`)
170171
- Performance improvement in :meth:`DataFrame.join` for sorted but non-unique indexes (:issue:`56941`)
171172
- Performance improvement in :meth:`DataFrame.join` when left and/or right are non-unique and ``how`` is ``"left"``, ``"right"``, or ``"inner"`` (:issue:`56817`)
172173
- Performance improvement in :meth:`DataFrame.join` with ``how="left"`` or ``how="right"`` and ``sort=True`` (:issue:`56919`)
@@ -290,6 +291,7 @@ Styler
290291

291292
Other
292293
^^^^^
294+
- Bug in :class:`DataFrame` when passing a ``dict`` with a NA scalar and ``columns`` that would always return ``np.nan`` (:issue:`57205`)
293295
- Bug in :func:`tseries.api.guess_datetime_format` would fail to infer time format when "%Y" == "%H%M" (:issue:`57452`)
294296
- Bug in :meth:`DataFrame.sort_index` when passing ``axis="columns"`` and ``ignore_index=True`` and ``ascending=False`` not returning a :class:`RangeIndex` columns (:issue:`57293`)
295297
- Bug in :meth:`DataFrame.where` where using a non-bool type array in the function would return a ``ValueError`` instead of a ``TypeError`` (:issue:`56330`)

Diff for: pandas/core/internals/construction.py

+34-29
Original file line numberDiff line numberDiff line change
@@ -31,12 +31,14 @@
3131
is_list_like,
3232
is_named_tuple,
3333
is_object_dtype,
34+
is_scalar,
3435
)
3536
from pandas.core.dtypes.dtypes import ExtensionDtype
3637
from pandas.core.dtypes.generic import (
3738
ABCDataFrame,
3839
ABCSeries,
3940
)
41+
from pandas.core.dtypes.missing import isna
4042

4143
from pandas.core import (
4244
algorithms,
@@ -354,45 +356,48 @@ def dict_to_mgr(
354356
355357
Used in DataFrame.__init__
356358
"""
357-
arrays: Sequence[Any] | Series
359+
arrays: Sequence[Any]
358360

359361
if columns is not None:
360-
from pandas.core.series import Series
362+
columns = ensure_index(columns)
363+
arrays = [np.nan] * len(columns)
364+
midxs = set()
365+
data_keys = ensure_index(data.keys()) # type: ignore[arg-type]
366+
data_values = list(data.values())
367+
368+
for i, column in enumerate(columns):
369+
try:
370+
idx = data_keys.get_loc(column)
371+
except KeyError:
372+
midxs.add(i)
373+
continue
374+
array = data_values[idx]
375+
arrays[i] = array
376+
if is_scalar(array) and isna(array):
377+
midxs.add(i)
361378

362-
arrays = Series(data, index=columns, dtype=object)
363-
missing = arrays.isna()
364379
if index is None:
365380
# GH10856
366381
# raise ValueError if only scalars in dict
367-
index = _extract_index(arrays[~missing])
382+
if midxs:
383+
index = _extract_index(
384+
[array for i, array in enumerate(arrays) if i not in midxs]
385+
)
386+
else:
387+
index = _extract_index(arrays)
368388
else:
369389
index = ensure_index(index)
370390

371391
# no obvious "empty" int column
372-
if missing.any() and not is_integer_dtype(dtype):
373-
nan_dtype: DtypeObj
374-
375-
if dtype is not None:
376-
# calling sanitize_array ensures we don't mix-and-match
377-
# NA dtypes
378-
midxs = missing.values.nonzero()[0]
379-
for i in midxs:
380-
arr = sanitize_array(arrays.iat[i], index, dtype=dtype)
381-
arrays.iat[i] = arr
382-
else:
383-
# GH#1783
384-
nan_dtype = np.dtype("object")
385-
val = construct_1d_arraylike_from_scalar(np.nan, len(index), nan_dtype)
386-
nmissing = missing.sum()
387-
if copy:
388-
rhs = [val] * nmissing
389-
else:
390-
# GH#45369
391-
rhs = [val.copy() for _ in range(nmissing)]
392-
arrays.loc[missing] = rhs
393-
394-
arrays = list(arrays)
395-
columns = ensure_index(columns)
392+
if midxs and not is_integer_dtype(dtype):
393+
# GH#1783
394+
for i in midxs:
395+
arr = construct_1d_arraylike_from_scalar(
396+
arrays[i],
397+
len(index),
398+
dtype if dtype is not None else np.dtype("object"),
399+
)
400+
arrays[i] = arr
396401

397402
else:
398403
keys = list(data.keys())

Diff for: pandas/tests/frame/test_constructors.py

+5
Original file line numberDiff line numberDiff line change
@@ -3042,6 +3042,11 @@ def test_columns_indexes_raise_on_sets(self):
30423042
with pytest.raises(ValueError, match="columns cannot be a set"):
30433043
DataFrame(data, columns={"a", "b", "c"})
30443044

3045+
def test_from_dict_with_columns_na_scalar(self):
3046+
result = DataFrame({"a": pd.NaT}, columns=["a"], index=range(2))
3047+
expected = DataFrame({"a": Series([pd.NaT, pd.NaT])})
3048+
tm.assert_frame_equal(result, expected)
3049+
30453050

30463051
def get1(obj): # TODO: make a helper in tm?
30473052
if isinstance(obj, Series):

0 commit comments

Comments
 (0)