diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index a0b24342091ec..f3b005b704014 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -1,3 +1,5 @@ +import string +import sys import warnings import numpy as np @@ -67,6 +69,47 @@ def time_existing_series(self): pd.Categorical(self.series) +class AsType: + def setup(self): + N = 10 ** 5 + + random_pick = np.random.default_rng().choice + + categories = { + "str": list(string.ascii_letters), + "int": np.random.randint(2 ** 16, size=154), + "float": sys.maxsize * np.random.random((38,)), + "timestamp": [ + pd.Timestamp(x, unit="s") for x in np.random.randint(2 ** 18, size=578) + ], + } + + self.df = pd.DataFrame( + {col: random_pick(cats, N) for col, cats in categories.items()} + ) + + for col in ("int", "float", "timestamp"): + self.df[col + "_as_str"] = self.df[col].astype(str) + + for col in self.df.columns: + self.df[col] = self.df[col].astype("category") + + def astype_str(self): + [self.df[col].astype("str") for col in "int float timestamp".split()] + + def astype_int(self): + [self.df[col].astype("int") for col in "int_as_str timestamp".split()] + + def astype_float(self): + [ + self.df[col].astype("float") + for col in "float_as_str int int_as_str timestamp".split() + ] + + def astype_datetime(self): + self.df["float"].astype(pd.DatetimeTZDtype(tz="US/Pacific")) + + class Concat: def setup(self): N = 10 ** 5 diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index e00b177f2a2fc..669599bb5845a 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -499,6 +499,7 @@ Performance improvements - Reduced peak memory usage in :meth:`DataFrame.to_pickle` when using ``protocol=5`` in python 3.8+ (:issue:`34244`) - faster ``dir`` calls when many index labels, e.g. ``dir(ser)`` (:issue:`37450`) - Performance improvement in :class:`ExpandingGroupby` (:issue:`37064`) +- Performance improvement in :meth:`Series.astype` and :meth:`DataFrame.astype` for :class:`Categorical` (:issue:`8628`) - Performance improvement in :meth:`pd.DataFrame.groupby` for ``float`` ``dtype`` (:issue:`28303`), changes of the underlying hash-function can lead to changes in float based indexes sort ordering for ties (e.g. :meth:`pd.Index.value_counts`) - Performance improvement in :meth:`pd.isin` for inputs with more than 1e6 elements diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index e57ef17272a9d..62e508c491740 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -403,20 +403,42 @@ def astype(self, dtype: Dtype, copy: bool = True) -> ArrayLike: If copy is set to False and dtype is categorical, the original object is returned. """ - if is_categorical_dtype(dtype): + if self.dtype is dtype: + result = self.copy() if copy else self + + elif is_categorical_dtype(dtype): dtype = cast(Union[str, CategoricalDtype], dtype) # GH 10696/18593/18630 dtype = self.dtype.update_dtype(dtype) - result = self.copy() if copy else self - if dtype == self.dtype: - return result - return result._set_dtype(dtype) - if is_extension_array_dtype(dtype): - return array(self, dtype=dtype, copy=copy) - if is_integer_dtype(dtype) and self.isna().any(): + self = self.copy() if copy else self + result = self._set_dtype(dtype) + + # TODO: consolidate with ndarray case? + elif is_extension_array_dtype(dtype): + result = array(self, dtype=dtype, copy=copy) + + elif is_integer_dtype(dtype) and self.isna().any(): raise ValueError("Cannot convert float NaN to integer") - return np.array(self, dtype=dtype, copy=copy) + + elif len(self.codes) == 0 or len(self.categories) == 0: + result = np.array(self, dtype=dtype, copy=copy) + + else: + # GH8628 (PERF): astype category codes instead of astyping array + try: + astyped_cats = self.categories.astype(dtype=dtype, copy=copy) + except ( + TypeError, # downstream error msg for CategoricalIndex is misleading + ValueError, + ): + msg = f"Cannot cast {self.categories.dtype} dtype to {dtype}" + raise ValueError(msg) + + astyped_cats = extract_array(astyped_cats, extract_numpy=True) + result = take_1d(astyped_cats, libalgos.ensure_platform_int(self._codes)) + + return result @cache_readonly def itemsize(self) -> int: diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index d52a8ae935688..0b0f985697da9 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -6,7 +6,7 @@ from pandas._libs import index as libindex from pandas._libs.tslibs import BaseOffset, Period, Resolution, Tick from pandas._libs.tslibs.parsing import DateParseError, parse_time_string -from pandas._typing import DtypeObj, Label +from pandas._typing import DtypeObj from pandas.errors import InvalidIndexError from pandas.util._decorators import Appender, cache_readonly, doc diff --git a/pandas/tests/arrays/categorical/test_dtypes.py b/pandas/tests/arrays/categorical/test_dtypes.py index deafa22a6e8eb..12654388de904 100644 --- a/pandas/tests/arrays/categorical/test_dtypes.py +++ b/pandas/tests/arrays/categorical/test_dtypes.py @@ -127,7 +127,7 @@ def test_astype(self, ordered): expected = np.array(cat) tm.assert_numpy_array_equal(result, expected) - msg = "could not convert string to float" + msg = r"Cannot cast object dtype to " with pytest.raises(ValueError, match=msg): cat.astype(float) @@ -138,7 +138,7 @@ def test_astype(self, ordered): tm.assert_numpy_array_equal(result, expected) result = cat.astype(int) - expected = np.array(cat, dtype=int) + expected = np.array(cat, dtype="int64") tm.assert_numpy_array_equal(result, expected) result = cat.astype(float) diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index 46c41efc09fdf..865ae565b6501 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -60,7 +60,7 @@ def test_astype_categorical_to_other(self): expected = ser tm.assert_series_equal(ser.astype("category"), expected) tm.assert_series_equal(ser.astype(CategoricalDtype()), expected) - msg = r"could not convert string to float|invalid literal for float\(\)" + msg = r"Cannot cast object dtype to float64" with pytest.raises(ValueError, match=msg): ser.astype("float64") @@ -68,7 +68,7 @@ def test_astype_categorical_to_other(self): exp = Series(["a", "b", "b", "a", "a", "c", "c", "c"]) tm.assert_series_equal(cat.astype("str"), exp) s2 = Series(Categorical(["1", "2", "3", "4"])) - exp2 = Series([1, 2, 3, 4]).astype(int) + exp2 = Series([1, 2, 3, 4]).astype("int64") tm.assert_series_equal(s2.astype("int"), exp2) # object don't sort correctly, so just compare that we have the same