pandas-dev · mroeschke · Oct 22, 2023 · Sep 12, 2023 · Sep 12, 2023 · Sep 14, 2023
diff --git a/pandas/core/_numba/extensions.py b/pandas/core/_numba/extensions.py
diff --git a/pandas/core/apply.py b/pandas/core/apply.py
@@ -1,7 +1,13 @@
+# pyright: reportUnusedImport=false
+# Disabled since there's no way to do an ignore for both pyright
+# and ruff, and ruff should be sufficient
+# (The reason we need this is because the import of the numba extensions is unused
+# but is necessary to register the extensions)
 from __future__ import annotations
 
 import abc
 from collections import defaultdict
+import functools
 from functools import partial
 import inspect
 from typing import (
@@ -29,6 +35,7 @@
     NDFrameT,
     npt,
 )
+from pandas.compat._optional import import_optional_dependency
 from pandas.errors import SpecificationError
 from pandas.util._decorators import cache_readonly
 from pandas.util._exceptions import find_stack_level
@@ -121,6 +128,8 @@ def __init__(
         result_type: str | None,
         *,
         by_row: Literal[False, "compat", "_compat"] = "compat",
+        engine: str = "python",
+        engine_kwargs: dict[str, bool] | None = None,
         args,
         kwargs,
     ) -> None:
@@ -133,6 +142,9 @@ def __init__(
         self.args = args or ()
         self.kwargs = kwargs or {}
 
+        self.engine = engine
+        self.engine_kwargs = {} if engine_kwargs is None else engine_kwargs
+
         if result_type not in [None, "reduce", "broadcast", "expand"]:
             raise ValueError(
                 "invalid value for result_type, must be one "
@@ -601,6 +613,13 @@ def apply_list_or_dict_like(self) -> DataFrame | Series:
         result: Series, DataFrame, or None
             Result when self.func is a list-like or dict-like, None otherwise.
         """
+
+        if self.engine == "numba":
+            raise NotImplementedError(
+                "The 'numba' engine doesn't support list-like/"
+                "dict likes of callables yet."
+            )
+
         if self.axis == 1 and isinstance(self.obj, ABCDataFrame):
             return self.obj.T.apply(self.func, 0, args=self.args, **self.kwargs).T
 
@@ -768,10 +787,16 @@ def __init__(
     ) -> None:
         if by_row is not False and by_row != "compat":
             raise ValueError(f"by_row={by_row} not allowed")
-        self.engine = engine
-        self.engine_kwargs = engine_kwargs
         super().__init__(
-            obj, func, raw, result_type, by_row=by_row, args=args, kwargs=kwargs
+            obj,
+            func,
+            raw,
+            result_type,
+            by_row=by_row,
+            engine=engine,
+            engine_kwargs=engine_kwargs,
+            args=args,
+            kwargs=kwargs,
         )
 
     # ---------------------------------------------------------------
@@ -792,6 +817,18 @@ def result_columns(self) -> Index:
     def series_generator(self) -> Generator[Series, None, None]:
         pass
 
+    @staticmethod
+    @functools.cache
+    @abc.abstractmethod
+    def generate_numba_apply_func(
+        func, nogil=True, nopython=True, parallel=False
+    ) -> Callable[[npt.NDArray, Index, Index], dict[int, Any]]:
+        pass
+
+    @abc.abstractmethod
+    def apply_with_numba(self):
+        pass
+
     @abc.abstractmethod
     def wrap_results_for_axis(
         self, results: ResType, res_index: Index
@@ -815,13 +852,12 @@ def values(self):
     def apply(self) -> DataFrame | Series:
         """compute the results"""
 
-        if self.engine == "numba" and not self.raw:
-            raise ValueError(
-                "The numba engine in DataFrame.apply can only be used when raw=True"
-            )
-
         # dispatch to handle list-like or dict-like
         if is_list_like(self.func):
+            if self.engine == "numba":
+                raise NotImplementedError(
+                    "the 'numba' engine doesn't support lists of callables yet"
+                )
             return self.apply_list_or_dict_like()
 
         # all empty
@@ -830,17 +866,31 @@ def apply(self) -> DataFrame | Series:
 
         # string dispatch
         if isinstance(self.func, str):
+            if self.engine == "numba":
+                raise NotImplementedError(
+                    "the 'numba' engine doesn't support using "
+                    "a string as the callable function"
+                )
             return self.apply_str()
 
         # ufunc
         elif isinstance(self.func, np.ufunc):
+            if self.engine == "numba":
+                raise NotImplementedError(
+                    "the 'numba' engine doesn't support "
+                    "using a numpy ufunc as the callable function"
+                )
             with np.errstate(all="ignore"):
                 results = self.obj._mgr.apply("apply", func=self.func)
             # _constructor will retain self.index and self.columns
             return self.obj._constructor_from_mgr(results, axes=results.axes)
 
         # broadcasting
         if self.result_type == "broadcast":
+            if self.engine == "numba":
+                raise NotImplementedError(
+                    "the 'numba' engine doesn't support result_type='broadcast'"
+                )
             return self.apply_broadcast(self.obj)
 
         # one axis empty
@@ -997,7 +1047,10 @@ def apply_broadcast(self, target: DataFrame) -> DataFrame:
         return result
 
     def apply_standard(self):
-        results, res_index = self.apply_series_generator()
+        if self.engine == "python":
+            results, res_index = self.apply_series_generator()
+        else:
+            results, res_index = self.apply_series_numba()
 
         # wrap results
         return self.wrap_results(results, res_index)
@@ -1021,6 +1074,18 @@ def apply_series_generator(self) -> tuple[ResType, Index]:
 
         return results, res_index
 
+    def apply_series_numba(self):
+        if self.engine_kwargs.get("parallel", False):
+            raise NotImplementedError(
+                "Parallel apply is not supported when raw=False and engine='numba'"
+            )
+        if not self.obj.index.is_unique or not self.columns.is_unique:
+            raise NotImplementedError(
+                "The index/columns must be unique when raw=False and engine='numba'"
+            )
+        results = self.apply_with_numba()
+        return results, self.result_index
+
     def wrap_results(self, results: ResType, res_index: Index) -> DataFrame | Series:
         from pandas import Series
 
@@ -1060,6 +1125,76 @@ class FrameRowApply(FrameApply):
     def series_generator(self) -> Generator[Series, None, None]:
         return (self.obj._ixs(i, axis=1) for i in range(len(self.columns)))
 
+    @staticmethod
+    @functools.cache
+    def generate_numba_apply_func(
+        func, nogil=True, nopython=True, parallel=False
+    ) -> Callable[[npt.NDArray, Index, Index], dict[int, Any]]:
+        from pandas import Series
+
+        # Dummy import just to make the extensions loaded in
+        # This isn't an entrypoint since we don't want users
+        # using Series/DF in numba code outside of apply
+        from pandas.core._numba.extensions import SeriesType  # noqa: F401
+        from pandas.core._numba.extensions import maybe_cast_str
+
+        numba = import_optional_dependency("numba")
+
+        jitted_udf = numba.extending.register_jitable(func)
+
+        @numba.jit(nogil=nogil, nopython=nopython, parallel=parallel)
+        def numba_func(values, col_names, df_index):
+            results = {}
+            for j in range(values.shape[1]):
-            for j in range(values.shape[1]):
+            for j in numba.prange(values.shape[1]):
-            for j in range(values.shape[1]):
+            for j in numba.prange(values.shape[1]):
+                # Create the series
+                ser = Series(
+                    values[:, j], index=df_index, name=maybe_cast_str(col_names[j])
+                )
+                results[j] = jitted_udf(ser)
+            return results
+
+        return numba_func
+
+    def apply_with_numba(self) -> dict[int, Any]:
+        nb_func = self.generate_numba_apply_func(
+            cast(Callable, self.func), **self.engine_kwargs
+        )
+        # Since numpy/numba doesn't support object array of stringswell
+        # we'll do a sketchy thing where if index._data is object
+        # we convert to string and directly set index._data to that,
+        # setting it back after we call the function
+        fixed_obj_colnames = False
+        orig_cols = self.columns.to_numpy()
+        if self.columns._data.dtype == object:
+            if not lib.is_string_array(orig_cols):
+                raise ValueError(
+                    "The numba engine only supports "
+                    "using string or numeric column names"
+                )
+            # Remember to set this back!!!
+            self.columns._data = orig_cols.astype("U")
+            fixed_obj_colnames = True
+
+        fixed_obj_index = False
+        orig_index = self.index.to_numpy()
+        if self.obj.index._data.dtype == object:
+            if not lib.is_string_array(orig_index):
+                raise ValueError(
+                    "The numba engine only supports "
+                    "using string or numeric index values"
+                )
+            # Remember to set this back!!!
+            self.obj.index._data = orig_index.astype("U")
+            fixed_obj_index = True
+        df_index = self.obj.index
+
+        res = dict(nb_func(self.values, self.columns, df_index))
+        if fixed_obj_colnames:
+            self.columns._data = orig_cols
+        if fixed_obj_index:
+            self.obj.index._data = orig_index
+        return res
+
     @property
     def result_index(self) -> Index:
         return self.columns
@@ -1143,6 +1278,84 @@ def series_generator(self) -> Generator[Series, None, None]:
                 object.__setattr__(ser, "_name", name)
                 yield ser
 
+    @staticmethod
+    @functools.cache
+    def generate_numba_apply_func(
+        func, nogil=True, nopython=True, parallel=False
+    ) -> Callable[[npt.NDArray, Index, Index], dict[int, Any]]:
+        # Dummy import just to make the extensions loaded in
+        # This isn't an entrypoint since we don't want users
+        # using Series/DF in numba code outside of apply
+        from pandas import Series
+        from pandas.core._numba.extensions import SeriesType  # noqa: F401
+        from pandas.core._numba.extensions import maybe_cast_str
+
+        numba = import_optional_dependency("numba")
+
+        jitted_udf = numba.extending.register_jitable(func)
+
+        @numba.jit(nogil=nogil, nopython=nopython, parallel=parallel)
+        def numba_func(values, col_names_index, index):
+            results = {}
+            for i in range(values.shape[0]):
+                # Create the series
+                # TODO: values corrupted without the copy
+                ser = Series(
+                    values[i].copy(),
+                    index=col_names_index,
+                    name=maybe_cast_str(index[i]),
+                )
+                results[i] = jitted_udf(ser)
+
+            return results
+
+        return numba_func
+
+    def apply_with_numba(self) -> dict[int, Any]:
+        nb_func = self.generate_numba_apply_func(
+            cast(Callable, self.func), **self.engine_kwargs
+        )
+
+        # Since numpy/numba doesn't support object array of stringswell
+        # we'll do a sketchy thing where if index._data is object
+        # we convert to string and directly set index._data to that,
+        # setting it back after we call the function
+        fixed_obj_colnames = False
+        orig_cols = self.columns.to_numpy()
+        if self.columns._data.dtype == object:
+            if not lib.is_string_array(orig_cols):
+                raise ValueError(
+                    "The numba engine only supports "
+                    "using string or numeric column names"
+                )
+            # Remember to set this back!!!
+            self.columns._data = orig_cols.astype("U")
+            fixed_obj_colnames = True
+
+        fixed_obj_index = False
+        orig_index = self.index.to_numpy()
+        if self.obj.index._data.dtype == object:
+            if not lib.is_string_array(orig_index):
+                raise ValueError(
+                    "The numba engine only supports "
+                    "using string or numeric index values"
+                )
+            # Remember to set this back!!!
+            self.obj.index._data = orig_index.astype("U")
+            fixed_obj_index = True
+
+        # Convert from numba dict to regular dict
+        # Our isinstance checks in the df constructor don't pass for numbas typed dict
+        res = dict(nb_func(self.values, self.columns, self.obj.index))
+
+        if fixed_obj_colnames:
+            self.columns._data = orig_cols
+
+        if fixed_obj_index:
+            self.obj.index._data = orig_index
+
+        return res
+
     @property
     def result_index(self) -> Index:
         return self.index

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -10051,6 +10051,9 @@ def apply(
             - nogil (release the GIL inside the JIT compiled function)
             - parallel (try to apply the function in parallel over the DataFrame)
 
+              Note: Due to limitations within numba/how pandas interfaces with numba,
+              you should only use this if raw=True
+
             Note: The numba compiler only supports a subset of
             valid Python/numpy operations.
 
@@ -10060,8 +10063,6 @@ def apply(
             <https://numba.pydata.org/numba-doc/dev/reference/numpysupported.html>`_
             in numba to learn what you can or cannot use in the passed function.
 
-            As of right now, the numba engine can only be used with raw=True.
-
             .. versionadded:: 2.2.0
 
         engine_kwargs : dict

diff --git a/pandas/tests/apply/conftest.py b/pandas/tests/apply/conftest.py
@@ -16,3 +16,15 @@ def int_frame_const_col():
         columns=["A", "B", "C"],
     )
     return df
+
+
+@pytest.fixture(params=["python", "numba"])
+def engine(request):
+    if request.param == "numba":
+        pytest.importorskip("numba")
+    return request.param
+
+
+@pytest.fixture(params=[0, 1])
+def apply_axis(request):
+    return request.param