diff --git a/.github/actions/setup-conda/action.yml b/.github/actions/setup-conda/action.yml index 3eb68bdd2a15c..9b7f752b19794 100644 --- a/.github/actions/setup-conda/action.yml +++ b/.github/actions/setup-conda/action.yml @@ -3,9 +3,24 @@ inputs: environment-file: description: Conda environment file to use. default: environment.yml + os: + description: The operating system to assume when creating Conda. + default: not specified runs: using: composite steps: + # Remove bodo from Window's environment for now until it supports Windows. + - name: Remove bodo on Windows + if: ${{ inputs.os == 'windows-latest' }} + run: | + + sed '/bodo/d' "$ENVIRONMENT_FILE" > tmp.txt + cat tmp.txt > "$ENVIRONMENT_FILE" + rm tmp.txt + env: + ENVIRONMENT_FILE: ${{ inputs.environment-file }} + shell: bash -el {0} + - name: Install ${{ inputs.environment-file }} uses: mamba-org/setup-micromamba@v1 with: diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 08c41a1eeb21f..8778559500e40 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -212,6 +212,7 @@ jobs: uses: ./.github/actions/setup-conda with: environment-file: ci/deps/${{ matrix.env_file }} + os: ${{ matrix.os }} - name: Build Pandas uses: ./.github/actions/build_pandas diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index 74cab4e0970dc..5eb3153bf810a 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -1,6 +1,7 @@ name: pandas-dev channels: - conda-forge + - bodo.ai dependencies: - python=3.10 @@ -35,6 +36,7 @@ dependencies: - lxml>=4.9.2 - matplotlib>=3.6.3 - numba>=0.56.4 + - bodo>=2025.1 # [not win] - numexpr>=2.8.4 - odfpy>=1.4.1 - qtpy>=2.3.0 diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml index 092ca18d61259..d4d7d9979310c 100644 --- a/ci/deps/actions-311-downstream_compat.yaml +++ b/ci/deps/actions-311-downstream_compat.yaml @@ -2,6 +2,7 @@ name: pandas-dev channels: - conda-forge + - bodo.ai dependencies: - python=3.11 @@ -36,6 +37,7 @@ dependencies: - lxml>=4.9.2 - matplotlib>=3.6.3 - numba>=0.56.4 + - bodo>=2025.1 # [not win] - numexpr>=2.8.4 - odfpy>=1.4.1 - qtpy>=2.3.0 diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index b6f515dceaea9..4a5f4d0a5259b 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -1,6 +1,7 @@ name: pandas-dev channels: - conda-forge + - bodo.ai dependencies: - python=3.11 @@ -35,6 +36,7 @@ dependencies: - lxml>=4.9.2 - matplotlib>=3.6.3 - numba>=0.56.4 + - bodo>=2025.1 # [not win] - numexpr>=2.8.4 - odfpy>=1.4.1 - qtpy>=2.3.0 diff --git a/ci/deps/actions-312.yaml b/ci/deps/actions-312.yaml index bc66f8a5382c9..ed2d4aa698cab 100644 --- a/ci/deps/actions-312.yaml +++ b/ci/deps/actions-312.yaml @@ -1,6 +1,7 @@ name: pandas-dev-312 channels: - conda-forge + - bodo.ai dependencies: - python=3.12 @@ -35,6 +36,7 @@ dependencies: - lxml>=4.9.2 - matplotlib>=3.6.3 - numba>=0.56.4 + - bodo>=2025.1 # [not win] - numexpr>=2.8.4 - odfpy>=1.4.1 - qtpy>=2.3.0 diff --git a/ci/run_tests.sh b/ci/run_tests.sh index 16292beec612b..4f5c8cf6b4032 100755 --- a/ci/run_tests.sh +++ b/ci/run_tests.sh @@ -11,8 +11,19 @@ COVERAGE="-s --cov=pandas --cov-report=xml --cov-append --cov-config=pyproject.t PYTEST_CMD="MESONPY_EDITABLE_VERBOSE=1 PYTHONDEVMODE=1 PYTHONWARNDEFAULTENCODING=1 pytest -r fE -n $PYTEST_WORKERS --dist=worksteal $TEST_ARGS $COVERAGE $PYTEST_TARGET" if [[ "$PATTERN" ]]; then - PYTEST_CMD="$PYTEST_CMD -m \"$PATTERN\"" + PYTEST_CMD="$PYTEST_CMD -m \"$PATTERN and not bodo_udf_engine\"" +else + PYTEST_CMD="$PYTEST_CMD -m \"not bodo_udf_engine\"" fi echo "$PYTEST_CMD" sh -c "$PYTEST_CMD" + +# Bodo tests need to be run in a separate session to prevent extensions installed conflicting with numba. +if [[ "$PYTEST_WORKERS" == "0" ]]; then + # Run without setting PYTHONDEVMODE since it can cause segmentation faults during compilation. + PYTEST_CMD_BODO_UDF_ENGINE="MESONPY_EDITABLE_VERBOSE=1 PYTHONWARNDEFAULTENCODING=1 pytest -r fE -n $PYTEST_WORKERS --dist=worksteal $TEST_ARGS $COVERAGE $PYTEST_TARGET -m \"bodo_udf_engine\"" + echo "Running Bodo Tests..." + echo $PYTEST_CMD_BODO_UDF_ENGINE + sh -c "$PYTEST_CMD_BODO_UDF_ENGINE" +fi diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index bda959f380e8a..aff9f7b1b84f2 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -186,6 +186,7 @@ Dependency Minimum Version pip ext `numexpr `__ 2.8.4 performance Accelerates certain numerical operations by using multiple cores as well as smart chunking and caching to achieve large speedups `bottleneck `__ 1.3.6 performance Accelerates certain types of ``nan`` by using specialized cython routines to achieve large speedup. `numba `__ 0.56.4 performance Alternative execution engine for operations that accept ``engine="numba"`` using a JIT compiler that translates Python functions to optimized machine code using the LLVM compiler. +`bodo `__ 2025.1 performance Alternative execution engine for operations that accept ``engine="bodo"`` using a JIT compiler that translates Python functions to optimized machine code using the LLVM compiler and automatically parallelizes uing MPI. ===================================================== ================== ================== =================================================================================================================================================================================== Visualization diff --git a/doc/source/reference/testing.rst b/doc/source/reference/testing.rst index 1f164d1aa98b4..5057275a565e7 100644 --- a/doc/source/reference/testing.rst +++ b/doc/source/reference/testing.rst @@ -35,6 +35,7 @@ Exceptions and warnings errors.DtypeWarning errors.DuplicateLabelError errors.EmptyDataError + errors.ExecutionError errors.IncompatibilityWarning errors.IndexingError errors.InvalidColumnName diff --git a/environment.yml b/environment.yml index 69647a436e3ad..d6eea7fdb5db7 100644 --- a/environment.yml +++ b/environment.yml @@ -2,6 +2,7 @@ name: pandas-dev channels: - conda-forge + - bodo.ai dependencies: - python=3.10 - pip @@ -40,6 +41,7 @@ dependencies: - lxml>=4.9.2 - matplotlib>=3.6.3 - numba>=0.56.4 + - bodo>=2025.1 - numexpr>=2.8.4 - openpyxl>=3.1.0 - odfpy>=1.4.1 diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 6b90389a62056..ea46cdfaa578b 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -57,6 +57,7 @@ "tzdata": "2022.7", "qtpy": "2.3.0", "pyqt5": "5.15.9", + "bodo": "2025.1", } # A mapping from import name to package name (on PyPI) for packages where diff --git a/pandas/core/apply.py b/pandas/core/apply.py index f36fc82fb1a11..1212571b615e8 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -27,7 +27,10 @@ npt, ) from pandas.compat._optional import import_optional_dependency -from pandas.errors import SpecificationError +from pandas.errors import ( + ExecutionError, + SpecificationError, +) from pandas.util._decorators import cache_readonly from pandas.core.dtypes.cast import is_nested_object @@ -598,9 +601,9 @@ def apply_list_or_dict_like(self) -> DataFrame | Series: Result when self.func is a list-like or dict-like, None otherwise. """ - if self.engine == "numba": + if self.engine in ("numba", "bodo"): raise NotImplementedError( - "The 'numba' engine doesn't support list-like/" + f"The '{self.engine}' engine doesn't support list-like/" "dict likes of callables yet." ) @@ -853,9 +856,9 @@ def apply(self) -> DataFrame | Series: # dispatch to handle list-like or dict-like if is_list_like(self.func): - if self.engine == "numba": + if self.engine in ("numba", "bodo"): raise NotImplementedError( - "the 'numba' engine doesn't support lists of callables yet" + f"the '{self.engine}' engine doesn't support lists of callables yet" ) return self.apply_list_or_dict_like() @@ -870,13 +873,16 @@ def apply(self) -> DataFrame | Series: "the 'numba' engine doesn't support using " "a string as the callable function" ) + elif self.engine == "bodo": + return self.apply_series_bodo() + return self.apply_str() # ufunc elif isinstance(self.func, np.ufunc): - if self.engine == "numba": + if self.engine in ("numba", "bodo"): raise NotImplementedError( - "the 'numba' engine doesn't support " + f"the '{self.engine}' engine doesn't support " "using a numpy ufunc as the callable function" ) with np.errstate(all="ignore"): @@ -886,9 +892,10 @@ def apply(self) -> DataFrame | Series: # broadcasting if self.result_type == "broadcast": - if self.engine == "numba": + if self.engine in ("numba", "bodo"): raise NotImplementedError( - "the 'numba' engine doesn't support result_type='broadcast'" + f"the '{self.engine}' engine doesn't support " + "result_type='broadcast'" ) return self.apply_broadcast(self.obj) @@ -1007,6 +1014,8 @@ def wrapper(*args, **kwargs): result = nb_looper(self.values, self.axis, *args) # If we made the result 2-D, squeeze it back to 1-D result = np.squeeze(result) + elif self.engine == "bodo": + raise NotImplementedError("the 'bodo' engine does not support raw=True.") else: result = np.apply_along_axis( wrap_function(self.func), @@ -1051,10 +1060,17 @@ def apply_broadcast(self, target: DataFrame) -> DataFrame: return result def apply_standard(self): - if self.engine == "python": + if self.engine == "numba": + results, res_index = self.apply_series_numba() + elif self.engine == "bodo": + return self.apply_series_bodo() + elif self.engine == "python": results, res_index = self.apply_series_generator() else: - results, res_index = self.apply_series_numba() + raise ValueError( + "invalid value for engine, must be one " + "of {'python', 'numba', 'bodo'}" + ) # wrap results return self.wrap_results(results, res_index) @@ -1089,6 +1105,36 @@ def apply_series_numba(self): results = self.apply_with_numba() return results, self.result_index + def apply_series_bodo(self) -> DataFrame | Series: + if self.result_type is not None: + raise NotImplementedError( + "the 'bodo' engine does not support result_type yet." + ) + + if self.axis != 1 and not isinstance(self.func, str): + raise NotImplementedError( + "the 'bodo' engine only supports axis=1 for user-defined functions." + ) + + if self.args or self.kwargs: + raise NotImplementedError( + "the 'bodo' engine does not support passing additional args/kwargs " + "to apply function yet." + ) + + bodo = import_optional_dependency("bodo") + + @bodo.jit(**self.engine_kwargs) + def do_apply(obj, func, axis): + return obj.apply(func, axis) + + try: + result = do_apply(self.obj, self.func, self.axis) + except bodo.utils.typing.BodoError as e: + raise ExecutionError("Execution with engine='bodo' failed.") from e + + return result + def wrap_results(self, results: ResType, res_index: Index) -> DataFrame | Series: from pandas import Series diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4cf595d167c46..36980e7585012 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -10254,7 +10254,7 @@ def apply( result_type: Literal["expand", "reduce", "broadcast"] | None = None, args=(), by_row: Literal[False, "compat"] = "compat", - engine: Literal["python", "numba"] = "python", + engine: Literal["python", "numba", "bodo"] = "python", engine_kwargs: dict[str, bool] | None = None, **kwargs, ): @@ -10316,7 +10316,7 @@ def apply( .. versionadded:: 2.1.0 - engine : {'python', 'numba'}, default 'python' + engine : {'python', 'numba', 'bodo'}, default 'python' Choose between the python (default) engine or the numba engine in apply. The numba engine will attempt to JIT compile the passed function, @@ -10339,6 +10339,19 @@ def apply( `_ in numba to learn what you can or cannot use in the passed function. + The bodo engine will attempt to JIT compile the passed function, spawn + multiple workers and apply the function in parallel over the Dataframe, + which may result in a speedup for large DataFrames. + + Bodo supports a subset of valid Python, numpy, pandas and scikit-learn. + Please refer to the `bodo documentation + `_ to learn more about which + operations and APIs are supported inside JIT compiled functions. + + Code that does not have JIT support yet can still utilize Bodo's parallel + constructs by decorating the function with `@wrap_python + `_. + .. versionadded:: 2.2.0 engine_kwargs : dict diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index 2b5bc450e41d6..ab40ff559fbcc 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -417,6 +417,21 @@ class NumbaUtilError(Exception): """ +class ExecutionError(Exception): + """ + Error raised from internal errors originating in engines. + + Examples + -------- + >>> df = pd.DataFrame({"A": [1, 2, 3], "B": ["1", "2", "3"]}) + >>> df.apply(lambda x: x.A + x.B, engine="bodo", axis=1) + Traceback (most recent call last): + ... + pandas.errors.ExecutionError: Execution with engine='bodo' failed. + + """ + + class DuplicateLabelError(ValueError): """ Error raised when an operation would introduce duplicate labels. @@ -916,6 +931,7 @@ class InvalidComparison(Exception): "DtypeWarning", "DuplicateLabelError", "EmptyDataError", + "ExecutionError", "IncompatibilityWarning", "IndexingError", "IntCastingNaNError", diff --git a/pandas/tests/apply/test_bodo.py b/pandas/tests/apply/test_bodo.py new file mode 100644 index 0000000000000..6044dfe3b1ecf --- /dev/null +++ b/pandas/tests/apply/test_bodo.py @@ -0,0 +1,145 @@ +import numpy as np +import pytest + +from pandas.errors import ExecutionError + +import pandas as pd +import pandas._testing as tm + +pytestmark = [pytest.mark.single_cpu, pytest.mark.bodo_udf_engine] + + +@pytest.fixture(params=["bodo"]) +def engine(request): + """Test bodo engine by itself to avoid extensions conflicting with numba. + + Note: Using a fixture here to avoid importing at the start of the session. + """ + if request.param == "bodo": + pytest.importorskip("bodo") + return request.param + + +def test_bodo_vs_python_indexing(engine): + frame = pd.DataFrame( + {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7.0, 8.0, 9.0]}, + ) + + def f(a): + return a["c"] + + result = frame.apply(f, engine="bodo", axis=1) + expected = frame.apply(f, engine="python", axis=1) + + tm.assert_series_equal(result, expected, check_series_type=False) + + +@pytest.mark.parametrize( + "reduction", + [lambda x: x.mean(), lambda x: x.min(), lambda x: x.max(), lambda x: x.sum()], +) +def test_bodo_vs_python_reductions(reduction, engine): + df = pd.DataFrame(np.ones((4, 4), dtype=np.float64)) + result = df.apply(reduction, engine="bodo", axis=1) + expected = df.apply(reduction, engine="python", axis=1) + tm.assert_series_equal(result, expected, check_series_type=False) + + +def test_bodo_vs_python_df_output(engine): + df = pd.DataFrame({"A": np.arange(20), "B": ["hi", "there"] * 10}) + + def f(a): + return pd.Series([a["B"], a["A"]]) + + result = df.apply(f, engine="bodo", axis=1) + expected = df.apply(f, engine="python", axis=1) + + tm.assert_frame_equal(result, expected, check_frame_type=False, check_dtype=False) + + +def test_bodo_vs_python_args(engine): + msg = ( + "the 'bodo' engine does not support passing additional args/kwargs " + "to apply function yet." + ) + + def f(x, y): + return x.A + y + + df = pd.DataFrame({"A": np.arange(20)}) + + with pytest.raises(NotImplementedError, match=msg): + df.apply(f, engine="bodo", axis=1, args=(2,)) + + with pytest.raises(NotImplementedError, match=msg): + df.apply(f, engine="bodo", axis=1, y=2) + + +@pytest.mark.parametrize("axis", [0, 1]) +def test_bodo_vs_python_str_apply(axis, engine): + df = pd.DataFrame({"A": np.arange(20)}) + + func = "mean" + axis = 1 + result = df.apply(func, axis, engine="bodo") + expected = df.apply(func, axis) + + tm.assert_series_equal(result, expected, check_series_type=False) + + +def test_bodo_unsupported_axis(engine): + """Tests that a BodoError is raised when trying to apply UDF column-wise""" + frame = pd.DataFrame( + {"a": [1, 2, 3]}, + ) + + def f(a): + return 1 + + with pytest.raises( + NotImplementedError, + match=r"the 'bodo' engine only supports axis=1 for user-defined functions", + ): + frame.apply(f, engine="bodo", axis=0) + + +def test_bodo_raw_unsupported(engine): + """Tests that error gets raised when using raw=True""" + frame = pd.DataFrame( + {"a": [1, 2, 3]}, + ) + + def f(a): + return 1 + + with pytest.raises( + NotImplementedError, match="the 'bodo' engine does not support raw=True." + ): + frame.apply(f, engine="bodo", raw=True, axis=1) + + +def test_bodo_result_type_unsupported(engine): + """Tests that error gets raised when passing any value to result_type""" + frame = pd.DataFrame( + {"a": [1, 2, 3]}, + ) + + def f(a): + return 1 + + with pytest.raises( + NotImplementedError, match="the 'bodo' engine does not support result_type yet." + ): + frame.apply(f, engine="bodo", axis=1, result_type="reduce") + + +def test_bodo_engine_execution_error(engine): + frame = pd.DataFrame( + {"a": [1, 2, 3], "b": ["1", "2", "3"]}, + ) + + def f(x): + return x.a + x.b + + with pytest.raises(ExecutionError, match="Execution with engine='bodo' failed."): + frame.apply(f, engine="bodo", axis=1) diff --git a/pandas/tests/util/test_bodo.py b/pandas/tests/util/test_bodo.py new file mode 100644 index 0000000000000..613192e5e0424 --- /dev/null +++ b/pandas/tests/util/test_bodo.py @@ -0,0 +1,22 @@ +import pytest + +import pandas.util._test_decorators as td + +from pandas import DataFrame + +pytestmark = pytest.mark.bodo_udf_engine + + +def test_bodo_not_installed_df_apply(): + "Test that importing bodo when not installed results in ImportError." + bodo_installed = bool(td.import_optional_dependency("bodo", errors="ignore")) + if bodo_installed: + pytest.skip("bodo is installed.") + + df = DataFrame({"A": [1, 2, 3, 4, 5]}) + + def f(x): + return 1 + + with pytest.raises(ImportError, match="Missing optional"): + df.apply(f, engine="bodo", axis=1) diff --git a/pyproject.toml b/pyproject.toml index b7d53b0d8934a..732c23166eefe 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,7 +60,7 @@ matplotlib = "pandas:plotting._matplotlib" [project.optional-dependencies] test = ['hypothesis>=6.84.0', 'pytest>=7.3.2', 'pytest-xdist>=3.4.0'] pyarrow = ['pyarrow>=10.0.1'] -performance = ['bottleneck>=1.3.6', 'numba>=0.56.4', 'numexpr>=2.8.4'] +performance = ['bottleneck>=1.3.6', 'numba>=0.56.4', 'numexpr>=2.8.4', 'bodo>=2025.1'] computation = ['scipy>=1.10.0', 'xarray>=2022.12.0'] fss = ['fsspec>=2022.11.0'] aws = ['s3fs>=2022.11.0'] @@ -97,6 +97,7 @@ all = ['adbc-driver-postgresql>=0.10.0', 'lxml>=4.9.2', 'matplotlib>=3.6.3', 'numba>=0.56.4', + 'bodo>=2025.1', 'numexpr>=2.8.4', 'odfpy>=1.4.1', 'openpyxl>=3.1.0', @@ -484,6 +485,7 @@ markers = [ # these tests only fail in the wheel builder and don't fail in regular # ARM CI "fails_arm_wheels: Tests that fail in the ARM wheel build only", + "bodo_udf_engine: Tests for bodo engine to accelarate applications of User Defined Functions (UDFs)", ] [tool.mypy] diff --git a/requirements-dev.txt b/requirements-dev.txt index fb4d9cdb589ca..e9f53f4886f9c 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -29,6 +29,7 @@ jinja2>=3.1.2 lxml>=4.9.2 matplotlib>=3.6.3 numba>=0.56.4 +bodo>=2025.1 numexpr>=2.8.4 openpyxl>=3.1.0 odfpy>=1.4.1