From 1e62d38cb7c05ede754c8eb50400cad11fc013c5 Mon Sep 17 00:00:00 2001 From: = <=> Date: Sun, 29 Dec 2024 20:22:28 -0500 Subject: [PATCH 01/16] add basic support for engine=bodo, df.apply --- ci/deps/actions-310-minimum_versions.yaml | 1 + doc/source/getting_started/install.rst | 1 + environment.yml | 2 + pandas/compat/_optional.py | 1 + pandas/core/apply.py | 47 ++++++++-- pandas/core/frame.py | 6 +- pandas/tests/apply/test_bodo.py | 105 ++++++++++++++++++++++ pandas/tests/util/test_bodo.py | 18 ++++ pyproject.toml | 3 +- requirements-dev.txt | 1 + 10 files changed, 173 insertions(+), 12 deletions(-) create mode 100644 pandas/tests/apply/test_bodo.py create mode 100644 pandas/tests/util/test_bodo.py diff --git a/ci/deps/actions-310-minimum_versions.yaml b/ci/deps/actions-310-minimum_versions.yaml index c7c72828db481..11ee062df0e9e 100644 --- a/ci/deps/actions-310-minimum_versions.yaml +++ b/ci/deps/actions-310-minimum_versions.yaml @@ -37,6 +37,7 @@ dependencies: - lxml=4.9.2 - matplotlib=3.6.3 - numba=0.56.4 + - bodo=2024.11 - numexpr=2.8.4 - odfpy=1.4.1 - qtpy=2.3.0 diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index bda959f380e8a..2bfe877bf9b81 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -186,6 +186,7 @@ Dependency Minimum Version pip ext `numexpr `__ 2.8.4 performance Accelerates certain numerical operations by using multiple cores as well as smart chunking and caching to achieve large speedups `bottleneck `__ 1.3.6 performance Accelerates certain types of ``nan`` by using specialized cython routines to achieve large speedup. `numba `__ 0.56.4 performance Alternative execution engine for operations that accept ``engine="numba"`` using a JIT compiler that translates Python functions to optimized machine code using the LLVM compiler. +`bodo `__ 2024.11 performance Alternative execution engine for operations that accept ``engine="bodo"`` using a JIT compiler that translates Python functions to optimized machine code using the LLVM compiler and automatically parallelizes uing MPI. ===================================================== ================== ================== =================================================================================================================================================================================== Visualization diff --git a/environment.yml b/environment.yml index 69647a436e3ad..157a4e5397061 100644 --- a/environment.yml +++ b/environment.yml @@ -2,6 +2,7 @@ name: pandas-dev channels: - conda-forge + - bodo.ai dependencies: - python=3.10 - pip @@ -40,6 +41,7 @@ dependencies: - lxml>=4.9.2 - matplotlib>=3.6.3 - numba>=0.56.4 + - bodo>=2024.11 - numexpr>=2.8.4 - openpyxl>=3.1.0 - odfpy>=1.4.1 diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 6b90389a62056..f4c9721e4f59e 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -57,6 +57,7 @@ "tzdata": "2022.7", "qtpy": "2.3.0", "pyqt5": "5.15.9", + "bodo": "2024.11", } # A mapping from import name to package name (on PyPI) for packages where diff --git a/pandas/core/apply.py b/pandas/core/apply.py index af513d49bcfe0..1d2ec6f561a26 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -598,9 +598,9 @@ def apply_list_or_dict_like(self) -> DataFrame | Series: Result when self.func is a list-like or dict-like, None otherwise. """ - if self.engine == "numba": + if self.engine in ("numba", "bodo"): raise NotImplementedError( - "The 'numba' engine doesn't support list-like/" + f"The '{self.engine}' engine doesn't support list-like/" "dict likes of callables yet." ) @@ -853,9 +853,9 @@ def apply(self) -> DataFrame | Series: # dispatch to handle list-like or dict-like if is_list_like(self.func): - if self.engine == "numba": + if self.engine in ("numba", "bodo"): raise NotImplementedError( - "the 'numba' engine doesn't support lists of callables yet" + f"the '{self.engine}' engine doesn't support lists of callables yet" ) return self.apply_list_or_dict_like() @@ -870,13 +870,16 @@ def apply(self) -> DataFrame | Series: "the 'numba' engine doesn't support using " "a string as the callable function" ) + if self.engine == "bodo": + return self.apply_series_bodo() + return self.apply_str() # ufunc elif isinstance(self.func, np.ufunc): - if self.engine == "numba": + if self.engine in ("numba", "bodo"): raise NotImplementedError( - "the 'numba' engine doesn't support " + f"the '{self.engine}' engine doesn't support " "using a numpy ufunc as the callable function" ) with np.errstate(all="ignore"): @@ -886,9 +889,10 @@ def apply(self) -> DataFrame | Series: # broadcasting if self.result_type == "broadcast": - if self.engine == "numba": + if self.engine in ("numba", "bodo"): raise NotImplementedError( - "the 'numba' engine doesn't support result_type='broadcast'" + f"the '{self.engine}' engine doesn't support " + "result_type='broadcast'" ) return self.apply_broadcast(self.obj) @@ -1007,6 +1011,8 @@ def wrapper(*args, **kwargs): result = nb_looper(self.values, self.axis, *args) # If we made the result 2-D, squeeze it back to 1-D result = np.squeeze(result) + elif self.engine == "bodo": + raise NotImplementedError("the 'bodo' engine does not support raw=True.") else: result = np.apply_along_axis( wrap_function(self.func), @@ -1053,8 +1059,11 @@ def apply_broadcast(self, target: DataFrame) -> DataFrame: def apply_standard(self): if self.engine == "python": results, res_index = self.apply_series_generator() - else: + elif self.engine == "numba": results, res_index = self.apply_series_numba() + else: + # bodo engine + return self.apply_series_bodo() # wrap results return self.wrap_results(results, res_index) @@ -1089,6 +1098,26 @@ def apply_series_numba(self): results = self.apply_with_numba() return results, self.result_index + def apply_series_bodo(self) -> DataFrame | Series: + bodo = import_optional_dependency("bodo") + + if self.result_type is not None: + raise NotImplementedError( + "the 'bodo' engine does not support result_type yet." + ) + + if self.axis != 1 and not isinstance(self.func, str): + raise NotImplementedError( + "the 'bodo' engine only supports axis=1 for user-defined functions." + ) + + @bodo.jit + def do_apply(obj, func, axis): + return obj.apply(func, axis) + + result = do_apply(self.obj, self.func, self.axis) + return result + def wrap_results(self, results: ResType, res_index: Index) -> DataFrame | Series: from pandas import Series diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 02878b36a379e..1ee445be714a5 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -10203,7 +10203,7 @@ def apply( result_type: Literal["expand", "reduce", "broadcast"] | None = None, args=(), by_row: Literal[False, "compat"] = "compat", - engine: Literal["python", "numba"] = "python", + engine: Literal["python", "numba", "bodo"] = "python", engine_kwargs: dict[str, bool] | None = None, **kwargs, ): @@ -10265,7 +10265,7 @@ def apply( .. versionadded:: 2.1.0 - engine : {'python', 'numba'}, default 'python' + engine : {'python', 'numba', 'bodo'}, default 'python' Choose between the python (default) engine or the numba engine in apply. The numba engine will attempt to JIT compile the passed function, @@ -10288,6 +10288,8 @@ def apply( `_ in numba to learn what you can or cannot use in the passed function. + TODO: describe bodo + .. versionadded:: 2.2.0 engine_kwargs : dict diff --git a/pandas/tests/apply/test_bodo.py b/pandas/tests/apply/test_bodo.py new file mode 100644 index 0000000000000..f3bcefd5a5ec4 --- /dev/null +++ b/pandas/tests/apply/test_bodo.py @@ -0,0 +1,105 @@ +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +import pandas._testing as tm + +pytestmark = [td.skip_if_no("bodo")] + + +def test_bodo_vs_python_indexing(): + frame = pd.DataFrame( + {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7.0, 8.0, 9.0]}, + ) + f = lambda x: x["c"] + result = frame.apply(f, engine="bodo", axis=1) + expected = frame.apply(f, engine="python", axis=1) + + tm.assert_series_equal(result, expected, check_series_type=False) + + +@pytest.mark.parametrize( + "reduction", + [lambda x: x.mean(), lambda x: x.min(), lambda x: x.max(), lambda x: x.sum()], +) +def test_bodo_vs_python_reductions(reduction): + df = pd.DataFrame(np.ones((4, 4), dtype=np.float64)) + result = df.apply(reduction, engine="bodo", axis=1) + expected = df.apply(reduction, engine="python", axis=1) + tm.assert_series_equal(result, expected, check_series_type=False) + + +def test_bodo_vs_python_df_output(): + df = pd.DataFrame({"A": np.arange(20), "B": ["hi", "there"] * 10}) + + f = lambda a: pd.Series([a["B"], a["A"]]) + result = df.apply(f, engine="bodo", axis=1) + expected = df.apply(f, engine="python", axis=1) + + tm.assert_frame_equal(result, expected, check_frame_type=False, check_dtype=False) + + +@pytest.mark.skip(reason="TODO: pass args/kwargs to bodo jitted function") +def test_bodo_vs_python_args_kwargs(): + def f(x, y, z=3): + return x.A == y + z + + df = pd.DataFrame({"A": np.arange(20)}) + + result = df.apply(f, z=2, engine="bodo", axis=1, args=(2,)) + expected = df.apply(f, z=2, axis=1, args=(2,)) + tm.assert_series_equal(result, expected, check_series_type=False) + + +@pytest.mark.parametrize("axis", [0, 1]) +def test_bodo_vs_python_str_apply(axis): + df = pd.DataFrame({"A": np.arange(20)}) + + func = "mean" + axis = 1 + result = df.apply(func, axis) + expected = df.apply(func, axis) + + tm.assert_series_equal(result, expected, check_series_type=False) + + +def test_bodo_unsupported_axis(): + """Tests that a BodoError is raised when trying to apply UDF column-wise""" + frame = pd.DataFrame( + {"a": [1, 2, 3]}, + ) + f = lambda x: 1 + + with pytest.raises( + NotImplementedError, + match=r"the 'bodo' engine only supports axis=1 for user-defined functions", + ): + frame.apply(f, engine="bodo", axis=0) + + +def test_bodo_raw_unsupported(): + """Tests that error gets raised when using raw=True""" + frame = pd.DataFrame( + {"a": [1, 2, 3]}, + ) + f = lambda a: 1 + + with pytest.raises( + NotImplementedError, match="the 'bodo' engine does not support raw=True." + ): + frame.apply(f, engine="bodo", raw=True, axis=1) + + +def test_bodo_result_type_unsupported(): + """Tests that error gets raised when passing any value to result_type""" + frame = pd.DataFrame( + {"a": [1, 2, 3]}, + ) + f = lambda a: 1 + + with pytest.raises( + NotImplementedError, match="the 'bodo' engine does not support result_type yet." + ): + frame.apply(f, engine="bodo", axis=1, result_type="reduce") diff --git a/pandas/tests/util/test_bodo.py b/pandas/tests/util/test_bodo.py new file mode 100644 index 0000000000000..d2b302ea97496 --- /dev/null +++ b/pandas/tests/util/test_bodo.py @@ -0,0 +1,18 @@ +import pytest + +import pandas.util._test_decorators as td + +from pandas import DataFrame + + +@td.skip_if_installed("bodo") +def test_bodo_not_installed_df_apply(): + "Test that importing bodo when not installed results in ImportError." + + df = DataFrame({"A": [1, 2, 3, 4, 5]}) + + def f(x): + return 1 + + with pytest.raises(ImportError, match="Missing optional"): + df.apply(f, engine="bodo") diff --git a/pyproject.toml b/pyproject.toml index 7ab9cd2c17669..e04e9927528ff 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,7 +60,7 @@ matplotlib = "pandas:plotting._matplotlib" [project.optional-dependencies] test = ['hypothesis>=6.84.0', 'pytest>=7.3.2', 'pytest-xdist>=3.4.0'] pyarrow = ['pyarrow>=10.0.1'] -performance = ['bottleneck>=1.3.6', 'numba>=0.56.4', 'numexpr>=2.8.4'] +performance = ['bottleneck>=1.3.6', 'numba>=0.56.4', 'numexpr>=2.8.4', 'bodo>=2024.11'] computation = ['scipy>=1.10.0', 'xarray>=2022.12.0'] fss = ['fsspec>=2022.11.0'] aws = ['s3fs>=2022.11.0'] @@ -97,6 +97,7 @@ all = ['adbc-driver-postgresql>=0.10.0', 'lxml>=4.9.2', 'matplotlib>=3.6.3', 'numba>=0.56.4', + 'bodo>=2024.11', 'numexpr>=2.8.4', 'odfpy>=1.4.1', 'openpyxl>=3.1.0', diff --git a/requirements-dev.txt b/requirements-dev.txt index fb4d9cdb589ca..256f4b27363ad 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -29,6 +29,7 @@ jinja2>=3.1.2 lxml>=4.9.2 matplotlib>=3.6.3 numba>=0.56.4 +bodo>=2024.11 numexpr>=2.8.4 openpyxl>=3.1.0 odfpy>=1.4.1 From 27fbc0a1f5fcdb6d409700defb649b5c69d88b75 Mon Sep 17 00:00:00 2001 From: = <=> Date: Mon, 30 Dec 2024 11:56:10 -0500 Subject: [PATCH 02/16] fix test --- ci/deps/actions-310-minimum_versions.yaml | 1 + pandas/tests/apply/test_bodo.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/ci/deps/actions-310-minimum_versions.yaml b/ci/deps/actions-310-minimum_versions.yaml index 11ee062df0e9e..cc38f180f0dc5 100644 --- a/ci/deps/actions-310-minimum_versions.yaml +++ b/ci/deps/actions-310-minimum_versions.yaml @@ -3,6 +3,7 @@ name: pandas-dev channels: - conda-forge + - bodo.ai dependencies: - python=3.10 diff --git a/pandas/tests/apply/test_bodo.py b/pandas/tests/apply/test_bodo.py index f3bcefd5a5ec4..e8ba5644dc8e5 100644 --- a/pandas/tests/apply/test_bodo.py +++ b/pandas/tests/apply/test_bodo.py @@ -59,7 +59,7 @@ def test_bodo_vs_python_str_apply(axis): func = "mean" axis = 1 - result = df.apply(func, axis) + result = df.apply(func, axis, engine="bodo") expected = df.apply(func, axis) tm.assert_series_equal(result, expected, check_series_type=False) From 4c2e94af6e62da1533b78ff80b6f96bb1138241d Mon Sep 17 00:00:00 2001 From: = <=> Date: Mon, 30 Dec 2024 13:44:18 -0500 Subject: [PATCH 03/16] adjust minimum version requirements --- ci/deps/actions-310-minimum_versions.yaml | 2 +- doc/source/getting_started/install.rst | 2 +- environment.yml | 2 +- pandas/compat/_optional.py | 2 +- pyproject.toml | 4 ++-- requirements-dev.txt | 2 +- 6 files changed, 7 insertions(+), 7 deletions(-) diff --git a/ci/deps/actions-310-minimum_versions.yaml b/ci/deps/actions-310-minimum_versions.yaml index cc38f180f0dc5..4c34d02c4f199 100644 --- a/ci/deps/actions-310-minimum_versions.yaml +++ b/ci/deps/actions-310-minimum_versions.yaml @@ -38,7 +38,7 @@ dependencies: - lxml=4.9.2 - matplotlib=3.6.3 - numba=0.56.4 - - bodo=2024.11 + - bodo=2024.12.3 - numexpr=2.8.4 - odfpy=1.4.1 - qtpy=2.3.0 diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index 2bfe877bf9b81..67009a1ff3c44 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -186,7 +186,7 @@ Dependency Minimum Version pip ext `numexpr `__ 2.8.4 performance Accelerates certain numerical operations by using multiple cores as well as smart chunking and caching to achieve large speedups `bottleneck `__ 1.3.6 performance Accelerates certain types of ``nan`` by using specialized cython routines to achieve large speedup. `numba `__ 0.56.4 performance Alternative execution engine for operations that accept ``engine="numba"`` using a JIT compiler that translates Python functions to optimized machine code using the LLVM compiler. -`bodo `__ 2024.11 performance Alternative execution engine for operations that accept ``engine="bodo"`` using a JIT compiler that translates Python functions to optimized machine code using the LLVM compiler and automatically parallelizes uing MPI. +`bodo `__ 2024.12.3 performance Alternative execution engine for operations that accept ``engine="bodo"`` using a JIT compiler that translates Python functions to optimized machine code using the LLVM compiler and automatically parallelizes uing MPI. ===================================================== ================== ================== =================================================================================================================================================================================== Visualization diff --git a/environment.yml b/environment.yml index 157a4e5397061..0280f7b3197e0 100644 --- a/environment.yml +++ b/environment.yml @@ -41,7 +41,7 @@ dependencies: - lxml>=4.9.2 - matplotlib>=3.6.3 - numba>=0.56.4 - - bodo>=2024.11 + - bodo>=2024.12.3 - numexpr>=2.8.4 - openpyxl>=3.1.0 - odfpy>=1.4.1 diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index f4c9721e4f59e..d8aa3bae7007e 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -57,7 +57,7 @@ "tzdata": "2022.7", "qtpy": "2.3.0", "pyqt5": "5.15.9", - "bodo": "2024.11", + "bodo": "2024.12.3", } # A mapping from import name to package name (on PyPI) for packages where diff --git a/pyproject.toml b/pyproject.toml index e04e9927528ff..67b17417b71f5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,7 +60,7 @@ matplotlib = "pandas:plotting._matplotlib" [project.optional-dependencies] test = ['hypothesis>=6.84.0', 'pytest>=7.3.2', 'pytest-xdist>=3.4.0'] pyarrow = ['pyarrow>=10.0.1'] -performance = ['bottleneck>=1.3.6', 'numba>=0.56.4', 'numexpr>=2.8.4', 'bodo>=2024.11'] +performance = ['bottleneck>=1.3.6', 'numba>=0.56.4', 'numexpr>=2.8.4', 'bodo>=2024.12.3'] computation = ['scipy>=1.10.0', 'xarray>=2022.12.0'] fss = ['fsspec>=2022.11.0'] aws = ['s3fs>=2022.11.0'] @@ -97,7 +97,7 @@ all = ['adbc-driver-postgresql>=0.10.0', 'lxml>=4.9.2', 'matplotlib>=3.6.3', 'numba>=0.56.4', - 'bodo>=2024.11', + 'bodo>=2024.12.3', 'numexpr>=2.8.4', 'odfpy>=1.4.1', 'openpyxl>=3.1.0', diff --git a/requirements-dev.txt b/requirements-dev.txt index 256f4b27363ad..c5f151e3e97e5 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -29,7 +29,7 @@ jinja2>=3.1.2 lxml>=4.9.2 matplotlib>=3.6.3 numba>=0.56.4 -bodo>=2024.11 +bodo>=2024.12.3 numexpr>=2.8.4 openpyxl>=3.1.0 odfpy>=1.4.1 From 4349d61ca327c392c0dadaaed085bb7178560701 Mon Sep 17 00:00:00 2001 From: = <=> Date: Mon, 30 Dec 2024 15:26:34 -0500 Subject: [PATCH 04/16] update ci envs --- ci/deps/actions-310-minimum_versions.yaml | 2 +- ci/deps/actions-310.yaml | 1 + ci/deps/actions-311.yaml | 1 + ci/deps/actions-312.yaml | 1 + 4 files changed, 4 insertions(+), 1 deletion(-) diff --git a/ci/deps/actions-310-minimum_versions.yaml b/ci/deps/actions-310-minimum_versions.yaml index 4c34d02c4f199..708eba17fb5ec 100644 --- a/ci/deps/actions-310-minimum_versions.yaml +++ b/ci/deps/actions-310-minimum_versions.yaml @@ -1,6 +1,6 @@ # Minimum version of required + optional dependencies # Aligned with getting_started/install.rst and compat/_optional.py -name: pandas-dev +name: pandas-dev-minimum channels: - conda-forge - bodo.ai diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index 74cab4e0970dc..90b20cc091308 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -35,6 +35,7 @@ dependencies: - lxml>=4.9.2 - matplotlib>=3.6.3 - numba>=0.56.4 + - bodo>=2024.12.3 - numexpr>=2.8.4 - odfpy>=1.4.1 - qtpy>=2.3.0 diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index b6f515dceaea9..9891cb7256089 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -35,6 +35,7 @@ dependencies: - lxml>=4.9.2 - matplotlib>=3.6.3 - numba>=0.56.4 + - bodo>=2024.12.3 - numexpr>=2.8.4 - odfpy>=1.4.1 - qtpy>=2.3.0 diff --git a/ci/deps/actions-312.yaml b/ci/deps/actions-312.yaml index bc66f8a5382c9..d07f34f99e9be 100644 --- a/ci/deps/actions-312.yaml +++ b/ci/deps/actions-312.yaml @@ -35,6 +35,7 @@ dependencies: - lxml>=4.9.2 - matplotlib>=3.6.3 - numba>=0.56.4 + - bodo>=2024.12.3 - numexpr>=2.8.4 - odfpy>=1.4.1 - qtpy>=2.3.0 From 0872285ec641dde30f0d44d83f131e2d06f98e48 Mon Sep 17 00:00:00 2001 From: = <=> Date: Mon, 30 Dec 2024 15:29:29 -0500 Subject: [PATCH 05/16] add channel --- ci/deps/actions-310.yaml | 1 + ci/deps/actions-311.yaml | 1 + ci/deps/actions-312.yaml | 1 + 3 files changed, 3 insertions(+) diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index 90b20cc091308..08ede326d41a7 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -1,6 +1,7 @@ name: pandas-dev channels: - conda-forge + - bodo.ai dependencies: - python=3.10 diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index 9891cb7256089..f71f08bc2c78d 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -1,6 +1,7 @@ name: pandas-dev channels: - conda-forge + - bodo.ai dependencies: - python=3.11 diff --git a/ci/deps/actions-312.yaml b/ci/deps/actions-312.yaml index d07f34f99e9be..27afc0bfabc3c 100644 --- a/ci/deps/actions-312.yaml +++ b/ci/deps/actions-312.yaml @@ -1,6 +1,7 @@ name: pandas-dev-312 channels: - conda-forge + - bodo.ai dependencies: - python=3.12 From cd94be95fcd5d85d963a69e21e9b58ffc255200c Mon Sep 17 00:00:00 2001 From: = <=> Date: Mon, 30 Dec 2024 16:37:25 -0500 Subject: [PATCH 06/16] try skipping some tests --- ci/deps/actions-310-minimum_versions.yaml | 2 +- pandas/tests/apply/test_bodo.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/deps/actions-310-minimum_versions.yaml b/ci/deps/actions-310-minimum_versions.yaml index 708eba17fb5ec..4c34d02c4f199 100644 --- a/ci/deps/actions-310-minimum_versions.yaml +++ b/ci/deps/actions-310-minimum_versions.yaml @@ -1,6 +1,6 @@ # Minimum version of required + optional dependencies # Aligned with getting_started/install.rst and compat/_optional.py -name: pandas-dev-minimum +name: pandas-dev channels: - conda-forge - bodo.ai diff --git a/pandas/tests/apply/test_bodo.py b/pandas/tests/apply/test_bodo.py index e8ba5644dc8e5..b15b0531b075c 100644 --- a/pandas/tests/apply/test_bodo.py +++ b/pandas/tests/apply/test_bodo.py @@ -6,7 +6,7 @@ import pandas as pd import pandas._testing as tm -pytestmark = [td.skip_if_no("bodo")] +pytestmark = [pytest.mark.skip, td.skip_if_no("bodo")] def test_bodo_vs_python_indexing(): From dcdd00ef677c8ba637d3b380c39cc4a371da3af7 Mon Sep 17 00:00:00 2001 From: = <=> Date: Fri, 17 Jan 2025 09:59:46 -0500 Subject: [PATCH 07/16] apply feedback --- pandas/core/apply.py | 20 ++++++++++++-------- pandas/tests/apply/test_bodo.py | 6 ++++-- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 1d2ec6f561a26..4f7c028bc4e66 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -870,7 +870,7 @@ def apply(self) -> DataFrame | Series: "the 'numba' engine doesn't support using " "a string as the callable function" ) - if self.engine == "bodo": + elif self.engine == "bodo": return self.apply_series_bodo() return self.apply_str() @@ -1057,13 +1057,17 @@ def apply_broadcast(self, target: DataFrame) -> DataFrame: return result def apply_standard(self): - if self.engine == "python": - results, res_index = self.apply_series_generator() - elif self.engine == "numba": + if self.engine == "numba": results, res_index = self.apply_series_numba() - else: - # bodo engine + elif self.engine == "bodo": return self.apply_series_bodo() + elif self.engine == "python": + results, res_index = self.apply_series_generator() + else: + raise ValueError( + "invalid value for engine, must be one " + "of {'python', 'numba', 'bodo'}" + ) # wrap results return self.wrap_results(results, res_index) @@ -1099,8 +1103,6 @@ def apply_series_numba(self): return results, self.result_index def apply_series_bodo(self) -> DataFrame | Series: - bodo = import_optional_dependency("bodo") - if self.result_type is not None: raise NotImplementedError( "the 'bodo' engine does not support result_type yet." @@ -1111,6 +1113,8 @@ def apply_series_bodo(self) -> DataFrame | Series: "the 'bodo' engine only supports axis=1 for user-defined functions." ) + bodo = import_optional_dependency("bodo") + @bodo.jit def do_apply(obj, func, axis): return obj.apply(func, axis) diff --git a/pandas/tests/apply/test_bodo.py b/pandas/tests/apply/test_bodo.py index b15b0531b075c..2081acb878dbb 100644 --- a/pandas/tests/apply/test_bodo.py +++ b/pandas/tests/apply/test_bodo.py @@ -6,7 +6,7 @@ import pandas as pd import pandas._testing as tm -pytestmark = [pytest.mark.skip, td.skip_if_no("bodo")] +pytestmark = [pytest.mark.single_cpu, td.skip_if_no("bodo")] def test_bodo_vs_python_indexing(): @@ -97,7 +97,9 @@ def test_bodo_result_type_unsupported(): frame = pd.DataFrame( {"a": [1, 2, 3]}, ) - f = lambda a: 1 + + def f(a): + return 1 with pytest.raises( NotImplementedError, match="the 'bodo' engine does not support result_type yet." From d0778929dcd53fca755625dfc19769efe90dc728 Mon Sep 17 00:00:00 2001 From: = <=> Date: Fri, 24 Jan 2025 13:21:05 -0500 Subject: [PATCH 08/16] debug failing unit tests and apply style comments --- ci/deps/actions-310-minimum_versions.yaml | 3 +- ci/deps/actions-310.yaml | 2 +- ci/deps/actions-311-downstream_compat.yaml | 2 + ci/deps/actions-311.yaml | 2 +- ci/deps/actions-312.yaml | 2 +- environment.yml | 2 +- pandas/compat/_optional.py | 2 +- pandas/conftest.py | 18 +++++++ pandas/core/apply.py | 8 ++- pandas/tests/apply/test_bodo.py | 59 +++++++++++++++------- pandas/tests/util/test_bodo.py | 4 +- pyproject.toml | 5 +- requirements-dev.txt | 2 +- 13 files changed, 80 insertions(+), 31 deletions(-) diff --git a/ci/deps/actions-310-minimum_versions.yaml b/ci/deps/actions-310-minimum_versions.yaml index 4c34d02c4f199..61f7cf04d23a4 100644 --- a/ci/deps/actions-310-minimum_versions.yaml +++ b/ci/deps/actions-310-minimum_versions.yaml @@ -3,7 +3,6 @@ name: pandas-dev channels: - conda-forge - - bodo.ai dependencies: - python=3.10 @@ -38,7 +37,7 @@ dependencies: - lxml=4.9.2 - matplotlib=3.6.3 - numba=0.56.4 - - bodo=2024.12.3 + - bodo=2025.1 - numexpr=2.8.4 - odfpy=1.4.1 - qtpy=2.3.0 diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index 08ede326d41a7..5eb3153bf810a 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -36,7 +36,7 @@ dependencies: - lxml>=4.9.2 - matplotlib>=3.6.3 - numba>=0.56.4 - - bodo>=2024.12.3 + - bodo>=2025.1 # [not win] - numexpr>=2.8.4 - odfpy>=1.4.1 - qtpy>=2.3.0 diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml index 092ca18d61259..d4d7d9979310c 100644 --- a/ci/deps/actions-311-downstream_compat.yaml +++ b/ci/deps/actions-311-downstream_compat.yaml @@ -2,6 +2,7 @@ name: pandas-dev channels: - conda-forge + - bodo.ai dependencies: - python=3.11 @@ -36,6 +37,7 @@ dependencies: - lxml>=4.9.2 - matplotlib>=3.6.3 - numba>=0.56.4 + - bodo>=2025.1 # [not win] - numexpr>=2.8.4 - odfpy>=1.4.1 - qtpy>=2.3.0 diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index f71f08bc2c78d..4a5f4d0a5259b 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -36,7 +36,7 @@ dependencies: - lxml>=4.9.2 - matplotlib>=3.6.3 - numba>=0.56.4 - - bodo>=2024.12.3 + - bodo>=2025.1 # [not win] - numexpr>=2.8.4 - odfpy>=1.4.1 - qtpy>=2.3.0 diff --git a/ci/deps/actions-312.yaml b/ci/deps/actions-312.yaml index 27afc0bfabc3c..ed2d4aa698cab 100644 --- a/ci/deps/actions-312.yaml +++ b/ci/deps/actions-312.yaml @@ -36,7 +36,7 @@ dependencies: - lxml>=4.9.2 - matplotlib>=3.6.3 - numba>=0.56.4 - - bodo>=2024.12.3 + - bodo>=2025.1 # [not win] - numexpr>=2.8.4 - odfpy>=1.4.1 - qtpy>=2.3.0 diff --git a/environment.yml b/environment.yml index 0280f7b3197e0..d6eea7fdb5db7 100644 --- a/environment.yml +++ b/environment.yml @@ -41,7 +41,7 @@ dependencies: - lxml>=4.9.2 - matplotlib>=3.6.3 - numba>=0.56.4 - - bodo>=2024.12.3 + - bodo>=2025.1 - numexpr>=2.8.4 - openpyxl>=3.1.0 - odfpy>=1.4.1 diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index d8aa3bae7007e..ea46cdfaa578b 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -57,7 +57,7 @@ "tzdata": "2022.7", "qtpy": "2.3.0", "pyqt5": "5.15.9", - "bodo": "2024.12.3", + "bodo": "2025.1", } # A mapping from import name to package name (on PyPI) for packages where diff --git a/pandas/conftest.py b/pandas/conftest.py index f9c10a7758bd2..fd2b5b37cf8da 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -127,6 +127,22 @@ def ignore_doctest_warning(item: pytest.Item, path: str, message: str) -> None: item.add_marker(pytest.mark.filterwarnings(f"ignore:{message}")) +def run_bodo_udf_engine_tests_last(items: list[pytest.Item]) -> None: + """Always run tests related to bodo UDF engine last to avoid installing + extensions that might change behavior of some tests. + + Parameters + ---------- + item : list[pytest.Item] + The collection of pytest test items to modify in place. + """ + bodo_tests = [item for item in items if "bodo_udf_engine" in item.keywords] + non_bodo_tests = [item for item in items if "bodo_udf_engine" not in item.keywords] + + # Run bodo tests last to avoid conflicting names when installing extensions + items[:] = non_bodo_tests + bodo_tests + + def pytest_collection_modifyitems(items, config) -> None: is_doctest = config.getoption("--doctest-modules") or config.getoption( "--doctest-cython", default=False @@ -175,6 +191,8 @@ def pytest_collection_modifyitems(items, config) -> None: for path, message in ignored_doctest_warnings: ignore_doctest_warning(item, path, message) + run_bodo_udf_engine_tests_last(items) + hypothesis_health_checks = [ hypothesis.HealthCheck.too_slow, diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 4f7c028bc4e66..a8df8e56fc292 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -1113,9 +1113,15 @@ def apply_series_bodo(self) -> DataFrame | Series: "the 'bodo' engine only supports axis=1 for user-defined functions." ) + if self.args or self.kwargs: + raise NotImplementedError( + "the 'bodo' engine does not support passing additional args/kwargs " + "to apply function yet." + ) + bodo = import_optional_dependency("bodo") - @bodo.jit + @bodo.jit(**self.engine_kwargs) def do_apply(obj, func, axis): return obj.apply(func, axis) diff --git a/pandas/tests/apply/test_bodo.py b/pandas/tests/apply/test_bodo.py index 2081acb878dbb..bcf143313a520 100644 --- a/pandas/tests/apply/test_bodo.py +++ b/pandas/tests/apply/test_bodo.py @@ -6,14 +6,23 @@ import pandas as pd import pandas._testing as tm -pytestmark = [pytest.mark.single_cpu, td.skip_if_no("bodo")] +pytestmark = [pytest.mark.single_cpu, pytest.mark.bodo_udf_engine] -def test_bodo_vs_python_indexing(): +@pytest.fixture +def skip_if_no_bodo(): + """Avoid using in test decorator which will cause bodo import immediately.""" + td.skip_if_no("bodo") + + +def test_bodo_vs_python_indexing(skip_if_no_bodo): frame = pd.DataFrame( {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7.0, 8.0, 9.0]}, ) - f = lambda x: x["c"] + + def f(a): + return a["c"] + result = frame.apply(f, engine="bodo", axis=1) expected = frame.apply(f, engine="python", axis=1) @@ -24,37 +33,45 @@ def test_bodo_vs_python_indexing(): "reduction", [lambda x: x.mean(), lambda x: x.min(), lambda x: x.max(), lambda x: x.sum()], ) -def test_bodo_vs_python_reductions(reduction): +def test_bodo_vs_python_reductions(reduction, skip_if_no_bodo): df = pd.DataFrame(np.ones((4, 4), dtype=np.float64)) result = df.apply(reduction, engine="bodo", axis=1) expected = df.apply(reduction, engine="python", axis=1) tm.assert_series_equal(result, expected, check_series_type=False) -def test_bodo_vs_python_df_output(): +def test_bodo_vs_python_df_output(skip_if_no_bodo): df = pd.DataFrame({"A": np.arange(20), "B": ["hi", "there"] * 10}) - f = lambda a: pd.Series([a["B"], a["A"]]) + def f(a): + return pd.Series([a["B"], a["A"]]) + result = df.apply(f, engine="bodo", axis=1) expected = df.apply(f, engine="python", axis=1) tm.assert_frame_equal(result, expected, check_frame_type=False, check_dtype=False) -@pytest.mark.skip(reason="TODO: pass args/kwargs to bodo jitted function") -def test_bodo_vs_python_args_kwargs(): - def f(x, y, z=3): - return x.A == y + z +def test_bodo_vs_python_args(skip_if_no_bodo): + msg = ( + "the 'bodo' engine does not support passing additional args/kwargs " + "to apply function yet." + ) + + def f(x, y): + return x.A + y df = pd.DataFrame({"A": np.arange(20)}) - result = df.apply(f, z=2, engine="bodo", axis=1, args=(2,)) - expected = df.apply(f, z=2, axis=1, args=(2,)) - tm.assert_series_equal(result, expected, check_series_type=False) + with pytest.raises(NotImplementedError, match=msg): + df.apply(f, engine="bodo", axis=1, args=(2,)) + + with pytest.raises(NotImplementedError, match=msg): + df.apply(f, engine="bodo", axis=1, y=2) @pytest.mark.parametrize("axis", [0, 1]) -def test_bodo_vs_python_str_apply(axis): +def test_bodo_vs_python_str_apply(axis, skip_if_no_bodo): df = pd.DataFrame({"A": np.arange(20)}) func = "mean" @@ -65,12 +82,14 @@ def test_bodo_vs_python_str_apply(axis): tm.assert_series_equal(result, expected, check_series_type=False) -def test_bodo_unsupported_axis(): +def test_bodo_unsupported_axis(skip_if_no_bodo): """Tests that a BodoError is raised when trying to apply UDF column-wise""" frame = pd.DataFrame( {"a": [1, 2, 3]}, ) - f = lambda x: 1 + + def f(a): + return 1 with pytest.raises( NotImplementedError, @@ -79,12 +98,14 @@ def test_bodo_unsupported_axis(): frame.apply(f, engine="bodo", axis=0) -def test_bodo_raw_unsupported(): +def test_bodo_raw_unsupported(skip_if_no_bodo): """Tests that error gets raised when using raw=True""" frame = pd.DataFrame( {"a": [1, 2, 3]}, ) - f = lambda a: 1 + + def f(a): + return 1 with pytest.raises( NotImplementedError, match="the 'bodo' engine does not support raw=True." @@ -92,7 +113,7 @@ def test_bodo_raw_unsupported(): frame.apply(f, engine="bodo", raw=True, axis=1) -def test_bodo_result_type_unsupported(): +def test_bodo_result_type_unsupported(skip_if_no_bodo): """Tests that error gets raised when passing any value to result_type""" frame = pd.DataFrame( {"a": [1, 2, 3]}, diff --git a/pandas/tests/util/test_bodo.py b/pandas/tests/util/test_bodo.py index d2b302ea97496..2d4d2a2c1a9b1 100644 --- a/pandas/tests/util/test_bodo.py +++ b/pandas/tests/util/test_bodo.py @@ -4,10 +4,12 @@ from pandas import DataFrame +pytestmark = pytest.mark.bodo_udf_engine + -@td.skip_if_installed("bodo") def test_bodo_not_installed_df_apply(): "Test that importing bodo when not installed results in ImportError." + td.skip_if_installed("bodo") df = DataFrame({"A": [1, 2, 3, 4, 5]}) diff --git a/pyproject.toml b/pyproject.toml index 67b17417b71f5..d0ec22e869584 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,7 +60,7 @@ matplotlib = "pandas:plotting._matplotlib" [project.optional-dependencies] test = ['hypothesis>=6.84.0', 'pytest>=7.3.2', 'pytest-xdist>=3.4.0'] pyarrow = ['pyarrow>=10.0.1'] -performance = ['bottleneck>=1.3.6', 'numba>=0.56.4', 'numexpr>=2.8.4', 'bodo>=2024.12.3'] +performance = ['bottleneck>=1.3.6', 'numba>=0.56.4', 'numexpr>=2.8.4', 'bodo>=2025.1'] computation = ['scipy>=1.10.0', 'xarray>=2022.12.0'] fss = ['fsspec>=2022.11.0'] aws = ['s3fs>=2022.11.0'] @@ -97,7 +97,7 @@ all = ['adbc-driver-postgresql>=0.10.0', 'lxml>=4.9.2', 'matplotlib>=3.6.3', 'numba>=0.56.4', - 'bodo>=2024.12.3', + 'bodo>=2025.1', 'numexpr>=2.8.4', 'odfpy>=1.4.1', 'openpyxl>=3.1.0', @@ -489,6 +489,7 @@ markers = [ # these tests only fail in the wheel builder and don't fail in regular # ARM CI "fails_arm_wheels: Tests that fail in the ARM wheel build only", + "bodo_udf_engine: Tests for bodo engine to accelarate applications of User Defined Functions (UDFs)", ] [tool.mypy] diff --git a/requirements-dev.txt b/requirements-dev.txt index c5f151e3e97e5..e9f53f4886f9c 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -29,7 +29,7 @@ jinja2>=3.1.2 lxml>=4.9.2 matplotlib>=3.6.3 numba>=0.56.4 -bodo>=2024.12.3 +bodo>=2025.1 numexpr>=2.8.4 openpyxl>=3.1.0 odfpy>=1.4.1 From 5711ad4a069c9c5c9a5b6b8a21d87c1503059fa6 Mon Sep 17 00:00:00 2001 From: = <=> Date: Mon, 27 Jan 2025 10:14:58 -0500 Subject: [PATCH 09/16] Fix tests and add docstrings --- ci/deps/actions-310.yaml | 2 +- pandas/core/frame.py | 13 ++++++++++++- pandas/tests/apply/test_bodo.py | 31 +++++++++++++++++-------------- pandas/tests/util/test_bodo.py | 6 ++++-- 4 files changed, 34 insertions(+), 18 deletions(-) diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index 5eb3153bf810a..8493f44a85428 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -36,7 +36,7 @@ dependencies: - lxml>=4.9.2 - matplotlib>=3.6.3 - numba>=0.56.4 - - bodo>=2025.1 # [not win] + # - bodo>=2025.1 # [not win] - numexpr>=2.8.4 - odfpy>=1.4.1 - qtpy>=2.3.0 diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ba813982e69f8..60bc544364e1e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -10316,7 +10316,18 @@ def apply( `_ in numba to learn what you can or cannot use in the passed function. - TODO: describe bodo + The bodo engine will attempt to JIT compile the passed function, spawn + multiple workers and apply the function in parallel over the Dataframe, + which may result in a speedup for large DataFrames. + + Bodo supports a subset of valid Python, numpy, pandas and sci-kit learn. + Please refer to the `bodo documentation + `_ to learn more about which + operations and APIs are supported inside JIT compiled functions. + + Code that does not have JIT support yet can still utilize Bodo's parallel + constructs by decorating the function with `@wrap_python + `_. .. versionadded:: 2.2.0 diff --git a/pandas/tests/apply/test_bodo.py b/pandas/tests/apply/test_bodo.py index bcf143313a520..d563da73a1f10 100644 --- a/pandas/tests/apply/test_bodo.py +++ b/pandas/tests/apply/test_bodo.py @@ -1,21 +1,24 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - import pandas as pd import pandas._testing as tm pytestmark = [pytest.mark.single_cpu, pytest.mark.bodo_udf_engine] -@pytest.fixture -def skip_if_no_bodo(): - """Avoid using in test decorator which will cause bodo import immediately.""" - td.skip_if_no("bodo") +@pytest.fixture(params=["bodo"]) +def engine(request): + """Test bodo engine by itself to avoid extensions conflicting with numba. + + Note: Using a fixture here to avoid importing at the start of the session. + """ + if request.param == "bodo": + pytest.importorskip("bodo") + return request.param -def test_bodo_vs_python_indexing(skip_if_no_bodo): +def test_bodo_vs_python_indexing(engine): frame = pd.DataFrame( {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7.0, 8.0, 9.0]}, ) @@ -33,14 +36,14 @@ def f(a): "reduction", [lambda x: x.mean(), lambda x: x.min(), lambda x: x.max(), lambda x: x.sum()], ) -def test_bodo_vs_python_reductions(reduction, skip_if_no_bodo): +def test_bodo_vs_python_reductions(reduction, engine): df = pd.DataFrame(np.ones((4, 4), dtype=np.float64)) result = df.apply(reduction, engine="bodo", axis=1) expected = df.apply(reduction, engine="python", axis=1) tm.assert_series_equal(result, expected, check_series_type=False) -def test_bodo_vs_python_df_output(skip_if_no_bodo): +def test_bodo_vs_python_df_output(engine): df = pd.DataFrame({"A": np.arange(20), "B": ["hi", "there"] * 10}) def f(a): @@ -52,7 +55,7 @@ def f(a): tm.assert_frame_equal(result, expected, check_frame_type=False, check_dtype=False) -def test_bodo_vs_python_args(skip_if_no_bodo): +def test_bodo_vs_python_args(engine): msg = ( "the 'bodo' engine does not support passing additional args/kwargs " "to apply function yet." @@ -71,7 +74,7 @@ def f(x, y): @pytest.mark.parametrize("axis", [0, 1]) -def test_bodo_vs_python_str_apply(axis, skip_if_no_bodo): +def test_bodo_vs_python_str_apply(axis, engine): df = pd.DataFrame({"A": np.arange(20)}) func = "mean" @@ -82,7 +85,7 @@ def test_bodo_vs_python_str_apply(axis, skip_if_no_bodo): tm.assert_series_equal(result, expected, check_series_type=False) -def test_bodo_unsupported_axis(skip_if_no_bodo): +def test_bodo_unsupported_axis(engine): """Tests that a BodoError is raised when trying to apply UDF column-wise""" frame = pd.DataFrame( {"a": [1, 2, 3]}, @@ -98,7 +101,7 @@ def f(a): frame.apply(f, engine="bodo", axis=0) -def test_bodo_raw_unsupported(skip_if_no_bodo): +def test_bodo_raw_unsupported(engine): """Tests that error gets raised when using raw=True""" frame = pd.DataFrame( {"a": [1, 2, 3]}, @@ -113,7 +116,7 @@ def f(a): frame.apply(f, engine="bodo", raw=True, axis=1) -def test_bodo_result_type_unsupported(skip_if_no_bodo): +def test_bodo_result_type_unsupported(engine): """Tests that error gets raised when passing any value to result_type""" frame = pd.DataFrame( {"a": [1, 2, 3]}, diff --git a/pandas/tests/util/test_bodo.py b/pandas/tests/util/test_bodo.py index 2d4d2a2c1a9b1..613192e5e0424 100644 --- a/pandas/tests/util/test_bodo.py +++ b/pandas/tests/util/test_bodo.py @@ -9,7 +9,9 @@ def test_bodo_not_installed_df_apply(): "Test that importing bodo when not installed results in ImportError." - td.skip_if_installed("bodo") + bodo_installed = bool(td.import_optional_dependency("bodo", errors="ignore")) + if bodo_installed: + pytest.skip("bodo is installed.") df = DataFrame({"A": [1, 2, 3, 4, 5]}) @@ -17,4 +19,4 @@ def f(x): return 1 with pytest.raises(ImportError, match="Missing optional"): - df.apply(f, engine="bodo") + df.apply(f, engine="bodo", axis=1) From 829e8792879002d6ded192f811eb8c41c60155f2 Mon Sep 17 00:00:00 2001 From: = <=> Date: Mon, 27 Jan 2025 12:29:07 -0500 Subject: [PATCH 10/16] run bodo tests in separate pytest session --- ci/run_tests.sh | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/ci/run_tests.sh b/ci/run_tests.sh index d2c2f58427a23..1b255f2e5a30c 100755 --- a/ci/run_tests.sh +++ b/ci/run_tests.sh @@ -13,8 +13,19 @@ COVERAGE="-s --cov=pandas --cov-report=xml --cov-append --cov-config=pyproject.t PYTEST_CMD="MESONPY_EDITABLE_VERBOSE=1 PYTHONDEVMODE=1 PYTHONWARNDEFAULTENCODING=1 pytest -r fE -n $PYTEST_WORKERS --dist=worksteal $TEST_ARGS $COVERAGE $PYTEST_TARGET" if [[ "$PATTERN" ]]; then - PYTEST_CMD="$PYTEST_CMD -m \"$PATTERN\"" + PYTEST_CMD="$PYTEST_CMD -m \"$PATTERN and not bodo_udf_engine\"" +else + PYTEST_CMD="$PYTEST_CMD -m \"not bodo_udf_engine\"" fi echo $PYTEST_CMD sh -c "$PYTEST_CMD" + +# Workaround for running bodo tests. Needs to be in a separate session to prevent +# conflicts with numba extensions and run without PYTHONDEVMODE=1 since it can cause segmentation faults during compilation. +if [[ "$PYTEST_WORKERS" == "0" ]]; then + PYTEST_CMD_BODO_UDF_ENGINE="MESONPY_EDITABLE_VERBOSE=1 PYTHONWARNDEFAULTENCODING=1 pytest -r fE -n $PYTEST_WORKERS --dist=worksteal $TEST_ARGS $COVERAGE $PYTEST_TARGET -m \"bodo_udf_engine\"" + echo "Running Bodo Tests..." + echo $PYTEST_CMD_BODO_UDF_ENGINE + sh -c "$PYTEST_CMD_BODO_UDF_ENGINE" +fi From 539c6ba74adf8af971033857b7d69f0705195b4c Mon Sep 17 00:00:00 2001 From: = <=> Date: Mon, 27 Jan 2025 13:38:46 -0500 Subject: [PATCH 11/16] remove workaround in conftest.py --- ci/deps/actions-310.yaml | 3 ++- ci/run_tests.sh | 4 ++-- pandas/conftest.py | 18 ------------------ 3 files changed, 4 insertions(+), 21 deletions(-) diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index 8493f44a85428..b0b2dbaa8ca7e 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -36,7 +36,7 @@ dependencies: - lxml>=4.9.2 - matplotlib>=3.6.3 - numba>=0.56.4 - # - bodo>=2025.1 # [not win] + - bodo>=2025.1 # [not win] - numexpr>=2.8.4 - odfpy>=1.4.1 - qtpy>=2.3.0 @@ -60,6 +60,7 @@ dependencies: - zstandard>=0.19.0 - pip: + - bodo>=2025.1; platform_system != "Windows" - adbc-driver-postgresql>=0.10.0 - adbc-driver-sqlite>=0.8.0 - tzdata>=2022.7 diff --git a/ci/run_tests.sh b/ci/run_tests.sh index 1b255f2e5a30c..b853795f45d32 100755 --- a/ci/run_tests.sh +++ b/ci/run_tests.sh @@ -21,9 +21,9 @@ fi echo $PYTEST_CMD sh -c "$PYTEST_CMD" -# Workaround for running bodo tests. Needs to be in a separate session to prevent -# conflicts with numba extensions and run without PYTHONDEVMODE=1 since it can cause segmentation faults during compilation. +# Bodo tests need to be run in a separate session to prevent extensions installed conflicting with numba. if [[ "$PYTEST_WORKERS" == "0" ]]; then + # Run without setting PYTHONDEVMODE since it can cause segmentation faults during compilation. PYTEST_CMD_BODO_UDF_ENGINE="MESONPY_EDITABLE_VERBOSE=1 PYTHONWARNDEFAULTENCODING=1 pytest -r fE -n $PYTEST_WORKERS --dist=worksteal $TEST_ARGS $COVERAGE $PYTEST_TARGET -m \"bodo_udf_engine\"" echo "Running Bodo Tests..." echo $PYTEST_CMD_BODO_UDF_ENGINE diff --git a/pandas/conftest.py b/pandas/conftest.py index fd2b5b37cf8da..f9c10a7758bd2 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -127,22 +127,6 @@ def ignore_doctest_warning(item: pytest.Item, path: str, message: str) -> None: item.add_marker(pytest.mark.filterwarnings(f"ignore:{message}")) -def run_bodo_udf_engine_tests_last(items: list[pytest.Item]) -> None: - """Always run tests related to bodo UDF engine last to avoid installing - extensions that might change behavior of some tests. - - Parameters - ---------- - item : list[pytest.Item] - The collection of pytest test items to modify in place. - """ - bodo_tests = [item for item in items if "bodo_udf_engine" in item.keywords] - non_bodo_tests = [item for item in items if "bodo_udf_engine" not in item.keywords] - - # Run bodo tests last to avoid conflicting names when installing extensions - items[:] = non_bodo_tests + bodo_tests - - def pytest_collection_modifyitems(items, config) -> None: is_doctest = config.getoption("--doctest-modules") or config.getoption( "--doctest-cython", default=False @@ -191,8 +175,6 @@ def pytest_collection_modifyitems(items, config) -> None: for path, message in ignored_doctest_warnings: ignore_doctest_warning(item, path, message) - run_bodo_udf_engine_tests_last(items) - hypothesis_health_checks = [ hypothesis.HealthCheck.too_slow, From cf213bdeb0dd8b60ebb58e800c4529f0bf0c4da3 Mon Sep 17 00:00:00 2001 From: = <=> Date: Mon, 27 Jan 2025 14:45:38 -0500 Subject: [PATCH 12/16] remove bodo from env file for window CI --- .github/actions/setup-conda/action.yml | 15 +++++++++++++++ .github/workflows/unit-tests.yml | 1 + ci/deps/actions-310.yaml | 1 - 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/.github/actions/setup-conda/action.yml b/.github/actions/setup-conda/action.yml index 3eb68bdd2a15c..9b7f752b19794 100644 --- a/.github/actions/setup-conda/action.yml +++ b/.github/actions/setup-conda/action.yml @@ -3,9 +3,24 @@ inputs: environment-file: description: Conda environment file to use. default: environment.yml + os: + description: The operating system to assume when creating Conda. + default: not specified runs: using: composite steps: + # Remove bodo from Window's environment for now until it supports Windows. + - name: Remove bodo on Windows + if: ${{ inputs.os == 'windows-latest' }} + run: | + + sed '/bodo/d' "$ENVIRONMENT_FILE" > tmp.txt + cat tmp.txt > "$ENVIRONMENT_FILE" + rm tmp.txt + env: + ENVIRONMENT_FILE: ${{ inputs.environment-file }} + shell: bash -el {0} + - name: Install ${{ inputs.environment-file }} uses: mamba-org/setup-micromamba@v1 with: diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 842629ba331d6..08917b6735976 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -212,6 +212,7 @@ jobs: uses: ./.github/actions/setup-conda with: environment-file: ci/deps/${{ matrix.env_file }} + os: ${{ matrix.os }} - name: Build Pandas uses: ./.github/actions/build_pandas diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index b0b2dbaa8ca7e..5eb3153bf810a 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -60,7 +60,6 @@ dependencies: - zstandard>=0.19.0 - pip: - - bodo>=2025.1; platform_system != "Windows" - adbc-driver-postgresql>=0.10.0 - adbc-driver-sqlite>=0.8.0 - tzdata>=2022.7 From 6f7bf12a944f0d900e5a6540f488d49e2bfc15a6 Mon Sep 17 00:00:00 2001 From: = <=> Date: Mon, 27 Jan 2025 16:32:05 -0500 Subject: [PATCH 13/16] remove bodo from min version CI --- ci/deps/actions-310-minimum_versions.yaml | 2 +- pandas/core/frame.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/deps/actions-310-minimum_versions.yaml b/ci/deps/actions-310-minimum_versions.yaml index 61f7cf04d23a4..28d5160a18fe7 100644 --- a/ci/deps/actions-310-minimum_versions.yaml +++ b/ci/deps/actions-310-minimum_versions.yaml @@ -37,7 +37,7 @@ dependencies: - lxml=4.9.2 - matplotlib=3.6.3 - numba=0.56.4 - - bodo=2025.1 + # - bodo=2025.1 - numexpr=2.8.4 - odfpy=1.4.1 - qtpy=2.3.0 diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 60bc544364e1e..c57d34ac76e99 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -10320,7 +10320,7 @@ def apply( multiple workers and apply the function in parallel over the Dataframe, which may result in a speedup for large DataFrames. - Bodo supports a subset of valid Python, numpy, pandas and sci-kit learn. + Bodo supports a subset of valid Python, numpy, pandas and scikit-learn. Please refer to the `bodo documentation `_ to learn more about which operations and APIs are supported inside JIT compiled functions. From f1bdbd73038b4143717c973407cfd7ef6c821c46 Mon Sep 17 00:00:00 2001 From: = <=> Date: Mon, 27 Jan 2025 17:21:05 -0500 Subject: [PATCH 14/16] add ExecutionError and raise from BodoError --- ci/deps/actions-310-minimum_versions.yaml | 2 +- doc/source/reference/testing.rst | 1 + pandas/core/apply.py | 11 +++++++++-- pandas/errors/__init__.py | 16 ++++++++++++++++ pandas/tests/apply/test_bodo.py | 14 ++++++++++++++ 5 files changed, 41 insertions(+), 3 deletions(-) diff --git a/ci/deps/actions-310-minimum_versions.yaml b/ci/deps/actions-310-minimum_versions.yaml index 28d5160a18fe7..61f7cf04d23a4 100644 --- a/ci/deps/actions-310-minimum_versions.yaml +++ b/ci/deps/actions-310-minimum_versions.yaml @@ -37,7 +37,7 @@ dependencies: - lxml=4.9.2 - matplotlib=3.6.3 - numba=0.56.4 - # - bodo=2025.1 + - bodo=2025.1 - numexpr=2.8.4 - odfpy=1.4.1 - qtpy=2.3.0 diff --git a/doc/source/reference/testing.rst b/doc/source/reference/testing.rst index 1f164d1aa98b4..5057275a565e7 100644 --- a/doc/source/reference/testing.rst +++ b/doc/source/reference/testing.rst @@ -35,6 +35,7 @@ Exceptions and warnings errors.DtypeWarning errors.DuplicateLabelError errors.EmptyDataError + errors.ExecutionError errors.IncompatibilityWarning errors.IndexingError errors.InvalidColumnName diff --git a/pandas/core/apply.py b/pandas/core/apply.py index a8df8e56fc292..7bc046a0a082a 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -27,7 +27,10 @@ npt, ) from pandas.compat._optional import import_optional_dependency -from pandas.errors import SpecificationError +from pandas.errors import ( + ExecutionError, + SpecificationError, +) from pandas.util._decorators import cache_readonly from pandas.core.dtypes.cast import is_nested_object @@ -1125,7 +1128,11 @@ def apply_series_bodo(self) -> DataFrame | Series: def do_apply(obj, func, axis): return obj.apply(func, axis) - result = do_apply(self.obj, self.func, self.axis) + try: + result = do_apply(self.obj, self.func, self.axis) + except bodo.utils.typing.BodoError as e: + raise ExecutionError("Execution with engine='bodo' failed.") from e + return result def wrap_results(self, results: ResType, res_index: Index) -> DataFrame | Series: diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index 2b5bc450e41d6..ab40ff559fbcc 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -417,6 +417,21 @@ class NumbaUtilError(Exception): """ +class ExecutionError(Exception): + """ + Error raised from internal errors originating in engines. + + Examples + -------- + >>> df = pd.DataFrame({"A": [1, 2, 3], "B": ["1", "2", "3"]}) + >>> df.apply(lambda x: x.A + x.B, engine="bodo", axis=1) + Traceback (most recent call last): + ... + pandas.errors.ExecutionError: Execution with engine='bodo' failed. + + """ + + class DuplicateLabelError(ValueError): """ Error raised when an operation would introduce duplicate labels. @@ -916,6 +931,7 @@ class InvalidComparison(Exception): "DtypeWarning", "DuplicateLabelError", "EmptyDataError", + "ExecutionError", "IncompatibilityWarning", "IndexingError", "IntCastingNaNError", diff --git a/pandas/tests/apply/test_bodo.py b/pandas/tests/apply/test_bodo.py index d563da73a1f10..6044dfe3b1ecf 100644 --- a/pandas/tests/apply/test_bodo.py +++ b/pandas/tests/apply/test_bodo.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas.errors import ExecutionError + import pandas as pd import pandas._testing as tm @@ -129,3 +131,15 @@ def f(a): NotImplementedError, match="the 'bodo' engine does not support result_type yet." ): frame.apply(f, engine="bodo", axis=1, result_type="reduce") + + +def test_bodo_engine_execution_error(engine): + frame = pd.DataFrame( + {"a": [1, 2, 3], "b": ["1", "2", "3"]}, + ) + + def f(x): + return x.a + x.b + + with pytest.raises(ExecutionError, match="Execution with engine='bodo' failed."): + frame.apply(f, engine="bodo", axis=1) From cfecfbf937bbdb3a4064cc75f588e703ff2adb88 Mon Sep 17 00:00:00 2001 From: = <=> Date: Mon, 27 Jan 2025 17:33:54 -0500 Subject: [PATCH 15/16] remove bodo from min version CI --- ci/deps/actions-310-minimum_versions.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/ci/deps/actions-310-minimum_versions.yaml b/ci/deps/actions-310-minimum_versions.yaml index 61f7cf04d23a4..c7c72828db481 100644 --- a/ci/deps/actions-310-minimum_versions.yaml +++ b/ci/deps/actions-310-minimum_versions.yaml @@ -37,7 +37,6 @@ dependencies: - lxml=4.9.2 - matplotlib=3.6.3 - numba=0.56.4 - - bodo=2025.1 - numexpr=2.8.4 - odfpy=1.4.1 - qtpy=2.3.0 From edf2f4889e13d75a3ab5abc51721a569dc548a13 Mon Sep 17 00:00:00 2001 From: = <=> Date: Mon, 27 Jan 2025 17:40:05 -0500 Subject: [PATCH 16/16] fix typo in min version string --- doc/source/getting_started/install.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index 67009a1ff3c44..aff9f7b1b84f2 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -186,7 +186,7 @@ Dependency Minimum Version pip ext `numexpr `__ 2.8.4 performance Accelerates certain numerical operations by using multiple cores as well as smart chunking and caching to achieve large speedups `bottleneck `__ 1.3.6 performance Accelerates certain types of ``nan`` by using specialized cython routines to achieve large speedup. `numba `__ 0.56.4 performance Alternative execution engine for operations that accept ``engine="numba"`` using a JIT compiler that translates Python functions to optimized machine code using the LLVM compiler. -`bodo `__ 2024.12.3 performance Alternative execution engine for operations that accept ``engine="bodo"`` using a JIT compiler that translates Python functions to optimized machine code using the LLVM compiler and automatically parallelizes uing MPI. +`bodo `__ 2025.1 performance Alternative execution engine for operations that accept ``engine="bodo"`` using a JIT compiler that translates Python functions to optimized machine code using the LLVM compiler and automatically parallelizes uing MPI. ===================================================== ================== ================== =================================================================================================================================================================================== Visualization