Merge branch 'feature/generalized_window_operations' into feature/rolling_apply_numba

Matt Roeschke · Matt Roeschke · commit 9b9ea7aa9955 · 2019-09-18T13:40:37.000-07:00
diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json
@@ -52,6 +52,7 @@
         "xlwt": [],
         "odfpy": [],
         "pytest": [],
+        "numba": [],
         // If using Windows with python 2.7 and want to build using the
         // mingw toolchain (rather than MSVC), uncomment the following line.
         // "libpython": [],
diff --git a/pandas/core/window/aggregators/kernels.py b/pandas/core/window/aggregators/kernels.py
@@ -1,6 +1,18 @@
-from functools import partial
+"""
+Implementation of the rolling aggregations using jitclasses.
+
+Some current difficulties as of numba 0.45.1:
+
+1) jitclasses don't support inheritance, i.e. a base jitclass cannot be subclassed.
+
+2) This implementation is not currently utilized because of
+   inherent performance penalties.
+See https://github.com/numba/numba/issues/4522
+"""
+
 from typing import Optional
 
+import numba
 import numpy as np
 
 from pandas._typing import Scalar
@@ -56,13 +68,15 @@ class AggKernel:
     make_aggregator
     """
 
+    def __init__(self):
+        pass
+
     def finalize(self):
         """Return the final value of the aggregation."""
         raise NotImplementedError
 
-    @classmethod
     def make_aggregator(
-        cls, values: np.ndarray, minimum_periods: int
+        self, values: np.ndarray, minimum_periods: int
     ) -> BaseAggregator:
         """Return an aggregator that performs the aggregation calculation"""
         raise NotImplementedError
@@ -80,14 +94,30 @@ def invert(self, value) -> None:
         raise NotImplementedError
 
 
+agg_type = numba.deferred_type()
+
+
+base_aggregator_spec = (
+    ("values", numba.float64[:]),
+    ("min_periods", numba.uint64),
+    ("agg", agg_type),
+    ("previous_start", numba.int64),
+    ("previous_end", numba.int64),
+)
+
+
+@numba.jitclass(base_aggregator_spec)
 class SubtractableAggregator(BaseAggregator):
     """
     Aggregator in which a current aggregated value
     is offset from a prior aggregated value.
     """
 
     def __init__(self, values: np.ndarray, min_periods: int, agg) -> None:
-        super().__init__(values, min_periods)
+        # Note: Numba doesn't like inheritance
+        # super().__init__(values, min_periods)
+        self.values = values
+        self.min_periods = min_periods
         self.agg = agg
         self.previous_start = -1
         self.previous_end = -1
@@ -108,7 +138,8 @@ def query(self, start: int, stop: int) -> Optional[Scalar]:
         self.previous_end = stop
         if self.agg.count >= self.min_periods:
             return self.agg.finalize()
-        return None
+        # Numba wanted this to be None instead of None
+        return np.nan
 
 
 class Sum(UnaryAggKernel):
@@ -140,32 +171,40 @@ def combine(self, other) -> None:
         self.total += other.total
         self.count += other.count
 
-    @classmethod
-    def make_aggregator(cls, values: np.ndarray, min_periods: int) -> BaseAggregator:
-        aggregator = SubtractableAggregator(values, min_periods, cls())
+    def make_aggregator(self, values: np.ndarray, min_periods: int) -> BaseAggregator:
+        aggregator = SubtractableAggregator(values, min_periods, self)
         return aggregator
 
 
+sum_spec = (("count", numba.uint64), ("total", numba.float64))
+
+
+@numba.jitclass(sum_spec)
 class Mean(Sum):
     def finalize(self) -> Optional[float]:
         if not self.count:
             return None
         return self.total / self.count
 
 
-def rolling_aggregation(
+agg_type.define(Mean.class_type.instance_type)  # type: ignore
+
+
+aggregation_signature = (numba.float64[:], numba.int64[:], numba.int64[:], numba.int64)
+
+
+@numba.njit(aggregation_signature, nogil=True, parallel=True)
+def rolling_mean(
     values: np.ndarray,
     begin: np.ndarray,
     end: np.ndarray,
     minimum_periods: int,
-    kernel_class,
+    # kernel_class,  Don't think I can define this in the signature in nopython mode
 ) -> np.ndarray:
     """Perform a generic rolling aggregation"""
-    aggregator = kernel_class.make_aggregator(values, minimum_periods)
+    aggregator = Mean().make_aggregator(values, minimum_periods)
+    # aggregator = kernel_class().make_aggregator(values, minimum_periods)
     result = np.empty(len(begin))
     for i, (start, stop) in enumerate(zip(begin, end)):
         result[i] = aggregator.query(start, stop)
     return result
-
-
-rolling_mean = partial(rolling_aggregation, kernel_class=Mean)
diff --git a/pandas/core/window/aggregators/methods.py b/pandas/core/window/aggregators/methods.py
@@ -1,12 +1,39 @@
+"""
+Implementation of the rolling aggregations using njit methods.
+This implementation mimics what we currently do in cython except the
+calculation of window bounds is independent of the aggregation routine.
+"""
 from typing import Callable
 
 import numba
 import numpy as np
 
 
+@numba.njit(nogil=True)
 def rolling_mean(
     values: np.ndarray, begin: np.ndarray, end: np.ndarray, minimum_periods: int
 ) -> np.ndarray:
+    """
+    Compute a rolling mean over values.
+
+    Parameters
+    ----------
+    values : ndarray[float64]
+        values to roll over
+
+    begin : ndarray[int64]
+        starting indexers
+
+    end : ndarray[int64]
+        ending indexers
+
+    minimum_periods : ndarray[float64]
+        minimum
+
+    Returns
+    -------
+    ndarray[float64]
+    """
     result = np.empty(len(begin))
     previous_start = -1
     previous_end = -1
diff --git a/pandas/core/window/indexers.py b/pandas/core/window/indexers.py
@@ -1,11 +1,14 @@
 from typing import Optional, Sequence, Tuple, Union
 
+import numba
 import numpy as np
 
 from pandas.tseries.offsets import DateOffset
 
 BeginEnd = Tuple[np.ndarray, np.ndarray]
 
+baseindexer_spec = (("index", numba.optional(numba.int64[:])),)
+
 
 class BaseIndexer:
     """Base class for window bounds calculations"""
@@ -22,16 +25,11 @@ def __init__(
         index : ndarray[int64], default None
             pandas index to reference in the window bound calculation
 
-        offset: str or DateOffset, default None
-            Offset used to calcuate the window boundary
-
-        keys: np.ndarray, default None
-            Additional columns needed to calculate the window bounds
-
         """
         self.index = index
-        self.offset = offset
-        self.keys = keys
+        # TODO: How to effectively types these in Numba to run in nopython?
+        # self.offset = offset
+        # self.keys = keys
 
     def get_window_bounds(
         self,
@@ -74,6 +72,7 @@ def get_window_bounds(
         raise NotImplementedError
 
 
+@numba.jitclass(baseindexer_spec)
 class FixedWindowIndexer(BaseIndexer):
     """Calculate window boundaries that have a fixed window size"""
 
@@ -97,16 +96,16 @@ def get_window_bounds(
         (array([0, 0, 1, 1, 2]), array([1, 2, 3, 4, 5]))
         """
         start_s = np.zeros(window_size, dtype=np.int64)
-        start_e = np.arange(1, num_values - window_size + 1, dtype=np.int64)
-        start = np.concatenate([start_s, start_e])
+        start_e = np.arange(1, num_values - window_size + 1)
+        start = np.concatenate((start_s, start_e))
 
-        end = np.arange(1, num_values + 1, dtype=np.int64)
-        if window_size > num_values:
-            start = start[:num_values]
-            end = end[:num_values]
+        end = np.arange(1, num_values + 1)
+        start = start[:num_values]
+        end = end[:num_values]
         return start, end
 
 
+@numba.jitclass(baseindexer_spec)
 class VariableWindowIndexer(BaseIndexer):
     """
     Calculate window boundaries with variable closed boundaries and index dependent
diff --git a/pandas/tests/window/test_timeseries_window.py b/pandas/tests/window/test_timeseries_window.py
@@ -1,6 +1,8 @@
 import numpy as np
 import pytest
 
+import pandas.compat as compat
+
 from pandas import DataFrame, Index, Series, Timestamp, date_range, to_datetime
 import pandas.util.testing as tm
 
@@ -577,7 +579,27 @@ def test_all_apply(self, raw):
         expected = er.apply(lambda x: 1, raw=raw)
         tm.assert_frame_equal(result, expected)
 
-    def test_all2(self):
+    @pytest.mark.parametrize(
+        "func",
+        [
+            "sum",
+            pytest.param(
+                "mean",
+                marks=pytest.mark.skipif(
+                    compat.is_platform_32bit(), reason="Numba fails here for 32 bit"
+                ),
+            ),
+            "count",
+            "median",
+            "std",
+            "var",
+            "kurt",
+            "skew",
+            "min",
+            "max",
+        ],
+    )
+    def test_all2(self, func):
 
         # more sophisticated comparison of integer vs.
         # time-based windowing
@@ -589,36 +611,21 @@ def test_all2(self):
 
         r = dft.rolling(window="5H")
 
-        for f in [
-            "sum",
-            "mean",
-            "count",
-            "median",
-            "std",
-            "var",
-            "kurt",
-            "skew",
-            "min",
-            "max",
-        ]:
-
-            result = getattr(r, f)()
+        result = getattr(r, func)()
 
-            # we need to roll the days separately
-            # to compare with a time-based roll
-            # finally groupby-apply will return a multi-index
-            # so we need to drop the day
-            def agg_by_day(x):
-                x = x.between_time("09:00", "16:00")
-                return getattr(x.rolling(5, min_periods=1), f)()
+        # we need to roll the days separately
+        # to compare with a time-based roll
+        # finally groupby-apply will return a multi-index
+        # so we need to drop the day
+        def agg_by_day(x):
+            x = x.between_time("09:00", "16:00")
+            return getattr(x.rolling(5, min_periods=1), func)()
 
-            expected = (
-                df.groupby(df.index.day)
-                .apply(agg_by_day)
-                .reset_index(level=0, drop=True)
-            )
+        expected = (
+            df.groupby(df.index.day).apply(agg_by_day).reset_index(level=0, drop=True)
+        )
 
-            tm.assert_frame_equal(result, expected)
+        tm.assert_frame_equal(result, expected)
 
     def test_groupby_monotonic(self):
 
@@ -671,6 +678,9 @@ def test_non_monotonic(self):
         result = df2.groupby("A").rolling("4s", on="B").C.mean()
         tm.assert_series_equal(result, expected)
 
+    @pytest.mark.skipif(
+        compat.is_platform_32bit(), reason="Numba fails here for 32 bit"
+    )
     def test_rolling_cov_offset(self):
         # GH16058
 
diff --git a/setup.cfg b/setup.cfg
@@ -116,7 +116,7 @@ known_dtypes = pandas.core.dtypes
 known_post_core = pandas.tseries,pandas.io,pandas.plotting
 sections = FUTURE,STDLIB,THIRDPARTY,PRE_LIBS,PRE_CORE,DTYPES,FIRSTPARTY,POST_CORE,LOCALFOLDER
 known_first_party = pandas
-known_third_party = _pytest,announce,dateutil,docutils,flake8,git,hypothesis,jinja2,lxml,matplotlib,numpy,numpydoc,pkg_resources,pyarrow,pytest,pytz,requests,scipy,setuptools,sphinx,sqlalchemy,validate_docstrings,yaml
+known_third_party = _pytest,announce,dateutil,docutils,flake8,git,hypothesis,jinja2,lxml,matplotlib,numpy,numpydoc,pkg_resources,pyarrow,pytest,pytz,requests,scipy,setuptools,sphinx,sqlalchemy,validate_docstrings,yaml,numba
 multi_line_output = 3
 include_trailing_comma = True
 force_grid_wrap = 0
diff --git a/setup.py b/setup.py
@@ -39,6 +39,7 @@ def is_platform_mac():
         "python-dateutil >= 2.6.1",
         "pytz >= 2017.2",
         "numpy >= {numpy_ver}".format(numpy_ver=min_numpy_ver),
+        "numba >= 0.45.1"
     ],
     "setup_requires": ["numpy >= {numpy_ver}".format(numpy_ver=min_numpy_ver)],
     "zip_safe": False,