diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index fe5342c520196..5fa0c1ff9253f 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -379,6 +379,8 @@ Performance Improvements - Improved performance of :func:`IntervalIndex.symmetric_difference()` (:issue:`18475`) - Improved performance of ``DatetimeIndex`` and ``Series`` arithmetic operations with Business-Month and Business-Quarter frequencies (:issue:`18489`) - :func:`Series` / :func:`DataFrame` tab completion limits to 100 values, for better performance. (:issue:`18587`) +- Improved performance of :func:`DataFrame.median` with ``axis=1`` when bottleneck is not installed (:issue:`16468`) + .. _whatsnew_0230.docs: diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 7b61cd22f45d1..9a7af71e74574 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -196,24 +196,6 @@ cpdef numeric kth_smallest(numeric[:] a, Py_ssize_t k) nogil: return a[k] -cpdef numeric median(numeric[:] arr): - """ - A faster median - """ - cdef Py_ssize_t n = arr.size - - if n == 0: - return np.NaN - - arr = arr.copy() - - if n % 2: - return kth_smallest(arr, n // 2) - else: - return (kth_smallest(arr, n // 2) + - kth_smallest(arr, n // 2 - 1)) / 2 - - # ---------------------------------------------------------------------- # Pairwise correlation/covariance diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 9d9ac2ef2f5b1..9cc15fb6692d9 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -118,7 +118,7 @@ def group_last_object(ndarray[object, ndim=2] out, out[i, j] = resx[i, j] -cdef inline float64_t _median_linear(float64_t* a, int n) nogil: +cdef inline float64_t median_linear(float64_t* a, int n) nogil: cdef int i, j, na_count = 0 cdef float64_t result cdef float64_t* tmp diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index 14d47398ac1df..a751fadaf48cf 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -740,7 +740,7 @@ def group_median_float64(ndarray[float64_t, ndim=2] out, ptr += _counts[0] for j in range(ngroups): size = _counts[j + 1] - out[j, i] = _median_linear(ptr, size) + out[j, i] = median_linear(ptr, size) ptr += size diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 63989304bb5f9..eda86f12d501d 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -6,7 +6,7 @@ import numpy as np from pandas import compat -from pandas._libs import tslib, algos, lib +from pandas._libs import tslib, lib from pandas.core.dtypes.common import ( _get_dtype, is_float, is_scalar, @@ -370,14 +370,13 @@ def nanmean(values, axis=None, skipna=True): @bottleneck_switch() def nanmedian(values, axis=None, skipna=True): - values, mask, dtype, dtype_max = _get_values(values, skipna) - def get_median(x): mask = notna(x) if not skipna and not mask.all(): return np.nan - return algos.median(com._values_from_object(x[mask])) + return np.nanmedian(x[mask]) + values, mask, dtype, dtype_max = _get_values(values, skipna) if not is_float_dtype(values): values = values.astype('f8') values[mask] = np.nan @@ -389,10 +388,15 @@ def get_median(x): # an array from a frame if values.ndim > 1: + # there's a non-empty array to apply over otherwise numpy raises if notempty: - return _wrap_results( - np.apply_along_axis(get_median, axis, values), dtype) + if not skipna: + return _wrap_results( + np.apply_along_axis(get_median, axis, values), dtype) + + # fastpath for the skipna case + return _wrap_results(np.nanmedian(values, axis), dtype) # must return the correct shape, but median is not defined for the # empty set so return nans of shape "everything but the passed axis"