-
-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Weighted quantile #6059
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Weighted quantile #6059
Changes from 36 commits
dcd1b24
875f766
217d2f0
278bed9
80ae229
77bb84e
58af567
15e3834
83e4210
2237399
b936e21
c94fa16
ab810d7
7bcf09e
8427637
3217962
7379d22
c8871d1
b26a5fc
42ebcc2
5aa22a4
abe253e
82147aa
784cedd
3ee62fd
4d6a4fd
9f93f55
8132320
4d7f5f5
c268ddd
db706aa
33ee96c
2ffd3d3
9806db8
15ee999
9559c87
73dde79
59714af
585b705
7443b82
42a6a49
2e0c16e
4186a24
1be1f92
5c251e0
9060f8e
c112d2c
d4ba8ee
fd0a54e
8bd83f9
343b47e
c298bd0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,12 +1,33 @@ | ||
from typing import TYPE_CHECKING, Generic, Hashable, Iterable, Optional, Union, cast | ||
from typing import ( | ||
TYPE_CHECKING, | ||
Generic, | ||
Hashable, | ||
Iterable, | ||
Literal, | ||
Optional, | ||
Sequence, | ||
Union, | ||
cast, | ||
) | ||
|
||
import numpy as np | ||
|
||
from . import duck_array_ops | ||
from .computation import dot | ||
from . import duck_array_ops, utils | ||
from .alignment import align | ||
from .computation import apply_ufunc, dot | ||
from .pycompat import is_duck_dask_array | ||
from .types import T_Xarray | ||
|
||
# Weighted quantile methods are a subset of the numpy supported quantile methods. | ||
QUANTILE_METHODS = Literal[ | ||
"linear", | ||
"interpolated_inverted_cdf", | ||
"hazen", | ||
"weibull", | ||
"median_unbiased", | ||
"normal_unbiased", | ||
] | ||
|
||
_WEIGHTED_REDUCE_DOCSTRING_TEMPLATE = """ | ||
Reduce this {cls}'s data by a weighted ``{fcn}`` along some dimension(s). | ||
|
||
|
@@ -54,6 +75,61 @@ | |
New {cls} object with the sum of the weights over the given dimension. | ||
""" | ||
|
||
_WEIGHTED_QUANTILE_DOCSTRING_TEMPLATE = """ | ||
Apply a a weighted ``quantile`` to this {cls}'s data along some dimension(s). | ||
dcherian marked this conversation as resolved.
Show resolved
Hide resolved
huard marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
Weights are interpreted as *sampling weights* (or probability weights) and | ||
describe how a sample is scaled to the whole population [1]_. Although there are | ||
huard marked this conversation as resolved.
Show resolved
Hide resolved
|
||
other possible interpretations for weights, *precision weights* describing the | ||
precision of observations, or *frequency weights* counting the number of identical | ||
observations, they are not implemented here. | ||
huard marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
For compatibility with NumPy's non-weighted ``quantile`` (which is used by | ||
``DataArray.quantile`` and ``Dataset.quantile``), the only interpolation | ||
method supported by this weighted version corresponds to the default "linear" | ||
option of ``numpy.quantile``. This is "Type 7" option, described in Hyndman | ||
and Fan (1996) [2]_. The implementation is largely inspired from A. Akinshin's [3]_. | ||
huard marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
Parameters | ||
---------- | ||
q : float or sequence of float | ||
Quantile to compute, which must be between 0 and 1 inclusive. | ||
dim : str or sequence of str, optional | ||
Dimension(s) over which to apply the weighted ``quantile``. | ||
skipna : bool, optional | ||
If True, skip missing values (as marked by NaN). By default, only | ||
skips missing values for float dtypes; other dtypes either do not | ||
have a sentinel missing value (int) or skipna=True has not been | ||
implemented (object, datetime64 or timedelta64). | ||
keep_attrs : bool, optional | ||
If True, the attributes (``attrs``) will be copied from the original | ||
object to the new one. If False (default), the new object will be | ||
returned without attributes. | ||
|
||
Returns | ||
------- | ||
quantiles : {cls} | ||
New {cls} object with weighted ``quantile`` applied to its data and | ||
the indicated dimension(s) removed. | ||
|
||
See Also | ||
-------- | ||
numpy.nanquantile, pandas.Series.quantile, Dataset.quantile | ||
DataArray.quantile | ||
cjauvin marked this conversation as resolved.
Show resolved
Hide resolved
huard marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
Notes | ||
----- | ||
Returns NaN if the ``weights`` sum to 0.0 along the reduced | ||
dimension(s). | ||
|
||
References | ||
---------- | ||
.. [1] https://notstatschat.rbind.io/2020/08/04/weights-in-statistics/ | ||
.. [2] Hyndman, R. J. & Fan, Y. (1996). Sample Quantiles in Statistical Packages. | ||
The American Statistician, 50(4), 361–365. https://doi.org/10.2307/2684934 | ||
.. [3] https://aakinshin.net/posts/weighted-quantiles | ||
""" | ||
|
||
|
||
if TYPE_CHECKING: | ||
from .dataarray import DataArray | ||
|
@@ -239,6 +315,123 @@ def _weighted_std( | |
|
||
return cast("DataArray", np.sqrt(self._weighted_var(da, dim, skipna))) | ||
|
||
def _weighted_quantile( | ||
self, | ||
da: "DataArray", | ||
q: Union[float, Iterable[float]], | ||
dim: Union[Hashable, Iterable[Hashable]] = None, | ||
skipna: Optional[bool] = None, | ||
keep_attrs: Optional[bool] = None, | ||
) -> "DataArray": | ||
"""Apply a weighted ``quantile`` to a DataArray along some dimension(s).""" | ||
|
||
def _get_h(n: float, q: np.ndarray, method: QUANTILE_METHODS) -> np.ndarray: | ||
"""Return the interpolation parameter.""" | ||
# Note that options are not yet exposed in the public API. | ||
if method == "linear": | ||
h = (n - 1) * q + 1 | ||
elif method == "interpolated_inverted_cdf": | ||
h = n * q | ||
elif method == "hazen": | ||
h = n * q + 0.5 | ||
elif method == "weibull": | ||
h = (n + 1) * q | ||
elif method == "median_unbiased": | ||
h = (n + 1 / 3) * q + 1 / 3 | ||
elif method == "normal_unbiased": | ||
h = (n + 1 / 4) * q + 3 / 8 | ||
else: | ||
raise ValueError(f"Invalid method: {method}.") | ||
return h.clip(1, n) | ||
|
||
def _weighted_quantile_1d( | ||
data: np.ndarray, | ||
weights: np.ndarray, | ||
q: np.ndarray, | ||
skipna: bool, | ||
method: QUANTILE_METHODS = "linear", | ||
) -> np.ndarray: | ||
# This algorithm has been adapted from: | ||
# https://aakinshin.net/posts/weighted-quantiles/#reference-implementation | ||
is_nan = np.isnan(data) | ||
if skipna: | ||
# Remove nans from data and weights | ||
not_nan = ~is_nan | ||
data = data[not_nan] | ||
weights = weights[not_nan] | ||
elif is_nan.any(): | ||
# Return nan if data contains any nan | ||
return np.full(q.size, np.nan) | ||
|
||
# Filter out data (and weights) associated with zero weights, which also flattens them | ||
nonzero_weights = weights != 0 | ||
data = data[nonzero_weights] | ||
weights = weights[nonzero_weights] | ||
n = data.size | ||
|
||
if n == 0: | ||
# Possibly empty after nan or zero weight filtering above | ||
return np.full(q.size, np.nan) | ||
|
||
# Kish's effective sample size | ||
nw = weights.sum() ** 2 / (weights**2).sum() | ||
|
||
# Sort data and weights | ||
sorter = np.argsort(data) | ||
data = data[sorter] | ||
weights = weights[sorter] | ||
|
||
# Normalize and sum the weights | ||
weights = weights / weights.sum() | ||
weights_cum = np.append(0, weights.cumsum()) | ||
|
||
# Vectorize the computation for q | ||
q = np.atleast_2d(q).T | ||
h = _get_h(nw, q, method) | ||
u = np.maximum((h - 1) / nw, np.minimum(h / nw, weights_cum)) | ||
v = u * nw - h + 1 | ||
w = np.diff(v) | ||
return (data * w).sum(axis=1) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would this section merit some comments on what's happening here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added some comments, but it's still abstract. |
||
|
||
if da.shape != self.weights.shape: | ||
raise ValueError("da and weights must have the same shape") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I had to explicitly call align to make sure coordinates for da and weights match. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I still disagree with this - import numpy as np
import xarray as xr
da = xr.DataArray(np.arange(6).reshape(3, 2))
w = xr.DataArray([1, 1, 1])
da.weighted(w).quantile(0.5) Check also: da = xr.DataArray(np.arange(6).reshape(3, 2), coords={"dim_0": [1, 2, 3]})
w = xr.DataArray([1, 1, 1, 100_000], coords={"dim_0": [0, 1, 2, 3]})
da.weighted(w).mean() or if you absolutely disagree with the second example do There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree with the principle, but in practice I'm having trouble getting it to work. Your first example doesn't work out of the box. I get ValueError: operand to apply_ufunc has required core dimensions ['dim_0', 'dim_1'], but some of these dimensions are absent on an input variable: ['dim_1'] |
||
|
||
q = np.atleast_1d(np.asarray(q, dtype=np.float64)) | ||
|
||
if q.ndim > 1: | ||
raise ValueError("q must be a scalar or 1d") | ||
|
||
if np.any((q < 0) | (q > 1)): | ||
raise ValueError("q values must be between 0 and 1") | ||
|
||
if dim is None: | ||
dim = da.dims | ||
|
||
if utils.is_scalar(dim): | ||
dim = [dim] | ||
|
||
dim = cast(Sequence, dim) | ||
|
||
da, w = align(da, self.weights) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Shouldn't this before the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You could have the same shapes but different coordinates. |
||
result = apply_ufunc( | ||
_weighted_quantile_1d, | ||
da, | ||
w, | ||
input_core_dims=[dim, dim], | ||
output_core_dims=[["quantile"]], | ||
output_dtypes=[np.float64], | ||
join="override", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why don't you use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Indeed, this seems to work. |
||
dask_gufunc_kwargs=dict(output_sizes={"quantile": len(q)}), | ||
dask="parallelized", | ||
vectorize=True, | ||
kwargs={"q": q, "skipna": skipna}, | ||
) | ||
|
||
result = result.transpose("quantile", ...) | ||
result = result.assign_coords(quantile=q).squeeze() | ||
|
||
return result | ||
|
||
def _implementation(self, func, dim, **kwargs): | ||
|
||
raise NotImplementedError("Use `Dataset.weighted` or `DataArray.weighted`") | ||
|
@@ -308,6 +501,19 @@ def std( | |
self._weighted_std, dim=dim, skipna=skipna, keep_attrs=keep_attrs | ||
) | ||
|
||
def quantile( | ||
self, | ||
q: Union[float, Iterable[float]], | ||
*, | ||
dim: Optional[Union[Hashable, Iterable[Hashable]]] = None, | ||
skipna: Optional[bool] = None, | ||
keep_attrs: Optional[bool] = None, | ||
) -> T_Xarray: | ||
|
||
return self._implementation( | ||
self._weighted_quantile, q=q, dim=dim, skipna=skipna, keep_attrs=keep_attrs | ||
) | ||
|
||
def __repr__(self): | ||
"""provide a nice str repr of our Weighted object""" | ||
|
||
|
@@ -358,6 +564,8 @@ def _inject_docstring(cls, cls_name): | |
cls=cls_name, fcn="std", on_zero="NaN" | ||
) | ||
|
||
cls.quantile.__doc__ = _WEIGHTED_QUANTILE_DOCSTRING_TEMPLATE.format(cls=cls_name) | ||
|
||
|
||
_inject_docstring(DataArrayWeighted, "DataArray") | ||
_inject_docstring(DatasetWeighted, "Dataset") |
Uh oh!
There was an error while loading. Please reload this page.