Skip to content

Commit 37fe544

Browse files
alimanfoomax-sixty
andauthored
Adds Dataset.query() method, analogous to pandas DataFrame.query() (#4984)
* initial work on Dataset.query * dataset query: test backends, engines, parsers; add docstring * add error test * unfortunate typo * test three dims * refine tests * fix error message Co-authored-by: Maximilian Roos <[email protected]> * add requires decorators * revert change, should be func name * improve Dataset.query tests * add DataArray.query * add query to API docs * add query to whats new * fix black, mypy * refine test parameterisation and requirements Co-authored-by: Maximilian Roos <[email protected]>
1 parent 14b288b commit 37fe544

File tree

8 files changed

+346
-0
lines changed

8 files changed

+346
-0
lines changed

ci/requirements/environment.yml

+1
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ dependencies:
2222
- nc-time-axis
2323
- netcdf4
2424
- numba
25+
- numexpr
2526
- numpy
2627
- pandas
2728
- pint

doc/api.rst

+2
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,7 @@ Indexing
138138
Dataset.set_index
139139
Dataset.reset_index
140140
Dataset.reorder_levels
141+
Dataset.query
141142

142143
Missing value handling
143144
----------------------
@@ -321,6 +322,7 @@ Indexing
321322
DataArray.set_index
322323
DataArray.reset_index
323324
DataArray.reorder_levels
325+
DataArray.query
324326

325327
Missing value handling
326328
----------------------

doc/whats-new.rst

+4
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,10 @@ v0.17.1 (unreleased)
2222

2323
New Features
2424
~~~~~~~~~~~~
25+
26+
- Add :py:meth:`Dataset.query` and :py:meth:`DataArray.query` which enable indexing
27+
of datasets and data arrays by evaluating query expressions against the values of the
28+
data variables (:pull:`4984`). By `Alistair Miles <https://github.com/alimanfoo>`_.
2529
- Allow passing ``combine_attrs`` to :py:meth:`Dataset.merge` (:pull:`4895`).
2630
By `Justus Magin <https://github.com/keewis>`_.
2731
- Support for `dask.graph_manipulation

xarray/core/dataarray.py

+64
Original file line numberDiff line numberDiff line change
@@ -4354,6 +4354,70 @@ def argmax(
43544354
else:
43554355
return self._replace_maybe_drop_dims(result)
43564356

4357+
def query(
4358+
self,
4359+
queries: Mapping[Hashable, Any] = None,
4360+
parser: str = "pandas",
4361+
engine: str = None,
4362+
missing_dims: str = "raise",
4363+
**queries_kwargs: Any,
4364+
) -> "DataArray":
4365+
"""Return a new data array indexed along the specified
4366+
dimension(s), where the indexers are given as strings containing
4367+
Python expressions to be evaluated against the values in the array.
4368+
4369+
Parameters
4370+
----------
4371+
queries : dict, optional
4372+
A dict with keys matching dimensions and values given by strings
4373+
containing Python expressions to be evaluated against the data variables
4374+
in the dataset. The expressions will be evaluated using the pandas
4375+
eval() function, and can contain any valid Python expressions but cannot
4376+
contain any Python statements.
4377+
parser : {"pandas", "python"}, default: "pandas"
4378+
The parser to use to construct the syntax tree from the expression.
4379+
The default of 'pandas' parses code slightly different than standard
4380+
Python. Alternatively, you can parse an expression using the 'python'
4381+
parser to retain strict Python semantics.
4382+
engine: {"python", "numexpr", None}, default: None
4383+
The engine used to evaluate the expression. Supported engines are:
4384+
- None: tries to use numexpr, falls back to python
4385+
- "numexpr": evaluates expressions using numexpr
4386+
- "python": performs operations as if you had eval’d in top level python
4387+
missing_dims : {"raise", "warn", "ignore"}, default: "raise"
4388+
What to do if dimensions that should be selected from are not present in the
4389+
Dataset:
4390+
- "raise": raise an exception
4391+
- "warning": raise a warning, and ignore the missing dimensions
4392+
- "ignore": ignore the missing dimensions
4393+
**queries_kwargs : {dim: query, ...}, optional
4394+
The keyword arguments form of ``queries``.
4395+
One of queries or queries_kwargs must be provided.
4396+
4397+
Returns
4398+
-------
4399+
obj : DataArray
4400+
A new DataArray with the same contents as this dataset, indexed by
4401+
the results of the appropriate queries.
4402+
4403+
See Also
4404+
--------
4405+
DataArray.isel
4406+
Dataset.query
4407+
pandas.eval
4408+
4409+
"""
4410+
4411+
ds = self._to_dataset_whole(shallow_copy=True)
4412+
ds = ds.query(
4413+
queries=queries,
4414+
parser=parser,
4415+
engine=engine,
4416+
missing_dims=missing_dims,
4417+
**queries_kwargs,
4418+
)
4419+
return ds[self.name]
4420+
43574421
# this needs to be at the end, or mypy will confuse with `str`
43584422
# https://mypy.readthedocs.io/en/latest/common_issues.html#dealing-with-conflicting-names
43594423
str = utils.UncachedAccessor(StringAccessor)

xarray/core/dataset.py

+73
Original file line numberDiff line numberDiff line change
@@ -7001,5 +7001,78 @@ def argmax(self, dim=None, **kwargs):
70017001
"Dataset.argmin() with a sequence or ... for dim"
70027002
)
70037003

7004+
def query(
7005+
self,
7006+
queries: Mapping[Hashable, Any] = None,
7007+
parser: str = "pandas",
7008+
engine: str = None,
7009+
missing_dims: str = "raise",
7010+
**queries_kwargs: Any,
7011+
) -> "Dataset":
7012+
"""Return a new dataset with each array indexed along the specified
7013+
dimension(s), where the indexers are given as strings containing
7014+
Python expressions to be evaluated against the data variables in the
7015+
dataset.
7016+
7017+
Parameters
7018+
----------
7019+
queries : dict, optional
7020+
A dict with keys matching dimensions and values given by strings
7021+
containing Python expressions to be evaluated against the data variables
7022+
in the dataset. The expressions will be evaluated using the pandas
7023+
eval() function, and can contain any valid Python expressions but cannot
7024+
contain any Python statements.
7025+
parser : {"pandas", "python"}, default: "pandas"
7026+
The parser to use to construct the syntax tree from the expression.
7027+
The default of 'pandas' parses code slightly different than standard
7028+
Python. Alternatively, you can parse an expression using the 'python'
7029+
parser to retain strict Python semantics.
7030+
engine: {"python", "numexpr", None}, default: None
7031+
The engine used to evaluate the expression. Supported engines are:
7032+
- None: tries to use numexpr, falls back to python
7033+
- "numexpr": evaluates expressions using numexpr
7034+
- "python": performs operations as if you had eval’d in top level python
7035+
missing_dims : {"raise", "warn", "ignore"}, default: "raise"
7036+
What to do if dimensions that should be selected from are not present in the
7037+
Dataset:
7038+
- "raise": raise an exception
7039+
- "warning": raise a warning, and ignore the missing dimensions
7040+
- "ignore": ignore the missing dimensions
7041+
**queries_kwargs : {dim: query, ...}, optional
7042+
The keyword arguments form of ``queries``.
7043+
One of queries or queries_kwargs must be provided.
7044+
7045+
Returns
7046+
-------
7047+
obj : Dataset
7048+
A new Dataset with the same contents as this dataset, except each
7049+
array and dimension is indexed by the results of the appropriate
7050+
queries.
7051+
7052+
See Also
7053+
--------
7054+
Dataset.isel
7055+
pandas.eval
7056+
7057+
"""
7058+
7059+
# allow queries to be given either as a dict or as kwargs
7060+
queries = either_dict_or_kwargs(queries, queries_kwargs, "query")
7061+
7062+
# check queries
7063+
for dim, expr in queries.items():
7064+
if not isinstance(expr, str):
7065+
msg = f"expr for dim {dim} must be a string to be evaluated, {type(expr)} given"
7066+
raise ValueError(msg)
7067+
7068+
# evaluate the queries to create the indexers
7069+
indexers = {
7070+
dim: pd.eval(expr, resolvers=[self], parser=parser, engine=engine)
7071+
for dim, expr in queries.items()
7072+
}
7073+
7074+
# apply the selection
7075+
return self.isel(indexers, missing_dims=missing_dims)
7076+
70047077

70057078
ops.inject_all_ops_and_reduce_methods(Dataset, array_only=False)

xarray/tests/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ def LooseVersion(vstring):
8383
has_cartopy, requires_cartopy = _importorskip("cartopy")
8484
# Need Pint 0.15 for __dask_tokenize__ tests for Quantity wrapped Dask Arrays
8585
has_pint_0_15, requires_pint_0_15 = _importorskip("pint", minversion="0.15")
86+
has_numexpr, requires_numexpr = _importorskip("numexpr")
8687

8788
# some special cases
8889
has_scipy_or_netCDF4 = has_scipy or has_netCDF4

xarray/tests/test_dataarray.py

+70
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import numpy as np
88
import pandas as pd
99
import pytest
10+
from pandas.core.computation.ops import UndefinedVariableError
1011
from pandas.tseries.frequencies import to_offset
1112

1213
import xarray as xr
@@ -39,6 +40,7 @@
3940
requires_dask,
4041
requires_iris,
4142
requires_numbagg,
43+
requires_numexpr,
4244
requires_scipy,
4345
requires_sparse,
4446
source_ndarray,
@@ -4620,6 +4622,74 @@ def test_pad_reflect(self, mode, reflect_type):
46204622
assert actual.shape == (7, 4, 9)
46214623
assert_identical(actual, expected)
46224624

4625+
@pytest.mark.parametrize("parser", ["pandas", "python"])
4626+
@pytest.mark.parametrize(
4627+
"engine", ["python", None, pytest.param("numexpr", marks=[requires_numexpr])]
4628+
)
4629+
@pytest.mark.parametrize(
4630+
"backend", ["numpy", pytest.param("dask", marks=[requires_dask])]
4631+
)
4632+
def test_query(self, backend, engine, parser):
4633+
"""Test querying a dataset."""
4634+
4635+
# setup test data
4636+
np.random.seed(42)
4637+
a = np.arange(0, 10, 1)
4638+
b = np.random.randint(0, 100, size=10)
4639+
c = np.linspace(0, 1, 20)
4640+
d = np.random.choice(["foo", "bar", "baz"], size=30, replace=True).astype(
4641+
object
4642+
)
4643+
if backend == "numpy":
4644+
aa = DataArray(data=a, dims=["x"], name="a")
4645+
bb = DataArray(data=b, dims=["x"], name="b")
4646+
cc = DataArray(data=c, dims=["y"], name="c")
4647+
dd = DataArray(data=d, dims=["z"], name="d")
4648+
4649+
elif backend == "dask":
4650+
import dask.array as da
4651+
4652+
aa = DataArray(data=da.from_array(a, chunks=3), dims=["x"], name="a")
4653+
bb = DataArray(data=da.from_array(b, chunks=3), dims=["x"], name="b")
4654+
cc = DataArray(data=da.from_array(c, chunks=7), dims=["y"], name="c")
4655+
dd = DataArray(data=da.from_array(d, chunks=12), dims=["z"], name="d")
4656+
4657+
# query single dim, single variable
4658+
actual = aa.query(x="a > 5", engine=engine, parser=parser)
4659+
expect = aa.isel(x=(a > 5))
4660+
assert_identical(expect, actual)
4661+
4662+
# query single dim, single variable, via dict
4663+
actual = aa.query(dict(x="a > 5"), engine=engine, parser=parser)
4664+
expect = aa.isel(dict(x=(a > 5)))
4665+
assert_identical(expect, actual)
4666+
4667+
# query single dim, single variable
4668+
actual = bb.query(x="b > 50", engine=engine, parser=parser)
4669+
expect = bb.isel(x=(b > 50))
4670+
assert_identical(expect, actual)
4671+
4672+
# query single dim, single variable
4673+
actual = cc.query(y="c < .5", engine=engine, parser=parser)
4674+
expect = cc.isel(y=(c < 0.5))
4675+
assert_identical(expect, actual)
4676+
4677+
# query single dim, single string variable
4678+
if parser == "pandas":
4679+
# N.B., this query currently only works with the pandas parser
4680+
# xref https://github.com/pandas-dev/pandas/issues/40436
4681+
actual = dd.query(z='d == "bar"', engine=engine, parser=parser)
4682+
expect = dd.isel(z=(d == "bar"))
4683+
assert_identical(expect, actual)
4684+
4685+
# test error handling
4686+
with pytest.raises(ValueError):
4687+
aa.query("a > 5") # must be dict or kwargs
4688+
with pytest.raises(ValueError):
4689+
aa.query(x=(a > 5)) # must be query string
4690+
with pytest.raises(UndefinedVariableError):
4691+
aa.query(x="spam > 50") # name not present
4692+
46234693

46244694
class TestReduce:
46254695
@pytest.fixture(autouse=True)

0 commit comments

Comments
 (0)