Skip to content

Commit a41e805

Browse files
committed
dataset query: test backends, engines, parsers; add docstring
1 parent 8b542f8 commit a41e805

File tree

3 files changed

+85
-22
lines changed

3 files changed

+85
-22
lines changed

ci/requirements/environment.yml

+1
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ dependencies:
2222
- nc-time-axis
2323
- netcdf4
2424
- numba
25+
- numexpr
2526
- numpy
2627
- pandas
2728
- pint

xarray/core/dataset.py

+46-4
Original file line numberDiff line numberDiff line change
@@ -6988,7 +6988,52 @@ def query(
69886988
missing_dims: str = "raise",
69896989
**queries_kwargs: Any,
69906990
) -> "Dataset":
6991-
"""TODO docstring"""
6991+
"""Return a new dataset with each array indexed along the specified
6992+
dimension(s), where the indexers are given as strings containing
6993+
Python expressions to be evaluated against the data variables in the
6994+
dataset.
6995+
6996+
Parameters
6997+
----------
6998+
queries : dict, optional
6999+
A dic with keys matching dimensions and values given by strings
7000+
containing Python expressions to be evaluated against the data variables
7001+
in the dataset. The expressions will be evaluated using the pandas
7002+
eval() function, and can contain any valid Python expressions but cannot
7003+
contain any Python statements.
7004+
parser : {"pandas", "python"}, default: "pandas"
7005+
The parser to use to construct the syntax tree from the expression.
7006+
The default of 'pandas' parses code slightly different than standard
7007+
Python. Alternatively, you can parse an expression using the 'python'
7008+
parser to retain strict Python semantics.
7009+
engine: {"python", "numexpr", None}, default: None
7010+
The engine used to evaluate the expression. Supported engines are:
7011+
- None: tries to use numexpr, falls back to python
7012+
- "numexpr": evaluates expressions using numexpr
7013+
- "python": performs operations as if you had eval’d in top level python
7014+
missing_dims : {"raise", "warn", "ignore"}, default: "raise"
7015+
What to do if dimensions that should be selected from are not present in the
7016+
Dataset:
7017+
- "raise": raise an exception
7018+
- "warning": raise a warning, and ignore the missing dimensions
7019+
- "ignore": ignore the missing dimensions
7020+
**queries_kwargs : {dim: query, ...}, optional
7021+
The keyword arguments form of ``queries``.
7022+
One of queries or queries_kwargs must be provided.
7023+
7024+
Returns
7025+
-------
7026+
obj : Dataset
7027+
A new Dataset with the same contents as this dataset, except each
7028+
array and dimension is indexed by the results of the appropriate
7029+
queries.
7030+
7031+
See Also
7032+
--------
7033+
Dataset.isel
7034+
pandas.eval
7035+
7036+
"""
69927037

69937038
# allow queries to be given either as a dict or as kwargs
69947039
queries = either_dict_or_kwargs(queries, queries_kwargs, "query")
@@ -6998,16 +7043,13 @@ def query(
69987043
if not isinstance(expr, str):
69997044
msg = f"expr for dim {dim} must be a string to be evaluated, {type(expr)} given"
70007045
raise ValueError(msg)
7001-
# TODO check missing dims here, or delegate to isel?
70027046

70037047
# evaluate the queries to create the indexers
70047048
indexers = {
70057049
dim: pd.eval(expr, resolvers=[self], parser=parser, engine=engine)
70067050
for dim, expr in queries.items()
70077051
}
70087052

7009-
# TODO any validation of indexers? Or just let isel try to handle it?
7010-
70117053
# apply the selection
70127054
return self.isel(indexers, missing_dims=missing_dims)
70137055

xarray/tests/test_dataset.py

+38-18
Original file line numberDiff line numberDiff line change
@@ -5807,62 +5807,82 @@ def test_astype_attrs(self):
58075807
assert not data.astype(float, keep_attrs=False).attrs
58085808
assert not data.astype(float, keep_attrs=False).var1.attrs
58095809

5810-
def test_query_single_dim(self):
5811-
"""Test querying a single dimension."""
5810+
@pytest.mark.parametrize("parser", ["pandas", "python"])
5811+
@pytest.mark.parametrize("engine", ["python", "numexpr", None])
5812+
@pytest.mark.parametrize("backend", ["numpy", "dask"])
5813+
def test_query(self, backend, engine, parser):
5814+
"""Test querying a dataset."""
58125815

58135816
# setup test data
58145817
np.random.seed(42)
58155818
a = np.arange(0, 10, 1)
58165819
b = np.random.randint(0, 100, size=10)
58175820
c = np.linspace(0, 1, 20)
58185821
d = np.arange(0, 200).reshape(10, 20)
5819-
ds = Dataset(
5820-
{"a": ("x", a), "b": ("x", b), "c": ("y", c), "d": (("x", "y"), d)}
5821-
)
5822+
if backend == "numpy":
5823+
ds = Dataset(
5824+
{"a": ("x", a), "b": ("x", b), "c": ("y", c), "d": (("x", "y"), d)}
5825+
)
5826+
elif backend == "dask":
5827+
ds = Dataset(
5828+
{
5829+
"a": ("x", da.from_array(a, chunks=3)),
5830+
"b": ("x", da.from_array(b, chunks=3)),
5831+
"c": ("y", da.from_array(c, chunks=7)),
5832+
"d": (("x", "y"), da.from_array(d, chunks=(3, 7))),
5833+
}
5834+
)
58225835

58235836
# query single dim, single variable
5824-
actual = ds.query(x="a > 5")
5837+
actual = ds.query(x="a > 5", engine=engine, parser=parser)
58255838
expect = ds.isel(x=(a > 5))
58265839
assert_identical(expect, actual)
58275840

58285841
# query single dim, single variable, via dict
5829-
actual = ds.query(dict(x="a > 5"))
5842+
actual = ds.query(dict(x="a > 5"), engine=engine, parser=parser)
58305843
expect = ds.isel(dict(x=(a > 5)))
58315844
assert_identical(expect, actual)
58325845

58335846
# query single dim, single variable
5834-
actual = ds.query(x="b > 50")
5847+
actual = ds.query(x="b > 50", engine=engine, parser=parser)
58355848
expect = ds.isel(x=(b > 50))
58365849
assert_identical(expect, actual)
58375850

58385851
# query single dim, single variable
5839-
actual = ds.query(y="c < .5")
5852+
actual = ds.query(y="c < .5", engine=engine, parser=parser)
58405853
expect = ds.isel(y=(c < 0.5))
58415854
assert_identical(expect, actual)
58425855

58435856
# query single dim, multiple variables
5844-
actual = ds.query(x="(a > 5) & (b > 50)")
5857+
actual = ds.query(x="(a > 5) & (b > 50)", engine=engine, parser=parser)
58455858
expect = ds.isel(x=((a > 5) & (b > 50)))
58465859
assert_identical(expect, actual)
58475860

58485861
# support pandas query parser
5849-
actual = ds.query(x="(a > 5) and (b > 50)")
5850-
expect = ds.isel(x=((a > 5) & (b > 50)))
5851-
assert_identical(expect, actual)
5862+
if parser == "pandas":
5863+
actual = ds.query(x="(a > 5) and (b > 50)", engine=engine, parser=parser)
5864+
expect = ds.isel(x=((a > 5) & (b > 50)))
5865+
assert_identical(expect, actual)
58525866

58535867
# query multiple dims via kwargs
5854-
actual = ds.query(x="a > 5", y="c < .5")
5868+
actual = ds.query(x="a > 5", y="c < .5", engine=engine, parser=parser)
58555869
expect = ds.isel(x=(a > 5), y=(c < 0.5))
58565870
assert_identical(expect, actual)
58575871

58585872
# query multiple dims via dict
5859-
actual = ds.query(dict(x="a > 5", y="c < .5"))
5873+
actual = ds.query(dict(x="a > 5", y="c < .5"), engine=engine, parser=parser)
58605874
expect = ds.isel(dict(x=(a > 5), y=(c < 0.5)))
58615875
assert_identical(expect, actual)
58625876

5863-
# TODO test error handling
5864-
5865-
# TODO test dask data variables
5877+
# test error handling
5878+
with pytest.raises(ValueError):
5879+
ds.query("a > 5") # must be dict
5880+
with pytest.raises(IndexError):
5881+
ds.query(y="a > 5") # wrong length dimension
5882+
with pytest.raises(IndexError):
5883+
ds.query(x="c < .5") # wrong length dimension
5884+
with pytest.raises(IndexError):
5885+
ds.query(x="d > 100") # wrong number of dimensions
58665886

58675887

58685888
# Py.test tests

0 commit comments

Comments
 (0)