dataset query: test backends, engines, parsers; add docstring

alimanfoo · alimanfoo · commit a41e80539310 · 2021-03-12T18:13:48.000Z
diff --git a/ci/requirements/environment.yml b/ci/requirements/environment.yml
@@ -22,6 +22,7 @@ dependencies:
   - nc-time-axis
   - netcdf4
   - numba
+  - numexpr
   - numpy
   - pandas
   - pint
diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
@@ -6988,7 +6988,52 @@ def query(
         missing_dims: str = "raise",
         **queries_kwargs: Any,
     ) -> "Dataset":
-        """TODO docstring"""
+        """Return a new dataset with each array indexed along the specified
+        dimension(s), where the indexers are given as strings containing
+        Python expressions to be evaluated against the data variables in the
+        dataset.
+
+        Parameters
+        ----------
+        queries : dict, optional
+            A dic with keys matching dimensions and values given by strings
+            containing Python expressions to be evaluated against the data variables
+            in the dataset. The expressions will be evaluated using the pandas
+            eval() function, and can contain any valid Python expressions but cannot
+            contain any Python statements.
+        parser : {"pandas", "python"}, default: "pandas"
+            The parser to use to construct the syntax tree from the expression.
+            The default of 'pandas' parses code slightly different than standard
+            Python. Alternatively, you can parse an expression using the 'python'
+            parser to retain strict Python semantics.
+        engine: {"python", "numexpr", None}, default: None
+            The engine used to evaluate the expression. Supported engines are:
+            - None: tries to use numexpr, falls back to python
+            - "numexpr": evaluates expressions using numexpr
+            - "python": performs operations as if you had eval’d in top level python
+        missing_dims : {"raise", "warn", "ignore"}, default: "raise"
+            What to do if dimensions that should be selected from are not present in the
+            Dataset:
+            - "raise": raise an exception
+            - "warning": raise a warning, and ignore the missing dimensions
+            - "ignore": ignore the missing dimensions
+        **queries_kwargs : {dim: query, ...}, optional
+            The keyword arguments form of ``queries``.
+            One of queries or queries_kwargs must be provided.
+
+        Returns
+        -------
+        obj : Dataset
+            A new Dataset with the same contents as this dataset, except each
+            array and dimension is indexed by the results of the appropriate
+            queries.
+
+        See Also
+        --------
+        Dataset.isel
+        pandas.eval
+
+        """
 
         # allow queries to be given either as a dict or as kwargs
         queries = either_dict_or_kwargs(queries, queries_kwargs, "query")
@@ -6998,16 +7043,13 @@ def query(
             if not isinstance(expr, str):
                 msg = f"expr for dim {dim} must be a string to be evaluated, {type(expr)} given"
                 raise ValueError(msg)
-            # TODO check missing dims here, or delegate to isel?
 
         # evaluate the queries to create the indexers
         indexers = {
             dim: pd.eval(expr, resolvers=[self], parser=parser, engine=engine)
             for dim, expr in queries.items()
         }
 
-        # TODO any validation of indexers? Or just let isel try to handle it?
-
         # apply the selection
         return self.isel(indexers, missing_dims=missing_dims)
 
diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py
@@ -5807,62 +5807,82 @@ def test_astype_attrs(self):
         assert not data.astype(float, keep_attrs=False).attrs
         assert not data.astype(float, keep_attrs=False).var1.attrs
 
-    def test_query_single_dim(self):
-        """Test querying a single dimension."""
+    @pytest.mark.parametrize("parser", ["pandas", "python"])
+    @pytest.mark.parametrize("engine", ["python", "numexpr", None])
+    @pytest.mark.parametrize("backend", ["numpy", "dask"])
+    def test_query(self, backend, engine, parser):
+        """Test querying a dataset."""
 
         # setup test data
         np.random.seed(42)
         a = np.arange(0, 10, 1)
         b = np.random.randint(0, 100, size=10)
         c = np.linspace(0, 1, 20)
         d = np.arange(0, 200).reshape(10, 20)
-        ds = Dataset(
-            {"a": ("x", a), "b": ("x", b), "c": ("y", c), "d": (("x", "y"), d)}
-        )
+        if backend == "numpy":
+            ds = Dataset(
+                {"a": ("x", a), "b": ("x", b), "c": ("y", c), "d": (("x", "y"), d)}
+            )
+        elif backend == "dask":
+            ds = Dataset(
+                {
+                    "a": ("x", da.from_array(a, chunks=3)),
+                    "b": ("x", da.from_array(b, chunks=3)),
+                    "c": ("y", da.from_array(c, chunks=7)),
+                    "d": (("x", "y"), da.from_array(d, chunks=(3, 7))),
+                }
+            )
 
         # query single dim, single variable
-        actual = ds.query(x="a > 5")
+        actual = ds.query(x="a > 5", engine=engine, parser=parser)
         expect = ds.isel(x=(a > 5))
         assert_identical(expect, actual)
 
         # query single dim, single variable, via dict
-        actual = ds.query(dict(x="a > 5"))
+        actual = ds.query(dict(x="a > 5"), engine=engine, parser=parser)
         expect = ds.isel(dict(x=(a > 5)))
         assert_identical(expect, actual)
 
         # query single dim, single variable
-        actual = ds.query(x="b > 50")
+        actual = ds.query(x="b > 50", engine=engine, parser=parser)
         expect = ds.isel(x=(b > 50))
         assert_identical(expect, actual)
 
         # query single dim, single variable
-        actual = ds.query(y="c < .5")
+        actual = ds.query(y="c < .5", engine=engine, parser=parser)
         expect = ds.isel(y=(c < 0.5))
         assert_identical(expect, actual)
 
         # query single dim, multiple variables
-        actual = ds.query(x="(a > 5) & (b > 50)")
+        actual = ds.query(x="(a > 5) & (b > 50)", engine=engine, parser=parser)
         expect = ds.isel(x=((a > 5) & (b > 50)))
         assert_identical(expect, actual)
 
         # support pandas query parser
-        actual = ds.query(x="(a > 5) and (b > 50)")
-        expect = ds.isel(x=((a > 5) & (b > 50)))
-        assert_identical(expect, actual)
+        if parser == "pandas":
+            actual = ds.query(x="(a > 5) and (b > 50)", engine=engine, parser=parser)
+            expect = ds.isel(x=((a > 5) & (b > 50)))
+            assert_identical(expect, actual)
 
         # query multiple dims via kwargs
-        actual = ds.query(x="a > 5", y="c < .5")
+        actual = ds.query(x="a > 5", y="c < .5", engine=engine, parser=parser)
         expect = ds.isel(x=(a > 5), y=(c < 0.5))
         assert_identical(expect, actual)
 
         # query multiple dims via dict
-        actual = ds.query(dict(x="a > 5", y="c < .5"))
+        actual = ds.query(dict(x="a > 5", y="c < .5"), engine=engine, parser=parser)
         expect = ds.isel(dict(x=(a > 5), y=(c < 0.5)))
         assert_identical(expect, actual)
 
-        # TODO test error handling
-
-        # TODO test dask data variables
+        # test error handling
+        with pytest.raises(ValueError):
+            ds.query("a > 5")  # must be dict
+        with pytest.raises(IndexError):
+            ds.query(y="a > 5")  # wrong length dimension
+        with pytest.raises(IndexError):
+            ds.query(x="c < .5")  # wrong length dimension
+        with pytest.raises(IndexError):
+            ds.query(x="d > 100")  # wrong number of dimensions
 
 
 # Py.test tests

-Original file line number
+Diff line change
   - nc-time-axis
   - netcdf4
   - numba
 +  - numexpr
   - numpy
   - pandas
   - pint