Skip to content
forked from pydata/xarray

Commit e0f227f

Browse files
committed
Merge remote-tracking branch 'upstream/master' into dataset/quiver
* upstream/master: speed up the repr for big MultiIndex objects (pydata#4846) dim -> coord in DataArray.integrate (pydata#3993) WIP: backend interface, now it uses subclassing (pydata#4836) weighted: small improvements (pydata#4818) Update related-projects.rst (pydata#4844) iris update doc url (pydata#4845) Faster unstacking (pydata#4746) Allow swap_dims to take kwargs (pydata#4841) Move skip ci instructions to contributing guide (pydata#4829) fix issues in drop_sel and drop_isel (pydata#4828) Bugfix in list_engine (pydata#4811) Add drop_isel (pydata#4819) Fix RST. Remove the references to `_file_obj` outside low level code paths, change to `_close` (pydata#4809)
2 parents e795672 + 39048f9 commit e0f227f

34 files changed

+1065
-601
lines changed

.github/PULL_REQUEST_TEMPLATE.md

-8
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,3 @@
55
- [ ] Passes `pre-commit run --all-files`
66
- [ ] User visible changes (including notable bug fixes) are documented in `whats-new.rst`
77
- [ ] New functions/methods are listed in `api.rst`
8-
9-
10-
<sub>
11-
<h3>
12-
Overriding CI behaviors
13-
</h3>
14-
By default, the upstream dev CI is disabled on pull request and push events. You can override this behavior per commit by adding a <tt>[test-upstream]</tt> tag to the first line of the commit message. For documentation-only commits, you can skip the CI per commit by adding a <tt>[skip-ci]</tt> tag to the first line of the commit message
15-
</sub>

asv_bench/benchmarks/repr.py

+18
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
import pandas as pd
2+
3+
import xarray as xr
4+
5+
6+
class ReprMultiIndex:
7+
def setup(self, key):
8+
index = pd.MultiIndex.from_product(
9+
[range(10000), range(10000)], names=("level_0", "level_1")
10+
)
11+
series = pd.Series(range(100000000), index=index)
12+
self.da = xr.DataArray(series)
13+
14+
def time_repr(self):
15+
repr(self.da)
16+
17+
def time_repr_html(self):
18+
self.da._repr_html_()

asv_bench/benchmarks/unstacking.py

+10-5
Original file line numberDiff line numberDiff line change
@@ -7,18 +7,23 @@
77

88
class Unstacking:
99
def setup(self):
10-
data = np.random.RandomState(0).randn(1, 1000, 500)
11-
self.ds = xr.DataArray(data).stack(flat_dim=["dim_1", "dim_2"])
10+
data = np.random.RandomState(0).randn(500, 1000)
11+
self.da_full = xr.DataArray(data, dims=list("ab")).stack(flat_dim=[...])
12+
self.da_missing = self.da_full[:-1]
13+
self.df_missing = self.da_missing.to_pandas()
1214

1315
def time_unstack_fast(self):
14-
self.ds.unstack("flat_dim")
16+
self.da_full.unstack("flat_dim")
1517

1618
def time_unstack_slow(self):
17-
self.ds[:, ::-1].unstack("flat_dim")
19+
self.da_missing.unstack("flat_dim")
20+
21+
def time_unstack_pandas_slow(self):
22+
self.df_missing.unstack()
1823

1924

2025
class UnstackingDask(Unstacking):
2126
def setup(self, *args, **kwargs):
2227
requires_dask()
2328
super().setup(**kwargs)
24-
self.ds = self.ds.chunk({"flat_dim": 50})
29+
self.da_full = self.da_full.chunk({"flat_dim": 50})

doc/api.rst

+2
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,7 @@ Indexing
126126
Dataset.isel
127127
Dataset.sel
128128
Dataset.drop_sel
129+
Dataset.drop_isel
129130
Dataset.head
130131
Dataset.tail
131132
Dataset.thin
@@ -308,6 +309,7 @@ Indexing
308309
DataArray.isel
309310
DataArray.sel
310311
DataArray.drop_sel
312+
DataArray.drop_isel
311313
DataArray.head
312314
DataArray.tail
313315
DataArray.thin

doc/conf.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -411,7 +411,7 @@
411411
intersphinx_mapping = {
412412
"python": ("https://docs.python.org/3/", None),
413413
"pandas": ("https://pandas.pydata.org/pandas-docs/stable", None),
414-
"iris": ("https://scitools.org.uk/iris/docs/latest", None),
414+
"iris": ("https://scitools-iris.readthedocs.io/en/latest", None),
415415
"numpy": ("https://numpy.org/doc/stable", None),
416416
"scipy": ("https://docs.scipy.org/doc/scipy/reference", None),
417417
"numba": ("https://numba.pydata.org/numba-doc/latest", None),

doc/contributing.rst

+1
Original file line numberDiff line numberDiff line change
@@ -836,6 +836,7 @@ PR checklist
836836
837837
- Write new tests if needed. See `"Test-driven development/code writing" <https://xarray.pydata.org/en/stable/contributing.html#test-driven-development-code-writing>`_.
838838
- Test the code using `Pytest <http://doc.pytest.org/en/latest/>`_. Running all tests (type ``pytest`` in the root directory) takes a while, so feel free to only run the tests you think are needed based on your PR (example: ``pytest xarray/tests/test_dataarray.py``). CI will catch any failing tests.
839+
- By default, the upstream dev CI is disabled on pull request and push events. You can override this behavior per commit by adding a <tt>[test-upstream]</tt> tag to the first line of the commit message. For documentation-only commits, you can skip the CI per commit by adding a "[skip-ci]" tag to the first line of the commit message.
839840
840841
- **Properly format your code** and verify that it passes the formatting guidelines set by `Black <https://black.readthedocs.io/en/stable/>`_ and `Flake8 <http://flake8.pycqa.org/en/latest/>`_. See `"Code formatting" <https://xarray.pydata.org/en/stablcontributing.html#code-formatting>`_. You can use `pre-commit <https://pre-commit.com/>`_ to run these automatically on each commit.
841842

doc/faq.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,7 @@ different approaches to handling metadata: Iris strictly interprets
166166
`CF conventions`_. Iris particularly shines at mapping, thanks to its
167167
integration with Cartopy_.
168168

169-
.. _Iris: http://scitools.org.uk/iris/
169+
.. _Iris: https://scitools-iris.readthedocs.io/en/stable/
170170
.. _Cartopy: http://scitools.org.uk/cartopy/docs/latest/
171171

172172
`UV-CDAT`__ is another Python library that implements in-memory netCDF-like

doc/related-projects.rst

+1
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ Geosciences
1515
- `aospy <https://aospy.readthedocs.io>`_: Automated analysis and management of gridded climate data.
1616
- `climpred <https://climpred.readthedocs.io>`_: Analysis of ensemble forecast models for climate prediction.
1717
- `geocube <https://corteva.github.io/geocube>`_: Tool to convert geopandas vector data into rasterized xarray data.
18+
- `GeoWombat <https://github.com/jgrss/geowombat>`_: Utilities for analysis of remotely sensed and gridded raster data at scale (easily tame Landsat, Sentinel, Quickbird, and PlanetScope).
1819
- `infinite-diff <https://github.com/spencerahill/infinite-diff>`_: xarray-based finite-differencing, focused on gridded climate/meterology data
1920
- `marc_analysis <https://github.com/darothen/marc_analysis>`_: Analysis package for CESM/MARC experiments and output.
2021
- `MetPy <https://unidata.github.io/MetPy/dev/index.html>`_: A collection of tools in Python for reading, visualizing, and performing calculations with weather data.

doc/whats-new.rst

+22-3
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ What's New
1717
1818
.. _whats-new.0.16.3:
1919

20-
v0.16.3 (unreleased)
20+
v0.17.0 (unreleased)
2121
--------------------
2222

2323
Breaking changes
@@ -39,16 +39,32 @@ Breaking changes
3939
always be set such that ``int64`` values can be used. In the past, no units
4040
finer than "seconds" were chosen, which would sometimes mean that ``float64``
4141
values were required, which would lead to inaccurate I/O round-trips.
42-
- remove deprecated ``autoclose`` kwargs from :py:func:`open_dataset` (:pull: `4725`).
43-
By `Aureliana Barghini <https://github.com/aurghs>`_
42+
- remove deprecated ``autoclose`` kwargs from :py:func:`open_dataset` (:pull:`4725`).
43+
By `Aureliana Barghini <https://github.com/aurghs>`_.
44+
45+
Deprecations
46+
~~~~~~~~~~~~
47+
48+
- ``dim`` argument to :py:meth:`DataArray.integrate` is being deprecated in
49+
favour of a ``coord`` argument, for consistency with :py:meth:`Dataset.integrate`.
50+
For now using ``dim`` issues a ``FutureWarning``. By `Tom Nicholas <https://github.com/TomNicholas>`_.
4451

4552

4653
New Features
4754
~~~~~~~~~~~~
55+
- Significantly higher ``unstack`` performance on numpy-backed arrays which
56+
contain missing values; 8x faster in our benchmark, and 2x faster than pandas.
57+
(:pull:`4746`);
58+
By `Maximilian Roos <https://github.com/max-sixty>`_.
59+
4860
- Performance improvement when constructing DataArrays. Significantly speeds up repr for Datasets with large number of variables.
4961
By `Deepak Cherian <https://github.com/dcherian>`_
5062
- Add :py:meth:`Dataset.plot.quiver` for quiver plots with :py:class:`Dataset` variables.
5163
By `Deepak Cherian <https://github.com/dcherian>`_
64+
By `Deepak Cherian <https://github.com/dcherian>`_.
65+
- :py:meth:`DataArray.swap_dims` & :py:meth:`Dataset.swap_dims` now accept dims
66+
in the form of kwargs as well as a dict, like most similar methods.
67+
By `Maximilian Roos <https://github.com/max-sixty>`_.
5268

5369
Bug fixes
5470
~~~~~~~~~
@@ -82,6 +98,7 @@ Bug fixes
8298
- Expand user directory paths (e.g. ``~/``) in :py:func:`open_mfdataset` and
8399
:py:meth:`Dataset.to_zarr` (:issue:`4783`, :pull:`4795`).
84100
By `Julien Seguinot <https://github.com/juseg>`_.
101+
- Add :py:meth:`Dataset.drop_isel` and :py:meth:`DataArray.drop_isel` (:issue:`4658`, :pull:`4819`). By `Daniel Mesejo <https://github.com/mesejo>`_.
85102

86103
Documentation
87104
~~~~~~~~~~~~~
@@ -110,6 +127,8 @@ Internal Changes
110127
By `Maximilian Roos <https://github.com/max-sixty>`_.
111128
- Speed up attribute style access (e.g. ``ds.somevar`` instead of ``ds["somevar"]``) and tab completion
112129
in ipython (:issue:`4741`, :pull:`4742`). By `Richard Kleijn <https://github.com/rhkleijn>`_.
130+
- Added the ``set_close`` method to ``Dataset`` and ``DataArray`` for beckends to specify how to voluntary release
131+
all resources. (:pull:`#4809`), By `Alessandro Amici <https://github.com/alexamici>`_.
113132

114133
.. _whats-new.0.16.2:
115134

xarray/backends/api.py

+9-16
Original file line numberDiff line numberDiff line change
@@ -522,7 +522,7 @@ def maybe_decode_store(store, chunks):
522522

523523
else:
524524
ds2 = ds
525-
ds2._file_obj = ds._file_obj
525+
ds2.set_close(ds._close)
526526
return ds2
527527

528528
filename_or_obj = _normalize_path(filename_or_obj)
@@ -701,7 +701,7 @@ def open_dataarray(
701701
else:
702702
(data_array,) = dataset.data_vars.values()
703703

704-
data_array._file_obj = dataset._file_obj
704+
data_array.set_close(dataset._close)
705705

706706
# Reset names if they were changed during saving
707707
# to ensure that we can 'roundtrip' perfectly
@@ -715,17 +715,6 @@ def open_dataarray(
715715
return data_array
716716

717717

718-
class _MultiFileCloser:
719-
__slots__ = ("file_objs",)
720-
721-
def __init__(self, file_objs):
722-
self.file_objs = file_objs
723-
724-
def close(self):
725-
for f in self.file_objs:
726-
f.close()
727-
728-
729718
def open_mfdataset(
730719
paths,
731720
chunks=None,
@@ -918,14 +907,14 @@ def open_mfdataset(
918907
getattr_ = getattr
919908

920909
datasets = [open_(p, **open_kwargs) for p in paths]
921-
file_objs = [getattr_(ds, "_file_obj") for ds in datasets]
910+
closers = [getattr_(ds, "_close") for ds in datasets]
922911
if preprocess is not None:
923912
datasets = [preprocess(ds) for ds in datasets]
924913

925914
if parallel:
926915
# calling compute here will return the datasets/file_objs lists,
927916
# the underlying datasets will still be stored as dask arrays
928-
datasets, file_objs = dask.compute(datasets, file_objs)
917+
datasets, closers = dask.compute(datasets, closers)
929918

930919
# Combine all datasets, closing them in case of a ValueError
931920
try:
@@ -963,7 +952,11 @@ def open_mfdataset(
963952
ds.close()
964953
raise
965954

966-
combined._file_obj = _MultiFileCloser(file_objs)
955+
def multi_file_closer():
956+
for closer in closers:
957+
closer()
958+
959+
combined.set_close(multi_file_closer)
967960

968961
# read global attributes from the attrs_file or from the first dataset
969962
if attrs_file is not None:

xarray/backends/apiv2.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ def _dataset_from_backend_dataset(
9090
**extra_tokens,
9191
)
9292

93-
ds._file_obj = backend_ds._file_obj
93+
ds.set_close(backend_ds._close)
9494

9595
# Ensure source filename always stored in dataset object (GH issue #2550)
9696
if "source" not in ds.encoding:

xarray/backends/cfgrib_.py

+68-56
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,22 @@
55
from ..core import indexing
66
from ..core.utils import Frozen, FrozenDict, close_on_error
77
from ..core.variable import Variable
8-
from .common import AbstractDataStore, BackendArray, BackendEntrypoint
8+
from .common import (
9+
BACKEND_ENTRYPOINTS,
10+
AbstractDataStore,
11+
BackendArray,
12+
BackendEntrypoint,
13+
)
914
from .locks import SerializableLock, ensure_lock
10-
from .store import open_backend_dataset_store
15+
from .store import StoreBackendEntrypoint
16+
17+
try:
18+
import cfgrib
19+
20+
has_cfgrib = True
21+
except ModuleNotFoundError:
22+
has_cfgrib = False
23+
1124

1225
# FIXME: Add a dedicated lock, even if ecCodes is supposed to be thread-safe
1326
# in most circumstances. See:
@@ -38,7 +51,6 @@ class CfGribDataStore(AbstractDataStore):
3851
"""
3952

4053
def __init__(self, filename, lock=None, **backend_kwargs):
41-
import cfgrib
4254

4355
if lock is None:
4456
lock = ECCODES_LOCK
@@ -74,58 +86,58 @@ def get_encoding(self):
7486
return encoding
7587

7688

77-
def guess_can_open_cfgrib(store_spec):
78-
try:
79-
_, ext = os.path.splitext(store_spec)
80-
except TypeError:
81-
return False
82-
return ext in {".grib", ".grib2", ".grb", ".grb2"}
83-
84-
85-
def open_backend_dataset_cfgrib(
86-
filename_or_obj,
87-
*,
88-
mask_and_scale=True,
89-
decode_times=None,
90-
concat_characters=None,
91-
decode_coords=None,
92-
drop_variables=None,
93-
use_cftime=None,
94-
decode_timedelta=None,
95-
lock=None,
96-
indexpath="{path}.{short_hash}.idx",
97-
filter_by_keys={},
98-
read_keys=[],
99-
encode_cf=("parameter", "time", "geography", "vertical"),
100-
squeeze=True,
101-
time_dims=("time", "step"),
102-
):
103-
104-
store = CfGribDataStore(
89+
class CfgribfBackendEntrypoint(BackendEntrypoint):
90+
def guess_can_open(self, store_spec):
91+
try:
92+
_, ext = os.path.splitext(store_spec)
93+
except TypeError:
94+
return False
95+
return ext in {".grib", ".grib2", ".grb", ".grb2"}
96+
97+
def open_dataset(
98+
self,
10599
filename_or_obj,
106-
indexpath=indexpath,
107-
filter_by_keys=filter_by_keys,
108-
read_keys=read_keys,
109-
encode_cf=encode_cf,
110-
squeeze=squeeze,
111-
time_dims=time_dims,
112-
lock=lock,
113-
)
114-
115-
with close_on_error(store):
116-
ds = open_backend_dataset_store(
117-
store,
118-
mask_and_scale=mask_and_scale,
119-
decode_times=decode_times,
120-
concat_characters=concat_characters,
121-
decode_coords=decode_coords,
122-
drop_variables=drop_variables,
123-
use_cftime=use_cftime,
124-
decode_timedelta=decode_timedelta,
100+
*,
101+
mask_and_scale=True,
102+
decode_times=None,
103+
concat_characters=None,
104+
decode_coords=None,
105+
drop_variables=None,
106+
use_cftime=None,
107+
decode_timedelta=None,
108+
lock=None,
109+
indexpath="{path}.{short_hash}.idx",
110+
filter_by_keys={},
111+
read_keys=[],
112+
encode_cf=("parameter", "time", "geography", "vertical"),
113+
squeeze=True,
114+
time_dims=("time", "step"),
115+
):
116+
117+
store = CfGribDataStore(
118+
filename_or_obj,
119+
indexpath=indexpath,
120+
filter_by_keys=filter_by_keys,
121+
read_keys=read_keys,
122+
encode_cf=encode_cf,
123+
squeeze=squeeze,
124+
time_dims=time_dims,
125+
lock=lock,
125126
)
126-
return ds
127-
128-
129-
cfgrib_backend = BackendEntrypoint(
130-
open_dataset=open_backend_dataset_cfgrib, guess_can_open=guess_can_open_cfgrib
131-
)
127+
store_entrypoint = StoreBackendEntrypoint()
128+
with close_on_error(store):
129+
ds = store_entrypoint.open_dataset(
130+
store,
131+
mask_and_scale=mask_and_scale,
132+
decode_times=decode_times,
133+
concat_characters=concat_characters,
134+
decode_coords=decode_coords,
135+
drop_variables=drop_variables,
136+
use_cftime=use_cftime,
137+
decode_timedelta=decode_timedelta,
138+
)
139+
return ds
140+
141+
142+
if has_cfgrib:
143+
BACKEND_ENTRYPOINTS["cfgrib"] = CfgribfBackendEntrypoint

0 commit comments

Comments
 (0)