Skip to content

Commit 53c5199

Browse files
dcheriancrusaderky
authored andcommitted
__dask_tokenize__ (pydata#3446)
* Implement __dask_tokenize__ * Fix window test * Code review * Test change in IndexVariable
1 parent 8fbe1f8 commit 53c5199

File tree

6 files changed

+146
-5
lines changed

6 files changed

+146
-5
lines changed

doc/whats-new.rst

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,17 +21,20 @@ v0.14.1 (unreleased)
2121
Breaking changes
2222
~~~~~~~~~~~~~~~~
2323

24-
- Minimum cftime version is now 1.0.3. By `Deepak Cherian <https://github.com/dcherian>`_.
24+
- Broken compatibility with cftime < 1.0.3.
25+
By `Deepak Cherian <https://github.com/dcherian>`_.
2526

2627
.. note::
2728

28-
cftime version 1.0.4 is broken (`cftime/126 <https://github.com/Unidata/cftime/issues/126>`_), use version 1.0.4.2 instead.
29+
cftime version 1.0.4 is broken
30+
(`cftime/126 <https://github.com/Unidata/cftime/issues/126>`_);
31+
please use version 1.0.4.2 instead.
2932

3033
- All leftover support for dates from non-standard calendars through netcdftime, the
3134
module included in versions of netCDF4 prior to 1.4 that eventually became the
3235
cftime package, has been removed in favor of relying solely on the standalone
33-
cftime package (:pull:`3450`). By `Spencer Clark
34-
<https://github.com/spencerkclark>`_.
36+
cftime package (:pull:`3450`).
37+
By `Spencer Clark <https://github.com/spencerkclark>`_.
3538

3639
New Features
3740
~~~~~~~~~~~~
@@ -52,6 +55,14 @@ New Features
5255
for now. Enable it with :py:meth:`xarray.set_options(display_style="html")`.
5356
(:pull:`3425`) by `Benoit Bovy <https://github.com/benbovy>`_ and
5457
`Julia Signell <https://github.com/jsignell>`_.
58+
- Implement `dask deterministic hashing
59+
<https://docs.dask.org/en/latest/custom-collections.html#deterministic-hashing>`_
60+
for xarray objects. Note that xarray objects with a dask.array backend already used
61+
deterministic hashing in previous releases; this change implements it when whole
62+
xarray objects are embedded in a dask graph, e.g. when :meth:`DataArray.map` is
63+
invoked. (:issue:`3378`, :pull:`3446`)
64+
By `Deepak Cherian <https://github.com/dcherian>`_ and
65+
`Guido Imperiale <https://github.com/crusaderky>`_.
5566

5667
Bug fixes
5768
~~~~~~~~~
@@ -96,6 +107,7 @@ Internal Changes
96107
- Use Python 3.6 idioms throughout the codebase. (:pull:3419)
97108
By `Maximilian Roos <https://github.com/max-sixty>`_
98109

110+
99111
.. _whats-new.0.14.0:
100112

101113
v0.14.0 (14 Oct 2019)

xarray/core/dataarray.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -752,6 +752,9 @@ def reset_coords(
752752
dataset[self.name] = self.variable
753753
return dataset
754754

755+
def __dask_tokenize__(self):
756+
return (type(self), self._variable, self._coords, self._name)
757+
755758
def __dask_graph__(self):
756759
return self._to_temp_dataset().__dask_graph__()
757760

xarray/core/dataset.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -651,6 +651,9 @@ def load(self, **kwargs) -> "Dataset":
651651

652652
return self
653653

654+
def __dask_tokenize__(self):
655+
return (type(self), self._variables, self._coord_names, self._attrs)
656+
654657
def __dask_graph__(self):
655658
graphs = {k: v.__dask_graph__() for k, v in self.variables.items()}
656659
graphs = {k: v for k, v in graphs.items() if v is not None}

xarray/core/variable.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -390,6 +390,11 @@ def compute(self, **kwargs):
390390
new = self.copy(deep=False)
391391
return new.load(**kwargs)
392392

393+
def __dask_tokenize__(self):
394+
# Use v.data, instead of v._data, in order to cope with the wrappers
395+
# around NetCDF and the like
396+
return type(self), self._dims, self.data, self._attrs
397+
393398
def __dask_graph__(self):
394399
if isinstance(self._data, dask_array_type):
395400
return self._data.__dask_graph__()
@@ -1963,6 +1968,10 @@ def __init__(self, dims, data, attrs=None, encoding=None, fastpath=False):
19631968
if not isinstance(self._data, PandasIndexAdapter):
19641969
self._data = PandasIndexAdapter(self._data)
19651970

1971+
def __dask_tokenize__(self):
1972+
# Don't waste time converting pd.Index to np.ndarray
1973+
return (type(self), self._dims, self._data.array, self._attrs)
1974+
19661975
def load(self):
19671976
# data is already loaded into memory for IndexVariable
19681977
return self

xarray/tests/test_dask.py

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import operator
22
import pickle
3+
import sys
34
from contextlib import suppress
45
from distutils.version import LooseVersion
56
from textwrap import dedent
@@ -21,12 +22,16 @@
2122
assert_frame_equal,
2223
assert_identical,
2324
raises_regex,
25+
requires_scipy_or_netCDF4,
2426
)
27+
from .test_backends import create_tmp_file
2528

2629
dask = pytest.importorskip("dask")
2730
da = pytest.importorskip("dask.array")
2831
dd = pytest.importorskip("dask.dataframe")
2932

33+
ON_WINDOWS = sys.platform == "win32"
34+
3035

3136
class CountingScheduler:
3237
""" Simple dask scheduler counting the number of computes.
@@ -1135,3 +1140,92 @@ def test_make_meta(map_ds):
11351140
for variable in map_ds.data_vars:
11361141
assert variable in meta.data_vars
11371142
assert meta.data_vars[variable].shape == (0,) * meta.data_vars[variable].ndim
1143+
1144+
1145+
@pytest.mark.parametrize(
1146+
"obj", [make_da(), make_da().compute(), make_ds(), make_ds().compute()]
1147+
)
1148+
@pytest.mark.parametrize(
1149+
"transform",
1150+
[
1151+
lambda x: x.reset_coords(),
1152+
lambda x: x.reset_coords(drop=True),
1153+
lambda x: x.isel(x=1),
1154+
lambda x: x.attrs.update(new_attrs=1),
1155+
lambda x: x.assign_coords(cxy=1),
1156+
lambda x: x.rename({"x": "xnew"}),
1157+
lambda x: x.rename({"cxy": "cxynew"}),
1158+
],
1159+
)
1160+
def test_token_changes_on_transform(obj, transform):
1161+
with raise_if_dask_computes():
1162+
assert dask.base.tokenize(obj) != dask.base.tokenize(transform(obj))
1163+
1164+
1165+
@pytest.mark.parametrize(
1166+
"obj", [make_da(), make_da().compute(), make_ds(), make_ds().compute()]
1167+
)
1168+
def test_token_changes_when_data_changes(obj):
1169+
with raise_if_dask_computes():
1170+
t1 = dask.base.tokenize(obj)
1171+
1172+
# Change data_var
1173+
if isinstance(obj, DataArray):
1174+
obj *= 2
1175+
else:
1176+
obj["a"] *= 2
1177+
with raise_if_dask_computes():
1178+
t2 = dask.base.tokenize(obj)
1179+
assert t2 != t1
1180+
1181+
# Change non-index coord
1182+
obj.coords["ndcoord"] *= 2
1183+
with raise_if_dask_computes():
1184+
t3 = dask.base.tokenize(obj)
1185+
assert t3 != t2
1186+
1187+
# Change IndexVariable
1188+
obj.coords["x"] *= 2
1189+
with raise_if_dask_computes():
1190+
t4 = dask.base.tokenize(obj)
1191+
assert t4 != t3
1192+
1193+
1194+
@pytest.mark.parametrize("obj", [make_da().compute(), make_ds().compute()])
1195+
def test_token_changes_when_buffer_changes(obj):
1196+
with raise_if_dask_computes():
1197+
t1 = dask.base.tokenize(obj)
1198+
1199+
if isinstance(obj, DataArray):
1200+
obj[0, 0] = 123
1201+
else:
1202+
obj["a"][0, 0] = 123
1203+
with raise_if_dask_computes():
1204+
t2 = dask.base.tokenize(obj)
1205+
assert t2 != t1
1206+
1207+
obj.coords["ndcoord"][0] = 123
1208+
with raise_if_dask_computes():
1209+
t3 = dask.base.tokenize(obj)
1210+
assert t3 != t2
1211+
1212+
1213+
@pytest.mark.parametrize(
1214+
"transform",
1215+
[lambda x: x, lambda x: x.copy(deep=False), lambda x: x.copy(deep=True)],
1216+
)
1217+
@pytest.mark.parametrize("obj", [make_da(), make_ds(), make_ds().variables["a"]])
1218+
def test_token_identical(obj, transform):
1219+
with raise_if_dask_computes():
1220+
assert dask.base.tokenize(obj) == dask.base.tokenize(transform(obj))
1221+
assert dask.base.tokenize(obj.compute()) == dask.base.tokenize(
1222+
transform(obj.compute())
1223+
)
1224+
1225+
1226+
@requires_scipy_or_netCDF4
1227+
def test_normalize_token_with_backend(map_ds):
1228+
with create_tmp_file(allow_cleanup_failure=ON_WINDOWS) as tmp_file:
1229+
map_ds.to_netcdf(tmp_file)
1230+
read = xr.open_dataset(tmp_file)
1231+
assert not dask.base.tokenize(map_ds) == dask.base.tokenize(read)

xarray/tests/test_sparse.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from xarray.core.npcompat import IS_NEP18_ACTIVE
1212
from xarray.core.pycompat import sparse_array_type
1313

14-
from . import assert_equal, assert_identical
14+
from . import assert_equal, assert_identical, requires_dask
1515

1616
param = pytest.param
1717
xfail = pytest.mark.xfail
@@ -849,3 +849,23 @@ def test_chunk():
849849
dsc = ds.chunk(2)
850850
assert dsc.chunks == {"dim_0": (2, 2)}
851851
assert_identical(dsc, ds)
852+
853+
854+
@requires_dask
855+
def test_dask_token():
856+
import dask
857+
858+
s = sparse.COO.from_numpy(np.array([0, 0, 1, 2]))
859+
a = DataArray(s)
860+
t1 = dask.base.tokenize(a)
861+
t2 = dask.base.tokenize(a)
862+
t3 = dask.base.tokenize(a + 1)
863+
assert t1 == t2
864+
assert t3 != t2
865+
assert isinstance(a.data, sparse.COO)
866+
867+
ac = a.chunk(2)
868+
t4 = dask.base.tokenize(ac)
869+
t5 = dask.base.tokenize(ac + 1)
870+
assert t4 != t5
871+
assert isinstance(ac.data._meta, sparse.COO)

0 commit comments

Comments
 (0)