Skip to content

Commit ee7a8da

Browse files
TomNicholasdcherian
authored andcommitted
Add .chunksizes property (pydata#5900)
* added chunksizes property * fix typing via Hashable->Any * add chunksizes to API doc * whatsnew * grammar * Update doc/whats-new.rst Co-authored-by: Deepak Cherian <[email protected]> * Update doc/whats-new.rst Co-authored-by: Deepak Cherian <[email protected]> * removed the word consistent * test .chunksizes Co-authored-by: Deepak Cherian <[email protected]>
1 parent c8b64a7 commit ee7a8da

File tree

7 files changed

+159
-21
lines changed

7 files changed

+159
-21
lines changed

doc/api.rst

+2
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ Attributes
6565
Dataset.indexes
6666
Dataset.get_index
6767
Dataset.chunks
68+
Dataset.chunksizes
6869
Dataset.nbytes
6970

7071
Dictionary interface
@@ -271,6 +272,7 @@ Attributes
271272
DataArray.encoding
272273
DataArray.indexes
273274
DataArray.get_index
275+
DataArray.chunksizes
274276

275277
**ndarray attributes**:
276278
:py:attr:`~DataArray.ndim`

doc/whats-new.rst

+4
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,10 @@ New Features
3838
`Nathan Lis <https://github.com/wxman22>`_.
3939
- Histogram plots are set with a title displaying the scalar coords if any, similarly to the other plots (:issue:`5791`, :pull:`5792`).
4040
By `Maxime Liquet <https://github.com/maximlt>`_.
41+
- Added a new :py:attr:`Dataset.chunksizes`, :py:attr:`DataArray.chunksizes`, and :py:attr:`Variable.chunksizes`
42+
property, which will always return a mapping from dimension names to chunking pattern along that dimension,
43+
regardless of whether the object is a Dataset, DataArray, or Variable. (:issue:`5846`, :pull:`5900`)
44+
By `Tom Nicholas <https://github.com/TomNicholas>`_.
4145

4246
Breaking changes
4347
~~~~~~~~~~~~~~~~

xarray/core/common.py

+17
Original file line numberDiff line numberDiff line change
@@ -1813,6 +1813,23 @@ def ones_like(other, dtype: DTypeLike = None):
18131813
return full_like(other, 1, dtype)
18141814

18151815

1816+
def get_chunksizes(
1817+
variables: Iterable[Variable],
1818+
) -> Mapping[Any, Tuple[int, ...]]:
1819+
1820+
chunks: Dict[Any, Tuple[int, ...]] = {}
1821+
for v in variables:
1822+
if hasattr(v.data, "chunks"):
1823+
for dim, c in v.chunksizes.items():
1824+
if dim in chunks and c != chunks[dim]:
1825+
raise ValueError(
1826+
f"Object has inconsistent chunks along dimension {dim}. "
1827+
"This can be fixed by calling unify_chunks()."
1828+
)
1829+
chunks[dim] = c
1830+
return Frozen(chunks)
1831+
1832+
18161833
def is_np_datetime_like(dtype: DTypeLike) -> bool:
18171834
"""Check if a dtype is a subclass of the numpy datetime types"""
18181835
return np.issubdtype(dtype, np.datetime64) or np.issubdtype(dtype, np.timedelta64)

xarray/core/dataarray.py

+29-3
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@
4343
reindex_like_indexers,
4444
)
4545
from .arithmetic import DataArrayArithmetic
46-
from .common import AbstractArray, DataWithCoords
46+
from .common import AbstractArray, DataWithCoords, get_chunksizes
4747
from .computation import unify_chunks
4848
from .coordinates import (
4949
DataArrayCoordinates,
@@ -1058,11 +1058,37 @@ def __deepcopy__(self, memo=None) -> "DataArray":
10581058

10591059
@property
10601060
def chunks(self) -> Optional[Tuple[Tuple[int, ...], ...]]:
1061-
"""Block dimensions for this array's data or None if it's not a dask
1062-
array.
1061+
"""
1062+
Tuple of block lengths for this dataarray's data, in order of dimensions, or None if
1063+
the underlying data is not a dask array.
1064+
1065+
See Also
1066+
--------
1067+
DataArray.chunk
1068+
DataArray.chunksizes
1069+
xarray.unify_chunks
10631070
"""
10641071
return self.variable.chunks
10651072

1073+
@property
1074+
def chunksizes(self) -> Mapping[Any, Tuple[int, ...]]:
1075+
"""
1076+
Mapping from dimension names to block lengths for this dataarray's data, or None if
1077+
the underlying data is not a dask array.
1078+
Cannot be modified directly, but can be modified by calling .chunk().
1079+
1080+
Differs from DataArray.chunks because it returns a mapping of dimensions to chunk shapes
1081+
instead of a tuple of chunk shapes.
1082+
1083+
See Also
1084+
--------
1085+
DataArray.chunk
1086+
DataArray.chunks
1087+
xarray.unify_chunks
1088+
"""
1089+
all_variables = [self.variable] + [c.variable for c in self.coords.values()]
1090+
return get_chunksizes(all_variables)
1091+
10661092
def chunk(
10671093
self,
10681094
chunks: Union[

xarray/core/dataset.py

+37-14
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@
5252
)
5353
from .alignment import _broadcast_helper, _get_broadcast_dims_map_common_coords, align
5454
from .arithmetic import DatasetArithmetic
55-
from .common import DataWithCoords, _contains_datetime_like_objects
55+
from .common import DataWithCoords, _contains_datetime_like_objects, get_chunksizes
5656
from .computation import unify_chunks
5757
from .coordinates import (
5858
DatasetCoordinates,
@@ -2095,20 +2095,37 @@ def info(self, buf=None) -> None:
20952095

20962096
@property
20972097
def chunks(self) -> Mapping[Hashable, Tuple[int, ...]]:
2098-
"""Block dimensions for this dataset's data or None if it's not a dask
2099-
array.
21002098
"""
2101-
chunks: Dict[Hashable, Tuple[int, ...]] = {}
2102-
for v in self.variables.values():
2103-
if v.chunks is not None:
2104-
for dim, c in zip(v.dims, v.chunks):
2105-
if dim in chunks and c != chunks[dim]:
2106-
raise ValueError(
2107-
f"Object has inconsistent chunks along dimension {dim}. "
2108-
"This can be fixed by calling unify_chunks()."
2109-
)
2110-
chunks[dim] = c
2111-
return Frozen(chunks)
2099+
Mapping from dimension names to block lengths for this dataset's data, or None if
2100+
the underlying data is not a dask array.
2101+
Cannot be modified directly, but can be modified by calling .chunk().
2102+
2103+
Same as Dataset.chunksizes, but maintained for backwards compatibility.
2104+
2105+
See Also
2106+
--------
2107+
Dataset.chunk
2108+
Dataset.chunksizes
2109+
xarray.unify_chunks
2110+
"""
2111+
return get_chunksizes(self.variables.values())
2112+
2113+
@property
2114+
def chunksizes(self) -> Mapping[Any, Tuple[int, ...]]:
2115+
"""
2116+
Mapping from dimension names to block lengths for this dataset's data, or None if
2117+
the underlying data is not a dask array.
2118+
Cannot be modified directly, but can be modified by calling .chunk().
2119+
2120+
Same as Dataset.chunks.
2121+
2122+
See Also
2123+
--------
2124+
Dataset.chunk
2125+
Dataset.chunks
2126+
xarray.unify_chunks
2127+
"""
2128+
return get_chunksizes(self.variables.values())
21122129

21132130
def chunk(
21142131
self,
@@ -2147,6 +2164,12 @@ def chunk(
21472164
Returns
21482165
-------
21492166
chunked : xarray.Dataset
2167+
2168+
See Also
2169+
--------
2170+
Dataset.chunks
2171+
Dataset.chunksizes
2172+
xarray.unify_chunks
21502173
"""
21512174
if chunks is None:
21522175
warnings.warn(

xarray/core/variable.py

+33-4
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545
sparse_array_type,
4646
)
4747
from .utils import (
48+
Frozen,
4849
NdimSizeLenMixin,
4950
OrderedSet,
5051
_default,
@@ -996,16 +997,44 @@ def __deepcopy__(self, memo=None):
996997
__hash__ = None # type: ignore[assignment]
997998

998999
@property
999-
def chunks(self):
1000-
"""Block dimensions for this array's data or None if it's not a dask
1001-
array.
1000+
def chunks(self) -> Optional[Tuple[Tuple[int, ...], ...]]:
1001+
"""
1002+
Tuple of block lengths for this dataarray's data, in order of dimensions, or None if
1003+
the underlying data is not a dask array.
1004+
1005+
See Also
1006+
--------
1007+
Variable.chunk
1008+
Variable.chunksizes
1009+
xarray.unify_chunks
10021010
"""
10031011
return getattr(self._data, "chunks", None)
10041012

1013+
@property
1014+
def chunksizes(self) -> Mapping[Any, Tuple[int, ...]]:
1015+
"""
1016+
Mapping from dimension names to block lengths for this variable's data, or None if
1017+
the underlying data is not a dask array.
1018+
Cannot be modified directly, but can be modified by calling .chunk().
1019+
1020+
Differs from variable.chunks because it returns a mapping of dimensions to chunk shapes
1021+
instead of a tuple of chunk shapes.
1022+
1023+
See Also
1024+
--------
1025+
Variable.chunk
1026+
Variable.chunks
1027+
xarray.unify_chunks
1028+
"""
1029+
if hasattr(self._data, "chunks"):
1030+
return Frozen({dim: c for dim, c in zip(self.dims, self.data.chunks)})
1031+
else:
1032+
return {}
1033+
10051034
_array_counter = itertools.count()
10061035

10071036
def chunk(self, chunks={}, name=None, lock=False):
1008-
"""Coerce this array's data into a dask arrays with the given chunks.
1037+
"""Coerce this array's data into a dask array with the given chunks.
10091038
10101039
If this variable is a non-dask array, it will be converted to dask
10111040
array. If it's a dask array, it will be rechunked to the given chunk

xarray/tests/test_dask.py

+37
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,11 @@ def test_chunk(self):
104104
assert rechunked.chunks == expected
105105
self.assertLazyAndIdentical(self.eager_var, rechunked)
106106

107+
expected_chunksizes = {
108+
dim: chunks for dim, chunks in zip(self.lazy_var.dims, expected)
109+
}
110+
assert rechunked.chunksizes == expected_chunksizes
111+
107112
def test_indexing(self):
108113
u = self.eager_var
109114
v = self.lazy_var
@@ -330,6 +335,38 @@ def setUp(self):
330335
self.data, coords={"x": range(4)}, dims=("x", "y"), name="foo"
331336
)
332337

338+
def test_chunk(self):
339+
for chunks, expected in [
340+
({}, ((2, 2), (2, 2, 2))),
341+
(3, ((3, 1), (3, 3))),
342+
({"x": 3, "y": 3}, ((3, 1), (3, 3))),
343+
({"x": 3}, ((3, 1), (2, 2, 2))),
344+
({"x": (3, 1)}, ((3, 1), (2, 2, 2))),
345+
]:
346+
# Test DataArray
347+
rechunked = self.lazy_array.chunk(chunks)
348+
assert rechunked.chunks == expected
349+
self.assertLazyAndIdentical(self.eager_array, rechunked)
350+
351+
expected_chunksizes = {
352+
dim: chunks for dim, chunks in zip(self.lazy_array.dims, expected)
353+
}
354+
assert rechunked.chunksizes == expected_chunksizes
355+
356+
# Test Dataset
357+
lazy_dataset = self.lazy_array.to_dataset()
358+
eager_dataset = self.eager_array.to_dataset()
359+
expected_chunksizes = {
360+
dim: chunks for dim, chunks in zip(lazy_dataset.dims, expected)
361+
}
362+
rechunked = lazy_dataset.chunk(chunks)
363+
364+
# Dataset.chunks has a different return type to DataArray.chunks - see issue #5843
365+
assert rechunked.chunks == expected_chunksizes
366+
self.assertLazyAndIdentical(eager_dataset, rechunked)
367+
368+
assert rechunked.chunksizes == expected_chunksizes
369+
333370
def test_rechunk(self):
334371
chunked = self.eager_array.chunk({"x": 2}).chunk({"y": 2})
335372
assert chunked.chunks == ((2,) * 2, (2,) * 3)

0 commit comments

Comments
 (0)