Skip to content

Commit 7611ed9

Browse files
crusaderkyshoyer
authored andcommitted
Avoid computing dask variables on __repr__ and __getattr__ (#1532)
* stop repr(Dataset) from resolving dask variables Also stop resolving non-index coords in Datasets * stop DataArray.__getattr__('NOTEXIST') from computing dask variables * PEP8 fixes * PEP8 fixes * Tests for __repr__, __getattr__ and __getstate__ * int is coerced to int64 or int32 on different systems * What's New * flake8 tweaks * print summary of dask arrays * More compact printing for dask arrays * Cleared changelog and moved to breaking changes * Deduplicate code
1 parent 31921fa commit 7611ed9

File tree

7 files changed

+154
-62
lines changed

7 files changed

+154
-62
lines changed

doc/whats-new.rst

+6-1
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,11 @@ Breaking changes
2727
(:issue:`727`).
2828
By `Joe Hamman <https://github.com/jhamman>`_.
2929

30+
- ``repr`` and the Jupyter Notebook won't automatically compute dask variables.
31+
Datasets loaded with ``open_dataset`` won't automatically read coords from
32+
disk when calling ``repr`` (:issue:`1522`).
33+
By `Guido Imperiale <https://github.com/crusaderky>`_.
34+
3035
Backward Incompatible Changes
3136
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
3237

@@ -153,7 +158,7 @@ Bug fixes
153158
``rtol`` arguments when called on ``DataArray`` objects.
154159
By `Stephan Hoyer <https://github.com/shoyer>`_.
155160

156-
- Xarray ``quantile`` methods now properly raise a ``TypeError`` when applied to
161+
- xarray ``quantile`` methods now properly raise a ``TypeError`` when applied to
157162
objects with data stored as ``dask`` arrays (:issue:`1529`).
158163
By `Joe Hamman <https://github.com/jhamman>`_.
159164

xarray/core/dataarray.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -447,8 +447,8 @@ def _level_coords(self):
447447
"""
448448
level_coords = OrderedDict()
449449
for cname, var in self._coords.items():
450-
if var.ndim == 1:
451-
level_names = var.to_index_variable().level_names
450+
if var.ndim == 1 and isinstance(var, IndexVariable):
451+
level_names = var.level_names
452452
if level_names is not None:
453453
dim, = var.dims
454454
level_coords.update({lname: dim for lname in level_names})

xarray/core/dataset.py

+8-8
Original file line numberDiff line numberDiff line change
@@ -654,8 +654,8 @@ def _level_coords(self):
654654
level_coords = OrderedDict()
655655
for cname in self._coord_names:
656656
var = self.variables[cname]
657-
if var.ndim == 1:
658-
level_names = var.to_index_variable().level_names
657+
if var.ndim == 1 and isinstance(var, IndexVariable):
658+
level_names = var.level_names
659659
if level_names is not None:
660660
dim, = var.dims
661661
level_coords.update({lname: dim for lname in level_names})
@@ -1669,12 +1669,12 @@ def expand_dims(self, dim, axis=None):
16691669
for d in dim:
16701670
if d in self.dims:
16711671
raise ValueError(
1672-
'Dimension {dim} already exists.'.format(dim=d))
1672+
'Dimension {dim} already exists.'.format(dim=d))
16731673
if (d in self._variables and
16741674
not utils.is_scalar(self._variables[d])):
16751675
raise ValueError(
1676-
'{dim} already exists as coordinate or'
1677-
' variable name.'.format(dim=d))
1676+
'{dim} already exists as coordinate or'
1677+
' variable name.'.format(dim=d))
16781678

16791679
if len(dim) != len(set(dim)):
16801680
raise ValueError('dims should not contain duplicate values.')
@@ -1691,7 +1691,7 @@ def expand_dims(self, dim, axis=None):
16911691
raise IndexError(
16921692
'Axis {a} is out of bounds of the expanded'
16931693
' dimension size {dim}.'.format(
1694-
a=a, v=k, dim=result_ndim))
1694+
a=a, v=k, dim=result_ndim))
16951695

16961696
axis_pos = [a if a >= 0 else result_ndim + a
16971697
for a in axis]
@@ -3008,8 +3008,8 @@ def filter_by_attrs(self, **kwargs):
30083008
for var_name, variable in self.data_vars.items():
30093009
for attr_name, pattern in kwargs.items():
30103010
attr_value = variable.attrs.get(attr_name)
3011-
if ((callable(pattern) and pattern(attr_value))
3012-
or attr_value == pattern):
3011+
if ((callable(pattern) and pattern(attr_value)) or
3012+
attr_value == pattern):
30133013
selection.append(var_name)
30143014
return self[selection]
30153015

xarray/core/formatting.py

+28-22
Original file line numberDiff line numberDiff line change
@@ -196,8 +196,8 @@ def format_array_flat(items_ndarray, max_width):
196196
return pprint_str
197197

198198

199-
def _summarize_var_or_coord(name, var, col_width, show_values=True,
200-
marker=' ', max_width=None):
199+
def summarize_variable(name, var, col_width, show_values=True,
200+
marker=' ', max_width=None):
201201
if max_width is None:
202202
max_width = OPTIONS['display_width']
203203
first_col = pretty_print(u' %s %s ' % (marker, name), col_width)
@@ -208,6 +208,8 @@ def _summarize_var_or_coord(name, var, col_width, show_values=True,
208208
front_str = u'%s%s%s ' % (first_col, dims_str, var.dtype)
209209
if show_values:
210210
values_str = format_array_flat(var, max_width - len(front_str))
211+
elif isinstance(var.data, dask_array_type):
212+
values_str = short_dask_repr(var, show_dtype=False)
211213
else:
212214
values_str = u'...'
213215

@@ -222,38 +224,29 @@ def _summarize_coord_multiindex(coord, col_width, marker):
222224
def _summarize_coord_levels(coord, col_width, marker=u'-'):
223225
relevant_coord = coord[:30]
224226
return u'\n'.join(
225-
[_summarize_var_or_coord(lname,
226-
relevant_coord.get_level_variable(lname),
227-
col_width, marker=marker)
227+
[summarize_variable(lname,
228+
relevant_coord.get_level_variable(lname),
229+
col_width, marker=marker)
228230
for lname in coord.level_names])
229231

230232

231-
def _not_remote(var):
232-
"""Helper function to identify if array is positively identifiable as
233-
coming from a remote source.
234-
"""
235-
source = var.encoding.get('source')
236-
if source and source.startswith('http') and not var._in_memory:
237-
return False
238-
return True
239-
240-
241-
def summarize_var(name, var, col_width):
242-
show_values = _not_remote(var)
243-
return _summarize_var_or_coord(name, var, col_width, show_values)
233+
def summarize_datavar(name, var, col_width):
234+
show_values = var._in_memory
235+
return summarize_variable(name, var.variable, col_width, show_values)
244236

245237

246238
def summarize_coord(name, var, col_width):
247239
is_index = name in var.dims
248-
show_values = is_index or _not_remote(var)
240+
show_values = var._in_memory
249241
marker = u'*' if is_index else u' '
250242
if is_index:
251243
coord = var.variable.to_index_variable()
252244
if coord.level_names is not None:
253245
return u'\n'.join(
254246
[_summarize_coord_multiindex(coord, col_width, marker),
255247
_summarize_coord_levels(coord, col_width)])
256-
return _summarize_var_or_coord(name, var, col_width, show_values, marker)
248+
return summarize_variable(
249+
name, var.variable, col_width, show_values, marker)
257250

258251

259252
def summarize_attr(key, value, col_width=None):
@@ -307,7 +300,7 @@ def _mapping_repr(mapping, title, summarizer, col_width=None):
307300

308301

309302
data_vars_repr = functools.partial(_mapping_repr, title=u'Data variables',
310-
summarizer=summarize_var)
303+
summarizer=summarize_datavar)
311304

312305

313306
attrs_repr = functools.partial(_mapping_repr, title=u'Attributes',
@@ -370,6 +363,19 @@ def short_array_repr(array):
370363
return repr(array)
371364

372365

366+
def short_dask_repr(array, show_dtype=True):
367+
"""Similar to dask.array.DataArray.__repr__, but without
368+
redundant information that's already printed by the repr
369+
function of the xarray wrapper.
370+
"""
371+
chunksize = tuple(c[0] for c in array.chunks)
372+
if show_dtype:
373+
return 'dask.array<shape=%s, dtype=%s, chunksize=%s>' % (
374+
array.shape, array.dtype, chunksize)
375+
else:
376+
return 'dask.array<shape=%s, chunksize=%s>' % (array.shape, chunksize)
377+
378+
373379
def array_repr(arr):
374380
# used for DataArray, Variable and IndexVariable
375381
if hasattr(arr, 'name') and arr.name is not None:
@@ -381,7 +387,7 @@ def array_repr(arr):
381387
% (type(arr).__name__, name_str, dim_summary(arr))]
382388

383389
if isinstance(getattr(arr, 'variable', arr)._data, dask_array_type):
384-
summary.append(repr(arr.data))
390+
summary.append(short_dask_repr(arr))
385391
elif arr._in_memory or arr.size < 1e5:
386392
summary.append(short_array_repr(arr.values))
387393
else:

xarray/core/merge.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ def merge_variables(
113113
list_of_variables_dicts, # type: List[Mapping[Any, Variable]]
114114
priority_vars=None, # type: Optional[Mapping[Any, Variable]]
115115
compat='minimal', # type: str
116-
):
116+
):
117117
# type: (...) -> OrderedDict[Any, Variable]
118118
"""Merge dicts of variables, while resolving conflicts appropriately.
119119
@@ -180,7 +180,7 @@ def expand_variable_dicts(list_of_variable_dicts):
180180
Parameters
181181
----------
182182
list_of_variable_dicts : list of dict or Dataset objects
183-
The each value for the mappings must be of the following types:
183+
Each value for the mappings must be of the following types:
184184
- an xarray.Variable
185185
- a tuple `(dims, data[, attrs[, encoding]])` that can be converted in
186186
an xarray.Variable

xarray/core/variable.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -284,7 +284,7 @@ def nbytes(self):
284284

285285
@property
286286
def _in_memory(self):
287-
return (isinstance(self._data, (np.ndarray, PandasIndexAdapter)) or
287+
return (isinstance(self._data, (np.ndarray, np.number, PandasIndexAdapter)) or
288288
(isinstance(self._data, indexing.MemoryCachedArray) and
289289
isinstance(self._data.array, np.ndarray)))
290290

@@ -1210,6 +1210,7 @@ def func(self, other):
12101210
return self
12111211
return func
12121212

1213+
12131214
ops.inject_all_ops_and_reduce_methods(Variable)
12141215

12151216

@@ -1374,6 +1375,7 @@ def name(self):
13741375
def name(self, value):
13751376
raise AttributeError('cannot modify name of IndexVariable in-place')
13761377

1378+
13771379
# for backwards compatibility
13781380
Coordinate = utils.alias(IndexVariable, 'Coordinate')
13791381

0 commit comments

Comments
 (0)