Skip to content

Avoid computing dask variables on __repr__ and __getattr__ #1532

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 16 commits into from
Sep 21, 2017
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,10 @@ Bug fixes
``rtol`` arguments when called on ``DataArray`` objects.
By `Stephan Hoyer <https://github.com/shoyer>`_.

- Stop repr() and the Jupyter Notebook from automatically computing dask
variables (:issue:`1522`).
By `Guido Imperiale <https://github.com/crusaderky>`_.

.. _whats-new.0.9.6:

v0.9.6 (8 June 2017)
Expand Down
4 changes: 2 additions & 2 deletions xarray/core/dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -447,8 +447,8 @@ def _level_coords(self):
"""
level_coords = OrderedDict()
for cname, var in self._coords.items():
if var.ndim == 1:
level_names = var.to_index_variable().level_names
if var.ndim == 1 and isinstance(var, IndexVariable):
level_names = var.level_names
if level_names is not None:
dim, = var.dims
level_coords.update({lname: dim for lname in level_names})
Expand Down
16 changes: 8 additions & 8 deletions xarray/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -627,8 +627,8 @@ def _level_coords(self):
level_coords = OrderedDict()
for cname in self._coord_names:
var = self.variables[cname]
if var.ndim == 1:
level_names = var.to_index_variable().level_names
if var.ndim == 1 and isinstance(var, IndexVariable):
level_names = var.level_names
if level_names is not None:
dim, = var.dims
level_coords.update({lname: dim for lname in level_names})
Expand Down Expand Up @@ -1641,12 +1641,12 @@ def expand_dims(self, dim, axis=None):
for d in dim:
if d in self.dims:
raise ValueError(
'Dimension {dim} already exists.'.format(dim=d))
'Dimension {dim} already exists.'.format(dim=d))
if (d in self._variables and
not utils.is_scalar(self._variables[d])):
raise ValueError(
'{dim} already exists as coordinate or'
' variable name.'.format(dim=d))
'{dim} already exists as coordinate or'
' variable name.'.format(dim=d))

if len(dim) != len(set(dim)):
raise ValueError('dims should not contain duplicate values.')
Expand All @@ -1663,7 +1663,7 @@ def expand_dims(self, dim, axis=None):
raise IndexError(
'Axis {a} is out of bounds of the expanded'
' dimension size {dim}.'.format(
a=a, v=k, dim=result_ndim))
a=a, v=k, dim=result_ndim))

axis_pos = [a if a >= 0 else result_ndim + a
for a in axis]
Expand Down Expand Up @@ -2980,8 +2980,8 @@ def filter_by_attrs(self, **kwargs):
for var_name, variable in self.data_vars.items():
for attr_name, pattern in kwargs.items():
attr_value = variable.attrs.get(attr_name)
if ((callable(pattern) and pattern(attr_value))
or attr_value == pattern):
if ((callable(pattern) and pattern(attr_value)) or
attr_value == pattern):
selection.append(var_name)
return self[selection]

Expand Down
32 changes: 11 additions & 21 deletions xarray/core/formatting.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,8 +196,8 @@ def format_array_flat(items_ndarray, max_width):
return pprint_str


def _summarize_var_or_coord(name, var, col_width, show_values=True,
marker=' ', max_width=None):
def summarize_variable(name, var, col_width, show_values=True,
marker=' ', max_width=None):
if max_width is None:
max_width = OPTIONS['display_width']
first_col = pretty_print(u' %s %s ' % (marker, name), col_width)
Expand All @@ -222,38 +222,28 @@ def _summarize_coord_multiindex(coord, col_width, marker):
def _summarize_coord_levels(coord, col_width, marker=u'-'):
relevant_coord = coord[:30]
return u'\n'.join(
[_summarize_var_or_coord(lname,
relevant_coord.get_level_variable(lname),
col_width, marker=marker)
[summarize_variable(lname,
relevant_coord.get_level_variable(lname),
col_width, marker=marker)
for lname in coord.level_names])


def _not_remote(var):
"""Helper function to identify if array is positively identifiable as
coming from a remote source.
"""
source = var.encoding.get('source')
if source and source.startswith('http') and not var._in_memory:
return False
return True


def summarize_var(name, var, col_width):
show_values = _not_remote(var)
return _summarize_var_or_coord(name, var, col_width, show_values)
def summarize_datavar(name, var, col_width):
show_values = var._in_memory
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Our current heuristic uses the _not_remote() helper function, so it doesn't display arrays loaded over a network (via opendap), which can often be quite slow. But it does display a summary of values from netCDF files on disk, which I do think is generally helpful and for which I haven't noticed any performance issues.

Based on the current definition of _in_memory, we wouldn't display any of these arrays:

@property
def _in_memory(self):
return (isinstance(self._data, (np.ndarray, PandasIndexAdapter)) or
(isinstance(self._data, indexing.MemoryCachedArray) and
isinstance(self._data.array, np.ndarray)))

So instead of using _in_memory, I would suggest something like _not_remote(var) and not isinstance(var._data, dask_array_type) as the condition for showing values.

Copy link
Contributor Author

@crusaderky crusaderky Sep 2, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@shoyer loading a NetCDF variable from disk every time you do __repr__ is a terrible idea if that variable has been compressed without chunking. If the variable is a single block of 100MB of zlib-compressed data, you will have to read it and decompress it every time.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@shoyer also, your netcdf array might be sitting on a network file system on the opposite side of a narrowband VPN.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's certainly possible, but in my experience very few people writes 100MB chunks -- those are very large.

Let's summarize our options:

  1. Always show a preview of data from netCDF files with Dataset.__repr__
  2. Never show a preview for data if it isn't already in memory
  3. Show a preview depending on a global option (with default choice TBD).

Reasons to show data from disk in __repr__:

  • It's what we've always done.
  • "Most" of the time it's fast and convenient.
  • It provides a good experience for new users, who don't need to hunt for a separate preview() or load() command to see what's in a Dataset. You can simply print it at a console.

Reasons not to show data from disk in __repr__:

  • IO can be slow/expensive, especially if compression or networks are involved.
  • Heuristics to detect expensive IO are unreliable and somewhat distasteful.

Maybe we should solicit a few more opinions here before we change the default behavior?

Another possibility is to try loading data in a separate thread and timeout if it takes too long (say more than 0.5 seconds), but that might open up it's own set of performance issues (it's not easy to kill a thread, short of terminating a process).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think my vote would be to only print a preview of data that is in memory. For my uses, I typically have fill values in the first 10-20 data points so the previous __repr__ didn't give me any information.

@pydata/xarray - anyone else have thoughts on this?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@shoyer - do we have results from your google poll on this issue yet?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds like I was wrong -- the consensus is pretty clear that we should go ahead with this

screen shot 2017-09-20 at 12 22 32 pm

screen shot 2017-09-20 at 12 22 38 pm

screen shot 2017-09-20 at 12 22 41 pm

screen shot 2017-09-20 at 12 22 48 pm

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure this sample size is going to give us statistically significant results but I'm glad to see @delgadom and I are in agreement.

@crusaderky - are you up for implementing this?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the current implementation (in this PR) is actually already correct.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep - data is eagerly loaded from disk only for index coords on __init__ now.

return summarize_variable(name, var.variable, col_width, show_values)


def summarize_coord(name, var, col_width):
is_index = name in var.dims
show_values = is_index or _not_remote(var)
show_values = var._in_memory
marker = u'*' if is_index else u' '
if is_index:
coord = var.variable.to_index_variable()
if coord.level_names is not None:
return u'\n'.join(
[_summarize_coord_multiindex(coord, col_width, marker),
_summarize_coord_levels(coord, col_width)])
return _summarize_var_or_coord(name, var, col_width, show_values, marker)
return summarize_variable(name, var.variable, col_width, show_values, marker)


def summarize_attr(key, value, col_width=None):
Expand Down Expand Up @@ -307,7 +297,7 @@ def _mapping_repr(mapping, title, summarizer, col_width=None):


data_vars_repr = functools.partial(_mapping_repr, title=u'Data variables',
summarizer=summarize_var)
summarizer=summarize_datavar)


attrs_repr = functools.partial(_mapping_repr, title=u'Attributes',
Expand Down
4 changes: 2 additions & 2 deletions xarray/core/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def merge_variables(
list_of_variables_dicts, # type: List[Mapping[Any, Variable]]
priority_vars=None, # type: Optional[Mapping[Any, Variable]]
compat='minimal', # type: str
):
):
# type: (...) -> OrderedDict[Any, Variable]
"""Merge dicts of variables, while resolving conflicts appropriately.

Expand Down Expand Up @@ -180,7 +180,7 @@ def expand_variable_dicts(list_of_variable_dicts):
Parameters
----------
list_of_variable_dicts : list of dict or Dataset objects
The each value for the mappings must be of the following types:
Each value for the mappings must be of the following types:
- an xarray.Variable
- a tuple `(dims, data[, attrs[, encoding]])` that can be converted in
an xarray.Variable
Expand Down
4 changes: 3 additions & 1 deletion xarray/core/variable.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,7 @@ def nbytes(self):

@property
def _in_memory(self):
return (isinstance(self._data, (np.ndarray, PandasIndexAdapter)) or
return (isinstance(self._data, (np.ndarray, np.number, PandasIndexAdapter)) or
(isinstance(self._data, indexing.MemoryCachedArray) and
isinstance(self._data.array, np.ndarray)))

Expand Down Expand Up @@ -1189,6 +1189,7 @@ def func(self, other):
return self
return func


ops.inject_all_ops_and_reduce_methods(Variable)


Expand Down Expand Up @@ -1353,6 +1354,7 @@ def name(self):
def name(self, value):
raise AttributeError('cannot modify name of IndexVariable in-place')


# for backwards compatibility
Coordinate = utils.alias(IndexVariable, 'Coordinate')

Expand Down
127 changes: 101 additions & 26 deletions xarray/tests/test_dask.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import pickle
from textwrap import dedent
import numpy as np
import pandas as pd

Expand Down Expand Up @@ -130,6 +132,25 @@ def test_binary_op(self):
self.assertLazyAndIdentical(u + u, v + v)
self.assertLazyAndIdentical(u[0] + u, v[0] + v)

def test_repr(self):
expected = dedent("""\
<xarray.Variable (x: 4, y: 6)>
dask.array<array, shape=(4, 6), dtype=float64, chunksize=(2, 2)>""")
self.assertEqual(expected, repr(self.lazy_var))

def test_pickle(self):
# Test that pickling/unpickling does not convert the dask
# backend to numpy
a1 = Variable(['x'], build_dask_array('x'))
a1.compute()
self.assertFalse(a1._in_memory)
self.assertEquals(kernel_call_count, 1)
a2 = pickle.loads(pickle.dumps(a1))
self.assertEquals(kernel_call_count, 1)
self.assertVariableIdentical(a1, a2)
self.assertFalse(a1._in_memory)
self.assertFalse(a2._in_memory)

def test_reduce(self):
u = self.eager_var
v = self.lazy_var
Expand Down Expand Up @@ -341,47 +362,98 @@ def test_dot(self):
lazy = self.lazy_array.dot(self.lazy_array[0])
self.assertLazyAndAllClose(eager, lazy)

def test_variable_pickle(self):
# Test that pickling/unpickling does not convert the dask
# backend to numpy
a1 = Variable(['x'], build_dask_array())
a1.compute()
self.assertFalse(a1._in_memory)
self.assertEquals(kernel_call_count, 1)
a2 = pickle.loads(pickle.dumps(a1))
self.assertEquals(kernel_call_count, 1)
self.assertVariableIdentical(a1, a2)
self.assertFalse(a1._in_memory)
self.assertFalse(a2._in_memory)
def test_dataarray_repr(self):
# Test that __repr__ converts the dask backend to numpy
# in neither the data variable nor the non-index coords
data = build_dask_array('data')
nonindex_coord = build_dask_array('coord')
a = DataArray(data, dims=['x'], coords={'y': ('x', nonindex_coord)})
expected = dedent("""\
<xarray.DataArray 'data' (x: 1)>
dask.array<data, shape=(1,), dtype=int64, chunksize=(1,)>
Coordinates:
y (x) int64 ...
Dimensions without coordinates: x""")
self.assertEqual(expected, repr(a))
self.assertEquals(kernel_call_count, 0)

def test_dataset_repr(self):
# Test that pickling/unpickling converts the dask backend
# to numpy in neither the data variables nor the non-index coords
data = build_dask_array('data')
nonindex_coord = build_dask_array('coord')
ds = Dataset(data_vars={'a': ('x', data)}, coords={'y': ('x', nonindex_coord)})
expected = dedent("""\
<xarray.Dataset>
Dimensions: (x: 1)
Coordinates:
y (x) int64 ...
Dimensions without coordinates: x
Data variables:
a (x) int64 ...""")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Something to consider: could we show an abbreviated version of the dask array repr instead of ...?

e.g., if the dask repr is dask.array<add, shape=(10,), dtype=float64, chunksize=(5,)>, maybe dask.array<add, chunksize=(5,)> or dask.array<add, shape=(10,) chunksize=(5,)>?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@shoyer fixed. Now it's the same as in Variable and in the DataArray data var.

self.assertEqual(expected, repr(ds))
self.assertEquals(kernel_call_count, 0)

def test_dataarray_pickle(self):
# Test that pickling/unpickling does not convert the dask
# backend to numpy
a1 = DataArray(build_dask_array())
# Test that pickling/unpickling converts the dask backend
# to numpy in neither the data variable nor the non-index coords
data = build_dask_array('data')
nonindex_coord = build_dask_array('coord')
a1 = DataArray(data, dims=['x'], coords={'y': ('x', nonindex_coord)})
a1.compute()
self.assertFalse(a1._in_memory)
self.assertEquals(kernel_call_count, 1)
self.assertFalse(a1.coords['y']._in_memory)
self.assertEquals(kernel_call_count, 2)
a2 = pickle.loads(pickle.dumps(a1))
self.assertEquals(kernel_call_count, 1)
self.assertEquals(kernel_call_count, 2)
self.assertDataArrayIdentical(a1, a2)
self.assertFalse(a1._in_memory)
self.assertFalse(a2._in_memory)
self.assertFalse(a1.coords['y']._in_memory)
self.assertFalse(a2.coords['y']._in_memory)

def test_dataset_pickle(self):
ds1 = Dataset({'a': DataArray(build_dask_array())})
# Test that pickling/unpickling converts the dask backend
# to numpy in neither the data variables nor the non-index coords
data = build_dask_array('data')
nonindex_coord = build_dask_array('coord')
ds1 = Dataset(data_vars={'a': ('x', data)}, coords={'y': ('x', nonindex_coord)})
ds1.compute()
self.assertFalse(ds1['a']._in_memory)
self.assertEquals(kernel_call_count, 1)
self.assertFalse(ds1['y']._in_memory)
self.assertEquals(kernel_call_count, 2)
ds2 = pickle.loads(pickle.dumps(ds1))
self.assertEquals(kernel_call_count, 1)
self.assertEquals(kernel_call_count, 2)
self.assertDatasetIdentical(ds1, ds2)
self.assertFalse(ds1['a']._in_memory)
self.assertFalse(ds2['a']._in_memory)
self.assertFalse(ds1['y']._in_memory)
self.assertFalse(ds2['y']._in_memory)

def test_dataarray_getattr(self):
# ipython/jupyter does a long list of getattr() calls to when trying to
# represent an object. Make sure we're not accidentally computing dask variables.
data = build_dask_array('data')
nonindex_coord = build_dask_array('coord')
a = DataArray(data, dims=['x'], coords={'y': ('x', nonindex_coord)})
with suppress(AttributeError):
getattr(a, 'NOTEXIST')
self.assertEquals(kernel_call_count, 0)

def test_dataset_getattr(self):
# Test that pickling/unpickling converts the dask backend
# to numpy in neither the data variables nor the non-index coords
data = build_dask_array('data')
nonindex_coord = build_dask_array('coord')
ds = Dataset(data_vars={'a': ('x', data)}, coords={'y': ('x', nonindex_coord)})
with suppress(AttributeError):
getattr(ds, 'NOTEXIST')
self.assertEquals(kernel_call_count, 0)

def test_values(self):
# Test that invoking the values property does not convert the dask
# backend to numpy
a = DataArray([1,2]).chunk()
a = DataArray([1, 2]).chunk()
self.assertFalse(a._in_memory)
self.assertEquals(a.values.tolist(), [1, 2])
self.assertFalse(a._in_memory)
Expand All @@ -395,17 +467,20 @@ def test_from_dask_variable(self):


kernel_call_count = 0


def kernel():
"""Dask kernel to test pickling/unpickling.
"""Dask kernel to test pickling/unpickling and __repr__.
Must be global to make it pickleable.
"""
global kernel_call_count
kernel_call_count += 1
return np.ones(1)
return np.ones(1, dtype=np.int64)


def build_dask_array():
def build_dask_array(name):
global kernel_call_count
kernel_call_count = 0
return dask.array.Array(
dask={('foo', 0): (kernel, )}, name='foo',
chunks=((1,),), dtype=int)
dask={(name, 0): (kernel, )}, name=name,
chunks=((1,),), dtype=np.int64)
2 changes: 2 additions & 0 deletions xarray/tests/test_dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -2596,6 +2596,7 @@ def da(request):
[0, np.nan, 1, 2, np.nan, 3, 4, 5, np.nan, 6, 7],
dims='time')


def test_rolling_iter(da):

rolling_obj = da.rolling(time=7)
Expand Down Expand Up @@ -2698,6 +2699,7 @@ def test_rolling_pandas_compat(da, center, window, min_periods):
np.testing.assert_allclose(s_rolling.index,
da_rolling['index'])


@pytest.mark.parametrize('da', (1, 2), indirect=True)
@pytest.mark.parametrize('center', (True, False))
@pytest.mark.parametrize('min_periods', (None, 1, 2, 3))
Expand Down