-
-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Avoid computing dask variables on __repr__ and __getattr__ #1532
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 7 commits
37d9135
885d5cd
df9b57b
426b12c
5d9062e
555d2ca
1633a4a
e626a9b
2717ae9
8c72da0
4e2d081
05e02be
9fc6751
963e26a
9bc1c74
7b1b265
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,9 @@ | ||
from __future__ import absolute_import | ||
from __future__ import division | ||
from __future__ import print_function | ||
|
||
import pickle | ||
from textwrap import dedent | ||
import numpy as np | ||
import pandas as pd | ||
|
||
|
@@ -130,6 +132,25 @@ def test_binary_op(self): | |
self.assertLazyAndIdentical(u + u, v + v) | ||
self.assertLazyAndIdentical(u[0] + u, v[0] + v) | ||
|
||
def test_repr(self): | ||
expected = dedent("""\ | ||
<xarray.Variable (x: 4, y: 6)> | ||
dask.array<array, shape=(4, 6), dtype=float64, chunksize=(2, 2)>""") | ||
self.assertEqual(expected, repr(self.lazy_var)) | ||
|
||
def test_pickle(self): | ||
# Test that pickling/unpickling does not convert the dask | ||
# backend to numpy | ||
a1 = Variable(['x'], build_dask_array('x')) | ||
a1.compute() | ||
self.assertFalse(a1._in_memory) | ||
self.assertEquals(kernel_call_count, 1) | ||
a2 = pickle.loads(pickle.dumps(a1)) | ||
self.assertEquals(kernel_call_count, 1) | ||
self.assertVariableIdentical(a1, a2) | ||
self.assertFalse(a1._in_memory) | ||
self.assertFalse(a2._in_memory) | ||
|
||
def test_reduce(self): | ||
u = self.eager_var | ||
v = self.lazy_var | ||
|
@@ -341,47 +362,98 @@ def test_dot(self): | |
lazy = self.lazy_array.dot(self.lazy_array[0]) | ||
self.assertLazyAndAllClose(eager, lazy) | ||
|
||
def test_variable_pickle(self): | ||
# Test that pickling/unpickling does not convert the dask | ||
# backend to numpy | ||
a1 = Variable(['x'], build_dask_array()) | ||
a1.compute() | ||
self.assertFalse(a1._in_memory) | ||
self.assertEquals(kernel_call_count, 1) | ||
a2 = pickle.loads(pickle.dumps(a1)) | ||
self.assertEquals(kernel_call_count, 1) | ||
self.assertVariableIdentical(a1, a2) | ||
self.assertFalse(a1._in_memory) | ||
self.assertFalse(a2._in_memory) | ||
def test_dataarray_repr(self): | ||
# Test that __repr__ converts the dask backend to numpy | ||
# in neither the data variable nor the non-index coords | ||
data = build_dask_array('data') | ||
nonindex_coord = build_dask_array('coord') | ||
a = DataArray(data, dims=['x'], coords={'y': ('x', nonindex_coord)}) | ||
expected = dedent("""\ | ||
<xarray.DataArray 'data' (x: 1)> | ||
dask.array<data, shape=(1,), dtype=int64, chunksize=(1,)> | ||
Coordinates: | ||
y (x) int64 ... | ||
Dimensions without coordinates: x""") | ||
self.assertEqual(expected, repr(a)) | ||
self.assertEquals(kernel_call_count, 0) | ||
|
||
def test_dataset_repr(self): | ||
# Test that pickling/unpickling converts the dask backend | ||
# to numpy in neither the data variables nor the non-index coords | ||
data = build_dask_array('data') | ||
nonindex_coord = build_dask_array('coord') | ||
ds = Dataset(data_vars={'a': ('x', data)}, coords={'y': ('x', nonindex_coord)}) | ||
expected = dedent("""\ | ||
<xarray.Dataset> | ||
Dimensions: (x: 1) | ||
Coordinates: | ||
y (x) int64 ... | ||
Dimensions without coordinates: x | ||
Data variables: | ||
a (x) int64 ...""") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Something to consider: could we show an abbreviated version of the dask array repr instead of e.g., if the dask repr is There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @shoyer fixed. Now it's the same as in Variable and in the DataArray data var. |
||
self.assertEqual(expected, repr(ds)) | ||
self.assertEquals(kernel_call_count, 0) | ||
|
||
def test_dataarray_pickle(self): | ||
# Test that pickling/unpickling does not convert the dask | ||
# backend to numpy | ||
a1 = DataArray(build_dask_array()) | ||
# Test that pickling/unpickling converts the dask backend | ||
# to numpy in neither the data variable nor the non-index coords | ||
data = build_dask_array('data') | ||
nonindex_coord = build_dask_array('coord') | ||
a1 = DataArray(data, dims=['x'], coords={'y': ('x', nonindex_coord)}) | ||
a1.compute() | ||
self.assertFalse(a1._in_memory) | ||
self.assertEquals(kernel_call_count, 1) | ||
self.assertFalse(a1.coords['y']._in_memory) | ||
self.assertEquals(kernel_call_count, 2) | ||
a2 = pickle.loads(pickle.dumps(a1)) | ||
self.assertEquals(kernel_call_count, 1) | ||
self.assertEquals(kernel_call_count, 2) | ||
self.assertDataArrayIdentical(a1, a2) | ||
self.assertFalse(a1._in_memory) | ||
self.assertFalse(a2._in_memory) | ||
self.assertFalse(a1.coords['y']._in_memory) | ||
self.assertFalse(a2.coords['y']._in_memory) | ||
|
||
def test_dataset_pickle(self): | ||
ds1 = Dataset({'a': DataArray(build_dask_array())}) | ||
# Test that pickling/unpickling converts the dask backend | ||
# to numpy in neither the data variables nor the non-index coords | ||
data = build_dask_array('data') | ||
nonindex_coord = build_dask_array('coord') | ||
ds1 = Dataset(data_vars={'a': ('x', data)}, coords={'y': ('x', nonindex_coord)}) | ||
ds1.compute() | ||
self.assertFalse(ds1['a']._in_memory) | ||
self.assertEquals(kernel_call_count, 1) | ||
self.assertFalse(ds1['y']._in_memory) | ||
self.assertEquals(kernel_call_count, 2) | ||
ds2 = pickle.loads(pickle.dumps(ds1)) | ||
self.assertEquals(kernel_call_count, 1) | ||
self.assertEquals(kernel_call_count, 2) | ||
self.assertDatasetIdentical(ds1, ds2) | ||
self.assertFalse(ds1['a']._in_memory) | ||
self.assertFalse(ds2['a']._in_memory) | ||
self.assertFalse(ds1['y']._in_memory) | ||
self.assertFalse(ds2['y']._in_memory) | ||
|
||
def test_dataarray_getattr(self): | ||
# ipython/jupyter does a long list of getattr() calls to when trying to | ||
# represent an object. Make sure we're not accidentally computing dask variables. | ||
data = build_dask_array('data') | ||
nonindex_coord = build_dask_array('coord') | ||
a = DataArray(data, dims=['x'], coords={'y': ('x', nonindex_coord)}) | ||
with suppress(AttributeError): | ||
getattr(a, 'NOTEXIST') | ||
self.assertEquals(kernel_call_count, 0) | ||
|
||
def test_dataset_getattr(self): | ||
# Test that pickling/unpickling converts the dask backend | ||
# to numpy in neither the data variables nor the non-index coords | ||
data = build_dask_array('data') | ||
nonindex_coord = build_dask_array('coord') | ||
ds = Dataset(data_vars={'a': ('x', data)}, coords={'y': ('x', nonindex_coord)}) | ||
with suppress(AttributeError): | ||
getattr(ds, 'NOTEXIST') | ||
self.assertEquals(kernel_call_count, 0) | ||
|
||
def test_values(self): | ||
# Test that invoking the values property does not convert the dask | ||
# backend to numpy | ||
a = DataArray([1,2]).chunk() | ||
a = DataArray([1, 2]).chunk() | ||
self.assertFalse(a._in_memory) | ||
self.assertEquals(a.values.tolist(), [1, 2]) | ||
self.assertFalse(a._in_memory) | ||
|
@@ -395,17 +467,20 @@ def test_from_dask_variable(self): | |
|
||
|
||
kernel_call_count = 0 | ||
|
||
|
||
def kernel(): | ||
"""Dask kernel to test pickling/unpickling. | ||
"""Dask kernel to test pickling/unpickling and __repr__. | ||
Must be global to make it pickleable. | ||
""" | ||
global kernel_call_count | ||
kernel_call_count += 1 | ||
return np.ones(1) | ||
return np.ones(1, dtype=np.int64) | ||
|
||
|
||
def build_dask_array(): | ||
def build_dask_array(name): | ||
global kernel_call_count | ||
kernel_call_count = 0 | ||
return dask.array.Array( | ||
dask={('foo', 0): (kernel, )}, name='foo', | ||
chunks=((1,),), dtype=int) | ||
dask={(name, 0): (kernel, )}, name=name, | ||
chunks=((1,),), dtype=np.int64) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Our current heuristic uses the
_not_remote()
helper function, so it doesn't display arrays loaded over a network (via opendap), which can often be quite slow. But it does display a summary of values from netCDF files on disk, which I do think is generally helpful and for which I haven't noticed any performance issues.Based on the current definition of
_in_memory
, we wouldn't display any of these arrays:xarray/xarray/core/variable.py
Lines 285 to 289 in 4a15cfa
So instead of using
_in_memory
, I would suggest something like_not_remote(var) and not isinstance(var._data, dask_array_type)
as the condition for showing values.Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@shoyer loading a NetCDF variable from disk every time you do __repr__ is a terrible idea if that variable has been compressed without chunking. If the variable is a single block of 100MB of zlib-compressed data, you will have to read it and decompress it every time.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@shoyer also, your netcdf array might be sitting on a network file system on the opposite side of a narrowband VPN.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That's certainly possible, but in my experience very few people writes 100MB chunks -- those are very large.
Let's summarize our options:
Dataset.__repr__
Reasons to show data from disk in
__repr__
:preview()
orload()
command to see what's in a Dataset. You can simply print it at a console.Reasons not to show data from disk in
__repr__
:Maybe we should solicit a few more opinions here before we change the default behavior?
Another possibility is to try loading data in a separate thread and timeout if it takes too long (say more than 0.5 seconds), but that might open up it's own set of performance issues (it's not easy to kill a thread, short of terminating a process).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think my vote would be to only print a preview of data that is in memory. For my uses, I typically have fill values in the first 10-20 data points so the previous
__repr__
didn't give me any information.@pydata/xarray - anyone else have thoughts on this?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@shoyer - do we have results from your google poll on this issue yet?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sounds like I was wrong -- the consensus is pretty clear that we should go ahead with this
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure this sample size is going to give us statistically significant results but I'm glad to see @delgadom and I are in agreement.
@crusaderky - are you up for implementing this?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think the current implementation (in this PR) is actually already correct.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep - data is eagerly loaded from disk only for index coords on __init__ now.