13
13
import xarray as xr
14
14
from xarray import Variable , DataArray , Dataset
15
15
import xarray .ufuncs as xu
16
- from xarray .core .pycompat import suppress
17
- from . import TestCase
16
+ from xarray .core .pycompat import suppress , OrderedDict
17
+ from . import TestCase , assert_frame_equal
18
18
19
19
from xarray .tests import mock
20
20
21
21
dask = pytest .importorskip ('dask' )
22
22
import dask .array as da
23
+ import dask .dataframe as dd
23
24
24
25
25
26
class DaskTestCase (TestCase ):
@@ -29,9 +30,9 @@ def assertLazyAnd(self, expected, actual, test):
29
30
if isinstance (actual , Dataset ):
30
31
for k , v in actual .variables .items ():
31
32
if k in actual .dims :
32
- self .assertIsInstance (var .data , np .ndarray )
33
+ self .assertIsInstance (v .data , np .ndarray )
33
34
else :
34
- self .assertIsInstance (var .data , da .Array )
35
+ self .assertIsInstance (v .data , da .Array )
35
36
elif isinstance (actual , DataArray ):
36
37
self .assertIsInstance (actual .data , da .Array )
37
38
for k , v in actual .coords .items ():
@@ -546,6 +547,100 @@ def test_from_dask_variable(self):
546
547
coords = {'x' : range (4 )}, name = 'foo' )
547
548
self .assertLazyAndIdentical (self .lazy_array , a )
548
549
550
+ def test_to_dask_dataframe (self ):
551
+ # Test conversion of Datasets to dask DataFrames
552
+ x = da .from_array (np .random .randn (10 ), chunks = 4 )
553
+ y = np .arange (10 , dtype = 'uint8' )
554
+ t = list ('abcdefghij' )
555
+
556
+ ds = Dataset (OrderedDict ([('a' , ('t' , x )),
557
+ ('b' , ('t' , y )),
558
+ ('t' , ('t' , t ))]))
559
+
560
+ expected_pd = pd .DataFrame ({'a' : x ,
561
+ 'b' : y },
562
+ index = pd .Index (t , name = 't' ))
563
+
564
+ # test if 1-D index is correctly set up
565
+ expected = dd .from_pandas (expected_pd , chunksize = 4 )
566
+ actual = ds .to_dask_dataframe (set_index = True )
567
+ # test if we have dask dataframes
568
+ self .assertIsInstance (actual , dd .DataFrame )
569
+
570
+ # use the .equals from pandas to check dataframes are equivalent
571
+ assert_frame_equal (expected .compute (), actual .compute ())
572
+
573
+ # test if no index is given
574
+ expected = dd .from_pandas (expected_pd .reset_index (drop = False ),
575
+ chunksize = 4 )
576
+
577
+ actual = ds .to_dask_dataframe (set_index = False )
578
+
579
+ self .assertIsInstance (actual , dd .DataFrame )
580
+ assert_frame_equal (expected .compute (), actual .compute ())
581
+
582
+ def test_to_dask_dataframe_2D (self ):
583
+ # Test if 2-D dataset is supplied
584
+ w = da .from_array (np .random .randn (2 , 3 ), chunks = (1 , 2 ))
585
+ ds = Dataset ({'w' : (('x' , 'y' ), w )})
586
+ ds ['x' ] = ('x' , np .array ([0 , 1 ], np .int64 ))
587
+ ds ['y' ] = ('y' , list ('abc' ))
588
+
589
+ # dask dataframes do not (yet) support multiindex,
590
+ # but when it does, this would be the expected index:
591
+ exp_index = pd .MultiIndex .from_arrays (
592
+ [[0 , 0 , 0 , 1 , 1 , 1 ], ['a' , 'b' , 'c' , 'a' , 'b' , 'c' ]],
593
+ names = ['x' , 'y' ])
594
+ expected = pd .DataFrame ({'w' : w .reshape (- 1 )},
595
+ index = exp_index )
596
+ # so for now, reset the index
597
+ expected = expected .reset_index (drop = False )
598
+
599
+ actual = ds .to_dask_dataframe (set_index = False )
600
+
601
+ self .assertIsInstance (actual , dd .DataFrame )
602
+ assert_frame_equal (expected , actual .compute ())
603
+
604
+ def test_to_dask_dataframe_coordinates (self ):
605
+ # Test if coordinate is also a dask array
606
+ x = da .from_array (np .random .randn (10 ), chunks = 4 )
607
+ t = da .from_array (np .arange (10 )* 2 , chunks = 4 )
608
+
609
+ ds = Dataset (OrderedDict ([('a' , ('t' , x )),
610
+ ('t' , ('t' , t ))]))
611
+
612
+ expected_pd = pd .DataFrame ({'a' : x },
613
+ index = pd .Index (t , name = 't' ))
614
+ expected = dd .from_pandas (expected_pd , chunksize = 4 )
615
+ actual = ds .to_dask_dataframe (set_index = True )
616
+ self .assertIsInstance (actual , dd .DataFrame )
617
+ assert_frame_equal (expected .compute (), actual .compute ())
618
+
619
+ def test_to_dask_dataframe_not_daskarray (self ):
620
+ # Test if DataArray is not a dask array
621
+ x = np .random .randn (10 )
622
+ y = np .arange (10 , dtype = 'uint8' )
623
+ t = list ('abcdefghij' )
624
+
625
+ ds = Dataset (OrderedDict ([('a' , ('t' , x )),
626
+ ('b' , ('t' , y )),
627
+ ('t' , ('t' , t ))]))
628
+
629
+ expected = pd .DataFrame ({'a' : x , 'b' : y },
630
+ index = pd .Index (t , name = 't' ))
631
+
632
+ actual = ds .to_dask_dataframe (set_index = True )
633
+ self .assertIsInstance (actual , dd .DataFrame )
634
+ assert_frame_equal (expected , actual .compute ())
635
+
636
+ def test_to_dask_dataframe_no_coordinate (self ):
637
+ # Test if Dataset has a dimension without coordinates
638
+ x = da .from_array (np .random .randn (10 ), chunks = 4 )
639
+ ds = Dataset ({'x' : ('dim_0' , x )})
640
+ expected = pd .DataFrame ({'x' : x .compute ()})
641
+ actual = ds .to_dask_dataframe (set_index = True )
642
+ assert_frame_equal (expected , actual .compute ())
643
+
549
644
550
645
@pytest .mark .parametrize ("method" , ['load' , 'compute' ])
551
646
def test_dask_kwargs_variable (method ):
0 commit comments