9
9
from sgkit import Fst , Tajimas_D , create_genotype_call_dataset , divergence , diversity
10
10
11
11
12
- def ts_to_dataset (ts , samples = None ):
12
+ def ts_to_dataset (ts , chunks = None , samples = None ):
13
13
"""
14
14
Convert the specified tskit tree sequence into an sgkit dataset.
15
15
Note this just generates haploids for now. With msprime 1.0, we'll be
@@ -26,22 +26,24 @@ def ts_to_dataset(ts, samples=None):
26
26
alleles = np .array (alleles ).astype ("S" )
27
27
genotypes = np .expand_dims (genotypes , axis = 2 )
28
28
29
- df = create_genotype_call_dataset (
29
+ ds = create_genotype_call_dataset (
30
30
variant_contig_names = ["1" ],
31
31
variant_contig = np .zeros (len (tables .sites ), dtype = int ),
32
32
variant_position = tables .sites .position .astype (int ),
33
33
variant_alleles = alleles ,
34
34
sample_id = np .array ([f"tsk_{ u } " for u in samples ]).astype ("U" ),
35
35
call_genotype = genotypes ,
36
36
)
37
- return df
37
+ if chunks is not None :
38
+ ds = ds .chunk (dict (zip (["variants" , "samples" ], chunks )))
39
+ return ds
38
40
39
41
40
42
@pytest .mark .parametrize ("size" , [2 , 3 , 10 , 100 ])
41
43
@pytest .mark .parametrize ("chunks" , [(- 1 , - 1 ), (10 , - 1 )])
42
44
def test_diversity (size , chunks ):
43
45
ts = msprime .simulate (size , length = 100 , mutation_rate = 0.05 , random_seed = 42 )
44
- ds = ts_to_dataset (ts ) # type: ignore[no-untyped-call]
46
+ ds = ts_to_dataset (ts , chunks ) # type: ignore[no-untyped-call]
45
47
ds = ds .chunk (dict (zip (["variants" , "samples" ], chunks )))
46
48
sample_cohorts = np .full_like (ts .samples (), 0 )
47
49
ds ["sample_cohort" ] = xr .DataArray (sample_cohorts , dims = "samples" )
@@ -56,10 +58,11 @@ def test_diversity(size, chunks):
56
58
"size, n_cohorts" ,
57
59
[(2 , 2 ), (3 , 2 ), (3 , 3 ), (10 , 2 ), (10 , 3 ), (10 , 4 ), (100 , 2 ), (100 , 3 ), (100 , 4 )],
58
60
)
59
- def test_divergence (size , n_cohorts ):
61
+ @pytest .mark .parametrize ("chunks" , [(- 1 , - 1 ), (10 , - 1 )])
62
+ def test_divergence (size , n_cohorts , chunks ):
60
63
ts = msprime .simulate (size , length = 100 , mutation_rate = 0.05 , random_seed = 42 )
61
64
subsets = np .array_split (ts .samples (), n_cohorts )
62
- ds = ts_to_dataset (ts ) # type: ignore[no-untyped-call]
65
+ ds = ts_to_dataset (ts , chunks ) # type: ignore[no-untyped-call]
63
66
sample_cohorts = np .concatenate (
64
67
[np .full_like (subset , i ) for i , subset in enumerate (subsets )]
65
68
)
@@ -84,12 +87,13 @@ def test_divergence(size, n_cohorts):
84
87
85
88
86
89
@pytest .mark .parametrize ("size" , [2 , 3 , 10 , 100 ])
87
- def test_Fst__Hudson (size ):
90
+ @pytest .mark .parametrize ("chunks" , [(- 1 , - 1 ), (10 , - 1 )])
91
+ def test_Fst__Hudson (size , chunks ):
88
92
# scikit-allel can only calculate Fst for pairs of cohorts (populations)
89
93
n_cohorts = 2
90
94
ts = msprime .simulate (size , length = 100 , mutation_rate = 0.05 , random_seed = 42 )
91
95
subsets = np .array_split (ts .samples (), n_cohorts )
92
- ds = ts_to_dataset (ts ) # type: ignore[no-untyped-call]
96
+ ds = ts_to_dataset (ts , chunks ) # type: ignore[no-untyped-call]
93
97
sample_cohorts = np .concatenate (
94
98
[np .full_like (subset , i ) for i , subset in enumerate (subsets )]
95
99
)
@@ -112,10 +116,11 @@ def test_Fst__Hudson(size):
112
116
"size, n_cohorts" ,
113
117
[(2 , 2 ), (3 , 2 ), (3 , 3 ), (10 , 2 ), (10 , 3 ), (10 , 4 ), (100 , 2 ), (100 , 3 ), (100 , 4 )],
114
118
)
115
- def test_Fst__Nei (size , n_cohorts ):
119
+ @pytest .mark .parametrize ("chunks" , [(- 1 , - 1 ), (10 , - 1 )])
120
+ def test_Fst__Nei (size , n_cohorts , chunks ):
116
121
ts = msprime .simulate (size , length = 100 , mutation_rate = 0.05 , random_seed = 42 )
117
122
subsets = np .array_split (ts .samples (), n_cohorts )
118
- ds = ts_to_dataset (ts ) # type: ignore[no-untyped-call]
123
+ ds = ts_to_dataset (ts , chunks ) # type: ignore[no-untyped-call]
119
124
sample_cohorts = np .concatenate (
120
125
[np .full_like (subset , i ) for i , subset in enumerate (subsets )]
121
126
)
@@ -142,9 +147,10 @@ def test_Fst__unknown_estimator():
142
147
143
148
144
149
@pytest .mark .parametrize ("size" , [2 , 3 , 10 , 100 ])
145
- def test_Tajimas_D (size ):
150
+ @pytest .mark .parametrize ("chunks" , [(- 1 , - 1 ), (10 , - 1 )])
151
+ def test_Tajimas_D (size , chunks ):
146
152
ts = msprime .simulate (size , length = 100 , mutation_rate = 0.05 , random_seed = 42 )
147
- ds = ts_to_dataset (ts ) # type: ignore[no-untyped-call]
153
+ ds = ts_to_dataset (ts , chunks ) # type: ignore[no-untyped-call]
148
154
sample_cohorts = np .full_like (ts .samples (), 0 )
149
155
ds ["sample_cohort" ] = xr .DataArray (sample_cohorts , dims = "samples" )
150
156
ds = Tajimas_D (ds )
0 commit comments