Skip to content

Commit acc76f6

Browse files
committed
Add create_genotype_dosage_dataset
1 parent 81b5d2a commit acc76f6

File tree

3 files changed

+91
-0
lines changed

3 files changed

+91
-0
lines changed

sgkit/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
DIM_SAMPLE,
55
DIM_VARIANT,
66
create_genotype_call_dataset,
7+
create_genotype_dosage_dataset,
78
)
89
from .stats.association import gwas_linear_regression
910

@@ -13,5 +14,6 @@
1314
"DIM_SAMPLE",
1415
"DIM_VARIANT",
1516
"create_genotype_call_dataset",
17+
"create_genotype_dosage_dataset",
1618
"gwas_linear_regression",
1719
]

sgkit/api.py

+57
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from typing import Any, Dict, Hashable, List
22

3+
import numpy as np
34
import xarray as xr
45

56
from .utils import check_array_like
@@ -79,3 +80,59 @@ def create_genotype_call_dataset(
7980
data_vars["variant/id"] = ([DIM_VARIANT], variant_id)
8081
attrs: Dict[Hashable, Any] = {"contigs": variant_contig_names}
8182
return xr.Dataset(data_vars=data_vars, attrs=attrs)
83+
84+
85+
def create_genotype_dosage_dataset(
86+
*,
87+
variant_contig_names: List[str],
88+
variant_contig: Any,
89+
variant_position: Any,
90+
variant_alleles: Any,
91+
sample_id: Any,
92+
call_dosage: Any,
93+
variant_id: Any = None,
94+
) -> xr.Dataset:
95+
"""Create a dataset of genotype calls.
96+
97+
Parameters
98+
----------
99+
variant_contig_names : list of str
100+
The contig names.
101+
variant_contig : array_like, int
102+
The (index of the) contig for each variant.
103+
variant_position : array_like, int
104+
The reference position of the variant.
105+
variant_alleles : array_like, S1
106+
The possible alleles for the variant.
107+
sample_id : array_like, str
108+
The unique identifier of the sample.
109+
call_dosage : array_like, float
110+
Dosages, encoded as floats, with NaN indicating a
111+
missing value.
112+
variant_id: array_like, str, optional
113+
The unique identifier of the variant.
114+
115+
Returns
116+
-------
117+
xr.Dataset
118+
The dataset of genotype calls.
119+
120+
"""
121+
check_array_like(variant_contig, kind="i", ndim=1)
122+
check_array_like(variant_position, kind="i", ndim=1)
123+
check_array_like(variant_alleles, kind="S", ndim=2)
124+
check_array_like(sample_id, kind="U", ndim=1)
125+
check_array_like(call_dosage, kind="f", ndim=2)
126+
data_vars: Dict[Hashable, Any] = {
127+
"variant/contig": ([DIM_VARIANT], variant_contig),
128+
"variant/position": ([DIM_VARIANT], variant_position),
129+
"variant/alleles": ([DIM_VARIANT, DIM_ALLELE], variant_alleles),
130+
"sample/id": ([DIM_SAMPLE], sample_id),
131+
"call/dosage": ([DIM_VARIANT, DIM_SAMPLE], call_dosage),
132+
"call/dosage_mask": ([DIM_VARIANT, DIM_SAMPLE], np.isnan(call_dosage),),
133+
}
134+
if variant_id is not None:
135+
check_array_like(variant_id, kind="U", ndim=1)
136+
data_vars["variant/id"] = ([DIM_VARIANT], variant_id)
137+
attrs: Dict[Hashable, Any] = {"contigs": variant_contig_names}
138+
return xr.Dataset(data_vars=data_vars, attrs=attrs)

sgkit/tests/test_api.py

+32
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
DIM_SAMPLE,
88
DIM_VARIANT,
99
create_genotype_call_dataset,
10+
create_genotype_dosage_dataset,
1011
)
1112

1213

@@ -48,3 +49,34 @@ def test_create_genotype_call_dataset():
4849
assert_array_equal(ds["call/genotype"], call_genotype)
4950
assert_array_equal(ds["call/genotype_mask"], call_genotype < 0)
5051
assert_array_equal(ds["call/genotype_phased"], call_genotype_phased)
52+
53+
54+
def test_create_genotype_dosage_dataset():
55+
variant_contig_names = ["chr1"]
56+
variant_contig = np.array([0, 0], dtype="i1")
57+
variant_position = np.array([1000, 2000], dtype="i4")
58+
variant_alleles = np.array([["A", "C"], ["G", "A"]], dtype="S1")
59+
variant_id = np.array(["rs1", "rs2"], dtype=str)
60+
sample_id = np.array(["sample_1", "sample_2", "sample_3"], dtype=str)
61+
call_dosage = np.array([[0.8, 0.9, 1.0], [1.0, 1.1, 1.2]], dtype="f4")
62+
ds = create_genotype_dosage_dataset(
63+
variant_contig_names=variant_contig_names,
64+
variant_contig=variant_contig,
65+
variant_position=variant_position,
66+
variant_alleles=variant_alleles,
67+
sample_id=sample_id,
68+
call_dosage=call_dosage,
69+
variant_id=variant_id,
70+
)
71+
72+
assert DIM_VARIANT in ds.dims
73+
assert DIM_SAMPLE in ds.dims
74+
75+
assert ds.attrs["contigs"] == variant_contig_names
76+
assert_array_equal(ds["variant/contig"], variant_contig)
77+
assert_array_equal(ds["variant/position"], variant_position)
78+
assert_array_equal(ds["variant/alleles"], variant_alleles)
79+
assert_array_equal(ds["variant/id"], variant_id)
80+
assert_array_equal(ds["sample/id"], sample_id)
81+
assert_array_equal(ds["call/dosage"], call_dosage)
82+
assert_array_equal(ds["call/dosage_mask"], np.isnan(call_dosage))

0 commit comments

Comments
 (0)