|
1 | 1 | from typing import Any, Dict, Hashable, List
|
2 | 2 |
|
| 3 | +import numpy as np |
3 | 4 | import xarray as xr
|
4 | 5 |
|
5 | 6 | from .utils import check_array_like
|
@@ -79,3 +80,59 @@ def create_genotype_call_dataset(
|
79 | 80 | data_vars["variant/id"] = ([DIM_VARIANT], variant_id)
|
80 | 81 | attrs: Dict[Hashable, Any] = {"contigs": variant_contig_names}
|
81 | 82 | return xr.Dataset(data_vars=data_vars, attrs=attrs)
|
| 83 | + |
| 84 | + |
| 85 | +def create_genotype_dosage_dataset( |
| 86 | + *, |
| 87 | + variant_contig_names: List[str], |
| 88 | + variant_contig: Any, |
| 89 | + variant_position: Any, |
| 90 | + variant_alleles: Any, |
| 91 | + sample_id: Any, |
| 92 | + call_dosage: Any, |
| 93 | + variant_id: Any = None, |
| 94 | +) -> xr.Dataset: |
| 95 | + """Create a dataset of genotype calls. |
| 96 | +
|
| 97 | + Parameters |
| 98 | + ---------- |
| 99 | + variant_contig_names : list of str |
| 100 | + The contig names. |
| 101 | + variant_contig : array_like, int |
| 102 | + The (index of the) contig for each variant. |
| 103 | + variant_position : array_like, int |
| 104 | + The reference position of the variant. |
| 105 | + variant_alleles : array_like, S1 |
| 106 | + The possible alleles for the variant. |
| 107 | + sample_id : array_like, str |
| 108 | + The unique identifier of the sample. |
| 109 | + call_dosage : array_like, float |
| 110 | + Dosages, encoded as floats, with NaN indicating a |
| 111 | + missing value. |
| 112 | + variant_id: array_like, str, optional |
| 113 | + The unique identifier of the variant. |
| 114 | +
|
| 115 | + Returns |
| 116 | + ------- |
| 117 | + xr.Dataset |
| 118 | + The dataset of genotype calls. |
| 119 | +
|
| 120 | + """ |
| 121 | + check_array_like(variant_contig, kind="i", ndim=1) |
| 122 | + check_array_like(variant_position, kind="i", ndim=1) |
| 123 | + check_array_like(variant_alleles, kind="S", ndim=2) |
| 124 | + check_array_like(sample_id, kind="U", ndim=1) |
| 125 | + check_array_like(call_dosage, kind="f", ndim=2) |
| 126 | + data_vars: Dict[Hashable, Any] = { |
| 127 | + "variant/contig": ([DIM_VARIANT], variant_contig), |
| 128 | + "variant/position": ([DIM_VARIANT], variant_position), |
| 129 | + "variant/alleles": ([DIM_VARIANT, DIM_ALLELE], variant_alleles), |
| 130 | + "sample/id": ([DIM_SAMPLE], sample_id), |
| 131 | + "call/dosage": ([DIM_VARIANT, DIM_SAMPLE], call_dosage), |
| 132 | + "call/dosage_mask": ([DIM_VARIANT, DIM_SAMPLE], np.isnan(call_dosage),), |
| 133 | + } |
| 134 | + if variant_id is not None: |
| 135 | + check_array_like(variant_id, kind="U", ndim=1) |
| 136 | + data_vars["variant/id"] = ([DIM_VARIANT], variant_id) |
| 137 | + attrs: Dict[Hashable, Any] = {"contigs": variant_contig_names} |
| 138 | + return xr.Dataset(data_vars=data_vars, attrs=attrs) |
0 commit comments