|
8 | 8 | from numpy.testing import assert_allclose, assert_array_equal
|
9 | 9 |
|
10 | 10 | from sgkit import load_dataset
|
11 |
| -from sgkit.io.vcf import partition_into_regions, vcf_to_zarr |
| 11 | +from sgkit.io.vcf import ( |
| 12 | + MaxAltAllelesExceededWarning, |
| 13 | + partition_into_regions, |
| 14 | + vcf_to_zarr, |
| 15 | +) |
12 | 16 |
|
13 | 17 | from .utils import path_for_test
|
14 | 18 |
|
@@ -98,30 +102,35 @@ def test_vcf_to_zarr__max_alt_alleles(shared_datadir, is_path, tmp_path):
|
98 | 102 | path = path_for_test(shared_datadir, "sample.vcf.gz", is_path)
|
99 | 103 | output = tmp_path.joinpath("vcf.zarr").as_posix()
|
100 | 104 |
|
101 |
| - vcf_to_zarr(path, output, chunk_length=5, chunk_width=2, max_alt_alleles=1) |
102 |
| - ds = xr.open_zarr(output) |
| 105 | + with pytest.warns(MaxAltAllelesExceededWarning): |
| 106 | + vcf_to_zarr(path, output, chunk_length=5, chunk_width=2, max_alt_alleles=1) |
| 107 | + ds = xr.open_zarr(output) |
103 | 108 |
|
104 |
| - # extra alt alleles are silently dropped |
105 |
| - assert_array_equal( |
106 |
| - ds["variant_allele"], |
107 |
| - [ |
108 |
| - ["A", "C"], |
109 |
| - ["A", "G"], |
110 |
| - ["G", "A"], |
111 |
| - ["T", "A"], |
112 |
| - ["A", "G"], |
113 |
| - ["T", ""], |
114 |
| - ["G", "GA"], |
115 |
| - ["T", ""], |
116 |
| - ["AC", "A"], |
117 |
| - ], |
118 |
| - ) |
| 109 | + # extra alt alleles are dropped |
| 110 | + assert_array_equal( |
| 111 | + ds["variant_allele"], |
| 112 | + [ |
| 113 | + ["A", "C"], |
| 114 | + ["A", "G"], |
| 115 | + ["G", "A"], |
| 116 | + ["T", "A"], |
| 117 | + ["A", "G"], |
| 118 | + ["T", ""], |
| 119 | + ["G", "GA"], |
| 120 | + ["T", ""], |
| 121 | + ["AC", "A"], |
| 122 | + ], |
| 123 | + ) |
| 124 | + |
| 125 | + # the maximum number of alt alleles actually seen is stored as an attribute |
| 126 | + assert ds.attrs["max_alt_alleles_seen"] == 3 |
119 | 127 |
|
120 | 128 |
|
121 | 129 | @pytest.mark.parametrize(
|
122 | 130 | "is_path",
|
123 | 131 | [True, False],
|
124 | 132 | )
|
| 133 | +@pytest.mark.filterwarnings("ignore::sgkit.io.vcf.MaxAltAllelesExceededWarning") |
125 | 134 | def test_vcf_to_zarr__large_vcf(shared_datadir, is_path, tmp_path):
|
126 | 135 | path = path_for_test(shared_datadir, "CEUTrio.20.21.gatk3.4.g.vcf.bgz", is_path)
|
127 | 136 | output = tmp_path.joinpath("vcf.zarr").as_posix()
|
@@ -159,6 +168,7 @@ def test_vcf_to_zarr__plain_vcf_with_no_index(shared_datadir, tmp_path):
|
159 | 168 | "is_path",
|
160 | 169 | [True, False],
|
161 | 170 | )
|
| 171 | +@pytest.mark.filterwarnings("ignore::sgkit.io.vcf.MaxAltAllelesExceededWarning") |
162 | 172 | def test_vcf_to_zarr__mutable_mapping(shared_datadir, is_path):
|
163 | 173 | path = path_for_test(shared_datadir, "CEUTrio.20.21.gatk3.4.g.vcf.bgz", is_path)
|
164 | 174 | output: MutableMapping[str, bytes] = {}
|
@@ -217,6 +227,7 @@ def test_vcf_to_zarr__compressor_and_filters(shared_datadir, is_path, tmp_path):
|
217 | 227 | "is_path",
|
218 | 228 | [True, False],
|
219 | 229 | )
|
| 230 | +@pytest.mark.filterwarnings("ignore::sgkit.io.vcf.MaxAltAllelesExceededWarning") |
220 | 231 | def test_vcf_to_zarr__parallel(shared_datadir, is_path, tmp_path):
|
221 | 232 | path = path_for_test(shared_datadir, "CEUTrio.20.21.gatk3.4.g.vcf.bgz", is_path)
|
222 | 233 | output = tmp_path.joinpath("vcf_concat.zarr").as_posix()
|
@@ -266,6 +277,7 @@ def test_vcf_to_zarr__empty_region(shared_datadir, is_path, tmp_path):
|
266 | 277 | "is_path",
|
267 | 278 | [False],
|
268 | 279 | )
|
| 280 | +@pytest.mark.filterwarnings("ignore::sgkit.io.vcf.MaxAltAllelesExceededWarning") |
269 | 281 | def test_vcf_to_zarr__parallel_temp_chunk_length(shared_datadir, is_path, tmp_path):
|
270 | 282 | path = path_for_test(shared_datadir, "CEUTrio.20.21.gatk3.4.g.vcf.bgz", is_path)
|
271 | 283 | output = tmp_path.joinpath("vcf_concat.zarr").as_posix()
|
@@ -354,6 +366,7 @@ def test_vcf_to_zarr__parallel_partitioned_by_size(shared_datadir, is_path, tmp_
|
354 | 366 | "is_path",
|
355 | 367 | [True, False],
|
356 | 368 | )
|
| 369 | +@pytest.mark.filterwarnings("ignore::sgkit.io.vcf.MaxAltAllelesExceededWarning") |
357 | 370 | def test_vcf_to_zarr__multiple(shared_datadir, is_path, tmp_path):
|
358 | 371 | paths = [
|
359 | 372 | path_for_test(shared_datadir, "CEUTrio.20.gatk3.4.g.vcf.bgz", is_path),
|
@@ -381,6 +394,7 @@ def test_vcf_to_zarr__multiple(shared_datadir, is_path, tmp_path):
|
381 | 394 | "is_path",
|
382 | 395 | [True, False],
|
383 | 396 | )
|
| 397 | +@pytest.mark.filterwarnings("ignore::sgkit.io.vcf.MaxAltAllelesExceededWarning") |
384 | 398 | def test_vcf_to_zarr__multiple_partitioned(shared_datadir, is_path, tmp_path):
|
385 | 399 | paths = [
|
386 | 400 | path_for_test(shared_datadir, "CEUTrio.20.gatk3.4.g.vcf.bgz", is_path),
|
@@ -410,6 +424,7 @@ def test_vcf_to_zarr__multiple_partitioned(shared_datadir, is_path, tmp_path):
|
410 | 424 | "is_path",
|
411 | 425 | [True, False],
|
412 | 426 | )
|
| 427 | +@pytest.mark.filterwarnings("ignore::sgkit.io.vcf.MaxAltAllelesExceededWarning") |
413 | 428 | def test_vcf_to_zarr__multiple_partitioned_by_size(shared_datadir, is_path, tmp_path):
|
414 | 429 | paths = [
|
415 | 430 | path_for_test(shared_datadir, "CEUTrio.20.gatk3.4.g.vcf.bgz", is_path),
|
@@ -456,6 +471,31 @@ def test_vcf_to_zarr__mutiple_partitioned_invalid_regions(
|
456 | 471 | vcf_to_zarr(paths, output, regions=regions, chunk_length=5_000)
|
457 | 472 |
|
458 | 473 |
|
| 474 | +@pytest.mark.parametrize( |
| 475 | + "is_path", |
| 476 | + [True, False], |
| 477 | +) |
| 478 | +def test_vcf_to_zarr__multiple_max_alt_alleles(shared_datadir, is_path, tmp_path): |
| 479 | + paths = [ |
| 480 | + path_for_test(shared_datadir, "CEUTrio.20.gatk3.4.g.vcf.bgz", is_path), |
| 481 | + path_for_test(shared_datadir, "CEUTrio.21.gatk3.4.g.vcf.bgz", is_path), |
| 482 | + ] |
| 483 | + output = tmp_path.joinpath("vcf_concat.zarr").as_posix() |
| 484 | + |
| 485 | + with pytest.warns(MaxAltAllelesExceededWarning): |
| 486 | + vcf_to_zarr( |
| 487 | + paths, |
| 488 | + output, |
| 489 | + target_part_size="40KB", |
| 490 | + chunk_length=5_000, |
| 491 | + max_alt_alleles=1, |
| 492 | + ) |
| 493 | + ds = xr.open_zarr(output) |
| 494 | + |
| 495 | + # the maximum number of alt alleles actually seen is stored as an attribute |
| 496 | + assert ds.attrs["max_alt_alleles_seen"] == 7 |
| 497 | + |
| 498 | + |
459 | 499 | @pytest.mark.parametrize(
|
460 | 500 | "ploidy,mixed_ploidy,truncate_calls,regions",
|
461 | 501 | [
|
@@ -647,6 +687,7 @@ def test_vcf_to_zarr__fields(shared_datadir, tmp_path):
|
647 | 687 | assert ds["call_DP"].attrs["comment"] == "Read Depth"
|
648 | 688 |
|
649 | 689 |
|
| 690 | +@pytest.mark.filterwarnings("ignore::sgkit.io.vcf.MaxAltAllelesExceededWarning") |
650 | 691 | def test_vcf_to_zarr__parallel_with_fields(shared_datadir, tmp_path):
|
651 | 692 | path = path_for_test(shared_datadir, "CEUTrio.20.21.gatk3.4.g.vcf.bgz")
|
652 | 693 | output = tmp_path.joinpath("vcf.zarr").as_posix()
|
@@ -703,6 +744,7 @@ def test_vcf_to_zarr__field_defs(shared_datadir, tmp_path):
|
703 | 744 | assert "comment" not in ds["variant_DP"].attrs
|
704 | 745 |
|
705 | 746 |
|
| 747 | +@pytest.mark.filterwarnings("ignore::sgkit.io.vcf.MaxAltAllelesExceededWarning") |
706 | 748 | def test_vcf_to_zarr__field_number_A(shared_datadir, tmp_path):
|
707 | 749 | path = path_for_test(shared_datadir, "sample.vcf.gz")
|
708 | 750 | output = tmp_path.joinpath("vcf.zarr").as_posix()
|
@@ -736,6 +778,7 @@ def test_vcf_to_zarr__field_number_A(shared_datadir, tmp_path):
|
736 | 778 | )
|
737 | 779 |
|
738 | 780 |
|
| 781 | +@pytest.mark.filterwarnings("ignore::sgkit.io.vcf.MaxAltAllelesExceededWarning") |
739 | 782 | def test_vcf_to_zarr__field_number_R(shared_datadir, tmp_path):
|
740 | 783 | path = path_for_test(shared_datadir, "CEUTrio.21.gatk3.4.g.vcf.bgz")
|
741 | 784 | output = tmp_path.joinpath("vcf.zarr").as_posix()
|
@@ -768,6 +811,7 @@ def test_vcf_to_zarr__field_number_R(shared_datadir, tmp_path):
|
768 | 811 | )
|
769 | 812 |
|
770 | 813 |
|
| 814 | +@pytest.mark.filterwarnings("ignore::sgkit.io.vcf.MaxAltAllelesExceededWarning") |
771 | 815 | def test_vcf_to_zarr__field_number_G(shared_datadir, tmp_path):
|
772 | 816 | path = path_for_test(shared_datadir, "CEUTrio.21.gatk3.4.g.vcf.bgz")
|
773 | 817 | output = tmp_path.joinpath("vcf.zarr").as_posix()
|
|
0 commit comments