diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml new file mode 100644 index 0000000..10b4dd1 --- /dev/null +++ b/.github/workflows/build.yml @@ -0,0 +1,24 @@ +name: Check code standards and run tests + +on: [push, pull_request] + +jobs: + build: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: '3.8' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt -r requirements-dev.txt + - name: Run pre-commit + uses: pre-commit/action@v2.0.0 + - name: Test with pytest and coverage + run: | + pytest -v --cov=sgkit_plink --cov-report term-missing diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..6e3fc84 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,27 @@ +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v2.5.0 + hooks: + - id: check-merge-conflict + - id: debug-statements + - id: mixed-line-ending + - id: check-case-conflict + - id: check-yaml + - repo: https://github.com/asottile/seed-isort-config + rev: v2.2.0 + hooks: + - id: seed-isort-config + - repo: https://github.com/timothycrosley/isort + rev: 4.3.21-2 # pick the isort version you'd like to use from https://github.com/timothycrosley/isort/releases + hooks: + - id: isort + - repo: https://github.com/python/black + rev: 19.10b0 + hooks: + - id: black + language_version: python3 + - repo: https://gitlab.com/pycqa/flake8 + rev: 3.7.9 + hooks: + - id: flake8 + language_version: python3 diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..7d7c36c --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,8 @@ +black +flake8 +isort +mypy +pre-commit +pytest +pytest-cov +pytest-datadir diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..c2e0d4c --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +numpy +xarray +pysnptools>=0.4.19 +-e git://github.com/pystatgen/sgkit.git#egg=sgkit diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..9a875f7 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,30 @@ +[coverage:report] +fail_under = 100 + +[flake8] +ignore = + # whitespace before ':' - doesn't work well with black + E203 + E402 + # line too long - let black worry about that + E501 + # do not assign a lambda expression, use a def + E731 + # line break before binary operator + W503 + +[isort] +default_section = THIRDPARTY +known_first_party = sgkit +known_third_party = dask,numpy,pysnptools,pytest,xarray +multi_line_output = 3 +include_trailing_comma = True +force_grid_wrap = 0 +use_parentheses = True +line_length = 88 + +[mypy-numpy.*] +ignore_missing_imports = True + +[mypy-sgkit_plink.tests.*] +disallow_untyped_defs = False diff --git a/sgkit_plink/__init__.py b/sgkit_plink/__init__.py new file mode 100644 index 0000000..32e558e --- /dev/null +++ b/sgkit_plink/__init__.py @@ -0,0 +1,3 @@ +from .pysnptools import read_plink # noqa: F401 + +__all__ = ["read_plink"] diff --git a/sgkit_plink/pysnptools.py b/sgkit_plink/pysnptools.py new file mode 100644 index 0000000..08bd079 --- /dev/null +++ b/sgkit_plink/pysnptools.py @@ -0,0 +1,295 @@ +"""PLINK 1.9 reader implementation""" +from pathlib import Path +from typing import Union + +import dask.array as da +import dask.dataframe as dd +import numpy as np +from dask.dataframe import DataFrame +from pysnptools.snpreader import Bed +from xarray import Dataset + +from sgkit import create_genotype_call_dataset +from sgkit.api import DIM_SAMPLE + +PathType = Union[str, Path] + +FAM_FIELDS = [ + ("family_id", str, "U"), + ("member_id", str, "U"), + ("paternal_id", str, "U"), + ("maternal_id", str, "U"), + ("sex", str, "int8"), + ("phenotype", str, "int8"), +] +FAM_DF_DTYPE = dict([(f[0], f[1]) for f in FAM_FIELDS]) +FAM_ARRAY_DTYPE = dict([(f[0], f[2]) for f in FAM_FIELDS]) + +BIM_FIELDS = [ + ("contig", str, "U"), + ("variant_id", str, "U"), + ("cm_pos", "float32", "float32"), + ("pos", "int32", "int32"), + ("a1", str, "U"), + ("a2", str, "U"), +] +BIM_DF_DTYPE = dict([(f[0], f[1]) for f in BIM_FIELDS]) +BIM_ARRAY_DTYPE = dict([(f[0], f[2]) for f in BIM_FIELDS]) + + +class BedReader(object): + def __init__(self, path, shape, dtype=np.int8, count_A1=True): + # n variants (sid = SNP id), n samples (iid = Individual id) + n_sid, n_iid = shape + # Initialize Bed with empty arrays for axis data, otherwise it will + # load the bim/map/fam files entirely into memory (it does not do out-of-core for those) + self.bed = Bed( + str(path), + count_A1=count_A1, + # Array (n_sample, 2) w/ FID and IID + iid=np.empty((n_iid, 2), dtype="str"), + # SNP id array (n_variants) + sid=np.empty((n_sid,), dtype="str"), + # Contig and positions array (n_variants, 3) + pos=np.empty((n_sid, 3), dtype="int"), + ) + self.shape = (n_sid, n_iid, 2) + self.dtype = dtype + self.ndim = 3 + + def __getitem__(self, idx): + if not isinstance(idx, tuple): + raise IndexError( # pragma: no cover + f"Indexer must be tuple (received {type(idx)})" + ) + if len(idx) != self.ndim: + raise IndexError( # pragma: no cover + f"Indexer must be two-item tuple (received {len(idx)} slices)" + ) + # Slice using reversal of first two slices -- + # pysnptools uses sample x variant orientation + arr = self.bed[idx[1::-1]].read(dtype=np.float32, view_ok=False).val.T + # Convert missing calls as nan to -1 + arr = np.nan_to_num(arr, nan=-1.0) + arr = arr.astype(self.dtype) + # Add a ploidy dimension, so allele counts of 0, 1, 2 correspond to 00, 10, 11 + arr = np.stack( + [ + np.where(arr < 0, -1, np.where(arr == 0, 0, 1)), + np.where(arr < 0, -1, np.where(arr == 2, 1, 0)), + ], + axis=-1, + ) + + # Apply final slice to 3D result + return arr[:, :, idx[-1]] + + def close(self): + # This is not actually crucial since a Bed instance with no + # in-memory bim/map/fam data is essentially just a file pointer + # but this will still be problematic if the an array is created + # from the same PLINK dataset many times + self.bed._close_bed() # pragma: no cover + + +def _to_dict(df, dtype=None): + return { + c: df[c].to_dask_array(lengths=True).astype(dtype[c] if dtype else df[c].dtype) + for c in df + } + + +def read_fam(path: PathType, sep: str = " ") -> DataFrame: + # See: https://www.cog-genomics.org/plink/1.9/formats#fam + names = [f[0] for f in FAM_FIELDS] + df = dd.read_csv(str(path) + ".fam", sep=sep, names=names, dtype=FAM_DF_DTYPE) + + def coerce_code(v, codes): + # Set non-ints and unexpected codes to missing (-1) + v = dd.to_numeric(v, errors="coerce") + v = v.where(v.isin(codes), np.nan) + return v.fillna(-1).astype("int8") + + df["paternal_id"] = df["paternal_id"].where(df["paternal_id"] != "0", None) + df["maternal_id"] = df["maternal_id"].where(df["maternal_id"] != "0", None) + df["sex"] = coerce_code(df["sex"], [1, 2]) + df["phenotype"] = coerce_code(df["phenotype"], [1, 2]) + + return df + + +def read_bim(path: PathType, sep: str = "\t") -> DataFrame: + # See: https://www.cog-genomics.org/plink/1.9/formats#bim + names = [f[0] for f in BIM_FIELDS] + df = dd.read_csv(str(path) + ".bim", sep=sep, names=names, dtype=BIM_DF_DTYPE) + df["contig"] = df["contig"].where(df["contig"] != "0", None) + return df + + +def encode_array(x): + """Encode array values as integers indexing unique values + + The codes created for each unique element in the array correspond + to order of appearance, not the natural sort order for the array + dtype. + + Examples + -------- + + >>> encode_array(['c', 'a', 'a', 'b']) + (array([0, 1, 1, 2]), array(['c', 'a', 'b'], dtype=' Dataset: + """Read PLINK dataset + + Loads a single PLINK dataset as dask arrays within a Dataset + from bed, bim, and fam files. + + Parameters + ---------- + path : PathType + Path to PLINK file set. + This should not include a suffix, i.e. if the files are + at `data.{bed,fam,bim}` then only 'data' should be + provided (suffixes are added internally). + chunks : Union[str, int, tuple], optional + Chunk size for genotype (i.e. `.bed`) data, by default "auto" + fam_sep : str, optional + Delimiter for `.fam` file, by default " " + bim_sep : str, optional + Delimiter for `.bim` file, by default "\t" + bim_int_contig : bool, optional + Whether or not the contig/chromosome name in the `.bim` + file should be interpreted as an integer, by default False. + If False, then the `variant/contig` field in the resulting + dataset will contain the indexes of corresponding strings + encountered in the first `.bim` field. + count_a1 : bool, optional + Whether or not allele counts should be for A1 or A2, + by default True. Typically A1 is the minor allele + and should be counted instead of A2. This is not enforced + by PLINK though and it is up to the data generating process + to ensure that A1 is in fact an alternate/minor/effect + allele. See https://www.cog-genomics.org/plink/1.9/formats + for more details. + lock : bool, optional + Whether or not to synchronize concurrent reads of `.bed` + file blocks, by default False. This is passed through to + [dask.array.from_array](https://docs.dask.org/en/latest/array-api.html#dask.array.from_array). + persist : bool, optional + Whether or not to persist `.fam` and `.bim` information in + memory, by default True. This is an important performance + consideration as the plain text files for this data will + be read multiple times when False. This can lead to load + times that are upwards of 10x slower. + + Returns + ------- + Dataset + A dataset containing genotypes as 3 dimensional calls along with + all accompanying pedigree and variant information. The content + of this dataset matches that of `sgkit.create_genotype_call_dataset` + with all pedigree-specific fields defined as: + - sample/family_id: Family identifier commonly referred to as FID + - sample/id: Within-family identifier for sample + - sample/paternal_id: Within-family identifier for father of sample + - sample/maternal_id: Within-family identifier for mother of sample + - sample/sex: Sex code equal to 1 for male, 2 for female, and -1 + for missing + - sample/phenotype: Phenotype code equal to 1 for control, 2 for case, + and -1 for missing + + See https://www.cog-genomics.org/plink/1.9/formats#fam for more details. + """ + + # Load axis data first to determine dimension sizes + df_fam = read_fam(path, sep=fam_sep) + df_bim = read_bim(path, sep=bim_sep) + + if persist: + df_fam = df_fam.persist() + df_bim = df_bim.persist() + + arr_fam = _to_dict(df_fam, dtype=FAM_ARRAY_DTYPE) + arr_bim = _to_dict(df_bim, dtype=BIM_ARRAY_DTYPE) + + # Load genotyping data + call_genotype = da.from_array( + # Make sure to use asarray=False in order for masked arrays to propagate + BedReader(path, (len(df_bim), len(df_fam)), count_A1=count_a1), + chunks=chunks, + # Lock must be true with multiprocessing dask scheduler + # to not get pysnptools errors (it works w/ threading backend though) + lock=lock, + asarray=False, + name=f"pysnptools:read_plink:{path}", + ) + + # If contigs are already integers, use them as-is + if bim_int_contig: + variant_contig = arr_bim["contig"].astype("int16") + variant_contig_names = da.unique(variant_contig).astype(str) + variant_contig_names = list(variant_contig_names.compute()) + # Otherwise create index for contig names based + # on order of appearance in underlying .bim file + else: + variant_contig, variant_contig_names = encode_array(arr_bim["contig"].compute()) + variant_contig = variant_contig.astype("int16") + variant_contig_names = list(variant_contig_names) + + variant_position = arr_bim["pos"] + a1 = arr_bim["a1"].astype("str") + a2 = arr_bim["a2"].astype("str") + + # Note: column_stack not implemented in Dask, must use [v|h]stack + variant_alleles = da.hstack((a1[:, np.newaxis], a2[:, np.newaxis])) + # TODO: Why use bytes for this instead of string? + variant_alleles = variant_alleles.astype("S") + variant_id = arr_bim["variant_id"] + + sample_id = arr_fam["member_id"] + + ds = create_genotype_call_dataset( + variant_contig_names=variant_contig_names, + variant_contig=variant_contig, + variant_position=variant_position, + variant_alleles=variant_alleles, + sample_id=sample_id, + call_genotype=call_genotype, + variant_id=variant_id, + ) + + # Assign PLINK-specific pedigree fields + ds = ds.assign( + **{f"sample/{f}": (DIM_SAMPLE, arr_fam[f]) for f in arr_fam if f != "member_id"} + ) + return ds diff --git a/sgkit_plink/tests/__init__.py b/sgkit_plink/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/sgkit_plink/tests/data/plink_sim_10s_100v_10pmiss.bed b/sgkit_plink/tests/data/plink_sim_10s_100v_10pmiss.bed new file mode 100644 index 0000000..7cd4487 Binary files /dev/null and b/sgkit_plink/tests/data/plink_sim_10s_100v_10pmiss.bed differ diff --git a/sgkit_plink/tests/data/plink_sim_10s_100v_10pmiss.bim b/sgkit_plink/tests/data/plink_sim_10s_100v_10pmiss.bim new file mode 100644 index 0000000..01325a9 --- /dev/null +++ b/sgkit_plink/tests/data/plink_sim_10s_100v_10pmiss.bim @@ -0,0 +1,100 @@ +1 1:1:A:C 0.0 1 C A +1 1:2:A:C 0.0 2 C A +1 1:3:A:C 0.0 3 C A +1 1:4:A:C 0.0 4 C A +1 1:5:A:C 0.0 5 C A +1 1:6:A:C 0.0 6 C A +1 1:7:A:C 0.0 7 C A +1 1:8:A:C 0.0 8 C A +1 1:9:A:C 0.0 9 C A +1 1:10:A:C 0.0 10 C A +1 1:11:A:C 0.0 11 C A +1 1:12:A:C 0.0 12 C A +1 1:13:A:C 0.0 13 C A +1 1:14:A:C 0.0 14 C A +1 1:15:A:C 0.0 15 C A +1 1:16:A:C 0.0 16 C A +1 1:17:A:C 0.0 17 C A +1 1:18:A:C 0.0 18 C A +1 1:19:A:C 0.0 19 C A +1 1:20:A:C 0.0 20 C A +1 1:21:A:C 0.0 21 C A +1 1:22:A:C 0.0 22 C A +1 1:23:A:C 0.0 23 C A +1 1:24:A:C 0.0 24 C A +1 1:25:A:C 0.0 25 C A +1 1:26:A:C 0.0 26 C A +1 1:27:A:C 0.0 27 C A +1 1:28:A:C 0.0 28 C A +1 1:29:A:C 0.0 29 C A +1 1:30:A:C 0.0 30 C A +1 1:31:A:C 0.0 31 C A +1 1:32:A:C 0.0 32 C A +1 1:33:A:C 0.0 33 C A +1 1:34:A:C 0.0 34 C A +1 1:35:A:C 0.0 35 C A +1 1:36:A:C 0.0 36 C A +1 1:37:A:C 0.0 37 C A +1 1:38:A:C 0.0 38 C A +1 1:39:A:C 0.0 39 C A +1 1:40:A:C 0.0 40 C A +1 1:41:A:C 0.0 41 C A +1 1:42:A:C 0.0 42 C A +1 1:43:A:C 0.0 43 C A +1 1:44:A:C 0.0 44 C A +1 1:45:A:C 0.0 45 C A +1 1:46:A:C 0.0 46 C A +1 1:47:A:C 0.0 47 C A +1 1:48:A:C 0.0 48 C A +1 1:49:A:C 0.0 49 C A +1 1:50:A:C 0.0 50 C A +1 1:51:A:C 0.0 51 C A +1 1:52:A:C 0.0 52 C A +1 1:53:A:C 0.0 53 C A +1 1:54:A:C 0.0 54 C A +1 1:55:A:C 0.0 55 C A +1 1:56:A:C 0.0 56 C A +1 1:57:A:C 0.0 57 C A +1 1:58:A:C 0.0 58 C A +1 1:59:A:C 0.0 59 C A +1 1:60:A:C 0.0 60 C A +1 1:61:A:C 0.0 61 C A +1 1:62:A:C 0.0 62 C A +1 1:63:A:C 0.0 63 C A +1 1:64:A:C 0.0 64 C A +1 1:65:A:C 0.0 65 C A +1 1:66:A:C 0.0 66 C A +1 1:67:A:C 0.0 67 C A +1 1:68:A:C 0.0 68 C A +1 1:69:A:C 0.0 69 C A +1 1:70:A:C 0.0 70 C A +1 1:71:A:C 0.0 71 C A +1 1:72:A:C 0.0 72 C A +1 1:73:A:C 0.0 73 C A +1 1:74:A:C 0.0 74 C A +1 1:75:A:C 0.0 75 C A +1 1:76:A:C 0.0 76 C A +1 1:77:A:C 0.0 77 C A +1 1:78:A:C 0.0 78 C A +1 1:79:A:C 0.0 79 C A +1 1:80:A:C 0.0 80 C A +1 1:81:A:C 0.0 81 C A +1 1:82:A:C 0.0 82 C A +1 1:83:A:C 0.0 83 C A +1 1:84:A:C 0.0 84 C A +1 1:85:A:C 0.0 85 C A +1 1:86:A:C 0.0 86 C A +1 1:87:A:C 0.0 87 C A +1 1:88:A:C 0.0 88 C A +1 1:89:A:C 0.0 89 C A +1 1:90:A:C 0.0 90 C A +1 1:91:A:C 0.0 91 C A +1 1:92:A:C 0.0 92 C A +1 1:93:A:C 0.0 93 C A +1 1:94:A:C 0.0 94 C A +1 1:95:A:C 0.0 95 C A +1 1:96:A:C 0.0 96 C A +1 1:97:A:C 0.0 97 C A +1 1:98:A:C 0.0 98 C A +1 1:99:A:C 0.0 99 C A +1 1:100:A:C 0.0 100 C A diff --git a/sgkit_plink/tests/data/plink_sim_10s_100v_10pmiss.fam b/sgkit_plink/tests/data/plink_sim_10s_100v_10pmiss.fam new file mode 100644 index 0000000..e0af7f4 --- /dev/null +++ b/sgkit_plink/tests/data/plink_sim_10s_100v_10pmiss.fam @@ -0,0 +1,10 @@ +0 0 0 0 0 NA +0 1 0 0 0 NA +0 2 0 0 0 NA +0 3 0 0 0 NA +0 4 0 0 0 NA +0 5 0 0 0 NA +0 6 0 0 0 NA +0 7 0 0 0 NA +0 8 0 0 0 NA +0 9 0 0 0 NA diff --git a/sgkit_plink/tests/test_pysnptools.py b/sgkit_plink/tests/test_pysnptools.py new file mode 100644 index 0000000..7609587 --- /dev/null +++ b/sgkit_plink/tests/test_pysnptools.py @@ -0,0 +1,112 @@ +import numpy as np +import pytest +from sgkit_plink import pysnptools + +example_dataset_1 = "plink_sim_10s_100v_10pmiss" + + +@pytest.fixture(params=[dict()]) +def ds1(shared_datadir, request): + path = shared_datadir / example_dataset_1 + return pysnptools.read_plink(path, bim_sep="\t", fam_sep="\t", **request.param) + + +def test_read_slicing(ds1): + gt = ds1["call/genotype"] + shape = gt.shape + assert gt[:3].shape == (3,) + shape[1:] + assert gt[:, :3].shape == shape[:1] + (3,) + shape[2:] + assert gt[:3, :5].shape == (3, 5) + shape[2:] + assert gt[:3, :5, :1].shape == (3, 5, 1) + + +def test_encode_array(): + def check(x, values, names): + v, n = pysnptools.encode_array(x) + np.testing.assert_equal(v, values) + np.testing.assert_equal(n, names) + + check([], [], []) + check(["a"], [0], ["a"]) + check(["a", "b"], [0, 1], ["a", "b"]) + check(["b", "a"], [0, 1], ["b", "a"]) + check(["a", "b", "b"], [0, 1, 1], ["a", "b"]) + check(["b", "b", "a"], [0, 0, 1], ["b", "a"]) + check(["b", "b", "a", "a"], [0, 0, 1, 1], ["b", "a"]) + check(["c", "a", "a", "b"], [0, 1, 1, 2], ["c", "a", "b"]) + check(["b", "b", "c", "c", "c", "a", "a"], [0, 0, 1, 1, 1, 2, 2], ["b", "c", "a"]) + check(["b", "c", "b", "c", "a"], [0, 1, 0, 1, 2], ["b", "c", "a"]) + check([2, 2, 1, 3, 1, 5, 5, 1], [0, 0, 1, 2, 1, 3, 3, 1], [2.0, 1.0, 3.0, 5.0]) + check( + [2.0, 2.0, 1.0, 3.0, 1.0, 5.0, 5.0, 1.0], + [0, 0, 1, 2, 1, 3, 3, 1], + [2.0, 1.0, 3.0, 5.0], + ) + + +@pytest.mark.parametrize("ds1", [dict(bim_int_contig=True)], indirect=True) +def test_read_int_contig(ds1): + # Test contig parse as int (the value is always "1" in .bed for ds1) + assert np.all(ds1["variant/contig"].values == 1) + assert ds1.attrs["contigs"] == ["1"] + + +@pytest.mark.parametrize("ds1", [dict(bim_int_contig=False)], indirect=True) +def test_read_str_contig(ds1): + # Test contig indexing as string (the value is always "1" in .bed for ds1) + assert np.all(ds1["variant/contig"].values == 0) + assert ds1.attrs["contigs"] == ["1"] + + +def test_read_call_values(ds1): + # Validate a few randomly selected individual calls + # (spanning all possible states for a call) + idx = np.array( + [ + [50, 7], + [81, 8], + [45, 2], + [36, 8], + [24, 2], + [92, 9], + [26, 2], + [81, 0], + [31, 8], + [4, 9], + ] + ) + expected = np.array( + [ + [1, 0], + [1, 0], + [1, 1], + [1, 1], + [-1, -1], + [0, 0], + [0, 0], + [1, 1], + [0, 0], + [0, 0], + ] + ) + gt = ds1["call/genotype"].values + actual = gt[tuple(idx.T)] + np.testing.assert_equal(actual, expected) + + +def test_read_stat_call_rate(ds1): + # Validate call rate for each sample + sample_call_rates = ( + (ds1["call/genotype"] >= 0).max(dim="ploidy").mean(dim="variants").values + ) + np.testing.assert_equal( + sample_call_rates, [0.95, 0.9, 0.91, 0.87, 0.86, 0.83, 0.86, 0.87, 0.92, 0.92] + ) + + +def test_read_stat_alt_alleles(ds1): + # Validate alt allele sum for each sample + n_alt_alleles = ( + ds1["call/genotype"].clip(0, 2).sum(dim="ploidy").sum(dim="variants").values + ) + np.testing.assert_equal(n_alt_alleles, [102, 95, 98, 94, 88, 91, 90, 98, 96, 103])