sgkit-dev · eric-czech · Jul 7, 2020 · Jul 1, 2020 · Jul 2, 2020 · Jul 2, 2020
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -0,0 +1,24 @@
+name: Check code standards and run tests
+
+on: [push, pull_request]
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v2
+      with:
+        python-version: '3.8'
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -r requirements.txt -r requirements-dev.txt
+    - name: Run pre-commit
+      uses: pre-commit/[email protected]
+    - name: Test with pytest and coverage
+      run: |
+        pytest -v --cov=sgkit_plink --cov-report term-missing
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,27 @@
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v2.5.0
+    hooks:
+      - id: check-merge-conflict
+      - id: debug-statements
+      - id: mixed-line-ending
+      - id: check-case-conflict
+      - id: check-yaml
+  - repo: https://github.com/asottile/seed-isort-config
+    rev: v2.2.0
+    hooks:
+      - id: seed-isort-config
+  - repo: https://github.com/timothycrosley/isort
+    rev: 4.3.21-2  # pick the isort version you'd like to use from https://github.com/timothycrosley/isort/releases
+    hooks:
+      - id: isort
+  - repo: https://github.com/python/black
+    rev: 19.10b0
+    hooks:
+      - id: black
+        language_version: python3
+  - repo: https://gitlab.com/pycqa/flake8
+    rev: 3.7.9
+    hooks:
+      - id: flake8
+        language_version: python3
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -0,0 +1,8 @@
+black
+flake8
+isort
+mypy
+pre-commit
+pytest
+pytest-cov
+pytest-datadir
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,4 @@
+numpy
+xarray
+pysnptools>=0.4.19
+-e git://github.com/pystatgen/sgkit.git#egg=sgkit
diff --git a/setup.cfg b/setup.cfg
@@ -0,0 +1,30 @@
+[coverage:report]
+fail_under = 100
+
+[flake8]
+ignore =
+    # whitespace before ':' - doesn't work well with black
+    E203
+    E402
+    # line too long - let black worry about that
+    E501
+    # do not assign a lambda expression, use a def
+    E731
+    # line break before binary operator
+    W503
+
+[isort]
+default_section = THIRDPARTY
+known_first_party = sgkit
+known_third_party = dask,numpy,pysnptools,pytest,xarray
+multi_line_output = 3
+include_trailing_comma = True
+force_grid_wrap = 0
+use_parentheses = True
+line_length = 88
+
+[mypy-numpy.*]
+ignore_missing_imports = True
+
+[mypy-sgkit_plink.tests.*]
+disallow_untyped_defs = False
diff --git a/sgkit_plink/__init__.py b/sgkit_plink/__init__.py
@@ -0,0 +1,3 @@
+from .pysnptools import read_plink  # noqa: F401
+
+__all__ = ["read_plink"]
diff --git a/sgkit_plink/pysnptools.py b/sgkit_plink/pysnptools.py
@@ -0,0 +1,295 @@
+"""PLINK 1.9 reader implementation"""
+from pathlib import Path
+from typing import Union
+
+import dask.array as da
+import dask.dataframe as dd
+import numpy as np
+from dask.dataframe import DataFrame
+from pysnptools.snpreader import Bed
+from xarray import Dataset
+
+from sgkit import create_genotype_call_dataset
+from sgkit.api import DIM_SAMPLE
+
+PathType = Union[str, Path]
+
+FAM_FIELDS = [
+    ("family_id", str, "U"),
+    ("member_id", str, "U"),
+    ("paternal_id", str, "U"),
+    ("maternal_id", str, "U"),
+    ("sex", str, "int8"),
+    ("phenotype", str, "int8"),
+]
+FAM_DF_DTYPE = dict([(f[0], f[1]) for f in FAM_FIELDS])
+FAM_ARRAY_DTYPE = dict([(f[0], f[2]) for f in FAM_FIELDS])
+
+BIM_FIELDS = [
+    ("contig", str, "U"),
+    ("variant_id", str, "U"),
+    ("cm_pos", "float32", "float32"),
+    ("pos", "int32", "int32"),
+    ("a1", str, "U"),
+    ("a2", str, "U"),
+]
+BIM_DF_DTYPE = dict([(f[0], f[1]) for f in BIM_FIELDS])
+BIM_ARRAY_DTYPE = dict([(f[0], f[2]) for f in BIM_FIELDS])
+
+
+class BedReader(object):
+    def __init__(self, path, shape, dtype=np.int8, count_A1=True):
+        # n variants (sid = SNP id), n samples (iid = Individual id)
+        n_sid, n_iid = shape
+        # Initialize Bed with empty arrays for axis data, otherwise it will
+        # load the bim/map/fam files entirely into memory (it does not do out-of-core for those)
+        self.bed = Bed(
+            str(path),
+            count_A1=count_A1,
+            # Array (n_sample, 2) w/ FID and IID
+            iid=np.empty((n_iid, 2), dtype="str"),
+            # SNP id array (n_variants)
+            sid=np.empty((n_sid,), dtype="str"),
+            # Contig and positions array (n_variants, 3)
+            pos=np.empty((n_sid, 3), dtype="int"),
+        )
+        self.shape = (n_sid, n_iid, 2)
+        self.dtype = dtype
+        self.ndim = 3
+
+    def __getitem__(self, idx):
+        if not isinstance(idx, tuple):
+            raise IndexError(  # pragma: no cover
+                f"Indexer must be tuple (received {type(idx)})"
+            )
+        if len(idx) != self.ndim:
+            raise IndexError(  # pragma: no cover
+                f"Indexer must be two-item tuple (received {len(idx)} slices)"
+            )
+        # Slice using reversal of first two slices --
+        # pysnptools uses sample x variant orientation
+        arr = self.bed[idx[1::-1]].read(dtype=np.float32, view_ok=False).val.T
+        # Convert missing calls as nan to -1
+        arr = np.nan_to_num(arr, nan=-1.0)
+        arr = arr.astype(self.dtype)
+        # Add a ploidy dimension, so allele counts of 0, 1, 2 correspond to 00, 10, 11
+        arr = np.stack(
+            [
+                np.where(arr < 0, -1, np.where(arr == 0, 0, 1)),
+                np.where(arr < 0, -1, np.where(arr == 2, 1, 0)),
+            ],
+            axis=-1,
+        )
+
+        # Apply final slice to 3D result
+        return arr[:, :, idx[-1]]
+
+    def close(self):
+        # This is not actually crucial since a Bed instance with no
+        # in-memory bim/map/fam data is essentially just a file pointer
+        # but this will still be problematic if the an array is created
+        # from the same PLINK dataset many times
+        self.bed._close_bed()  # pragma: no cover
+
+
+def _to_dict(df, dtype=None):
+    return {
+        c: df[c].to_dask_array(lengths=True).astype(dtype[c] if dtype else df[c].dtype)
+        for c in df
+    }
+
+
+def read_fam(path: PathType, sep: str = " ") -> DataFrame:
+    # See: https://www.cog-genomics.org/plink/1.9/formats#fam
+    names = [f[0] for f in FAM_FIELDS]
+    df = dd.read_csv(str(path) + ".fam", sep=sep, names=names, dtype=FAM_DF_DTYPE)
+
+    def coerce_code(v, codes):
+        # Set non-ints and unexpected codes to missing (-1)
+        v = dd.to_numeric(v, errors="coerce")
+        v = v.where(v.isin(codes), np.nan)
+        return v.fillna(-1).astype("int8")
+
+    df["paternal_id"] = df["paternal_id"].where(df["paternal_id"] != "0", None)
+    df["maternal_id"] = df["maternal_id"].where(df["maternal_id"] != "0", None)
+    df["sex"] = coerce_code(df["sex"], [1, 2])
+    df["phenotype"] = coerce_code(df["phenotype"], [1, 2])
+
+    return df
+
+
+def read_bim(path: PathType, sep: str = "\t") -> DataFrame:
+    # See: https://www.cog-genomics.org/plink/1.9/formats#bim
+    names = [f[0] for f in BIM_FIELDS]
+    df = dd.read_csv(str(path) + ".bim", sep=sep, names=names, dtype=BIM_DF_DTYPE)
+    df["contig"] = df["contig"].where(df["contig"] != "0", None)
+    return df
+
+
+def encode_array(x):
+    """Encode array values as integers indexing unique values
+
+    The codes created for each unique element in the array correspond
+    to order of appearance, not the natural sort order for the array
+    dtype.
+
+    Examples
+    --------
+
+    >>> encode_array(['c', 'a', 'a', 'b'])
+    (array([0, 1, 1, 2]), array(['c', 'a', 'b'], dtype='<U1'))
+
+    Parameters
+    ----------
+    x : (M,) array-like
+        Array of elements to encode of any type
+
+    Returns
+    -------
+    indexes : (M,) ndarray
+        Encoded values as integer indices
+    values : ndarray
+        Unique values in original array in order of appearance
+    """
+    # argsort not implemented in dask: https://github.com/dask/dask/issues/4368
+    names, index, inverse = np.unique(x, return_index=True, return_inverse=True)
+    index = np.argsort(index)
+    rank = np.empty_like(index)
+    rank[index] = np.arange(len(index))
+    return rank[inverse], names[index]
+
+
+def read_plink(
+    path: PathType,
+    chunks: Union[str, int, tuple] = "auto",
+    fam_sep: str = " ",
+    bim_sep: str = "\t",
+    bim_int_contig: bool = False,
+    count_a1: bool = True,
+    lock: bool = False,
+    persist: bool = True,
+) -> Dataset:
+    """Read PLINK dataset
+
+    Loads a single PLINK dataset as dask arrays within a Dataset
+    from bed, bim, and fam files.
+
+    Parameters
+    ----------
+    path : PathType
+        Path to PLINK file set.
+        This should not include a suffix, i.e. if the files are
+        at `data.{bed,fam,bim}` then only 'data' should be
+        provided (suffixes are added internally).
+    chunks : Union[str, int, tuple], optional
+        Chunk size for genotype (i.e. `.bed`) data, by default "auto"
+    fam_sep : str, optional
+        Delimiter for `.fam` file, by default " "
+    bim_sep : str, optional
+        Delimiter for `.bim` file, by default "\t"
+    bim_int_contig : bool, optional
+        Whether or not the contig/chromosome name in the `.bim`
+        file should be interpreted as an integer, by default False.
+        If False, then the `variant/contig` field in the resulting
+        dataset will contain the indexes of corresponding strings
+        encountered in the first `.bim` field.
+    count_a1 : bool, optional
+        Whether or not allele counts should be for A1 or A2,
+        by default True. Typically A1 is the minor allele
+        and should be counted instead of A2.  This is not enforced
+        by PLINK though and it is up to the data generating process
+        to ensure that A1 is in fact an alternate/minor/effect
+        allele. See https://www.cog-genomics.org/plink/1.9/formats
+        for more details.
+    lock : bool, optional
+        Whether or not to synchronize concurrent reads of `.bed`
+        file blocks, by default False. This is passed through to
+        [dask.array.from_array](https://docs.dask.org/en/latest/array-api.html#dask.array.from_array).
+    persist : bool, optional
+        Whether or not to persist `.fam` and `.bim` information in
+        memory, by default True.  This is an important performance
+        consideration as the plain text files for this data will
+        be read multiple times when False. This can lead to load
+        times that are upwards of 10x slower.
+
+    Returns
+    -------
+    Dataset
+        A dataset containing genotypes as 3 dimensional calls along with
+        all accompanying pedigree and variant information. The content
+        of this dataset matches that of `sgkit.create_genotype_call_dataset`
+        with all pedigree-specific fields defined as:
+            - sample/family_id: Family identifier commonly referred to as FID
+            - sample/id: Within-family identifier for sample
+            - sample/paternal_id: Within-family identifier for father of sample
+            - sample/maternal_id: Within-family identifier for mother of sample
+            - sample/sex: Sex code equal to 1 for male, 2 for female, and -1
+                for missing
+            - sample/phenotype: Phenotype code equal to 1 for control, 2 for case,
+                and -1 for missing
+
+        See https://www.cog-genomics.org/plink/1.9/formats#fam for more details.
+    """
+
+    # Load axis data first to determine dimension sizes
+    df_fam = read_fam(path, sep=fam_sep)
+    df_bim = read_bim(path, sep=bim_sep)
+
+    if persist:
+        df_fam = df_fam.persist()
+        df_bim = df_bim.persist()
+
+    arr_fam = _to_dict(df_fam, dtype=FAM_ARRAY_DTYPE)
+    arr_bim = _to_dict(df_bim, dtype=BIM_ARRAY_DTYPE)
+
+    # Load genotyping data
+    call_genotype = da.from_array(
+        # Make sure to use asarray=False in order for masked arrays to propagate
+        BedReader(path, (len(df_bim), len(df_fam)), count_A1=count_a1),
+        chunks=chunks,
+        # Lock must be true with multiprocessing dask scheduler
+        # to not get pysnptools errors (it works w/ threading backend though)
+        lock=lock,
+        asarray=False,
+        name=f"pysnptools:read_plink:{path}",
+    )
+
+    # If contigs are already integers, use them as-is
+    if bim_int_contig:
+        variant_contig = arr_bim["contig"].astype("int16")
+        variant_contig_names = da.unique(variant_contig).astype(str)
+        variant_contig_names = list(variant_contig_names.compute())
+    # Otherwise create index for contig names based
+    # on order of appearance in underlying .bim file
+    else:
+        variant_contig, variant_contig_names = encode_array(arr_bim["contig"].compute())
+        variant_contig = variant_contig.astype("int16")
+        variant_contig_names = list(variant_contig_names)
+
+    variant_position = arr_bim["pos"]
+    a1 = arr_bim["a1"].astype("str")
+    a2 = arr_bim["a2"].astype("str")
+
+    # Note: column_stack not implemented in Dask, must use [v|h]stack
+    variant_alleles = da.hstack((a1[:, np.newaxis], a2[:, np.newaxis]))
+    # TODO: Why use bytes for this instead of string?
+    variant_alleles = variant_alleles.astype("S")
+    variant_id = arr_bim["variant_id"]
+
+    sample_id = arr_fam["member_id"]
+
+    ds = create_genotype_call_dataset(
+        variant_contig_names=variant_contig_names,
+        variant_contig=variant_contig,
+        variant_position=variant_position,
+        variant_alleles=variant_alleles,
+        sample_id=sample_id,
+        call_genotype=call_genotype,
+        variant_id=variant_id,
+    )
+
+    # Assign PLINK-specific pedigree fields
+    ds = ds.assign(
+        **{f"sample/{f}": (DIM_SAMPLE, arr_fam[f]) for f in arr_fam if f != "member_id"}
+    )
+    return ds
diff --git a/sgkit_plink/tests/__init__.py b/sgkit_plink/tests/__init__.py
diff --git a/sgkit_plink/tests/data/plink_sim_10s_100v_10pmiss.bed b/sgkit_plink/tests/data/plink_sim_10s_100v_10pmiss.bed
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		from .pysnptools import read_plink # noqa: F401

		__all__ = ["read_plink"]