Skip to content
This repository was archived by the owner on Oct 15, 2020. It is now read-only.

Pysnptools reader implementation #1

Merged
merged 15 commits into from
Jul 7, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
name: Check code standards and run tests

on: [push, pull_request]

jobs:
build:

runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: '3.8'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt -r requirements-dev.txt
- name: Run pre-commit
uses: pre-commit/[email protected]
- name: Test with pytest and coverage
run: |
pytest -v --cov=sgkit_plink --cov-report term-missing
27 changes: 27 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v2.5.0
hooks:
- id: check-merge-conflict
- id: debug-statements
- id: mixed-line-ending
- id: check-case-conflict
- id: check-yaml
- repo: https://github.com/asottile/seed-isort-config
rev: v2.2.0
hooks:
- id: seed-isort-config
- repo: https://github.com/timothycrosley/isort
rev: 4.3.21-2 # pick the isort version you'd like to use from https://github.com/timothycrosley/isort/releases
hooks:
- id: isort
- repo: https://github.com/python/black
rev: 19.10b0
hooks:
- id: black
language_version: python3
- repo: https://gitlab.com/pycqa/flake8
rev: 3.7.9
hooks:
- id: flake8
language_version: python3
8 changes: 8 additions & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
black
flake8
isort
mypy
pre-commit
pytest
pytest-cov
pytest-datadir
4 changes: 4 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
numpy
xarray
pysnptools>=0.4.19
-e git://github.com/pystatgen/sgkit.git#egg=sgkit
30 changes: 30 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
[coverage:report]
fail_under = 100

[flake8]
ignore =
# whitespace before ':' - doesn't work well with black
E203
E402
# line too long - let black worry about that
E501
# do not assign a lambda expression, use a def
E731
# line break before binary operator
W503

[isort]
default_section = THIRDPARTY
known_first_party = sgkit
known_third_party = dask,numpy,pysnptools,pytest,xarray
multi_line_output = 3
include_trailing_comma = True
force_grid_wrap = 0
use_parentheses = True
line_length = 88

[mypy-numpy.*]
ignore_missing_imports = True

[mypy-sgkit_plink.tests.*]
disallow_untyped_defs = False
3 changes: 3 additions & 0 deletions sgkit_plink/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .pysnptools import read_plink # noqa: F401

__all__ = ["read_plink"]
295 changes: 295 additions & 0 deletions sgkit_plink/pysnptools.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,295 @@
"""PLINK 1.9 reader implementation"""
from pathlib import Path
from typing import Union

import dask.array as da
import dask.dataframe as dd
import numpy as np
from dask.dataframe import DataFrame
from pysnptools.snpreader import Bed
from xarray import Dataset

from sgkit import create_genotype_call_dataset
from sgkit.api import DIM_SAMPLE

PathType = Union[str, Path]

FAM_FIELDS = [
("family_id", str, "U"),
("member_id", str, "U"),
("paternal_id", str, "U"),
("maternal_id", str, "U"),
("sex", str, "int8"),
("phenotype", str, "int8"),
]
FAM_DF_DTYPE = dict([(f[0], f[1]) for f in FAM_FIELDS])
FAM_ARRAY_DTYPE = dict([(f[0], f[2]) for f in FAM_FIELDS])

BIM_FIELDS = [
("contig", str, "U"),
("variant_id", str, "U"),
("cm_pos", "float32", "float32"),
("pos", "int32", "int32"),
("a1", str, "U"),
("a2", str, "U"),
]
BIM_DF_DTYPE = dict([(f[0], f[1]) for f in BIM_FIELDS])
BIM_ARRAY_DTYPE = dict([(f[0], f[2]) for f in BIM_FIELDS])


class BedReader(object):
def __init__(self, path, shape, dtype=np.int8, count_A1=True):
# n variants (sid = SNP id), n samples (iid = Individual id)
n_sid, n_iid = shape
# Initialize Bed with empty arrays for axis data, otherwise it will
# load the bim/map/fam files entirely into memory (it does not do out-of-core for those)
self.bed = Bed(
str(path),
count_A1=count_A1,
# Array (n_sample, 2) w/ FID and IID
iid=np.empty((n_iid, 2), dtype="str"),
# SNP id array (n_variants)
sid=np.empty((n_sid,), dtype="str"),
# Contig and positions array (n_variants, 3)
pos=np.empty((n_sid, 3), dtype="int"),
)
self.shape = (n_sid, n_iid, 2)
self.dtype = dtype
self.ndim = 3

def __getitem__(self, idx):
if not isinstance(idx, tuple):
raise IndexError( # pragma: no cover
f"Indexer must be tuple (received {type(idx)})"
)
if len(idx) != self.ndim:
raise IndexError( # pragma: no cover
f"Indexer must be two-item tuple (received {len(idx)} slices)"
)
# Slice using reversal of first two slices --
# pysnptools uses sample x variant orientation
arr = self.bed[idx[1::-1]].read(dtype=np.float32, view_ok=False).val.T
# Convert missing calls as nan to -1
arr = np.nan_to_num(arr, nan=-1.0)
arr = arr.astype(self.dtype)
# Add a ploidy dimension, so allele counts of 0, 1, 2 correspond to 00, 10, 11
arr = np.stack(
[
np.where(arr < 0, -1, np.where(arr == 0, 0, 1)),
np.where(arr < 0, -1, np.where(arr == 2, 1, 0)),
],
axis=-1,
)

# Apply final slice to 3D result
return arr[:, :, idx[-1]]

def close(self):
# This is not actually crucial since a Bed instance with no
# in-memory bim/map/fam data is essentially just a file pointer
# but this will still be problematic if the an array is created
# from the same PLINK dataset many times
self.bed._close_bed() # pragma: no cover


def _to_dict(df, dtype=None):
return {
c: df[c].to_dask_array(lengths=True).astype(dtype[c] if dtype else df[c].dtype)
for c in df
}


def read_fam(path: PathType, sep: str = " ") -> DataFrame:
# See: https://www.cog-genomics.org/plink/1.9/formats#fam
names = [f[0] for f in FAM_FIELDS]
df = dd.read_csv(str(path) + ".fam", sep=sep, names=names, dtype=FAM_DF_DTYPE)

def coerce_code(v, codes):
# Set non-ints and unexpected codes to missing (-1)
v = dd.to_numeric(v, errors="coerce")
v = v.where(v.isin(codes), np.nan)
return v.fillna(-1).astype("int8")

df["paternal_id"] = df["paternal_id"].where(df["paternal_id"] != "0", None)
df["maternal_id"] = df["maternal_id"].where(df["maternal_id"] != "0", None)
df["sex"] = coerce_code(df["sex"], [1, 2])
df["phenotype"] = coerce_code(df["phenotype"], [1, 2])

return df


def read_bim(path: PathType, sep: str = "\t") -> DataFrame:
# See: https://www.cog-genomics.org/plink/1.9/formats#bim
names = [f[0] for f in BIM_FIELDS]
df = dd.read_csv(str(path) + ".bim", sep=sep, names=names, dtype=BIM_DF_DTYPE)
df["contig"] = df["contig"].where(df["contig"] != "0", None)
return df


def encode_array(x):
"""Encode array values as integers indexing unique values

The codes created for each unique element in the array correspond
to order of appearance, not the natural sort order for the array
dtype.

Examples
--------

>>> encode_array(['c', 'a', 'a', 'b'])
(array([0, 1, 1, 2]), array(['c', 'a', 'b'], dtype='<U1'))

Parameters
----------
x : (M,) array-like
Array of elements to encode of any type

Returns
-------
indexes : (M,) ndarray
Encoded values as integer indices
values : ndarray
Unique values in original array in order of appearance
"""
# argsort not implemented in dask: https://github.com/dask/dask/issues/4368
names, index, inverse = np.unique(x, return_index=True, return_inverse=True)
index = np.argsort(index)
rank = np.empty_like(index)
rank[index] = np.arange(len(index))
return rank[inverse], names[index]


def read_plink(
path: PathType,
chunks: Union[str, int, tuple] = "auto",
fam_sep: str = " ",
bim_sep: str = "\t",
bim_int_contig: bool = False,
count_a1: bool = True,
lock: bool = False,
persist: bool = True,
) -> Dataset:
"""Read PLINK dataset

Loads a single PLINK dataset as dask arrays within a Dataset
from bed, bim, and fam files.

Parameters
----------
path : PathType
Path to PLINK file set.
This should not include a suffix, i.e. if the files are
at `data.{bed,fam,bim}` then only 'data' should be
provided (suffixes are added internally).
chunks : Union[str, int, tuple], optional
Chunk size for genotype (i.e. `.bed`) data, by default "auto"
fam_sep : str, optional
Delimiter for `.fam` file, by default " "
bim_sep : str, optional
Delimiter for `.bim` file, by default "\t"
bim_int_contig : bool, optional
Whether or not the contig/chromosome name in the `.bim`
file should be interpreted as an integer, by default False.
If False, then the `variant/contig` field in the resulting
dataset will contain the indexes of corresponding strings
encountered in the first `.bim` field.
count_a1 : bool, optional
Whether or not allele counts should be for A1 or A2,
by default True. Typically A1 is the minor allele
and should be counted instead of A2. This is not enforced
by PLINK though and it is up to the data generating process
to ensure that A1 is in fact an alternate/minor/effect
allele. See https://www.cog-genomics.org/plink/1.9/formats
for more details.
lock : bool, optional
Whether or not to synchronize concurrent reads of `.bed`
file blocks, by default False. This is passed through to
[dask.array.from_array](https://docs.dask.org/en/latest/array-api.html#dask.array.from_array).
persist : bool, optional
Whether or not to persist `.fam` and `.bim` information in
memory, by default True. This is an important performance
consideration as the plain text files for this data will
be read multiple times when False. This can lead to load
times that are upwards of 10x slower.

Returns
-------
Dataset
A dataset containing genotypes as 3 dimensional calls along with
all accompanying pedigree and variant information. The content
of this dataset matches that of `sgkit.create_genotype_call_dataset`
with all pedigree-specific fields defined as:
- sample/family_id: Family identifier commonly referred to as FID
- sample/id: Within-family identifier for sample
- sample/paternal_id: Within-family identifier for father of sample
- sample/maternal_id: Within-family identifier for mother of sample
- sample/sex: Sex code equal to 1 for male, 2 for female, and -1
for missing
- sample/phenotype: Phenotype code equal to 1 for control, 2 for case,
and -1 for missing

See https://www.cog-genomics.org/plink/1.9/formats#fam for more details.
"""

# Load axis data first to determine dimension sizes
df_fam = read_fam(path, sep=fam_sep)
df_bim = read_bim(path, sep=bim_sep)

if persist:
df_fam = df_fam.persist()
df_bim = df_bim.persist()

arr_fam = _to_dict(df_fam, dtype=FAM_ARRAY_DTYPE)
arr_bim = _to_dict(df_bim, dtype=BIM_ARRAY_DTYPE)

# Load genotyping data
call_genotype = da.from_array(
# Make sure to use asarray=False in order for masked arrays to propagate
BedReader(path, (len(df_bim), len(df_fam)), count_A1=count_a1),
chunks=chunks,
# Lock must be true with multiprocessing dask scheduler
# to not get pysnptools errors (it works w/ threading backend though)
lock=lock,
asarray=False,
name=f"pysnptools:read_plink:{path}",
)

# If contigs are already integers, use them as-is
if bim_int_contig:
variant_contig = arr_bim["contig"].astype("int16")
variant_contig_names = da.unique(variant_contig).astype(str)
variant_contig_names = list(variant_contig_names.compute())
# Otherwise create index for contig names based
# on order of appearance in underlying .bim file
else:
variant_contig, variant_contig_names = encode_array(arr_bim["contig"].compute())
variant_contig = variant_contig.astype("int16")
variant_contig_names = list(variant_contig_names)

variant_position = arr_bim["pos"]
a1 = arr_bim["a1"].astype("str")
a2 = arr_bim["a2"].astype("str")

# Note: column_stack not implemented in Dask, must use [v|h]stack
variant_alleles = da.hstack((a1[:, np.newaxis], a2[:, np.newaxis]))
# TODO: Why use bytes for this instead of string?
variant_alleles = variant_alleles.astype("S")
variant_id = arr_bim["variant_id"]

sample_id = arr_fam["member_id"]

ds = create_genotype_call_dataset(
variant_contig_names=variant_contig_names,
variant_contig=variant_contig,
variant_position=variant_position,
variant_alleles=variant_alleles,
sample_id=sample_id,
call_genotype=call_genotype,
variant_id=variant_id,
)

# Assign PLINK-specific pedigree fields
ds = ds.assign(
**{f"sample/{f}": (DIM_SAMPLE, arr_fam[f]) for f in arr_fam if f != "member_id"}
)
return ds
Empty file added sgkit_plink/tests/__init__.py
Empty file.
Binary file not shown.
Loading