-
Notifications
You must be signed in to change notification settings - Fork 35
PCA #123
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
PCA #123
Changes from 2 commits
34c34e2
4a10af8
8de4ec6
c34ff27
80f6b93
d6289c8
e719c0b
147602c
8b2588c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,4 +3,5 @@ xarray | |
dask[array] | ||
scipy | ||
numba | ||
zarr | ||
zarr | ||
sklearn |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,169 @@ | ||
from typing import Optional, Tuple | ||
|
||
import dask.array as da | ||
import numpy as np | ||
import sklearn.decomposition | ||
from sklearn.utils.validation import check_random_state | ||
|
||
from ..typing import ArrayLike | ||
|
||
|
||
# https://github.com/dask/dask-ml/blob/b94c587abae3f5667eff131b0616ad8f91966e7f/dask_ml/_utils.py#L15 | ||
# Grabbing this from dask-ml to avoid declaring a dependency on dask-ml | ||
def draw_seed(random_state, low, high=None): # type: ignore | ||
return random_state.randint(low, high) | ||
|
||
|
||
class GenotypePCA(sklearn.decomposition.PCA): # type: ignore | ||
""" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. A description here would be good, possibly with some appeal to the relationship with scikit-allele given that there is a "differences from scikit-allel" section. |
||
|
||
Parameters | ||
---------- | ||
copy : boolean, optional, default True | ||
ignored | ||
ploidy : int, optional, default 2 | ||
The n_ploidy of the samples. Assumed to be 2 for diploid samples | ||
n_components: int, optional, default 10 | ||
The estimated number of components. | ||
|
||
Attributes | ||
---------- | ||
mean_ : ndarray or None, shape (n_variants, 1) | ||
The mean value for each feature in the training set. | ||
std_ : ndarray or None, shape (n_variants, 1) | ||
scaling factor | ||
|
||
Differences from scikit-allel | ||
---------- | ||
* The scalers have been separated out from the PCAs to conform with | ||
SKLearn Pipelines - https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html | ||
|
||
* Uses Dask under the hood instead of numpy | ||
* svd_solver : 'randomized' uses ``dask.linalg.svd_compressed`` | ||
'full' uses ``dask.linalg.svd``, 'arpack' is not valid. | ||
* iterated_power : defaults to ``0``, the default for | ||
``dask.linalg.svd_compressed``.ad of numpy | ||
|
||
Examples | ||
-------- | ||
>>> from sgkit.stats.preprocessing import PattersonScaler | ||
>>> from sgkit.stats.decomposition import GenotypePCA | ||
>>> import dask.array as da | ||
|
||
>>> # Let's generate some random diploid genotype data | ||
>>> # With 30000 variants and 67 samples | ||
>>> n_variants = 30000 | ||
>>> n_samples = 67 | ||
>>> genotypes = da.random.choice(3, n_variants * n_samples) | ||
>>> genotypes = genotypes.reshape(n_variants, n_samples) | ||
|
||
>>> scaler = PattersonScaler() | ||
>>> scaled_genotypes = scaler.fit_transform(genotypes) | ||
>>> # If you want to deal with the scaled values directly | ||
>>> # scaled_genotypes.compute() | ||
>>> # Or you can put the scaled_genotypes directly into the PCA | ||
>>> pca = GenotypePCA() | ||
>>> transformed = pca.fit_transform(scaled_genotypes) | ||
|
||
>>> # Use SKLearn Pipelines | ||
>>> # https://github.com/pystatgen/sgkit/issues/95#issuecomment-672879865 | ||
>>> from sklearn.pipeline import Pipeline | ||
>>> est = Pipeline([ \ | ||
('scaler', PattersonScaler()), \ | ||
('pca', GenotypePCA(n_components=2)) \ | ||
]) | ||
>>> pcs = est.fit_transform(genotypes) | ||
>>> # `est` would also contain loadings + explained variance | ||
>>> # `scaler` would contain the MAF and binomial variance values needed for out-of-sample projection | ||
>>> # Out-of-sample projection | ||
>>> pcs_oos = est.transform(genotypes) | ||
""" | ||
|
||
def __init__( | ||
self, | ||
n_components: int = 10, | ||
copy: bool = True, | ||
jerowe marked this conversation as resolved.
Show resolved
Hide resolved
|
||
ploidy: int = 2, | ||
jerowe marked this conversation as resolved.
Show resolved
Hide resolved
|
||
iterated_power: int = 0, | ||
random_state: Optional[int] = None, | ||
svd_solver: str = "full", | ||
): | ||
self.n_components = n_components | ||
self.copy = copy | ||
self.ploidy = ploidy | ||
self.svd_solver = svd_solver | ||
self.iterated_power = iterated_power | ||
self.random_state = random_state | ||
|
||
def fit(self, gn: ArrayLike, y: Optional[ArrayLike] = None) -> "GenotypePCA": | ||
self._fit(gn) | ||
return self | ||
|
||
def _get_solver(self) -> str: | ||
solvers = {"full", "randomized"} | ||
solver = self.svd_solver | ||
|
||
if solver not in solvers: | ||
raise ValueError( | ||
"Invalid solver '{}'. Must be one of {}".format(solver, solvers) | ||
) | ||
return solver | ||
|
||
def fit_transform(self, gn: ArrayLike, y: Optional[ArrayLike] = None) -> ArrayLike: | ||
u, s, v = self._fit(gn) | ||
solver = self._get_solver() | ||
|
||
if solver in {"full"}: | ||
u = u[:, : self.n_components] | ||
u *= s[: self.n_components] | ||
else: | ||
u *= s | ||
|
||
return u | ||
|
||
def _fit(self, gn: ArrayLike) -> Tuple[ArrayLike, ArrayLike, ArrayLike]: | ||
x = gn.T | ||
n_samples, n_features = x.shape | ||
|
||
solver = self._get_solver() | ||
jerowe marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if solver in {"full"}: | ||
u, s, v = da.linalg.svd(x) | ||
else: | ||
# randomized | ||
random_state = check_random_state(self.random_state) | ||
seed = draw_seed(random_state, np.iinfo("int32").max) # type: ignore | ||
n_power_iter = self.iterated_power | ||
u, s, v = da.linalg.svd_compressed( | ||
x, self.n_components, n_power_iter=n_power_iter, seed=seed | ||
) | ||
|
||
n_components = self.n_components | ||
if solver in {"full"}: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can this if statement not be collapsed with the if statement above? |
||
# calculate explained variance | ||
explained_variance_ = (s ** 2) / n_samples | ||
explained_variance_ratio_ = explained_variance_ / da.sum( | ||
explained_variance_ | ||
) | ||
# store variables | ||
self.components_ = v[:n_components] | ||
self.explained_variance_ = explained_variance_[:n_components] | ||
self.explained_variance_ratio_ = explained_variance_ratio_[:n_components] | ||
else: | ||
# randomized | ||
# https://github.com/cggh/scikit-allel/blob/master/allel/stats/decomposition.py#L219 | ||
self.explained_variance_ = exp_var = (s ** 2) / n_samples | ||
full_var = np.var(x, axis=0).sum() | ||
self.explained_variance_ratio_ = exp_var / full_var | ||
self.components_ = v | ||
# self.components_ = v[:n_components] | ||
|
||
return u, s, v | ||
|
||
def transform(self, gn: ArrayLike) -> ArrayLike: | ||
if not hasattr(self, "components_"): | ||
raise ValueError("model has not been not fitted") | ||
|
||
x = gn.T | ||
# apply transformation | ||
x_transformed = da.dot(x, self.components_.T) | ||
return x_transformed |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,179 @@ | ||
from typing import Optional | ||
|
||
import dask.array as da | ||
from sklearn.base import BaseEstimator, TransformerMixin | ||
|
||
from ..typing import ArrayLike | ||
|
||
|
||
class PattersonScaler(TransformerMixin, BaseEstimator): # type: ignore | ||
"""New Patterson Scaler with SKLearn API | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It would be better IMO if this was a description of what patterson scaling means and I don't think it makes sense to say that it's "new" in some way. Also nit: can you use "scikit-learn" or "sklearn" when referring to it? As far as I know "SKLearn" isn't a common casing. |
||
|
||
Parameters | ||
---------- | ||
copy : boolean, optional, default True | ||
ignored | ||
ploidy : int, optional, default 2 | ||
The ploidy of the samples. Assumed to be 2 for diploid samples | ||
|
||
Attributes | ||
---------- | ||
mean_ : ndarray or None, shape (n_variants, 1) | ||
The mean value for each feature in the training set. | ||
std_ : ndarray or None, shape (n_variants, 1) | ||
scaling factor | ||
|
||
Differences from scikit-allel | ||
---------- | ||
* The scalers have been separated out from the PCAs to conform with | ||
SKLearn Pipelines - https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html | ||
|
||
* Uses Dask under the hood instead of numpy | ||
|
||
Examples | ||
-------- | ||
>>> from sgkit.stats.preprocessing import PattersonScaler | ||
>>> from sgkit.stats.decomposition import GenotypePCA | ||
>>> import dask.array as da | ||
|
||
>>> # Let's generate some random diploid genotype data | ||
>>> # With 30000 variants and 67 samples | ||
>>> n_variants = 30000 | ||
>>> n_samples = 67 | ||
>>> genotypes = da.random.choice(3, n_variants * n_samples) | ||
>>> genotypes = genotypes.reshape(n_variants, n_samples) | ||
>>> scaler = PattersonScaler() | ||
>>> scaled_genotypes = scaler.fit_transform(genotypes) | ||
""" | ||
|
||
def __init__(self, copy: bool = True, ploidy: int = 2): | ||
self.mean_: ArrayLike = None | ||
self.std_: ArrayLike = None | ||
self.copy: bool = copy | ||
self.ploidy: int = ploidy | ||
|
||
def _reset(self) -> None: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Scikit-learn has features for this sort of thing, namely clone. That would work for this since the stateful parameters are suffixed by |
||
"""Reset internal data-dependent state of the scaler, if necessary. | ||
__init__ parameters are not touched. | ||
""" | ||
|
||
# Checking one attribute is enough, becase they are all set together | ||
# in fit | ||
if hasattr(self, "mean_"): | ||
del self.mean_ | ||
del self.std_ | ||
|
||
def fit(self, gn: ArrayLike) -> "PattersonScaler": | ||
"""Compute the mean and std to be used for later scaling. | ||
Parameters | ||
---------- | ||
gn : {array-like}, shape [n_samples, n_features] | ||
Genotype calls | ||
""" | ||
|
||
# Reset internal state before fitting | ||
self._reset() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think this is necessary if we go the clone-via-sklearn approach. |
||
|
||
# find the mean | ||
self.mean_ = gn.mean(axis=1, keepdims=True) | ||
|
||
# find the scaling factor | ||
p = self.mean_ / self.ploidy | ||
self.std_ = da.sqrt(p * (1 - p)) | ||
return self | ||
|
||
def transform(self, gn: ArrayLike, y: Optional[ArrayLike] = None) -> ArrayLike: | ||
# check inputs | ||
# TODO Add pack in type and dim checks | ||
# copy = copy if copy is not None else self.copy | ||
# gn = asarray_ndim(gn, 2, copy=copy) | ||
|
||
# if not gn.dtype.kind == 'f': | ||
# gn = gn.astype('f2') | ||
|
||
# center | ||
transformed = gn - self.mean_ | ||
|
||
# scale | ||
transformed = transformed / self.std_ | ||
|
||
return transformed | ||
|
||
def fit_transform(self, gn: ArrayLike, y: Optional[ArrayLike] = None) -> ArrayLike: | ||
jerowe marked this conversation as resolved.
Show resolved
Hide resolved
|
||
# TODO Raise an Error if this is not a dask array | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fwiw I've found a few functions in the Dask array API that will choke if not provided a Dask array explicitly but most would dispatch to the underlying array type successfully (e.g. da.sqrt). If you find otherwise, then a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this comment is mostly resolved by using dask-ml as the base class for the preprocessors and pca. They have type checking in their functions. |
||
# if not dask.is_dask_collection(gn): | ||
# gn = da.from_array(gn, chunks=gn.shape) | ||
self.fit(gn) | ||
return self.transform(gn) | ||
|
||
|
||
class CenterScaler(TransformerMixin, BaseEstimator): # type: ignore | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we can drop this unless there's some reason the scikit-learn StandardScaler doesn't work using There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You're right. It's only ported over for the sake of niceness and keeping names consistent. Should I remove? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep, remove please. |
||
""" | ||
|
||
Parameters | ||
---------- | ||
copy : boolean, optional, default True | ||
ignored | ||
ploidy : int, optional, default 2 | ||
The ploidy of the samples. Assumed to be 2 for diploid samples | ||
|
||
Attributes | ||
---------- | ||
mean_ : ndarray or None, shape (n_variants, 1) | ||
The mean value for each feature in the training set. | ||
|
||
Differences from scikit-allel | ||
---------- | ||
* The scalers have been separated out from the PCAs to conform with | ||
SKLearn Pipelines - https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html | ||
|
||
* Uses Dask under the hood instead of numpy | ||
|
||
Examples | ||
-------- | ||
>>> from sgkit.stats.preprocessing import CenterScaler | ||
>>> import dask.array as da | ||
|
||
>>> # Let's generate some random diploid genotype data | ||
>>> # With 30000 variants and 67 samples | ||
>>> n_variants = 30000 | ||
>>> n_samples = 67 | ||
>>> genotypes = da.random.choice(3, n_variants * n_samples) | ||
>>> genotypes = genotypes.reshape(n_variants, n_samples) | ||
>>> scaler = CenterScaler() | ||
>>> scaled_genotypes = scaler.fit_transform(genotypes) | ||
""" | ||
|
||
def __init__(self, copy: bool = True): | ||
self.copy = copy | ||
self.mean_ = None | ||
|
||
def _reset(self) -> None: | ||
"""Reset internal data-dependent state of the scaler, if necessary. | ||
__init__ parameters are not touched. | ||
""" | ||
del self.mean_ | ||
|
||
def fit(self, gn: ArrayLike) -> "CenterScaler": | ||
self._reset() | ||
# TODO add back in check input sanity checks | ||
# gn = asarray_ndim(gn, 2) | ||
|
||
# find mean | ||
self.mean_ = gn.mean(axis=1, keepdims=True) | ||
return self | ||
|
||
def transform(self, gn: ArrayLike, y: Optional[ArrayLike] = None) -> ArrayLike: | ||
# TODO sanity check check inputs | ||
# gn = asarray_ndim(gn, 2, copy=copy) | ||
# if not gn.dtype.kind == 'f': | ||
# gn = gn.astype('f2') | ||
|
||
# center | ||
transform = gn - self.mean_ | ||
|
||
return transform | ||
|
||
def fit_transform(self, gn: ArrayLike, y: Optional[ArrayLike] = None) -> ArrayLike: | ||
self.fit(gn) | ||
return self.transform(gn, y=y) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Was the rationale for this to be able to use
super._get_solver()
? Does the dask-ml version override that logic or does it also use the scikit-learn function?