sgkit-dev · jerowe · Aug 19, 2020 · Aug 20, 2020 · Aug 26, 2020 · Aug 27, 2020
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -30,4 +30,4 @@ repos:
     hooks:
       - id: mypy
         args: ["--strict", "--show-error-codes"]
-        additional_dependencies: ["numpy", "xarray", "dask[array]", "scipy", "zarr", "numba"]
+        additional_dependencies: ["numpy", "xarray", "dask[array]", "scipy", "zarr", "numba", "sklearn"]
diff --git a/requirements.txt b/requirements.txt
@@ -3,4 +3,5 @@ xarray
 dask[array]
 scipy
 numba
-zarr
+zarr
+sklearn
diff --git a/setup.cfg b/setup.cfg
@@ -31,6 +31,7 @@ install_requires =
     scipy
     zarr
     numba
+    sklearn
     setuptools >= 41.2  # For pkg_resources
 setup_requires =
     setuptools >= 41.2
@@ -59,13 +60,15 @@ ignore =
 profile = black
 default_section = THIRDPARTY
 known_first_party = sgkit
-known_third_party = dask,fire,glow,hail,hypothesis,invoke,numba,numpy,pandas,pkg_resources,pyspark,pytest,setuptools,sgkit_plink,xarray,yaml,zarr
+known_third_party = dask,fire,glow,hail,hypothesis,invoke,numba,numpy,pandas,pkg_resources,pyspark,pytest,setuptools,sgkit_plink,sklearn,xarray,yaml,zarr
 multi_line_output = 3
 include_trailing_comma = True
 force_grid_wrap = 0
 use_parentheses = True
 line_length = 88
 
+[mypy-sklearn.*]
+ignore_missing_imports = True
 [mypy-dask.*]
 ignore_missing_imports = True
 [mypy-numpy.*]

diff --git a/sgkit/stats/decomposition.py b/sgkit/stats/decomposition.py
@@ -0,0 +1,169 @@
+from typing import Optional, Tuple
+
+import dask.array as da
+import numpy as np
+import sklearn.decomposition
+from sklearn.utils.validation import check_random_state
+
+from ..typing import ArrayLike
+
+
+# https://github.com/dask/dask-ml/blob/b94c587abae3f5667eff131b0616ad8f91966e7f/dask_ml/_utils.py#L15
+# Grabbing this from dask-ml to avoid declaring a dependency on dask-ml
+def draw_seed(random_state, low, high=None):  # type: ignore
+    return random_state.randint(low, high)
+
+
+class GenotypePCA(sklearn.decomposition.PCA):  # type: ignore
+    """
+
+    Parameters
+    ----------
+    copy : boolean, optional, default True
+        ignored
+    ploidy : int, optional, default 2
+        The n_ploidy of the samples. Assumed to be 2 for diploid samples
+    n_components: int, optional, default 10
+        The estimated number of components.
+
+    Attributes
+    ----------
+    mean_ : ndarray or None, shape (n_variants, 1)
+        The mean value for each feature in the training set.
+    std_ : ndarray or None, shape (n_variants, 1)
+        scaling factor
+
+    Differences from scikit-allel
+    ----------
+    * The scalers have been separated out from the PCAs to conform with
+    SKLearn Pipelines - https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html
+
+    * Uses Dask under the hood instead of numpy
+    * svd_solver : 'randomized' uses ``dask.linalg.svd_compressed``
+      'full' uses ``dask.linalg.svd``, 'arpack' is not valid.
+    * iterated_power : defaults to ``0``, the default for
+      ``dask.linalg.svd_compressed``.ad of numpy
+
+    Examples
+    --------
+    >>> from sgkit.stats.preprocessing import PattersonScaler
+    >>> from sgkit.stats.decomposition import GenotypePCA
+    >>> import dask.array as da
+
+    >>> # Let's generate some random diploid genotype data
+    >>> # With 30000 variants and 67 samples
+    >>> n_variants = 30000
+    >>> n_samples = 67
+    >>> genotypes = da.random.choice(3, n_variants * n_samples)
+    >>> genotypes = genotypes.reshape(n_variants, n_samples)
+
+    >>> scaler = PattersonScaler()
+    >>> scaled_genotypes = scaler.fit_transform(genotypes)
+    >>> # If you want to deal with the scaled values directly
+    >>> # scaled_genotypes.compute()
+    >>> # Or you can put the scaled_genotypes directly into the PCA
+    >>> pca = GenotypePCA()
+    >>> transformed = pca.fit_transform(scaled_genotypes)
+
+    >>> # Use SKLearn Pipelines
+    >>> # https://github.com/pystatgen/sgkit/issues/95#issuecomment-672879865
+    >>> from sklearn.pipeline import Pipeline
+    >>> est = Pipeline([ \
+        ('scaler', PattersonScaler()), \
+        ('pca', GenotypePCA(n_components=2)) \
+        ])
+    >>> pcs = est.fit_transform(genotypes)
+    >>> # `est` would also contain loadings + explained variance
+    >>> # `scaler` would contain the MAF and binomial variance values needed for out-of-sample projection
+    >>>   # Out-of-sample projection
+    >>> pcs_oos = est.transform(genotypes)
+    """
+
+    def __init__(
+        self,
+        n_components: int = 10,
+        copy: bool = True,
+        ploidy: int = 2,
+        iterated_power: int = 0,
+        random_state: Optional[int] = None,
+        svd_solver: str = "full",
+    ):
+        self.n_components = n_components
+        self.copy = copy
+        self.ploidy = ploidy
+        self.svd_solver = svd_solver
+        self.iterated_power = iterated_power
+        self.random_state = random_state
+
+    def fit(self, gn: ArrayLike, y: Optional[ArrayLike] = None) -> "GenotypePCA":
+        self._fit(gn)
+        return self
+
+    def _get_solver(self) -> str:
+        solvers = {"full", "randomized"}
+        solver = self.svd_solver
+
+        if solver not in solvers:
+            raise ValueError(
+                "Invalid solver '{}'. Must be one of {}".format(solver, solvers)
+            )
+        return solver
+
+    def fit_transform(self, gn: ArrayLike, y: Optional[ArrayLike] = None) -> ArrayLike:
+        u, s, v = self._fit(gn)
+        solver = self._get_solver()
+
+        if solver in {"full"}:
+            u = u[:, : self.n_components]
+            u *= s[: self.n_components]
+        else:
+            u *= s
+
+        return u
+
+    def _fit(self, gn: ArrayLike) -> Tuple[ArrayLike, ArrayLike, ArrayLike]:
+        x = gn.T
+        n_samples, n_features = x.shape
+
+        solver = self._get_solver()
+        if solver in {"full"}:
+            u, s, v = da.linalg.svd(x)
+        else:
+            # randomized
+            random_state = check_random_state(self.random_state)
+            seed = draw_seed(random_state, np.iinfo("int32").max)  # type: ignore
+            n_power_iter = self.iterated_power
+            u, s, v = da.linalg.svd_compressed(
+                x, self.n_components, n_power_iter=n_power_iter, seed=seed
+            )
+
+        n_components = self.n_components
+        if solver in {"full"}:
+            # calculate explained variance
+            explained_variance_ = (s ** 2) / n_samples
+            explained_variance_ratio_ = explained_variance_ / da.sum(
+                explained_variance_
+            )
+            # store variables
+            self.components_ = v[:n_components]
+            self.explained_variance_ = explained_variance_[:n_components]
+            self.explained_variance_ratio_ = explained_variance_ratio_[:n_components]
+        else:
+            # randomized
+            # https://github.com/cggh/scikit-allel/blob/master/allel/stats/decomposition.py#L219
+            self.explained_variance_ = exp_var = (s ** 2) / n_samples
+            full_var = np.var(x, axis=0).sum()
+            self.explained_variance_ratio_ = exp_var / full_var
+            self.components_ = v
+            # self.components_ = v[:n_components]
+
+        return u, s, v
+
+    def transform(self, gn: ArrayLike) -> ArrayLike:
+        if not hasattr(self, "components_"):
+            raise ValueError("model has not been not fitted")
+
+        x = gn.T
+        # apply transformation
+        x_transformed = da.dot(x, self.components_.T)
+        return x_transformed
diff --git a/sgkit/stats/preprocessing.py b/sgkit/stats/preprocessing.py
@@ -0,0 +1,179 @@
+from typing import Optional
+
+import dask.array as da
+from sklearn.base import BaseEstimator, TransformerMixin
+
+from ..typing import ArrayLike
+
+
+class PattersonScaler(TransformerMixin, BaseEstimator):  # type: ignore
+    """New Patterson Scaler with SKLearn API
+
+    Parameters
+    ----------
+    copy : boolean, optional, default True
+        ignored
+    ploidy : int, optional, default 2
+        The ploidy of the samples. Assumed to be 2 for diploid samples
+
+    Attributes
+    ----------
+    mean_ : ndarray or None, shape (n_variants, 1)
+        The mean value for each feature in the training set.
+    std_ : ndarray or None, shape (n_variants, 1)
+        scaling factor
+
+    Differences from scikit-allel
+    ----------
+    * The scalers have been separated out from the PCAs to conform with
+    SKLearn Pipelines - https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html
+
+    * Uses Dask under the hood instead of numpy
+
+    Examples
+    --------
+    >>> from sgkit.stats.preprocessing import PattersonScaler
+    >>> from sgkit.stats.decomposition import GenotypePCA
+    >>> import dask.array as da
+
+    >>> # Let's generate some random diploid genotype data
+    >>> # With 30000 variants and 67 samples
+    >>> n_variants = 30000
+    >>> n_samples = 67
+    >>> genotypes = da.random.choice(3, n_variants * n_samples)
+    >>> genotypes = genotypes.reshape(n_variants, n_samples)
+    >>> scaler = PattersonScaler()
+    >>> scaled_genotypes = scaler.fit_transform(genotypes)
+    """
+
+    def __init__(self, copy: bool = True, ploidy: int = 2):
+        self.mean_: ArrayLike = None
+        self.std_: ArrayLike = None
+        self.copy: bool = copy
+        self.ploidy: int = ploidy
+
+    def _reset(self) -> None:
+        """Reset internal data-dependent state of the scaler, if necessary.
+        __init__ parameters are not touched.
+        """
+
+        # Checking one attribute is enough, becase they are all set together
+        # in fit
+        if hasattr(self, "mean_"):
+            del self.mean_
+            del self.std_
+
+    def fit(self, gn: ArrayLike) -> "PattersonScaler":
+        """Compute the mean and std to be used for later scaling.
+        Parameters
+        ----------
+        gn : {array-like}, shape [n_samples, n_features]
+            Genotype calls
+        """
+
+        # Reset internal state before fitting
+        self._reset()
+
+        # find the mean
+        self.mean_ = gn.mean(axis=1, keepdims=True)
+
+        # find the scaling factor
+        p = self.mean_ / self.ploidy
+        self.std_ = da.sqrt(p * (1 - p))
+        return self
+
+    def transform(self, gn: ArrayLike, y: Optional[ArrayLike] = None) -> ArrayLike:
+        # check inputs
+        # TODO Add pack in type and dim checks
+        # copy = copy if copy is not None else self.copy
+        # gn = asarray_ndim(gn, 2, copy=copy)
+
+        # if not gn.dtype.kind == 'f':
+        #    gn = gn.astype('f2')
+
+        # center
+        transformed = gn - self.mean_
+
+        # scale
+        transformed = transformed / self.std_
+
+        return transformed
+
+    def fit_transform(self, gn: ArrayLike, y: Optional[ArrayLike] = None) -> ArrayLike:
+        # TODO Raise an Error if this is not a dask array
+        # if not dask.is_dask_collection(gn):
+        #    gn = da.from_array(gn, chunks=gn.shape)
+        self.fit(gn)
+        return self.transform(gn)
+
+
+class CenterScaler(TransformerMixin, BaseEstimator):  # type: ignore
+    """
+
+    Parameters
+    ----------
+    copy : boolean, optional, default True
+        ignored
+    ploidy : int, optional, default 2
+        The ploidy of the samples. Assumed to be 2 for diploid samples
+
+    Attributes
+    ----------
+    mean_ : ndarray or None, shape (n_variants, 1)
+        The mean value for each feature in the training set.
+
+    Differences from scikit-allel
+    ----------
+    * The scalers have been separated out from the PCAs to conform with
+    SKLearn Pipelines - https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html
+
+    * Uses Dask under the hood instead of numpy
+
+    Examples
+    --------
+    >>> from sgkit.stats.preprocessing import CenterScaler
+    >>> import dask.array as da
+
+    >>> # Let's generate some random diploid genotype data
+    >>> # With 30000 variants and 67 samples
+    >>> n_variants = 30000
+    >>> n_samples = 67
+    >>> genotypes = da.random.choice(3, n_variants * n_samples)
+    >>> genotypes = genotypes.reshape(n_variants, n_samples)
+    >>> scaler = CenterScaler()
+    >>> scaled_genotypes = scaler.fit_transform(genotypes)
+    """
+
+    def __init__(self, copy: bool = True):
+        self.copy = copy
+        self.mean_ = None
+
+    def _reset(self) -> None:
+        """Reset internal data-dependent state of the scaler, if necessary.
+        __init__ parameters are not touched.
+        """
+        del self.mean_
+
+    def fit(self, gn: ArrayLike) -> "CenterScaler":
+        self._reset()
+        # TODO add back in check input sanity checks
+        # gn = asarray_ndim(gn, 2)
+
+        # find mean
+        self.mean_ = gn.mean(axis=1, keepdims=True)
+        return self
+
+    def transform(self, gn: ArrayLike, y: Optional[ArrayLike] = None) -> ArrayLike:
+        # TODO sanity check check inputs
+        # gn = asarray_ndim(gn, 2, copy=copy)
+        # if not gn.dtype.kind == 'f':
+        #     gn = gn.astype('f2')
+
+        # center
+        transform = gn - self.mean_
+
+        return transform
+
+    def fit_transform(self, gn: ArrayLike, y: Optional[ArrayLike] = None) -> ArrayLike:
+        self.fit(gn)
+        return self.transform(gn, y=y)
-Original file line number
+Diff line change
@@ Expand Up / @@ -3,4 +3,5 @@ xarray @@
     dask[array]
     scipy
     numba
-    zarr
+    zarr
+    sklearn