scikit-learn-contrib · massich · Aug 28, 2018 · Aug 27, 2018 · Aug 27, 2018 · Aug 27, 2018
diff --git a/doc/api.rst b/doc/api.rst
@@ -247,4 +247,3 @@ Imbalance-learn provides some fast-prototyping tools.
    utils.check_neighbors_object
    utils.check_ratio
    utils.check_sampling_strategy
-   utils.hash_X_y
diff --git a/doc/combine.rst b/doc/combine.rst
@@ -33,12 +33,12 @@ to their former samplers::
   [(0, 64), (1, 262), (2, 4674)]
   >>> from imblearn.combine import SMOTEENN
   >>> smote_enn = SMOTEENN(random_state=0)
-  >>> X_resampled, y_resampled = smote_enn.fit_sample(X, y)
+  >>> X_resampled, y_resampled = smote_enn.fit_resample(X, y)
   >>> print(sorted(Counter(y_resampled).items()))
   [(0, 4060), (1, 4381), (2, 3502)]
   >>> from imblearn.combine import SMOTETomek
   >>> smote_tomek = SMOTETomek(random_state=0)
-  >>> X_resampled, y_resampled = smote_tomek.fit_sample(X, y)
+  >>> X_resampled, y_resampled = smote_tomek.fit_resample(X, y)
   >>> print(sorted(Counter(y_resampled).items()))
   [(0, 4499), (1, 4566), (2, 4413)]
 

diff --git a/doc/ensemble.rst b/doc/ensemble.rst
@@ -33,7 +33,7 @@ under-sampling the original set::
   [(0, 64), (1, 262), (2, 4674)]
   >>> from imblearn.ensemble import EasyEnsemble
   >>> ee = EasyEnsemble(random_state=0, n_subsets=10)
-  >>> X_resampled, y_resampled = ee.fit_sample(X, y)
+  >>> X_resampled, y_resampled = ee.fit_resample(X, y)
   >>> print(X_resampled.shape)
   (10, 192, 2)
   >>> print(sorted(Counter(y_resampled[0]).items()))
@@ -55,7 +55,7 @@ parameter ``n_max_subset`` and an additional bootstraping can be activated with
   >>> bc = BalanceCascade(random_state=0,
   ...                     estimator=LogisticRegression(random_state=0),
   ...                     n_max_subset=4)
-  >>> X_resampled, y_resampled = bc.fit_sample(X, y)
+  >>> X_resampled, y_resampled = bc.fit_resample(X, y)
   >>> print(X_resampled.shape)
   (4, 192, 2)
   >>> print(sorted(Counter(y_resampled[0]).items()))

diff --git a/doc/introduction.rst b/doc/introduction.rst
@@ -18,15 +18,11 @@ and adding a sampling functionality through the ``sample`` method:
 
       estimator = obj.fit(data, targets)
 
-:Sampler:
+:Resampler:
 
     To resample a data sets, each sampler implements::
 
-      data_resampled, targets_resampled = obj.sample(data, targets)
-
-    Fitting and sampling can also be done in one step::
-
-      data_resampled, targets_resampled = obj.fit_sample(data, targets)
+      data_resampled, targets_resampled = obj.fit_resample(data, targets)
 
 Imbalanced-learn samplers accept the same inputs that in scikit-learn:
 

diff --git a/doc/miscellaneous.rst b/doc/miscellaneous.rst
@@ -28,7 +28,7 @@ to retain the 10 first elements of the array ``X`` and ``y``::
   >>> def func(X, y):
   ...   return X[:10], y[:10]
   >>> sampler = FunctionSampler(func=func)
-  >>> X_res, y_res = sampler.fit_sample(X, y)
+  >>> X_res, y_res = sampler.fit_resample(X, y)
   >>> np.all(X_res == X[:10])
   True
   >>> np.all(y_res == y[:10])

diff --git a/doc/over_sampling.rst b/doc/over_sampling.rst
@@ -27,7 +27,7 @@ randomly sampling with replacement the current available samples. The
    ...                            class_sep=0.8, random_state=0)
    >>> from imblearn.over_sampling import RandomOverSampler
    >>> ros = RandomOverSampler(random_state=0)
-   >>> X_resampled, y_resampled = ros.fit_sample(X, y)
+   >>> X_resampled, y_resampled = ros.fit_resample(X, y)
    >>> from collections import Counter
    >>> print(sorted(Counter(y_resampled).items()))
    [(0, 4674), (1, 4674), (2, 4674)]
@@ -59,7 +59,7 @@ In addition, :class:`RandomOverSampler` allows to sample heterogeneous data
   >>> X_hetero = np.array([['xxx', 1, 1.0], ['yyy', 2, 2.0], ['zzz', 3, 3.0]],
   ...                     dtype=np.object)
   >>> y_hetero = np.array([0, 0, 1])
-  >>> X_resampled, y_resampled = ros.fit_sample(X_hetero, y_hetero)
+  >>> X_resampled, y_resampled = ros.fit_resample(X_hetero, y_hetero)
   >>> print(X_resampled)
   [['xxx' 1 1.0]
    ['yyy' 2 2.0]
@@ -82,11 +82,11 @@ to over-sample minority classes: (i) the Synthetic Minority Oversampling Techniq
 can be used in the same manner::
 
   >>> from imblearn.over_sampling import SMOTE, ADASYN
-  >>> X_resampled, y_resampled = SMOTE().fit_sample(X, y)
+  >>> X_resampled, y_resampled = SMOTE().fit_resample(X, y)
   >>> print(sorted(Counter(y_resampled).items()))
   [(0, 4674), (1, 4674), (2, 4674)]
   >>> clf_smote = LinearSVC().fit(X_resampled, y_resampled)
-  >>> X_resampled, y_resampled = ADASYN().fit_sample(X, y)
+  >>> X_resampled, y_resampled = ADASYN().fit_resample(X, y)
   >>> print(sorted(Counter(y_resampled).items()))
   [(0, 4673), (1, 4662), (2, 4674)]
   >>> clf_adasyn = LinearSVC().fit(X_resampled, y_resampled)
@@ -147,7 +147,7 @@ The :class:`BorderlineSMOTE` and :class:`SVMSMOTE` offer some variant of the SMO
 algorithm::
 
   >>> from imblearn.over_sampling import BorderlineSMOTE
-  >>> X_resampled, y_resampled = BorderlineSMOTE().fit_sample(X, y)
+  >>> X_resampled, y_resampled = BorderlineSMOTE().fit_resample(X, y)
   >>> print(sorted(Counter(y_resampled).items()))
   [(0, 4674), (1, 4674), (2, 4674)]
 

diff --git a/doc/under_sampling.rst b/doc/under_sampling.rst
@@ -32,7 +32,7 @@ K-means method instead of the original samples::
   [(0, 64), (1, 262), (2, 4674)]
   >>> from imblearn.under_sampling import ClusterCentroids
   >>> cc = ClusterCentroids(random_state=0)
-  >>> X_resampled, y_resampled = cc.fit_sample(X, y)
+  >>> X_resampled, y_resampled = cc.fit_resample(X, y)
   >>> print(sorted(Counter(y_resampled).items()))
   [(0, 64), (1, 64), (2, 64)]
 
@@ -82,7 +82,7 @@ randomly selecting a subset of data for the targeted classes::
 
   >>> from imblearn.under_sampling import RandomUnderSampler
   >>> rus = RandomUnderSampler(random_state=0)
-  >>> X_resampled, y_resampled = rus.fit_sample(X, y)
+  >>> X_resampled, y_resampled = rus.fit_resample(X, y)
   >>> print(sorted(Counter(y_resampled).items()))
   [(0, 64), (1, 64), (2, 64)]
 
@@ -99,7 +99,7 @@ by considering independently each targeted class::
   >>> print(np.vstack({tuple(row) for row in X_resampled}).shape)
   (192, 2)
   >>> rus = RandomUnderSampler(random_state=0, replacement=True)
-  >>> X_resampled, y_resampled = rus.fit_sample(X, y)
+  >>> X_resampled, y_resampled = rus.fit_resample(X, y)
   >>> print(np.vstack({tuple(row) for row in X_resampled}).shape)
   (181, 2)
 
@@ -109,7 +109,7 @@ In addition, :class:`RandomUnderSampler` allows to sample heterogeneous data
   >>> X_hetero = np.array([['xxx', 1, 1.0], ['yyy', 2, 2.0], ['zzz', 3, 3.0]],
   ...                     dtype=np.object)
   >>> y_hetero = np.array([0, 0, 1])
-  >>> X_resampled, y_resampled = rus.fit_sample(X_hetero, y_hetero)
+  >>> X_resampled, y_resampled = rus.fit_resample(X_hetero, y_hetero)
   >>> print(X_resampled)
   [['xxx' 1 1.0]
    ['zzz' 3 3.0]]
@@ -126,7 +126,7 @@ be selected with the parameter ``version``::
 
   >>> from imblearn.under_sampling import NearMiss
   >>> nm1 = NearMiss(version=1)
-  >>> X_resampled_nm1, y_resampled = nm1.fit_sample(X, y)
+  >>> X_resampled_nm1, y_resampled = nm1.fit_resample(X, y)
   >>> print(sorted(Counter(y_resampled).items()))
   [(0, 64), (1, 64), (2, 64)]
 
@@ -261,7 +261,7 @@ the sample inspected to keep it in the dataset::
   [(0, 64), (1, 262), (2, 4674)]
   >>> from imblearn.under_sampling import EditedNearestNeighbours
   >>> enn = EditedNearestNeighbours()
-  >>> X_resampled, y_resampled = enn.fit_sample(X, y)
+  >>> X_resampled, y_resampled = enn.fit_resample(X, y)
   >>> print(sorted(Counter(y_resampled).items()))
   [(0, 64), (1, 213), (2, 4568)]
 
@@ -275,7 +275,7 @@ Generally, repeating the algorithm will delete more data::
 
    >>> from imblearn.under_sampling import RepeatedEditedNearestNeighbours
    >>> renn = RepeatedEditedNearestNeighbours()
-   >>> X_resampled, y_resampled = renn.fit_sample(X, y)
+   >>> X_resampled, y_resampled = renn.fit_resample(X, y)
    >>> print(sorted(Counter(y_resampled).items()))
    [(0, 64), (1, 208), (2, 4551)]
 
@@ -285,7 +285,7 @@ internal nearest neighbors algorithm is increased at each iteration::
 
   >>> from imblearn.under_sampling import AllKNN
   >>> allknn = AllKNN()
-  >>> X_resampled, y_resampled = allknn.fit_sample(X, y)
+  >>> X_resampled, y_resampled = allknn.fit_resample(X, y)
   >>> print(sorted(Counter(y_resampled).items()))
   [(0, 64), (1, 220), (2, 4601)]
 
@@ -323,7 +323,7 @@ The :class:`CondensedNearestNeighbour` can be used in the following manner::
 
   >>> from imblearn.under_sampling import CondensedNearestNeighbour
   >>> cnn = CondensedNearestNeighbour(random_state=0)
-  >>> X_resampled, y_resampled = cnn.fit_sample(X, y)
+  >>> X_resampled, y_resampled = cnn.fit_resample(X, y)
   >>> print(sorted(Counter(y_resampled).items()))
   [(0, 64), (1, 24), (2, 115)]
 
@@ -338,7 +338,7 @@ used as::
 
   >>> from imblearn.under_sampling import OneSidedSelection
   >>> oss = OneSidedSelection(random_state=0)
-  >>> X_resampled, y_resampled = oss.fit_sample(X, y)
+  >>> X_resampled, y_resampled = oss.fit_resample(X, y)
   >>> print(sorted(Counter(y_resampled).items()))
   [(0, 64), (1, 174), (2, 4403)]
 
@@ -352,7 +352,7 @@ neighbors classifier. The class can be used as::
 
   >>> from imblearn.under_sampling import NeighbourhoodCleaningRule
   >>> ncr = NeighbourhoodCleaningRule()
-  >>> X_resampled, y_resampled = ncr.fit_sample(X, y)
+  >>> X_resampled, y_resampled = ncr.fit_resample(X, y)
   >>> print(sorted(Counter(y_resampled).items()))
   [(0, 64), (1, 234), (2, 4666)]
 
@@ -380,7 +380,7 @@ removed. The class can be used as::
   >>> from imblearn.under_sampling import InstanceHardnessThreshold
   >>> iht = InstanceHardnessThreshold(random_state=0,
   ...                                 estimator=LogisticRegression())
-  >>> X_resampled, y_resampled = iht.fit_sample(X, y)
+  >>> X_resampled, y_resampled = iht.fit_resample(X, y)
   >>> print(sorted(Counter(y_resampled).items()))
   [(0, 64), (1, 64), (2, 64)]
 

diff --git a/doc/whats_new/v0.0.4.rst b/doc/whats_new/v0.0.4.rst
@@ -18,6 +18,11 @@ API
 - Enable to use a ``list`` for the cleaning methods to specify the class to
   sample. :issue:`411` by :user:`Guillaume Lemaitre <glemaitre>`.
 
+- Replace ``fit_sample`` by ``fit_resample``. An alias is still available for
+  backward compatibility. In addition, ``sample`` has been removed to avoid
+  resampling on different set of data.
+  :issue:`462` by :user:`Guillaume Lemaitre <glemaitre>`.
+
 New features
 ............
 

diff --git a/examples/applications/plot_over_sampling_benchmark_lfw.py b/examples/applications/plot_over_sampling_benchmark_lfw.py
@@ -39,7 +39,7 @@ def sample(self, X, y):
     def fit(self, X, y):
         return self
 
-    def fit_sample(self, X, y):
+    def fit_resample(self, X, y):
         return self.sample(X, y)
 
 

diff --git a/examples/applications/porto_seguro_keras_under_sampling.py b/examples/applications/porto_seguro_keras_under_sampling.py
@@ -49,7 +49,7 @@
 ###############################################################################
 
 from sklearn.compose import ColumnTransformer
-from sklearn.pipeline import Pipeline, make_pipeline
+from sklearn.pipeline import make_pipeline
 from sklearn.preprocessing import OneHotEncoder
 from sklearn.preprocessing import StandardScaler
 from sklearn.preprocessing import FunctionTransformer

diff --git a/examples/combine/plot_comparison_combine.py b/examples/combine/plot_comparison_combine.py
@@ -47,7 +47,7 @@ def create_dataset(n_samples=1000, weights=(0.01, 0.01, 0.98), n_classes=3,
 
 
 def plot_resampling(X, y, sampling, ax):
-    X_res, y_res = sampling.fit_sample(X, y)
+    X_res, y_res = sampling.fit_resample(X, y)
     ax.scatter(X_res[:, 0], X_res[:, 1], c=y_res, alpha=0.8, edgecolor='k')
     # make nice plotting
     ax.spines['top'].set_visible(False)

diff --git a/examples/combine/plot_smote_enn.py b/examples/combine/plot_smote_enn.py
@@ -32,7 +32,7 @@
 
 # Apply SMOTE + ENN
 sm = SMOTEENN()
-X_resampled, y_resampled = sm.fit_sample(X, y)
+X_resampled, y_resampled = sm.fit_resample(X, y)
 X_res_vis = pca.transform(X_resampled)
 
 # Two subplots, unpack the axes array immediately

diff --git a/examples/combine/plot_smote_tomek.py b/examples/combine/plot_smote_tomek.py
@@ -32,7 +32,7 @@
 
 # Apply SMOTE + Tomek links
 sm = SMOTETomek()
-X_resampled, y_resampled = sm.fit_sample(X, y)
+X_resampled, y_resampled = sm.fit_resample(X, y)
 X_res_vis = pca.transform(X_resampled)
 
 # Two subplots, unpack the axes array immediately

diff --git a/examples/ensemble/plot_balance_cascade.py b/examples/ensemble/plot_balance_cascade.py
@@ -32,7 +32,7 @@
 
 # Apply Balance Cascade method
 bc = BalanceCascade()
-X_resampled, y_resampled = bc.fit_sample(X, y)
+X_resampled, y_resampled = bc.fit_resample(X, y)
 X_res_vis = []
 for X_res in X_resampled:
     X_res_vis.append(pca.transform(X_res))

diff --git a/examples/ensemble/plot_easy_ensemble.py b/examples/ensemble/plot_easy_ensemble.py
@@ -32,7 +32,7 @@
 
 # Apply Easy Ensemble
 ee = EasyEnsemble(n_subsets=3)
-X_resampled, y_resampled = ee.fit_sample(X, y)
+X_resampled, y_resampled = ee.fit_resample(X, y)
 X_res_vis = []
 for X_res in X_resampled:
     X_res_vis.append(pca.transform(X_res))

diff --git a/examples/over-sampling/plot_adasyn.py b/examples/over-sampling/plot_adasyn.py
@@ -33,7 +33,7 @@
 
 # Apply the random over-sampling
 ada = ADASYN()
-X_resampled, y_resampled = ada.fit_sample(X, y)
+X_resampled, y_resampled = ada.fit_resample(X, y)
 X_res_vis = pca.transform(X_resampled)
 
 # Two subplots, unpack the axes array immediately

diff --git a/examples/over-sampling/plot_comparison_over_sampling.py b/examples/over-sampling/plot_comparison_over_sampling.py
@@ -23,8 +23,7 @@
 from imblearn.over_sampling import ADASYN
 from imblearn.over_sampling import SMOTE, BorderlineSMOTE, SVMSMOTE
 from imblearn.over_sampling import RandomOverSampler
-from imblearn.base import SamplerMixin
-from imblearn.utils import hash_X_y
+from imblearn.base import BaseSampler
 
 print(__doc__)
 
@@ -49,7 +48,7 @@ def create_dataset(n_samples=1000, weights=(0.01, 0.01, 0.98), n_classes=3,
 
 
 def plot_resampling(X, y, sampling, ax):
-    X_res, y_res = sampling.fit_sample(X, y)
+    X_res, y_res = sampling.fit_resample(X, y)
     ax.scatter(X_res[:, 0], X_res[:, 1], c=y_res, alpha=0.8, edgecolor='k')
     # make nice plotting
     ax.spines['top'].set_visible(False)
@@ -131,20 +130,11 @@ def plot_decision_function(X, y, clf, ax):
 
 
 # Make an identity sampler
-class FakeSampler(SamplerMixin):
+class FakeSampler(BaseSampler):
 
-    def fit(self, X, y):
-        self.ratio_ = 1
-        self.X_hash_ = hash_X_y(X, y)
-        return self
+    _sampling_type = 'bypass'
 
-    def sample(self, X, y):
-        return X,
-
-    def _sample(self, X, y):
-        pass
-
-    def fit_sample(self, X, y):
+    def _fit_resample(self, X, y):
         return X, y
 
 

diff --git a/examples/over-sampling/plot_random_over_sampling.py b/examples/over-sampling/plot_random_over_sampling.py
@@ -32,7 +32,7 @@
 
 # Apply the random over-sampling
 ros = RandomOverSampler()
-X_resampled, y_resampled = ros.fit_sample(X, y)
+X_resampled, y_resampled = ros.fit_resample(X, y)
 X_res_vis = pca.transform(X_resampled)
 
 # Two subplots, unpack the axes array immediately

diff --git a/examples/over-sampling/plot_smote.py b/examples/over-sampling/plot_smote.py
@@ -57,7 +57,7 @@ def plot_resampling(ax, X, y, title):
 y_resampled = []
 X_res_vis = []
 for method in sm:
-    X_res, y_res = method.fit_sample(X, y)
+    X_res, y_res = method.fit_resample(X, y)
     X_resampled.append(X_res)
     y_resampled.append(y_res)
     X_res_vis.append(pca.transform(X_res))

diff --git a/examples/plot_outlier_rejections.py b/examples/plot_outlier_rejections.py
@@ -73,7 +73,7 @@ def outlier_rejection(X, y):
 
 
 reject_sampler = FunctionSampler(func=outlier_rejection)
-X_inliers, y_inliers = reject_sampler.fit_sample(X_train, y_train)
+X_inliers, y_inliers = reject_sampler.fit_resample(X_train, y_train)
 plot_scatter(X_inliers, y_inliers, 'Training data without outliers')
 
 pipe = make_pipeline(FunctionSampler(func=outlier_rejection),