Skip to content

[MRG+1] EHN: scikit-learn API transition #462

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Aug 28, 2018
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion doc/api.rst
Original file line number Diff line number Diff line change
@@ -247,4 +247,3 @@ Imbalance-learn provides some fast-prototyping tools.
utils.check_neighbors_object
utils.check_ratio
utils.check_sampling_strategy
utils.hash_X_y
4 changes: 2 additions & 2 deletions doc/combine.rst
Original file line number Diff line number Diff line change
@@ -33,12 +33,12 @@ to their former samplers::
[(0, 64), (1, 262), (2, 4674)]
>>> from imblearn.combine import SMOTEENN
>>> smote_enn = SMOTEENN(random_state=0)
>>> X_resampled, y_resampled = smote_enn.fit_sample(X, y)
>>> X_resampled, y_resampled = smote_enn.fit_resample(X, y)
>>> print(sorted(Counter(y_resampled).items()))
[(0, 4060), (1, 4381), (2, 3502)]
>>> from imblearn.combine import SMOTETomek
>>> smote_tomek = SMOTETomek(random_state=0)
>>> X_resampled, y_resampled = smote_tomek.fit_sample(X, y)
>>> X_resampled, y_resampled = smote_tomek.fit_resample(X, y)
>>> print(sorted(Counter(y_resampled).items()))
[(0, 4499), (1, 4566), (2, 4413)]

4 changes: 2 additions & 2 deletions doc/ensemble.rst
Original file line number Diff line number Diff line change
@@ -33,7 +33,7 @@ under-sampling the original set::
[(0, 64), (1, 262), (2, 4674)]
>>> from imblearn.ensemble import EasyEnsemble
>>> ee = EasyEnsemble(random_state=0, n_subsets=10)
>>> X_resampled, y_resampled = ee.fit_sample(X, y)
>>> X_resampled, y_resampled = ee.fit_resample(X, y)
>>> print(X_resampled.shape)
(10, 192, 2)
>>> print(sorted(Counter(y_resampled[0]).items()))
@@ -55,7 +55,7 @@ parameter ``n_max_subset`` and an additional bootstraping can be activated with
>>> bc = BalanceCascade(random_state=0,
... estimator=LogisticRegression(random_state=0),
... n_max_subset=4)
>>> X_resampled, y_resampled = bc.fit_sample(X, y)
>>> X_resampled, y_resampled = bc.fit_resample(X, y)
>>> print(X_resampled.shape)
(4, 192, 2)
>>> print(sorted(Counter(y_resampled[0]).items()))
8 changes: 2 additions & 6 deletions doc/introduction.rst
Original file line number Diff line number Diff line change
@@ -18,15 +18,11 @@ and adding a sampling functionality through the ``sample`` method:

estimator = obj.fit(data, targets)

:Sampler:
:Resampler:

To resample a data sets, each sampler implements::

data_resampled, targets_resampled = obj.sample(data, targets)

Fitting and sampling can also be done in one step::

data_resampled, targets_resampled = obj.fit_sample(data, targets)
data_resampled, targets_resampled = obj.fit_resample(data, targets)

Imbalanced-learn samplers accept the same inputs that in scikit-learn:

2 changes: 1 addition & 1 deletion doc/miscellaneous.rst
Original file line number Diff line number Diff line change
@@ -28,7 +28,7 @@ to retain the 10 first elements of the array ``X`` and ``y``::
>>> def func(X, y):
... return X[:10], y[:10]
>>> sampler = FunctionSampler(func=func)
>>> X_res, y_res = sampler.fit_sample(X, y)
>>> X_res, y_res = sampler.fit_resample(X, y)
>>> np.all(X_res == X[:10])
True
>>> np.all(y_res == y[:10])
10 changes: 5 additions & 5 deletions doc/over_sampling.rst
Original file line number Diff line number Diff line change
@@ -27,7 +27,7 @@ randomly sampling with replacement the current available samples. The
... class_sep=0.8, random_state=0)
>>> from imblearn.over_sampling import RandomOverSampler
>>> ros = RandomOverSampler(random_state=0)
>>> X_resampled, y_resampled = ros.fit_sample(X, y)
>>> X_resampled, y_resampled = ros.fit_resample(X, y)
>>> from collections import Counter
>>> print(sorted(Counter(y_resampled).items()))
[(0, 4674), (1, 4674), (2, 4674)]
@@ -59,7 +59,7 @@ In addition, :class:`RandomOverSampler` allows to sample heterogeneous data
>>> X_hetero = np.array([['xxx', 1, 1.0], ['yyy', 2, 2.0], ['zzz', 3, 3.0]],
... dtype=np.object)
>>> y_hetero = np.array([0, 0, 1])
>>> X_resampled, y_resampled = ros.fit_sample(X_hetero, y_hetero)
>>> X_resampled, y_resampled = ros.fit_resample(X_hetero, y_hetero)
>>> print(X_resampled)
[['xxx' 1 1.0]
['yyy' 2 2.0]
@@ -82,11 +82,11 @@ to over-sample minority classes: (i) the Synthetic Minority Oversampling Techniq
can be used in the same manner::

>>> from imblearn.over_sampling import SMOTE, ADASYN
>>> X_resampled, y_resampled = SMOTE().fit_sample(X, y)
>>> X_resampled, y_resampled = SMOTE().fit_resample(X, y)
>>> print(sorted(Counter(y_resampled).items()))
[(0, 4674), (1, 4674), (2, 4674)]
>>> clf_smote = LinearSVC().fit(X_resampled, y_resampled)
>>> X_resampled, y_resampled = ADASYN().fit_sample(X, y)
>>> X_resampled, y_resampled = ADASYN().fit_resample(X, y)
>>> print(sorted(Counter(y_resampled).items()))
[(0, 4673), (1, 4662), (2, 4674)]
>>> clf_adasyn = LinearSVC().fit(X_resampled, y_resampled)
@@ -147,7 +147,7 @@ The :class:`BorderlineSMOTE` and :class:`SVMSMOTE` offer some variant of the SMO
algorithm::

>>> from imblearn.over_sampling import BorderlineSMOTE
>>> X_resampled, y_resampled = BorderlineSMOTE().fit_sample(X, y)
>>> X_resampled, y_resampled = BorderlineSMOTE().fit_resample(X, y)
>>> print(sorted(Counter(y_resampled).items()))
[(0, 4674), (1, 4674), (2, 4674)]

24 changes: 12 additions & 12 deletions doc/under_sampling.rst
Original file line number Diff line number Diff line change
@@ -32,7 +32,7 @@ K-means method instead of the original samples::
[(0, 64), (1, 262), (2, 4674)]
>>> from imblearn.under_sampling import ClusterCentroids
>>> cc = ClusterCentroids(random_state=0)
>>> X_resampled, y_resampled = cc.fit_sample(X, y)
>>> X_resampled, y_resampled = cc.fit_resample(X, y)
>>> print(sorted(Counter(y_resampled).items()))
[(0, 64), (1, 64), (2, 64)]

@@ -82,7 +82,7 @@ randomly selecting a subset of data for the targeted classes::

>>> from imblearn.under_sampling import RandomUnderSampler
>>> rus = RandomUnderSampler(random_state=0)
>>> X_resampled, y_resampled = rus.fit_sample(X, y)
>>> X_resampled, y_resampled = rus.fit_resample(X, y)
>>> print(sorted(Counter(y_resampled).items()))
[(0, 64), (1, 64), (2, 64)]

@@ -99,7 +99,7 @@ by considering independently each targeted class::
>>> print(np.vstack({tuple(row) for row in X_resampled}).shape)
(192, 2)
>>> rus = RandomUnderSampler(random_state=0, replacement=True)
>>> X_resampled, y_resampled = rus.fit_sample(X, y)
>>> X_resampled, y_resampled = rus.fit_resample(X, y)
>>> print(np.vstack({tuple(row) for row in X_resampled}).shape)
(181, 2)

@@ -109,7 +109,7 @@ In addition, :class:`RandomUnderSampler` allows to sample heterogeneous data
>>> X_hetero = np.array([['xxx', 1, 1.0], ['yyy', 2, 2.0], ['zzz', 3, 3.0]],
... dtype=np.object)
>>> y_hetero = np.array([0, 0, 1])
>>> X_resampled, y_resampled = rus.fit_sample(X_hetero, y_hetero)
>>> X_resampled, y_resampled = rus.fit_resample(X_hetero, y_hetero)
>>> print(X_resampled)
[['xxx' 1 1.0]
['zzz' 3 3.0]]
@@ -126,7 +126,7 @@ be selected with the parameter ``version``::

>>> from imblearn.under_sampling import NearMiss
>>> nm1 = NearMiss(version=1)
>>> X_resampled_nm1, y_resampled = nm1.fit_sample(X, y)
>>> X_resampled_nm1, y_resampled = nm1.fit_resample(X, y)
>>> print(sorted(Counter(y_resampled).items()))
[(0, 64), (1, 64), (2, 64)]

@@ -261,7 +261,7 @@ the sample inspected to keep it in the dataset::
[(0, 64), (1, 262), (2, 4674)]
>>> from imblearn.under_sampling import EditedNearestNeighbours
>>> enn = EditedNearestNeighbours()
>>> X_resampled, y_resampled = enn.fit_sample(X, y)
>>> X_resampled, y_resampled = enn.fit_resample(X, y)
>>> print(sorted(Counter(y_resampled).items()))
[(0, 64), (1, 213), (2, 4568)]

@@ -275,7 +275,7 @@ Generally, repeating the algorithm will delete more data::

>>> from imblearn.under_sampling import RepeatedEditedNearestNeighbours
>>> renn = RepeatedEditedNearestNeighbours()
>>> X_resampled, y_resampled = renn.fit_sample(X, y)
>>> X_resampled, y_resampled = renn.fit_resample(X, y)
>>> print(sorted(Counter(y_resampled).items()))
[(0, 64), (1, 208), (2, 4551)]

@@ -285,7 +285,7 @@ internal nearest neighbors algorithm is increased at each iteration::

>>> from imblearn.under_sampling import AllKNN
>>> allknn = AllKNN()
>>> X_resampled, y_resampled = allknn.fit_sample(X, y)
>>> X_resampled, y_resampled = allknn.fit_resample(X, y)
>>> print(sorted(Counter(y_resampled).items()))
[(0, 64), (1, 220), (2, 4601)]

@@ -323,7 +323,7 @@ The :class:`CondensedNearestNeighbour` can be used in the following manner::

>>> from imblearn.under_sampling import CondensedNearestNeighbour
>>> cnn = CondensedNearestNeighbour(random_state=0)
>>> X_resampled, y_resampled = cnn.fit_sample(X, y)
>>> X_resampled, y_resampled = cnn.fit_resample(X, y)
>>> print(sorted(Counter(y_resampled).items()))
[(0, 64), (1, 24), (2, 115)]

@@ -338,7 +338,7 @@ used as::

>>> from imblearn.under_sampling import OneSidedSelection
>>> oss = OneSidedSelection(random_state=0)
>>> X_resampled, y_resampled = oss.fit_sample(X, y)
>>> X_resampled, y_resampled = oss.fit_resample(X, y)
>>> print(sorted(Counter(y_resampled).items()))
[(0, 64), (1, 174), (2, 4403)]

@@ -352,7 +352,7 @@ neighbors classifier. The class can be used as::

>>> from imblearn.under_sampling import NeighbourhoodCleaningRule
>>> ncr = NeighbourhoodCleaningRule()
>>> X_resampled, y_resampled = ncr.fit_sample(X, y)
>>> X_resampled, y_resampled = ncr.fit_resample(X, y)
>>> print(sorted(Counter(y_resampled).items()))
[(0, 64), (1, 234), (2, 4666)]

@@ -380,7 +380,7 @@ removed. The class can be used as::
>>> from imblearn.under_sampling import InstanceHardnessThreshold
>>> iht = InstanceHardnessThreshold(random_state=0,
... estimator=LogisticRegression())
>>> X_resampled, y_resampled = iht.fit_sample(X, y)
>>> X_resampled, y_resampled = iht.fit_resample(X, y)
>>> print(sorted(Counter(y_resampled).items()))
[(0, 64), (1, 64), (2, 64)]

5 changes: 5 additions & 0 deletions doc/whats_new/v0.0.4.rst
Original file line number Diff line number Diff line change
@@ -18,6 +18,11 @@ API
- Enable to use a ``list`` for the cleaning methods to specify the class to
sample. :issue:`411` by :user:`Guillaume Lemaitre <glemaitre>`.

- Replace ``fit_sample`` by ``fit_resample``. An alias is still available for
backward compatibility. In addition, ``sample`` has been removed to avoid
resampling on different set of data.
:issue:`462` by :user:`Guillaume Lemaitre <glemaitre>`.

New features
............

2 changes: 1 addition & 1 deletion examples/applications/plot_over_sampling_benchmark_lfw.py
Original file line number Diff line number Diff line change
@@ -39,7 +39,7 @@ def sample(self, X, y):
def fit(self, X, y):
return self

def fit_sample(self, X, y):
def fit_resample(self, X, y):
return self.sample(X, y)


2 changes: 1 addition & 1 deletion examples/applications/porto_seguro_keras_under_sampling.py
Original file line number Diff line number Diff line change
@@ -49,7 +49,7 @@
###############################################################################

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import FunctionTransformer
2 changes: 1 addition & 1 deletion examples/combine/plot_comparison_combine.py
Original file line number Diff line number Diff line change
@@ -47,7 +47,7 @@ def create_dataset(n_samples=1000, weights=(0.01, 0.01, 0.98), n_classes=3,


def plot_resampling(X, y, sampling, ax):
X_res, y_res = sampling.fit_sample(X, y)
X_res, y_res = sampling.fit_resample(X, y)
ax.scatter(X_res[:, 0], X_res[:, 1], c=y_res, alpha=0.8, edgecolor='k')
# make nice plotting
ax.spines['top'].set_visible(False)
2 changes: 1 addition & 1 deletion examples/combine/plot_smote_enn.py
Original file line number Diff line number Diff line change
@@ -32,7 +32,7 @@

# Apply SMOTE + ENN
sm = SMOTEENN()
X_resampled, y_resampled = sm.fit_sample(X, y)
X_resampled, y_resampled = sm.fit_resample(X, y)
X_res_vis = pca.transform(X_resampled)

# Two subplots, unpack the axes array immediately
2 changes: 1 addition & 1 deletion examples/combine/plot_smote_tomek.py
Original file line number Diff line number Diff line change
@@ -32,7 +32,7 @@

# Apply SMOTE + Tomek links
sm = SMOTETomek()
X_resampled, y_resampled = sm.fit_sample(X, y)
X_resampled, y_resampled = sm.fit_resample(X, y)
X_res_vis = pca.transform(X_resampled)

# Two subplots, unpack the axes array immediately
2 changes: 1 addition & 1 deletion examples/ensemble/plot_balance_cascade.py
Original file line number Diff line number Diff line change
@@ -32,7 +32,7 @@

# Apply Balance Cascade method
bc = BalanceCascade()
X_resampled, y_resampled = bc.fit_sample(X, y)
X_resampled, y_resampled = bc.fit_resample(X, y)
X_res_vis = []
for X_res in X_resampled:
X_res_vis.append(pca.transform(X_res))
2 changes: 1 addition & 1 deletion examples/ensemble/plot_easy_ensemble.py
Original file line number Diff line number Diff line change
@@ -32,7 +32,7 @@

# Apply Easy Ensemble
ee = EasyEnsemble(n_subsets=3)
X_resampled, y_resampled = ee.fit_sample(X, y)
X_resampled, y_resampled = ee.fit_resample(X, y)
X_res_vis = []
for X_res in X_resampled:
X_res_vis.append(pca.transform(X_res))
2 changes: 1 addition & 1 deletion examples/over-sampling/plot_adasyn.py
Original file line number Diff line number Diff line change
@@ -33,7 +33,7 @@

# Apply the random over-sampling
ada = ADASYN()
X_resampled, y_resampled = ada.fit_sample(X, y)
X_resampled, y_resampled = ada.fit_resample(X, y)
X_res_vis = pca.transform(X_resampled)

# Two subplots, unpack the axes array immediately
20 changes: 5 additions & 15 deletions examples/over-sampling/plot_comparison_over_sampling.py
Original file line number Diff line number Diff line change
@@ -23,8 +23,7 @@
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, SVMSMOTE
from imblearn.over_sampling import RandomOverSampler
from imblearn.base import SamplerMixin
from imblearn.utils import hash_X_y
from imblearn.base import BaseSampler

print(__doc__)

@@ -49,7 +48,7 @@ def create_dataset(n_samples=1000, weights=(0.01, 0.01, 0.98), n_classes=3,


def plot_resampling(X, y, sampling, ax):
X_res, y_res = sampling.fit_sample(X, y)
X_res, y_res = sampling.fit_resample(X, y)
ax.scatter(X_res[:, 0], X_res[:, 1], c=y_res, alpha=0.8, edgecolor='k')
# make nice plotting
ax.spines['top'].set_visible(False)
@@ -131,20 +130,11 @@ def plot_decision_function(X, y, clf, ax):


# Make an identity sampler
class FakeSampler(SamplerMixin):
class FakeSampler(BaseSampler):

def fit(self, X, y):
self.ratio_ = 1
self.X_hash_ = hash_X_y(X, y)
return self
_sampling_type = 'bypass'

def sample(self, X, y):
return X,

def _sample(self, X, y):
pass

def fit_sample(self, X, y):
def _fit_resample(self, X, y):
return X, y


2 changes: 1 addition & 1 deletion examples/over-sampling/plot_random_over_sampling.py
Original file line number Diff line number Diff line change
@@ -32,7 +32,7 @@

# Apply the random over-sampling
ros = RandomOverSampler()
X_resampled, y_resampled = ros.fit_sample(X, y)
X_resampled, y_resampled = ros.fit_resample(X, y)
X_res_vis = pca.transform(X_resampled)

# Two subplots, unpack the axes array immediately
2 changes: 1 addition & 1 deletion examples/over-sampling/plot_smote.py
Original file line number Diff line number Diff line change
@@ -57,7 +57,7 @@ def plot_resampling(ax, X, y, title):
y_resampled = []
X_res_vis = []
for method in sm:
X_res, y_res = method.fit_sample(X, y)
X_res, y_res = method.fit_resample(X, y)
X_resampled.append(X_res)
y_resampled.append(y_res)
X_res_vis.append(pca.transform(X_res))
2 changes: 1 addition & 1 deletion examples/plot_outlier_rejections.py
Original file line number Diff line number Diff line change
@@ -73,7 +73,7 @@ def outlier_rejection(X, y):


reject_sampler = FunctionSampler(func=outlier_rejection)
X_inliers, y_inliers = reject_sampler.fit_sample(X_train, y_train)
X_inliers, y_inliers = reject_sampler.fit_resample(X_train, y_train)
plot_scatter(X_inliers, y_inliers, 'Training data without outliers')

pipe = make_pipeline(FunctionSampler(func=outlier_rejection),
Loading