diff --git a/doc/api.rst b/doc/api.rst index ddf076143..f9566146f 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -205,4 +205,5 @@ Imbalance-learn provides some fast-prototyping tools. utils.estimator_checks.check_estimator utils.check_neighbors_object utils.check_ratio + utils.check_sampling_strategy utils.hash_X_y diff --git a/doc/datasets/index.rst b/doc/datasets/index.rst index 47ed6c7c4..7cb4f1909 100644 --- a/doc/datasets/index.rst +++ b/doc/datasets/index.rst @@ -94,29 +94,31 @@ Imbalanced generator ==================== :func:`make_imbalance` turns an original dataset into an imbalanced -dataset. This behaviour is driven by the parameter ``ratio`` which behave -similarly to other resampling algorithm. ``ratio`` can be given as a dictionary -where the key corresponds to the class and the value is the the number of -samples in the class:: +dataset. This behaviour is driven by the parameter ``sampling_strategy`` which +behave similarly to other resampling algorithm. ``sampling_strategy`` can be +given as a dictionary where the key corresponds to the class and the value is +the number of samples in the class:: >>> from sklearn.datasets import load_iris >>> from imblearn.datasets import make_imbalance >>> iris = load_iris() - >>> ratio = {0: 20, 1: 30, 2: 40} - >>> X_imb, y_imb = make_imbalance(iris.data, iris.target, ratio=ratio) + >>> sampling_strategy = {0: 20, 1: 30, 2: 40} + >>> X_imb, y_imb = make_imbalance(iris.data, iris.target, + ... sampling_strategy=sampling_strategy) >>> sorted(Counter(y_imb).items()) [(0, 20), (1, 30), (2, 40)] Note that all samples of a class are passed-through if the class is not mentioned in the dictionary:: - >>> ratio = {0: 10} - >>> X_imb, y_imb = make_imbalance(iris.data, iris.target, ratio=ratio) + >>> sampling_strategy = {0: 10} + >>> X_imb, y_imb = make_imbalance(iris.data, iris.target, + ... sampling_strategy=sampling_strategy) >>> sorted(Counter(y_imb).items()) [(0, 10), (1, 50), (2, 50)] Instead of a dictionary, a function can be defined and directly pass to -``ratio``:: +``sampling_strategy``:: >>> def ratio_multiplier(y): ... multiplier = {0: 0.5, 1: 0.7, 2: 0.95} @@ -125,9 +127,9 @@ Instead of a dictionary, a function can be defined and directly pass to ... target_stats[key] = int(value * multiplier[key]) ... return target_stats >>> X_imb, y_imb = make_imbalance(iris.data, iris.target, - ... ratio=ratio_multiplier) + ... sampling_strategy=ratio_multiplier) >>> sorted(Counter(y_imb).items()) [(0, 25), (1, 35), (2, 47)] See :ref:`sphx_glr_auto_examples_datasets_plot_make_imbalance.py` and -:ref:`sphx_glr_auto_examples_plot_ratio_usage.py`. +:ref:`sphx_glr_auto_examples_plot_sampling_strategy_usage.py`. diff --git a/doc/developers_utils.rst b/doc/developers_utils.rst index 04d6fbe55..577045656 100644 --- a/doc/developers_utils.rst +++ b/doc/developers_utils.rst @@ -26,9 +26,10 @@ which accepts arrays, matrices, or sparse matrices as arguments, the following should be used when applicable. - :func:`check_neighbors_object`: Check the objects is consistent to be a NN. -- :func:`check_target_type`: Check the target types to be conform to the current samplers. -- :func:`check_ratio`: Checks ratio for consistent type and return a dictionary - containing each targeted class with its corresponding number of pixel. +- :func:`check_target_type`: Check the target types to be conform to the current sam plers. +- :func:`check_sampling_strategy`: Checks that sampling target is onsistent with + the type and return a dictionary containing each targeted class with its + corresponding number of pixel. Deprecation diff --git a/doc/ensemble.rst b/doc/ensemble.rst index bda7a74fb..86a7ccc2b 100644 --- a/doc/ensemble.rst +++ b/doc/ensemble.rst @@ -92,12 +92,13 @@ output of an :class:`EasyEnsemble` sampler with an ensemble of classifiers (i.e. ``BaggingClassifier``). Therefore, :class:`BalancedBaggingClassifier` takes the same parameters than the scikit-learn ``BaggingClassifier``. Additionally, there is two additional parameters, -``ratio`` and ``replacement``, as in the :class:`EasyEnsemble` sampler:: +``sampling_strategy`` and ``replacement``, as in the :class:`EasyEnsemble` +sampler:: >>> from imblearn.ensemble import BalancedBaggingClassifier >>> bbc = BalancedBaggingClassifier(base_estimator=DecisionTreeClassifier(), - ... ratio='auto', + ... sampling_strategy='auto', ... replacement=False, ... random_state=0) >>> bbc.fit(X_train, y_train) # doctest: +ELLIPSIS diff --git a/doc/under_sampling.rst b/doc/under_sampling.rst index 318fb8b3c..f2412528e 100644 --- a/doc/under_sampling.rst +++ b/doc/under_sampling.rst @@ -103,7 +103,7 @@ by considering independently each targeted class:: >>> print(np.vstack({tuple(row) for row in X_resampled}).shape) (181, 2) -See :ref:`sphx_glr_auto_examples_plot_ratio_usage.py`, +See :ref:`sphx_glr_auto_examples_plot_sampling_strategy_usage.py`., :ref:`sphx_glr_auto_examples_under-sampling_plot_comparison_under_sampling.py`, and :ref:`sphx_glr_auto_examples_under-sampling_plot_random_under_sampler.py`. @@ -214,11 +214,11 @@ the samples of interest in green. :scale: 60 :align: center -The parameter ``ratio`` control which sample of the link will be removed. For -instance, the default (i.e., ``ratio='auto'``) will remove the sample from the -majority class. Both samples from the majority and minority class can be -removed by setting ``ratio`` to ``'all'``. The figure illustrates this -behaviour. +The parameter ``sampling_strategy`` control which sample of the link will be +removed. For instance, the default (i.e., ``sampling_strategy='auto'``) will +remove the sample from the majority class. Both samples from the majority and +minority class can be removed by setting ``sampling_strategy`` to ``'all'``. The +figure illustrates this behaviour. .. image:: ./auto_examples/under-sampling/images/sphx_glr_plot_illustration_tomek_links_002.png :target: ./auto_examples/under-sampling/plot_illustration_tomek_links.html diff --git a/doc/whats_new/v0.0.4.rst b/doc/whats_new/v0.0.4.rst index 0e3211b31..41a34338d 100644 --- a/doc/whats_new/v0.0.4.rst +++ b/doc/whats_new/v0.0.4.rst @@ -6,6 +6,18 @@ Version 0.4 (under development) Changelog --------- +API +... + +- Replace the parameter ``ratio`` by ``sampling_strategy``. :issue:`411` by + :user:`Guillaume Lemaitre `. + +- Enable to use a ``float`` with binary classification for + ``sampling_strategy``. :issue:`411` by :user:`Guillaume Lemaitre `. + +- Enable to use a ``list`` for the cleaning methods to specify the class to + sample. :issue:`411` by :user:`Guillaume Lemaitre `. + Enhancement ........... @@ -34,3 +46,20 @@ Maintenance - Remove deprecated parameters in 0.2 - :issue:`331` by :user:`Guillaume Lemaitre `. + +Deprecation +........... + +- Deprecate ``ratio`` in favor of ``sampling_strategy``. :issue:`411` by + :user:`Guillaume Lemaitre `. + +- Deprecate the use of a ``dict`` for cleaning methods. a ``list`` should be + used. :issue:`411` by :user:`Guillaume Lemaitre `. + +- Deprecate ``random_state`` in :class:`imblearn.under_sampling.NearMiss`, + :class:`imblearn.under_sampling.EditedNearestNeighbors`, + :class:`imblearn.under_sampling.RepeatedEditedNearestNeighbors`, + :class:`imblearn.under_sampling.AllKNN`, + :class:`imblearn.under_sampling.NeighbourhoodCleaningRule`, + :class:`imblearn.under_sampling.InstanceHardnessThreshold`, + :class:`imblearn.under_sampling.CondensedNearestNeighbours`. diff --git a/examples/applications/plot_multi_class_under_sampling.py b/examples/applications/plot_multi_class_under_sampling.py index e1d196906..c45129079 100644 --- a/examples/applications/plot_multi_class_under_sampling.py +++ b/examples/applications/plot_multi_class_under_sampling.py @@ -29,8 +29,9 @@ # Create a folder to fetch the dataset iris = load_iris() -X, y = make_imbalance(iris.data, iris.target, ratio={0: 25, 1: 50, 2: 50}, - random_state=0) +X, y = make_imbalance(iris.data, iris.target, + sampling_strategy={0: 25, 1: 50, 2: 50}, + random_state=RANDOM_STATE) X_train, X_test, y_train, y_test = train_test_split( X, y, random_state=RANDOM_STATE) @@ -39,7 +40,7 @@ print('Testing target statistics: {}'.format(Counter(y_test))) # Create a pipeline -pipeline = make_pipeline(NearMiss(version=2, random_state=RANDOM_STATE), +pipeline = make_pipeline(NearMiss(version=2), LinearSVC(random_state=RANDOM_STATE)) pipeline.fit(X_train, y_train) diff --git a/examples/datasets/plot_make_imbalance.py b/examples/datasets/plot_make_imbalance.py index e8da9630e..444d18943 100644 --- a/examples/datasets/plot_make_imbalance.py +++ b/examples/datasets/plot_make_imbalance.py @@ -55,12 +55,12 @@ def ratio_func(y, multiplier, minority_class): for i, multiplier in enumerate(multipliers, start=1): ax = axs[i] - X_, y_ = make_imbalance(X, y, ratio=ratio_func, + X_, y_ = make_imbalance(X, y, sampling_strategy=ratio_func, **{"multiplier": multiplier, "minority_class": 1}) ax.scatter(X_[y_ == 0, 0], X_[y_ == 0, 1], label="Class #0", alpha=0.5) ax.scatter(X_[y_ == 1, 0], X_[y_ == 1, 1], label="Class #1", alpha=0.5) - ax.set_title('ratio = {}'.format(multiplier)) + ax.set_title('sampling_strategy = {}'.format(multiplier)) plot_decoration(ax) plt.tight_layout() diff --git a/examples/plot_ratio_usage.py b/examples/plot_ratio_usage.py deleted file mode 100644 index 62c86d8f5..000000000 --- a/examples/plot_ratio_usage.py +++ /dev/null @@ -1,134 +0,0 @@ -""" -============================================================ -Usage of the ``ratio`` parameter for the different algorithm -============================================================ - -This example shows how to use the ``ratio`` parameter in the different -examples. It illustrated the use of passing ``ratio`` as a ``str``, ``dict`` or -a callable. - -""" - -# Authors: Guillaume Lemaitre -# License: MIT - -from collections import Counter - -import matplotlib.pyplot as plt - -from sklearn.datasets import load_iris - -from imblearn.datasets import make_imbalance -from imblearn.under_sampling import RandomUnderSampler - -print(__doc__) - - -def plot_pie(y): - target_stats = Counter(y) - labels = list(target_stats.keys()) - sizes = list(target_stats.values()) - explode = tuple([0.1] * len(target_stats)) - - fig, ax = plt.subplots() - ax.pie(sizes, explode=explode, labels=labels, shadow=True, - autopct='%1.1f%%') - ax.axis('equal') - - -############################################################################### -# Creation of an imbalanced data set from a balanced data set -############################################################################### - -############################################################################### -# We will show how to use the parameter ``ratio`` when dealing with the -# ``make_imbalance`` function. For this function, this parameter accepts both -# dictionary and callable. When using a dictionary, each key will correspond to -# the class of interest and the corresponding value will be the number of -# samples desired in this class. - -iris = load_iris() - -print('Information of the original iris data set: \n {}'.format( - Counter(iris.target))) -plot_pie(iris.target) - -ratio = {0: 10, 1: 20, 2: 30} -X, y = make_imbalance(iris.data, iris.target, ratio=ratio) - -print('Information of the iris data set after making it' - ' imbalanced using a dict: \n ratio={} \n y: {}'.format(ratio, - Counter(y))) -plot_pie(y) - -############################################################################### -# You might required more flexibility and require your own heuristic to -# determine the number of samples by class and you can define your own callable -# as follow. In this case we will define a function which will use a float -# multiplier to define the number of samples per class. - - -def ratio_multiplier(y): - multiplier = {0: 0.5, 1: 0.7, 2: 0.95} - target_stats = Counter(y) - for key, value in target_stats.items(): - target_stats[key] = int(value * multiplier[key]) - return target_stats - - -X, y = make_imbalance(iris.data, iris.target, ratio=ratio_multiplier) - -print('Information of the iris data set after making it' - ' imbalanced using a callable: \n ratio={} \n y: {}'.format( - ratio_multiplier, Counter(y))) -plot_pie(y) - -############################################################################### -# Using ``ratio`` in resampling algorithm -############################################################################### - -############################################################################### -# In all sampling algorithms, ``ratio`` can be used as illustrated earlier. In -# addition, some predefined functions are available and can be executed using a -# ``str`` with the following choices: (i) ``'minority'``: resample the minority -# class; (ii) ``'majority'``: resample the majority class, (iii) ``'not -# minority'``: resample all classes apart of the minority class, (iv) -# ``'all'``: resample all classes, and (v) ``'auto'``: correspond to 'all' with -# for over-sampling methods and 'not minority' for under-sampling methods. The -# classes targeted will be over-sampled or under-sampled to achieve an equal -# number of sample with the majority or minority class. - -ratio = 'auto' -X_res, y_res = RandomUnderSampler(ratio=ratio, random_state=0).fit_sample(X, y) - -print('Information of the iris data set after balancing using "auto"' - ' mode:\n ratio={} \n y: {}'.format(ratio, Counter(y_res))) -plot_pie(y_res) - -############################################################################### -# However, you can use the dictionary or the callable options as previously -# mentioned. - -ratio = {0: 25, 1: 30, 2: 35} -X_res, y_res = RandomUnderSampler(ratio=ratio, random_state=0).fit_sample(X, y) - -print('Information of the iris data set after balancing using a dict' - ' mode:\n ratio={} \n y: {}'.format(ratio, Counter(y_res))) -plot_pie(y_res) - - -def ratio_multiplier(y): - multiplier = {1: 0.7, 2: 0.95} - target_stats = Counter(y) - for key, value in target_stats.items(): - target_stats[key] = int(value * multiplier[key]) - return target_stats - - -X_res, y_res = RandomUnderSampler(ratio=ratio, random_state=0).fit_sample(X, y) - -print('Information of the iris data set after balancing using a callable' - ' mode:\n ratio={} \n y: {}'.format(ratio, Counter(y_res))) -plot_pie(y_res) - -plt.show() diff --git a/examples/plot_sampling_target_usage.py b/examples/plot_sampling_target_usage.py new file mode 100644 index 000000000..f4339572d --- /dev/null +++ b/examples/plot_sampling_target_usage.py @@ -0,0 +1,220 @@ +""" +====================================================================== +Usage of the ``sampling_strategy`` parameter for the different algorithms +======================================================================= + +This example shows the different usage of the parameter ``sampling_strategy`` for +the different family of samplers (i.e. over-sampling, under-sampling. or +cleaning methods). + +""" + +# Authors: Guillaume Lemaitre +# License: MIT + +from collections import Counter + +import numpy as np +import matplotlib.pyplot as plt + +from sklearn.datasets import load_iris + +from imblearn.datasets import make_imbalance + +from imblearn.over_sampling import RandomOverSampler +from imblearn.under_sampling import RandomUnderSampler +from imblearn.under_sampling import TomekLinks + +print(__doc__) + + +def plot_pie(y): + target_stats = Counter(y) + labels = list(target_stats.keys()) + sizes = list(target_stats.values()) + explode = tuple([0.1] * len(target_stats)) + + def make_autopct(values): + def my_autopct(pct): + total = sum(values) + val = int(round(pct * total / 100.0)) + return '{p:.2f}% ({v:d})'.format(p=pct, v=val) + return my_autopct + + fig, ax = plt.subplots() + ax.pie(sizes, explode=explode, labels=labels, shadow=True, + autopct=make_autopct(sizes)) + ax.axis('equal') + + +############################################################################### +# First, we will create an imbalanced data set from a the iris data set. + +iris = load_iris() + +print('Information of the original iris data set: \n {}'.format( + Counter(iris.target))) +plot_pie(iris.target) + +sampling_strategy = {0: 10, 1: 20, 2: 47} +X, y = make_imbalance(iris.data, iris.target, sampling_strategy=sampling_strategy) + +print('Information of the iris data set after making it' + ' imbalanced using a dict: \n sampling_strategy={} \n y: {}' + .format(sampling_strategy, Counter(y))) +plot_pie(y) + +############################################################################### +# Using ``sampling_strategy`` in resampling algorithms +############################################################################### + +############################################################################### +# ``sampling_strategy`` as a ``float`` +# ................................... +# +# ``sampling_strategy`` can be given a ``float``. For **under-sampling +# methods**, it corresponds to the ratio :math:`\\alpha_{us}` defined by +# :math:`N_{rM} = \\alpha_{us} \\times N_{m}` where :math:`N_{rM}` and +# :math:`N_{m}` are the number of samples in the majority class after +# resampling and the number of samples in the minority class, respectively. + +# select only 2 classes since the ratio make sense in this case +binary_mask = np.bitwise_or(y == 0, y == 2) +binary_y = y[binary_mask] +binary_X = X[binary_mask] + +sampling_strategy = 0.8 + +rus = RandomUnderSampler(sampling_strategy=sampling_strategy) +X_res, y_res = rus.fit_sample(binary_X, binary_y) +print('Information of the iris data set after making it ' + 'balanced using a float and an under-sampling method: \n ' + 'sampling_strategy={} \n y: {}' + .format(sampling_strategy, Counter(y_res))) +plot_pie(y_res) + +############################################################################### +# For **over-sampling methods**, it correspond to the ratio +# :math:`\\alpha_{os}` defined by :math:`N_{rm} = \\alpha_{os} \\times N_{m}` +# where :math:`N_{rm}` and :math:`N_{M}` are the number of samples in the +# minority class after resampling and the number of samples in the majority +# class, respectively. + +ros = RandomOverSampler(sampling_strategy=sampling_strategy) +X_res, y_res = ros.fit_sample(binary_X, binary_y) +print('Information of the iris data set after making it ' + 'balanced using a float and an over-sampling method: \n ' + 'sampling_strategy={} \n y: {}' + .format(sampling_strategy, Counter(y_res))) +plot_pie(y_res) + +############################################################################### +# ``sampling_strategy`` has a ``str`` +# ................................. +# +# ``sampling_strategy`` can be given as a string which specify the class targeted +# by the resampling. With under- and over-sampling, the number of samples will +# be equalized. +# +# Note that we are using multiple classes from now on. + +sampling_strategy = 'not minority' + +rus = RandomUnderSampler(sampling_strategy=sampling_strategy) +X_res, y_res = rus.fit_sample(X, y) +print('Information of the iris data set after making it ' + 'balanced by under-sampling: \n sampling_strategy={} \n y: {}' + .format(sampling_strategy, Counter(y_res))) +plot_pie(y_res) + +sampling_strategy = 'not majority' + +ros = RandomOverSampler(sampling_strategy=sampling_strategy) +X_res, y_res = ros.fit_sample(X, y) +print('Information of the iris data set after making it ' + 'balanced by over-sampling: \n sampling_strategy={} \n y: {}' + .format(sampling_strategy, Counter(y_res))) +plot_pie(y_res) + +############################################################################### +# With **cleaning method**, the number of samples in each class will not be +# equalized even if targeted. + +sampling_strategy = 'not minority' +tl = TomekLinks(sampling_strategy) +X_res, y_res = tl.fit_sample(X, y) +print('Information of the iris data set after making it ' + 'balanced by cleaning sampling: \n sampling_strategy={} \n y: {}' + .format(sampling_strategy, Counter(y_res))) +plot_pie(y_res) + +############################################################################### +# ``sampling_strategy`` as a ``dict`` +# .................................. +# +# When ``sampling_strategy`` is a ``dict``, the keys correspond to the targeted +# classes. The values correspond to the desired number of samples for each +# targeted class. This is working for both **under- and over-sampling** +# algorithms but not for the **cleaning algorithms**. Use a ``list`` instead. + + +sampling_strategy = {0: 10, 1: 15, 2: 20} + +rus = RandomUnderSampler(sampling_strategy=sampling_strategy) +X_res, y_res = rus.fit_sample(X, y) +print('Information of the iris data set after making it ' + 'balanced by under-sampling: \n sampling_strategy={} \n y: {}' + .format(sampling_strategy, Counter(y_res))) +plot_pie(y_res) + +sampling_strategy = {0: 25, 1: 35, 2: 47} + +ros = RandomOverSampler(sampling_strategy=sampling_strategy) +X_res, y_res = ros.fit_sample(X, y) +print('Information of the iris data set after making it ' + 'balanced by over-sampling: \n sampling_strategy={} \n y: {}' + .format(sampling_strategy, Counter(y_res))) +plot_pie(y_res) + +############################################################################### +# ``sampling_strategy`` as a ``list`` +# .................................. +# +# When ``sampling_strategy`` is a ``list``, the list contains the targeted +# classes. It is used only for **cleaning methods** and raise an error +# otherwise. + +sampling_strategy = [0, 1, 2] +tl = TomekLinks(sampling_strategy=sampling_strategy) +X_res, y_res = tl.fit_sample(X, y) +print('Information of the iris data set after making it ' + 'balanced by cleaning sampling: \n sampling_strategy={} \n y: {}' + .format(sampling_strategy, Counter(y_res))) +plot_pie(y_res) + +############################################################################### +# ``sampling_strategy`` as a callable +# .................................. +# +# When callable, function taking ``y`` and returns a ``dict``. The keys +# correspond to the targeted classes. The values correspond to the desired +# number of samples for each class. + + +def ratio_multiplier(y): + multiplier = {1: 0.7, 2: 0.95} + target_stats = Counter(y) + for key, value in target_stats.items(): + if key in multiplier: + target_stats[key] = int(value * multiplier[key]) + return target_stats + + +X_res, y_res = (RandomUnderSampler(sampling_strategy=ratio_multiplier) + .fit_sample(X, y)) + +print('Information of the iris data set after balancing using a callable' + ' mode:\n ratio={} \n y: {}'.format(ratio_multiplier, Counter(y_res))) +plot_pie(y_res) + +plt.show() diff --git a/examples/under-sampling/plot_comparison_under_sampling.py b/examples/under-sampling/plot_comparison_under_sampling.py index bc4c7b66d..3a2c427a3 100644 --- a/examples/under-sampling/plot_comparison_under_sampling.py +++ b/examples/under-sampling/plot_comparison_under_sampling.py @@ -155,9 +155,9 @@ def plot_decision_function(X, y, clf, ax): X, y = create_dataset(n_samples=5000, weights=(0.1, 0.2, 0.7), class_sep=0.8) ax_arr = ((ax1, ax2), (ax3, ax4), (ax5, ax6)) -for ax, sampler in zip(ax_arr, (NearMiss(version=1, random_state=0), - NearMiss(version=2, random_state=0), - NearMiss(version=3, random_state=0))): +for ax, sampler in zip(ax_arr, (NearMiss(version=1), + NearMiss(version=2), + NearMiss(version=3))): clf = make_pipeline(sampler, LinearSVC()) clf.fit(X, y) plot_decision_function(X, y, clf, ax[0]) @@ -182,9 +182,9 @@ def plot_decision_function(X, y, clf, ax): ax_arr = ((ax1, ax2), (ax3, ax4), (ax5, ax6)) for ax, sampler in zip(ax_arr, ( - EditedNearestNeighbours(random_state=0), - RepeatedEditedNearestNeighbours(random_state=0), - AllKNN(random_state=0, allow_minority=True))): + EditedNearestNeighbours(), + RepeatedEditedNearestNeighbours(), + AllKNN(allow_minority=True))): clf = make_pipeline(sampler, LinearSVC()) clf.fit(X, y) plot_decision_function(X, y, clf, ax[0]) @@ -212,7 +212,7 @@ def plot_decision_function(X, y, clf, ax): for ax, sampler in zip(ax_arr, ( CondensedNearestNeighbour(random_state=0), OneSidedSelection(random_state=0), - NeighbourhoodCleaningRule(random_state=0))): + NeighbourhoodCleaningRule())): clf = make_pipeline(sampler, LinearSVC()) clf.fit(X, y) plot_decision_function(X, y, clf, ax[0]) diff --git a/examples/under-sampling/plot_illustration_tomek_links.py b/examples/under-sampling/plot_illustration_tomek_links.py index 4ad376378..3f3ff469b 100644 --- a/examples/under-sampling/plot_illustration_tomek_links.py +++ b/examples/under-sampling/plot_illustration_tomek_links.py @@ -64,8 +64,9 @@ def make_plot_despine(ax): ############################################################################### # We can run the ``TomekLinks`` sampling to remove the corresponding -# samples. If ``ratio='auto'`` only the sample from the majority class will be -# removed. If ``ratio='all'`` both samples will be removed. +# samples. If ``sampling_strategy='auto'`` only the sample from the majority +# class will be removed. If ``sampling_strategy='all'`` both samples will be +# removed. sampler = TomekLinks() @@ -76,8 +77,8 @@ def make_plot_despine(ax): 'Removing all samples') for ax, title, sampler in zip(ax_arr, title_arr, - [TomekLinks(ratio='auto', random_state=0), - TomekLinks(ratio='all', random_state=0)]): + [TomekLinks(sampling_strategy='auto'), + TomekLinks(sampling_strategy='all')]): X_res, y_res = sampler.fit_sample(np.vstack((X_minority, X_majority)), np.array([0] * X_minority.shape[0] + [1] * X_majority.shape[0])) diff --git a/examples/under-sampling/plot_instance_hardness_threshold.py b/examples/under-sampling/plot_instance_hardness_threshold.py index 95730d880..cdea26699 100644 --- a/examples/under-sampling/plot_instance_hardness_threshold.py +++ b/examples/under-sampling/plot_instance_hardness_threshold.py @@ -52,20 +52,21 @@ def plot_resampling(ax, X, y, title): f, axs = plt.subplots(2, 2) axs = [a for ax in axs for a in ax] -for ax, ratio in zip(axs, (0, - {1: 25, 0: 10}, - {1: 14, 0: 10}, - {1: 10, 0: 10})): - if ratio == 0: +for ax, sampling_strategy in zip(axs, (0, + {1: 25, 0: 10}, + {1: 14, 0: 10}, + {1: 10, 0: 10})): + if sampling_strategy == 0: c0, c1 = plot_resampling(ax, X_vis, y, 'Original set') else: - iht = InstanceHardnessThreshold(ratio=ratio, + iht = InstanceHardnessThreshold(sampling_strategy=sampling_strategy, estimator=LogisticRegression(), return_indices=True) X_res, y_res, idx_res = iht.fit_sample(X, y) X_res_vis = pca.transform(X_res) plot_resampling(ax, X_res_vis, y_res, - 'Instance Hardness Threshold ({})'.format(ratio)) + 'Instance Hardness Threshold ({})' + .format(sampling_strategy)) # plot samples which have been removed idx_samples_removed = np.setdiff1d(np.arange(X_vis.shape[0]), idx_res) diff --git a/imblearn/base.py b/imblearn/base.py index a44831c0b..dbfe08070 100644 --- a/imblearn/base.py +++ b/imblearn/base.py @@ -7,6 +7,7 @@ from __future__ import division import logging +import warnings from abc import ABCMeta, abstractmethod import numpy as np @@ -17,7 +18,8 @@ from sklearn.utils import check_X_y from sklearn.utils.validation import check_is_fitted -from .utils import check_ratio, check_target_type, hash_X_y +from .utils import check_sampling_strategy, check_target_type, hash_X_y +from .utils.deprecation import deprecate_parameter class SamplerMixin(six.with_metaclass(ABCMeta, BaseEstimator)): @@ -61,7 +63,7 @@ def sample(self, X, y): y, binarize_y = check_target_type(y, indicate_one_vs_all=True) X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) - check_is_fitted(self, 'ratio_') + check_is_fitted(self, 'sampling_strategy_') self._check_X_y(X, y) output = self._sample(X, y) @@ -143,10 +145,26 @@ class BaseSampler(SamplerMixin): instead. """ - def __init__(self, ratio='auto'): + def __init__(self, sampling_strategy='auto', ratio=None): + self.sampling_strategy = sampling_strategy + # FIXME: remove in 0.6 self.ratio = ratio self.logger = logging.getLogger(self.__module__) + @property + def ratio_(self): + # FIXME: remove in 0.6 + warnings.warn("'ratio' and 'ratio_' are deprecated. Use " + "'sampling_strategy' and 'sampling_strategy_' instead.", + DeprecationWarning) + return self.sampling_strategy_ + + def _deprecate_ratio(self): + # both ratio and sampling_strategy should not be set + if self.ratio is not None: + deprecate_parameter(self, '0.4', 'ratio', 'sampling_strategy') + self.sampling_strategy = self.ratio + def fit(self, X, y): """Find the classes statistics before to perform sampling. @@ -164,11 +182,12 @@ def fit(self, X, y): Return self. """ + self._deprecate_ratio() y = check_target_type(y) X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) self.X_hash_, self.y_hash_ = hash_X_y(X, y) - # self.sampling_type is already checked in check_ratio - self.ratio_ = check_ratio(self.ratio, y, self._sampling_type) + self.sampling_strategy_ = check_sampling_strategy( + self.sampling_strategy, y, self._sampling_type) return self @@ -226,11 +245,12 @@ class FunctionSampler(SamplerMixin): >>> from collections import Counter >>> from imblearn.under_sampling import RandomUnderSampler - >>> def func(X, y, ratio, random_state): - ... return RandomUnderSampler(ratio=ratio, + >>> def func(X, y, sampling_strategy, random_state): + ... return RandomUnderSampler(sampling_strategy=sampling_strategy, ... random_state=random_state).fit_sample(X, y) >>> sampler = FunctionSampler(func=func, - ... kw_args={'ratio': 'auto', 'random_state': 0}) + ... kw_args={'sampling_strategy': 'auto', + ... 'random_state': 0}) >>> X_res, y_res = sampler.fit_sample(X, y) >>> print('Resampled dataset shape {}'.format( ... sorted(Counter(y_res).items()))) @@ -246,19 +266,31 @@ def __init__(self, func=None, accept_sparse=True, kw_args=None): def fit(self, X, y): y = check_target_type(y) - X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'] - if self.accept_sparse else False) + X, y = check_X_y( + X, + y, + accept_sparse=['csr', 'csc'] if self.accept_sparse else False) self.X_hash_, self.y_hash_ = hash_X_y(X, y) # when using a sampler, ratio_ is supposed to exist after fit - self.ratio_ = 'is_fitted' + self.sampling_strategy_ = 'is_fitted' return self + @property + def ratio_(self): + # FIXME: remove in 0.6 + warnings.warn("'ratio' and 'ratio_' are deprecated. Use " + "'sampling_strategy' and 'sampling_strategy_' instead.", + DeprecationWarning) + return self.sampling_strategy_ + def _sample(self, X, y, func=None, kw_args=None): y, binarize_y = check_target_type(y, indicate_one_vs_all=True) - X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'] - if self.accept_sparse else False) - check_is_fitted(self, 'ratio_') + X, y = check_X_y( + X, + y, + accept_sparse=['csr', 'csc'] if self.accept_sparse else False) + check_is_fitted(self, 'sampling_strategy_') X_hash, y_hash = hash_X_y(X, y) if self.X_hash_ != X_hash or self.y_hash_ != y_hash: raise RuntimeError("X and y need to be same array earlier fitted.") diff --git a/imblearn/combine/smote_enn.py b/imblearn/combine/smote_enn.py index 470919878..75c98b6e1 100644 --- a/imblearn/combine/smote_enn.py +++ b/imblearn/combine/smote_enn.py @@ -7,15 +7,22 @@ from __future__ import division import logging +import warnings from sklearn.utils import check_X_y from ..base import SamplerMixin from ..over_sampling import SMOTE +from ..over_sampling.base import BaseOverSampler from ..under_sampling import EditedNearestNeighbours from ..utils import check_target_type, hash_X_y +from ..utils import Substitution +from ..utils._docstring import _random_state_docstring +@Substitution( + sampling_strategy=BaseOverSampler._sampling_strategy_docstring, + random_state=_random_state_docstring) class SMOTEENN(SamplerMixin): """Class to perform over-sampling using SMOTE and cleaning using ENN. @@ -25,34 +32,20 @@ class SMOTEENN(SamplerMixin): Parameters ---------- - ratio : str, dict, or callable, optional (default='auto') - Ratio to use for resampling the data set. - - - If ``str``, has to be one of: (i) ``'minority'``: resample the - minority class; (ii) ``'majority'``: resample the majority class, - (iii) ``'not minority'``: resample all classes apart of the minority - class, (iv) ``'all'``: resample all classes, and (v) ``'auto'``: - correspond to ``'all'`` with for over-sampling methods and ``'not - minority'`` for under-sampling methods. The classes targeted will be - over-sampled or under-sampled to achieve an equal number of sample - with the majority or minority class. - - If ``dict``, the keys correspond to the targeted classes. The values - correspond to the desired number of samples. - - If callable, function taking ``y`` and returns a ``dict``. The keys - correspond to the targeted classes. The values correspond to the - desired number of samples. - - random_state : int, RandomState instance or None, optional (default=None) - If int, ``random_state`` is the seed used by the random number - generator; If ``RandomState`` instance, random_state is the random - number generator; If ``None``, the random number generator is the - ``RandomState`` instance used by ``np.random``. + {sampling_strategy} + + {random_state} smote : object, optional (default=SMOTE()) The :class:`imblearn.over_sampling.SMOTE` object to use. If not given, a :class:`imblearn.over_sampling.SMOTE` object with default parameters will be given. + ratio : str, dict, or callable + .. deprecated:: 0.4 + Use the parameter ``sampling_strategy`` instead. It will be removed + in 0.6. + Notes ----- The method is presented in [1]_. @@ -83,25 +76,27 @@ class SMOTEENN(SamplerMixin): >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) - >>> print('Original dataset shape {}'.format(Counter(y))) - Original dataset shape Counter({1: 900, 0: 100}) + >>> print('Original dataset shape %s' % Counter(y)) + Original dataset shape Counter({{1: 900, 0: 100}}) >>> sme = SMOTEENN(random_state=42) >>> X_res, y_res = sme.fit_sample(X, y) - >>> print('Resampled dataset shape {}'.format(Counter(y_res))) - Resampled dataset shape Counter({0: 900, 1: 881}) + >>> print('Resampled dataset shape %s' % Counter(y_res)) + Resampled dataset shape Counter({{0: 900, 1: 881}}) """ def __init__(self, - ratio='auto', + sampling_strategy='auto', random_state=None, smote=None, - enn=None): + enn=None, + ratio=None): super(SMOTEENN, self).__init__() - self.ratio = ratio + self.sampling_strategy = sampling_strategy self.random_state = random_state self.smote = smote self.enn = enn + self.ratio = ratio self.logger = logging.getLogger(__name__) def _validate_estimator(self): @@ -115,7 +110,9 @@ def _validate_estimator(self): # Otherwise create a default SMOTE else: self.smote_ = SMOTE( - ratio=self.ratio, random_state=self.random_state) + sampling_strategy=self.sampling_strategy, + random_state=self.random_state, + ratio=self.ratio) if self.enn is not None: if isinstance(self.enn, EditedNearestNeighbours): @@ -125,7 +122,15 @@ def _validate_estimator(self): ' Got {} instead.'.format(type(self.enn))) # Otherwise create a default EditedNearestNeighbours else: - self.enn_ = EditedNearestNeighbours(ratio='all') + self.enn_ = EditedNearestNeighbours(sampling_strategy='all') + + @property + def ratio_(self): + # FIXME: remove in 0.6 + warnings.warn("'ratio' and 'ratio_' are deprecated. Use " + "'sampling_strategy' and 'sampling_strategy_' instead.", + DeprecationWarning) + return self.sampling_strategy_ def fit(self, X, y): """Find the classes statistics before to perform sampling. @@ -146,7 +151,7 @@ def fit(self, X, y): """ y = check_target_type(y) X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) - self.ratio_ = self.ratio + self.sampling_strategy_ = self.sampling_strategy self.X_hash_, self.y_hash_ = hash_X_y(X, y) return self diff --git a/imblearn/combine/smote_tomek.py b/imblearn/combine/smote_tomek.py index 0748e6ef7..782503762 100644 --- a/imblearn/combine/smote_tomek.py +++ b/imblearn/combine/smote_tomek.py @@ -8,15 +8,22 @@ from __future__ import division import logging +import warnings from sklearn.utils import check_X_y from ..base import SamplerMixin from ..over_sampling import SMOTE +from ..over_sampling.base import BaseOverSampler from ..under_sampling import TomekLinks from ..utils import check_target_type, hash_X_y +from ..utils import Substitution +from ..utils._docstring import _random_state_docstring +@Substitution( + sampling_strategy=BaseOverSampler._sampling_strategy_docstring, + random_state=_random_state_docstring) class SMOTETomek(SamplerMixin): """Class to perform over-sampling using SMOTE and cleaning using Tomek links. @@ -27,28 +34,9 @@ class SMOTETomek(SamplerMixin): Parameters ---------- - ratio : str, dict, or callable, optional (default='auto') - Ratio to use for resampling the data set. - - - If ``str``, has to be one of: (i) ``'minority'``: resample the - minority class; (ii) ``'majority'``: resample the majority class, - (iii) ``'not minority'``: resample all classes apart of the minority - class, (iv) ``'all'``: resample all classes, and (v) ``'auto'``: - correspond to ``'all'`` with for over-sampling methods and ``'not - minority'`` for under-sampling methods. The classes targeted will be - over-sampled or under-sampled to achieve an equal number of sample - with the majority or minority class. - - If ``dict``, the keys correspond to the targeted classes. The values - correspond to the desired number of samples. - - If callable, function taking ``y`` and returns a ``dict``. The keys - correspond to the targeted classes. The values correspond to the - desired number of samples. - - random_state : int, RandomState instance or None, optional (default=None) - If int, ``random_state`` is the seed used by the random number - generator; If ``RandomState`` instance, random_state is the random - number generator; If ``None``, the random number generator is the - ``RandomState`` instance used by ``np.random``. + {sampling_strategy} + + {random_state} smote : object, optional (default=SMOTE()) The :class:`imblearn.over_sampling.SMOTE` object to use. If not given, @@ -60,6 +48,11 @@ class SMOTETomek(SamplerMixin): a :class:`imblearn.under_sampling.Tomek` object with default parameters will be given. + ratio : str, dict, or callable + .. deprecated:: 0.4 + Use the parameter ``sampling_strategy`` instead. It will be removed + in 0.6. + Notes ----- The methos is presented in [1]_. @@ -90,25 +83,27 @@ class SMOTETomek(SamplerMixin): >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) - >>> print('Original dataset shape {}'.format(Counter(y))) - Original dataset shape Counter({1: 900, 0: 100}) + >>> print('Original dataset shape %s' % Counter(y)) + Original dataset shape Counter({{1: 900, 0: 100}}) >>> smt = SMOTETomek(random_state=42) >>> X_res, y_res = smt.fit_sample(X, y) - >>> print('Resampled dataset shape {}'.format(Counter(y_res))) - Resampled dataset shape Counter({0: 900, 1: 900}) + >>> print('Resampled dataset shape %s' % Counter(y_res)) + Resampled dataset shape Counter({{0: 900, 1: 900}}) """ def __init__(self, - ratio='auto', + sampling_strategy='auto', random_state=None, smote=None, - tomek=None): + tomek=None, + ratio=None): super(SMOTETomek, self).__init__() - self.ratio = ratio + self.sampling_strategy = sampling_strategy self.random_state = random_state self.smote = smote self.tomek = tomek + self.ratio = ratio self.logger = logging.getLogger(__name__) def _validate_estimator(self): @@ -123,7 +118,9 @@ def _validate_estimator(self): # Otherwise create a default SMOTE else: self.smote_ = SMOTE( - ratio=self.ratio, random_state=self.random_state) + sampling_strategy=self.sampling_strategy, + random_state=self.random_state, + ratio=self.ratio) if self.tomek is not None: if isinstance(self.tomek, TomekLinks): @@ -133,7 +130,15 @@ def _validate_estimator(self): 'Got {} instead.'.format(type(self.tomek))) # Otherwise create a default TomekLinks else: - self.tomek_ = TomekLinks(ratio='all') + self.tomek_ = TomekLinks(sampling_strategy='all') + + @property + def ratio_(self): + # FIXME: remove in 0.6 + warnings.warn("'ratio' and 'ratio_' are deprecated. Use " + "'sampling_strategy' and 'sampling_strategy_' instead.", + DeprecationWarning) + return self.sampling_strategy_ def fit(self, X, y): """Find the classes statistics before to perform sampling. @@ -154,7 +159,7 @@ def fit(self, X, y): """ y = check_target_type(y) X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) - self.ratio_ = self.ratio + self.sampling_strategy_ = self.sampling_strategy self.X_hash_, self.y_hash_ = hash_X_y(X, y) return self diff --git a/imblearn/combine/tests/test_smote_enn.py b/imblearn/combine/tests/test_smote_enn.py index fb5a5ae13..793a7a967 100644 --- a/imblearn/combine/tests/test_smote_enn.py +++ b/imblearn/combine/tests/test_smote_enn.py @@ -15,16 +15,19 @@ from imblearn.over_sampling import SMOTE RND_SEED = 0 -X = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], - [1.25192108, -0.22367336], [0.53366841, -0.30312976], - [1.52091956, -0.49283504], [-0.28162401, -2.10400981], - [0.83680821, 1.72827342], [0.3084254, 0.33299982], - [0.70472253, -0.73309052], [0.28893132, -0.38761769], - [1.15514042, 0.0129463], [0.88407872, 0.35454207], - [1.31301027, -0.92648734], [-1.11515198, -0.93689695], - [-0.18410027, -0.45194484], [0.9281014, 0.53085498], - [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], - [0.08711622, 0.93259929], [1.70580611, -0.11219234]]) +X = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [ + 1.25192108, -0.22367336 +], [0.53366841, -0.30312976], [1.52091956, + -0.49283504], [-0.28162401, -2.10400981], + [0.83680821, + 1.72827342], [0.3084254, 0.33299982], [0.70472253, -0.73309052], + [0.28893132, -0.38761769], [1.15514042, 0.0129463], [ + 0.88407872, 0.35454207 + ], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [ + -0.18410027, -0.45194484 + ], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [ + -0.41635887, -0.38299653 + ], [0.08711622, 0.93259929], [1.70580611, -0.11219234]]) Y = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0]) R_TOL = 1e-4 @@ -33,46 +36,40 @@ def test_sample_regular(): smote = SMOTEENN(random_state=RND_SEED) X_resampled, y_resampled = smote.fit_sample(X, Y) - X_gt = np.array([[1.52091956, -0.49283504], - [0.84976473, -0.15570176], - [0.61319159, -0.11571667], - [0.66052536, -0.28246518], - [-0.28162401, -2.10400981], - [0.83680821, 1.72827342], - [0.08711622, 0.93259929]]) + X_gt = np.array([[1.52091956, -0.49283504], [0.84976473, -0.15570176], [ + 0.61319159, -0.11571667 + ], [0.66052536, -0.28246518], [-0.28162401, -2.10400981], + [0.83680821, 1.72827342], [0.08711622, 0.93259929]]) y_gt = np.array([0, 0, 0, 0, 1, 1, 1]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) def test_sample_regular_pass_smote_enn(): - smote = SMOTEENN(smote=SMOTE(ratio='auto', random_state=RND_SEED), - enn=EditedNearestNeighbours(ratio='all', - random_state=RND_SEED), - random_state=RND_SEED) + smote = SMOTEENN( + smote=SMOTE(sampling_strategy='auto', random_state=RND_SEED), + enn=EditedNearestNeighbours( + sampling_strategy='all', random_state=RND_SEED), + random_state=RND_SEED) X_resampled, y_resampled = smote.fit_sample(X, Y) - X_gt = np.array([[1.52091956, -0.49283504], - [0.84976473, -0.15570176], - [0.61319159, -0.11571667], - [0.66052536, -0.28246518], - [-0.28162401, -2.10400981], - [0.83680821, 1.72827342], - [0.08711622, 0.93259929]]) + X_gt = np.array([[1.52091956, -0.49283504], [0.84976473, -0.15570176], [ + 0.61319159, -0.11571667 + ], [0.66052536, -0.28246518], [-0.28162401, -2.10400981], + [0.83680821, 1.72827342], [0.08711622, 0.93259929]]) y_gt = np.array([0, 0, 0, 0, 1, 1, 1]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) def test_sample_regular_half(): - ratio = {0: 10, 1: 12} - smote = SMOTEENN(ratio=ratio, random_state=RND_SEED) + sampling_strategy = {0: 10, 1: 12} + smote = SMOTEENN( + sampling_strategy=sampling_strategy, random_state=RND_SEED) X_resampled, y_resampled = smote.fit_sample(X, Y) - X_gt = np.array([[1.52091956, -0.49283504], - [-0.28162401, -2.10400981], - [0.83680821, 1.72827342], - [0.08711622, 0.93259929]]) + X_gt = np.array([[1.52091956, -0.49283504], [-0.28162401, -2.10400981], + [0.83680821, 1.72827342], [0.08711622, 0.93259929]]) y_gt = np.array([0, 1, 1, 1]) assert_allclose(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) @@ -80,16 +77,14 @@ def test_sample_regular_half(): def test_validate_estimator_init(): smote = SMOTE(random_state=RND_SEED) - enn = EditedNearestNeighbours(random_state=RND_SEED, ratio='all') + enn = EditedNearestNeighbours( + random_state=RND_SEED, sampling_strategy='all') smt = SMOTEENN(smote=smote, enn=enn, random_state=RND_SEED) X_resampled, y_resampled = smt.fit_sample(X, Y) - X_gt = np.array([[1.52091956, -0.49283504], - [0.84976473, -0.15570176], - [0.61319159, -0.11571667], - [0.66052536, -0.28246518], - [-0.28162401, -2.10400981], - [0.83680821, 1.72827342], - [0.08711622, 0.93259929]]) + X_gt = np.array([[1.52091956, -0.49283504], [0.84976473, -0.15570176], [ + 0.61319159, -0.11571667 + ], [0.66052536, -0.28246518], [-0.28162401, -2.10400981], + [0.83680821, 1.72827342], [0.08711622, 0.93259929]]) y_gt = np.array([0, 0, 0, 0, 1, 1, 1]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) @@ -98,13 +93,10 @@ def test_validate_estimator_init(): def test_validate_estimator_default(): smt = SMOTEENN(random_state=RND_SEED) X_resampled, y_resampled = smt.fit_sample(X, Y) - X_gt = np.array([[1.52091956, -0.49283504], - [0.84976473, -0.15570176], - [0.61319159, -0.11571667], - [0.66052536, -0.28246518], - [-0.28162401, -2.10400981], - [0.83680821, 1.72827342], - [0.08711622, 0.93259929]]) + X_gt = np.array([[1.52091956, -0.49283504], [0.84976473, -0.15570176], [ + 0.61319159, -0.11571667 + ], [0.66052536, -0.28246518], [-0.28162401, -2.10400981], + [0.83680821, 1.72827342], [0.08711622, 0.93259929]]) y_gt = np.array([0, 0, 0, 0, 1, 1, 1]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) diff --git a/imblearn/combine/tests/test_smote_tomek.py b/imblearn/combine/tests/test_smote_tomek.py index dc8d909c7..362653bd5 100644 --- a/imblearn/combine/tests/test_smote_tomek.py +++ b/imblearn/combine/tests/test_smote_tomek.py @@ -15,16 +15,19 @@ from imblearn.under_sampling import TomekLinks RND_SEED = 0 -X = np.array([[0.20622591, 0.0582794], [0.68481731, 0.51935141], - [1.34192108, -0.13367336], [0.62366841, -0.21312976], - [1.61091956, -0.40283504], [-0.37162401, -2.19400981], - [0.74680821, 1.63827342], [0.2184254, 0.24299982], - [0.61472253, -0.82309052], [0.19893132, -0.47761769], - [1.06514042, -0.0770537], [0.97407872, 0.44454207], - [1.40301027, -0.83648734], [-1.20515198, -1.02689695], - [-0.27410027, -0.54194484], [0.8381014, 0.44085498], - [-0.23374509, 0.18370049], [-0.32635887, -0.29299653], - [-0.00288378, 0.84259929], [1.79580611, -0.02219234]]) +X = np.array([[0.20622591, 0.0582794], [0.68481731, 0.51935141], [ + 1.34192108, -0.13367336 +], [0.62366841, -0.21312976], [1.61091956, + -0.40283504], [-0.37162401, -2.19400981], + [0.74680821, + 1.63827342], [0.2184254, 0.24299982], [0.61472253, -0.82309052], + [0.19893132, -0.47761769], [1.06514042, -0.0770537], [ + 0.97407872, 0.44454207 + ], [1.40301027, -0.83648734], [-1.20515198, -1.02689695], [ + -0.27410027, -0.54194484 + ], [0.8381014, 0.44085498], [-0.23374509, 0.18370049], [ + -0.32635887, -0.29299653 + ], [-0.00288378, 0.84259929], [1.79580611, -0.02219234]]) Y = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0]) R_TOL = 1e-4 @@ -32,44 +35,34 @@ def test_sample_regular(): smote = SMOTETomek(random_state=RND_SEED) X_resampled, y_resampled = smote.fit_sample(X, Y) - X_gt = np.array([[0.68481731, 0.51935141], - [1.34192108, -0.13367336], - [0.62366841, -0.21312976], - [1.61091956, -0.40283504], - [-0.37162401, -2.19400981], - [0.74680821, 1.63827342], - [0.61472253, -0.82309052], - [0.19893132, -0.47761769], - [1.40301027, -0.83648734], - [-1.20515198, -1.02689695], - [-0.23374509, 0.18370049], - [-0.00288378, 0.84259929], - [1.79580611, -0.02219234], - [0.38307743, -0.05670439], - [0.70319159, -0.02571667], - [0.75052536, -0.19246518]]) + X_gt = np.array([[0.68481731, 0.51935141], [1.34192108, -0.13367336], [ + 0.62366841, -0.21312976 + ], [1.61091956, -0.40283504], [-0.37162401, + -2.19400981], [0.74680821, 1.63827342], + [0.61472253, -0.82309052], [0.19893132, -0.47761769], + [1.40301027, -0.83648734], [-1.20515198, -1.02689695], [ + -0.23374509, 0.18370049 + ], [-0.00288378, 0.84259929], [1.79580611, -0.02219234], [ + 0.38307743, -0.05670439 + ], [0.70319159, -0.02571667], [0.75052536, -0.19246518]]) y_gt = np.array([1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) def test_sample_regular_half(): - ratio = {0: 9, 1: 12} - smote = SMOTETomek(ratio=ratio, random_state=RND_SEED) + sampling_strategy = {0: 9, 1: 12} + smote = SMOTETomek( + sampling_strategy=sampling_strategy, random_state=RND_SEED) X_resampled, y_resampled = smote.fit_sample(X, Y) - X_gt = np.array([[0.68481731, 0.51935141], - [0.62366841, -0.21312976], - [1.61091956, -0.40283504], - [-0.37162401, -2.19400981], - [0.74680821, 1.63827342], - [0.61472253, -0.82309052], - [0.19893132, -0.47761769], - [1.40301027, -0.83648734], - [-1.20515198, -1.02689695], - [-0.23374509, 0.18370049], - [-0.00288378, 0.84259929], - [1.79580611, -0.02219234], - [0.45784496, -0.1053161]]) + X_gt = np.array([[0.68481731, 0.51935141], [0.62366841, -0.21312976], [ + 1.61091956, -0.40283504 + ], [-0.37162401, -2.19400981], [0.74680821, + 1.63827342], [0.61472253, -0.82309052], + [0.19893132, -0.47761769], [1.40301027, -0.83648734], + [-1.20515198, -1.02689695], [-0.23374509, 0.18370049], [ + -0.00288378, 0.84259929 + ], [1.79580611, -0.02219234], [0.45784496, -0.1053161]]) y_gt = np.array([1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) @@ -77,25 +70,19 @@ def test_sample_regular_half(): def test_validate_estimator_init(): smote = SMOTE(random_state=RND_SEED) - tomek = TomekLinks(random_state=RND_SEED, ratio='all') + tomek = TomekLinks(random_state=RND_SEED, sampling_strategy='all') smt = SMOTETomek(smote=smote, tomek=tomek, random_state=RND_SEED) X_resampled, y_resampled = smt.fit_sample(X, Y) - X_gt = np.array([[0.68481731, 0.51935141], - [1.34192108, -0.13367336], - [0.62366841, -0.21312976], - [1.61091956, -0.40283504], - [-0.37162401, -2.19400981], - [0.74680821, 1.63827342], - [0.61472253, -0.82309052], - [0.19893132, -0.47761769], - [1.40301027, -0.83648734], - [-1.20515198, -1.02689695], - [-0.23374509, 0.18370049], - [-0.00288378, 0.84259929], - [1.79580611, -0.02219234], - [0.38307743, -0.05670439], - [0.70319159, -0.02571667], - [0.75052536, -0.19246518]]) + X_gt = np.array([[0.68481731, 0.51935141], [1.34192108, -0.13367336], [ + 0.62366841, -0.21312976 + ], [1.61091956, -0.40283504], [-0.37162401, + -2.19400981], [0.74680821, 1.63827342], + [0.61472253, -0.82309052], [0.19893132, -0.47761769], + [1.40301027, -0.83648734], [-1.20515198, -1.02689695], [ + -0.23374509, 0.18370049 + ], [-0.00288378, 0.84259929], [1.79580611, -0.02219234], [ + 0.38307743, -0.05670439 + ], [0.70319159, -0.02571667], [0.75052536, -0.19246518]]) y_gt = np.array([1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) @@ -104,22 +91,16 @@ def test_validate_estimator_init(): def test_validate_estimator_default(): smt = SMOTETomek(random_state=RND_SEED) X_resampled, y_resampled = smt.fit_sample(X, Y) - X_gt = np.array([[0.68481731, 0.51935141], - [1.34192108, -0.13367336], - [0.62366841, -0.21312976], - [1.61091956, -0.40283504], - [-0.37162401, -2.19400981], - [0.74680821, 1.63827342], - [0.61472253, -0.82309052], - [0.19893132, -0.47761769], - [1.40301027, -0.83648734], - [-1.20515198, -1.02689695], - [-0.23374509, 0.18370049], - [-0.00288378, 0.84259929], - [1.79580611, -0.02219234], - [0.38307743, -0.05670439], - [0.70319159, -0.02571667], - [0.75052536, -0.19246518]]) + X_gt = np.array([[0.68481731, 0.51935141], [1.34192108, -0.13367336], [ + 0.62366841, -0.21312976 + ], [1.61091956, -0.40283504], [-0.37162401, + -2.19400981], [0.74680821, 1.63827342], + [0.61472253, -0.82309052], [0.19893132, -0.47761769], + [1.40301027, -0.83648734], [-1.20515198, -1.02689695], [ + -0.23374509, 0.18370049 + ], [-0.00288378, 0.84259929], [1.79580611, -0.02219234], [ + 0.38307743, -0.05670439 + ], [0.70319159, -0.02571667], [0.75052536, -0.19246518]]) y_gt = np.array([1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) diff --git a/imblearn/datasets/__init__.py b/imblearn/datasets/__init__.py index 10f8bac10..355070127 100644 --- a/imblearn/datasets/__init__.py +++ b/imblearn/datasets/__init__.py @@ -7,5 +7,4 @@ from .zenodo import fetch_datasets -__all__ = ['make_imbalance', - 'fetch_datasets'] +__all__ = ['make_imbalance', 'fetch_datasets'] diff --git a/imblearn/datasets/imbalance.py b/imblearn/datasets/imbalance.py index 8f8d6805e..98eba953a 100644 --- a/imblearn/datasets/imbalance.py +++ b/imblearn/datasets/imbalance.py @@ -1,23 +1,28 @@ """Transform a dataset into an imbalanced dataset.""" - # Authors: Dayvid Oliveira # Guillaume Lemaitre # Christos Aridas # License: MIT import logging +import warnings from collections import Counter from sklearn.utils import check_X_y from ..under_sampling.prototype_selection import RandomUnderSampler -from ..utils import check_ratio +from ..utils import check_sampling_strategy LOGGER = logging.getLogger(__name__) -def make_imbalance(X, y, ratio, random_state=None, **kwargs): +def make_imbalance(X, + y, + sampling_strategy=None, + ratio=None, + random_state=None, + **kwargs): """Turns a dataset into an imbalanced dataset at specific ratio. A simple toy dataset to visualize clustering and classification @@ -33,15 +38,21 @@ def make_imbalance(X, y, ratio, random_state=None, **kwargs): y : ndarray, shape (n_samples, ) Corresponding label for each sample in X. - ratio : str, dict, or callable, optional (default='auto') + sampling_strategy : dict, or callable, Ratio to use for resampling the data set. - - If ``dict``, the keys correspond to the targeted classes. The values - correspond to the desired number of samples. All samples will be - passed through if the class is not specified. - - If callable, function taking ``y`` and returns a ``dict``. The keys + - When ``dict``, the keys correspond to the targeted classes. The + values correspond to the desired number of samples for each targeted + class. + + - When callable, function taking ``y`` and returns a ``dict``. The keys correspond to the targeted classes. The values correspond to the - desired number of samples. + desired number of samples for each class. + + ratio : str, dict, or callable + .. deprecated:: 0.4 + Use the parameter ``sampling_strategy`` instead. It will be removed + in 0.6. random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; @@ -50,7 +61,8 @@ def make_imbalance(X, y, ratio, random_state=None, **kwargs): by np.random. kwargs : dict, optional - Dictionary of additional keyword arguments to pass to ``ratio``. + Dictionary of additional keyword arguments to pass to + ``sampling_strategy``. Returns ------- @@ -65,7 +77,7 @@ def make_imbalance(X, y, ratio, random_state=None, **kwargs): See :ref:`sphx_glr_auto_examples_applications_plot_multi_class_under_sampling.py`, :ref:`sphx_glr_auto_examples_datasets_plot_make_imbalance.py`, and - :ref:`sphx_glr_auto_examples_plot_ratio_usage.py`. + :ref:`sphx_glr_auto_examples_plot_sampling_strategy_usage.py`. Examples -------- @@ -77,7 +89,8 @@ def make_imbalance(X, y, ratio, random_state=None, **kwargs): >>> X, y = data.data, data.target >>> print('Distribution before imbalancing: {}'.format(Counter(y))) Distribution before imbalancing: Counter({0: 50, 1: 50, 2: 50}) - >>> X_res, y_res = make_imbalance(X, y, ratio={0: 10, 1: 20, 2: 30}, + >>> X_res, y_res = make_imbalance(X, y, + ... sampling_strategy={0: 10, 1: 20, 2: 30}, ... random_state=42) >>> print('Distribution after imbalancing: {}'.format(Counter(y_res))) Distribution after imbalancing: Counter({2: 30, 1: 20, 0: 10}) @@ -86,17 +99,28 @@ def make_imbalance(X, y, ratio, random_state=None, **kwargs): X, y = check_X_y(X, y) target_stats = Counter(y) # restrict ratio to be a dict or a callable - if isinstance(ratio, dict) or callable(ratio): - ratio_ = check_ratio(ratio, y, 'under-sampling', **kwargs) + # FIXME remove ratio at 0.6 + if ratio is not None: + warnings.warn("'ratio' has been deprecated in 0.4 and will be " + "removed in 0.6. Use 'sampling_strategy' instead.") + sampling_strategy = ratio + elif sampling_strategy is None: + raise TypeError("make_imbalance() missing 1 required positional " + "argument: 'sampling_strategy'") + if isinstance(sampling_strategy, dict) or callable(sampling_strategy): + sampling_strategy_ = check_sampling_strategy( + sampling_strategy, y, 'under-sampling', **kwargs) else: - raise ValueError("'ratio' has to be a dictionary or a function" - " returning a dictionary. Got {} instead.".format( - type(ratio))) + raise ValueError("'sampling_strategy' has to be a dictionary or a " + "function returning a dictionary. Got {} instead." + .format(type(sampling_strategy))) LOGGER.info('The original target distribution in the dataset is: %s', target_stats) - rus = RandomUnderSampler(ratio=ratio_, replacement=False, - random_state=random_state) + rus = RandomUnderSampler( + sampling_strategy=sampling_strategy_, + replacement=False, + random_state=random_state) X_resampled, y_resampled = rus.fit_sample(X, y) LOGGER.info('Make the dataset imbalanced: %s', Counter(y_resampled)) diff --git a/imblearn/datasets/tests/test_imbalance.py b/imblearn/datasets/tests/test_imbalance.py index 3a98f9ea8..3ba99cd19 100644 --- a/imblearn/datasets/tests/test_imbalance.py +++ b/imblearn/datasets/tests/test_imbalance.py @@ -3,7 +3,6 @@ # Christos Aridas # License: MIT - from __future__ import print_function from collections import Counter @@ -20,29 +19,46 @@ X, Y = data.data, data.target +def test_make_imbalanced_backcompat(): + # check an error is raised with we don't pass sampling_strategy and ratio + with raises(TypeError, match="missing 1 required positional argument"): + make_imbalance(X, Y) + + def test_make_imbalance_error(): - # we are reusing part of utils.check_ratio, however this is not cover in - # the common tests so we will repeat it here - ratio = {0: -100, 1: 50, 2: 50} + # we are reusing part of utils.check_sampling_strategy, however this is not + # cover in the common tests so we will repeat it here + sampling_strategy = {0: -100, 1: 50, 2: 50} with raises(ValueError, match="in a class cannot be negative"): - make_imbalance(X, Y, ratio) - ratio = {0: 10, 1: 70} + make_imbalance(X, Y, sampling_strategy) + sampling_strategy = {0: 10, 1: 70} with raises(ValueError, match="should be less or equal to the original"): - make_imbalance(X, Y, ratio) + make_imbalance(X, Y, sampling_strategy) y_ = np.zeros((X.shape[0], )) - ratio = {0: 10} + sampling_strategy = {0: 10} with raises(ValueError, match="needs to have more than 1 class."): - make_imbalance(X, y_, ratio) - ratio = 'random-string' + make_imbalance(X, y_, sampling_strategy) + sampling_strategy = 'random-string' with raises(ValueError, match="has to be a dictionary or a function"): - make_imbalance(X, Y, ratio) + make_imbalance(X, Y, sampling_strategy) def test_make_imbalance_dict(): - ratio = {0: 10, 1: 20, 2: 30} - X_, y_ = make_imbalance(X, Y, ratio=ratio) - assert Counter(y_) == ratio + sampling_strategy = {0: 10, 1: 20, 2: 30} + X_, y_ = make_imbalance(X, Y, sampling_strategy=sampling_strategy) + assert Counter(y_) == sampling_strategy + + sampling_strategy = {0: 10, 1: 20} + X_, y_ = make_imbalance(X, Y, sampling_strategy=sampling_strategy) + assert Counter(y_) == {0: 10, 1: 20, 2: 50} + + +def test_make_imbalance_ratio(): + # check that using 'ratio' is working + sampling_strategy = {0: 10, 1: 20, 2: 30} + X_, y_ = make_imbalance(X, Y, ratio=sampling_strategy) + assert Counter(y_) == sampling_strategy - ratio = {0: 10, 1: 20} - X_, y_ = make_imbalance(X, Y, ratio=ratio) + sampling_strategy = {0: 10, 1: 20} + X_, y_ = make_imbalance(X, Y, ratio=sampling_strategy) assert Counter(y_) == {0: 10, 1: 20, 2: 50} diff --git a/imblearn/datasets/tests/test_zenodo.py b/imblearn/datasets/tests/test_zenodo.py index 0977ed1a2..7d95784ec 100644 --- a/imblearn/datasets/tests/test_zenodo.py +++ b/imblearn/datasets/tests/test_zenodo.py @@ -11,33 +11,35 @@ from pytest import raises -DATASET_SHAPE = {'ecoli': (336, 7), - 'optical_digits': (5620, 64), - 'satimage': (6435, 36), - 'pen_digits': (10992, 16), - 'abalone': (4177, 10), - 'sick_euthyroid': (3163, 42), - 'spectrometer': (531, 93), - 'car_eval_34': (1728, 21), - 'isolet': (7797, 617), - 'us_crime': (1994, 100), - 'yeast_ml8': (2417, 103), - 'scene': (2407, 294), - 'libras_move': (360, 90), - 'thyroid_sick': (3772, 52), - 'coil_2000': (9822, 85), - 'arrhythmia': (452, 278), - 'solar_flare_m0': (1389, 32), - 'oil': (937, 49), - 'car_eval_4': (1728, 21), - 'wine_quality': (4898, 11), - 'letter_img': (20000, 16), - 'yeast_me2': (1484, 8), - 'webpage': (34780, 300), - 'ozone_level': (2536, 72), - 'mammography': (11183, 6), - 'protein_homo': (145751, 74), - 'abalone_19': (4177, 10)} +DATASET_SHAPE = { + 'ecoli': (336, 7), + 'optical_digits': (5620, 64), + 'satimage': (6435, 36), + 'pen_digits': (10992, 16), + 'abalone': (4177, 10), + 'sick_euthyroid': (3163, 42), + 'spectrometer': (531, 93), + 'car_eval_34': (1728, 21), + 'isolet': (7797, 617), + 'us_crime': (1994, 100), + 'yeast_ml8': (2417, 103), + 'scene': (2407, 294), + 'libras_move': (360, 90), + 'thyroid_sick': (3772, 52), + 'coil_2000': (9822, 85), + 'arrhythmia': (452, 278), + 'solar_flare_m0': (1389, 32), + 'oil': (937, 49), + 'car_eval_4': (1728, 21), + 'wine_quality': (4898, 11), + 'letter_img': (20000, 16), + 'yeast_me2': (1484, 8), + 'webpage': (34780, 300), + 'ozone_level': (2536, 72), + 'mammography': (11183, 6), + 'protein_homo': (145751, 74), + 'abalone_19': (4177, 10) +} def fetch(*args, **kwargs): @@ -59,19 +61,19 @@ def test_fetch(): assert X1.shape == X2.shape y1, y2 = datasets1[k].target, datasets2[k].target - assert (X1.shape[0],) == y1.shape - assert (X1.shape[0],) == y2.shape + assert (X1.shape[0], ) == y1.shape + assert (X1.shape[0], ) == y2.shape def test_fetch_filter(): try: - datasets1 = fetch(filter_data=tuple([1]), shuffle=True, - random_state=42) + datasets1 = fetch( + filter_data=tuple([1]), shuffle=True, random_state=42) except IOError: raise SkipTest("Zenodo dataset can not be loaded.") - datasets2 = fetch(filter_data=tuple(['ecoli']), shuffle=True, - random_state=37) + datasets2 = fetch( + filter_data=tuple(['ecoli']), shuffle=True, random_state=37) X1, X2 = datasets1['ecoli'].data, datasets2['ecoli'].data assert DATASET_SHAPE['ecoli'] == X1.shape @@ -80,8 +82,8 @@ def test_fetch_filter(): assert_allclose(X1.sum(), X2.sum()) y1, y2 = datasets1['ecoli'].target, datasets2['ecoli'].target - assert (X1.shape[0],) == y1.shape - assert (X1.shape[0],) == y2.shape + assert (X1.shape[0], ) == y1.shape + assert (X1.shape[0], ) == y2.shape def test_fetch_error(): diff --git a/imblearn/datasets/zenodo.py b/imblearn/datasets/zenodo.py index f7cef635b..ba439f2e8 100644 --- a/imblearn/datasets/zenodo.py +++ b/imblearn/datasets/zenodo.py @@ -67,33 +67,14 @@ PRE_FILENAME = 'x' POST_FILENAME = 'data.npz' -MAP_NAME_ID_KEYS = ['ecoli', - 'optical_digits', - 'satimage', - 'pen_digits', - 'abalone', - 'sick_euthyroid', - 'spectrometer', - 'car_eval_34', - 'isolet', - 'us_crime', - 'yeast_ml8', - 'scene', - 'libras_move', - 'thyroid_sick', - 'coil_2000', - 'arrhythmia', - 'solar_flare_m0', - 'oil', - 'car_eval_4', - 'wine_quality', - 'letter_img', - 'yeast_me2', - 'webpage', - 'ozone_level', - 'mammography', - 'protein_homo', - 'abalone_19'] +MAP_NAME_ID_KEYS = [ + 'ecoli', 'optical_digits', 'satimage', 'pen_digits', 'abalone', + 'sick_euthyroid', 'spectrometer', 'car_eval_34', 'isolet', 'us_crime', + 'yeast_ml8', 'scene', 'libras_move', 'thyroid_sick', 'coil_2000', + 'arrhythmia', 'solar_flare_m0', 'oil', 'car_eval_4', 'wine_quality', + 'letter_img', 'yeast_me2', 'webpage', 'ozone_level', 'mammography', + 'protein_homo', 'abalone_19' +] MAP_NAME_ID = OrderedDict() MAP_ID_NAME = OrderedDict() diff --git a/imblearn/ensemble/balance_cascade.py b/imblearn/ensemble/balance_cascade.py index 6668209ea..611d56755 100644 --- a/imblearn/ensemble/balance_cascade.py +++ b/imblearn/ensemble/balance_cascade.py @@ -14,9 +14,15 @@ from sklearn.model_selection import cross_val_predict from .base import BaseEnsembleSampler -from ..utils import check_ratio, check_target_type +from ..under_sampling.base import BaseUnderSampler +from ..utils import check_sampling_strategy, check_target_type +from ..utils import Substitution +from ..utils._docstring import _random_state_docstring +@Substitution( + sampling_strategy=BaseUnderSampler._sampling_strategy_docstring, + random_state=_random_state_docstring) class BalanceCascade(BaseEnsembleSampler): """Create an ensemble of balanced sets by iteratively under-sampling the imbalanced dataset using an estimator. @@ -28,32 +34,13 @@ class BalanceCascade(BaseEnsembleSampler): Parameters ---------- - ratio : str, dict, or callable, optional (default='auto') - Ratio to use for resampling the data set. - - - If ``str``, has to be one of: (i) ``'minority'``: resample the - minority class; (ii) ``'majority'``: resample the majority class, - (iii) ``'not minority'``: resample all classes apart of the minority - class, (iv) ``'all'``: resample all classes, and (v) ``'auto'``: - correspond to ``'all'`` with for over-sampling methods and ``'not - minority'`` for under-sampling methods. The classes targeted will be - over-sampled or under-sampled to achieve an equal number of sample - with the majority or minority class. - - If ``dict``, the keys correspond to the targeted classes. The values - correspond to the desired number of samples. - - If callable, function taking ``y`` and returns a ``dict``. The keys - correspond to the targeted classes. The values correspond to the - desired number of samples. + {sampling_strategy} return_indices : bool, optional (default=True) Whether or not to return the indices of the samples randomly selected from the majority class. - random_state : int, RandomState instance or None, optional (default=None) - If int, ``random_state`` is the seed used by the random number - generator; If ``RandomState`` instance, random_state is the random - number generator; If ``None``, the random number generator is the - ``RandomState`` instance used by ``np.random``. + {random_state} n_max_subset : int or None, optional (default=None) Maximum number of subsets to generate. By default, all data from @@ -67,6 +54,11 @@ class BalanceCascade(BaseEnsembleSampler): bootstrap : bool, optional (default=True) Whether to bootstrap the data before each iteration. + ratio : str, dict, or callable + .. deprecated:: 0.4 + Use the parameter ``sampling_strategy`` instead. It will be removed + in 0.6. + Notes ----- The method is described in [1]_. @@ -97,23 +89,25 @@ class BalanceCascade(BaseEnsembleSampler): >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) - >>> print('Original dataset shape {}'.format(Counter(y))) - Original dataset shape Counter({1: 900, 0: 100}) + >>> print('Original dataset shape %s' % Counter(y)) + Original dataset shape Counter({{1: 900, 0: 100}}) >>> bc = BalanceCascade(random_state=42) >>> X_res, y_res = bc.fit_sample(X, y) - >>> print('Resampled dataset shape {}'.format(Counter(y_res[0]))) \ + >>> print('Resampled dataset shape %s' % Counter(y_res[0])) \ # doctest: +ELLIPSIS - Resampled dataset shape Counter({...}) + Resampled dataset shape Counter({{...}}) """ def __init__(self, - ratio='auto', + sampling_strategy='auto', return_indices=False, random_state=None, n_max_subset=None, - estimator=None): - super(BalanceCascade, self).__init__(ratio=ratio) + estimator=None, + ratio=None): + super(BalanceCascade, self).__init__( + sampling_strategy=sampling_strategy, ratio=ratio) self.random_state = random_state self.return_indices = return_indices self.estimator = estimator @@ -138,7 +132,8 @@ def fit(self, X, y): """ super(BalanceCascade, self).fit(X, y) y = check_target_type(y) - self.ratio_ = check_ratio(self.ratio, y, 'under-sampling') + self.sampling_strategy_ = check_sampling_strategy( + self.sampling_strategy, y, 'under-sampling') return self def _validate_estimator(self): @@ -194,15 +189,15 @@ def _sample(self, X, y): n_subsets = 0 b_subset_search = True while b_subset_search: - target_stats = Counter(safe_indexing( - y, np.flatnonzero(samples_mask))) + target_stats = Counter( + safe_indexing(y, np.flatnonzero(samples_mask))) # store the index of the data to under-sample index_under_sample = np.empty((0, ), dtype=y.dtype) # value which will be picked at each round index_constant = np.empty((0, ), dtype=y.dtype) for target_class in target_stats.keys(): - if target_class in self.ratio_.keys(): - n_samples = self.ratio_[target_class] + if target_class in self.sampling_strategy_.keys(): + n_samples = self.sampling_strategy_[target_class] # extract the data of interest for this round from the # current class index_class = np.flatnonzero(y == target_class) @@ -218,14 +213,13 @@ def _sample(self, X, y): axis=0) else: index_constant = np.concatenate( - (index_constant, - np.flatnonzero(y == target_class)), + (index_constant, np.flatnonzero(y == target_class)), axis=0) # store the set created n_subsets += 1 - subset_indices = np.concatenate((index_under_sample, - index_constant), axis=0) + subset_indices = np.concatenate( + (index_under_sample, index_constant), axis=0) idx_under.append(subset_indices) # fit and predict using cross validation @@ -234,9 +228,8 @@ def _sample(self, X, y): pred = cross_val_predict(self.estimator_, X_subset, y_subset) # extract the prediction about the targeted classes only pred_target = pred[:index_under_sample.size] - index_classified = index_under_sample[ - pred_target == safe_indexing(y_subset, - range(index_under_sample.size))] + index_classified = index_under_sample[pred_target == safe_indexing( + y_subset, range(index_under_sample.size))] samples_mask[index_classified] = False # check the stopping criterion @@ -244,10 +237,11 @@ def _sample(self, X, y): if n_subsets == self.n_max_subset: b_subset_search = False # check that there is enough samples for another round - target_stats = Counter(safe_indexing( - y, np.flatnonzero(samples_mask))) - for target_class in self.ratio_.keys(): - if target_stats[target_class] < self.ratio_[target_class]: + target_stats = Counter( + safe_indexing(y, np.flatnonzero(samples_mask))) + for target_class in self.sampling_strategy_.keys(): + if (target_stats[target_class] < + self.sampling_strategy_[target_class]): b_subset_search = False X_resampled, y_resampled = [], [] diff --git a/imblearn/ensemble/base.py b/imblearn/ensemble/base.py index bb1d34ebc..5e24c5d56 100644 --- a/imblearn/ensemble/base.py +++ b/imblearn/ensemble/base.py @@ -4,6 +4,8 @@ # Authors: Guillaume Lemaitre # License: MIT +import warnings + import numpy as np from sklearn.preprocessing import label_binarize @@ -23,6 +25,14 @@ class BaseEnsembleSampler(BaseSampler): _sampling_type = 'ensemble' + @property + def ratio_(self): + warnings.warn( + "'ratio' and 'ratio_' are deprecated. " + "Use 'sampling_strategy' and 'sampling_strategy_' instead.", + DeprecationWarning) + return self.sampling_strategy_ + def sample(self, X, y): """Resample the dataset. @@ -49,7 +59,7 @@ def sample(self, X, y): y, binarize_y = check_target_type(y, indicate_one_vs_all=True) X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) - check_is_fitted(self, 'ratio_') + check_is_fitted(self, 'sampling_strategy_') self._check_X_y(X, y) output = self._sample(X, y) @@ -57,8 +67,8 @@ def sample(self, X, y): if binarize_y: y_resampled = output[1] classes = np.unique(y) - y_resampled_encoded = np.array([label_binarize(batch_y, classes) - for batch_y in y_resampled]) + y_resampled_encoded = np.array( + [label_binarize(batch_y, classes) for batch_y in y_resampled]) if len(output) == 2: return output[0], y_resampled_encoded else: diff --git a/imblearn/ensemble/classifier.py b/imblearn/ensemble/classifier.py index 9f154a770..62d06c36b 100644 --- a/imblearn/ensemble/classifier.py +++ b/imblearn/ensemble/classifier.py @@ -11,12 +11,17 @@ from sklearn.base import clone from sklearn.ensemble import BaggingClassifier from sklearn.tree import DecisionTreeClassifier -from sklearn.ensemble.bagging import _generate_bagging_indices from ..pipeline import Pipeline from ..under_sampling import RandomUnderSampler +from ..under_sampling.base import BaseUnderSampler +from ..utils import Substitution +from ..utils._docstring import _random_state_docstring +@Substitution( + sampling_strategy=BaseUnderSampler._sampling_strategy_docstring, + random_state=_random_state_docstring) class BalancedBaggingClassifier(BaggingClassifier): """A Bagging classifier with additional balancing. @@ -65,22 +70,7 @@ class BalancedBaggingClassifier(BaggingClassifier): .. versionadded:: 0.17 *warm_start* constructor parameter. - ratio : str, dict, or callable, optional (default='auto') - Ratio to use for resampling the data set. - - - If ``str``, has to be one of: (i) ``'minority'``: resample the - minority class; (ii) ``'majority'``: resample the majority class, - (iii) ``'not minority'``: resample all classes apart of the minority - class, (iv) ``'all'``: resample all classes, and (v) ``'auto'``: - correspond to ``'all'`` with for over-sampling methods and ``'not - minority'`` for under-sampling methods. The classes targeted will be - over-sampled or under-sampled to achieve an equal number of sample - with the majority or minority class. - - If ``dict``, the keys correspond to the targeted classes. The values - correspond to the desired number of samples. - - If callable, function taking ``y`` and returns a ``dict``. The keys - correspond to the targeted classes. The values correspond to the - desired number of samples. + {sampling_strategy} replacement : bool, optional (default=False) Whether or not to sample randomly with replacement or not. @@ -89,17 +79,16 @@ class BalancedBaggingClassifier(BaggingClassifier): The number of jobs to run in parallel for both `fit` and `predict`. If -1, then the number of jobs is set to the number of cores. - random_state : int, RandomState instance or None, optional (default=None) - - If int, ``random_state`` is the seed used by the random number - generator; - - If ``RandomState`` instance, random_state is the random - number generator; - - If ``None``, the random number generator is the - ``RandomState`` instance used by ``np.random``. + {random_state} verbose : int, optional (default=0) Controls the verbosity of the building process. + ratio : str, dict, or callable + .. deprecated:: 0.4 + Use the parameter ``sampling_strategy`` instead. It will be removed + in 0.6. + Attributes ---------- base_estimator_ : estimator @@ -170,8 +159,8 @@ class BalancedBaggingClassifier(BaggingClassifier): >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) - >>> print('Original dataset shape {}'.format(Counter(y))) - Original dataset shape Counter({1: 900, 0: 100}) + >>> print('Original dataset shape %s' % Counter(y)) + Original dataset shape Counter({{1: 900, 0: 100}}) >>> X_train, X_test, y_train, y_test = train_test_split(X, y, ... random_state=0) >>> bbc = BalancedBaggingClassifier(random_state=42) @@ -183,6 +172,7 @@ class BalancedBaggingClassifier(BaggingClassifier): [ 2 225]] """ + def __init__(self, base_estimator=None, n_estimators=10, @@ -192,11 +182,12 @@ def __init__(self, bootstrap_features=False, oob_score=False, warm_start=False, - ratio='auto', + sampling_strategy='auto', replacement=False, n_jobs=1, random_state=None, - verbose=0): + verbose=0, + ratio=None): super(BaggingClassifier, self).__init__( base_estimator, @@ -210,6 +201,7 @@ def __init__(self, n_jobs=n_jobs, random_state=random_state, verbose=verbose) + self.sampling_strategy = sampling_strategy self.ratio = ratio self.replacement = replacement @@ -229,10 +221,10 @@ def _validate_estimator(self, default=DecisionTreeClassifier()): else: base_estimator = clone(default) - self.base_estimator_ = Pipeline( - [('sampler', RandomUnderSampler(ratio=self.ratio, - replacement=self.replacement)), - ('classifier', base_estimator)]) + self.base_estimator_ = Pipeline([('sampler', RandomUnderSampler( + sampling_strategy=self.sampling_strategy, + replacement=self.replacement, + ratio=self.ratio)), ('classifier', base_estimator)]) def fit(self, X, y): """Build a Bagging ensemble of estimators from the training diff --git a/imblearn/ensemble/easy_ensemble.py b/imblearn/ensemble/easy_ensemble.py index b1aa17391..6706ee5d8 100644 --- a/imblearn/ensemble/easy_ensemble.py +++ b/imblearn/ensemble/easy_ensemble.py @@ -10,10 +10,16 @@ from .base import BaseEnsembleSampler from ..under_sampling import RandomUnderSampler +from ..under_sampling.base import BaseUnderSampler +from ..utils import Substitution +from ..utils._docstring import _random_state_docstring MAX_INT = np.iinfo(np.int32).max +@Substitution( + sampling_strategy=BaseUnderSampler._sampling_strategy_docstring, + random_state=_random_state_docstring) class EasyEnsemble(BaseEnsembleSampler): """Create an ensemble sets by iteratively applying random under-sampling. @@ -24,32 +30,13 @@ class EasyEnsemble(BaseEnsembleSampler): Parameters ---------- - ratio : str, dict, or callable, optional (default='auto') - Ratio to use for resampling the data set. - - - If ``str``, has to be one of: (i) ``'minority'``: resample the - minority class; (ii) ``'majority'``: resample the majority class, - (iii) ``'not minority'``: resample all classes apart of the minority - class, (iv) ``'all'``: resample all classes, and (v) ``'auto'``: - correspond to ``'all'`` with for over-sampling methods and ``'not - minority'`` for under-sampling methods. The classes targeted will be - over-sampled or under-sampled to achieve an equal number of sample - with the majority or minority class. - - If ``dict``, the keys correspond to the targeted classes. The values - correspond to the desired number of samples. - - If callable, function taking ``y`` and returns a ``dict``. The keys - correspond to the targeted classes. The values correspond to the - desired number of samples. + {sampling_strategy} return_indices : bool, optional (default=False) Whether or not to return the indices of the samples randomly selected from the majority class. - random_state : int, RandomState instance or None, optional (default=None) - If int, ``random_state`` is the seed used by the random number - generator; If ``RandomState`` instance, random_state is the random - number generator; If ``None``, the random number generator is the - ``RandomState`` instance used by ``np.random``. + {random_state} replacement : bool, optional (default=False) Whether or not to sample randomly with replacement or not. @@ -57,6 +44,11 @@ class EasyEnsemble(BaseEnsembleSampler): n_subsets : int, optional (default=10) Number of subsets to generate. + ratio : str, dict, or callable + .. deprecated:: 0.4 + Use the parameter ``sampling_strategy`` instead. It will be removed + in 0.6. + Notes ----- The method is described in [1]_. @@ -86,22 +78,24 @@ class EasyEnsemble(BaseEnsembleSampler): >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) - >>> print('Original dataset shape {}'.format(Counter(y))) - Original dataset shape Counter({1: 900, 0: 100}) + >>> print('Original dataset shape %s' % Counter(y)) + Original dataset shape Counter({{1: 900, 0: 100}}) >>> ee = EasyEnsemble(random_state=42) >>> X_res, y_res = ee.fit_sample(X, y) - >>> print('Resampled dataset shape {}'.format(Counter(y_res[0]))) - Resampled dataset shape Counter({0: 100, 1: 100}) + >>> print('Resampled dataset shape %s' % Counter(y_res[0])) + Resampled dataset shape Counter({{0: 100, 1: 100}}) """ def __init__(self, - ratio='auto', + sampling_strategy='auto', return_indices=False, random_state=None, replacement=False, - n_subsets=10): - super(EasyEnsemble, self).__init__(ratio=ratio) + n_subsets=10, + ratio=None): + super(EasyEnsemble, self).__init__( + sampling_strategy=sampling_strategy, ratio=ratio) self.random_state = random_state self.return_indices = return_indices self.replacement = replacement @@ -142,7 +136,8 @@ def _sample(self, X, y): for _ in range(self.n_subsets): rus = RandomUnderSampler( - ratio=self.ratio_, return_indices=True, + sampling_strategy=self.sampling_strategy_, + return_indices=True, random_state=random_state.randint(MAX_INT), replacement=self.replacement) sel_x, sel_y, sel_idx = rus.fit_sample(X, y) diff --git a/imblearn/ensemble/tests/test_balance_cascade.py b/imblearn/ensemble/tests/test_balance_cascade.py index c68ab1e8e..a56d2a607 100644 --- a/imblearn/ensemble/tests/test_balance_cascade.py +++ b/imblearn/ensemble/tests/test_balance_cascade.py @@ -15,58 +15,51 @@ from imblearn.ensemble import BalanceCascade - RND_SEED = 0 -X = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], - [1.25192108, -0.22367336], [0.53366841, -0.30312976], - [1.52091956, -0.49283504], [-0.28162401, -2.10400981], - [0.83680821, 1.72827342], [0.3084254, 0.33299982], - [0.70472253, -0.73309052], [0.28893132, -0.38761769], - [1.15514042, 0.0129463], [0.88407872, 0.35454207], - [1.31301027, -0.92648734], [-1.11515198, -0.93689695], - [-0.18410027, -0.45194484], [0.9281014, 0.53085498], - [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], - [0.08711622, 0.93259929], [1.70580611, -0.11219234]]) +X = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [ + 1.25192108, -0.22367336 +], [0.53366841, -0.30312976], [1.52091956, + -0.49283504], [-0.28162401, -2.10400981], + [0.83680821, + 1.72827342], [0.3084254, 0.33299982], [0.70472253, -0.73309052], + [0.28893132, -0.38761769], [1.15514042, 0.0129463], [ + 0.88407872, 0.35454207 + ], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [ + -0.18410027, -0.45194484 + ], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [ + -0.41635887, -0.38299653 + ], [0.08711622, 0.93259929], [1.70580611, -0.11219234]]) Y = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0]) def test_fit_sample_auto(): - ratio = 'auto' - bc = BalanceCascade(ratio=ratio, random_state=RND_SEED, - return_indices=True) + sampling_strategy = 'auto' + bc = BalanceCascade( + sampling_strategy=sampling_strategy, + random_state=RND_SEED, + return_indices=True) X_resampled, y_resampled, idx_under = bc.fit_sample(X, Y) - X_gt = np.array([[[1.15514042, 0.0129463], - [0.08711622, 0.93259929], - [0.70472253, -0.73309052], - [-0.14374509, 0.27370049], - [0.83680821, 1.72827342], - [-0.18410027, -0.45194484], - [-0.28162401, -2.10400981], - [-1.11515198, -0.93689695], - [0.11622591, -0.0317206], - [1.25192108, -0.22367336], - [0.53366841, -0.30312976], - [1.52091956, -0.49283504], - [0.88407872, 0.35454207], - [1.31301027, -0.92648734], - [-0.41635887, -0.38299653], - [1.70580611, -0.11219234]], - [[0.28893132, -0.38761769], - [0.83680821, 1.72827342], - [0.3084254, 0.33299982], - [0.70472253, -0.73309052], - [-0.14374509, 0.27370049], - [0.77481731, 0.60935141], - [-0.18410027, -0.45194484], - [1.15514042, 0.0129463], - [0.11622591, -0.0317206], - [1.25192108, -0.22367336], - [0.53366841, -0.30312976], - [1.52091956, -0.49283504], - [0.88407872, 0.35454207], - [1.31301027, -0.92648734], - [-0.41635887, -0.38299653], - [1.70580611, -0.11219234]]]) + X_gt = np.array( + [[[1.15514042, 0.0129463], [0.08711622, 0.93259929], + [0.70472253, + -0.73309052], [-0.14374509, 0.27370049], [0.83680821, 1.72827342], [ + -0.18410027, -0.45194484 + ], [-0.28162401, -2.10400981], [-1.11515198, -0.93689695], [ + 0.11622591, -0.0317206 + ], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [ + 1.52091956, -0.49283504 + ], [0.88407872, 0.35454207], [1.31301027, -0.92648734], + [-0.41635887, -0.38299653], [1.70580611, -0.11219234]], + [[0.28893132, + -0.38761769], [0.83680821, 1.72827342], [0.3084254, 0.33299982], + [0.70472253, -0.73309052], [-0.14374509, 0.27370049], [ + 0.77481731, 0.60935141 + ], [-0.18410027, -0.45194484], [1.15514042, 0.0129463], + [0.11622591, -0.0317206], [1.25192108, -0.22367336], [ + 0.53366841, -0.30312976 + ], [1.52091956, -0.49283504], [0.88407872, 0.35454207], [ + 1.31301027, -0.92648734 + ], [-0.41635887, -0.38299653], [1.70580611, -0.11219234]]]) y_gt = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]) idx_gt = np.array( @@ -78,91 +71,82 @@ def test_fit_sample_auto(): def test_fit_sample_half(): - ratio = {0: 8, 1: 10} - bc = BalanceCascade(ratio=ratio, random_state=RND_SEED) + sampling_strategy = {0: 8, 1: 10} + bc = BalanceCascade( + sampling_strategy=sampling_strategy, random_state=RND_SEED) X_resampled, y_resampled = bc.fit_sample(X, Y) - X_gt = np.array([[[-0.41635887, -0.38299653], - [0.53366841, -0.30312976], - [1.25192108, -0.22367336], - [1.70580611, -0.11219234], - [1.52091956, -0.49283504], - [0.11622591, -0.0317206], - [1.31301027, -0.92648734], - [0.88407872, 0.35454207], - [0.3084254, 0.33299982], - [0.08711622, 0.93259929], - [-0.28162401, -2.10400981], - [-0.14374509, 0.27370049], - [0.9281014, 0.53085498], - [-0.18410027, -0.45194484], - [0.77481731, 0.60935141], - [1.15514042, 0.0129463], - [-1.11515198, -0.93689695], - [0.70472253, -0.73309052]]]) + X_gt = np.array([[[-0.41635887, -0.38299653], [0.53366841, -0.30312976], [ + 1.25192108, -0.22367336 + ], [1.70580611, -0.11219234], [1.52091956, -0.49283504], [ + 0.11622591, -0.0317206 + ], [1.31301027, -0.92648734], [0.88407872, 0.35454207], [ + 0.3084254, 0.33299982 + ], [0.08711622, 0.93259929], [-0.28162401, -2.10400981], [ + -0.14374509, 0.27370049 + ], [0.9281014, 0.53085498], [-0.18410027, -0.45194484], + [0.77481731, 0.60935141], [1.15514042, 0.0129463], + [-1.11515198, -0.93689695], [0.70472253, -0.73309052]]]) y_gt = np.array([[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) def test_fit_sample_auto_early_stop(): - ratio = 'auto' + sampling_strategy = 'auto' estimator = LinearSVC(random_state=RND_SEED) - bc = BalanceCascade(ratio=ratio, random_state=RND_SEED, - return_indices=False, estimator=estimator, - n_max_subset=1) + bc = BalanceCascade( + sampling_strategy=sampling_strategy, + random_state=RND_SEED, + return_indices=False, + estimator=estimator, + n_max_subset=1) X_resampled, y_resampled = bc.fit_sample(X, Y) - X_gt = np.array([[[1.15514042, 0.0129463], - [0.08711622, 0.93259929], - [0.70472253, -0.73309052], - [-0.14374509, 0.27370049], - [0.83680821, 1.72827342], - [-0.18410027, -0.45194484], - [-0.28162401, -2.10400981], - [-1.11515198, -0.93689695], - [0.11622591, -0.0317206], - [1.25192108, -0.22367336], - [0.53366841, -0.30312976], - [1.52091956, -0.49283504], - [0.88407872, 0.35454207], - [1.31301027, -0.92648734], - [-0.41635887, -0.38299653], - [1.70580611, -0.11219234]]]) + X_gt = np.array([[[1.15514042, 0.0129463], [0.08711622, 0.93259929], [ + 0.70472253, -0.73309052 + ], [-0.14374509, 0.27370049], [0.83680821, 1.72827342], [ + -0.18410027, -0.45194484 + ], [-0.28162401, -2.10400981], [-1.11515198, -0.93689695], [ + 0.11622591, -0.0317206 + ], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [ + 1.52091956, -0.49283504 + ], [0.88407872, 0.35454207], [1.31301027, -0.92648734], + [-0.41635887, -0.38299653], [1.70580611, -0.11219234]]]) y_gt = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) def test_give_classifier_obj(): - ratio = 'auto' + sampling_strategy = 'auto' estimator = RandomForestClassifier(random_state=RND_SEED) - bc = BalanceCascade(ratio=ratio, random_state=RND_SEED, - return_indices=False, estimator=estimator) + bc = BalanceCascade( + sampling_strategy=sampling_strategy, + random_state=RND_SEED, + return_indices=False, + estimator=estimator) X_resampled, y_resampled = bc.fit_sample(X, Y) - X_gt = np.array([[[1.15514042, 0.0129463], - [0.08711622, 0.93259929], - [0.70472253, -0.73309052], - [-0.14374509, 0.27370049], - [0.83680821, 1.72827342], - [-0.18410027, -0.45194484], - [-0.28162401, -2.10400981], - [-1.11515198, -0.93689695], - [0.11622591, -0.0317206], - [1.25192108, -0.22367336], - [0.53366841, -0.30312976], - [1.52091956, -0.49283504], - [0.88407872, 0.35454207], - [1.31301027, -0.92648734], - [-0.41635887, -0.38299653], - [1.70580611, -0.11219234]]]) + X_gt = np.array([[[1.15514042, 0.0129463], [0.08711622, 0.93259929], [ + 0.70472253, -0.73309052 + ], [-0.14374509, 0.27370049], [0.83680821, 1.72827342], [ + -0.18410027, -0.45194484 + ], [-0.28162401, -2.10400981], [-1.11515198, -0.93689695], [ + 0.11622591, -0.0317206 + ], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [ + 1.52091956, -0.49283504 + ], [0.88407872, 0.35454207], [1.31301027, -0.92648734], + [-0.41635887, -0.38299653], [1.70580611, -0.11219234]]]) y_gt = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) def test_give_classifier_wrong_obj(): - ratio = 'auto' + sampling_strategy = 'auto' classifier = 2 - bc = BalanceCascade(ratio=ratio, random_state=RND_SEED, - return_indices=True, estimator=classifier) + bc = BalanceCascade( + sampling_strategy=sampling_strategy, + random_state=RND_SEED, + return_indices=True, + estimator=classifier) with raises(ValueError, match="Invalid parameter `estimator`"): bc.fit_sample(X, Y) diff --git a/imblearn/ensemble/tests/test_classifier.py b/imblearn/ensemble/tests/test_classifier.py index 49c514b3f..cb6c135ed 100644 --- a/imblearn/ensemble/tests/test_classifier.py +++ b/imblearn/ensemble/tests/test_classifier.py @@ -15,10 +15,8 @@ from sklearn.svm import SVC from sklearn.feature_selection import SelectKBest from sklearn.utils.testing import (assert_array_equal, - assert_array_almost_equal, - assert_raises, - assert_warns, - assert_warns_message) + assert_array_almost_equal, assert_raises, + assert_warns, assert_warns_message) from imblearn.datasets import make_imbalance from imblearn.ensemble import BalancedBaggingClassifier @@ -30,34 +28,45 @@ def test_balanced_bagging_classifier(): # Check classification for various parameter settings. - X, y = make_imbalance(iris.data, iris.target, ratio={0: 20, 1: 25, 2: 50}, - random_state=0) - X_train, X_test, y_train, y_test = train_test_split(X, y, - random_state=0) - grid = ParameterGrid({"max_samples": [0.5, 1.0], - "max_features": [1, 2, 4], - "bootstrap": [True, False], - "bootstrap_features": [True, False]}) - - for base_estimator in [None, - DummyClassifier(), - Perceptron(), - DecisionTreeClassifier(), - KNeighborsClassifier(), - SVC()]: + X, y = make_imbalance( + iris.data, + iris.target, + sampling_strategy={0: 20, + 1: 25, + 2: 50}, + random_state=0) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) + grid = ParameterGrid({ + "max_samples": [0.5, 1.0], + "max_features": [1, 2, 4], + "bootstrap": [True, False], + "bootstrap_features": [True, False] + }) + + for base_estimator in [ + None, + DummyClassifier(), + Perceptron(), + DecisionTreeClassifier(), + KNeighborsClassifier(), + SVC() + ]: for params in grid: BalancedBaggingClassifier( - base_estimator=base_estimator, - random_state=0, - **params).fit(X_train, y_train).predict(X_test) + base_estimator=base_estimator, random_state=0, **params).fit( + X_train, y_train).predict(X_test) def test_bootstrap_samples(): # Test that bootstrapping samples generate non-perfect base estimators. - X, y = make_imbalance(iris.data, iris.target, ratio={0: 20, 1: 25, 2: 50}, - random_state=0) - X_train, X_test, y_train, y_test = train_test_split(X, y, - random_state=0) + X, y = make_imbalance( + iris.data, + iris.target, + sampling_strategy={0: 20, + 1: 25, + 2: 50}, + random_state=0) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) base_estimator = DecisionTreeClassifier().fit(X_train, y_train) @@ -68,11 +77,11 @@ def test_bootstrap_samples(): max_samples=1.0, bootstrap=False, n_estimators=10, - ratio={}, + sampling_strategy={}, random_state=0).fit(X_train, y_train) - assert (ensemble.score(X_train, y_train) == - base_estimator.score(X_train, y_train)) + assert (ensemble.score(X_train, y_train) == base_estimator.score( + X_train, y_train)) # with bootstrap, trees are no longer perfect on the training set ensemble = BalancedBaggingClassifier( @@ -81,16 +90,20 @@ def test_bootstrap_samples(): bootstrap=True, random_state=0).fit(X_train, y_train) - assert (ensemble.score(X_train, y_train) < - base_estimator.score(X_train, y_train)) + assert (ensemble.score(X_train, y_train) < base_estimator.score( + X_train, y_train)) def test_bootstrap_features(): # Test that bootstrapping features may generate duplicate features. - X, y = make_imbalance(iris.data, iris.target, ratio={0: 20, 1: 25, 2: 50}, - random_state=0) - X_train, X_test, y_train, y_test = train_test_split(X, y, - random_state=0) + X, y = make_imbalance( + iris.data, + iris.target, + sampling_strategy={0: 20, + 1: 25, + 2: 50}, + random_state=0) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) ensemble = BalancedBaggingClassifier( base_estimator=DecisionTreeClassifier(), @@ -107,52 +120,63 @@ def test_bootstrap_features(): bootstrap_features=True, random_state=0).fit(X_train, y_train) - unique_features = [np.unique(features).shape[0] - for features in ensemble.estimators_features_] + unique_features = [ + np.unique(features).shape[0] + for features in ensemble.estimators_features_ + ] assert np.median(unique_features) < X.shape[1] def test_probability(): # Predict probabilities. - X, y = make_imbalance(iris.data, iris.target, ratio={0: 20, 1: 25, 2: 50}, - random_state=0) - X_train, X_test, y_train, y_test = train_test_split(X, y, - random_state=0) + X, y = make_imbalance( + iris.data, + iris.target, + sampling_strategy={0: 20, + 1: 25, + 2: 50}, + random_state=0) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) with np.errstate(divide="ignore", invalid="ignore"): # Normal case ensemble = BalancedBaggingClassifier( - base_estimator=DecisionTreeClassifier(), - random_state=0).fit(X_train, y_train) + base_estimator=DecisionTreeClassifier(), random_state=0).fit( + X_train, y_train) - assert_array_almost_equal(np.sum(ensemble.predict_proba(X_test), - axis=1), - np.ones(len(X_test))) + assert_array_almost_equal( + np.sum(ensemble.predict_proba(X_test), axis=1), + np.ones(len(X_test))) - assert_array_almost_equal(ensemble.predict_proba(X_test), - np.exp(ensemble.predict_log_proba(X_test))) + assert_array_almost_equal( + ensemble.predict_proba(X_test), + np.exp(ensemble.predict_log_proba(X_test))) # Degenerate case, where some classes are missing ensemble = BalancedBaggingClassifier( - base_estimator=LogisticRegression(), - random_state=0, + base_estimator=LogisticRegression(), random_state=0, max_samples=5).fit(X_train, y_train) - assert_array_almost_equal(np.sum(ensemble.predict_proba(X_test), - axis=1), - np.ones(len(X_test))) + assert_array_almost_equal( + np.sum(ensemble.predict_proba(X_test), axis=1), + np.ones(len(X_test))) - assert_array_almost_equal(ensemble.predict_proba(X_test), - np.exp(ensemble.predict_log_proba(X_test))) + assert_array_almost_equal( + ensemble.predict_proba(X_test), + np.exp(ensemble.predict_log_proba(X_test))) def test_oob_score_classification(): # Check that oob prediction is a good estimation of the generalization # error. - X, y = make_imbalance(iris.data, iris.target, ratio={0: 20, 1: 25, 2: 50}, - random_state=0) - X_train, X_test, y_train, y_test = train_test_split(X, y, - random_state=0) + X, y = make_imbalance( + iris.data, + iris.target, + sampling_strategy={0: 20, + 1: 25, + 2: 50}, + random_state=0) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) for base_estimator in [DecisionTreeClassifier(), SVC()]: clf = BalancedBaggingClassifier( @@ -173,17 +197,19 @@ def test_oob_score_classification(): n_estimators=1, bootstrap=True, oob_score=True, - random_state=0).fit, - X_train, - y_train) + random_state=0).fit, X_train, y_train) def test_single_estimator(): # Check singleton ensembles. - X, y = make_imbalance(iris.data, iris.target, ratio={0: 20, 1: 25, 2: 50}, - random_state=0) - X_train, X_test, y_train, y_test = train_test_split(X, y, - random_state=0) + X, y = make_imbalance( + iris.data, + iris.target, + sampling_strategy={0: 20, + 1: 25, + 2: 50}, + random_state=0) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf1 = BalancedBaggingClassifier( base_estimator=KNeighborsClassifier(), @@ -192,16 +218,20 @@ def test_single_estimator(): bootstrap_features=False, random_state=0).fit(X_train, y_train) - clf2 = make_pipeline(RandomUnderSampler( - random_state=clf1.estimators_[0].steps[0][1].random_state), - KNeighborsClassifier()).fit(X_train, y_train) + clf2 = make_pipeline( + RandomUnderSampler( + random_state=clf1.estimators_[0].steps[0][1].random_state), + KNeighborsClassifier()).fit(X_train, y_train) assert_array_equal(clf1.predict(X_test), clf2.predict(X_test)) def test_error(): # Test that it gives proper exception on deficient input. - X, y = make_imbalance(iris.data, iris.target, ratio={0: 20, 1: 25, 2: 50}) + X, y = make_imbalance( + iris.data, iris.target, sampling_strategy={0: 20, + 1: 25, + 2: 50}) base = DecisionTreeClassifier() # Test n_estimators @@ -220,8 +250,8 @@ def test_error(): assert_raises(ValueError, BalancedBaggingClassifier(base, max_samples=1000).fit, X, y) assert_raises(ValueError, - BalancedBaggingClassifier(base, max_samples="foobar").fit, - X, y) + BalancedBaggingClassifier(base, max_samples="foobar").fit, X, + y) # Test max_features assert_raises(ValueError, @@ -237,8 +267,8 @@ def test_error(): X, y) # Test support of decision_function - assert not (hasattr(BalancedBaggingClassifier(base).fit(X, y), - 'decision_function')) + assert not (hasattr( + BalancedBaggingClassifier(base).fit(X, y), 'decision_function')) def test_gridsearch(): @@ -248,49 +278,53 @@ def test_gridsearch(): y[y == 2] = 1 # Grid search with scoring based on decision_function - parameters = {'n_estimators': (1, 2), - 'base_estimator__C': (1, 2)} + parameters = {'n_estimators': (1, 2), 'base_estimator__C': (1, 2)} - GridSearchCV(BalancedBaggingClassifier(SVC()), - parameters, - scoring="roc_auc").fit(X, y) + GridSearchCV( + BalancedBaggingClassifier(SVC()), parameters, scoring="roc_auc").fit( + X, y) def test_base_estimator(): # Check base_estimator and its default values. - X, y = make_imbalance(iris.data, iris.target, ratio={0: 20, 1: 25, 2: 50}, - random_state=0) - X_train, X_test, y_train, y_test = train_test_split(X, y, - random_state=0) + X, y = make_imbalance( + iris.data, + iris.target, + sampling_strategy={0: 20, + 1: 25, + 2: 50}, + random_state=0) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) - ensemble = BalancedBaggingClassifier(None, - n_jobs=3, - random_state=0).fit(X_train, y_train) + ensemble = BalancedBaggingClassifier( + None, n_jobs=3, random_state=0).fit(X_train, y_train) assert isinstance(ensemble.base_estimator_.steps[-1][1], DecisionTreeClassifier) - ensemble = BalancedBaggingClassifier(DecisionTreeClassifier(), - n_jobs=3, - random_state=0).fit(X_train, y_train) + ensemble = BalancedBaggingClassifier( + DecisionTreeClassifier(), n_jobs=3, random_state=0).fit( + X_train, y_train) assert isinstance(ensemble.base_estimator_.steps[-1][1], DecisionTreeClassifier) - ensemble = BalancedBaggingClassifier(Perceptron(), - n_jobs=3, - random_state=0).fit(X_train, y_train) + ensemble = BalancedBaggingClassifier( + Perceptron(), n_jobs=3, random_state=0).fit(X_train, y_train) - assert isinstance(ensemble.base_estimator_.steps[-1][1], - Perceptron) + assert isinstance(ensemble.base_estimator_.steps[-1][1], Perceptron) def test_bagging_with_pipeline(): - X, y = make_imbalance(iris.data, iris.target, ratio={0: 20, 1: 25, 2: 50}, - random_state=0) + X, y = make_imbalance( + iris.data, + iris.target, + sampling_strategy={0: 20, + 1: 25, + 2: 50}, + random_state=0) estimator = BalancedBaggingClassifier( - make_pipeline(SelectKBest(k=1), - DecisionTreeClassifier()), + make_pipeline(SelectKBest(k=1), DecisionTreeClassifier()), max_features=2) estimator.fit(X, y).predict(X) @@ -303,21 +337,21 @@ def test_warm_start(random_state=42): clf_ws = None for n_estimators in [5, 10]: if clf_ws is None: - clf_ws = BalancedBaggingClassifier(n_estimators=n_estimators, - random_state=random_state, - warm_start=True) + clf_ws = BalancedBaggingClassifier( + n_estimators=n_estimators, + random_state=random_state, + warm_start=True) else: clf_ws.set_params(n_estimators=n_estimators) clf_ws.fit(X, y) assert len(clf_ws) == n_estimators - clf_no_ws = BalancedBaggingClassifier(n_estimators=10, - random_state=random_state, - warm_start=False) + clf_no_ws = BalancedBaggingClassifier( + n_estimators=10, random_state=random_state, warm_start=False) clf_no_ws.fit(X, y) - assert (set([pipe.steps[-1][1].random_state for pipe in clf_ws]) == - set([pipe.steps[-1][1].random_state for pipe in clf_no_ws])) + assert (set([pipe.steps[-1][1].random_state for pipe in clf_ws]) == set( + [pipe.steps[-1][1].random_state for pipe in clf_no_ws])) def test_warm_start_smaller_n_estimators(): @@ -334,8 +368,8 @@ def test_warm_start_equal_n_estimators(): X, y = make_hastie_10_2(n_samples=20, random_state=1) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43) - clf = BalancedBaggingClassifier(n_estimators=5, warm_start=True, - random_state=83) + clf = BalancedBaggingClassifier( + n_estimators=5, warm_start=True, random_state=83) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) @@ -354,15 +388,15 @@ def test_warm_start_equivalence(): X, y = make_hastie_10_2(n_samples=20, random_state=1) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43) - clf_ws = BalancedBaggingClassifier(n_estimators=5, warm_start=True, - random_state=3141) + clf_ws = BalancedBaggingClassifier( + n_estimators=5, warm_start=True, random_state=3141) clf_ws.fit(X_train, y_train) clf_ws.set_params(n_estimators=10) clf_ws.fit(X_train, y_train) y1 = clf_ws.predict(X_test) - clf = BalancedBaggingClassifier(n_estimators=10, warm_start=False, - random_state=3141) + clf = BalancedBaggingClassifier( + n_estimators=10, warm_start=False, random_state=3141) clf.fit(X_train, y_train) y2 = clf.predict(X_test) @@ -372,8 +406,8 @@ def test_warm_start_equivalence(): def test_warm_start_with_oob_score_fails(): # Check using oob_score and warm_start simultaneously fails X, y = make_hastie_10_2(n_samples=20, random_state=1) - clf = BalancedBaggingClassifier(n_estimators=5, warm_start=True, - oob_score=True) + clf = BalancedBaggingClassifier( + n_estimators=5, warm_start=True, oob_score=True) assert_raises(ValueError, clf.fit, X, y) @@ -393,10 +427,12 @@ def test_oob_score_consistency(): # Make sure OOB scores are identical when random_state, estimator, and # training data are fixed and fitting is done twice X, y = make_hastie_10_2(n_samples=200, random_state=1) - bagging = BalancedBaggingClassifier(KNeighborsClassifier(), - max_samples=0.5, - max_features=0.5, oob_score=True, - random_state=1) + bagging = BalancedBaggingClassifier( + KNeighborsClassifier(), + max_samples=0.5, + max_features=0.5, + oob_score=True, + random_state=1) assert bagging.fit(X, y).oob_score_ == bagging.fit(X, y).oob_score_ @@ -445,9 +481,11 @@ def test_max_samples_consistency(): # Make sure validated max_samples and original max_samples are identical # when valid integer max_samples supplied by user max_samples = 100 - X, y = make_hastie_10_2(n_samples=2*max_samples, random_state=1) - bagging = BalancedBaggingClassifier(KNeighborsClassifier(), - max_samples=max_samples, - max_features=0.5, random_state=1) + X, y = make_hastie_10_2(n_samples=2 * max_samples, random_state=1) + bagging = BalancedBaggingClassifier( + KNeighborsClassifier(), + max_samples=max_samples, + max_features=0.5, + random_state=1) bagging.fit(X, y) assert bagging._max_samples == max_samples diff --git a/imblearn/ensemble/tests/test_easy_ensemble.py b/imblearn/ensemble/tests/test_easy_ensemble.py index 2de5b2ad5..05945133f 100644 --- a/imblearn/ensemble/tests/test_easy_ensemble.py +++ b/imblearn/ensemble/tests/test_easy_ensemble.py @@ -14,32 +14,36 @@ # Generate a global dataset to use RND_SEED = 0 -X = np.array([[0.5220963, 0.11349303], [0.59091459, 0.40692742], - [1.10915364, 0.05718352], [0.22039505, 0.26469445], - [1.35269503, 0.44812421], [0.85117925, 1.0185556], +X = np.array([[0.5220963, 0.11349303], [0.59091459, 0.40692742], [ + 1.10915364, 0.05718352 +], [0.22039505, 0.26469445], [1.35269503, 0.44812421], [0.85117925, 1.0185556], [-2.10724436, 0.70263997], [-0.23627356, 0.30254174], [-1.23195149, 0.15427291], [-0.58539673, 0.62515052]]) Y = np.array([1, 2, 2, 2, 1, 0, 1, 1, 1, 0]) def test_ee_init(): - # Define a ratio - ratio = 1. - ee = EasyEnsemble(ratio=ratio, random_state=RND_SEED) + # Define a sampling_strategy + sampling_strategy = 1. + ee = EasyEnsemble( + sampling_strategy=sampling_strategy, random_state=RND_SEED) - assert ee.ratio == ratio + assert ee.sampling_strategy == sampling_strategy assert ee.replacement is False assert ee.n_subsets == 10 assert ee.random_state == RND_SEED def test_fit_sample_auto(): - # Define the ratio parameter - ratio = 'auto' + # Define the sampling_strategy parameter + sampling_strategy = 'auto' # Create the sampling object ee = EasyEnsemble( - ratio=ratio, random_state=RND_SEED, return_indices=True, n_subsets=3) + sampling_strategy=sampling_strategy, + random_state=RND_SEED, + return_indices=True, + n_subsets=3) # Get the different subset X_resampled, y_resampled, idx_under = ee.fit_sample(X, Y) @@ -63,39 +67,30 @@ def test_fit_sample_auto(): def test_fit_sample_half(): - # Define the ratio parameter - ratio = {0: 2, 1: 3, 2: 3} + # Define the sampling_strategy parameter + sampling_strategy = {0: 2, 1: 3, 2: 3} # Create the sampling object - ee = EasyEnsemble(ratio=ratio, random_state=RND_SEED, n_subsets=3) + ee = EasyEnsemble( + sampling_strategy=sampling_strategy, + random_state=RND_SEED, + n_subsets=3) # Get the different subset X_resampled, y_resampled = ee.fit_sample(X, Y) - X_gt = np.array([[[-0.58539673, 0.62515052], - [0.85117925, 1.0185556], - [1.35269503, 0.44812421], - [-1.23195149, 0.15427291], - [0.5220963, 0.11349303], - [1.10915364, 0.05718352], - [0.59091459, 0.40692742], - [0.22039505, 0.26469445]], - [[0.85117925, 1.0185556], - [-0.58539673, 0.62515052], - [1.35269503, 0.44812421], - [-2.10724436, 0.70263997], - [-1.23195149, 0.15427291], - [0.59091459, 0.40692742], - [0.22039505, 0.26469445], - [1.10915364, 0.05718352]], - [[0.85117925, 1.0185556], - [-0.58539673, 0.62515052], - [-1.23195149, 0.15427291], - [0.5220963, 0.11349303], - [1.35269503, 0.44812421], - [1.10915364, 0.05718352], - [0.59091459, 0.40692742], - [0.22039505, 0.26469445]]]) + X_gt = np.array([[[-0.58539673, 0.62515052], [0.85117925, 1.0185556], + [1.35269503, 0.44812421], [-1.23195149, 0.15427291], + [0.5220963, 0.11349303], [1.10915364, 0.05718352], + [0.59091459, 0.40692742], [0.22039505, 0.26469445]], + [[0.85117925, 1.0185556], [-0.58539673, 0.62515052], + [1.35269503, 0.44812421], [-2.10724436, 0.70263997], + [-1.23195149, 0.15427291], [0.59091459, 0.40692742], + [0.22039505, 0.26469445], [1.10915364, 0.05718352]], + [[0.85117925, 1.0185556], [-0.58539673, 0.62515052], + [-1.23195149, 0.15427291], [0.5220963, 0.11349303], + [1.35269503, 0.44812421], [1.10915364, 0.05718352], + [0.59091459, 0.40692742], [0.22039505, 0.26469445]]]) y_gt = np.array([[0, 0, 1, 1, 1, 2, 2, 2], [0, 0, 1, 1, 1, 2, 2, 2], [0, 0, 1, 1, 1, 2, 2, 2]]) assert_array_equal(X_resampled, X_gt) @@ -103,11 +98,11 @@ def test_fit_sample_half(): def test_random_state_none(): - # Define the ratio parameter - ratio = 'auto' + # Define the sampling_strategy parameter + sampling_strategy = 'auto' # Create the sampling object - ee = EasyEnsemble(ratio=ratio, random_state=None) + ee = EasyEnsemble(sampling_strategy=sampling_strategy, random_state=None) # Get the different subset X_resampled, y_resampled = ee.fit_sample(X, Y) diff --git a/imblearn/metrics/classification.py b/imblearn/metrics/classification.py index ff8994c28..68c6e762b 100644 --- a/imblearn/metrics/classification.py +++ b/imblearn/metrics/classification.py @@ -33,7 +33,6 @@ except ImportError: from sklearn.externals.funcsigs import signature - LOGGER = logging.getLogger(__name__) @@ -166,10 +165,11 @@ def sensitivity_specificity_support(y_true, raise ValueError("Target is %s but average='binary'. Please " "choose another average setting." % y_type) elif pos_label not in (None, 1): - warnings.warn("Note that pos_label (set to %r) is ignored when " - "average != 'binary' (got %r). You may use " - "labels=[pos_label] to specify a single positive class." - % (pos_label, average), UserWarning) + warnings.warn( + "Note that pos_label (set to %r) is ignored when " + "average != 'binary' (got %r). You may use " + "labels=[pos_label] to specify a single positive class." % + (pos_label, average), UserWarning) if labels is None: labels = present_labels @@ -177,8 +177,8 @@ def sensitivity_specificity_support(y_true, else: n_labels = len(labels) labels = np.hstack( - [labels, np.setdiff1d( - present_labels, labels, assume_unique=True)]) + [labels, + np.setdiff1d(present_labels, labels, assume_unique=True)]) # Calculate tp_sum, pred_sum, true_sum ### @@ -591,8 +591,8 @@ class is unrecognized by the classifier, G-mean resolves to zero. To warn_for=('specificity', 'specificity'), sample_weight=sample_weight) - LOGGER.debug('The sensitivity and specificity are : %s - %s' % - (sen, spe)) + LOGGER.debug('The sensitivity and specificity are : %s - %s' % (sen, + spe)) return np.sqrt(sen * spe) else: present_labels = unique_labels(y_true, y_pred) @@ -602,8 +602,10 @@ class is unrecognized by the classifier, G-mean resolves to zero. To n_labels = None else: n_labels = len(labels) - labels = np.hstack([labels, np.setdiff1d(present_labels, labels, - assume_unique=True)]) + labels = np.hstack([ + labels, + np.setdiff1d(present_labels, labels, assume_unique=True) + ]) le = LabelEncoder() le.fit(labels) @@ -621,14 +623,14 @@ class is unrecognized by the classifier, G-mean resolves to zero. To tp_bins_weights = None if len(tp_bins): - tp_sum = np.bincount(tp_bins, weights=tp_bins_weights, - minlength=len(labels)) + tp_sum = np.bincount( + tp_bins, weights=tp_bins_weights, minlength=len(labels)) else: # Pathological case true_sum = tp_sum = np.zeros(len(labels)) if len(y_true): - true_sum = np.bincount(y_true, weights=sample_weight, - minlength=len(labels)) + true_sum = np.bincount( + y_true, weights=sample_weight, minlength=len(labels)) # Retain only selected labels indices = np.searchsorted(sorted_labels, labels[:n_labels]) @@ -722,8 +724,7 @@ def compute_score(*args, **kwargs): # specificity and specificity params_sens_spec = set(sens_spec_sig._parameters.keys()) # Make the intersection between the parameters - sel_params = params_sens_spec.intersection( - set(tags_scoring_func)) + sel_params = params_sens_spec.intersection(set(tags_scoring_func)) # Create a sub dictionary tags_scoring_func = dict((k, tags_scoring_func[k]) for k in sel_params) @@ -738,12 +739,10 @@ def compute_score(*args, **kwargs): scoring_func.__name__ == 'jaccard_similarity_score'): tags_scoring_func['average'] = 'binary' # Create the list of parameters through signature binding - tags_sens_spec = sens_spec_sig.bind( - **tags_scoring_func) + tags_sens_spec = sens_spec_sig.bind(**tags_scoring_func) # Call the sens/spec function sen, spe, _ = sensitivity_specificity_support( - *tags_sens_spec.args, - **tags_sens_spec.kwargs) + *tags_sens_spec.args, **tags_sens_spec.kwargs) # Compute the dominance dom = sen - spe return (1. + alpha * dom) * _score @@ -894,13 +893,11 @@ class 2 1.00 0.67 1.00 0.80 0.82 0.64\ # compute averages values = [last_line_heading] - for v in (np.average( - precision, weights=support), np.average( - recall, weights=support), np.average( - specificity, weights=support), np.average( - f1, weights=support), np.average( - geo_mean, weights=support), np.average( - iba, weights=support)): + for v in (np.average(precision, weights=support), np.average( + recall, weights=support), np.average(specificity, weights=support), + np.average(f1, weights=support), np.average( + geo_mean, weights=support), np.average(iba, + weights=support)): values += ["{0:0.{1}f}".format(v, digits)] values += ['{0}'.format(np.sum(support))] report += fmt % tuple(values) diff --git a/imblearn/metrics/tests/test_classification.py b/imblearn/metrics/tests/test_classification.py index c37f955f1..4dc583f8a 100644 --- a/imblearn/metrics/tests/test_classification.py +++ b/imblearn/metrics/tests/test_classification.py @@ -1,11 +1,9 @@ # coding: utf-8 - """Testing the metric for classification with imbalanced dataset""" # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT - from __future__ import division, print_function from functools import partial @@ -37,7 +35,6 @@ from imblearn.utils.testing import warns - RND_SEED = 42 R_TOL = 1e-2 @@ -163,12 +160,11 @@ def test_sensitivity_specificity_ignored_labels(): assert_allclose( np.mean([1., 0.33]), specificity_13(average='macro'), rtol=R_TOL) assert_allclose( - np.average( - [1., .33], weights=[2., 1.]), + np.average([1., .33], weights=[2., 1.]), specificity_13(average='weighted'), rtol=R_TOL) - assert_allclose(3. / (3. + 2.), specificity_13(average='micro'), - rtol=R_TOL) + assert_allclose( + 3. / (3. + 2.), specificity_13(average='micro'), rtol=R_TOL) # ensure the above were meaningful tests: for each in ['macro', 'weighted', 'micro']: @@ -191,8 +187,8 @@ def test_sensitivity_specificity_support_errors(): # Bad pos_label with raises(ValueError): - sensitivity_specificity_support(y_true, y_pred, pos_label=2, - average='binary') + sensitivity_specificity_support( + y_true, y_pred, pos_label=2, average='binary') # Bad average option with raises(ValueError): @@ -202,9 +198,8 @@ def test_sensitivity_specificity_support_errors(): def test_sensitivity_specificity_unused_pos_label(): # but average != 'binary'; even if data is binary with warns(UserWarning, "use labels=\[pos_label\] to specify a single"): - sensitivity_specificity_support([1, 2, 1], [1, 2, 2], - pos_label=2, - average='macro') + sensitivity_specificity_support( + [1, 2, 1], [1, 2, 2], pos_label=2, average='macro') def test_geometric_mean_support_binary(): @@ -228,57 +223,77 @@ def test_geometric_mean_multiclass(): cor = 0.001 y_true = [0, 0, 0, 0] y_pred = [0, 0, 0, 0] - assert_allclose(geometric_mean_score(y_true, y_pred, correction=cor), - 1.0, rtol=R_TOL) + assert_allclose( + geometric_mean_score(y_true, y_pred, correction=cor), 1.0, rtol=R_TOL) y_true = [0, 0, 0, 0] y_pred = [1, 1, 1, 1] - assert_allclose(geometric_mean_score(y_true, y_pred, correction=cor), - cor, rtol=R_TOL) + assert_allclose( + geometric_mean_score(y_true, y_pred, correction=cor), cor, rtol=R_TOL) y_true = [0, 0, 1, 1] y_pred = [0, 1, 1, 0] - assert_allclose(geometric_mean_score(y_true, y_pred, correction=cor), - 0.5, rtol=R_TOL) + assert_allclose( + geometric_mean_score(y_true, y_pred, correction=cor), 0.5, rtol=R_TOL) y_true = [0, 1, 2, 0, 1, 2] y_pred = [0, 2, 1, 0, 0, 1] - assert_allclose(geometric_mean_score(y_true, y_pred, correction=cor), - (1*cor*cor)**(1.0/3.0), rtol=R_TOL) + assert_allclose( + geometric_mean_score(y_true, y_pred, correction=cor), + (1 * cor * cor) ** (1.0 / 3.0), + rtol=R_TOL) y_true = [0, 1, 2, 3, 4, 5] y_pred = [0, 1, 2, 3, 4, 5] - assert_allclose(geometric_mean_score(y_true, y_pred, correction=cor), - 1, rtol=R_TOL) + assert_allclose( + geometric_mean_score(y_true, y_pred, correction=cor), 1, rtol=R_TOL) y_true = [0, 1, 1, 1, 1, 0] y_pred = [0, 0, 1, 1, 1, 1] - assert_allclose(geometric_mean_score(y_true, y_pred, correction=cor), - (0.5*0.75)**0.5, rtol=R_TOL) + assert_allclose( + geometric_mean_score(y_true, y_pred, correction=cor), + (0.5 * 0.75) ** 0.5, + rtol=R_TOL) y_true = [0, 1, 2, 0, 1, 2] y_pred = [0, 2, 1, 0, 0, 1] - assert_allclose(geometric_mean_score(y_true, y_pred, average='macro'), - 0.47140452079103168, rtol=R_TOL) - assert_allclose(geometric_mean_score(y_true, y_pred, average='micro'), - 0.47140452079103168, rtol=R_TOL) - assert_allclose(geometric_mean_score(y_true, y_pred, - average='weighted'), - 0.47140452079103168, rtol=R_TOL) - assert_allclose(geometric_mean_score(y_true, y_pred, average=None), - [0.8660254, 0.0, 0.0], rtol=R_TOL) + assert_allclose( + geometric_mean_score(y_true, y_pred, average='macro'), + 0.47140452079103168, + rtol=R_TOL) + assert_allclose( + geometric_mean_score(y_true, y_pred, average='micro'), + 0.47140452079103168, + rtol=R_TOL) + assert_allclose( + geometric_mean_score(y_true, y_pred, average='weighted'), + 0.47140452079103168, + rtol=R_TOL) + assert_allclose( + geometric_mean_score(y_true, y_pred, average=None), + [0.8660254, 0.0, 0.0], + rtol=R_TOL) y_true = [0, 1, 2, 0, 1, 2] y_pred = [0, 1, 1, 0, 0, 1] - assert_allclose(geometric_mean_score(y_true, y_pred, labels=[0, 1]), - 0.70710678118654752, rtol=R_TOL) - assert_allclose(geometric_mean_score(y_true, y_pred, labels=[0, 1], - sample_weight=[1, 2, 1, 1, 2, 1]), - 0.70710678118654752, rtol=R_TOL) - assert_allclose(geometric_mean_score(y_true, y_pred, labels=[0, 1], - sample_weight=[1, 2, 1, 1, 2, 1], - average='weighted'), - 0.3333333333, rtol=R_TOL) + assert_allclose( + geometric_mean_score(y_true, y_pred, labels=[0, 1]), + 0.70710678118654752, + rtol=R_TOL) + assert_allclose( + geometric_mean_score( + y_true, y_pred, labels=[0, 1], sample_weight=[1, 2, 1, 1, 2, 1]), + 0.70710678118654752, + rtol=R_TOL) + assert_allclose( + geometric_mean_score( + y_true, + y_pred, + labels=[0, 1], + sample_weight=[1, 2, 1, 1, 2, 1], + average='weighted'), + 0.3333333333, + rtol=R_TOL) y_true, y_pred, _ = make_prediction(binary=False) @@ -424,23 +439,21 @@ def test_classification_report_imbalanced_multiclass_with_long_string_label(): def test_iba_sklearn_metrics(): y_true, y_pred, _ = make_prediction(binary=True) - acc = make_index_balanced_accuracy(alpha=0.5, squared=True)( - accuracy_score) + acc = make_index_balanced_accuracy(alpha=0.5, squared=True)(accuracy_score) score = acc(y_true, y_pred) assert score == approx(0.54756) - jss = make_index_balanced_accuracy(alpha=0.5, squared=True)( - jaccard_similarity_score) + jss = make_index_balanced_accuracy( + alpha=0.5, squared=True)(jaccard_similarity_score) score = jss(y_true, y_pred) assert score == approx(0.54756) - pre = make_index_balanced_accuracy(alpha=0.5, squared=True)( - precision_score) + pre = make_index_balanced_accuracy( + alpha=0.5, squared=True)(precision_score) score = pre(y_true, y_pred) assert score == approx(0.65025) - rec = make_index_balanced_accuracy(alpha=0.5, squared=True)( - recall_score) + rec = make_index_balanced_accuracy(alpha=0.5, squared=True)(recall_score) score = rec(y_true, y_pred) assert score == approx(0.41616000000000009) @@ -448,22 +461,21 @@ def test_iba_sklearn_metrics(): def test_iba_error_y_score_prob(): y_true, y_pred, _ = make_prediction(binary=True) - aps = make_index_balanced_accuracy(alpha=0.5, squared=True)( - average_precision_score) + aps = make_index_balanced_accuracy( + alpha=0.5, squared=True)(average_precision_score) with raises(AttributeError): aps(y_true, y_pred) - brier = make_index_balanced_accuracy(alpha=0.5, squared=True)( - brier_score_loss) + brier = make_index_balanced_accuracy( + alpha=0.5, squared=True)(brier_score_loss) with raises(AttributeError): brier(y_true, y_pred) - kappa = make_index_balanced_accuracy(alpha=0.5, squared=True)( - cohen_kappa_score) + kappa = make_index_balanced_accuracy( + alpha=0.5, squared=True)(cohen_kappa_score) with raises(AttributeError): kappa(y_true, y_pred) - ras = make_index_balanced_accuracy(alpha=0.5, squared=True)( - roc_auc_score) + ras = make_index_balanced_accuracy(alpha=0.5, squared=True)(roc_auc_score) with raises(AttributeError): ras(y_true, y_pred) diff --git a/imblearn/metrics/tests/test_score_objects.py b/imblearn/metrics/tests/test_score_objects.py index 29ecea97d..d244bf6e0 100644 --- a/imblearn/metrics/tests/test_score_objects.py +++ b/imblearn/metrics/tests/test_score_objects.py @@ -32,102 +32,102 @@ def test_imblearn_classification_scorers(): # sensitivity scorer scorer = make_scorer(sensitivity_score, pos_label=None, average='macro') - grid = GridSearchCV(LinearSVC(random_state=0), param_grid={'C': [1, 10]}, - scoring=scorer) + grid = GridSearchCV( + LinearSVC(random_state=0), param_grid={'C': [1, 10]}, scoring=scorer) grid.fit(X_train, y_train).predict(X_test) assert_allclose(grid.best_score_, 0.92, rtol=R_TOL) scorer = make_scorer(sensitivity_score, pos_label=None, average='weighted') - grid = GridSearchCV(LinearSVC(random_state=0), param_grid={'C': [1, 10]}, - scoring=scorer) + grid = GridSearchCV( + LinearSVC(random_state=0), param_grid={'C': [1, 10]}, scoring=scorer) grid.fit(X_train, y_train).predict(X_test) assert_allclose(grid.best_score_, 0.92, rtol=R_TOL) scorer = make_scorer(sensitivity_score, pos_label=None, average='micro') - grid = GridSearchCV(LinearSVC(random_state=0), param_grid={'C': [1, 10]}, - scoring=scorer) + grid = GridSearchCV( + LinearSVC(random_state=0), param_grid={'C': [1, 10]}, scoring=scorer) grid.fit(X_train, y_train).predict(X_test) assert_allclose(grid.best_score_, 0.92, rtol=R_TOL) scorer = make_scorer(sensitivity_score, pos_label=1) - grid = GridSearchCV(LinearSVC(random_state=0), param_grid={'C': [1, 10]}, - scoring=scorer) + grid = GridSearchCV( + LinearSVC(random_state=0), param_grid={'C': [1, 10]}, scoring=scorer) grid.fit(X_train, y_train).predict(X_test) assert_allclose(grid.best_score_, 0.92, rtol=R_TOL) # specificity scorer scorer = make_scorer(specificity_score, pos_label=None, average='macro') - grid = GridSearchCV(LinearSVC(random_state=0), param_grid={'C': [1, 10]}, - scoring=scorer) + grid = GridSearchCV( + LinearSVC(random_state=0), param_grid={'C': [1, 10]}, scoring=scorer) grid.fit(X_train, y_train).predict(X_test) assert_allclose(grid.best_score_, 0.92, rtol=R_TOL) scorer = make_scorer(specificity_score, pos_label=None, average='weighted') - grid = GridSearchCV(LinearSVC(random_state=0), param_grid={'C': [1, 10]}, - scoring=scorer) + grid = GridSearchCV( + LinearSVC(random_state=0), param_grid={'C': [1, 10]}, scoring=scorer) grid.fit(X_train, y_train).predict(X_test) assert_allclose(grid.best_score_, 0.92, rtol=R_TOL) scorer = make_scorer(specificity_score, pos_label=None, average='micro') - grid = GridSearchCV(LinearSVC(random_state=0), param_grid={'C': [1, 10]}, - scoring=scorer) + grid = GridSearchCV( + LinearSVC(random_state=0), param_grid={'C': [1, 10]}, scoring=scorer) grid.fit(X_train, y_train).predict(X_test) assert_allclose(grid.best_score_, 0.92, rtol=R_TOL) scorer = make_scorer(specificity_score, pos_label=1) - grid = GridSearchCV(LinearSVC(random_state=0), param_grid={'C': [1, 10]}, - scoring=scorer) + grid = GridSearchCV( + LinearSVC(random_state=0), param_grid={'C': [1, 10]}, scoring=scorer) grid.fit(X_train, y_train).predict(X_test) assert_allclose(grid.best_score_, 0.95, rtol=R_TOL) # geometric_mean scorer scorer = make_scorer(geometric_mean_score, pos_label=None, average='macro') - grid = GridSearchCV(LinearSVC(random_state=0), param_grid={'C': [1, 10]}, - scoring=scorer) + grid = GridSearchCV( + LinearSVC(random_state=0), param_grid={'C': [1, 10]}, scoring=scorer) grid.fit(X_train, y_train).predict(X_test) assert_allclose(grid.best_score_, 0.92, rtol=R_TOL) scorer = make_scorer( geometric_mean_score, pos_label=None, average='weighted') - grid = GridSearchCV(LinearSVC(random_state=0), param_grid={'C': [1, 10]}, - scoring=scorer) + grid = GridSearchCV( + LinearSVC(random_state=0), param_grid={'C': [1, 10]}, scoring=scorer) grid.fit(X_train, y_train).predict(X_test) assert_allclose(grid.best_score_, 0.92, rtol=R_TOL) scorer = make_scorer(geometric_mean_score, pos_label=None, average='micro') - grid = GridSearchCV(LinearSVC(random_state=0), param_grid={'C': [1, 10]}, - scoring=scorer) + grid = GridSearchCV( + LinearSVC(random_state=0), param_grid={'C': [1, 10]}, scoring=scorer) grid.fit(X_train, y_train).predict(X_test) assert_allclose(grid.best_score_, 0.92, rtol=R_TOL) scorer = make_scorer(geometric_mean_score, pos_label=1) - grid = GridSearchCV(LinearSVC(random_state=0), param_grid={'C': [1, 10]}, - scoring=scorer) + grid = GridSearchCV( + LinearSVC(random_state=0), param_grid={'C': [1, 10]}, scoring=scorer) grid.fit(X_train, y_train).predict(X_test) assert_allclose(grid.best_score_, 0.92, rtol=R_TOL) # make a iba metric before a scorer geo_mean_iba = make_index_balanced_accuracy()(geometric_mean_score) scorer = make_scorer(geo_mean_iba, pos_label=None, average='macro') - grid = GridSearchCV(LinearSVC(random_state=0), param_grid={'C': [1, 10]}, - scoring=scorer) + grid = GridSearchCV( + LinearSVC(random_state=0), param_grid={'C': [1, 10]}, scoring=scorer) grid.fit(X_train, y_train).predict(X_test) assert_allclose(grid.best_score_, 0.85, rtol=R_TOL) scorer = make_scorer(geo_mean_iba, pos_label=None, average='weighted') - grid = GridSearchCV(LinearSVC(random_state=0), param_grid={'C': [1, 10]}, - scoring=scorer) + grid = GridSearchCV( + LinearSVC(random_state=0), param_grid={'C': [1, 10]}, scoring=scorer) grid.fit(X_train, y_train).predict(X_test) assert_allclose(grid.best_score_, 0.85, rtol=R_TOL) scorer = make_scorer(geo_mean_iba, pos_label=None, average='micro') - grid = GridSearchCV(LinearSVC(random_state=0), param_grid={'C': [1, 10]}, - scoring=scorer) + grid = GridSearchCV( + LinearSVC(random_state=0), param_grid={'C': [1, 10]}, scoring=scorer) grid.fit(X_train, y_train).predict(X_test) assert_allclose(grid.best_score_, 0.85, rtol=R_TOL) scorer = make_scorer(geo_mean_iba, pos_label=1) - grid = GridSearchCV(LinearSVC(random_state=0), param_grid={'C': [1, 10]}, - scoring=scorer) + grid = GridSearchCV( + LinearSVC(random_state=0), param_grid={'C': [1, 10]}, scoring=scorer) grid.fit(X_train, y_train).predict(X_test) assert_allclose(grid.best_score_, 0.84, rtol=R_TOL) diff --git a/imblearn/over_sampling/__init__.py b/imblearn/over_sampling/__init__.py index 4b94d047a..3d92ef0a5 100644 --- a/imblearn/over_sampling/__init__.py +++ b/imblearn/over_sampling/__init__.py @@ -7,6 +7,4 @@ from .random_over_sampler import RandomOverSampler from .smote import SMOTE -__all__ = ['ADASYN', - 'RandomOverSampler', - 'SMOTE'] +__all__ = ['ADASYN', 'RandomOverSampler', 'SMOTE'] diff --git a/imblearn/over_sampling/adasyn.py b/imblearn/over_sampling/adasyn.py index 8246d77ab..7f1798f73 100644 --- a/imblearn/over_sampling/adasyn.py +++ b/imblearn/over_sampling/adasyn.py @@ -13,8 +13,13 @@ from .base import BaseOverSampler from ..utils import check_neighbors_object +from ..utils import Substitution +from ..utils._docstring import _random_state_docstring +@Substitution( + sampling_strategy=BaseOverSampler._sampling_strategy_docstring, + random_state=_random_state_docstring) class ADASYN(BaseOverSampler): """Perform over-sampling using Adaptive Synthetic (ADASYN) sampling approach for imbalanced datasets. @@ -23,28 +28,9 @@ class ADASYN(BaseOverSampler): Parameters ---------- - ratio : str, dict, or callable, optional (default='auto') - Ratio to use for resampling the data set. - - - If ``str``, has to be one of: (i) ``'minority'``: resample the - minority class; (ii) ``'majority'``: resample the majority class, - (iii) ``'not minority'``: resample all classes apart of the minority - class, (iv) ``'all'``: resample all classes, and (v) ``'auto'``: - correspond to ``'all'`` with for over-sampling methods and ``'not - minority'`` for under-sampling methods. The classes targeted will be - over-sampled or under-sampled to achieve an equal number of sample - with the majority or minority class. - - If ``dict``, the keys correspond to the targeted classes. The values - correspond to the desired number of samples. - - If callable, function taking ``y`` and returns a ``dict``. The keys - correspond to the targeted classes. The values correspond to the - desired number of samples. - - random_state : int, RandomState instance or None, optional (default=None) - If int, ``random_state`` is the seed used by the random number - generator; If ``RandomState`` instance, random_state is the random - number generator; If ``None``, the random number generator is the - ``RandomState`` instance used by ``np.random``. + {sampling_strategy} + + {random_state} n_neighbors : int int or object, optional (default=5) If ``int``, number of nearest neighbours to used to construct synthetic @@ -55,6 +41,11 @@ class ADASYN(BaseOverSampler): n_jobs : int, optional (default=1) Number of threads to run the algorithm when it is possible. + ratio : str, dict, or callable + .. deprecated:: 0.4 + Use the parameter ``sampling_strategy`` instead. It will be removed + in 0.6. + Notes ----- The implementation is based on [1]_. @@ -88,29 +79,31 @@ class ADASYN(BaseOverSampler): ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, ... random_state=10) - >>> print('Original dataset shape {}'.format(Counter(y))) - Original dataset shape Counter({1: 900, 0: 100}) + >>> print('Original dataset shape %s' % Counter(y)) + Original dataset shape Counter({{1: 900, 0: 100}}) >>> ada = ADASYN(random_state=42) >>> X_res, y_res = ada.fit_sample(X, y) - >>> print('Resampled dataset shape {}'.format(Counter(y_res))) - Resampled dataset shape Counter({0: 904, 1: 900}) + >>> print('Resampled dataset shape %s' % Counter(y_res)) + Resampled dataset shape Counter({{0: 904, 1: 900}}) """ def __init__(self, - ratio='auto', + sampling_strategy='auto', random_state=None, n_neighbors=5, - n_jobs=1): - super(ADASYN, self).__init__(ratio=ratio) + n_jobs=1, + ratio=None): + super(ADASYN, self).__init__( + sampling_strategy=sampling_strategy, ratio=ratio) self.random_state = random_state self.n_neighbors = n_neighbors self.n_jobs = n_jobs def _validate_estimator(self): """Create the necessary objects for ADASYN""" - self.nn_ = check_neighbors_object('n_neighbors', self.n_neighbors, - additional_neighbor=1) + self.nn_ = check_neighbors_object( + 'n_neighbors', self.n_neighbors, additional_neighbor=1) self.nn_.set_params(**{'n_jobs': self.n_jobs}) def _sample(self, X, y): @@ -141,7 +134,7 @@ def _sample(self, X, y): X_resampled = X.copy() y_resampled = y.copy() - for class_sample, n_samples in self.ratio_.items(): + for class_sample, n_samples in self.sampling_strategy_.items(): if n_samples == 0: continue target_class_indices = np.flatnonzero(y == class_sample) @@ -183,17 +176,16 @@ def _sample(self, X, y): steps = random_state.uniform(size=len(nn_zs)) if x_i.nnz: for step, nn_z in zip(steps, nn_zs): - sample = (x_i + - step * (X_class[x_i_nn[nn_z], :] - x_i)) - row_indices += ([n_samples_generated] * - len(sample.indices)) + sample = (x_i + step * + (X_class[x_i_nn[nn_z], :] - x_i)) + row_indices += ( + [n_samples_generated] * len(sample.indices)) col_indices += sample.indices.tolist() samples += sample.data.tolist() n_samples_generated += 1 - X_new = (sparse.csr_matrix((samples, - (row_indices, col_indices)), - [np.sum(n_samples_generate), - X.shape[1]])) + X_new = (sparse.csr_matrix( + (samples, (row_indices, col_indices)), + [np.sum(n_samples_generate), X.shape[1]])) y_new = np.array([class_sample] * np.sum(n_samples_generate)) else: x_class_gen = [] @@ -204,9 +196,10 @@ def _sample(self, X, y): nn_zs = random_state.randint( 1, high=self.nn_.n_neighbors, size=num_sample_i) steps = random_state.uniform(size=len(nn_zs)) - x_class_gen.append([x_i + - step * (X_class[x_i_nn[nn_z], :] - x_i) - for step, nn_z in zip(steps, nn_zs)]) + x_class_gen.append([ + x_i + step * (X_class[x_i_nn[nn_z], :] - x_i) + for step, nn_z in zip(steps, nn_zs) + ]) X_new = np.concatenate(x_class_gen) y_new = np.array([class_sample] * np.sum(n_samples_generate)) diff --git a/imblearn/over_sampling/base.py b/imblearn/over_sampling/base.py index 9c1f6d51b..457147fdd 100644 --- a/imblearn/over_sampling/base.py +++ b/imblearn/over_sampling/base.py @@ -16,3 +16,40 @@ class BaseOverSampler(BaseSampler): """ _sampling_type = 'over-sampling' + + _sampling_strategy_docstring = \ + """sampling_strategy : float, str, dict or callable, (default='auto') + Sampling information to resample the data set. + + - When ``float``, it corresponds to the ratio :math:`\\alpha_{os}` + defined by :math:`N_{rm} = \\alpha_{os} \\times N_{m}` where + :math:`N_{rm}` and :math:`N_{M}` are the number of samples in the + minority class after resampling and the number of samples in the + majority class, respectively. + + .. warning:: + ``float`` is only available for **binary** classification. An + error is raised for multi-class classification. + + - When ``str``, specify the class targeted by the resampling. The + number of samples in the different classes will be equalized. + Possible choices are: + + ``'minority'``: resample only the minority class; + + ``'not minority'``: resample all classes but the minority class; + + ``'not majority'``: resample all classes but the majority class; + + ``'all'``: resample all classes; + + ``'auto'``: equivalent to ``'not majority'``. + + - When ``dict``, the keys correspond to the targeted classes. The + values correspond to the desired number of samples for each targeted + class. + + - When callable, function taking ``y`` and returns a ``dict``. The keys + correspond to the targeted classes. The values correspond to the + desired number of samples for each class. + """.strip() diff --git a/imblearn/over_sampling/random_over_sampler.py b/imblearn/over_sampling/random_over_sampler.py index e870d8c21..09617704d 100644 --- a/imblearn/over_sampling/random_over_sampler.py +++ b/imblearn/over_sampling/random_over_sampler.py @@ -11,8 +11,13 @@ from sklearn.utils import check_random_state, safe_indexing from .base import BaseOverSampler +from ..utils import Substitution +from ..utils._docstring import _random_state_docstring +@Substitution( + sampling_strategy=BaseOverSampler._sampling_strategy_docstring, + random_state=_random_state_docstring) class RandomOverSampler(BaseOverSampler): """Class to perform random over-sampling. @@ -23,28 +28,14 @@ class RandomOverSampler(BaseOverSampler): Parameters ---------- - ratio : str, dict, or callable, optional (default='auto') - Ratio to use for resampling the data set. - - - If ``str``, has to be one of: (i) ``'minority'``: resample the - minority class; (ii) ``'majority'``: resample the majority class, - (iii) ``'not minority'``: resample all classes apart of the minority - class, (iv) ``'all'``: resample all classes, and (v) ``'auto'``: - correspond to ``'all'`` with for over-sampling methods and ``'not - minority'`` for under-sampling methods. The classes targeted will be - over-sampled or under-sampled to achieve an equal number of sample - with the majority or minority class. - - If ``dict``, the keys correspond to the targeted classes. The values - correspond to the desired number of samples. - - If callable, function taking ``y`` and returns a ``dict``. The keys - correspond to the targeted classes. The values correspond to the - desired number of samples. - - random_state : int, RandomState instance or None, optional (default=None) - If int, ``random_state`` is the seed used by the random number - generator; If ``RandomState`` instance, random_state is the random - number generator; If ``None``, the random number generator is the - ``RandomState`` instance used by ``np.random``. + {sampling_strategy} + + {random_state} + + ratio : str, dict, or callable + .. deprecated:: 0.4 + Use the parameter ``sampling_strategy`` instead. It will be removed + in 0.6. Notes ----- @@ -66,17 +57,19 @@ class RandomOverSampler(BaseOverSampler): >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) - >>> print('Original dataset shape {}'.format(Counter(y))) - Original dataset shape Counter({1: 900, 0: 100}) + >>> print('Original dataset shape %s' % Counter(y)) + Original dataset shape Counter({{1: 900, 0: 100}}) >>> ros = RandomOverSampler(random_state=42) >>> X_res, y_res = ros.fit_sample(X, y) - >>> print('Resampled dataset shape {}'.format(Counter(y_res))) - Resampled dataset shape Counter({0: 900, 1: 900}) + >>> print('Resampled dataset shape %s' % Counter(y_res)) + Resampled dataset shape Counter({{0: 900, 1: 900}}) """ - def __init__(self, ratio='auto', random_state=None): - super(RandomOverSampler, self).__init__(ratio=ratio) + def __init__(self, sampling_strategy='auto', random_state=None, + ratio=None): + super(RandomOverSampler, self).__init__( + sampling_strategy=sampling_strategy, ratio=ratio) self.random_state = random_state def _sample(self, X, y): @@ -105,7 +98,7 @@ def _sample(self, X, y): sample_indices = range(X.shape[0]) - for class_sample, num_samples in self.ratio_.items(): + for class_sample, num_samples in self.sampling_strategy_.items(): target_class_indices = np.flatnonzero(y == class_sample) indices = random_state.randint( low=0, high=target_stats[class_sample], size=num_samples) @@ -113,5 +106,5 @@ def _sample(self, X, y): sample_indices = np.append(sample_indices, target_class_indices[indices]) - return (safe_indexing(X, sample_indices), - safe_indexing(y, sample_indices)) + return (safe_indexing(X, sample_indices), safe_indexing( + y, sample_indices)) diff --git a/imblearn/over_sampling/smote.py b/imblearn/over_sampling/smote.py index 40e29dbe9..e120bbee9 100644 --- a/imblearn/over_sampling/smote.py +++ b/imblearn/over_sampling/smote.py @@ -17,10 +17,15 @@ from .base import BaseOverSampler from ..exceptions import raise_isinstance_error from ..utils import check_neighbors_object +from ..utils import Substitution +from ..utils._docstring import _random_state_docstring SMOTE_KIND = ('regular', 'borderline1', 'borderline2', 'svm') +@Substitution( + sampling_strategy=BaseOverSampler._sampling_strategy_docstring, + random_state=_random_state_docstring) class SMOTE(BaseOverSampler): """Class to perform over-sampling using SMOTE. @@ -32,28 +37,9 @@ class SMOTE(BaseOverSampler): Parameters ---------- - ratio : str, dict, or callable, optional (default='auto') - Ratio to use for resampling the data set. - - - If ``str``, has to be one of: (i) ``'minority'``: resample the - minority class; (ii) ``'majority'``: resample the majority class, - (iii) ``'not minority'``: resample all classes apart of the minority - class, (iv) ``'all'``: resample all classes, and (v) ``'auto'``: - correspond to ``'all'`` with for over-sampling methods and ``'not - minority'`` for under-sampling methods. The classes targeted will be - over-sampled or under-sampled to achieve an equal number of sample - with the majority or minority class. - - If ``dict``, the keys correspond to the targeted classes. The values - correspond to the desired number of samples. - - If callable, function taking ``y`` and returns a ``dict``. The keys - correspond to the targeted classes. The values correspond to the - desired number of samples. - - random_state : int, RandomState instance or None, optional (default=None) - If int, ``random_state`` is the seed used by the random number - generator; If ``RandomState`` instance, random_state is the random - number generator; If ``None``, the random number generator is the - ``RandomState`` instance used by ``np.random``. + {sampling_strategy} + + {random_state} k_neighbors : int or object, optional (default=5) If ``int``, number of nearest neighbours to used to construct synthetic @@ -63,8 +49,8 @@ class SMOTE(BaseOverSampler): m_neighbors : int int or object, optional (default=10) If int, number of nearest neighbours to use to determine if a minority - sample is in danger. Used with ``kind={'borderline1', 'borderline2', - 'svm'}``. If object, an estimator that inherits + sample is in danger. Used with ``kind={{'borderline1', 'borderline2', + 'svm'}}``. If object, an estimator that inherits from :class:`sklearn.neighbors.base.KNeighborsMixin` that will be used to find the k_neighbors. @@ -82,6 +68,11 @@ class SMOTE(BaseOverSampler): n_jobs : int, optional (default=1) The number of threads to open if possible. + ratio : str, dict, or callable + .. deprecated:: 0.4 + Use the parameter ``sampling_strategy`` instead. It will be removed + in 0.6. + Notes ----- See the original papers: [1]_, [2]_, [3]_ for more details. @@ -125,25 +116,27 @@ class SMOTE(BaseOverSampler): >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) - >>> print('Original dataset shape {}'.format(Counter(y))) - Original dataset shape Counter({1: 900, 0: 100}) + >>> print('Original dataset shape %s' % Counter(y)) + Original dataset shape Counter({{1: 900, 0: 100}}) >>> sm = SMOTE(random_state=42) >>> X_res, y_res = sm.fit_sample(X, y) - >>> print('Resampled dataset shape {}'.format(Counter(y_res))) - Resampled dataset shape Counter({0: 900, 1: 900}) + >>> print('Resampled dataset shape %s' % Counter(y_res)) + Resampled dataset shape Counter({{0: 900, 1: 900}}) """ def __init__(self, - ratio='auto', + sampling_strategy='auto', random_state=None, k_neighbors=5, m_neighbors=10, out_step=0.5, kind='regular', svm_estimator=None, - n_jobs=1): - super(SMOTE, self).__init__(ratio=ratio) + n_jobs=1, + ratio=None): + super(SMOTE, self).__init__( + sampling_strategy=sampling_strategy, ratio=ratio) self.random_state = random_state self.kind = kind self.k_neighbors = k_neighbors @@ -184,9 +177,8 @@ def _in_danger_noise(self, samples, target_class, y, kind='danger'): if kind == 'danger': # Samples are in danger for m/2 <= m' < m - return np.bitwise_and( - n_maj >= (self.nn_m_.n_neighbors - 1) / 2, - n_maj < self.nn_m_.n_neighbors - 1) + return np.bitwise_and(n_maj >= (self.nn_m_.n_neighbors - 1) / 2, + n_maj < self.nn_m_.n_neighbors - 1) elif kind == 'noise': # Samples are noise for m = m' return n_maj == self.nn_m_.n_neighbors - 1 @@ -245,8 +237,8 @@ def _make_samples(self, row_indices, col_indices, samples = [], [], [] for i, (row, col, step) in enumerate(zip(rows, cols, steps)): if X[row].nnz: - sample = X[row] - step * (X[row] - - nn_data[nn_num[row, col]]) + sample = X[row] - step * ( + X[row] - nn_data[nn_num[row, col]]) row_indices += [i] * len(sample.indices) col_indices += sample.indices.tolist() samples += sample.data.tolist() @@ -272,15 +264,13 @@ def _validate_estimator(self): ' Choices are {}. Got {} instead.'.format( SMOTE_KIND, self.kind)) - self.nn_k_ = check_neighbors_object('k_neighbors', - self.k_neighbors, - additional_neighbor=1) + self.nn_k_ = check_neighbors_object( + 'k_neighbors', self.k_neighbors, additional_neighbor=1) self.nn_k_.set_params(**{'n_jobs': self.n_jobs}) if self.kind != 'regular': - self.nn_m_ = check_neighbors_object('m_neighbors', - self.m_neighbors, - additional_neighbor=1) + self.nn_m_ = check_neighbors_object( + 'm_neighbors', self.m_neighbors, additional_neighbor=1) self.nn_m_.set_params(**{'n_jobs': self.n_jobs}) if self.kind == 'svm': @@ -325,7 +315,7 @@ def _sample_regular(self, X, y): X_resampled = X.copy() y_resampled = y.copy() - for class_sample, n_samples in self.ratio_.items(): + for class_sample, n_samples in self.sampling_strategy_.items(): if n_samples == 0: continue target_class_indices = np.flatnonzero(y == class_sample) @@ -379,15 +369,15 @@ def _sample_borderline(self, X, y): X_resampled = X.copy() y_resampled = y.copy() - for class_sample, n_samples in self.ratio_.items(): + for class_sample, n_samples in self.sampling_strategy_.items(): if n_samples == 0: continue target_class_indices = np.flatnonzero(y == class_sample) X_class = safe_indexing(X, target_class_indices) self.nn_m_.fit(X) - danger_index = self._in_danger_noise(X_class, class_sample, y, - kind='danger') + danger_index = self._in_danger_noise( + X_class, class_sample, y, kind='danger') if not any(danger_index): continue @@ -399,10 +389,9 @@ def _sample_borderline(self, X, y): # divergence between borderline-1 and borderline-2 if self.kind == 'borderline1': # Create synthetic samples for borderline points. - X_new, y_new = self._make_samples(safe_indexing(X_class, - danger_index), - class_sample, X_class, - nns, n_samples) + X_new, y_new = self._make_samples( + safe_indexing(X_class, danger_index), class_sample, + X_class, nns, n_samples) if sparse.issparse(X_new): X_resampled = sparse.vstack([X_resampled, X_new]) else: @@ -415,26 +404,30 @@ def _sample_borderline(self, X, y): # only minority X_new_1, y_new_1 = self._make_samples( - safe_indexing(X_class, danger_index), class_sample, - X_class, nns, - int(fractions * (n_samples + 1)), step_size=1.) + safe_indexing(X_class, danger_index), + class_sample, + X_class, + nns, + int(fractions * (n_samples + 1)), + step_size=1.) # we use a one-vs-rest policy to handle the multiclass in which # new samples will be created considering not only the majority # class but all over classes. X_new_2, y_new_2 = self._make_samples( - safe_indexing(X_class, danger_index), class_sample, + safe_indexing(X_class, danger_index), + class_sample, safe_indexing(X, np.flatnonzero(y != class_sample)), - nns, int((1 - fractions) * n_samples), step_size=0.5) + nns, + int((1 - fractions) * n_samples), + step_size=0.5) if sparse.issparse(X_resampled): - X_resampled = sparse.vstack([X_resampled, - X_new_1, X_new_2]) + X_resampled = sparse.vstack( + [X_resampled, X_new_1, X_new_2]) else: - X_resampled = np.vstack((X_resampled, - X_new_1, X_new_2)) - y_resampled = np.hstack((y_resampled, - y_new_1, y_new_2)) + X_resampled = np.vstack((X_resampled, X_new_1, X_new_2)) + y_resampled = np.hstack((y_resampled, y_new_1, y_new_2)) return X_resampled, y_resampled @@ -472,39 +465,40 @@ def _sample_svm(self, X, y): X_resampled = X.copy() y_resampled = y.copy() - for class_sample, n_samples in self.ratio_.items(): + for class_sample, n_samples in self.sampling_strategy_.items(): if n_samples == 0: continue target_class_indices = np.flatnonzero(y == class_sample) X_class = safe_indexing(X, target_class_indices) self.svm_estimator_.fit(X, y) - support_index = self.svm_estimator_.support_[ - y[self.svm_estimator_.support_] == class_sample] + support_index = self.svm_estimator_.support_[y[ + self.svm_estimator_.support_] == class_sample] support_vector = safe_indexing(X, support_index) self.nn_m_.fit(X) - noise_bool = self._in_danger_noise(support_vector, class_sample, y, - kind='noise') + noise_bool = self._in_danger_noise( + support_vector, class_sample, y, kind='noise') support_vector = safe_indexing( - support_vector, - np.flatnonzero(np.logical_not(noise_bool))) - danger_bool = self._in_danger_noise(support_vector, class_sample, - y, kind='danger') + support_vector, np.flatnonzero(np.logical_not(noise_bool))) + danger_bool = self._in_danger_noise( + support_vector, class_sample, y, kind='danger') safety_bool = np.logical_not(danger_bool) self.nn_k_.fit(X_class) fractions = random_state.beta(10, 10) if np.count_nonzero(danger_bool) > 0: - nns = self.nn_k_.kneighbors(safe_indexing( - support_vector, - np.flatnonzero(danger_bool)), - return_distance=False)[:, 1:] + nns = self.nn_k_.kneighbors( + safe_indexing(support_vector, np.flatnonzero(danger_bool)), + return_distance=False)[:, 1:] X_new_1, y_new_1 = self._make_samples( safe_indexing(support_vector, np.flatnonzero(danger_bool)), - class_sample, X_class, - nns, int(fractions * (n_samples + 1)), step_size=1.) + class_sample, + X_class, + nns, + int(fractions * (n_samples + 1)), + step_size=1.) if np.count_nonzero(safety_bool) > 0: nns = self.nn_k_.kneighbors( @@ -513,23 +507,24 @@ def _sample_svm(self, X, y): X_new_2, y_new_2 = self._make_samples( safe_indexing(support_vector, np.flatnonzero(safety_bool)), - class_sample, X_class, - nns, int((1 - fractions) * n_samples), + class_sample, + X_class, + nns, + int((1 - fractions) * n_samples), step_size=-self.out_step) if (np.count_nonzero(danger_bool) > 0 and np.count_nonzero(safety_bool) > 0): if sparse.issparse(X_resampled): - X_resampled = sparse.vstack([X_resampled, - X_new_1, X_new_2]) + X_resampled = sparse.vstack( + [X_resampled, X_new_1, X_new_2]) else: - X_resampled = np.vstack((X_resampled, - X_new_1, X_new_2)) - y_resampled = np.concatenate((y_resampled, y_new_1, y_new_2), - axis=0) + X_resampled = np.vstack((X_resampled, X_new_1, X_new_2)) + y_resampled = np.concatenate( + (y_resampled, y_new_1, y_new_2), axis=0) elif np.count_nonzero(danger_bool) == 0: if sparse.issparse(X_resampled): - X_resampled = sparse.vstack([X_resampled, X_new_2]) + X_resampled = sparse.vstack([X_resampled, X_new_2]) else: X_resampled = np.vstack((X_resampled, X_new_2)) y_resampled = np.concatenate((y_resampled, y_new_2), axis=0) diff --git a/imblearn/over_sampling/tests/test_adasyn.py b/imblearn/over_sampling/tests/test_adasyn.py index 7dcfc700e..8534e53ca 100644 --- a/imblearn/over_sampling/tests/test_adasyn.py +++ b/imblearn/over_sampling/tests/test_adasyn.py @@ -13,61 +13,54 @@ from imblearn.over_sampling import ADASYN - RND_SEED = 0 -X = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], - [1.25192108, -0.22367336], [0.53366841, -0.30312976], - [1.52091956, -0.49283504], [-0.28162401, -2.10400981], - [0.83680821, 1.72827342], [0.3084254, 0.33299982], - [0.70472253, -0.73309052], [0.28893132, -0.38761769], - [1.15514042, 0.0129463], [0.88407872, 0.35454207], - [1.31301027, -0.92648734], [-1.11515198, -0.93689695], - [-0.18410027, -0.45194484], [0.9281014, 0.53085498], - [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], - [0.08711622, 0.93259929], [1.70580611, -0.11219234]]) +X = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [ + 1.25192108, -0.22367336 +], [0.53366841, -0.30312976], [1.52091956, + -0.49283504], [-0.28162401, -2.10400981], + [0.83680821, + 1.72827342], [0.3084254, 0.33299982], [0.70472253, -0.73309052], + [0.28893132, -0.38761769], [1.15514042, 0.0129463], [ + 0.88407872, 0.35454207 + ], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [ + -0.18410027, -0.45194484 + ], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [ + -0.41635887, -0.38299653 + ], [0.08711622, 0.93259929], [1.70580611, -0.11219234]]) Y = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0]) R_TOL = 1e-4 def test_ada_init(): - ratio = 'auto' - ada = ADASYN(ratio=ratio, random_state=RND_SEED) + sampling_strategy = 'auto' + ada = ADASYN(sampling_strategy=sampling_strategy, random_state=RND_SEED) assert ada.random_state == RND_SEED def test_ada_fit(): ada = ADASYN(random_state=RND_SEED) ada.fit(X, Y) - assert ada.ratio_ == {0: 4, 1: 0} + assert ada.sampling_strategy_ == {0: 4} def test_ada_fit_sample(): ada = ADASYN(random_state=RND_SEED) X_resampled, y_resampled = ada.fit_sample(X, Y) - X_gt = np.array([[0.11622591, -0.0317206], - [0.77481731, 0.60935141], - [1.25192108, -0.22367336], - [0.53366841, -0.30312976], - [1.52091956, -0.49283504], - [-0.28162401, -2.10400981], - [0.83680821, 1.72827342], - [0.3084254, 0.33299982], - [0.70472253, -0.73309052], - [0.28893132, -0.38761769], - [1.15514042, 0.0129463], - [0.88407872, 0.35454207], - [1.31301027, -0.92648734], - [-1.11515198, -0.93689695], - [-0.18410027, -0.45194484], - [0.9281014, 0.53085498], - [-0.14374509, 0.27370049], - [-0.41635887, -0.38299653], - [0.08711622, 0.93259929], - [1.70580611, -0.11219234], - [0.94899098, -0.30508981], - [0.28204936, -0.13953426], - [1.58028868, -0.04089947], - [0.66117333, -0.28009063]]) + X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [ + 1.25192108, -0.22367336 + ], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [ + -0.28162401, -2.10400981 + ], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [ + 0.70472253, -0.73309052 + ], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [ + 0.88407872, 0.35454207 + ], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [ + -0.18410027, -0.45194484 + ], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [ + -0.41635887, -0.38299653 + ], [0.08711622, 0.93259929], [1.70580611, -0.11219234], + [0.94899098, -0.30508981], [0.28204936, -0.13953426], + [1.58028868, -0.04089947], [0.66117333, -0.28009063]]) y_gt = np.array([ 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0 ]) @@ -75,9 +68,9 @@ def test_ada_fit_sample(): assert_array_equal(y_resampled, y_gt) -def test_ada_fit_ratio_error(): - ratio = {0: 9, 1: 12} - ada = ADASYN(ratio=ratio, random_state=RND_SEED) +def test_ada_fit_sampling_strategy_error(): + sampling_strategy = {0: 9, 1: 12} + ada = ADASYN(sampling_strategy=sampling_strategy, random_state=RND_SEED) with raises(ValueError, match="No samples will be generated."): ada.fit_sample(X, Y) @@ -86,30 +79,21 @@ def test_ada_fit_sample_nn_obj(): nn = NearestNeighbors(n_neighbors=6) ada = ADASYN(random_state=RND_SEED, n_neighbors=nn) X_resampled, y_resampled = ada.fit_sample(X, Y) - X_gt = np.array([[0.11622591, -0.0317206], - [0.77481731, 0.60935141], - [1.25192108, -0.22367336], - [0.53366841, -0.30312976], - [1.52091956, -0.49283504], - [-0.28162401, -2.10400981], - [0.83680821, 1.72827342], - [0.3084254, 0.33299982], - [0.70472253, -0.73309052], - [0.28893132, -0.38761769], - [1.15514042, 0.0129463], - [0.88407872, 0.35454207], - [1.31301027, -0.92648734], - [-1.11515198, -0.93689695], - [-0.18410027, -0.45194484], - [0.9281014, 0.53085498], - [-0.14374509, 0.27370049], - [-0.41635887, -0.38299653], - [0.08711622, 0.93259929], - [1.70580611, -0.11219234], - [0.94899098, -0.30508981], - [0.28204936, -0.13953426], - [1.58028868, -0.04089947], - [0.66117333, -0.28009063]]) + X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [ + 1.25192108, -0.22367336 + ], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [ + -0.28162401, -2.10400981 + ], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [ + 0.70472253, -0.73309052 + ], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [ + 0.88407872, 0.35454207 + ], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [ + -0.18410027, -0.45194484 + ], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [ + -0.41635887, -0.38299653 + ], [0.08711622, 0.93259929], [1.70580611, -0.11219234], + [0.94899098, -0.30508981], [0.28204936, -0.13953426], + [1.58028868, -0.04089947], [0.66117333, -0.28009063]]) y_gt = np.array([ 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0 ]) diff --git a/imblearn/over_sampling/tests/test_random_over_sampler.py b/imblearn/over_sampling/tests/test_random_over_sampler.py index 22d869465..13d0067c8 100644 --- a/imblearn/over_sampling/tests/test_random_over_sampler.py +++ b/imblearn/over_sampling/tests/test_random_over_sampler.py @@ -13,56 +13,48 @@ from imblearn.over_sampling import RandomOverSampler RND_SEED = 0 -X = np.array([[0.04352327, -0.20515826], [0.92923648, 0.76103773], - [0.20792588, 1.49407907], [0.47104475, 0.44386323], - [0.22950086, 0.33367433], [0.15490546, 0.3130677], +X = np.array([[0.04352327, -0.20515826], [0.92923648, 0.76103773], [ + 0.20792588, 1.49407907 +], [0.47104475, 0.44386323], [0.22950086, 0.33367433], [0.15490546, 0.3130677], [0.09125309, -0.85409574], [0.12372842, 0.6536186], [0.13347175, 0.12167502], [0.094035, -2.55298982]]) Y = np.array([1, 0, 1, 0, 1, 1, 1, 1, 0, 1]) def test_ros_init(): - ratio = 'auto' - ros = RandomOverSampler(ratio=ratio, random_state=RND_SEED) + sampling_strategy = 'auto' + ros = RandomOverSampler( + sampling_strategy=sampling_strategy, random_state=RND_SEED) assert ros.random_state == RND_SEED def test_ros_fit_sample(): ros = RandomOverSampler(random_state=RND_SEED) X_resampled, y_resampled = ros.fit_sample(X, Y) - X_gt = np.array([[0.04352327, -0.20515826], - [0.92923648, 0.76103773], - [0.20792588, 1.49407907], - [0.47104475, 0.44386323], - [0.22950086, 0.33367433], - [0.15490546, 0.3130677], - [0.09125309, -0.85409574], - [0.12372842, 0.6536186], - [0.13347175, 0.12167502], - [0.094035, -2.55298982], - [0.92923648, 0.76103773], - [0.47104475, 0.44386323], - [0.92923648, 0.76103773], - [0.47104475, 0.44386323]]) + X_gt = np.array([[0.04352327, -0.20515826], [0.92923648, 0.76103773], [ + 0.20792588, 1.49407907 + ], [0.47104475, 0.44386323], [0.22950086, 0.33367433], [ + 0.15490546, 0.3130677 + ], [0.09125309, -0.85409574], [0.12372842, 0.6536186], + [0.13347175, 0.12167502], [0.094035, -2.55298982], + [0.92923648, 0.76103773], [0.47104475, 0.44386323], + [0.92923648, 0.76103773], [0.47104475, 0.44386323]]) y_gt = np.array([1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) def test_ros_fit_sample_half(): - ratio = {0: 3, 1: 7} - ros = RandomOverSampler(ratio=ratio, random_state=RND_SEED) + sampling_strategy = {0: 3, 1: 7} + ros = RandomOverSampler( + sampling_strategy=sampling_strategy, random_state=RND_SEED) X_resampled, y_resampled = ros.fit_sample(X, Y) - X_gt = np.array([[0.04352327, -0.20515826], - [0.92923648, 0.76103773], - [0.20792588, 1.49407907], - [0.47104475, 0.44386323], - [0.22950086, 0.33367433], - [0.15490546, 0.3130677], - [0.09125309, -0.85409574], - [0.12372842, 0.6536186], - [0.13347175, 0.12167502], - [0.094035, -2.55298982]]) + X_gt = np.array([[0.04352327, -0.20515826], [0.92923648, 0.76103773], [ + 0.20792588, 1.49407907 + ], [0.47104475, 0.44386323], [0.22950086, + 0.33367433], [0.15490546, 0.3130677], + [0.09125309, -0.85409574], [0.12372842, 0.6536186], + [0.13347175, 0.12167502], [0.094035, -2.55298982]]) y_gt = np.array([1, 0, 1, 0, 1, 1, 1, 1, 0, 1]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) diff --git a/imblearn/over_sampling/tests/test_smote.py b/imblearn/over_sampling/tests/test_smote.py index 88c9713e1..5346c39fd 100644 --- a/imblearn/over_sampling/tests/test_smote.py +++ b/imblearn/over_sampling/tests/test_smote.py @@ -14,18 +14,20 @@ from imblearn.over_sampling import SMOTE - RND_SEED = 0 -X = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], - [1.25192108, -0.22367336], [0.53366841, -0.30312976], - [1.52091956, -0.49283504], [-0.28162401, -2.10400981], - [0.83680821, 1.72827342], [0.3084254, 0.33299982], - [0.70472253, -0.73309052], [0.28893132, -0.38761769], - [1.15514042, 0.0129463], [0.88407872, 0.35454207], - [1.31301027, -0.92648734], [-1.11515198, -0.93689695], - [-0.18410027, -0.45194484], [0.9281014, 0.53085498], - [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], - [0.08711622, 0.93259929], [1.70580611, -0.11219234]]) +X = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [ + 1.25192108, -0.22367336 +], [0.53366841, -0.30312976], [1.52091956, + -0.49283504], [-0.28162401, -2.10400981], + [0.83680821, + 1.72827342], [0.3084254, 0.33299982], [0.70472253, -0.73309052], + [0.28893132, -0.38761769], [1.15514042, 0.0129463], [ + 0.88407872, 0.35454207 + ], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [ + -0.18410027, -0.45194484 + ], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [ + -0.41635887, -0.38299653 + ], [0.08711622, 0.93259929], [1.70580611, -0.11219234]]) Y = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0]) R_TOL = 1e-4 @@ -41,16 +43,19 @@ def test_sample_regular(): kind = 'regular' smote = SMOTE(random_state=RND_SEED, kind=kind) X_resampled, y_resampled = smote.fit_sample(X, Y) - X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], - [1.25192108, -0.22367336], [0.53366841, -0.30312976], - [1.52091956, -0.49283504], [-0.28162401, -2.10400981], - [0.83680821, 1.72827342], [0.3084254, 0.33299982], - [0.70472253, -0.73309052], [0.28893132, -0.38761769], - [1.15514042, 0.0129463], [0.88407872, 0.35454207], - [1.31301027, -0.92648734], [-1.11515198, -0.93689695], - [-0.18410027, -0.45194484], [0.9281014, 0.53085498], - [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], - [0.08711622, 0.93259929], [1.70580611, -0.11219234], + X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [ + 1.25192108, -0.22367336 + ], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [ + -0.28162401, -2.10400981 + ], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [ + 0.70472253, -0.73309052 + ], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [ + 0.88407872, 0.35454207 + ], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [ + -0.18410027, -0.45194484 + ], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [ + -0.41635887, -0.38299653 + ], [0.08711622, 0.93259929], [1.70580611, -0.11219234], [0.29307743, -0.14670439], [0.84976473, -0.15570176], [0.61319159, -0.11571668], [0.66052536, -0.28246517]]) y_gt = np.array([ @@ -61,21 +66,24 @@ def test_sample_regular(): def test_sample_regular_half(): - ratio = {0: 9, 1: 12} + sampling_strategy = {0: 9, 1: 12} kind = 'regular' - smote = SMOTE(ratio=ratio, random_state=RND_SEED, kind=kind) + smote = SMOTE( + sampling_strategy=sampling_strategy, random_state=RND_SEED, kind=kind) X_resampled, y_resampled = smote.fit_sample(X, Y) - X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], - [1.25192108, -0.22367336], [0.53366841, -0.30312976], - [1.52091956, -0.49283504], [-0.28162401, -2.10400981], - [0.83680821, 1.72827342], [0.3084254, 0.33299982], - [0.70472253, -0.73309052], [0.28893132, -0.38761769], - [1.15514042, 0.0129463], [0.88407872, 0.35454207], - [1.31301027, -0.92648734], [-1.11515198, -0.93689695], - [-0.18410027, -0.45194484], [0.9281014, 0.53085498], - [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], - [0.08711622, 0.93259929], [1.70580611, -0.11219234], - [0.36784496, -0.1953161]]) + X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [ + 1.25192108, -0.22367336 + ], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [ + -0.28162401, -2.10400981 + ], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [ + 0.70472253, -0.73309052 + ], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [ + 0.88407872, 0.35454207 + ], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [ + -0.18410027, -0.45194484 + ], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], + [-0.41635887, -0.38299653], [0.08711622, 0.93259929], + [1.70580611, -0.11219234], [0.36784496, -0.1953161]]) y_gt = np.array( [0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) @@ -86,16 +94,19 @@ def test_sample_borderline1(): kind = 'borderline1' smote = SMOTE(random_state=RND_SEED, kind=kind) X_resampled, y_resampled = smote.fit_sample(X, Y) - X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], - [1.25192108, -0.22367336], [0.53366841, -0.30312976], - [1.52091956, -0.49283504], [-0.28162401, -2.10400981], - [0.83680821, 1.72827342], [0.3084254, 0.33299982], - [0.70472253, -0.73309052], [0.28893132, -0.38761769], - [1.15514042, 0.0129463], [0.88407872, 0.35454207], - [1.31301027, -0.92648734], [-1.11515198, -0.93689695], - [-0.18410027, -0.45194484], [0.9281014, 0.53085498], - [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], - [0.08711622, 0.93259929], [1.70580611, -0.11219234], + X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [ + 1.25192108, -0.22367336 + ], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [ + -0.28162401, -2.10400981 + ], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [ + 0.70472253, -0.73309052 + ], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [ + 0.88407872, 0.35454207 + ], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [ + -0.18410027, -0.45194484 + ], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [ + -0.41635887, -0.38299653 + ], [0.08711622, 0.93259929], [1.70580611, -0.11219234], [0.3765279, -0.2009615], [0.55276636, -0.10550373], [0.45413452, -0.08883319], [1.21118683, -0.22817957]]) y_gt = np.array([ @@ -109,18 +120,20 @@ def test_sample_borderline2(): kind = 'borderline2' smote = SMOTE(random_state=RND_SEED, kind=kind) X_resampled, y_resampled = smote.fit_sample(X, Y) - X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], - [1.25192108, -0.22367336], [0.53366841, -0.30312976], - [1.52091956, -0.49283504], [-0.28162401, -2.10400981], - [0.83680821, 1.72827342], [0.3084254, 0.33299982], - [0.70472253, -0.73309052], [0.28893132, -0.38761769], - [1.15514042, 0.0129463], [0.88407872, 0.35454207], - [1.31301027, -0.92648734], [-1.11515198, -0.93689695], - [-0.18410027, -0.45194484], [0.9281014, 0.53085498], - [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], - [0.08711622, 0.93259929], [1.70580611, -0.11219234], - [0.47436888, -0.2645749], [1.07844561, -0.19435291], - [0.33339622, 0.49870937]]) + X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [ + 1.25192108, -0.22367336 + ], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [ + -0.28162401, -2.10400981 + ], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [ + 0.70472253, -0.73309052 + ], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [ + 0.88407872, 0.35454207 + ], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [ + -0.18410027, -0.45194484 + ], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], + [-0.41635887, -0.38299653], [0.08711622, 0.93259929], + [1.70580611, -0.11219234], [0.47436888, -0.2645749], + [1.07844561, -0.19435291], [0.33339622, 0.49870937]]) y_gt = np.array( [0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) @@ -131,18 +144,20 @@ def test_sample_svm(): kind = 'svm' smote = SMOTE(random_state=RND_SEED, kind=kind) X_resampled, y_resampled = smote.fit_sample(X, Y) - X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], - [1.25192108, -0.22367336], [0.53366841, -0.30312976], - [1.52091956, -0.49283504], [-0.28162401, -2.10400981], - [0.83680821, 1.72827342], [0.3084254, 0.33299982], - [0.70472253, -0.73309052], [0.28893132, -0.38761769], - [1.15514042, 0.0129463], [0.88407872, 0.35454207], - [1.31301027, -0.92648734], [-1.11515198, -0.93689695], - [-0.18410027, -0.45194484], [0.9281014, 0.53085498], - [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], - [0.08711622, 0.93259929], [1.70580611, -0.11219234], - [0.47436888, -0.2645749], [1.07844561, -0.19435291], - [1.44015515, -1.30621303]]) + X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [ + 1.25192108, -0.22367336 + ], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [ + -0.28162401, -2.10400981 + ], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [ + 0.70472253, -0.73309052 + ], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [ + 0.88407872, 0.35454207 + ], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [ + -0.18410027, -0.45194484 + ], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], + [-0.41635887, -0.38299653], [0.08711622, 0.93259929], + [1.70580611, -0.11219234], [0.47436888, -0.2645749], + [1.07844561, -0.19435291], [1.44015515, -1.30621303]]) y_gt = np.array( [0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) @@ -153,19 +168,22 @@ def test_fit_sample_nn_obj(): kind = 'borderline1' nn_m = NearestNeighbors(n_neighbors=11) nn_k = NearestNeighbors(n_neighbors=6) - smote = SMOTE(random_state=RND_SEED, kind=kind, k_neighbors=nn_k, - m_neighbors=nn_m) + smote = SMOTE( + random_state=RND_SEED, kind=kind, k_neighbors=nn_k, m_neighbors=nn_m) X_resampled, y_resampled = smote.fit_sample(X, Y) - X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], - [1.25192108, -0.22367336], [0.53366841, -0.30312976], - [1.52091956, -0.49283504], [-0.28162401, -2.10400981], - [0.83680821, 1.72827342], [0.3084254, 0.33299982], - [0.70472253, -0.73309052], [0.28893132, -0.38761769], - [1.15514042, 0.0129463], [0.88407872, 0.35454207], - [1.31301027, -0.92648734], [-1.11515198, -0.93689695], - [-0.18410027, -0.45194484], [0.9281014, 0.53085498], - [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], - [0.08711622, 0.93259929], [1.70580611, -0.11219234], + X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [ + 1.25192108, -0.22367336 + ], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [ + -0.28162401, -2.10400981 + ], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [ + 0.70472253, -0.73309052 + ], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [ + 0.88407872, 0.35454207 + ], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [ + -0.18410027, -0.45194484 + ], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [ + -0.41635887, -0.38299653 + ], [0.08711622, 0.93259929], [1.70580611, -0.11219234], [0.3765279, -0.2009615], [0.55276636, -0.10550373], [0.45413452, -0.08883319], [1.21118683, -0.22817957]]) y_gt = np.array([ @@ -180,16 +198,19 @@ def test_sample_regular_with_nn(): nn_k = NearestNeighbors(n_neighbors=6) smote = SMOTE(random_state=RND_SEED, kind=kind, k_neighbors=nn_k) X_resampled, y_resampled = smote.fit_sample(X, Y) - X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], - [1.25192108, -0.22367336], [0.53366841, -0.30312976], - [1.52091956, -0.49283504], [-0.28162401, -2.10400981], - [0.83680821, 1.72827342], [0.3084254, 0.33299982], - [0.70472253, -0.73309052], [0.28893132, -0.38761769], - [1.15514042, 0.0129463], [0.88407872, 0.35454207], - [1.31301027, -0.92648734], [-1.11515198, -0.93689695], - [-0.18410027, -0.45194484], [0.9281014, 0.53085498], - [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], - [0.08711622, 0.93259929], [1.70580611, -0.11219234], + X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [ + 1.25192108, -0.22367336 + ], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [ + -0.28162401, -2.10400981 + ], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [ + 0.70472253, -0.73309052 + ], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [ + 0.88407872, 0.35454207 + ], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [ + -0.18410027, -0.45194484 + ], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [ + -0.41635887, -0.38299653 + ], [0.08711622, 0.93259929], [1.70580611, -0.11219234], [0.29307743, -0.14670439], [0.84976473, -0.15570176], [0.61319159, -0.11571668], [0.66052536, -0.28246517]]) y_gt = np.array([ @@ -227,18 +248,20 @@ def test_sample_regular_with_nn_svm(): smote = SMOTE( random_state=RND_SEED, kind=kind, k_neighbors=nn_k, svm_estimator=svm) X_resampled, y_resampled = smote.fit_sample(X, Y) - X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], - [1.25192108, -0.22367336], [0.53366841, -0.30312976], - [1.52091956, -0.49283504], [-0.28162401, -2.10400981], - [0.83680821, 1.72827342], [0.3084254, 0.33299982], - [0.70472253, -0.73309052], [0.28893132, -0.38761769], - [1.15514042, 0.0129463], [0.88407872, 0.35454207], - [1.31301027, -0.92648734], [-1.11515198, -0.93689695], - [-0.18410027, -0.45194484], [0.9281014, 0.53085498], - [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], - [0.08711622, 0.93259929], [1.70580611, -0.11219234], - [0.47436888, -0.2645749], [1.07844561, -0.19435291], - [1.44015515, -1.30621303]]) + X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [ + 1.25192108, -0.22367336 + ], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [ + -0.28162401, -2.10400981 + ], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [ + 0.70472253, -0.73309052 + ], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [ + 0.88407872, 0.35454207 + ], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [ + -0.18410027, -0.45194484 + ], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], + [-0.41635887, -0.38299653], [0.08711622, 0.93259929], + [1.70580611, -0.11219234], [0.47436888, -0.2645749], + [1.07844561, -0.19435291], [1.44015515, -1.30621303]]) y_gt = np.array( [0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) diff --git a/imblearn/pipeline.py b/imblearn/pipeline.py index ed716185f..d07d5ef6e 100644 --- a/imblearn/pipeline.py +++ b/imblearn/pipeline.py @@ -128,20 +128,16 @@ def _validate_steps(self): for t in transformers: if t is None: continue - if (not (hasattr(t, "fit") or - hasattr(t, "fit_transform") or - hasattr(t, "fit_sample")) or - not (hasattr(t, "transform") or - hasattr(t, "sample"))): + if (not (hasattr(t, "fit") or hasattr(t, "fit_transform") or + hasattr(t, "fit_sample")) or not + (hasattr(t, "transform") or hasattr(t, "sample"))): raise TypeError( "All intermediate steps of the chain should " "be estimators that implement fit and transform or sample " "(but not both) '%s' (type %s) doesn't)" % (t, type(t))) - if ((hasattr(t, "fit_sample") and - hasattr(t, "fit_transform")) or - (hasattr(t, "sample") and - hasattr(t, "transform"))): + if ((hasattr(t, "fit_sample") and hasattr(t, "fit_transform")) or + (hasattr(t, "sample") and hasattr(t, "transform"))): raise TypeError( "All intermediate steps of the chain should " "be estimators that implement fit and transform or sample." @@ -155,8 +151,8 @@ def _validate_steps(self): # We allow last estimator to be None as an identity transformation if estimator is not None and not hasattr(estimator, "fit"): raise TypeError("Last step of Pipeline should implement fit. " - "'%s' (type %s) doesn't" - % (estimator, type(estimator))) + "'%s' (type %s) doesn't" % (estimator, + type(estimator))) # Estimator interface @@ -201,8 +197,7 @@ def _fit(self, X, y=None, **fit_params): **fit_params_steps[name]) elif hasattr(cloned_transformer, "sample"): Xt, yt, fitted_transformer = fit_sample_one_cached( - cloned_transformer, Xt, yt, - **fit_params_steps[name]) + cloned_transformer, Xt, yt, **fit_params_steps[name]) # Replace the transformer of the step with the fitted # transformer. This is necessary when loading the transformer # from the cache. @@ -587,8 +582,7 @@ def score(self, X, y=None, sample_weight=None): return self.steps[-1][-1].score(Xt, y, **score_params) -def _fit_transform_one(transformer, weight, X, y, - **fit_params): +def _fit_transform_one(transformer, weight, X, y, **fit_params): if hasattr(transformer, 'fit_transform'): res = transformer.fit_transform(X, y, **fit_params) else: diff --git a/imblearn/tests/test_base.py b/imblearn/tests/test_base.py index e8c485d6a..4450d4515 100644 --- a/imblearn/tests/test_base.py +++ b/imblearn/tests/test_base.py @@ -16,23 +16,23 @@ from imblearn.under_sampling import RandomUnderSampler iris = load_iris() -X, y = make_imbalance(iris.data, iris.target, ratio={0: 10, 1: 25}, - random_state=0) +X, y = make_imbalance( + iris.data, iris.target, sampling_strategy={0: 10, + 1: 25}, random_state=0) def test_function_sampler_reject_sparse(): X_sparse = sparse.csr_matrix(X) sampler = FunctionSampler(accept_sparse=False) - with pytest.raises(TypeError, message="A sparse matrix was passed, " - "but dense data is required"): + with pytest.raises( + TypeError, + match="A sparse matrix was passed, " + "but dense data is required"): sampler.fit(X_sparse, y) -@pytest.mark.parametrize( - "X, y", - [(X, y), - (sparse.csr_matrix(X), y), - (sparse.csc_matrix(X), y)]) +@pytest.mark.parametrize("X, y", [(X, y), (sparse.csr_matrix(X), y), + (sparse.csc_matrix(X), y)]) def test_function_sampler_identity(X, y): sampler = FunctionSampler() X_res, y_res = sampler.fit_sample(X, y) @@ -40,13 +40,9 @@ def test_function_sampler_identity(X, y): assert_array_equal(y_res, y) -@pytest.mark.parametrize( - "X, y", - [(X, y), - (sparse.csr_matrix(X), y), - (sparse.csc_matrix(X), y)]) +@pytest.mark.parametrize("X, y", [(X, y), (sparse.csr_matrix(X), y), + (sparse.csc_matrix(X), y)]) def test_function_sampler_func(X, y): - def func(X, y): return X[:10], y[:10] @@ -56,19 +52,17 @@ def func(X, y): assert_array_equal(y_res, y[:10]) -@pytest.mark.parametrize( - "X, y", - [(X, y), - (sparse.csr_matrix(X), y), - (sparse.csc_matrix(X), y)]) +@pytest.mark.parametrize("X, y", [(X, y), (sparse.csr_matrix(X), y), + (sparse.csc_matrix(X), y)]) def test_function_sampler_func_kwargs(X, y): - - def func(X, y, ratio, random_state): - rus = RandomUnderSampler(ratio=ratio, random_state=random_state) + def func(X, y, sampling_strategy, random_state): + rus = RandomUnderSampler( + sampling_strategy=sampling_strategy, random_state=random_state) return rus.fit_sample(X, y) - sampler = FunctionSampler(func=func, kw_args={'ratio': 'auto', - 'random_state': 0}) + sampler = FunctionSampler( + func=func, kw_args={'sampling_strategy': 'auto', + 'random_state': 0}) X_res, y_res = sampler.fit_sample(X, y) X_res_2, y_res_2 = RandomUnderSampler(random_state=0).fit_sample(X, y) assert_allclose_dense_sparse(X_res, X_res_2) diff --git a/imblearn/tests/test_common.py b/imblearn/tests/test_common.py index 7f468c6fb..173211383 100644 --- a/imblearn/tests/test_common.py +++ b/imblearn/tests/test_common.py @@ -21,8 +21,7 @@ def test_all_estimators(): assert len(estimators) > 0 for name, Estimator in estimators: # some can just not be sensibly default constructed - yield (_named_check(check_estimator, name), - Estimator) + yield (_named_check(check_estimator, name), Estimator) def test_non_meta_estimators(): diff --git a/imblearn/tests/test_pipeline.py b/imblearn/tests/test_pipeline.py index 046fee903..97b65c4ac 100644 --- a/imblearn/tests/test_pipeline.py +++ b/imblearn/tests/test_pipeline.py @@ -5,7 +5,6 @@ # Christos Aridas # License: MIT - from tempfile import mkdtemp import shutil import time @@ -32,15 +31,13 @@ from imblearn.under_sampling import (RandomUnderSampler, EditedNearestNeighbours as ENN) - JUNK_FOOD_DOCS = ( "the pizza pizza beer copyright", "the pizza burger beer copyright", "the the pizza beer beer copyright", "the burger beer beer copyright", "the coke burger coke copyright", - "the coke burger burger", -) + "the coke burger burger", ) R_TOL = 1e-4 @@ -55,7 +52,6 @@ def __init__(self, a=None, b=None): class NoTrans(NoFit): - def fit(self, X, y): return self @@ -81,7 +77,6 @@ def inverse_transform(self, X): class TransfFitParams(Transf): - def fit(self, X, y, **fit_params): self.fit_params = fit_params return self @@ -186,8 +181,8 @@ def test_pipeline_init(): # Smoke test with only an estimator clf = NoTrans() pipe = Pipeline([('svc', clf)]) - expected = dict(svc__a=None, svc__b=None, svc=clf, - **pipe.get_params(deep=False)) + expected = dict( + svc__a=None, svc__b=None, svc=clf, **pipe.get_params(deep=False)) assert pipe.get_params(deep=True) == expected # Check that params are set @@ -336,7 +331,7 @@ def test_pipeline_methods_preprocessing_svm(): # check shapes of various prediction functions predict = pipe.predict(X) - assert predict.shape == (n_samples,) + assert predict.shape == (n_samples, ) proba = pipe.predict_proba(X) assert proba.shape == (n_samples, n_classes) @@ -367,10 +362,8 @@ def test_fit_predict_on_pipeline(): separate_pred = km.fit_predict(scaled) # use a pipeline to do the transform and clustering in one step - pipe = Pipeline([ - ('scaler', scaler_for_pipeline), - ('Kmeans', km_for_pipeline) - ]) + pipe = Pipeline([('scaler', scaler_for_pipeline), ('Kmeans', + km_for_pipeline)]) pipeline_pred = pipe.fit_predict(iris.data) assert_array_almost_equal(pipeline_pred, separate_pred) @@ -391,10 +384,8 @@ def test_fit_predict_with_intermediate_fit_params(): # tests that Pipeline passes fit_params to intermediate steps # when fit_predict is invoked pipe = Pipeline([('transf', TransfFitParams()), ('clf', FitParamT())]) - pipe.fit_predict(X=None, - y=None, - transf__should_get_this=True, - clf__should_succeed=True) + pipe.fit_predict( + X=None, y=None, transf__should_get_this=True, clf__should_succeed=True) assert pipe.named_steps['transf'].fit_params['should_get_this'] assert pipe.named_steps['clf'].successful assert 'should_succeed' not in pipe.named_steps['transf'].fit_params @@ -485,13 +476,15 @@ def make(): assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) - expected_params = {'steps': pipeline.steps, - 'm2': mult2, - 'm3': None, - 'last': mult5, - 'memory': None, - 'm2__mult': 2, - 'last__mult': 5} + expected_params = { + 'steps': pipeline.steps, + 'm2': mult2, + 'm3': None, + 'last': mult5, + 'memory': None, + 'm2__mult': 2, + 'last__mult': 5 + } assert pipeline.get_params(deep=True) == expected_params pipeline.set_params(m2=None) @@ -501,8 +494,10 @@ def make(): assert_array_equal(X, pipeline.inverse_transform([[exp]])) # for other methods, ensure no AttributeErrors on None: - other_methods = ['predict_proba', 'predict_log_proba', - 'decision_function', 'transform', 'score'] + other_methods = [ + 'predict_proba', 'predict_log_proba', 'decision_function', 'transform', + 'score' + ] for method in other_methods: getattr(pipeline, method)(X) @@ -599,8 +594,8 @@ def test_pipeline_wrong_memory(): y = iris.target # Define memory as an integer memory = 1 - cached_pipe = Pipeline([('transf', DummyTransf()), ('svc', SVC())], - memory=memory) + cached_pipe = Pipeline( + [('transf', DummyTransf()), ('svc', SVC())], memory=memory) error_regex = ("'memory' should either be a string or a joblib.Memory" " instance, got 'memory=1' instead.") with raises(ValueError, match=error_regex): @@ -618,8 +613,8 @@ def test_pipeline_memory_transformer(): clf = SVC(probability=True, random_state=0) transf = DummyTransf() pipe = Pipeline([('transf', clone(transf)), ('svc', clf)]) - cached_pipe = Pipeline([('transf', transf), ('svc', clf)], - memory=memory) + cached_pipe = Pipeline( + [('transf', transf), ('svc', clf)], memory=memory) # Memoize the transformer at the first fit cached_pipe.fit(X, y) @@ -629,8 +624,8 @@ def test_pipeline_memory_transformer(): # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) - assert_array_equal(pipe.predict_log_proba(X), - cached_pipe.predict_log_proba(X)) + assert_array_equal( + pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe.named_steps['transf'].means_) @@ -641,8 +636,8 @@ def test_pipeline_memory_transformer(): # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) - assert_array_equal(pipe.predict_log_proba(X), - cached_pipe.predict_log_proba(X)) + assert_array_equal( + pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe.named_steps['transf'].means_) @@ -651,16 +646,16 @@ def test_pipeline_memory_transformer(): # Check that even changing the name step does not affect the cache hit clf_2 = SVC(probability=True, random_state=0) transf_2 = DummyTransf() - cached_pipe_2 = Pipeline([('transf_2', transf_2), ('svc', clf_2)], - memory=memory) + cached_pipe_2 = Pipeline( + [('transf_2', transf_2), ('svc', clf_2)], memory=memory) cached_pipe_2.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X)) - assert_array_equal(pipe.predict_proba(X), - cached_pipe_2.predict_proba(X)) - assert_array_equal(pipe.predict_log_proba(X), - cached_pipe_2.predict_log_proba(X)) + assert_array_equal( + pipe.predict_proba(X), cached_pipe_2.predict_proba(X)) + assert_array_equal( + pipe.predict_log_proba(X), cached_pipe_2.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe_2.named_steps['transf_2'].means_) @@ -688,8 +683,8 @@ def test_pipeline_memory_sampler(): clf = SVC(probability=True, random_state=0) transf = DummySampler() pipe = Pipeline([('transf', clone(transf)), ('svc', clf)]) - cached_pipe = Pipeline([('transf', transf), ('svc', clf)], - memory=memory) + cached_pipe = Pipeline( + [('transf', transf), ('svc', clf)], memory=memory) # Memoize the transformer at the first fit cached_pipe.fit(X, y) @@ -699,8 +694,8 @@ def test_pipeline_memory_sampler(): # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) - assert_array_equal(pipe.predict_log_proba(X), - cached_pipe.predict_log_proba(X)) + assert_array_equal( + pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe.named_steps['transf'].means_) @@ -711,8 +706,8 @@ def test_pipeline_memory_sampler(): # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) - assert_array_equal(pipe.predict_log_proba(X), - cached_pipe.predict_log_proba(X)) + assert_array_equal( + pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe.named_steps['transf'].means_) @@ -721,16 +716,16 @@ def test_pipeline_memory_sampler(): # Check that even changing the name step does not affect the cache hit clf_2 = SVC(probability=True, random_state=0) transf_2 = DummySampler() - cached_pipe_2 = Pipeline([('transf_2', transf_2), ('svc', clf_2)], - memory=memory) + cached_pipe_2 = Pipeline( + [('transf_2', transf_2), ('svc', clf_2)], memory=memory) cached_pipe_2.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X)) - assert_array_equal(pipe.predict_proba(X), - cached_pipe_2.predict_proba(X)) - assert_array_equal(pipe.predict_log_proba(X), - cached_pipe_2.predict_log_proba(X)) + assert_array_equal( + pipe.predict_proba(X), cached_pipe_2.predict_proba(X)) + assert_array_equal( + pipe.predict_log_proba(X), cached_pipe_2.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe_2.named_steps['transf_2'].means_) @@ -819,8 +814,7 @@ def test_pipeline_sample(): assert_allclose(y_trans, y_trans3, rtol=R_TOL) pca = PCA() - pipeline = Pipeline([('pca', PCA()), - ('rus', rus)]) + pipeline = Pipeline([('pca', PCA()), ('rus', rus)]) X_trans, y_trans = pipeline.fit(X, y).sample(X, y) X_pca = pca.fit_transform(X) diff --git a/imblearn/under_sampling/__init__.py b/imblearn/under_sampling/__init__.py index f8bf577da..28df84eec 100644 --- a/imblearn/under_sampling/__init__.py +++ b/imblearn/under_sampling/__init__.py @@ -16,14 +16,9 @@ from .prototype_selection import AllKNN from .prototype_selection import InstanceHardnessThreshold -__all__ = ['ClusterCentroids', - 'RandomUnderSampler', - 'InstanceHardnessThreshold', - 'NearMiss', - 'TomekLinks', - 'EditedNearestNeighbours', - 'RepeatedEditedNearestNeighbours', - 'AllKNN', - 'OneSidedSelection', - 'CondensedNearestNeighbour', - 'NeighbourhoodCleaningRule'] +__all__ = [ + 'ClusterCentroids', 'RandomUnderSampler', 'InstanceHardnessThreshold', + 'NearMiss', 'TomekLinks', 'EditedNearestNeighbours', + 'RepeatedEditedNearestNeighbours', 'AllKNN', 'OneSidedSelection', + 'CondensedNearestNeighbour', 'NeighbourhoodCleaningRule' +] diff --git a/imblearn/under_sampling/base.py b/imblearn/under_sampling/base.py index cb476c19a..7b4ae65d1 100644 --- a/imblearn/under_sampling/base.py +++ b/imblearn/under_sampling/base.py @@ -15,6 +15,43 @@ class BaseUnderSampler(BaseSampler): """ _sampling_type = 'under-sampling' + _sampling_strategy_docstring = \ + """sampling_strategy : float, str, dict, callable, (default='auto') + Sampling information to sample the data set. + + - When ``float``, it corresponds to the ratio :math:`\\alpha_{us}` + defined by :math:`N_{rM} = \\alpha_{us} \\times N_{m}` where + :math:`N_{rM}` and :math:`N_{m}` are the number of samples in the + majority class after resampling and the number of samples in the + minority class, respectively. + + .. warning:: + ``float`` is only available for **binary** classification. An + error is raised for multi-class classification. + + - When ``str``, specify the class targeted by the resampling. The + number of samples in the different classes will be equalized. + Possible choices are: + + ``'majority'``: resample only the majority class; + + ``'not minority'``: resample all classes but the minority class; + + ``'not majority'``: resample all classes but the majority class; + + ``'all'``: resample all classes; + + ``'auto'``: equivalent to ``'not minority'``. + + - When ``dict``, the keys correspond to the targeted classes. The + values correspond to the desired number of samples for each targeted + class. + + - When callable, function taking ``y`` and returns a ``dict``. The keys + correspond to the targeted classes. The values correspond to the + desired number of samples for each class. + """.rstrip() + class BaseCleaningSampler(BaseSampler): """Base class for under-sampling algorithms. @@ -23,3 +60,29 @@ class BaseCleaningSampler(BaseSampler): instead. """ _sampling_type = 'clean-sampling' + + _sampling_strategy_docstring = \ + """sampling_strategy : str, list or callable + Sampling information to sample the data set. + + - When ``str``, specify the class targeted by the resampling. Note the + the number of samples will not be equal in each. Possible choices + are: + + ``'majority'``: resample only the majority class; + + ``'not minority'``: resample all classes but the minority class; + + ``'not majority'``: resample all classes but the majority class; + + ``'all'``: resample all classes; + + ``'auto'``: equivalent to ``'not minority'``. + + - When ``list``, the list contains the classes targeted by the + resampling. + + - When callable, function taking ``y`` and returns a ``dict``. The keys + correspond to the targeted classes. The values correspond to the + desired number of samples for each class. + """.rstrip() diff --git a/imblearn/under_sampling/prototype_generation/__init__.py b/imblearn/under_sampling/prototype_generation/__init__.py index 8bbdfcd4a..2052cdb2d 100644 --- a/imblearn/under_sampling/prototype_generation/__init__.py +++ b/imblearn/under_sampling/prototype_generation/__init__.py @@ -5,6 +5,4 @@ from .cluster_centroids import ClusterCentroids -__all__ = [ - 'ClusterCentroids' -] +__all__ = ['ClusterCentroids'] diff --git a/imblearn/under_sampling/prototype_generation/cluster_centroids.py b/imblearn/under_sampling/prototype_generation/cluster_centroids.py index 5557bba75..118a165a9 100644 --- a/imblearn/under_sampling/prototype_generation/cluster_centroids.py +++ b/imblearn/under_sampling/prototype_generation/cluster_centroids.py @@ -16,10 +16,15 @@ from sklearn.utils import safe_indexing from ..base import BaseUnderSampler +from ...utils import Substitution +from ...utils._docstring import _random_state_docstring VOTING_KIND = ('auto', 'hard', 'soft') +@Substitution( + sampling_strategy=BaseUnderSampler._sampling_strategy_docstring, + random_state=_random_state_docstring) class ClusterCentroids(BaseUnderSampler): """Perform under-sampling by generating centroids based on clustering methods. @@ -35,28 +40,9 @@ class ClusterCentroids(BaseUnderSampler): Parameters ---------- - ratio : str, dict, or callable, optional (default='auto') - Ratio to use for resampling the data set. - - - If ``str``, has to be one of: (i) ``'minority'``: resample the - minority class; (ii) ``'majority'``: resample the majority class, - (iii) ``'not minority'``: resample all classes apart of the minority - class, (iv) ``'all'``: resample all classes, and (v) ``'auto'``: - correspond to ``'all'`` with for over-sampling methods and ``'not - minority'`` for under-sampling methods. The classes targeted will be - over-sampled or under-sampled to achieve an equal number of sample - with the majority or minority class. - - If ``dict``, the keys correspond to the targeted classes. The values - correspond to the desired number of samples. - - If callable, function taking ``y`` and returns a ``dict``. The keys - correspond to the targeted classes. The values correspond to the - desired number of samples. - - random_state : int, RandomState instance or None, optional (default=None) - If int, ``random_state`` is the seed used by the random number - generator; If ``RandomState`` instance, random_state is the random - number generator; If ``None``, the random number generator is the - ``RandomState`` instance used by ``np.random``. + {sampling_strategy} + + {random_state} estimator : object, optional(default=KMeans()) Pass a :class:`sklearn.cluster.KMeans` estimator. @@ -76,6 +62,11 @@ class ClusterCentroids(BaseUnderSampler): n_jobs : int, optional (default=1) The number of threads to open if possible. + ratio : str, dict, or callable + .. deprecated:: 0.4 + Use the parameter ``sampling_strategy`` instead. It will be removed + in 0.6. + Notes ----- Supports mutli-class resampling by sampling each class independently. @@ -92,24 +83,25 @@ class ClusterCentroids(BaseUnderSampler): >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) - >>> print('Original dataset shape {}'.format(Counter(y))) - Original dataset shape Counter({1: 900, 0: 100}) + >>> print('Original dataset shape %s' % Counter(y)) + Original dataset shape Counter({{1: 900, 0: 100}}) >>> cc = ClusterCentroids(random_state=42) >>> X_res, y_res = cc.fit_sample(X, y) - >>> print('Resampled dataset shape {}'.format(Counter(y_res))) + >>> print('Resampled dataset shape %s' % Counter(y_res)) ... # doctest: +ELLIPSIS - Resampled dataset shape Counter({...}) + Resampled dataset shape Counter({{...}}) """ def __init__(self, - ratio='auto', + sampling_strategy='auto', random_state=None, estimator=None, voting='auto', - n_jobs=1): + n_jobs=1, + ratio=None): super(ClusterCentroids, self).__init__( - ratio=ratio) + sampling_strategy=sampling_strategy, ratio=ratio) self.random_state = random_state self.estimator = estimator self.voting = voting @@ -130,8 +122,8 @@ def _generate_sample(self, X, y, centroids, target_class): if self.voting_ == 'hard': nearest_neighbors = NearestNeighbors(n_neighbors=1) nearest_neighbors.fit(X, y) - indices = nearest_neighbors.kneighbors(centroids, - return_distance=False) + indices = nearest_neighbors.kneighbors( + centroids, return_distance=False) X_new = safe_indexing(X, np.squeeze(indices)) else: if sparse.issparse(X): @@ -179,8 +171,8 @@ def _sample(self, X, y): X_resampled, y_resampled = [], [] for target_class in np.unique(y): - if target_class in self.ratio_.keys(): - n_samples = self.ratio_[target_class] + if target_class in self.sampling_strategy_.keys(): + n_samples = self.sampling_strategy_[target_class] self.estimator_.set_params(**{'n_clusters': n_samples}) self.estimator_.fit(X[y == target_class]) X_new, y_new = self._generate_sample( diff --git a/imblearn/under_sampling/prototype_generation/tests/test_cluster_centroids.py b/imblearn/under_sampling/prototype_generation/tests/test_cluster_centroids.py index 4586021f9..4983ae06e 100644 --- a/imblearn/under_sampling/prototype_generation/tests/test_cluster_centroids.py +++ b/imblearn/under_sampling/prototype_generation/tests/test_cluster_centroids.py @@ -14,9 +14,9 @@ from imblearn.under_sampling import ClusterCentroids RND_SEED = 0 -X = np.array([[0.04352327, -0.20515826], [0.92923648, 0.76103773], - [0.20792588, 1.49407907], [0.47104475, 0.44386323], - [0.22950086, 0.33367433], [0.15490546, 0.3130677], +X = np.array([[0.04352327, -0.20515826], [0.92923648, 0.76103773], [ + 0.20792588, 1.49407907 +], [0.47104475, 0.44386323], [0.22950086, 0.33367433], [0.15490546, 0.3130677], [0.09125309, -0.85409574], [0.12372842, 0.6536186], [0.13347175, 0.12167502], [0.094035, -2.55298982]]) Y = np.array([1, 0, 1, 0, 1, 1, 1, 1, 0, 1]) @@ -33,33 +33,28 @@ def test_fit_sample_check_voting(): def test_fit_sample_auto(): - ratio = 'auto' - cc = ClusterCentroids(ratio=ratio, random_state=RND_SEED) + sampling_strategy = 'auto' + cc = ClusterCentroids( + sampling_strategy=sampling_strategy, random_state=RND_SEED) X_resampled, y_resampled = cc.fit_sample(X, Y) - X_gt = np.array([[0.92923648, 0.76103773], - [0.47104475, 0.44386323], - [0.13347175, 0.12167502], - [0.06738818, -0.529627], - [0.17901516, 0.69860992], - [0.094035, -2.55298982]]) + X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323], + [0.13347175, 0.12167502], [0.06738818, -0.529627], + [0.17901516, 0.69860992], [0.094035, -2.55298982]]) y_gt = np.array([0, 0, 0, 1, 1, 1]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) def test_fit_sample_half(): - ratio = {0: 3, 1: 6} - cc = ClusterCentroids(ratio=ratio, random_state=RND_SEED) + sampling_strategy = {0: 3, 1: 6} + cc = ClusterCentroids( + sampling_strategy=sampling_strategy, random_state=RND_SEED) X_resampled, y_resampled = cc.fit_sample(X, Y) - X_gt = np.array([[0.92923648, 0.76103773], - [0.13347175, 0.12167502], - [0.47104475, 0.44386323], - [0.09125309, -0.85409574], - [0.19220316, 0.32337101], - [0.094035, -2.55298982], - [0.20792588, 1.49407907], - [0.04352327, -0.20515826], - [0.12372842, 0.6536186]]) + X_gt = np.array([[0.92923648, 0.76103773], [0.13347175, 0.12167502], [ + 0.47104475, 0.44386323 + ], [0.09125309, -0.85409574], [0.19220316, 0.32337101], + [0.094035, -2.55298982], [0.20792588, 1.49407907], + [0.04352327, -0.20515826], [0.12372842, 0.6536186]]) y_gt = np.array([0, 0, 0, 1, 1, 1, 1, 1, 1]) print(X_resampled) assert_allclose(X_resampled, X_gt, rtol=R_TOL) @@ -79,38 +74,36 @@ def test_multiclass_fit_sample(): def test_fit_sample_object(): - ratio = 'auto' + sampling_strategy = 'auto' cluster = KMeans(random_state=RND_SEED) cc = ClusterCentroids( - ratio=ratio, random_state=RND_SEED, estimator=cluster) + sampling_strategy=sampling_strategy, + random_state=RND_SEED, + estimator=cluster) X_resampled, y_resampled = cc.fit_sample(X, Y) - X_gt = np.array([[0.92923648, 0.76103773], - [0.47104475, 0.44386323], - [0.13347175, 0.12167502], - [0.06738818, -0.529627], - [0.17901516, 0.69860992], - [0.094035, -2.55298982]]) + X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323], + [0.13347175, 0.12167502], [0.06738818, -0.529627], + [0.17901516, 0.69860992], [0.094035, -2.55298982]]) y_gt = np.array([0, 0, 0, 1, 1, 1]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) def test_fit_hard_voting(): - ratio = 'auto' + sampling_strategy = 'auto' voting = 'hard' cluster = KMeans(random_state=RND_SEED) cc = ClusterCentroids( - ratio=ratio, random_state=RND_SEED, estimator=cluster, + sampling_strategy=sampling_strategy, + random_state=RND_SEED, + estimator=cluster, voting=voting) X_resampled, y_resampled = cc.fit_sample(X, Y) - X_gt = np.array([[0.92923648, 0.76103773], - [0.47104475, 0.44386323], - [0.13347175, 0.12167502], - [0.09125309, -0.85409574], - [0.12372842, 0.6536186], - [0.094035, -2.55298982]]) + X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323], + [0.13347175, 0.12167502], [0.09125309, -0.85409574], + [0.12372842, 0.6536186], [0.094035, -2.55298982]]) y_gt = np.array([0, 0, 0, 1, 1, 1]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) @@ -119,14 +112,19 @@ def test_fit_hard_voting(): def test_fit_sample_error(): - ratio = 'auto' + sampling_strategy = 'auto' cluster = 'rnd' cc = ClusterCentroids( - ratio=ratio, random_state=RND_SEED, estimator=cluster) + sampling_strategy=sampling_strategy, + random_state=RND_SEED, + estimator=cluster) with raises(ValueError, match="has to be a KMeans clustering"): cc.fit_sample(X, Y) voting = 'unknown' - cc = ClusterCentroids(ratio=ratio, voting=voting, random_state=RND_SEED) + cc = ClusterCentroids( + sampling_strategy=sampling_strategy, + voting=voting, + random_state=RND_SEED) with raises(ValueError, match="needs to be one of"): cc.fit_sample(X, Y) diff --git a/imblearn/under_sampling/prototype_selection/__init__.py b/imblearn/under_sampling/prototype_selection/__init__.py index 5fab3d708..40e6a43df 100644 --- a/imblearn/under_sampling/prototype_selection/__init__.py +++ b/imblearn/under_sampling/prototype_selection/__init__.py @@ -14,13 +14,9 @@ from .edited_nearest_neighbours import AllKNN from .instance_hardness_threshold import InstanceHardnessThreshold -__all__ = ['RandomUnderSampler', - 'InstanceHardnessThreshold', - 'NearMiss', - 'TomekLinks', - 'EditedNearestNeighbours', - 'RepeatedEditedNearestNeighbours', - 'AllKNN', - 'OneSidedSelection', - 'CondensedNearestNeighbour', - 'NeighbourhoodCleaningRule'] +__all__ = [ + 'RandomUnderSampler', 'InstanceHardnessThreshold', 'NearMiss', + 'TomekLinks', 'EditedNearestNeighbours', 'RepeatedEditedNearestNeighbours', + 'AllKNN', 'OneSidedSelection', 'CondensedNearestNeighbour', + 'NeighbourhoodCleaningRule' +] diff --git a/imblearn/under_sampling/prototype_selection/condensed_nearest_neighbour.py b/imblearn/under_sampling/prototype_selection/condensed_nearest_neighbour.py index 111dde5c2..631534346 100644 --- a/imblearn/under_sampling/prototype_selection/condensed_nearest_neighbour.py +++ b/imblearn/under_sampling/prototype_selection/condensed_nearest_neighbour.py @@ -17,8 +17,13 @@ from sklearn.utils import check_random_state, safe_indexing from ..base import BaseCleaningSampler +from ...utils import Substitution +from ...utils._docstring import _random_state_docstring +@Substitution( + sampling_strategy=BaseCleaningSampler._sampling_strategy_docstring, + random_state=_random_state_docstring) class CondensedNearestNeighbour(BaseCleaningSampler): """Class to perform under-sampling based on the condensed nearest neighbour method. @@ -27,37 +32,13 @@ class CondensedNearestNeighbour(BaseCleaningSampler): Parameters ---------- - ratio : str, dict, or callable, optional (default='auto') - Ratio to use for resampling the data set. - - - If ``str``, has to be one of: (i) ``'minority'``: resample the - minority class; (ii) ``'majority'``: resample the majority class, - (iii) ``'not minority'``: resample all classes apart of the minority - class, (iv) ``'all'``: resample all classes, and (v) ``'auto'``: - correspond to ``'all'`` with for over-sampling methods and ``'not - minority'`` for under-sampling methods. The classes targeted will be - over-sampled or under-sampled to achieve an equal number of sample - with the majority or minority class. - - If ``dict``, the keys correspond to the targeted classes. The values - correspond to the desired number of samples. - - If callable, function taking ``y`` and returns a ``dict``. The keys - correspond to the targeted classes. The values correspond to the - desired number of samples. - - .. warning:: - This algorithm is a cleaning under-sampling method. When providing a - ``dict``, only the targeted classes will be used; the number of - samples will be discarded. + {sampling_strategy} return_indices : bool, optional (default=False) Whether or not to return the indices of the samples randomly selected from the majority class. - random_state : int, RandomState instance or None, optional (default=None) - If int, ``random_state`` is the seed used by the random number - generator; If ``RandomState`` instance, random_state is the random - number generator; If ``None``, the random number generator is the - ``RandomState`` instance used by ``np.random``. + {random_state} n_neighbors : int or object, optional (default=\ KNeighborsClassifier(n_neighbors=1)) @@ -72,6 +53,11 @@ class CondensedNearestNeighbour(BaseCleaningSampler): n_jobs : int, optional (default=1) The number of threads to open if possible. + ratio : str, dict, or callable + .. deprecated:: 0.4 + Use the parameter ``sampling_strategy`` instead. It will be removed + in 0.6. + Notes ----- The method is based on [1]_. @@ -95,31 +81,31 @@ class CondensedNearestNeighbour(BaseCleaningSampler): Examples -------- - >>> from collections import Counter #doctest: +SKIP - >>> from sklearn.datasets import fetch_mldata #doctest: +SKIP + >>> from collections import Counter # doctest: +SKIP + >>> from sklearn.datasets import fetch_mldata # doctest: +SKIP >>> from imblearn.under_sampling import \ -CondensedNearestNeighbour #doctest: +SKIP - >>> pima = fetch_mldata('diabetes_scale') #doctest: +SKIP - >>> X, y = pima['data'], pima['target'] #doctest: +SKIP - >>> print('Original dataset shape {}'.format(Counter(y))) #doctest: +SKIP - Original dataset shape Counter({1: 500, -1: 268}) #doctest: +SKIP - >>> cnn = CondensedNearestNeighbour(random_state=42) #doctest: +SKIP +CondensedNearestNeighbour # doctest: +SKIP + >>> pima = fetch_mldata('diabetes_scale') # doctest: +SKIP + >>> X, y = pima['data'], pima['target'] # doctest: +SKIP + >>> print('Original dataset shape %s' % Counter(y)) # doctest: +SKIP + Original dataset shape Counter({{1: 500, -1: 268}}) # doctest: +SKIP + >>> cnn = CondensedNearestNeighbour(random_state=42) # doctest: +SKIP >>> X_res, y_res = cnn.fit_sample(X, y) #doctest: +SKIP - >>> print('Resampled dataset shape {}'.format( - ... Counter(y_res))) #doctest: +SKIP - Resampled dataset shape Counter({-1: 268, 1: 227}) #doctest: +SKIP + >>> print('Resampled dataset shape %s' % Counter(y_res)) # doctest: +SKIP + Resampled dataset shape Counter({{-1: 268, 1: 227}}) # doctest: +SKIP """ def __init__(self, - ratio='auto', + sampling_strategy='auto', return_indices=False, random_state=None, n_neighbors=None, n_seeds_S=1, - n_jobs=1): + n_jobs=1, + ratio=None): super(CondensedNearestNeighbour, self).__init__( - ratio=ratio) + sampling_strategy=sampling_strategy, ratio=ratio) self.random_state = random_state self.return_indices = return_indices self.n_neighbors = n_neighbors @@ -174,17 +160,18 @@ def _sample(self, X, y): idx_under = np.empty((0, ), dtype=int) for target_class in np.unique(y): - if target_class in self.ratio_.keys(): + if target_class in self.sampling_strategy_.keys(): # Randomly get one sample from the majority class # Generate the index to select idx_maj = np.flatnonzero(y == target_class) idx_maj_sample = idx_maj[random_state.randint( - low=0, high=target_stats[target_class], - size=self.n_seeds_S)] + low=0, + high=target_stats[target_class], + size=self.n_seeds_S)] # Create the set C - One majority samples and all minority - C_indices = np.append(np.flatnonzero(y == class_minority), - idx_maj_sample) + C_indices = np.append( + np.flatnonzero(y == class_minority), idx_maj_sample) C_x = safe_indexing(X, C_indices) C_y = safe_indexing(y, C_indices) @@ -232,8 +219,7 @@ def _sample(self, X, y): np.append(idx_maj_sample, np.flatnonzero(pred_S_y == S_y))) - idx_under = np.concatenate((idx_under, idx_maj_sample), - axis=0) + idx_under = np.concatenate((idx_under, idx_maj_sample), axis=0) else: idx_under = np.concatenate( (idx_under, np.flatnonzero(y == target_class)), axis=0) diff --git a/imblearn/under_sampling/prototype_selection/edited_nearest_neighbours.py b/imblearn/under_sampling/prototype_selection/edited_nearest_neighbours.py index d3d2cf819..062c46f89 100644 --- a/imblearn/under_sampling/prototype_selection/edited_nearest_neighbours.py +++ b/imblearn/under_sampling/prototype_selection/edited_nearest_neighbours.py @@ -1,7 +1,6 @@ """Class to perform under-sampling based on the edited nearest neighbour method.""" - # Authors: Guillaume Lemaitre # Dayvid Oliveira # Christos Aridas @@ -18,12 +17,16 @@ from ..base import BaseCleaningSampler from ...utils import check_neighbors_object +from ...utils import Substitution from ...utils.deprecation import deprecate_parameter - +from ...utils._docstring import _random_state_docstring SEL_KIND = ('all', 'mode') +@Substitution( + sampling_strategy=BaseCleaningSampler._sampling_strategy_docstring, + random_state=_random_state_docstring) class EditedNearestNeighbours(BaseCleaningSampler): """Class to perform under-sampling based on the edited nearest neighbour method. @@ -32,37 +35,13 @@ class EditedNearestNeighbours(BaseCleaningSampler): Parameters ---------- - ratio : str, dict, or callable, optional (default='auto') - Ratio to use for resampling the data set. - - - If ``str``, has to be one of: (i) ``'minority'``: resample the - minority class; (ii) ``'majority'``: resample the majority class, - (iii) ``'not minority'``: resample all classes apart of the minority - class, (iv) ``'all'``: resample all classes, and (v) ``'auto'``: - correspond to ``'all'`` with for over-sampling methods and ``'not - minority'`` for under-sampling methods. The classes targeted will be - over-sampled or under-sampled to achieve an equal number of sample - with the majority or minority class. - - If ``dict``, the keys correspond to the targeted classes. The values - correspond to the desired number of samples. - - If callable, function taking ``y`` and returns a ``dict``. The keys - correspond to the targeted classes. The values correspond to the - desired number of samples. - - .. warning:: - This algorithm is a cleaning under-sampling method. When providing a - ``dict``, only the targeted classes will be used; the number of - samples will be discarded. + {sampling_strategy} return_indices : bool, optional (default=False) Whether or not to return the indices of the samples randomly selected from the majority class. - random_state : int, RandomState instance or None, optional (default=None) - If int, ``random_state`` is the seed used by the random number - generator; If ``RandomState`` instance, random_state is the random - number generator; If ``None``, the random number generator is the - ``RandomState`` instance used by ``np.random``. + {random_state} .. deprecated:: 0.4 ``random_state`` is deprecated in 0.4 and will be removed in 0.6. @@ -84,6 +63,11 @@ class EditedNearestNeighbours(BaseCleaningSampler): n_jobs : int, optional (default=1) The number of threads to open if possible. + ratio : str, dict, or callable + .. deprecated:: 0.4 + Use the parameter ``sampling_strategy`` instead. It will be removed + in 0.6. + Notes ----- The method is based on [1]_. @@ -115,23 +99,25 @@ class EditedNearestNeighbours(BaseCleaningSampler): >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) - >>> print('Original dataset shape {}'.format(Counter(y))) - Original dataset shape Counter({1: 900, 0: 100}) + >>> print('Original dataset shape %s' % Counter(y)) + Original dataset shape Counter({{1: 900, 0: 100}}) >>> enn = EditedNearestNeighbours() >>> X_res, y_res = enn.fit_sample(X, y) - >>> print('Resampled dataset shape {}'.format(Counter(y_res))) - Resampled dataset shape Counter({1: 887, 0: 100}) + >>> print('Resampled dataset shape %s' % Counter(y_res)) + Resampled dataset shape Counter({{1: 887, 0: 100}}) """ def __init__(self, - ratio='auto', + sampling_strategy='auto', return_indices=False, random_state=None, n_neighbors=3, kind_sel='all', - n_jobs=1): - super(EditedNearestNeighbours, self).__init__(ratio=ratio) + n_jobs=1, + ratio=None): + super(EditedNearestNeighbours, self).__init__( + sampling_strategy=sampling_strategy, ratio=ratio) self.random_state = random_state self.return_indices = return_indices self.n_neighbors = n_neighbors @@ -145,8 +131,8 @@ def _validate_estimator(self): if self.random_state is not None: deprecate_parameter(self, '0.4', 'random_state') - self.nn_ = check_neighbors_object('n_neighbors', self.n_neighbors, - additional_neighbor=1) + self.nn_ = check_neighbors_object( + 'n_neighbors', self.n_neighbors, additional_neighbor=1) self.nn_.set_params(**{'n_jobs': self.n_jobs}) if self.kind_sel not in SEL_KIND: @@ -184,7 +170,7 @@ def _sample(self, X, y): self.nn_.fit(X) for target_class in np.unique(y): - if target_class in self.ratio_.keys(): + if target_class in self.sampling_strategy_.keys(): target_class_indices = np.flatnonzero(y == target_class) X_class = safe_indexing(X, target_class_indices) y_class = safe_indexing(y, target_class_indices) @@ -202,8 +188,9 @@ def _sample(self, X, y): index_target_class = slice(None) idx_under = np.concatenate( - (idx_under, np.flatnonzero(y == target_class)[ - index_target_class]), axis=0) + (idx_under, + np.flatnonzero(y == target_class)[index_target_class]), + axis=0) if self.return_indices: return (safe_indexing(X, idx_under), safe_indexing(y, idx_under), @@ -212,6 +199,9 @@ def _sample(self, X, y): return safe_indexing(X, idx_under), safe_indexing(y, idx_under) +@Substitution( + sampling_strategy=BaseCleaningSampler._sampling_strategy_docstring, + random_state=_random_state_docstring) class RepeatedEditedNearestNeighbours(BaseCleaningSampler): """Class to perform under-sampling based on the repeated edited nearest neighbour method. @@ -220,37 +210,13 @@ class RepeatedEditedNearestNeighbours(BaseCleaningSampler): Parameters ---------- - ratio : str, dict, or callable, optional (default='auto') - Ratio to use for resampling the data set. - - - If ``str``, has to be one of: (i) ``'minority'``: resample the - minority class; (ii) ``'majority'``: resample the majority class, - (iii) ``'not minority'``: resample all classes apart of the minority - class, (iv) ``'all'``: resample all classes, and (v) ``'auto'``: - correspond to ``'all'`` with for over-sampling methods and ``'not - minority'`` for under-sampling methods. The classes targeted will be - over-sampled or under-sampled to achieve an equal number of sample - with the majority or minority class. - - If ``dict``, the keys correspond to the targeted classes. The values - correspond to the desired number of samples. - - If callable, function taking ``y`` and returns a ``dict``. The keys - correspond to the targeted classes. The values correspond to the - desired number of samples. - - .. warning:: - This algorithm is a cleaning under-sampling method. When providing a - ``dict``, only the targeted classes will be used; the number of - samples will be discarded. + {sampling_strategy} return_indices : bool, optional (default=False) Whether or not to return the indices of the samples randomly selected from the majority class. - random_state : int, RandomState instance or None, optional (default=None) - If int, ``random_state`` is the seed used by the random number - generator; If ``RandomState`` instance, random_state is the random - number generator; If ``None``, the random number generator is the - ``RandomState`` instance used by ``np.random``. + {random_state} .. deprecated:: 0.4 ``random_state`` is deprecated in 0.4 and will be removed in 0.6. @@ -276,6 +242,11 @@ class RepeatedEditedNearestNeighbours(BaseCleaningSampler): n_jobs : int, optional (default=1) The number of thread to open when it is possible. + ratio : str, dict, or callable + .. deprecated:: 0.4 + Use the parameter ``sampling_strategy`` instead. It will be removed + in 0.6. + Notes ----- The method is based on [1]_. A one-vs.-rest scheme is used when @@ -307,24 +278,26 @@ class RepeatedEditedNearestNeighbours(BaseCleaningSampler): >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) - >>> print('Original dataset shape {}'.format(Counter(y))) - Original dataset shape Counter({1: 900, 0: 100}) + >>> print('Original dataset shape %s' % Counter(y)) + Original dataset shape Counter({{1: 900, 0: 100}}) >>> renn = RepeatedEditedNearestNeighbours() >>> X_res, y_res = renn.fit_sample(X, y) - >>> print('Resampled dataset shape {}'.format(Counter(y_res))) - Resampled dataset shape Counter({1: 887, 0: 100}) + >>> print('Resampled dataset shape %s' % Counter(y_res)) + Resampled dataset shape Counter({{1: 887, 0: 100}}) """ def __init__(self, - ratio='auto', + sampling_strategy='auto', return_indices=False, random_state=None, n_neighbors=3, max_iter=100, kind_sel='all', - n_jobs=1): - super(RepeatedEditedNearestNeighbours, self).__init__(ratio=ratio) + n_jobs=1, + ratio=None): + super(RepeatedEditedNearestNeighbours, self).__init__( + sampling_strategy=sampling_strategy, ratio=ratio) self.random_state = random_state self.return_indices = return_indices self.n_neighbors = n_neighbors @@ -343,14 +316,16 @@ def _validate_estimator(self): raise ValueError('max_iter must be greater than 1.' ' Got {} instead.'.format(type(self.max_iter))) - self.nn_ = check_neighbors_object('n_neighbors', self.n_neighbors, - additional_neighbor=1) + self.nn_ = check_neighbors_object( + 'n_neighbors', self.n_neighbors, additional_neighbor=1) - self.enn_ = EditedNearestNeighbours(ratio=self.ratio, - return_indices=self.return_indices, - n_neighbors=self.nn_, - kind_sel=self.kind_sel, - n_jobs=self.n_jobs) + self.enn_ = EditedNearestNeighbours( + sampling_strategy=self.sampling_strategy, + return_indices=self.return_indices, + n_neighbors=self.nn_, + kind_sel=self.kind_sel, + n_jobs=self.n_jobs, + ratio=self.ratio) def _sample(self, X, y): """Resample the dataset. @@ -409,8 +384,8 @@ def _sample(self, X, y): val for val, key in zip(stats_enn.values(), stats_enn.keys()) if key != class_minority ]) - b_min_bec_maj = np.any(count_non_min < - target_stats[class_minority]) + b_min_bec_maj = np.any( + count_non_min < target_stats[class_minority]) # Case 3 b_remove_maj_class = (len(stats_enn) < len(target_stats)) @@ -436,6 +411,9 @@ def _sample(self, X, y): return X_resampled, y_resampled +@Substitution( + sampling_strategy=BaseCleaningSampler._sampling_strategy_docstring, + random_state=_random_state_docstring) class AllKNN(BaseCleaningSampler): """Class to perform under-sampling based on the AllKNN method. @@ -443,37 +421,13 @@ class AllKNN(BaseCleaningSampler): Parameters ---------- - ratio : str, dict, or callable, optional (default='auto') - Ratio to use for resampling the data set. - - - If ``str``, has to be one of: (i) ``'minority'``: resample the - minority class; (ii) ``'majority'``: resample the majority class, - (iii) ``'not minority'``: resample all classes apart of the minority - class, (iv) ``'all'``: resample all classes, and (v) ``'auto'``: - correspond to ``'all'`` with for over-sampling methods and ``'not - minority'`` for under-sampling methods. The classes targeted will be - over-sampled or under-sampled to achieve an equal number of sample - with the majority or minority class. - - If ``dict``, the keys correspond to the targeted classes. The values - correspond to the desired number of samples. - - If callable, function taking ``y`` and returns a ``dict``. The keys - correspond to the targeted classes. The values correspond to the - desired number of samples. - - .. warning:: - This algorithm is a cleaning under-sampling method. When providing a - ``dict``, only the targeted classes will be used; the number of - samples will be discarded. + {sampling_strategy} return_indices : bool, optional (default=False) Whether or not to return the indices of the samples randomly selected from the majority class. - random_state : int, RandomState instance or None, optional (default=None) - If int, ``random_state`` is the seed used by the random number - generator; If ``RandomState`` instance, random_state is the random - number generator; If ``None``, the random number generator is the - ``RandomState`` instance used by ``np.random``. + {random_state} .. deprecated:: 0.4 ``random_state`` is deprecated in 0.4 and will be removed in 0.6. @@ -501,6 +455,11 @@ class without early stopping. n_jobs : int, optional (default=1) The number of thread to open when it is possible. + ratio : str, dict, or callable + .. deprecated:: 0.4 + Use the parameter ``sampling_strategy`` instead. It will be removed + in 0.6. + Notes ----- The method is based on [1]_. @@ -531,24 +490,26 @@ class without early stopping. >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) - >>> print('Original dataset shape {}'.format(Counter(y))) - Original dataset shape Counter({1: 900, 0: 100}) + >>> print('Original dataset shape %s' % Counter(y)) + Original dataset shape Counter({{1: 900, 0: 100}}) >>> allknn = AllKNN() >>> X_res, y_res = allknn.fit_sample(X, y) - >>> print('Resampled dataset shape {}'.format(Counter(y_res))) - Resampled dataset shape Counter({1: 887, 0: 100}) + >>> print('Resampled dataset shape %s' % Counter(y_res)) + Resampled dataset shape Counter({{1: 887, 0: 100}}) """ def __init__(self, - ratio='auto', + sampling_strategy='auto', return_indices=False, random_state=None, n_neighbors=3, kind_sel='all', allow_minority=False, - n_jobs=1): - super(AllKNN, self).__init__(ratio=ratio) + n_jobs=1, + ratio=None): + super(AllKNN, self).__init__( + sampling_strategy=sampling_strategy, ratio=ratio) self.random_state = random_state self.return_indices = return_indices self.n_neighbors = n_neighbors @@ -566,14 +527,16 @@ def _validate_estimator(self): if self.kind_sel not in SEL_KIND: raise NotImplementedError - self.nn_ = check_neighbors_object('n_neighbors', self.n_neighbors, - additional_neighbor=1) + self.nn_ = check_neighbors_object( + 'n_neighbors', self.n_neighbors, additional_neighbor=1) - self.enn_ = EditedNearestNeighbours(ratio=self.ratio, - return_indices=self.return_indices, - n_neighbors=self.nn_, - kind_sel=self.kind_sel, - n_jobs=self.n_jobs) + self.enn_ = EditedNearestNeighbours( + sampling_strategy=self.sampling_strategy, + return_indices=self.return_indices, + n_neighbors=self.nn_, + kind_sel=self.kind_sel, + n_jobs=self.n_jobs, + ratio=self.ratio) def _sample(self, X, y): """Resample the dataset. @@ -627,8 +590,8 @@ def _sample(self, X, y): val for val, key in zip(stats_enn.values(), stats_enn.keys()) if key != class_minority ]) - b_min_bec_maj = np.any(count_non_min < - target_stats[class_minority]) + b_min_bec_maj = np.any( + count_non_min < target_stats[class_minority]) if self.allow_minority: # overwrite b_min_bec_maj b_min_bec_maj = False diff --git a/imblearn/under_sampling/prototype_selection/instance_hardness_threshold.py b/imblearn/under_sampling/prototype_selection/instance_hardness_threshold.py index a92610f2c..981cedfc1 100644 --- a/imblearn/under_sampling/prototype_selection/instance_hardness_threshold.py +++ b/imblearn/under_sampling/prototype_selection/instance_hardness_threshold.py @@ -18,8 +18,13 @@ from sklearn.utils import safe_indexing from ..base import BaseCleaningSampler +from ...utils import Substitution +from ...utils._docstring import _random_state_docstring +@Substitution( + sampling_strategy=BaseCleaningSampler._sampling_strategy_docstring, + random_state=_random_state_docstring) class InstanceHardnessThreshold(BaseCleaningSampler): """Class to perform under-sampling based on the instance hardness threshold. @@ -37,37 +42,13 @@ class InstanceHardnessThreshold(BaseCleaningSampler): inherited from :class:`sklearn.base.ClassifierMixin` and having an attribute :func:`predict_proba`. - ratio : str, dict, or callable, optional (default='auto') - Ratio to use for resampling the data set. - - - If ``str``, has to be one of: (i) ``'minority'``: resample the - minority class; (ii) ``'majority'``: resample the majority class, - (iii) ``'not minority'``: resample all classes apart of the minority - class, (iv) ``'all'``: resample all classes, and (v) ``'auto'``: - correspond to ``'all'`` with for over-sampling methods and ``'not - minority'`` for under-sampling methods. The classes targeted will be - over-sampled or under-sampled to achieve an equal number of sample - with the majority or minority class. - - If ``dict``, the keys correspond to the targeted classes. The values - correspond to the desired number of samples. - - If callable, function taking ``y`` and returns a ``dict``. The keys - correspond to the targeted classes. The values correspond to the - desired number of samples. - - .. warning:: - This algorithm is a cleaning under-sampling method. When providing a - ``dict``, only the targeted classes will be used; the number of - samples will be discarded. + {sampling_strategy} return_indices : bool, optional (default=False) Whether or not to return the indices of the samples randomly selected from the majority class. - random_state : int, RandomState instance or None, optional (default=None) - If int, ``random_state`` is the seed used by the random number - generator; If ``RandomState`` instance, random_state is the random - number generator; If ``None``, the random number generator is the - ``RandomState`` instance used by ``np.random``. + {random_state} cv : int, optional (default=5) Number of folds to be used when estimating samples' instance hardness. @@ -75,6 +56,11 @@ class InstanceHardnessThreshold(BaseCleaningSampler): n_jobs : int, optional (default=1) The number of threads to open if possible. + ratio : str, dict, or callable + .. deprecated:: 0.4 + Use the parameter ``sampling_strategy`` instead. It will be removed + in 0.6. + Notes ----- The method is based on [1]_. @@ -100,23 +86,25 @@ class InstanceHardnessThreshold(BaseCleaningSampler): >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) - >>> print('Original dataset shape {}'.format(Counter(y))) - Original dataset shape Counter({1: 900, 0: 100}) + >>> print('Original dataset shape %s' % Counter(y)) + Original dataset shape Counter({{1: 900, 0: 100}}) >>> iht = InstanceHardnessThreshold(random_state=42) >>> X_res, y_res = iht.fit_sample(X, y) - >>> print('Resampled dataset shape {}'.format(Counter(y_res))) - Resampled dataset shape Counter({1: 840, 0: 100}) + >>> print('Resampled dataset shape %s' % Counter(y_res)) + Resampled dataset shape Counter({{1: 840, 0: 100}}) """ def __init__(self, estimator=None, - ratio='auto', + sampling_strategy='auto', return_indices=False, random_state=None, cv=5, - n_jobs=1): - super(InstanceHardnessThreshold, self).__init__(ratio=ratio) + n_jobs=1, + ratio=None): + super(InstanceHardnessThreshold, self).__init__( + sampling_strategy=sampling_strategy, ratio=ratio) self.random_state = random_state self.estimator = estimator self.return_indices = return_indices @@ -161,8 +149,9 @@ def _sample(self, X, y): self._validate_estimator() target_stats = Counter(y) - skf = StratifiedKFold(n_splits=self.cv, shuffle=False, - random_state=self.random_state).split(X, y) + skf = StratifiedKFold( + n_splits=self.cv, shuffle=False, + random_state=self.random_state).split(X, y) probabilities = np.zeros(y.shape[0], dtype=float) for train_index, test_index in skf: @@ -183,8 +172,8 @@ def _sample(self, X, y): idx_under = np.empty((0, ), dtype=int) for target_class in np.unique(y): - if target_class in self.ratio_.keys(): - n_samples = self.ratio_[target_class] + if target_class in self.sampling_strategy_.keys(): + n_samples = self.sampling_strategy_[target_class] threshold = np.percentile( probabilities[y == target_class], (1. - (n_samples / target_stats[target_class])) * 100.) @@ -194,8 +183,9 @@ def _sample(self, X, y): index_target_class = slice(None) idx_under = np.concatenate( - (idx_under, np.flatnonzero(y == target_class)[ - index_target_class]), axis=0) + (idx_under, + np.flatnonzero(y == target_class)[index_target_class]), + axis=0) if self.return_indices: return (safe_indexing(X, idx_under), safe_indexing(y, idx_under), diff --git a/imblearn/under_sampling/prototype_selection/nearmiss.py b/imblearn/under_sampling/prototype_selection/nearmiss.py index 05bc0ae3e..e0de46418 100644 --- a/imblearn/under_sampling/prototype_selection/nearmiss.py +++ b/imblearn/under_sampling/prototype_selection/nearmiss.py @@ -15,9 +15,14 @@ from ..base import BaseUnderSampler from ...utils import check_neighbors_object +from ...utils import Substitution from ...utils.deprecation import deprecate_parameter +from ...utils._docstring import _random_state_docstring +@Substitution( + sampling_strategy=BaseUnderSampler._sampling_strategy_docstring, + random_state=_random_state_docstring) class NearMiss(BaseUnderSampler): """Class to perform under-sampling based on NearMiss methods. @@ -25,32 +30,13 @@ class NearMiss(BaseUnderSampler): Parameters ---------- - ratio : str, dict, or callable, optional (default='auto') - Ratio to use for resampling the data set. - - - If ``str``, has to be one of: (i) ``'minority'``: resample the - minority class; (ii) ``'majority'``: resample the majority class, - (iii) ``'not minority'``: resample all classes apart of the minority - class, (iv) ``'all'``: resample all classes, and (v) ``'auto'``: - correspond to ``'all'`` with for over-sampling methods and ``'not - minority'`` for under-sampling methods. The classes targeted will be - over-sampled or under-sampled to achieve an equal number of sample - with the majority or minority class. - - If ``dict``, the keys correspond to the targeted classes. The values - correspond to the desired number of samples. - - If callable, function taking ``y`` and returns a ``dict``. The keys - correspond to the targeted classes. The values correspond to the - desired number of samples. + {sampling_strategy} return_indices : bool, optional (default=False) Whether or not to return the indices of the samples randomly selected from the majority class. - random_state : int, RandomState instance or None, optional (default=None) - If int, ``random_state`` is the seed used by the random number - generator; If ``RandomState`` instance, random_state is the random - number generator; If ``None``, the random number generator is the - ``RandomState`` instance used by ``np.random``. + {random_state} .. deprecated:: 0.4 ``random_state`` is deprecated in 0.4 and will be removed in 0.6. @@ -76,6 +62,11 @@ class NearMiss(BaseUnderSampler): n_jobs : int, optional (default=1) The number of threads to open if possible. + ratio : str, dict, or callable + .. deprecated:: 0.4 + Use the parameter ``sampling_strategy`` instead. It will be removed + in 0.6. + Notes ----- The methods are based on [1]_. @@ -103,24 +94,26 @@ class NearMiss(BaseUnderSampler): >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) - >>> print('Original dataset shape {}'.format(Counter(y))) - Original dataset shape Counter({1: 900, 0: 100}) + >>> print('Original dataset shape %s' % Counter(y)) + Original dataset shape Counter({{1: 900, 0: 100}}) >>> nm = NearMiss() >>> X_res, y_res = nm.fit_sample(X, y) - >>> print('Resampled dataset shape {}'.format(Counter(y_res))) - Resampled dataset shape Counter({0: 100, 1: 100}) + >>> print('Resampled dataset shape %s' % Counter(y_res)) + Resampled dataset shape Counter({{0: 100, 1: 100}}) """ def __init__(self, - ratio='auto', + sampling_strategy='auto', return_indices=False, random_state=None, version=1, n_neighbors=3, n_neighbors_ver3=3, - n_jobs=1): - super(NearMiss, self).__init__(ratio=ratio) + n_jobs=1, + ratio=None): + super(NearMiss, self).__init__( + sampling_strategy=sampling_strategy, ratio=ratio) self.random_state = random_state self.return_indices = return_indices self.version = version @@ -254,8 +247,8 @@ def _sample(self, X, y): self.nn_.fit(safe_indexing(X, minority_class_indices)) for target_class in np.unique(y): - if target_class in self.ratio_.keys(): - n_samples = self.ratio_[target_class] + if target_class in self.sampling_strategy_.keys(): + n_samples = self.sampling_strategy_[target_class] target_class_indices = np.flatnonzero(y == target_class) X_class = safe_indexing(X, target_class_indices) y_class = safe_indexing(y, target_class_indices) @@ -264,13 +257,21 @@ def _sample(self, X, y): dist_vec, idx_vec = self.nn_.kneighbors( X_class, n_neighbors=self.nn_.n_neighbors) index_target_class = self._selection_dist_based( - X, y, dist_vec, n_samples, target_class, + X, + y, + dist_vec, + n_samples, + target_class, sel_strategy='nearest') elif self.version == 2: dist_vec, idx_vec = self.nn_.kneighbors( X_class, n_neighbors=target_stats[class_minority]) index_target_class = self._selection_dist_based( - X, y, dist_vec, n_samples, target_class, + X, + y, + dist_vec, + n_samples, + target_class, sel_strategy='nearest') elif self.version == 3: self.nn_ver3_.fit(X_class) @@ -283,8 +284,12 @@ def _sample(self, X, y): dist_vec, idx_vec = self.nn_.kneighbors( X_class_selected, n_neighbors=self.nn_.n_neighbors) index_target_class = self._selection_dist_based( - X_class_selected, y_class_selected, dist_vec, - n_samples, target_class, sel_strategy='farthest') + X_class_selected, + y_class_selected, + dist_vec, + n_samples, + target_class, + sel_strategy='farthest') # idx_tmp is relative to the feature selected in the # previous step and we need to find the indirection index_target_class = idx_vec_farthest[index_target_class] @@ -292,8 +297,9 @@ def _sample(self, X, y): index_target_class = slice(None) idx_under = np.concatenate( - (idx_under, np.flatnonzero(y == target_class)[ - index_target_class]), axis=0) + (idx_under, + np.flatnonzero(y == target_class)[index_target_class]), + axis=0) if self.return_indices: return (safe_indexing(X, idx_under), safe_indexing(y, idx_under), diff --git a/imblearn/under_sampling/prototype_selection/neighbourhood_cleaning_rule.py b/imblearn/under_sampling/prototype_selection/neighbourhood_cleaning_rule.py index 6b70b069c..706d70b70 100644 --- a/imblearn/under_sampling/prototype_selection/neighbourhood_cleaning_rule.py +++ b/imblearn/under_sampling/prototype_selection/neighbourhood_cleaning_rule.py @@ -16,12 +16,16 @@ from ..base import BaseCleaningSampler from .edited_nearest_neighbours import EditedNearestNeighbours from ...utils import check_neighbors_object +from ...utils import Substitution from ...utils.deprecation import deprecate_parameter - +from ...utils._docstring import _random_state_docstring SEL_KIND = ('all', 'mode') +@Substitution( + sampling_strategy=BaseCleaningSampler._sampling_strategy_docstring, + random_state=_random_state_docstring) class NeighbourhoodCleaningRule(BaseCleaningSampler): """Class performing under-sampling based on the neighbourhood cleaning rule. @@ -30,37 +34,13 @@ class NeighbourhoodCleaningRule(BaseCleaningSampler): Parameters ---------- - ratio : str, dict, or callable, optional (default='auto') - Ratio to use for resampling the data set. - - - If ``str``, has to be one of: (i) ``'minority'``: resample the - minority class; (ii) ``'majority'``: resample the majority class, - (iii) ``'not minority'``: resample all classes apart of the minority - class, (iv) ``'all'``: resample all classes, and (v) ``'auto'``: - correspond to ``'all'`` with for over-sampling methods and ``'not - minority'`` for under-sampling methods. The classes targeted will be - over-sampled or under-sampled to achieve an equal number of sample - with the majority or minority class. - - If ``dict``, the keys correspond to the targeted classes. The values - correspond to the desired number of samples. - - If callable, function taking ``y`` and returns a ``dict``. The keys - correspond to the targeted classes. The values correspond to the - desired number of samples. - - .. warning:: - This algorithm is a cleaning under-sampling method. When providing a - ``dict``, only the targeted classes will be used; the number of - samples will be discarded. + {sampling_strategy} return_indices : bool, optional (default=False) Whether or not to return the indices of the samples randomly selected from the majority class. - random_state : int, RandomState instance or None, optional (default=None) - If int, ``random_state`` is the seed used by the random number - generator; If ``RandomState`` instance, random_state is the random - number generator; If ``None``, the random number generator is the - ``RandomState`` instance used by ``np.random``. + {random_state} .. deprecated:: 0.4 ``random_state`` is deprecated in 0.4 and will be removed in 0.6. @@ -83,6 +63,11 @@ class NeighbourhoodCleaningRule(BaseCleaningSampler): n_jobs : int, optional (default=1) The number of threads to open if possible. + ratio : str, dict, or callable + .. deprecated:: 0.4 + Use the parameter ``sampling_strategy`` instead. It will be removed + in 0.6. + Notes ----- See the original paper: [1]_. @@ -108,24 +93,26 @@ class NeighbourhoodCleaningRule(BaseCleaningSampler): >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) - >>> print('Original dataset shape {}'.format(Counter(y))) - Original dataset shape Counter({1: 900, 0: 100}) + >>> print('Original dataset shape %s' % Counter(y)) + Original dataset shape Counter({{1: 900, 0: 100}}) >>> ncr = NeighbourhoodCleaningRule() >>> X_res, y_res = ncr.fit_sample(X, y) - >>> print('Resampled dataset shape {}'.format(Counter(y_res))) - Resampled dataset shape Counter({1: 877, 0: 100}) + >>> print('Resampled dataset shape %s' % Counter(y_res)) + Resampled dataset shape Counter({{1: 877, 0: 100}}) """ def __init__(self, - ratio='auto', + sampling_strategy='auto', return_indices=False, random_state=None, n_neighbors=3, kind_sel='all', threshold_cleaning=0.5, - n_jobs=1): - super(NeighbourhoodCleaningRule, self).__init__(ratio=ratio) + n_jobs=1, + ratio=None): + super(NeighbourhoodCleaningRule, self).__init__( + sampling_strategy=sampling_strategy, ratio=ratio) self.random_state = random_state self.return_indices = return_indices self.n_neighbors = n_neighbors @@ -140,17 +127,17 @@ def _validate_estimator(self): if self.random_state is not None: deprecate_parameter(self, '0.4', 'random_state') - self.nn_ = check_neighbors_object('n_neighbors', self.n_neighbors, - additional_neighbor=1) + self.nn_ = check_neighbors_object( + 'n_neighbors', self.n_neighbors, additional_neighbor=1) self.nn_.set_params(**{'n_jobs': self.n_jobs}) if self.kind_sel not in SEL_KIND: raise NotImplementedError if self.threshold_cleaning > 1 or self.threshold_cleaning < 0: - raise ValueError("'threshold_cleaning' is a value between 0 and 1." - " Got {} instead.".format( - self.threshold_cleaning)) + raise ValueError( + "'threshold_cleaning' is a value between 0 and 1." + " Got {} instead.".format(self.threshold_cleaning)) def _sample(self, X, y): """Resample the dataset. @@ -178,10 +165,13 @@ def _sample(self, X, y): """ self._validate_estimator() - enn = EditedNearestNeighbours(ratio=self.ratio, return_indices=True, - n_neighbors=self.n_neighbors, - kind_sel='mode', - n_jobs=self.n_jobs) + enn = EditedNearestNeighbours( + sampling_strategy=self.sampling_strategy, + return_indices=True, + n_neighbors=self.n_neighbors, + kind_sel='mode', + n_jobs=self.n_jobs, + ratio=self.ratio) _, _, index_not_a1 = enn.fit_sample(X, y) index_a1 = np.ones(y.shape, dtype=bool) index_a1[index_not_a1] = False @@ -191,16 +181,16 @@ def _sample(self, X, y): target_stats = Counter(y) class_minority = min(target_stats, key=target_stats.get) # compute which classes to consider for cleaning for the A2 group - classes_under_sample = [c for c, n_samples in target_stats.items() - if (c in self.ratio_.keys() and - (n_samples > X.shape[0] * - self.threshold_cleaning))] + classes_under_sample = [ + c for c, n_samples in target_stats.items() + if (c in self.sampling_strategy_.keys() and ( + n_samples > X.shape[0] * self.threshold_cleaning)) + ] self.nn_.fit(X) class_minority_indices = np.flatnonzero(y == class_minority) X_class = safe_indexing(X, class_minority_indices) y_class = safe_indexing(y, class_minority_indices) - nnhood_idx = self.nn_.kneighbors( - X_class, return_distance=False)[:, 1:] + nnhood_idx = self.nn_.kneighbors(X_class, return_distance=False)[:, 1:] nnhood_label = y[nnhood_idx] if self.kind_sel == 'mode': nnhood_label_majority, _ = mode(nnhood_label, axis=1) @@ -212,8 +202,8 @@ def _sample(self, X, y): raise NotImplementedError # compute a2 group index_a2 = np.ravel(nnhood_idx[~nnhood_bool]) - index_a2 = np.unique([index for index in index_a2 - if y[index] in classes_under_sample]) + index_a2 = np.unique( + [index for index in index_a2 if y[index] in classes_under_sample]) union_a1_a2 = np.union1d(index_a1, index_a2).astype(int) selected_samples = np.ones(y.shape, dtype=bool) @@ -221,9 +211,8 @@ def _sample(self, X, y): index_target_class = np.flatnonzero(selected_samples) if self.return_indices: - return (safe_indexing(X, index_target_class), - safe_indexing(y, index_target_class), - index_target_class) + return (safe_indexing(X, index_target_class), safe_indexing( + y, index_target_class), index_target_class) else: - return (safe_indexing(X, index_target_class), - safe_indexing(y, index_target_class)) + return (safe_indexing(X, index_target_class), safe_indexing( + y, index_target_class)) diff --git a/imblearn/under_sampling/prototype_selection/one_sided_selection.py b/imblearn/under_sampling/prototype_selection/one_sided_selection.py index 0bcc17df1..314304dbe 100644 --- a/imblearn/under_sampling/prototype_selection/one_sided_selection.py +++ b/imblearn/under_sampling/prototype_selection/one_sided_selection.py @@ -14,8 +14,13 @@ from ..base import BaseCleaningSampler from .tomek_links import TomekLinks +from ...utils import Substitution +from ...utils._docstring import _random_state_docstring +@Substitution( + sampling_strategy=BaseCleaningSampler._sampling_strategy_docstring, + random_state=_random_state_docstring) class OneSidedSelection(BaseCleaningSampler): """Class to perform under-sampling based on one-sided selection method. @@ -23,37 +28,13 @@ class OneSidedSelection(BaseCleaningSampler): Parameters ---------- - ratio : str, dict, or callable, optional (default='auto') - Ratio to use for resampling the data set. - - - If ``str``, has to be one of: (i) ``'minority'``: resample the - minority class; (ii) ``'majority'``: resample the majority class, - (iii) ``'not minority'``: resample all classes apart of the minority - class, (iv) ``'all'``: resample all classes, and (v) ``'auto'``: - correspond to ``'all'`` with for over-sampling methods and ``'not - minority'`` for under-sampling methods. The classes targeted will be - over-sampled or under-sampled to achieve an equal number of sample - with the majority or minority class. - - If ``dict``, the keys correspond to the targeted classes. The values - correspond to the desired number of samples. - - If callable, function taking ``y`` and returns a ``dict``. The keys - correspond to the targeted classes. The values correspond to the - desired number of samples. - - .. warning:: - This algorithm is a cleaning under-sampling method. When providing a - ``dict``, only the targeted classes will be used; the number of - samples will be discarded. + {sampling_strategy} return_indices : bool, optional (default=False) Whether or not to return the indices of the samples randomly selected from the majority class. - random_state : int, RandomState instance or None, optional (default=None) - If int, ``random_state`` is the seed used by the random number - generator; If ``RandomState`` instance, random_state is the random - number generator; If ``None``, the random number generator is the - ``RandomState`` instance used by ``np.random``. + {random_state} n_neighbors : int or object, optional (default=\ KNeighborsClassifier(n_neighbors=1)) @@ -68,6 +49,11 @@ class OneSidedSelection(BaseCleaningSampler): n_jobs : int, optional (default=1) The number of threads to open if possible. + ratio : str, dict, or callable + .. deprecated:: 0.4 + Use the parameter ``sampling_strategy`` instead. It will be removed + in 0.6. + Notes ----- The method is based on [1]_. @@ -94,23 +80,25 @@ class OneSidedSelection(BaseCleaningSampler): >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) - >>> print('Original dataset shape {}'.format(Counter(y))) - Original dataset shape Counter({1: 900, 0: 100}) + >>> print('Original dataset shape %s' % Counter(y)) + Original dataset shape Counter({{1: 900, 0: 100}}) >>> oss = OneSidedSelection(random_state=42) >>> X_res, y_res = oss.fit_sample(X, y) - >>> print('Resampled dataset shape {}'.format(Counter(y_res))) - Resampled dataset shape Counter({1: 495, 0: 100}) + >>> print('Resampled dataset shape %s' % Counter(y_res)) + Resampled dataset shape Counter({{1: 495, 0: 100}}) """ def __init__(self, - ratio='auto', + sampling_strategy='auto', return_indices=False, random_state=None, n_neighbors=None, n_seeds_S=1, - n_jobs=1): - super(OneSidedSelection, self).__init__(ratio=ratio) + n_jobs=1, + ratio=None): + super(OneSidedSelection, self).__init__( + sampling_strategy=sampling_strategy, ratio=ratio) self.random_state = random_state self.return_indices = return_indices self.n_neighbors = n_neighbors @@ -165,12 +153,13 @@ def _sample(self, X, y): idx_under = np.empty((0, ), dtype=int) for target_class in np.unique(y): - if target_class in self.ratio_.keys(): + if target_class in self.sampling_strategy_.keys(): # select a sample from the current class idx_maj = np.flatnonzero(y == target_class) idx_maj_sample = idx_maj[random_state.randint( - low=0, high=target_stats[target_class], - size=self.n_seeds_S)] + low=0, + high=target_stats[target_class], + size=self.n_seeds_S)] minority_class_indices = np.flatnonzero(y == class_minority) C_indices = np.append(minority_class_indices, idx_maj_sample) @@ -200,9 +189,10 @@ def _sample(self, X, y): y_resampled = safe_indexing(y, idx_under) # apply Tomek cleaning - tl = TomekLinks(ratio=self.ratio_, return_indices=True) - X_cleaned, y_cleaned, idx_cleaned = tl.fit_sample(X_resampled, - y_resampled) + tl = TomekLinks( + sampling_strategy=self.sampling_strategy_, return_indices=True) + X_cleaned, y_cleaned, idx_cleaned = tl.fit_sample( + X_resampled, y_resampled) idx_under = safe_indexing(idx_under, idx_cleaned) if self.return_indices: diff --git a/imblearn/under_sampling/prototype_selection/random_under_sampler.py b/imblearn/under_sampling/prototype_selection/random_under_sampler.py index 2bce9a251..437727053 100644 --- a/imblearn/under_sampling/prototype_selection/random_under_sampler.py +++ b/imblearn/under_sampling/prototype_selection/random_under_sampler.py @@ -10,8 +10,13 @@ from sklearn.utils import check_random_state, safe_indexing from ..base import BaseUnderSampler +from ...utils import Substitution +from ...utils._docstring import _random_state_docstring +@Substitution( + sampling_strategy=BaseUnderSampler._sampling_strategy_docstring, + random_state=_random_state_docstring) class RandomUnderSampler(BaseUnderSampler): """Class to perform random under-sampling. @@ -22,42 +27,28 @@ class RandomUnderSampler(BaseUnderSampler): Parameters ---------- - ratio : str, dict, or callable, optional (default='auto') - Ratio to use for resampling the data set. - - - If ``str``, has to be one of: (i) ``'minority'``: resample the - minority class; (ii) ``'majority'``: resample the majority class, - (iii) ``'not minority'``: resample all classes apart of the minority - class, (iv) ``'all'``: resample all classes, and (v) ``'auto'``: - correspond to ``'all'`` with for over-sampling methods and ``'not - minority'`` for under-sampling methods. The classes targeted will be - over-sampled or under-sampled to achieve an equal number of sample - with the majority or minority class. - - If ``dict``, the keys correspond to the targeted classes. The values - correspond to the desired number of samples. - - If callable, function taking ``y`` and returns a ``dict``. The keys - correspond to the targeted classes. The values correspond to the - desired number of samples. + {sampling_strategy} return_indices : bool, optional (default=False) Whether or not to return the indices of the samples randomly selected from the majority class. - random_state : int, RandomState instance or None, optional (default=None) - If int, ``random_state`` is the seed used by the random number - generator; If ``RandomState`` instance, random_state is the random - number generator; If ``None``, the random number generator is the - ``RandomState`` instance used by ``np.random``. + {random_state} replacement : boolean, optional (default=False) Whether the sample is with or without replacement. + ratio : str, dict, or callable + .. deprecated:: 0.4 + Use the parameter ``sampling_strategy`` instead. It will be removed + in 0.6. + Notes ----- Supports mutli-class resampling by sampling each class independently. See - :ref:`sphx_glr_auto_examples_plot_ratio_usage.py` and + :ref:`sphx_glr_auto_examples_plot_sampling_strategy_usage.py` and :ref:`sphx_glr_auto_examples_under-sampling_plot_random_under_sampler.py` Examples @@ -70,21 +61,23 @@ class RandomUnderSampler(BaseUnderSampler): >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) - >>> print('Original dataset shape {}'.format(Counter(y))) - Original dataset shape Counter({1: 900, 0: 100}) + >>> print('Original dataset shape %s' % Counter(y)) + Original dataset shape Counter({{1: 900, 0: 100}}) >>> rus = RandomUnderSampler(random_state=42) >>> X_res, y_res = rus.fit_sample(X, y) - >>> print('Resampled dataset shape {}'.format(Counter(y_res))) - Resampled dataset shape Counter({0: 100, 1: 100}) + >>> print('Resampled dataset shape %s' % Counter(y_res)) + Resampled dataset shape Counter({{0: 100, 1: 100}}) """ def __init__(self, - ratio='auto', + sampling_strategy='auto', return_indices=False, random_state=None, - replacement=False): - super(RandomUnderSampler, self).__init__(ratio=ratio) + replacement=False, + ratio=None): + super(RandomUnderSampler, self).__init__( + sampling_strategy=sampling_strategy, ratio=ratio) self.random_state = random_state self.return_indices = return_indices self.replacement = replacement @@ -120,8 +113,8 @@ def _sample(self, X, y): idx_under = np.empty((0, ), dtype=int) for target_class in np.unique(y): - if target_class in self.ratio_.keys(): - n_samples = self.ratio_[target_class] + if target_class in self.sampling_strategy_.keys(): + n_samples = self.sampling_strategy_[target_class] index_target_class = random_state.choice( range(np.count_nonzero(y == target_class)), size=n_samples, @@ -130,8 +123,9 @@ def _sample(self, X, y): index_target_class = slice(None) idx_under = np.concatenate( - (idx_under, np.flatnonzero(y == target_class)[ - index_target_class]), axis=0) + (idx_under, + np.flatnonzero(y == target_class)[index_target_class]), + axis=0) if self.return_indices: return (safe_indexing(X, idx_under), safe_indexing(y, idx_under), diff --git a/imblearn/under_sampling/prototype_selection/tests/test_allknn.py b/imblearn/under_sampling/prototype_selection/tests/test_allknn.py index e00c154ae..60c6275f4 100644 --- a/imblearn/under_sampling/prototype_selection/tests/test_allknn.py +++ b/imblearn/under_sampling/prototype_selection/tests/test_allknn.py @@ -15,26 +15,30 @@ from imblearn.under_sampling import AllKNN from imblearn.utils.testing import warns - -X = np.array([[-0.12840393, 0.66446571], [1.32319756, -0.13181616], - [0.04296502, -0.37981873], [0.83631853, 0.18569783], - [1.02956816, 0.36061601], [1.12202806, 0.33811558], - [-0.53171468, -0.53735182], [1.3381556, 0.35956356], - [-0.35946678, 0.72510189], [1.32326943, 0.28393874], - [2.94290565, -0.13986434], [0.28294738, -1.00125525], - [0.34218094, -0.58781961], [-0.88864036, -0.33782387], - [-1.10146139, 0.91782682], [-0.7969716, -0.50493969], - [0.73489726, 0.43915195], [0.2096964, -0.61814058], - [-0.28479268, 0.70459548], [1.84864913, 0.14729596], - [1.59068979, -0.96622933], [0.73418199, -0.02222847], - [0.50307437, 0.498805], [0.84929742, 0.41042894], - [0.62649535, 0.46600596], [0.79270821, -0.41386668], - [1.16606871, -0.25641059], [1.57356906, 0.30390519], - [1.0304995, -0.16955962], [1.67314371, 0.19231498], - [0.98382284, 0.37184502], [0.48921682, -1.38504507], - [-0.46226554, -0.50481004], [-0.03918551, -0.68540745], - [0.24991051, -1.00864997], [0.80541964, -0.34465185], - [0.1732627, -1.61323172], [0.69804044, 0.44810796], +X = np.array([[-0.12840393, 0.66446571], [1.32319756, -0.13181616], [ + 0.04296502, -0.37981873 +], [0.83631853, 0.18569783], [1.02956816, 0.36061601], [ + 1.12202806, 0.33811558 +], [-0.53171468, -0.53735182], [1.3381556, 0.35956356], [ + -0.35946678, 0.72510189 +], [1.32326943, 0.28393874], [2.94290565, -0.13986434], [ + 0.28294738, -1.00125525 +], [0.34218094, -0.58781961], [-0.88864036, -0.33782387], [ + -1.10146139, 0.91782682 +], [-0.7969716, -0.50493969], [0.73489726, 0.43915195], [ + 0.2096964, -0.61814058 +], [-0.28479268, 0.70459548], [1.84864913, 0.14729596], [ + 1.59068979, -0.96622933 +], [0.73418199, -0.02222847], [0.50307437, 0.498805], [0.84929742, 0.41042894], + [0.62649535, 0.46600596], [0.79270821, -0.41386668], [ + 1.16606871, -0.25641059 + ], [1.57356906, 0.30390519], [1.0304995, -0.16955962], [ + 1.67314371, 0.19231498 + ], [0.98382284, 0.37184502], [0.48921682, -1.38504507], [ + -0.46226554, -0.50481004 + ], [-0.03918551, -0.68540745], [0.24991051, -1.00864997], [ + 0.80541964, -0.34465185 + ], [0.1732627, -1.61323172], [0.69804044, 0.44810796], [-0.5506368, -0.42072426], [-0.34474418, 0.21969797]]) Y = np.array([ 1, 2, 2, 2, 1, 1, 0, 2, 1, 1, 1, 2, 2, 0, 1, 2, 1, 2, 1, 1, 2, 2, 1, 1, 1, @@ -47,20 +51,23 @@ def test_allknn_fit_sample(): allknn = AllKNN() X_resampled, y_resampled = allknn.fit_sample(X, Y) - X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387], - [-0.46226554, -0.50481004], [-0.34474418, 0.21969797], - [1.02956816, 0.36061601], [1.12202806, 0.33811558], - [-1.10146139, 0.91782682], [0.73489726, 0.43915195], - [0.50307437, 0.498805], [0.84929742, 0.41042894], - [0.62649535, 0.46600596], [0.98382284, 0.37184502], - [0.69804044, 0.44810796], [0.04296502, -0.37981873], - [0.28294738, -1.00125525], [0.34218094, -0.58781961], - [0.2096964, -0.61814058], [1.59068979, -0.96622933], - [0.73418199, -0.02222847], [0.79270821, -0.41386668], - [1.16606871, -0.25641059], [1.0304995, -0.16955962], - [0.48921682, -1.38504507], [-0.03918551, -0.68540745], - [0.24991051, -1.00864997], [0.80541964, -0.34465185], - [0.1732627, -1.61323172]]) + X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387], [ + -0.46226554, -0.50481004 + ], [-0.34474418, 0.21969797], [1.02956816, 0.36061601], [ + 1.12202806, 0.33811558 + ], [-1.10146139, 0.91782682], [0.73489726, 0.43915195], [ + 0.50307437, 0.498805 + ], [0.84929742, 0.41042894], [0.62649535, 0.46600596], [ + 0.98382284, 0.37184502 + ], [0.69804044, 0.44810796], [0.04296502, -0.37981873], [ + 0.28294738, -1.00125525 + ], [0.34218094, -0.58781961], [0.2096964, -0.61814058], [ + 1.59068979, -0.96622933 + ], [0.73418199, -0.02222847], [0.79270821, -0.41386668], [ + 1.16606871, -0.25641059 + ], [1.0304995, -0.16955962], [0.48921682, -1.38504507], + [-0.03918551, -0.68540745], [0.24991051, -1.00864997], + [0.80541964, -0.34465185], [0.1732627, -1.61323172]]) y_gt = np.array([ 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 @@ -70,10 +77,17 @@ def test_allknn_fit_sample(): def test_all_knn_allow_minority(): - X, y = make_classification(n_samples=10000, n_features=2, n_informative=2, - n_redundant=0, n_repeated=0, n_classes=3, - n_clusters_per_class=1, weights=[0.2, 0.3, 0.5], - class_sep=0.4, random_state=0) + X, y = make_classification( + n_samples=10000, + n_features=2, + n_informative=2, + n_redundant=0, + n_repeated=0, + n_classes=3, + n_clusters_per_class=1, + weights=[0.2, 0.3, 0.5], + class_sep=0.4, + random_state=0) allknn = AllKNN(allow_minority=True) X_res_1, y_res_1 = allknn.fit_sample(X, y) @@ -86,20 +100,23 @@ def test_allknn_fit_sample_with_indices(): allknn = AllKNN(return_indices=True) X_resampled, y_resampled, idx_under = allknn.fit_sample(X, Y) - X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387], - [-0.46226554, -0.50481004], [-0.34474418, 0.21969797], - [1.02956816, 0.36061601], [1.12202806, 0.33811558], - [-1.10146139, 0.91782682], [0.73489726, 0.43915195], - [0.50307437, 0.498805], [0.84929742, 0.41042894], - [0.62649535, 0.46600596], [0.98382284, 0.37184502], - [0.69804044, 0.44810796], [0.04296502, -0.37981873], - [0.28294738, -1.00125525], [0.34218094, -0.58781961], - [0.2096964, -0.61814058], [1.59068979, -0.96622933], - [0.73418199, -0.02222847], [0.79270821, -0.41386668], - [1.16606871, -0.25641059], [1.0304995, -0.16955962], - [0.48921682, -1.38504507], [-0.03918551, -0.68540745], - [0.24991051, -1.00864997], [0.80541964, -0.34465185], - [0.1732627, -1.61323172]]) + X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387], [ + -0.46226554, -0.50481004 + ], [-0.34474418, 0.21969797], [1.02956816, 0.36061601], [ + 1.12202806, 0.33811558 + ], [-1.10146139, 0.91782682], [0.73489726, 0.43915195], [ + 0.50307437, 0.498805 + ], [0.84929742, 0.41042894], [0.62649535, 0.46600596], [ + 0.98382284, 0.37184502 + ], [0.69804044, 0.44810796], [0.04296502, -0.37981873], [ + 0.28294738, -1.00125525 + ], [0.34218094, -0.58781961], [0.2096964, -0.61814058], [ + 1.59068979, -0.96622933 + ], [0.73418199, -0.02222847], [0.79270821, -0.41386668], [ + 1.16606871, -0.25641059 + ], [1.0304995, -0.16955962], [0.48921682, -1.38504507], + [-0.03918551, -0.68540745], [0.24991051, -1.00864997], + [0.80541964, -0.34465185], [0.1732627, -1.61323172]]) y_gt = np.array([ 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 @@ -117,22 +134,26 @@ def test_allknn_fit_sample_mode(): allknn = AllKNN(kind_sel='mode') X_resampled, y_resampled = allknn.fit_sample(X, Y) - X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387], - [-0.46226554, -0.50481004], [-0.34474418, 0.21969797], - [-0.12840393, 0.66446571], [1.02956816, 0.36061601], - [1.12202806, 0.33811558], [-0.35946678, 0.72510189], - [-1.10146139, 0.91782682], [0.73489726, 0.43915195], - [-0.28479268, 0.70459548], [0.50307437, 0.498805], - [0.84929742, 0.41042894], [0.62649535, 0.46600596], - [0.98382284, 0.37184502], [0.69804044, 0.44810796], - [1.32319756, -0.13181616], [0.04296502, -0.37981873], - [0.28294738, -1.00125525], [0.34218094, -0.58781961], - [0.2096964, -0.61814058], [1.59068979, -0.96622933], - [0.73418199, -0.02222847], [0.79270821, -0.41386668], - [1.16606871, -0.25641059], [1.0304995, -0.16955962], - [0.48921682, -1.38504507], [-0.03918551, -0.68540745], - [0.24991051, -1.00864997], [0.80541964, -0.34465185], - [0.1732627, -1.61323172]]) + X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387], [ + -0.46226554, -0.50481004 + ], [-0.34474418, 0.21969797], [-0.12840393, 0.66446571], [ + 1.02956816, 0.36061601 + ], [1.12202806, 0.33811558], [-0.35946678, 0.72510189], [ + -1.10146139, 0.91782682 + ], [0.73489726, 0.43915195], [-0.28479268, 0.70459548], [ + 0.50307437, 0.498805 + ], [0.84929742, 0.41042894], [0.62649535, 0.46600596], [ + 0.98382284, 0.37184502 + ], [0.69804044, 0.44810796], [1.32319756, -0.13181616], [ + 0.04296502, -0.37981873 + ], [0.28294738, -1.00125525], [0.34218094, -0.58781961], [ + 0.2096964, -0.61814058 + ], [1.59068979, -0.96622933], [0.73418199, -0.02222847], [ + 0.79270821, -0.41386668 + ], [1.16606871, -0.25641059], [1.0304995, -0.16955962], [ + 0.48921682, -1.38504507 + ], [-0.03918551, -0.68540745], [0.24991051, -1.00864997], + [0.80541964, -0.34465185], [0.1732627, -1.61323172]]) y_gt = np.array([ 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 @@ -146,22 +167,26 @@ def test_allknn_fit_sample_with_nn_object(): allknn = AllKNN(n_neighbors=nn, kind_sel='mode') X_resampled, y_resampled = allknn.fit_sample(X, Y) - X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387], - [-0.46226554, -0.50481004], [-0.34474418, 0.21969797], - [-0.12840393, 0.66446571], [1.02956816, 0.36061601], - [1.12202806, 0.33811558], [-0.35946678, 0.72510189], - [-1.10146139, 0.91782682], [0.73489726, 0.43915195], - [-0.28479268, 0.70459548], [0.50307437, 0.498805], - [0.84929742, 0.41042894], [0.62649535, 0.46600596], - [0.98382284, 0.37184502], [0.69804044, 0.44810796], - [1.32319756, -0.13181616], [0.04296502, -0.37981873], - [0.28294738, -1.00125525], [0.34218094, -0.58781961], - [0.2096964, -0.61814058], [1.59068979, -0.96622933], - [0.73418199, -0.02222847], [0.79270821, -0.41386668], - [1.16606871, -0.25641059], [1.0304995, -0.16955962], - [0.48921682, -1.38504507], [-0.03918551, -0.68540745], - [0.24991051, -1.00864997], [0.80541964, -0.34465185], - [0.1732627, -1.61323172]]) + X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387], [ + -0.46226554, -0.50481004 + ], [-0.34474418, 0.21969797], [-0.12840393, 0.66446571], [ + 1.02956816, 0.36061601 + ], [1.12202806, 0.33811558], [-0.35946678, 0.72510189], [ + -1.10146139, 0.91782682 + ], [0.73489726, 0.43915195], [-0.28479268, 0.70459548], [ + 0.50307437, 0.498805 + ], [0.84929742, 0.41042894], [0.62649535, 0.46600596], [ + 0.98382284, 0.37184502 + ], [0.69804044, 0.44810796], [1.32319756, -0.13181616], [ + 0.04296502, -0.37981873 + ], [0.28294738, -1.00125525], [0.34218094, -0.58781961], [ + 0.2096964, -0.61814058 + ], [1.59068979, -0.96622933], [0.73418199, -0.02222847], [ + 0.79270821, -0.41386668 + ], [1.16606871, -0.25641059], [1.0304995, -0.16955962], [ + 0.48921682, -1.38504507 + ], [-0.03918551, -0.68540745], [0.24991051, -1.00864997], + [0.80541964, -0.34465185], [0.1732627, -1.61323172]]) y_gt = np.array([ 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 @@ -179,6 +204,6 @@ def test_alknn_not_good_object(): def test_deprecation_random_state(): allknn = AllKNN(random_state=0) - with warns(DeprecationWarning, - match="'random_state' is deprecated from 0.4"): + with warns( + DeprecationWarning, match="'random_state' is deprecated from 0.4"): allknn.fit_sample(X, Y) diff --git a/imblearn/under_sampling/prototype_selection/tests/test_condensed_nearest_neighbour.py b/imblearn/under_sampling/prototype_selection/tests/test_condensed_nearest_neighbour.py index 28917c38b..e45a51b24 100644 --- a/imblearn/under_sampling/prototype_selection/tests/test_condensed_nearest_neighbour.py +++ b/imblearn/under_sampling/prototype_selection/tests/test_condensed_nearest_neighbour.py @@ -14,16 +14,19 @@ from imblearn.under_sampling import CondensedNearestNeighbour RND_SEED = 0 -X = np.array([[2.59928271, 0.93323465], [0.25738379, 0.95564169], - [1.42772181, 0.526027], [1.92365863, 0.82718767], - [-0.10903849, -0.12085181], [-0.284881, -0.62730973], - [0.57062627, 1.19528323], [0.03394306, 0.03986753], - [0.78318102, 2.59153329], [0.35831463, 1.33483198], - [-0.14313184, -1.0412815], [0.01936241, 0.17799828], - [-1.25020462, -0.40402054], [-0.09816301, -0.74662486], - [-0.01252787, 0.34102657], [0.52726792, -0.38735648], - [0.2821046, -0.07862747], [0.05230552, 0.09043907], - [0.15198585, 0.12512646], [0.70524765, 0.39816382]]) +X = np.array([[2.59928271, 0.93323465], [0.25738379, 0.95564169], [ + 1.42772181, 0.526027 +], [1.92365863, 0.82718767], [-0.10903849, + -0.12085181], [-0.284881, -0.62730973], + [0.57062627, 1.19528323], [0.03394306, + 0.03986753], [0.78318102, 2.59153329], + [0.35831463, 1.33483198], [-0.14313184, -1.0412815], [ + 0.01936241, 0.17799828 + ], [-1.25020462, -0.40402054], [-0.09816301, -0.74662486], [ + -0.01252787, 0.34102657 + ], [0.52726792, -0.38735648], [0.2821046, -0.07862747], [ + 0.05230552, 0.09043907 + ], [0.15198585, 0.12512646], [0.70524765, 0.39816382]]) Y = np.array([1, 2, 1, 1, 0, 2, 2, 2, 2, 2, 2, 0, 1, 2, 2, 2, 2, 1, 2, 1]) @@ -38,9 +41,10 @@ def test_cnn_fit_sample(): cnn = CondensedNearestNeighbour(random_state=RND_SEED) X_resampled, y_resampled = cnn.fit_sample(X, Y) - X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828], - [0.05230552, 0.09043907], [-1.25020462, -0.40402054], - [0.70524765, 0.39816382], [0.35831463, 1.33483198], + X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828], [ + 0.05230552, 0.09043907 + ], [-1.25020462, -0.40402054], [0.70524765, + 0.39816382], [0.35831463, 1.33483198], [-0.284881, -0.62730973], [0.03394306, 0.03986753], [-0.01252787, 0.34102657], [0.15198585, 0.12512646]]) y_gt = np.array([0, 0, 1, 1, 1, 2, 2, 2, 2, 2]) @@ -52,9 +56,10 @@ def test_cnn_fit_sample_with_indices(): cnn = CondensedNearestNeighbour(return_indices=True, random_state=RND_SEED) X_resampled, y_resampled, idx_under = cnn.fit_sample(X, Y) - X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828], - [0.05230552, 0.09043907], [-1.25020462, -0.40402054], - [0.70524765, 0.39816382], [0.35831463, 1.33483198], + X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828], [ + 0.05230552, 0.09043907 + ], [-1.25020462, -0.40402054], [0.70524765, + 0.39816382], [0.35831463, 1.33483198], [-0.284881, -0.62730973], [0.03394306, 0.03986753], [-0.01252787, 0.34102657], [0.15198585, 0.12512646]]) y_gt = np.array([0, 0, 1, 1, 1, 2, 2, 2, 2, 2]) @@ -69,9 +74,10 @@ def test_cnn_fit_sample_with_object(): cnn = CondensedNearestNeighbour(random_state=RND_SEED, n_neighbors=knn) X_resampled, y_resampled = cnn.fit_sample(X, Y) - X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828], - [0.05230552, 0.09043907], [-1.25020462, -0.40402054], - [0.70524765, 0.39816382], [0.35831463, 1.33483198], + X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828], [ + 0.05230552, 0.09043907 + ], [-1.25020462, -0.40402054], [0.70524765, + 0.39816382], [0.35831463, 1.33483198], [-0.284881, -0.62730973], [0.03394306, 0.03986753], [-0.01252787, 0.34102657], [0.15198585, 0.12512646]]) y_gt = np.array([0, 0, 1, 1, 1, 2, 2, 2, 2, 2]) diff --git a/imblearn/under_sampling/prototype_selection/tests/test_edited_nearest_neighbours.py b/imblearn/under_sampling/prototype_selection/tests/test_edited_nearest_neighbours.py index ec223285e..bc8c825b6 100644 --- a/imblearn/under_sampling/prototype_selection/tests/test_edited_nearest_neighbours.py +++ b/imblearn/under_sampling/prototype_selection/tests/test_edited_nearest_neighbours.py @@ -15,16 +15,19 @@ from imblearn.under_sampling import EditedNearestNeighbours from imblearn.utils.testing import warns -X = np.array([[2.59928271, 0.93323465], [0.25738379, 0.95564169], - [1.42772181, 0.526027], [1.92365863, 0.82718767], - [-0.10903849, -0.12085181], [-0.284881, -0.62730973], - [0.57062627, 1.19528323], [0.03394306, 0.03986753], - [0.78318102, 2.59153329], [0.35831463, 1.33483198], - [-0.14313184, -1.0412815], [0.01936241, 0.17799828], - [-1.25020462, -0.40402054], [-0.09816301, -0.74662486], - [-0.01252787, 0.34102657], [0.52726792, -0.38735648], - [0.2821046, -0.07862747], [0.05230552, 0.09043907], - [0.15198585, 0.12512646], [0.70524765, 0.39816382]]) +X = np.array([[2.59928271, 0.93323465], [0.25738379, 0.95564169], [ + 1.42772181, 0.526027 +], [1.92365863, 0.82718767], [-0.10903849, + -0.12085181], [-0.284881, -0.62730973], + [0.57062627, 1.19528323], [0.03394306, + 0.03986753], [0.78318102, 2.59153329], + [0.35831463, 1.33483198], [-0.14313184, -1.0412815], [ + 0.01936241, 0.17799828 + ], [-1.25020462, -0.40402054], [-0.09816301, -0.74662486], [ + -0.01252787, 0.34102657 + ], [0.52726792, -0.38735648], [0.2821046, -0.07862747], [ + 0.05230552, 0.09043907 + ], [0.15198585, 0.12512646], [0.70524765, 0.39816382]]) Y = np.array([1, 2, 1, 1, 0, 2, 2, 2, 2, 2, 2, 0, 1, 2, 2, 2, 2, 1, 2, 1]) @@ -40,10 +43,10 @@ def test_enn_fit_sample(): enn = EditedNearestNeighbours() X_resampled, y_resampled = enn.fit_sample(X, Y) - X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828], - [2.59928271, 0.93323465], [1.92365863, 0.82718767], - [0.25738379, 0.95564169], [0.78318102, 2.59153329], - [0.52726792, -0.38735648]]) + X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828], [ + 2.59928271, 0.93323465 + ], [1.92365863, 0.82718767], [0.25738379, 0.95564169], + [0.78318102, 2.59153329], [0.52726792, -0.38735648]]) y_gt = np.array([0, 0, 1, 1, 2, 2, 2]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) @@ -53,10 +56,10 @@ def test_enn_fit_sample_with_indices(): enn = EditedNearestNeighbours(return_indices=True) X_resampled, y_resampled, idx_under = enn.fit_sample(X, Y) - X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828], - [2.59928271, 0.93323465], [1.92365863, 0.82718767], - [0.25738379, 0.95564169], [0.78318102, 2.59153329], - [0.52726792, -0.38735648]]) + X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828], [ + 2.59928271, 0.93323465 + ], [1.92365863, 0.82718767], [0.25738379, 0.95564169], + [0.78318102, 2.59153329], [0.52726792, -0.38735648]]) y_gt = np.array([0, 0, 1, 1, 2, 2, 2]) idx_gt = np.array([4, 11, 0, 3, 1, 8, 15]) assert_array_equal(X_resampled, X_gt) @@ -68,10 +71,11 @@ def test_enn_fit_sample_mode(): enn = EditedNearestNeighbours(kind_sel='mode') X_resampled, y_resampled = enn.fit_sample(X, Y) - X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828], - [2.59928271, 0.93323465], [1.42772181, 0.526027], - [1.92365863, 0.82718767], [0.25738379, 0.95564169], - [-0.284881, -0.62730973], [0.57062627, 1.19528323], + X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828], [ + 2.59928271, 0.93323465 + ], [1.42772181, 0.526027], [1.92365863, 0.82718767], [ + 0.25738379, 0.95564169 + ], [-0.284881, -0.62730973], [0.57062627, 1.19528323], [0.78318102, 2.59153329], [0.35831463, 1.33483198], [-0.14313184, -1.0412815], [-0.09816301, -0.74662486], [0.52726792, -0.38735648], [0.2821046, -0.07862747]]) @@ -82,14 +86,14 @@ def test_enn_fit_sample_mode(): def test_enn_fit_sample_with_nn_object(): nn = NearestNeighbors(n_neighbors=4) - enn = EditedNearestNeighbours( - n_neighbors=nn, kind_sel='mode') + enn = EditedNearestNeighbours(n_neighbors=nn, kind_sel='mode') X_resampled, y_resampled = enn.fit_sample(X, Y) - X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828], - [2.59928271, 0.93323465], [1.42772181, 0.526027], - [1.92365863, 0.82718767], [0.25738379, 0.95564169], - [-0.284881, -0.62730973], [0.57062627, 1.19528323], + X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828], [ + 2.59928271, 0.93323465 + ], [1.42772181, 0.526027], [1.92365863, 0.82718767], [ + 0.25738379, 0.95564169 + ], [-0.284881, -0.62730973], [0.57062627, 1.19528323], [0.78318102, 2.59153329], [0.35831463, 1.33483198], [-0.14313184, -1.0412815], [-0.09816301, -0.74662486], [0.52726792, -0.38735648], [0.2821046, -0.07862747]]) @@ -100,14 +104,13 @@ def test_enn_fit_sample_with_nn_object(): def test_enn_not_good_object(): nn = 'rnd' - enn = EditedNearestNeighbours( - n_neighbors=nn, kind_sel='mode') + enn = EditedNearestNeighbours(n_neighbors=nn, kind_sel='mode') with raises(ValueError, match="has to be one of"): enn.fit_sample(X, Y) def test_deprecation_random_state(): enn = EditedNearestNeighbours(random_state=0) - with warns(DeprecationWarning, - match="'random_state' is deprecated from 0.4"): + with warns( + DeprecationWarning, match="'random_state' is deprecated from 0.4"): enn.fit_sample(X, Y) diff --git a/imblearn/under_sampling/prototype_selection/tests/test_instance_hardness_threshold.py b/imblearn/under_sampling/prototype_selection/tests/test_instance_hardness_threshold.py index 11fb23941..a5fbf6931 100644 --- a/imblearn/under_sampling/prototype_selection/tests/test_instance_hardness_threshold.py +++ b/imblearn/under_sampling/prototype_selection/tests/test_instance_hardness_threshold.py @@ -13,26 +13,26 @@ from imblearn.under_sampling import InstanceHardnessThreshold - RND_SEED = 0 -X = np.array([[-0.3879569, 0.6894251], [-0.09322739, 1.28177189], - [-0.77740357, 0.74097941], [0.91542919, -0.65453327], - [-0.03852113, 0.40910479], [-0.43877303, 1.07366684], - [-0.85795321, 0.82980738], [-0.18430329, 0.52328473], - [-0.30126957, -0.66268378], [-0.65571327, 0.42412021], - [-0.28305528, 0.30284991], [0.20246714, -0.34727125], - [1.06446472, -1.09279772], [0.30543283, -0.02589502], - [-0.00717161, 0.00318087]]) +X = np.array([[-0.3879569, 0.6894251], [-0.09322739, 1.28177189], [ + -0.77740357, 0.74097941 +], [0.91542919, -0.65453327], [-0.03852113, 0.40910479], [ + -0.43877303, 1.07366684 +], [-0.85795321, 0.82980738], [-0.18430329, 0.52328473], [ + -0.30126957, -0.66268378 +], [-0.65571327, 0.42412021], [-0.28305528, 0.30284991], + [0.20246714, -0.34727125], [1.06446472, -1.09279772], + [0.30543283, -0.02589502], [-0.00717161, 0.00318087]]) Y = np.array([0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0]) ESTIMATOR = GradientBoostingClassifier(random_state=RND_SEED) def test_iht_init(): - ratio = 'auto' + sampling_strategy = 'auto' iht = InstanceHardnessThreshold( - ESTIMATOR, ratio=ratio, random_state=RND_SEED) + ESTIMATOR, sampling_strategy=sampling_strategy, random_state=RND_SEED) - assert iht.ratio == ratio + assert iht.sampling_strategy == sampling_strategy assert iht.random_state == RND_SEED @@ -40,18 +40,13 @@ def test_iht_fit_sample(): iht = InstanceHardnessThreshold(ESTIMATOR, random_state=RND_SEED) X_resampled, y_resampled = iht.fit_sample(X, Y) - X_gt = np.array([[-0.3879569, 0.6894251], - [0.91542919, -0.65453327], - [-0.65571327, 0.42412021], - [1.06446472, -1.09279772], - [0.30543283, -0.02589502], - [-0.00717161, 0.00318087], - [-0.09322739, 1.28177189], - [-0.77740357, 0.74097941], - [-0.43877303, 1.07366684], - [-0.85795321, 0.82980738], - [-0.18430329, 0.52328473], - [-0.28305528, 0.30284991]]) + X_gt = np.array([[-0.3879569, 0.6894251], [0.91542919, -0.65453327], [ + -0.65571327, 0.42412021 + ], [1.06446472, -1.09279772], [0.30543283, -0.02589502], [ + -0.00717161, 0.00318087 + ], [-0.09322739, 1.28177189], [-0.77740357, 0.74097941], + [-0.43877303, 1.07366684], [-0.85795321, 0.82980738], + [-0.18430329, 0.52328473], [-0.28305528, 0.30284991]]) y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) @@ -62,18 +57,13 @@ def test_iht_fit_sample_with_indices(): ESTIMATOR, return_indices=True, random_state=RND_SEED) X_resampled, y_resampled, idx_under = iht.fit_sample(X, Y) - X_gt = np.array([[-0.3879569, 0.6894251], - [0.91542919, -0.65453327], - [-0.65571327, 0.42412021], - [1.06446472, -1.09279772], - [0.30543283, -0.02589502], - [-0.00717161, 0.00318087], - [-0.09322739, 1.28177189], - [-0.77740357, 0.74097941], - [-0.43877303, 1.07366684], - [-0.85795321, 0.82980738], - [-0.18430329, 0.52328473], - [-0.28305528, 0.30284991]]) + X_gt = np.array([[-0.3879569, 0.6894251], [0.91542919, -0.65453327], [ + -0.65571327, 0.42412021 + ], [1.06446472, -1.09279772], [0.30543283, -0.02589502], [ + -0.00717161, 0.00318087 + ], [-0.09322739, 1.28177189], [-0.77740357, 0.74097941], + [-0.43877303, 1.07366684], [-0.85795321, 0.82980738], + [-0.18430329, 0.52328473], [-0.28305528, 0.30284991]]) y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) idx_gt = np.array([0, 3, 9, 12, 13, 14, 1, 2, 5, 6, 7, 10]) assert_array_equal(X_resampled, X_gt) @@ -82,25 +72,19 @@ def test_iht_fit_sample_with_indices(): def test_iht_fit_sample_half(): - ratio = {0: 6, 1: 8} + sampling_strategy = {0: 6, 1: 8} iht = InstanceHardnessThreshold( - ESTIMATOR, ratio=ratio, random_state=RND_SEED) + ESTIMATOR, sampling_strategy=sampling_strategy, random_state=RND_SEED) X_resampled, y_resampled = iht.fit_sample(X, Y) - X_gt = np.array([[-0.3879569, 0.6894251], - [0.91542919, -0.65453327], - [-0.65571327, 0.42412021], - [1.06446472, -1.09279772], - [0.30543283, -0.02589502], - [-0.00717161, 0.00318087], - [-0.09322739, 1.28177189], - [-0.77740357, 0.74097941], - [-0.03852113, 0.40910479], - [-0.43877303, 1.07366684], - [-0.85795321, 0.82980738], - [-0.18430329, 0.52328473], - [-0.30126957, -0.66268378], - [-0.28305528, 0.30284991]]) + X_gt = np.array([[-0.3879569, 0.6894251], [0.91542919, -0.65453327], [ + -0.65571327, 0.42412021 + ], [1.06446472, -1.09279772], [0.30543283, -0.02589502], [ + -0.00717161, 0.00318087 + ], [-0.09322739, 1.28177189], [-0.77740357, 0.74097941], + [-0.03852113, 0.40910479], [-0.43877303, 1.07366684], + [-0.85795321, 0.82980738], [-0.18430329, 0.52328473], + [-0.30126957, -0.66268378], [-0.28305528, 0.30284991]]) y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) @@ -111,18 +95,13 @@ def test_iht_fit_sample_class_obj(): iht = InstanceHardnessThreshold(estimator=est, random_state=RND_SEED) X_resampled, y_resampled = iht.fit_sample(X, Y) - X_gt = np.array([[-0.3879569, 0.6894251], - [0.91542919, -0.65453327], - [-0.65571327, 0.42412021], - [1.06446472, -1.09279772], - [0.30543283, -0.02589502], - [-0.00717161, 0.00318087], - [-0.09322739, 1.28177189], - [-0.77740357, 0.74097941], - [-0.43877303, 1.07366684], - [-0.85795321, 0.82980738], - [-0.18430329, 0.52328473], - [-0.28305528, 0.30284991]]) + X_gt = np.array([[-0.3879569, 0.6894251], [0.91542919, -0.65453327], [ + -0.65571327, 0.42412021 + ], [1.06446472, -1.09279772], [0.30543283, -0.02589502], [ + -0.00717161, 0.00318087 + ], [-0.09322739, 1.28177189], [-0.77740357, 0.74097941], + [-0.43877303, 1.07366684], [-0.85795321, 0.82980738], + [-0.18430329, 0.52328473], [-0.28305528, 0.30284991]]) y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) diff --git a/imblearn/under_sampling/prototype_selection/tests/test_nearmiss.py b/imblearn/under_sampling/prototype_selection/tests/test_nearmiss.py index 1f2f29daf..b120e85bb 100644 --- a/imblearn/under_sampling/prototype_selection/tests/test_nearmiss.py +++ b/imblearn/under_sampling/prototype_selection/tests/test_nearmiss.py @@ -14,22 +14,15 @@ from imblearn.under_sampling import NearMiss from imblearn.utils.testing import warns - -X = np.array([[1.17737838, -0.2002118], - [0.4960075, 0.86130762], - [-0.05903827, 0.10947647], - [0.91464286, 1.61369212], - [-0.54619583, 1.73009918], - [-0.60413357, 0.24628718], - [0.45713638, 1.31069295], - [-0.04032409, 3.01186964], - [0.03142011, 0.12323596], - [0.50701028, -0.17636928], - [-0.80809175, -1.09917302], - [-0.20497017, -0.26630228], - [0.99272351, -0.11631728], - [-1.95581933, 0.69609604], - [1.15157493, -1.2981518]]) +X = np.array([[1.17737838, -0.2002118], [0.4960075, 0.86130762], [ + -0.05903827, 0.10947647 +], [0.91464286, 1.61369212], [-0.54619583, 1.73009918], + [-0.60413357, 0.24628718], [0.45713638, 1.31069295], + [-0.04032409, 3.01186964], [0.03142011, 0.12323596], [ + 0.50701028, -0.17636928 + ], [-0.80809175, -1.09917302], [-0.20497017, -0.26630228], [ + 0.99272351, -0.11631728 + ], [-1.95581933, 0.69609604], [1.15157493, -1.2981518]]) Y = np.array([1, 2, 1, 0, 2, 1, 2, 2, 1, 2, 0, 0, 2, 1, 2]) VERSION_NEARMISS = (1, 2, 3) @@ -43,190 +36,163 @@ def test_nearmiss_wrong_version(): def test_nm_wrong_nn_obj(): - ratio = 'auto' + sampling_strategy = 'auto' nn = 'rnd' - nm = NearMiss(ratio=ratio, - version=VERSION_NEARMISS, - return_indices=True, - n_neighbors=nn) + nm = NearMiss( + sampling_strategy=sampling_strategy, + version=VERSION_NEARMISS, + return_indices=True, + n_neighbors=nn) with raises(ValueError, match="has to be one of"): nm.fit_sample(X, Y) nn3 = 'rnd' nn = NearestNeighbors(n_neighbors=3) - nm3 = NearMiss(ratio=ratio, - version=3, return_indices=True, - n_neighbors=nn, n_neighbors_ver3=nn3) + nm3 = NearMiss( + sampling_strategy=sampling_strategy, + version=3, + return_indices=True, + n_neighbors=nn, + n_neighbors_ver3=nn3) with raises(ValueError, match="has to be one of"): nm3.fit_sample(X, Y) def test_nm_fit_sample_auto(): - ratio = 'auto' - X_gt = [np.array([[0.91464286, 1.61369212], - [-0.80809175, -1.09917302], - [-0.20497017, -0.26630228], - [-0.05903827, 0.10947647], - [0.03142011, 0.12323596], - [-0.60413357, 0.24628718], - [0.50701028, -0.17636928], - [0.4960075, 0.86130762], - [0.45713638, 1.31069295]]), - np.array([[0.91464286, 1.61369212], - [-0.80809175, -1.09917302], - [-0.20497017, -0.26630228], - [-0.05903827, 0.10947647], - [0.03142011, 0.12323596], - [-0.60413357, 0.24628718], - [0.50701028, -0.17636928], - [0.4960075, 0.86130762], - [0.45713638, 1.31069295]]), - np.array([[0.91464286, 1.61369212], - [-0.80809175, -1.09917302], - [-0.20497017, -0.26630228], - [1.17737838, -0.2002118], - [-0.60413357, 0.24628718], - [0.03142011, 0.12323596], - [1.15157493, -1.2981518], - [-0.54619583, 1.73009918], - [0.99272351, -0.11631728]])] - y_gt = [np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), - np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), - np.array([0, 0, 0, 1, 1, 1, 2, 2, 2])] + sampling_strategy = 'auto' + X_gt = [ + np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302], [ + -0.20497017, -0.26630228 + ], [-0.05903827, 0.10947647], [0.03142011, 0.12323596], + [-0.60413357, 0.24628718], [0.50701028, -0.17636928], + [0.4960075, 0.86130762], [0.45713638, 1.31069295]]), + np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302], [ + -0.20497017, -0.26630228 + ], [-0.05903827, 0.10947647], [0.03142011, 0.12323596], + [-0.60413357, 0.24628718], [0.50701028, -0.17636928], + [0.4960075, 0.86130762], [0.45713638, 1.31069295]]), + np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302], [ + -0.20497017, -0.26630228 + ], [1.17737838, -0.2002118], [-0.60413357, 0.24628718], + [0.03142011, 0.12323596], [1.15157493, -1.2981518], + [-0.54619583, 1.73009918], [0.99272351, -0.11631728]]) + ] + y_gt = [ + np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), + np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), + np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]) + ] for version_idx, version in enumerate(VERSION_NEARMISS): - nm = NearMiss(ratio=ratio, - version=version) + nm = NearMiss(sampling_strategy=sampling_strategy, version=version) X_resampled, y_resampled = nm.fit_sample(X, Y) assert_array_equal(X_resampled, X_gt[version_idx]) assert_array_equal(y_resampled, y_gt[version_idx]) def test_nm_fit_sample_auto_indices(): - ratio = 'auto' - X_gt = [np.array([[0.91464286, 1.61369212], - [-0.80809175, -1.09917302], - [-0.20497017, -0.26630228], - [-0.05903827, 0.10947647], - [0.03142011, 0.12323596], - [-0.60413357, 0.24628718], - [0.50701028, -0.17636928], - [0.4960075, 0.86130762], - [0.45713638, 1.31069295]]), - np.array([[0.91464286, 1.61369212], - [-0.80809175, -1.09917302], - [-0.20497017, -0.26630228], - [-0.05903827, 0.10947647], - [0.03142011, 0.12323596], - [-0.60413357, 0.24628718], - [0.50701028, -0.17636928], - [0.4960075, 0.86130762], - [0.45713638, 1.31069295]]), - np.array([[0.91464286, 1.61369212], - [-0.80809175, -1.09917302], - [-0.20497017, -0.26630228], - [1.17737838, -0.2002118], - [-0.60413357, 0.24628718], - [0.03142011, 0.12323596], - [1.15157493, -1.2981518], - [-0.54619583, 1.73009918], - [0.99272351, -0.11631728]])] - y_gt = [np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), - np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), - np.array([0, 0, 0, 1, 1, 1, 2, 2, 2])] - idx_gt = [np.array([3, 10, 11, 2, 8, 5, 9, 1, 6]), - np.array([3, 10, 11, 2, 8, 5, 9, 1, 6]), - np.array([3, 10, 11, 0, 5, 8, 14, 4, 12])] + sampling_strategy = 'auto' + X_gt = [ + np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302], [ + -0.20497017, -0.26630228 + ], [-0.05903827, 0.10947647], [0.03142011, 0.12323596], + [-0.60413357, 0.24628718], [0.50701028, -0.17636928], + [0.4960075, 0.86130762], [0.45713638, 1.31069295]]), + np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302], [ + -0.20497017, -0.26630228 + ], [-0.05903827, 0.10947647], [0.03142011, 0.12323596], + [-0.60413357, 0.24628718], [0.50701028, -0.17636928], + [0.4960075, 0.86130762], [0.45713638, 1.31069295]]), + np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302], [ + -0.20497017, -0.26630228 + ], [1.17737838, -0.2002118], [-0.60413357, 0.24628718], + [0.03142011, 0.12323596], [1.15157493, -1.2981518], + [-0.54619583, 1.73009918], [0.99272351, -0.11631728]]) + ] + y_gt = [ + np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), + np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), + np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]) + ] + idx_gt = [ + np.array([3, 10, 11, 2, 8, 5, 9, 1, 6]), + np.array([3, 10, 11, 2, 8, 5, 9, 1, 6]), + np.array([3, 10, 11, 0, 5, 8, 14, 4, 12]) + ] for version_idx, version in enumerate(VERSION_NEARMISS): - nm = NearMiss(ratio=ratio, - version=version, return_indices=True) + nm = NearMiss( + sampling_strategy=sampling_strategy, + version=version, + return_indices=True) X_resampled, y_resampled, idx_under = nm.fit_sample(X, Y) assert_array_equal(X_resampled, X_gt[version_idx]) assert_array_equal(y_resampled, y_gt[version_idx]) assert_array_equal(idx_under, idx_gt[version_idx]) -def test_nm_fit_sample_float_ratio(): - ratio = {0: 3, 1: 4, 2: 4} - X_gt = [np.array([[-0.20497017, -0.26630228], - [-0.80809175, -1.09917302], - [0.91464286, 1.61369212], - [-0.05903827, 0.10947647], - [0.03142011, 0.12323596], - [-0.60413357, 0.24628718], - [1.17737838, -0.2002118], - [0.50701028, -0.17636928], - [0.4960075, 0.86130762], - [0.45713638, 1.31069295], - [0.99272351, -0.11631728]]), - np.array([[-0.20497017, -0.26630228], - [-0.80809175, -1.09917302], - [0.91464286, 1.61369212], - [-0.05903827, 0.10947647], - [0.03142011, 0.12323596], - [-0.60413357, 0.24628718], - [1.17737838, -0.2002118], - [0.50701028, -0.17636928], - [0.4960075, 0.86130762], - [0.45713638, 1.31069295], - [0.99272351, -0.11631728]]), - np.array([[0.91464286, 1.61369212], - [-0.80809175, -1.09917302], - [-0.20497017, -0.26630228], - [1.17737838, -0.2002118], - [-0.60413357, 0.24628718], - [0.03142011, 0.12323596], - [-0.05903827, 0.10947647], - [1.15157493, -1.2981518], - [-0.54619583, 1.73009918], - [0.99272351, -0.11631728], - [0.45713638, 1.31069295]])] - y_gt = [np.array([0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]), - np.array([0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]), - np.array([0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2])] +def test_nm_fit_sample_float_sampling_strategy(): + sampling_strategy = {0: 3, 1: 4, 2: 4} + X_gt = [ + np.array([[-0.20497017, -0.26630228], [-0.80809175, -1.09917302], [ + 0.91464286, 1.61369212 + ], [-0.05903827, 0.10947647], [0.03142011, 0.12323596], + [-0.60413357, 0.24628718], [1.17737838, -0.2002118], + [0.50701028, -0.17636928], [0.4960075, 0.86130762], + [0.45713638, 1.31069295], [0.99272351, -0.11631728]]), + np.array([[-0.20497017, -0.26630228], [-0.80809175, -1.09917302], [ + 0.91464286, 1.61369212 + ], [-0.05903827, 0.10947647], [0.03142011, 0.12323596], + [-0.60413357, 0.24628718], [1.17737838, -0.2002118], + [0.50701028, -0.17636928], [0.4960075, 0.86130762], + [0.45713638, 1.31069295], [0.99272351, -0.11631728]]), + np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302], [ + -0.20497017, -0.26630228 + ], [1.17737838, -0.2002118], [-0.60413357, 0.24628718], + [0.03142011, 0.12323596], [-0.05903827, 0.10947647], + [1.15157493, -1.2981518], [-0.54619583, 1.73009918], + [0.99272351, -0.11631728], [0.45713638, 1.31069295]]) + ] + y_gt = [ + np.array([0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]), + np.array([0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]), + np.array([0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]) + ] for version_idx, version in enumerate(VERSION_NEARMISS): - nm = NearMiss(ratio=ratio, - version=version) + nm = NearMiss(sampling_strategy=sampling_strategy, version=version) X_resampled, y_resampled = nm.fit_sample(X, Y) assert_array_equal(X_resampled, X_gt[version_idx]) assert_array_equal(y_resampled, y_gt[version_idx]) def test_nm_fit_sample_nn_obj(): - ratio = 'auto' + sampling_strategy = 'auto' nn = NearestNeighbors(n_neighbors=3) - X_gt = [np.array([[0.91464286, 1.61369212], - [-0.80809175, -1.09917302], - [-0.20497017, -0.26630228], - [-0.05903827, 0.10947647], - [0.03142011, 0.12323596], - [-0.60413357, 0.24628718], - [0.50701028, -0.17636928], - [0.4960075, 0.86130762], - [0.45713638, 1.31069295]]), - np.array([[0.91464286, 1.61369212], - [-0.80809175, -1.09917302], - [-0.20497017, -0.26630228], - [-0.05903827, 0.10947647], - [0.03142011, 0.12323596], - [-0.60413357, 0.24628718], - [0.50701028, -0.17636928], - [0.4960075, 0.86130762], - [0.45713638, 1.31069295]]), - np.array([[0.91464286, 1.61369212], - [-0.80809175, -1.09917302], - [-0.20497017, -0.26630228], - [1.17737838, -0.2002118], - [-0.60413357, 0.24628718], - [0.03142011, 0.12323596], - [1.15157493, -1.2981518], - [-0.54619583, 1.73009918], - [0.99272351, -0.11631728]])] - y_gt = [np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), - np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), - np.array([0, 0, 0, 1, 1, 1, 2, 2, 2])] + X_gt = [ + np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302], [ + -0.20497017, -0.26630228 + ], [-0.05903827, 0.10947647], [0.03142011, 0.12323596], + [-0.60413357, 0.24628718], [0.50701028, -0.17636928], + [0.4960075, 0.86130762], [0.45713638, 1.31069295]]), + np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302], [ + -0.20497017, -0.26630228 + ], [-0.05903827, 0.10947647], [0.03142011, 0.12323596], + [-0.60413357, 0.24628718], [0.50701028, -0.17636928], + [0.4960075, 0.86130762], [0.45713638, 1.31069295]]), + np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302], [ + -0.20497017, -0.26630228 + ], [1.17737838, -0.2002118], [-0.60413357, 0.24628718], + [0.03142011, 0.12323596], [1.15157493, -1.2981518], + [-0.54619583, 1.73009918], [0.99272351, -0.11631728]]) + ] + y_gt = [ + np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), + np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), + np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]) + ] for version_idx, version in enumerate(VERSION_NEARMISS): - nm = NearMiss(ratio=ratio, - version=version, n_neighbors=nn) + nm = NearMiss( + sampling_strategy=sampling_strategy, + version=version, + n_neighbors=nn) X_resampled, y_resampled = nm.fit_sample(X, Y) assert_array_equal(X_resampled, X_gt[version_idx]) assert_array_equal(y_resampled, y_gt[version_idx]) @@ -234,6 +200,6 @@ def test_nm_fit_sample_nn_obj(): def test_deprecation_random_state(): nm = NearMiss(random_state=0) - with warns(DeprecationWarning, - match="'random_state' is deprecated from 0.4"): + with warns( + DeprecationWarning, match="'random_state' is deprecated from 0.4"): nm.fit_sample(X, Y) diff --git a/imblearn/under_sampling/prototype_selection/tests/test_neighbourhood_cleaning_rule.py b/imblearn/under_sampling/prototype_selection/tests/test_neighbourhood_cleaning_rule.py index bdf5fe309..9d2c51920 100644 --- a/imblearn/under_sampling/prototype_selection/tests/test_neighbourhood_cleaning_rule.py +++ b/imblearn/under_sampling/prototype_selection/tests/test_neighbourhood_cleaning_rule.py @@ -12,26 +12,31 @@ from imblearn.under_sampling import NeighbourhoodCleaningRule from imblearn.utils.testing import warns -X = np.array([[1.57737838, 0.1997882], [0.8960075, 0.46130762], - [0.34096173, 0.50947647], [-0.91735824, 0.93110278], - [-0.14619583, 1.33009918], [-0.20413357, 0.64628718], - [0.85713638, 0.91069295], [0.35967591, 2.61186964], - [0.43142011, 0.52323596], [0.90701028, -0.57636928], - [-1.20809175, -1.49917302], [-0.60497017, -0.66630228], - [1.39272351, -0.51631728], [-1.55581933, 1.09609604], - [1.55157493, -1.6981518]]) +X = np.array([[1.57737838, 0.1997882], [0.8960075, 0.46130762], [ + 0.34096173, 0.50947647 +], [-0.91735824, 0.93110278], [-0.14619583, 1.33009918], + [-0.20413357, 0.64628718], [0.85713638, 0.91069295], + [0.35967591, 2.61186964], [0.43142011, 0.52323596], [ + 0.90701028, -0.57636928 + ], [-1.20809175, -1.49917302], [-0.60497017, -0.66630228], [ + 1.39272351, -0.51631728 + ], [-1.55581933, 1.09609604], [1.55157493, -1.6981518]]) Y = np.array([1, 2, 1, 1, 2, 1, 2, 2, 1, 2, 0, 0, 2, 1, 2]) def test_ncr_error(): threshold_cleaning = -10 - with raises(ValueError, match=("'threshold_cleaning' is a value between" - " 0 and 1")): + with raises( + ValueError, + match=("'threshold_cleaning' is a value between" + " 0 and 1")): NeighbourhoodCleaningRule( threshold_cleaning=threshold_cleaning).fit_sample(X, Y) threshold_cleaning = 10 - with raises(ValueError, match=("'threshold_cleaning' is a value between" - " 0 and 1")): + with raises( + ValueError, + match=("'threshold_cleaning' is a value between" + " 0 and 1")): NeighbourhoodCleaningRule( threshold_cleaning=threshold_cleaning).fit_sample(X, Y) @@ -40,16 +45,12 @@ def test_ncr_fit_sample(): ncr = NeighbourhoodCleaningRule() X_resampled, y_resampled = ncr.fit_sample(X, Y) - X_gt = np.array([[0.34096173, 0.50947647], - [-0.91735824, 0.93110278], - [-0.20413357, 0.64628718], - [0.35967591, 2.61186964], - [0.90701028, -0.57636928], - [-1.20809175, -1.49917302], - [-0.60497017, -0.66630228], - [1.39272351, -0.51631728], - [-1.55581933, 1.09609604], - [1.55157493, -1.6981518]]) + X_gt = np.array([[0.34096173, 0.50947647], [-0.91735824, 0.93110278], [ + -0.20413357, 0.64628718 + ], [0.35967591, 2.61186964], [0.90701028, + -0.57636928], [-1.20809175, -1.49917302], + [-0.60497017, -0.66630228], [1.39272351, -0.51631728], + [-1.55581933, 1.09609604], [1.55157493, -1.6981518]]) y_gt = np.array([1, 1, 1, 2, 2, 0, 0, 2, 1, 2]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) @@ -59,16 +60,12 @@ def test_ncr_fit_sample_mode(): ncr = NeighbourhoodCleaningRule(kind_sel='mode') X_resampled, y_resampled = ncr.fit_sample(X, Y) - X_gt = np.array([[0.34096173, 0.50947647], - [-0.91735824, 0.93110278], - [-0.20413357, 0.64628718], - [0.35967591, 2.61186964], - [0.90701028, -0.57636928], - [-1.20809175, -1.49917302], - [-0.60497017, -0.66630228], - [1.39272351, -0.51631728], - [-1.55581933, 1.09609604], - [1.55157493, -1.6981518]]) + X_gt = np.array([[0.34096173, 0.50947647], [-0.91735824, 0.93110278], [ + -0.20413357, 0.64628718 + ], [0.35967591, 2.61186964], [0.90701028, + -0.57636928], [-1.20809175, -1.49917302], + [-0.60497017, -0.66630228], [1.39272351, -0.51631728], + [-1.55581933, 1.09609604], [1.55157493, -1.6981518]]) y_gt = np.array([1, 1, 1, 2, 2, 0, 0, 2, 1, 2]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) @@ -78,16 +75,12 @@ def test_ncr_fit_sample_with_indices(): ncr = NeighbourhoodCleaningRule(return_indices=True) X_resampled, y_resampled, idx_under = ncr.fit_sample(X, Y) - X_gt = np.array([[0.34096173, 0.50947647], - [-0.91735824, 0.93110278], - [-0.20413357, 0.64628718], - [0.35967591, 2.61186964], - [0.90701028, -0.57636928], - [-1.20809175, -1.49917302], - [-0.60497017, -0.66630228], - [1.39272351, -0.51631728], - [-1.55581933, 1.09609604], - [1.55157493, -1.6981518]]) + X_gt = np.array([[0.34096173, 0.50947647], [-0.91735824, 0.93110278], [ + -0.20413357, 0.64628718 + ], [0.35967591, 2.61186964], [0.90701028, + -0.57636928], [-1.20809175, -1.49917302], + [-0.60497017, -0.66630228], [1.39272351, -0.51631728], + [-1.55581933, 1.09609604], [1.55157493, -1.6981518]]) y_gt = np.array([1, 1, 1, 2, 2, 0, 0, 2, 1, 2]) idx_gt = np.array([2, 3, 5, 7, 9, 10, 11, 12, 13, 14]) assert_array_equal(X_resampled, X_gt) @@ -97,20 +90,15 @@ def test_ncr_fit_sample_with_indices(): def test_ncr_fit_sample_nn_obj(): nn = NearestNeighbors(n_neighbors=4) - ncr = NeighbourhoodCleaningRule( - return_indices=True, n_neighbors=nn) + ncr = NeighbourhoodCleaningRule(return_indices=True, n_neighbors=nn) X_resampled, y_resampled, idx_under = ncr.fit_sample(X, Y) - X_gt = np.array([[0.34096173, 0.50947647], - [-0.91735824, 0.93110278], - [-0.20413357, 0.64628718], - [0.35967591, 2.61186964], - [0.90701028, -0.57636928], - [-1.20809175, -1.49917302], - [-0.60497017, -0.66630228], - [1.39272351, -0.51631728], - [-1.55581933, 1.09609604], - [1.55157493, -1.6981518]]) + X_gt = np.array([[0.34096173, 0.50947647], [-0.91735824, 0.93110278], [ + -0.20413357, 0.64628718 + ], [0.35967591, 2.61186964], [0.90701028, + -0.57636928], [-1.20809175, -1.49917302], + [-0.60497017, -0.66630228], [1.39272351, -0.51631728], + [-1.55581933, 1.09609604], [1.55157493, -1.6981518]]) y_gt = np.array([1, 1, 1, 2, 2, 0, 0, 2, 1, 2]) idx_gt = np.array([2, 3, 5, 7, 9, 10, 11, 12, 13, 14]) assert_array_equal(X_resampled, X_gt) @@ -120,14 +108,13 @@ def test_ncr_fit_sample_nn_obj(): def test_ncr_wrong_nn_obj(): nn = 'rnd' - ncr = NeighbourhoodCleaningRule( - return_indices=True, n_neighbors=nn) + ncr = NeighbourhoodCleaningRule(return_indices=True, n_neighbors=nn) with raises(ValueError, match="has to be one of"): ncr.fit_sample(X, Y) def test_deprecation_random_state(): ncr = NeighbourhoodCleaningRule(random_state=0) - with warns(DeprecationWarning, - match="'random_state' is deprecated from 0.4"): + with warns( + DeprecationWarning, match="'random_state' is deprecated from 0.4"): ncr.fit_sample(X, Y) diff --git a/imblearn/under_sampling/prototype_selection/tests/test_one_sided_selection.py b/imblearn/under_sampling/prototype_selection/tests/test_one_sided_selection.py index e5faf886a..cce6c386d 100644 --- a/imblearn/under_sampling/prototype_selection/tests/test_one_sided_selection.py +++ b/imblearn/under_sampling/prototype_selection/tests/test_one_sided_selection.py @@ -15,14 +15,15 @@ from imblearn.under_sampling import OneSidedSelection RND_SEED = 0 -X = np.array([[-0.3879569, 0.6894251], [-0.09322739, 1.28177189], - [-0.77740357, 0.74097941], [0.91542919, -0.65453327], - [-0.03852113, 0.40910479], [-0.43877303, 1.07366684], - [-0.85795321, 0.82980738], [-0.18430329, 0.52328473], - [-0.30126957, -0.66268378], [-0.65571327, 0.42412021], - [-0.28305528, 0.30284991], [0.20246714, -0.34727125], - [1.06446472, -1.09279772], [0.30543283, -0.02589502], - [-0.00717161, 0.00318087]]) +X = np.array([[-0.3879569, 0.6894251], [-0.09322739, 1.28177189], [ + -0.77740357, 0.74097941 +], [0.91542919, -0.65453327], [-0.03852113, 0.40910479], [ + -0.43877303, 1.07366684 +], [-0.85795321, 0.82980738], [-0.18430329, 0.52328473], [ + -0.30126957, -0.66268378 +], [-0.65571327, 0.42412021], [-0.28305528, 0.30284991], + [0.20246714, -0.34727125], [1.06446472, -1.09279772], + [0.30543283, -0.02589502], [-0.00717161, 0.00318087]]) Y = np.array([0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0]) @@ -38,10 +39,11 @@ def test_oss_fit_sample(): oss = OneSidedSelection(random_state=RND_SEED) X_resampled, y_resampled = oss.fit_sample(X, Y) - X_gt = np.array([[-0.3879569, 0.6894251], [0.91542919, -0.65453327], - [-0.65571327, 0.42412021], [1.06446472, -1.09279772], - [0.30543283, -0.02589502], [-0.00717161, 0.00318087], - [-0.09322739, 1.28177189], [-0.77740357, 0.74097941], + X_gt = np.array([[-0.3879569, 0.6894251], [0.91542919, -0.65453327], [ + -0.65571327, 0.42412021 + ], [1.06446472, -1.09279772], [0.30543283, -0.02589502], [ + -0.00717161, 0.00318087 + ], [-0.09322739, 1.28177189], [-0.77740357, 0.74097941], [-0.43877303, 1.07366684], [-0.85795321, 0.82980738], [-0.30126957, -0.66268378], [0.20246714, -0.34727125]]) y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) @@ -53,10 +55,11 @@ def test_oss_fit_sample_with_indices(): oss = OneSidedSelection(return_indices=True, random_state=RND_SEED) X_resampled, y_resampled, idx_under = oss.fit_sample(X, Y) - X_gt = np.array([[-0.3879569, 0.6894251], [0.91542919, -0.65453327], - [-0.65571327, 0.42412021], [1.06446472, -1.09279772], - [0.30543283, -0.02589502], [-0.00717161, 0.00318087], - [-0.09322739, 1.28177189], [-0.77740357, 0.74097941], + X_gt = np.array([[-0.3879569, 0.6894251], [0.91542919, -0.65453327], [ + -0.65571327, 0.42412021 + ], [1.06446472, -1.09279772], [0.30543283, -0.02589502], [ + -0.00717161, 0.00318087 + ], [-0.09322739, 1.28177189], [-0.77740357, 0.74097941], [-0.43877303, 1.07366684], [-0.85795321, 0.82980738], [-0.30126957, -0.66268378], [0.20246714, -0.34727125]]) y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) @@ -71,10 +74,11 @@ def test_oss_with_object(): oss = OneSidedSelection(random_state=RND_SEED, n_neighbors=knn) X_resampled, y_resampled = oss.fit_sample(X, Y) - X_gt = np.array([[-0.3879569, 0.6894251], [0.91542919, -0.65453327], - [-0.65571327, 0.42412021], [1.06446472, -1.09279772], - [0.30543283, -0.02589502], [-0.00717161, 0.00318087], - [-0.09322739, 1.28177189], [-0.77740357, 0.74097941], + X_gt = np.array([[-0.3879569, 0.6894251], [0.91542919, -0.65453327], [ + -0.65571327, 0.42412021 + ], [1.06446472, -1.09279772], [0.30543283, -0.02589502], [ + -0.00717161, 0.00318087 + ], [-0.09322739, 1.28177189], [-0.77740357, 0.74097941], [-0.43877303, 1.07366684], [-0.85795321, 0.82980738], [-0.30126957, -0.66268378], [0.20246714, -0.34727125]]) y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) diff --git a/imblearn/under_sampling/prototype_selection/tests/test_random_under_sampler.py b/imblearn/under_sampling/prototype_selection/tests/test_random_under_sampler.py index c0b6249be..962cd12fb 100644 --- a/imblearn/under_sampling/prototype_selection/tests/test_random_under_sampler.py +++ b/imblearn/under_sampling/prototype_selection/tests/test_random_under_sampler.py @@ -13,17 +13,16 @@ from imblearn.under_sampling import RandomUnderSampler RND_SEED = 0 -X = np.array([[0.04352327, -0.20515826], [0.92923648, 0.76103773], - [0.20792588, 1.49407907], [0.47104475, 0.44386323], - [0.22950086, 0.33367433], [0.15490546, 0.3130677], +X = np.array([[0.04352327, -0.20515826], [0.92923648, 0.76103773], [ + 0.20792588, 1.49407907 +], [0.47104475, 0.44386323], [0.22950086, 0.33367433], [0.15490546, 0.3130677], [0.09125309, -0.85409574], [0.12372842, 0.6536186], [0.13347175, 0.12167502], [0.094035, -2.55298982]]) Y = np.array([1, 0, 1, 0, 1, 1, 1, 1, 0, 1]) def test_rus_fit_sample(): - rus = RandomUnderSampler(random_state=RND_SEED, - replacement=True) + rus = RandomUnderSampler(random_state=RND_SEED, replacement=True) X_resampled, y_resampled = rus.fit_sample(X, Y) X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323], @@ -36,8 +35,8 @@ def test_rus_fit_sample(): def test_rus_fit_sample_with_indices(): - rus = RandomUnderSampler(return_indices=True, random_state=RND_SEED, - replacement=True) + rus = RandomUnderSampler( + return_indices=True, random_state=RND_SEED, replacement=True) X_resampled, y_resampled, idx_under = rus.fit_sample(X, Y) X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323], @@ -51,20 +50,18 @@ def test_rus_fit_sample_with_indices(): def test_rus_fit_sample_half(): - ratio = {0: 3, 1: 6} - rus = RandomUnderSampler(ratio=ratio, random_state=RND_SEED, - replacement=True) + sampling_strategy = {0: 3, 1: 6} + rus = RandomUnderSampler( + sampling_strategy=sampling_strategy, + random_state=RND_SEED, + replacement=True) X_resampled, y_resampled = rus.fit_sample(X, Y) - X_gt = np.array([[0.92923648, 0.76103773], - [0.47104475, 0.44386323], - [0.92923648, 0.76103773], - [0.15490546, 0.3130677], - [0.15490546, 0.3130677], - [0.15490546, 0.3130677], - [0.20792588, 1.49407907], - [0.15490546, 0.3130677], - [0.12372842, 0.6536186]]) + X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323], [ + 0.92923648, 0.76103773 + ], [0.15490546, 0.3130677], [0.15490546, 0.3130677], + [0.15490546, 0.3130677], [0.20792588, 1.49407907], + [0.15490546, 0.3130677], [0.12372842, 0.6536186]]) y_gt = np.array([0, 0, 0, 1, 1, 1, 1, 1, 1]) print(X_resampled) assert_array_equal(X_resampled, X_gt) diff --git a/imblearn/under_sampling/prototype_selection/tests/test_repeated_edited_nearest_neighbours.py b/imblearn/under_sampling/prototype_selection/tests/test_repeated_edited_nearest_neighbours.py index 6310885b9..b50c8dbab 100644 --- a/imblearn/under_sampling/prototype_selection/tests/test_repeated_edited_nearest_neighbours.py +++ b/imblearn/under_sampling/prototype_selection/tests/test_repeated_edited_nearest_neighbours.py @@ -14,26 +14,30 @@ from imblearn.under_sampling import RepeatedEditedNearestNeighbours from imblearn.utils.testing import warns - -X = np.array([[-0.12840393, 0.66446571], [1.32319756, -0.13181616], - [0.04296502, -0.37981873], [0.83631853, 0.18569783], - [1.02956816, 0.36061601], [1.12202806, 0.33811558], - [-0.53171468, -0.53735182], [1.3381556, 0.35956356], - [-0.35946678, 0.72510189], [1.32326943, 0.28393874], - [2.94290565, -0.13986434], [0.28294738, -1.00125525], - [0.34218094, -0.58781961], [-0.88864036, -0.33782387], - [-1.10146139, 0.91782682], [-0.7969716, -0.50493969], - [0.73489726, 0.43915195], [0.2096964, -0.61814058], - [-0.28479268, 0.70459548], [1.84864913, 0.14729596], - [1.59068979, -0.96622933], [0.73418199, -0.02222847], - [0.50307437, 0.498805], [0.84929742, 0.41042894], - [0.62649535, 0.46600596], [0.79270821, -0.41386668], - [1.16606871, -0.25641059], [1.57356906, 0.30390519], - [1.0304995, -0.16955962], [1.67314371, 0.19231498], - [0.98382284, 0.37184502], [0.48921682, -1.38504507], - [-0.46226554, -0.50481004], [-0.03918551, -0.68540745], - [0.24991051, -1.00864997], [0.80541964, -0.34465185], - [0.1732627, -1.61323172], [0.69804044, 0.44810796], +X = np.array([[-0.12840393, 0.66446571], [1.32319756, -0.13181616], [ + 0.04296502, -0.37981873 +], [0.83631853, 0.18569783], [1.02956816, 0.36061601], [ + 1.12202806, 0.33811558 +], [-0.53171468, -0.53735182], [1.3381556, 0.35956356], [ + -0.35946678, 0.72510189 +], [1.32326943, 0.28393874], [2.94290565, -0.13986434], [ + 0.28294738, -1.00125525 +], [0.34218094, -0.58781961], [-0.88864036, -0.33782387], [ + -1.10146139, 0.91782682 +], [-0.7969716, -0.50493969], [0.73489726, 0.43915195], [ + 0.2096964, -0.61814058 +], [-0.28479268, 0.70459548], [1.84864913, 0.14729596], [ + 1.59068979, -0.96622933 +], [0.73418199, -0.02222847], [0.50307437, 0.498805], [0.84929742, 0.41042894], + [0.62649535, 0.46600596], [0.79270821, -0.41386668], [ + 1.16606871, -0.25641059 + ], [1.57356906, 0.30390519], [1.0304995, -0.16955962], [ + 1.67314371, 0.19231498 + ], [0.98382284, 0.37184502], [0.48921682, -1.38504507], [ + -0.46226554, -0.50481004 + ], [-0.03918551, -0.68540745], [0.24991051, -1.00864997], [ + 0.80541964, -0.34465185 + ], [0.1732627, -1.61323172], [0.69804044, 0.44810796], [-0.5506368, -0.42072426], [-0.34474418, 0.21969797]]) Y = np.array([ 1, 2, 2, 2, 1, 1, 0, 2, 1, 1, 1, 2, 2, 0, 1, 2, 1, 2, 1, 1, 2, 2, 1, 1, 1, @@ -60,16 +64,19 @@ def test_renn_fit_sample(): renn = RepeatedEditedNearestNeighbours() X_resampled, y_resampled = renn.fit_sample(X, Y) - X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387], - [-0.46226554, -0.50481004], [-0.34474418, 0.21969797], - [1.02956816, 0.36061601], [1.12202806, 0.33811558], - [0.73489726, 0.43915195], [0.50307437, 0.498805], - [0.84929742, 0.41042894], [0.62649535, 0.46600596], - [0.98382284, 0.37184502], [0.69804044, 0.44810796], - [0.04296502, -0.37981873], [0.28294738, -1.00125525], - [0.34218094, -0.58781961], [0.2096964, -0.61814058], - [1.59068979, -0.96622933], [0.73418199, -0.02222847], - [0.79270821, -0.41386668], [1.16606871, -0.25641059], + X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387], [ + -0.46226554, -0.50481004 + ], [-0.34474418, 0.21969797], [1.02956816, 0.36061601], [ + 1.12202806, 0.33811558 + ], [0.73489726, 0.43915195], [0.50307437, 0.498805], [ + 0.84929742, 0.41042894 + ], [0.62649535, 0.46600596], [0.98382284, 0.37184502], [ + 0.69804044, 0.44810796 + ], [0.04296502, -0.37981873], [0.28294738, -1.00125525], [ + 0.34218094, -0.58781961 + ], [0.2096964, -0.61814058], [1.59068979, -0.96622933], [ + 0.73418199, -0.02222847 + ], [0.79270821, -0.41386668], [1.16606871, -0.25641059], [1.0304995, -0.16955962], [0.48921682, -1.38504507], [-0.03918551, -0.68540745], [0.24991051, -1.00864997], [0.80541964, -0.34465185], [0.1732627, -1.61323172]]) @@ -85,16 +92,19 @@ def test_renn_fit_sample_with_indices(): renn = RepeatedEditedNearestNeighbours(return_indices=True) X_resampled, y_resampled, idx_under = renn.fit_sample(X, Y) - X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387], - [-0.46226554, -0.50481004], [-0.34474418, 0.21969797], - [1.02956816, 0.36061601], [1.12202806, 0.33811558], - [0.73489726, 0.43915195], [0.50307437, 0.498805], - [0.84929742, 0.41042894], [0.62649535, 0.46600596], - [0.98382284, 0.37184502], [0.69804044, 0.44810796], - [0.04296502, -0.37981873], [0.28294738, -1.00125525], - [0.34218094, -0.58781961], [0.2096964, -0.61814058], - [1.59068979, -0.96622933], [0.73418199, -0.02222847], - [0.79270821, -0.41386668], [1.16606871, -0.25641059], + X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387], [ + -0.46226554, -0.50481004 + ], [-0.34474418, 0.21969797], [1.02956816, 0.36061601], [ + 1.12202806, 0.33811558 + ], [0.73489726, 0.43915195], [0.50307437, 0.498805], [ + 0.84929742, 0.41042894 + ], [0.62649535, 0.46600596], [0.98382284, 0.37184502], [ + 0.69804044, 0.44810796 + ], [0.04296502, -0.37981873], [0.28294738, -1.00125525], [ + 0.34218094, -0.58781961 + ], [0.2096964, -0.61814058], [1.59068979, -0.96622933], [ + 0.73418199, -0.02222847 + ], [0.79270821, -0.41386668], [1.16606871, -0.25641059], [1.0304995, -0.16955962], [0.48921682, -1.38504507], [-0.03918551, -0.68540745], [0.24991051, -1.00864997], [0.80541964, -0.34465185], [0.1732627, -1.61323172]]) @@ -115,22 +125,27 @@ def test_renn_fit_sample_mode_object(): renn = RepeatedEditedNearestNeighbours(kind_sel='mode') X_resampled, y_resampled = renn.fit_sample(X, Y) - X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387], - [-0.46226554, -0.50481004], [-0.34474418, 0.21969797], - [-0.12840393, 0.66446571], [1.02956816, 0.36061601], - [1.12202806, 0.33811558], [-0.35946678, 0.72510189], - [2.94290565, -0.13986434], [-1.10146139, 0.91782682], - [0.73489726, 0.43915195], [-0.28479268, 0.70459548], - [1.84864913, 0.14729596], [0.50307437, 0.498805], - [0.84929742, 0.41042894], [0.62649535, 0.46600596], - [1.67314371, 0.19231498], [0.98382284, 0.37184502], - [0.69804044, 0.44810796], [1.32319756, -0.13181616], - [0.04296502, -0.37981873], [0.28294738, -1.00125525], - [0.34218094, -0.58781961], [0.2096964, -0.61814058], - [1.59068979, -0.96622933], [0.73418199, -0.02222847], - [0.79270821, -0.41386668], [1.16606871, -0.25641059], - [1.0304995, -0.16955962], [0.48921682, -1.38504507], - [-0.03918551, -0.68540745], [0.24991051, -1.00864997], + X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387], [ + -0.46226554, -0.50481004 + ], [-0.34474418, 0.21969797], [-0.12840393, 0.66446571], [ + 1.02956816, 0.36061601 + ], [1.12202806, 0.33811558], [-0.35946678, 0.72510189], [ + 2.94290565, -0.13986434 + ], [-1.10146139, 0.91782682], [0.73489726, 0.43915195], [ + -0.28479268, 0.70459548 + ], [1.84864913, 0.14729596], [0.50307437, 0.498805], [ + 0.84929742, 0.41042894 + ], [0.62649535, 0.46600596], [1.67314371, 0.19231498], [ + 0.98382284, 0.37184502 + ], [0.69804044, 0.44810796], [1.32319756, -0.13181616], [ + 0.04296502, -0.37981873 + ], [0.28294738, -1.00125525], [0.34218094, -0.58781961], [ + 0.2096964, -0.61814058 + ], [1.59068979, -0.96622933], [0.73418199, -0.02222847], [ + 0.79270821, -0.41386668 + ], [1.16606871, -0.25641059], [1.0304995, -0.16955962], [ + 0.48921682, -1.38504507 + ], [-0.03918551, -0.68540745], [0.24991051, -1.00864997], [0.80541964, -0.34465185], [0.1732627, -1.61323172]]) y_gt = np.array([ 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, @@ -145,22 +160,27 @@ def test_renn_fit_sample_mode(): renn = RepeatedEditedNearestNeighbours(n_neighbors=nn, kind_sel='mode') X_resampled, y_resampled = renn.fit_sample(X, Y) - X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387], - [-0.46226554, -0.50481004], [-0.34474418, 0.21969797], - [-0.12840393, 0.66446571], [1.02956816, 0.36061601], - [1.12202806, 0.33811558], [-0.35946678, 0.72510189], - [2.94290565, -0.13986434], [-1.10146139, 0.91782682], - [0.73489726, 0.43915195], [-0.28479268, 0.70459548], - [1.84864913, 0.14729596], [0.50307437, 0.498805], - [0.84929742, 0.41042894], [0.62649535, 0.46600596], - [1.67314371, 0.19231498], [0.98382284, 0.37184502], - [0.69804044, 0.44810796], [1.32319756, -0.13181616], - [0.04296502, -0.37981873], [0.28294738, -1.00125525], - [0.34218094, -0.58781961], [0.2096964, -0.61814058], - [1.59068979, -0.96622933], [0.73418199, -0.02222847], - [0.79270821, -0.41386668], [1.16606871, -0.25641059], - [1.0304995, -0.16955962], [0.48921682, -1.38504507], - [-0.03918551, -0.68540745], [0.24991051, -1.00864997], + X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387], [ + -0.46226554, -0.50481004 + ], [-0.34474418, 0.21969797], [-0.12840393, 0.66446571], [ + 1.02956816, 0.36061601 + ], [1.12202806, 0.33811558], [-0.35946678, 0.72510189], [ + 2.94290565, -0.13986434 + ], [-1.10146139, 0.91782682], [0.73489726, 0.43915195], [ + -0.28479268, 0.70459548 + ], [1.84864913, 0.14729596], [0.50307437, 0.498805], [ + 0.84929742, 0.41042894 + ], [0.62649535, 0.46600596], [1.67314371, 0.19231498], [ + 0.98382284, 0.37184502 + ], [0.69804044, 0.44810796], [1.32319756, -0.13181616], [ + 0.04296502, -0.37981873 + ], [0.28294738, -1.00125525], [0.34218094, -0.58781961], [ + 0.2096964, -0.61814058 + ], [1.59068979, -0.96622933], [0.73418199, -0.02222847], [ + 0.79270821, -0.41386668 + ], [1.16606871, -0.25641059], [1.0304995, -0.16955962], [ + 0.48921682, -1.38504507 + ], [-0.03918551, -0.68540745], [0.24991051, -1.00864997], [0.80541964, -0.34465185], [0.1732627, -1.61323172]]) y_gt = np.array([ 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, @@ -172,14 +192,13 @@ def test_renn_fit_sample_mode(): def test_renn_not_good_object(): nn = 'rnd' - renn = RepeatedEditedNearestNeighbours( - n_neighbors=nn, kind_sel='mode') + renn = RepeatedEditedNearestNeighbours(n_neighbors=nn, kind_sel='mode') with raises(ValueError): renn.fit_sample(X, Y) def test_deprecation_random_state(): renn = RepeatedEditedNearestNeighbours(random_state=0) - with warns(DeprecationWarning, - match="'random_state' is deprecated from 0.4"): + with warns( + DeprecationWarning, match="'random_state' is deprecated from 0.4"): renn.fit_sample(X, Y) diff --git a/imblearn/under_sampling/prototype_selection/tests/test_tomek_links.py b/imblearn/under_sampling/prototype_selection/tests/test_tomek_links.py index 80d542ca0..c2b9d84f2 100644 --- a/imblearn/under_sampling/prototype_selection/tests/test_tomek_links.py +++ b/imblearn/under_sampling/prototype_selection/tests/test_tomek_links.py @@ -11,17 +11,19 @@ from imblearn.under_sampling import TomekLinks from imblearn.utils.testing import warns - -X = np.array([[0.31230513, 0.1216318], [0.68481731, 0.51935141], - [1.34192108, -0.13367336], [0.62366841, -0.21312976], - [1.61091956, -0.40283504], [-0.37162401, -2.19400981], - [0.74680821, 1.63827342], [0.2184254, 0.24299982], - [0.61472253, -0.82309052], [0.19893132, -0.47761769], - [1.06514042, -0.0770537], [0.97407872, 0.44454207], - [1.40301027, -0.83648734], [-1.20515198, -1.02689695], - [-0.27410027, -0.54194484], [0.8381014, 0.44085498], - [-0.23374509, 0.18370049], [-0.32635887, -0.29299653], - [-0.00288378, 0.84259929], [1.79580611, -0.02219234]]) +X = np.array([[0.31230513, 0.1216318], [0.68481731, 0.51935141], [ + 1.34192108, -0.13367336 +], [0.62366841, -0.21312976], [1.61091956, + -0.40283504], [-0.37162401, -2.19400981], + [0.74680821, + 1.63827342], [0.2184254, 0.24299982], [0.61472253, -0.82309052], + [0.19893132, -0.47761769], [1.06514042, -0.0770537], [ + 0.97407872, 0.44454207 + ], [1.40301027, -0.83648734], [-1.20515198, -1.02689695], [ + -0.27410027, -0.54194484 + ], [0.8381014, 0.44085498], [-0.23374509, 0.18370049], [ + -0.32635887, -0.29299653 + ], [-0.00288378, 0.84259929], [1.79580611, -0.02219234]]) Y = np.array([1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0]) @@ -34,15 +36,16 @@ def test_tl_fit_sample(): tl = TomekLinks() X_resampled, y_resampled = tl.fit_sample(X, Y) - X_gt = np.array([[0.31230513, 0.1216318], [0.68481731, 0.51935141], - [1.34192108, -0.13367336], [0.62366841, -0.21312976], - [1.61091956, -0.40283504], [-0.37162401, -2.19400981], - [0.74680821, 1.63827342], [0.2184254, 0.24299982], - [0.61472253, -0.82309052], [0.19893132, -0.47761769], - [0.97407872, 0.44454207], [1.40301027, -0.83648734], - [-1.20515198, -1.02689695], [-0.23374509, 0.18370049], - [-0.32635887, -0.29299653], [-0.00288378, 0.84259929], - [1.79580611, -0.02219234]]) + X_gt = np.array([[0.31230513, 0.1216318], [0.68481731, 0.51935141], [ + 1.34192108, -0.13367336 + ], [0.62366841, -0.21312976], [1.61091956, -0.40283504], [ + -0.37162401, -2.19400981 + ], [0.74680821, 1.63827342], [0.2184254, 0.24299982], [ + 0.61472253, -0.82309052 + ], [0.19893132, -0.47761769], [0.97407872, 0.44454207], + [1.40301027, -0.83648734], [-1.20515198, -1.02689695], + [-0.23374509, 0.18370049], [-0.32635887, -0.29299653], + [-0.00288378, 0.84259929], [1.79580611, -0.02219234]]) y_gt = np.array([1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) @@ -52,15 +55,16 @@ def test_tl_fit_sample_with_indices(): tl = TomekLinks(return_indices=True) X_resampled, y_resampled, idx_under = tl.fit_sample(X, Y) - X_gt = np.array([[0.31230513, 0.1216318], [0.68481731, 0.51935141], - [1.34192108, -0.13367336], [0.62366841, -0.21312976], - [1.61091956, -0.40283504], [-0.37162401, -2.19400981], - [0.74680821, 1.63827342], [0.2184254, 0.24299982], - [0.61472253, -0.82309052], [0.19893132, -0.47761769], - [0.97407872, 0.44454207], [1.40301027, -0.83648734], - [-1.20515198, -1.02689695], [-0.23374509, 0.18370049], - [-0.32635887, -0.29299653], [-0.00288378, 0.84259929], - [1.79580611, -0.02219234]]) + X_gt = np.array([[0.31230513, 0.1216318], [0.68481731, 0.51935141], [ + 1.34192108, -0.13367336 + ], [0.62366841, -0.21312976], [1.61091956, -0.40283504], [ + -0.37162401, -2.19400981 + ], [0.74680821, 1.63827342], [0.2184254, 0.24299982], [ + 0.61472253, -0.82309052 + ], [0.19893132, -0.47761769], [0.97407872, 0.44454207], + [1.40301027, -0.83648734], [-1.20515198, -1.02689695], + [-0.23374509, 0.18370049], [-0.32635887, -0.29299653], + [-0.00288378, 0.84259929], [1.79580611, -0.02219234]]) y_gt = np.array([1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0]) idx_gt = np.array( [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 16, 17, 18, 19]) @@ -71,6 +75,6 @@ def test_tl_fit_sample_with_indices(): def test_deprecation_random_state(): tl = TomekLinks(random_state=0) - with warns(DeprecationWarning, - match="'random_state' is deprecated from 0.4"): + with warns( + DeprecationWarning, match="'random_state' is deprecated from 0.4"): tl.fit_sample(X, Y) diff --git a/imblearn/under_sampling/prototype_selection/tomek_links.py b/imblearn/under_sampling/prototype_selection/tomek_links.py index d431a411a..f5f953f1d 100644 --- a/imblearn/under_sampling/prototype_selection/tomek_links.py +++ b/imblearn/under_sampling/prototype_selection/tomek_links.py @@ -12,9 +12,14 @@ from sklearn.utils import safe_indexing from ..base import BaseCleaningSampler +from ...utils import Substitution from ...utils.deprecation import deprecate_parameter +from ...utils._docstring import _random_state_docstring +@Substitution( + sampling_strategy=BaseCleaningSampler._sampling_strategy_docstring, + random_state=_random_state_docstring) class TomekLinks(BaseCleaningSampler): """Class to perform under-sampling by removing Tomek's links. @@ -22,37 +27,13 @@ class TomekLinks(BaseCleaningSampler): Parameters ---------- - ratio : str, dict, or callable, optional (default='auto') - Ratio to use for resampling the data set. - - - If ``str``, has to be one of: (i) ``'minority'``: resample the - minority class; (ii) ``'majority'``: resample the majority class, - (iii) ``'not minority'``: resample all classes apart of the minority - class, (iv) ``'all'``: resample all classes, and (v) ``'auto'``: - correspond to ``'all'`` with for over-sampling methods and ``'not - minority'`` for under-sampling methods. The classes targeted will be - over-sampled or under-sampled to achieve an equal number of sample - with the majority or minority class. - - If ``dict``, the keys correspond to the targeted classes. The values - correspond to the desired number of samples. - - If callable, function taking ``y`` and returns a ``dict``. The keys - correspond to the targeted classes. The values correspond to the - desired number of samples. - - .. warning:: - This algorithm is a cleaning under-sampling method. When providing a - ``dict``, only the targeted classes will be used; the number of - samples will be discarded. + {sampling_strategy} return_indices : bool, optional (default=False) Whether or not to return the indices of the samples randomly selected from the majority class. - random_state : int, RandomState instance or None, optional (default=None) - If int, ``random_state`` is the seed used by the random number - generator; If ``RandomState`` instance, random_state is the random - number generator; If ``None``, the random number generator is the - ``RandomState`` instance used by ``np.random``. + {random_state} .. deprecated:: 0.4 ``random_state`` is deprecated in 0.4 and will be removed in 0.6. @@ -60,6 +41,11 @@ class TomekLinks(BaseCleaningSampler): n_jobs : int, optional (default=1) The number of threads to open if possible. + ratio : str, dict, or callable + .. deprecated:: 0.4 + Use the parameter ``sampling_strategy`` instead. It will be removed + in 0.6. + Notes ----- This method is based on [1]_. @@ -85,18 +71,23 @@ class TomekLinks(BaseCleaningSampler): >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) - >>> print('Original dataset shape {}'.format(Counter(y))) - Original dataset shape Counter({1: 900, 0: 100}) + >>> print('Original dataset shape %s' % Counter(y)) + Original dataset shape Counter({{1: 900, 0: 100}}) >>> tl = TomekLinks() >>> X_res, y_res = tl.fit_sample(X, y) - >>> print('Resampled dataset shape {}'.format(Counter(y_res))) - Resampled dataset shape Counter({1: 897, 0: 100}) + >>> print('Resampled dataset shape %s' % Counter(y_res)) + Resampled dataset shape Counter({{1: 897, 0: 100}}) """ - def __init__(self, ratio='auto', return_indices=False, - random_state=None, n_jobs=1): - super(TomekLinks, self).__init__(ratio=ratio) + def __init__(self, + sampling_strategy='auto', + return_indices=False, + random_state=None, + n_jobs=1, + ratio=None): + super(TomekLinks, self).__init__( + sampling_strategy=sampling_strategy, ratio=ratio) self.random_state = random_state self.return_indices = return_indices self.n_jobs = n_jobs @@ -177,13 +168,11 @@ def _sample(self, X, y): nn.fit(X) nns = nn.kneighbors(X, return_distance=False)[:, 1] - links = self.is_tomek(y, nns, self.ratio_) + links = self.is_tomek(y, nns, self.sampling_strategy_) idx_under = np.flatnonzero(np.logical_not(links)) if self.return_indices: - return (safe_indexing(X, idx_under), - safe_indexing(y, idx_under), + return (safe_indexing(X, idx_under), safe_indexing(y, idx_under), idx_under) else: - return (safe_indexing(X, idx_under), - safe_indexing(y, idx_under)) + return (safe_indexing(X, idx_under), safe_indexing(y, idx_under)) diff --git a/imblearn/utils/__init__.py b/imblearn/utils/__init__.py index a767c20aa..a3be9ffa0 100644 --- a/imblearn/utils/__init__.py +++ b/imblearn/utils/__init__.py @@ -2,13 +2,15 @@ The :mod:`imblearn.utils` module includes various utilities. """ +from ._docstring import Substitution + from .validation import check_neighbors_object from .validation import check_target_type from .validation import hash_X_y from .validation import check_ratio +from .validation import check_sampling_strategy - -__all__ = ['check_neighbors_object', - 'check_target_type', - 'hash_X_y', - 'check_ratio'] +__all__ = [ + 'Substitution', 'check_neighbors_object', 'check_target_type', 'hash_X_y', + 'check_sampling_strategy', 'check_ratio' +] diff --git a/imblearn/utils/_docstring.py b/imblearn/utils/_docstring.py new file mode 100644 index 000000000..56ae44106 --- /dev/null +++ b/imblearn/utils/_docstring.py @@ -0,0 +1,35 @@ +"""Utilities for docstring in imbalanced-learn.""" + +# Authors: Guillaume Lemaitre +# License: MIT + + +class Substitution(object): + """Decorate a function's or a class' docstring to perform string + substitution on it. + + This decorator should be robust even if obj.__doc__ is None + (for example, if -OO was passed to the interpreter) + """ + + def __init__(self, *args, **kwargs): + if (args and kwargs): + raise AssertionError("Only positional or keyword args are allowed") + + self.params = args or kwargs + + def __call__(self, obj): + obj.__doc__ = obj.__doc__.format(**self.params) + return obj + + +_random_state_docstring = \ + """random_state : int, RandomState instance or None, optional (default=None) + Control the randomization of the algorithm + - If int, ``random_state`` is the seed used by the random number + generator; + - If ``RandomState`` instance, random_state is the random number + generator; + - If ``None``, the random number generator is the ``RandomState`` + instance used by ``np.random``. + """.rstrip() diff --git a/imblearn/utils/deprecation.py b/imblearn/utils/deprecation.py index 2637f135f..5b470652e 100644 --- a/imblearn/utils/deprecation.py +++ b/imblearn/utils/deprecation.py @@ -6,7 +6,9 @@ import warnings -def deprecate_parameter(sampler, version_deprecation, param_deprecated, +def deprecate_parameter(sampler, + version_deprecation, + param_deprecated, new_param=None): """Helper to deprecate a parameter by another one. @@ -36,22 +38,19 @@ def deprecate_parameter(sampler, version_deprecation, param_deprecated, version_removed = x + '.' + str(int(y) + 2) if new_param is None: if getattr(sampler, param_deprecated) is not None: - warnings.warn("In the estimator {}, the parameter '{}' is" - " deprecated from {} and will be removed in" - " {}.".format(sampler.__class__, - param_deprecated, - version_deprecation, - version_removed), - category=DeprecationWarning) + warnings.warn( + "In the estimator {}, the parameter '{}' is" + " deprecated from {} and will be removed in" + " {}.".format(sampler.__class__, param_deprecated, + version_deprecation, version_removed), + category=DeprecationWarning) else: if getattr(sampler, param_deprecated) is not None: - warnings.warn("In the estimator {}, the parameter '{}' is" - "deprecated from {} and will be removed in" - " {}. Use '{}' instead.".format( - sampler.__class__, - param_deprecated, - version_deprecation, - version_removed, - new_param), - category=DeprecationWarning) + warnings.warn( + "In the estimator {}, the parameter '{}' is" + "deprecated from {} and will be removed in" + " {}. Use '{}' instead.".format( + sampler.__class__, param_deprecated, version_deprecation, + version_removed, new_param), + category=DeprecationWarning) setattr(sampler, new_param, getattr(sampler, param_deprecated)) diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index db4cc2bd9..3480f55ac 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -15,7 +15,6 @@ import numpy as np from scipy import sparse -from pytest import raises from sklearn.datasets import make_classification from sklearn.cluster import KMeans @@ -44,6 +43,7 @@ def _yield_sampler_checks(name, Estimator): yield check_samplers_fit yield check_samplers_fit_sample yield check_samplers_ratio_fit_sample + yield check_samplers_sampling_strategy_fit_sample yield check_samplers_sparse yield check_samplers_pandas yield check_samplers_multiclass_ova @@ -111,7 +111,7 @@ def check_samplers_no_fit_error(name, Sampler): sampler = Sampler() X = np.random.random((20, 2)) y = np.array([1] * 5 + [0] * 15) - with raises(NotFittedError, match="instance is not fitted yet."): + with pytest.raises(NotFittedError, match="instance is not fitted yet."): sampler.sample(X, y) @@ -122,7 +122,8 @@ def check_samplers_X_consistancy_sample(name, Sampler): sampler.fit(X, y) X_different = np.random.random((40, 2)) y_different = y = np.array([1] * 25 + [0] * 15) - with raises(RuntimeError, match="X and y need to be same array earlier"): + msg = "X and y need to be same array earlier" + with pytest.raises(RuntimeError, match=msg): sampler.sample(X_different, y_different) @@ -131,14 +132,19 @@ def check_samplers_fit(name, Sampler): X = np.random.random((30, 2)) y = np.array([1] * 20 + [0] * 10) sampler.fit(X, y) + # FIXME remove in 0.6 -> ratio is deprecated assert hasattr(sampler, 'ratio_') + assert hasattr(sampler, 'sampling_strategy_') def check_samplers_fit_sample(name, Sampler): sampler = Sampler() - X, y = make_classification(n_samples=1000, n_classes=3, - n_informative=4, weights=[0.2, 0.3, 0.5], - random_state=0) + X, y = make_classification( + n_samples=1000, + n_classes=3, + n_informative=4, + weights=[0.2, 0.3, 0.5], + random_state=0) target_stats = Counter(y) X_res, y_res = sampler.fit_sample(X, y) if isinstance(sampler, BaseOverSampler): @@ -161,11 +167,15 @@ def check_samplers_fit_sample(name, Sampler): for value in Counter(y_ensemble).values()) +# FIXME remove in 0.6 -> ratio will be deprecated def check_samplers_ratio_fit_sample(name, Sampler): # in this test we will force all samplers to not change the class 1 - X, y = make_classification(n_samples=1000, n_classes=3, - n_informative=4, weights=[0.2, 0.3, 0.5], - random_state=0) + X, y = make_classification( + n_samples=1000, + n_classes=3, + n_informative=4, + weights=[0.2, 0.3, 0.5], + random_state=0) sampler = Sampler() expected_stat = Counter(y)[1] if isinstance(sampler, BaseOverSampler): @@ -183,7 +193,7 @@ def check_samplers_ratio_fit_sample(name, Sampler): sampler.set_params(ratio=ratio) X_res, y_res = sampler.fit_sample(X, y) assert Counter(y_res)[1] == expected_stat - elif isinstance(sampler, BaseEnsembleSampler): + if isinstance(sampler, BaseEnsembleSampler): ratio = {2: 201, 0: 201} sampler.set_params(ratio=ratio) X_res, y_res = sampler.fit_sample(X, y) @@ -191,26 +201,64 @@ def check_samplers_ratio_fit_sample(name, Sampler): assert Counter(y_ensemble)[1] == expected_stat +def check_samplers_sampling_strategy_fit_sample(name, Sampler): + # in this test we will force all samplers to not change the class 1 + X, y = make_classification( + n_samples=1000, + n_classes=3, + n_informative=4, + weights=[0.2, 0.3, 0.5], + random_state=0) + sampler = Sampler() + expected_stat = Counter(y)[1] + if isinstance(sampler, BaseOverSampler): + sampling_strategy = {2: 498, 0: 498} + sampler.set_params(sampling_strategy=sampling_strategy) + X_res, y_res = sampler.fit_sample(X, y) + assert Counter(y_res)[1] == expected_stat + elif isinstance(sampler, BaseUnderSampler): + sampling_strategy = {2: 201, 0: 201} + sampler.set_params(sampling_strategy=sampling_strategy) + X_res, y_res = sampler.fit_sample(X, y) + assert Counter(y_res)[1] == expected_stat + elif isinstance(sampler, BaseCleaningSampler): + sampling_strategy = {2: 201, 0: 201} + sampler.set_params(sampling_strategy=sampling_strategy) + X_res, y_res = sampler.fit_sample(X, y) + assert Counter(y_res)[1] == expected_stat + if isinstance(sampler, BaseEnsembleSampler): + sampling_strategy = {2: 201, 0: 201} + sampler.set_params(sampling_strategy=sampling_strategy) + X_res, y_res = sampler.fit_sample(X, y) + y_ensemble = y_res[0] + assert Counter(y_ensemble)[1] == expected_stat + + def check_samplers_sparse(name, Sampler): # check that sparse matrices can be passed through the sampler leading to # the same results than dense - X, y = make_classification(n_samples=1000, n_classes=3, - n_informative=4, weights=[0.2, 0.3, 0.5], - random_state=0) + X, y = make_classification( + n_samples=1000, + n_classes=3, + n_informative=4, + weights=[0.2, 0.3, 0.5], + random_state=0) X_sparse = sparse.csr_matrix(X) if isinstance(Sampler(), SMOTE): - samplers = [Sampler(random_state=0, kind=kind) - for kind in ('regular', 'borderline1', - 'borderline2', 'svm')] + samplers = [ + Sampler(random_state=0, kind=kind) + for kind in ('regular', 'borderline1', 'borderline2', 'svm') + ] elif isinstance(Sampler(), NearMiss): - samplers = [Sampler(version=version) - for version in (1, 2, 3)] + samplers = [Sampler(version=version) for version in (1, 2, 3)] elif isinstance(Sampler(), ClusterCentroids): # set KMeans to full since it support sparse and dense - samplers = [Sampler(random_state=0, - voting='soft', - estimator=KMeans(random_state=1, - algorithm='full'))] + samplers = [ + Sampler( + random_state=0, + voting='soft', + estimator=KMeans(random_state=1, algorithm='full')) + ] else: samplers = [Sampler()] @@ -219,12 +267,12 @@ def check_samplers_sparse(name, Sampler): X_res_sparse, y_res_sparse = sampler.fit_sample(X_sparse, y) X_res, y_res = sampler.fit_sample(X, y) if not isinstance(sampler, BaseEnsembleSampler): - assert sparse.issparse(X_res_sparse) - assert_allclose(X_res_sparse.A, X_res) - assert_allclose(y_res_sparse, y_res) + assert sparse.issparse(X_res_sparse) + assert_allclose(X_res_sparse.A, X_res) + assert_allclose(y_res_sparse, y_res) else: - for x_sp, x, y_sp, y in zip(X_res_sparse, X_res, - y_res_sparse, y_res): + for x_sp, x, y_sp, y in zip(X_res_sparse, X_res, y_res_sparse, + y_res): assert sparse.issparse(x_sp) assert_allclose(x_sp.A, x) assert_allclose(y_sp, y) @@ -233,19 +281,22 @@ def check_samplers_sparse(name, Sampler): def check_samplers_pandas(name, Sampler): pd = pytest.importorskip("pandas") # Check that the samplers handle pandas dataframe and pandas series - X, y = make_classification(n_samples=1000, n_classes=3, - n_informative=4, weights=[0.2, 0.3, 0.5], - random_state=0) + X, y = make_classification( + n_samples=1000, + n_classes=3, + n_informative=4, + weights=[0.2, 0.3, 0.5], + random_state=0) X_pd, y_pd = pd.DataFrame(X), pd.Series(y) sampler = Sampler() if isinstance(Sampler(), SMOTE): - samplers = [Sampler(random_state=0, kind=kind) - for kind in ('regular', 'borderline1', - 'borderline2', 'svm')] + samplers = [ + Sampler(random_state=0, kind=kind) + for kind in ('regular', 'borderline1', 'borderline2', 'svm') + ] elif isinstance(Sampler(), NearMiss): - samplers = [Sampler(version=version) - for version in (1, 2, 3)] + samplers = [Sampler(version=version) for version in (1, 2, 3)] else: samplers = [Sampler()] @@ -260,9 +311,12 @@ def check_samplers_pandas(name, Sampler): def check_samplers_multiclass_ova(name, Sampler): # Check that multiclass target lead to the same results than OVA encoding - X, y = make_classification(n_samples=1000, n_classes=3, - n_informative=4, weights=[0.2, 0.3, 0.5], - random_state=0) + X, y = make_classification( + n_samples=1000, + n_classes=3, + n_informative=4, + weights=[0.2, 0.3, 0.5], + random_state=0) y_ova = label_binarize(y, np.unique(y)) sampler = Sampler() set_random_state(sampler) diff --git a/imblearn/utils/testing.py b/imblearn/utils/testing.py index 6d5a980d4..c39c521c3 100644 --- a/imblearn/utils/testing.py +++ b/imblearn/utils/testing.py @@ -17,7 +17,6 @@ from imblearn.base import SamplerMixin import imblearn - # meta-estimators need another estimator to be instantiated. META_ESTIMATORS = [] # estimators that there is no way to default-construct sensibly @@ -27,7 +26,8 @@ def all_estimators(include_meta_estimators=False, - include_other=False, type_filter=None, + include_other=False, + type_filter=None, include_dont_test=False): """Get a list of all estimators from imblearn. @@ -65,8 +65,9 @@ def all_estimators(include_meta_estimators=False, and ``class`` is the actual type of the class. """ + def is_abstract(c): - if not(hasattr(c, '__abstractmethods__')): + if not (hasattr(c, '__abstractmethods__')): return False if not len(c.__abstractmethods__): return False @@ -85,9 +86,10 @@ def is_abstract(c): all_classes = set(all_classes) - estimators = [c for c in all_classes - if (issubclass(c[1], BaseEstimator) and - c[0] != 'BaseEstimator')] + estimators = [ + c for c in all_classes + if (issubclass(c[1], BaseEstimator) and c[0] != 'BaseEstimator') + ] # get rid of abstract base classes estimators = [c for c in estimators if not is_abstract(c[1])] @@ -112,8 +114,8 @@ def is_abstract(c): for name, mixin in filters.items(): if name in type_filter: type_filter.remove(name) - filtered_estimators.extend([est for est in estimators - if issubclass(est[1], mixin)]) + filtered_estimators.extend( + [est for est in estimators if issubclass(est[1], mixin)]) estimators = filtered_estimators if type_filter: raise ValueError("Parameter type_filter must be 'sampler' or " diff --git a/imblearn/utils/tests/test_docstring.py b/imblearn/utils/tests/test_docstring.py new file mode 100644 index 000000000..0b7966fc3 --- /dev/null +++ b/imblearn/utils/tests/test_docstring.py @@ -0,0 +1,63 @@ +"""Test utilities for docstring.""" + +# Authors: Guillaume Lemaitre +# License: MIT + +import pytest + +from imblearn.utils import Substitution + +func_docstring = \ + """A function. + + Parameters + ---------- + xxx + + yyy + """.rstrip() + + +def func(param_1, param_2): + """A function. + + Parameters + ---------- + {param_1} + + {param_2} + """ + return param_1, param_2 + + +cls_docstring = \ + """A class. + + Parameters + ---------- + xxx + + yyy + """.rstrip() + + +class cls: + """A class. + + Parameters + ---------- + {param_1} + + {param_2} + """ + + def __init__(self, param_1, param_2): + self.param_1 = param_1 + self.param_2 = param_2 + + +@pytest.mark.parametrize("obj, obj_docstring", [(func, func_docstring), + (cls, cls_docstring)]) +def test_docstring_inject(obj, obj_docstring): + obj_injected_docstring = Substitution(param_1='xxx', param_2='yyy')(obj) + obj_injected_docstring.__doc__ == obj_docstring diff --git a/imblearn/utils/tests/test_validation.py b/imblearn/utils/tests/test_validation.py index bed62617d..84f58ed40 100644 --- a/imblearn/utils/tests/test_validation.py +++ b/imblearn/utils/tests/test_validation.py @@ -5,9 +5,8 @@ from collections import Counter -import numpy as np import pytest -from pytest import raises +import numpy as np from sklearn.neighbors.base import KNeighborsMixin from sklearn.neighbors import NearestNeighbors @@ -18,9 +17,13 @@ from imblearn.utils.testing import warns from imblearn.utils import check_neighbors_object from imblearn.utils import check_ratio +from imblearn.utils import check_sampling_strategy from imblearn.utils import hash_X_y from imblearn.utils import check_target_type +multiclass_target = np.array([1] * 50 + [2] * 100 + [3] * 25) +binary_target = np.array([1] * 25 + [0] * 100) + def test_check_neighbors_object(): name = 'n_neighbors' @@ -34,27 +37,23 @@ def test_check_neighbors_object(): estimator = NearestNeighbors(n_neighbors) assert estimator is check_neighbors_object(name, estimator) n_neighbors = 'rnd' - with raises(ValueError, match="has to be one of"): + with pytest.raises(ValueError, match="has to be one of"): check_neighbors_object(name, n_neighbors) -@pytest.mark.parametrize( - "target, output_target", - [(np.array([0, 1, 1]), np.array([0, 1, 1])), - (np.array([0, 1, 2]), np.array([0, 1, 2])), - (np.array([[0, 1], [1, 0]]), np.array([1, 0]))] -) +@pytest.mark.parametrize("target, output_target", [(np.array( + [0, 1, 1]), np.array([0, 1, 1])), (np.array([0, 1, 2]), np.array( + [0, 1, 2])), (np.array([[0, 1], [1, 0]]), np.array([1, 0]))]) def test_check_target_type(target, output_target): converted_target = check_target_type(target.astype(int)) assert_array_equal(converted_target, output_target.astype(int)) -@pytest.mark.parametrize( - "target, output_target, is_ova", - [(np.array([0, 1, 1]), np.array([0, 1, 1]), False), - (np.array([0, 1, 2]), np.array([0, 1, 2]), False), - (np.array([[0, 1], [1, 0]]), np.array([1, 0]), True)] -) +@pytest.mark.parametrize("target, output_target, is_ova", + [(np.array([0, 1, 1]), np.array([0, 1, 1]), False), + (np.array([0, 1, 2]), np.array([0, 1, 2]), + False), (np.array([[0, 1], [1, 0]]), + np.array([1, 0]), True)]) def test_check_target_type_ova(target, output_target, is_ova): converted_target, binarize_target = check_target_type( target.astype(int), indicate_one_vs_all=True) @@ -68,140 +67,297 @@ def test_check_target_warning(): check_target_type(target) -def test_check_ratio_error(): - with raises(ValueError, match="'sampling_type' should be one of"): - check_ratio('auto', np.array([1, 2, 3]), 'rnd') - - error_regex = "The target 'y' needs to have more than 1 class." - with raises(ValueError, match=error_regex): - check_ratio('auto', np.ones((10, )), 'over-sampling') - - error_regex = "When 'ratio' is a string, it needs to be one of" - with raises(ValueError, match=error_regex): - check_ratio('rnd', np.array([1, 2, 3]), 'over-sampling') - - -def test_ratio_all_over_sampling(): - y = np.array([1] * 50 + [2] * 100 + [3] * 25) - for each in ('all', 'auto'): - assert check_ratio(each, y, 'over-sampling') == {1: 50, 2: 0, 3: 75} - - -def test_ratio_all_under_sampling(): - y = np.array([1] * 50 + [2] * 100 + [3] * 25) - ratio = check_ratio('all', y, 'under-sampling') - assert ratio == {1: 25, 2: 25, 3: 25} - - -def test_ratio_majority_over_sampling(): - error_regex = "'ratio'='majority' cannot be used with over-sampler." - with raises(ValueError, match=error_regex): - check_ratio('majority', np.array([1, 2, 3]), 'over-sampling') - - -def test_ratio_majority_under_sampling(): - y = np.array([1] * 50 + [2] * 100 + [3] * 25) - ratio = check_ratio('majority', y, 'under-sampling') - assert ratio == {2: 25} - +def test_check_sampling_strategy_warning(): + msg = 'dict for cleaning methods is deprecated' + with pytest.warns(DeprecationWarning, match=msg): + check_sampling_strategy({ + 1: 0, + 2: 0, + 3: 0 + }, multiclass_target, 'clean-sampling') -def test_ratio_not_minority_over_sampling(): - y = np.array([1] * 50 + [2] * 100 + [3] * 25) - ratio = check_ratio('not minority', y, 'over-sampling') - assert ratio == {1: 50, 2: 0} +def test_check_sampling_strategy_float_error(): + msg = "'clean-sampling' methods do let the user specify the sampling ratio" + with pytest.raises(ValueError, match=msg): + check_sampling_strategy(0.5, binary_target, 'clean-sampling') -def test_ratio_not_minority_under_sampling(): - y = np.array([1] * 50 + [2] * 100 + [3] * 25) - ratio = check_ratio('not minority', y, 'under-sampling') - assert ratio == {1: 25, 2: 25} - ratio = check_ratio('auto', y, 'under-sampling') - assert ratio == {1: 25, 2: 25} +def test_check_sampling_strategy_error(): + with pytest.raises(ValueError, match="'sampling_type' should be one of"): + check_sampling_strategy('auto', np.array([1, 2, 3]), 'rnd') -def test_ratio_minority_over_sampling(): + error_regex = "The target 'y' needs to have more than 1 class." + with pytest.raises(ValueError, match=error_regex): + check_sampling_strategy('auto', np.ones((10, )), 'over-sampling') + + error_regex = "When 'sampling_strategy' is a string, it needs to be one of" + with pytest.raises(ValueError, match=error_regex): + check_sampling_strategy('rnd', np.array([1, 2, 3]), 'over-sampling') + + +@pytest.mark.parametrize("sampling_strategy, sampling_type, err_msg", + [('majority', 'over-sampling', 'over-sampler'), + ('minority', 'under-sampling', 'under-sampler')]) +def test_check_sampling_strategy_error_wrong_string(sampling_strategy, + sampling_type, err_msg): + with pytest.raises( + ValueError, + match=("'{}' cannot be used with {}".format( + sampling_strategy, err_msg))): + check_sampling_strategy(sampling_strategy, + np.array([1, 2, 3]), sampling_type) + + +@pytest.mark.parametrize("sampling_strategy, sampling_method", [({ + 10: 10 +}, 'under-sampling'), ({ + 10: 10 +}, 'over-sampling'), ([10], 'clean-sampling')]) +def test_sampling_strategy_class_target_unknown(sampling_strategy, + sampling_method): y = np.array([1] * 50 + [2] * 100 + [3] * 25) - ratio = check_ratio('minority', y, 'over-sampling') - assert ratio == {3: 75} - - -def test_ratio_minority_under_sampling(): - error_regex = "'ratio'='minority' cannot be used with under-sampler." - with raises(ValueError, match=error_regex): - check_ratio('minority', np.array([1, 2, 3]), 'under-sampling') + with pytest.raises(ValueError, match="are not present in the data."): + check_sampling_strategy(sampling_strategy, y, sampling_method) -def test_ratio_dict_error(): +def test_sampling_strategy_dict_error(): y = np.array([1] * 50 + [2] * 100 + [3] * 25) - ratio = {1: -100, 2: 50, 3: 25} - with raises(ValueError, match="in a class cannot be negative."): - check_ratio(ratio, y, 'under-sampling') - ratio = {10: 10} - with raises(ValueError, match="are not present in the data."): - check_ratio(ratio, y, 'over-sampling') - ratio = {1: 45, 2: 100, 3: 70} + sampling_strategy = {1: -100, 2: 50, 3: 25} + with pytest.raises(ValueError, match="in a class cannot be negative."): + check_sampling_strategy(sampling_strategy, y, 'under-sampling') + sampling_strategy = {1: 45, 2: 100, 3: 70} error_regex = ("With over-sampling methods, the number of samples in a" " class should be greater or equal to the original number" " of samples. Originally, there is 50 samples and 45" " samples are asked.") - with raises(ValueError, match=error_regex): - check_ratio(ratio, y, 'over-sampling') + with pytest.raises(ValueError, match=error_regex): + check_sampling_strategy(sampling_strategy, y, 'over-sampling') error_regex = ("With under-sampling methods, the number of samples in a" " class should be less or equal to the original number of" " samples. Originally, there is 25 samples and 70 samples" " are asked.") - with raises(ValueError, match=error_regex): - check_ratio(ratio, y, 'under-sampling') + with pytest.raises(ValueError, match=error_regex): + check_sampling_strategy(sampling_strategy, y, 'under-sampling') -def test_ratio_dict_over_sampling(): - y = np.array([1] * 50 + [2] * 100 + [3] * 25) - ratio = {1: 70, 2: 100, 3: 70} - ratio_ = check_ratio(ratio, y, 'over-sampling') - assert ratio_ == {1: 20, 2: 0, 3: 45} - ratio = {1: 70, 2: 140, 3: 70} - expected_msg = ("After over-sampling, the number of samples \(140\) in" - " class 2 will be larger than the number of samples in the" - " majority class \(class #2 -> 100\)") - with warns(UserWarning, expected_msg): - check_ratio(ratio, y, 'over-sampling') +@pytest.mark.parametrize("sampling_strategy", [-10, 10]) +def test_sampling_strategy_float_error_not_in_range(sampling_strategy): + y = np.array([1] * 50 + [2] * 100) + with pytest.raises(ValueError, match='it should be in the range'): + check_sampling_strategy(sampling_strategy, y, 'under-sampling') -def test_ratio_dict_under_sampling(): +def test_sampling_strategy_float_error_not_binary(): y = np.array([1] * 50 + [2] * 100 + [3] * 25) - ratio = {1: 30, 2: 45, 3: 25} - ratio_ = check_ratio(ratio, y, 'under-sampling') - assert ratio_ == ratio + with pytest.raises(ValueError, match='the type of target is binary'): + sampling_strategy = 0.5 + check_sampling_strategy(sampling_strategy, y, 'under-sampling') -def test_ratio_callable(): +@pytest.mark.parametrize("sampling_method", + ['over-sampling', 'under-sampling']) +def test_sampling_strategy_list_error_not_clean_sampling(sampling_method): y = np.array([1] * 50 + [2] * 100 + [3] * 25) + with pytest.raises(ValueError, match='cannot be a list for samplers'): + sampling_strategy = [1, 2, 3] + check_sampling_strategy(sampling_strategy, y, sampling_method) + + +def _sampling_strategy_func(y): + # this function could create an equal number of samples + target_stats = Counter(y) + n_samples = max(target_stats.values()) + return {key: int(n_samples) for key in target_stats.keys()} - def ratio_func(y): - # this function could create an equal number of samples - target_stats = Counter(y) - n_samples = max(target_stats.values()) - return {key: int(n_samples) - for key in target_stats.keys()} - ratio_ = check_ratio(ratio_func, y, 'over-sampling') - assert ratio_ == {1: 50, 2: 0, 3: 75} +@pytest.mark.parametrize( + "sampling_strategy, sampling_type, expected_sampling_strategy, target", + [('auto', 'under-sampling', { + 1: 25, + 2: 25 + }, multiclass_target), ('auto', 'clean-sampling', { + 1: 25, + 2: 25 + }, multiclass_target), ('auto', 'over-sampling', { + 1: 50, + 3: 75 + }, multiclass_target), ('all', 'over-sampling', { + 1: 50, + 2: 0, + 3: 75 + }, multiclass_target), ('all', 'under-sampling', { + 1: 25, + 2: 25, + 3: 25 + }, multiclass_target), ('all', 'clean-sampling', { + 1: 25, + 2: 25, + 3: 25 + }, multiclass_target), ('majority', 'under-sampling', { + 2: 25 + }, multiclass_target), ('majority', 'clean-sampling', { + 2: 25 + }, multiclass_target), ('minority', 'over-sampling', { + 3: 75 + }, multiclass_target), ('not minority', 'over-sampling', { + 1: 50, + 2: 0 + }, multiclass_target), ('not minority', 'under-sampling', { + 1: 25, + 2: 25 + }, multiclass_target), ('not minority', 'clean-sampling', { + 1: 25, + 2: 25 + }, multiclass_target), ('not majority', 'over-sampling', { + 1: 50, + 3: 75 + }, multiclass_target), ('not majority', 'under-sampling', { + 1: 25, + 3: 25 + }, multiclass_target), ('not majority', 'clean-sampling', { + 1: 25, + 3: 25 + }, multiclass_target), ({ + 1: 70, + 2: 100, + 3: 70 + }, 'over-sampling', { + 1: 20, + 2: 0, + 3: 45 + }, multiclass_target), ({ + 1: 30, + 2: 45, + 3: 25 + }, 'under-sampling', { + 1: 30, + 2: 45, + 3: 25 + }, multiclass_target), ([1], 'clean-sampling', { + 1: 25 + }, multiclass_target), (_sampling_strategy_func, 'over-sampling', { + 1: 50, + 2: 0, + 3: 75 + }, multiclass_target), (0.5, 'over-sampling', { + 1: 25 + }, binary_target), (0.5, 'under-sampling', { + 0: 50 + }, binary_target)]) +def test_check_sampling_strategy(sampling_strategy, sampling_type, + expected_sampling_strategy, target): + sampling_strategy_ = check_sampling_strategy(sampling_strategy, target, + sampling_type) + assert sampling_strategy_ == expected_sampling_strategy + + +@pytest.mark.parametrize("ratio, sampling_type, expected_ratio, target", [ + ('auto', 'under-sampling', { + 1: 25, + 2: 25 + }, multiclass_target), ('auto', 'clean-sampling', { + 1: 25, + 2: 25 + }, multiclass_target), ('auto', 'over-sampling', { + 1: 50, + 3: 75 + }, multiclass_target), ('all', 'over-sampling', { + 1: 50, + 2: 0, + 3: 75 + }, multiclass_target), ('all', 'under-sampling', { + 1: 25, + 2: 25, + 3: 25 + }, multiclass_target), ('all', 'clean-sampling', { + 1: 25, + 2: 25, + 3: 25 + }, multiclass_target), ('majority', 'under-sampling', { + 2: 25 + }, multiclass_target), ('majority', 'clean-sampling', { + 2: 25 + }, multiclass_target), ('minority', 'over-sampling', { + 3: 75 + }, multiclass_target), ('not minority', 'over-sampling', { + 1: 50, + 2: 0 + }, multiclass_target), ('not minority', 'under-sampling', { + 1: 25, + 2: 25 + }, multiclass_target), ('not minority', 'clean-sampling', { + 1: 25, + 2: 25 + }, multiclass_target), ('not majority', 'over-sampling', { + 1: 50, + 3: 75 + }, multiclass_target), ('not majority', 'under-sampling', { + 1: 25, + 3: 25 + }, multiclass_target), ('not majority', 'clean-sampling', { + 1: 25, + 3: 25 + }, multiclass_target), ({ + 1: 70, + 2: 100, + 3: 70 + }, 'over-sampling', { + 1: 20, + 2: 0, + 3: 45 + }, multiclass_target), ({ + 1: 30, + 2: 45, + 3: 25 + }, 'under-sampling', { + 1: 30, + 2: 45, + 3: 25 + }, multiclass_target), ([1], 'clean-sampling', { + 1: 25 + }, multiclass_target), (_sampling_strategy_func, 'over-sampling', { + 1: 50, + 2: 0, + 3: 75 + }, multiclass_target), (0.5, 'over-sampling', { + 1: 25 + }, binary_target), (0.5, 'under-sampling', { + 0: 50 + }, binary_target) +]) +def test_check_ratio(ratio, sampling_type, expected_ratio, target): + with pytest.warns(DeprecationWarning, match="check_ratio is deprecated"): + ratio_ = check_ratio(ratio, target, sampling_type) + assert ratio_ == expected_ratio + + +def test_sampling_strategy_dict_over_sampling(): + y = np.array([1] * 50 + [2] * 100 + [3] * 25) + sampling_strategy = {1: 70, 2: 140, 3: 70} + expected_msg = ("After over-sampling, the number of samples \(140\) in" + " class 2 will be larger than the number of samples in the" + " majority class \(class #2 -> 100\)") + with warns(UserWarning, expected_msg): + check_sampling_strategy(sampling_strategy, y, 'over-sampling') -def test_ratio_callable_args(): +def test_sampling_strategy_callable_args(): y = np.array([1] * 50 + [2] * 100 + [3] * 25) multiplier = {1: 1.5, 2: 1, 3: 3} - def ratio_func(y, multiplier): + def sampling_strategy_func(y, multiplier): """samples such that each class will be affected by the multiplier.""" target_stats = Counter(y) - return {key: int(values * multiplier[key]) - for key, values in target_stats.items()} - - ratio_ = check_ratio(ratio_func, y, 'over-sampling', - multiplier=multiplier) - assert ratio_ == {1: 25, 2: 0, 3: 50} + return { + key: int(values * multiplier[key]) + for key, values in target_stats.items() + } + + sampling_strategy_ = check_sampling_strategy( + sampling_strategy_func, y, 'over-sampling', multiplier=multiplier) + assert sampling_strategy_ == {1: 25, 2: 0, 3: 50} def test_hash_X_y(): diff --git a/imblearn/utils/validation.py b/imblearn/utils/validation.py index 58488463a..7d4ad4495 100644 --- a/imblearn/utils/validation.py +++ b/imblearn/utils/validation.py @@ -6,7 +6,7 @@ import warnings from collections import Counter -from numbers import Integral +from numbers import Integral, Real import numpy as np @@ -14,6 +14,7 @@ from sklearn.neighbors import NearestNeighbors from sklearn.externals import six, joblib from sklearn.utils.multiclass import type_of_target +from sklearn.utils.deprecation import deprecated from ..exceptions import raise_isinstance_error @@ -122,111 +123,148 @@ def hash_X_y(X, y, n_samples=10, n_features=5): return joblib.hash(X[row_idx, col_idx]), joblib.hash(y[row_idx]) -def _ratio_all(y, sampling_type): - """Returns ratio by targeting all classes.""" +def _sampling_strategy_all(y, sampling_type): + """Returns sampling target by targeting all classes.""" target_stats = Counter(y) if sampling_type == 'over-sampling': n_sample_majority = max(target_stats.values()) - ratio = {key: n_sample_majority - value - for (key, value) in target_stats.items()} + sampling_strategy = { + key: n_sample_majority - value + for (key, value) in target_stats.items() + } elif (sampling_type == 'under-sampling' or sampling_type == 'clean-sampling'): n_sample_minority = min(target_stats.values()) - ratio = {key: n_sample_minority for key in target_stats.keys()} + sampling_strategy = { + key: n_sample_minority + for key in target_stats.keys() + } else: raise NotImplementedError - return ratio + return sampling_strategy -def _ratio_majority(y, sampling_type): - """Returns ratio by targeting the majority class only.""" +def _sampling_strategy_majority(y, sampling_type): + """Returns sampling target by targeting the majority class only.""" if sampling_type == 'over-sampling': - raise ValueError("'ratio'='majority' cannot be used with" + raise ValueError("'sampling_strategy'='majority' cannot be used with" " over-sampler.") elif (sampling_type == 'under-sampling' or sampling_type == 'clean-sampling'): target_stats = Counter(y) class_majority = max(target_stats, key=target_stats.get) n_sample_minority = min(target_stats.values()) - ratio = {key: n_sample_minority - for key in target_stats.keys() - if key == class_majority} + sampling_strategy = { + key: n_sample_minority + for key in target_stats.keys() if key == class_majority + } else: raise NotImplementedError - return ratio + return sampling_strategy -def _ratio_not_minority(y, sampling_type): - """Returns ratio by targeting all classes but not the minority.""" +def _sampling_strategy_not_majority(y, sampling_type): + """Returns sampling target by targeting all classes but not the + majority.""" + target_stats = Counter(y) + if sampling_type == 'over-sampling': + n_sample_majority = max(target_stats.values()) + class_majority = max(target_stats, key=target_stats.get) + sampling_strategy = { + key: n_sample_majority - value + for (key, value) in target_stats.items() if key != class_majority + } + elif (sampling_type == 'under-sampling' or + sampling_type == 'clean-sampling'): + n_sample_minority = min(target_stats.values()) + class_majority = max(target_stats, key=target_stats.get) + sampling_strategy = { + key: n_sample_minority + for key in target_stats.keys() if key != class_majority + } + else: + raise NotImplementedError + + return sampling_strategy + + +def _sampling_strategy_not_minority(y, sampling_type): + """Returns sampling target by targeting all classes but not the + minority.""" target_stats = Counter(y) if sampling_type == 'over-sampling': n_sample_majority = max(target_stats.values()) class_minority = min(target_stats, key=target_stats.get) - ratio = {key: n_sample_majority - value - for (key, value) in target_stats.items() - if key != class_minority} + sampling_strategy = { + key: n_sample_majority - value + for (key, value) in target_stats.items() if key != class_minority + } elif (sampling_type == 'under-sampling' or sampling_type == 'clean-sampling'): n_sample_minority = min(target_stats.values()) class_minority = min(target_stats, key=target_stats.get) - ratio = {key: n_sample_minority - for key in target_stats.keys() - if key != class_minority} + sampling_strategy = { + key: n_sample_minority + for key in target_stats.keys() if key != class_minority + } else: raise NotImplementedError - return ratio + return sampling_strategy -def _ratio_minority(y, sampling_type): - """Returns ratio by targeting the minority class only.""" +def _sampling_strategy_minority(y, sampling_type): + """Returns sampling target by targeting the minority class only.""" target_stats = Counter(y) if sampling_type == 'over-sampling': n_sample_majority = max(target_stats.values()) class_minority = min(target_stats, key=target_stats.get) - ratio = {key: n_sample_majority - value - for (key, value) in target_stats.items() - if key == class_minority} + sampling_strategy = { + key: n_sample_majority - value + for (key, value) in target_stats.items() if key == class_minority + } elif (sampling_type == 'under-sampling' or sampling_type == 'clean-sampling'): - raise ValueError("'ratio'='minority' cannot be used with" + raise ValueError("'sampling_strategy'='minority' cannot be used with" " under-sampler and clean-sampler.") else: raise NotImplementedError - return ratio + return sampling_strategy -def _ratio_auto(y, sampling_type): - """Returns ratio auto for over-sampling and not-minority for +def _sampling_strategy_auto(y, sampling_type): + """Returns sampling target auto for over-sampling and not-minority for under-sampling.""" if sampling_type == 'over-sampling': - return _ratio_all(y, sampling_type) + return _sampling_strategy_not_majority(y, sampling_type) elif (sampling_type == 'under-sampling' or sampling_type == 'clean-sampling'): - return _ratio_not_minority(y, sampling_type) + return _sampling_strategy_not_minority(y, sampling_type) -def _ratio_dict(ratio, y, sampling_type): - """Returns ratio by converting the dictionary depending of the sampling.""" +def _sampling_strategy_dict(sampling_strategy, y, sampling_type): + """Returns sampling target by converting the dictionary depending of the + sampling.""" target_stats = Counter(y) - # check that all keys in ratio are also in y - set_diff_ratio_target = set(ratio.keys()) - set(target_stats.keys()) - if len(set_diff_ratio_target) > 0: + # check that all keys in sampling_strategy are also in y + set_diff_sampling_strategy_target = ( + set(sampling_strategy.keys()) - set(target_stats.keys())) + if len(set_diff_sampling_strategy_target) > 0: raise ValueError("The {} target class is/are not present in the" - " data.".format(set_diff_ratio_target)) + " data.".format(set_diff_sampling_strategy_target)) # check that there is no negative number - if any(n_samples < 0 for n_samples in ratio.values()): + if any(n_samples < 0 for n_samples in sampling_strategy.values()): raise ValueError("The number of samples in a class cannot be negative." - "'ratio' contains some negative value: {}".format( - ratio)) - ratio_ = {} + "'sampling_strategy' contains some negative value: {}" + .format(sampling_strategy)) + sampling_strategy_ = {} if sampling_type == 'over-sampling': n_samples_majority = max(target_stats.values()) class_majority = max(target_stats, key=target_stats.get) - for class_sample, n_samples in ratio.items(): + for class_sample, n_samples in sampling_strategy.items(): if n_samples < target_stats[class_sample]: raise ValueError("With over-sampling methods, the number" " of samples in a class should be greater" @@ -241,9 +279,10 @@ def _ratio_dict(ratio, y, sampling_type): " {})".format(n_samples, class_sample, class_majority, n_samples_majority)) - ratio_[class_sample] = n_samples - target_stats[class_sample] + sampling_strategy_[class_sample] = ( + n_samples - target_stats[class_sample]) elif sampling_type == 'under-sampling': - for class_sample, n_samples in ratio.items(): + for class_sample, n_samples in sampling_strategy.items(): if n_samples > target_stats[class_sample]: raise ValueError("With under-sampling methods, the number of" " samples in a class should be less or equal" @@ -251,24 +290,219 @@ def _ratio_dict(ratio, y, sampling_type): " Originally, there is {} samples and {}" " samples are asked.".format( target_stats[class_sample], n_samples)) - ratio_[class_sample] = n_samples + sampling_strategy_[class_sample] = n_samples elif sampling_type == 'clean-sampling': + # FIXME: Turn into an error in 0.6 + warnings.warn("'sampling_strategy' as a dict for cleaning methods is " + "deprecated and will raise an error in version 0.6. " + "Please give a list of the classes to be targeted by the" + " sampling.", DeprecationWarning) # clean-sampling can be more permissive since those samplers do not # use samples - for class_sample, n_samples in ratio.items(): - ratio_[class_sample] = n_samples + for class_sample, n_samples in sampling_strategy.items(): + sampling_strategy_[class_sample] = n_samples else: raise NotImplementedError - return ratio_ + return sampling_strategy_ + + +def _sampling_strategy_list(sampling_strategy, y, sampling_type): + """With cleaning methods, sampling_strategy can be a list to target the + class of interest.""" + if sampling_type != 'clean-sampling': + raise ValueError("'sampling_strategy' cannot be a list for samplers " + "which are not cleaning methods.") + + target_stats = Counter(y) + # check that all keys in sampling_strategy are also in y + set_diff_sampling_strategy_target = ( + set(sampling_strategy) - set(target_stats.keys())) + if len(set_diff_sampling_strategy_target) > 0: + raise ValueError("The {} target class is/are not present in the" + " data.".format(set_diff_sampling_strategy_target)) + + return { + class_sample: min(target_stats.values()) + for class_sample in sampling_strategy + } + + +def _sampling_strategy_float(sampling_strategy, y, sampling_type): + """Take a proportion of the majority (over-sampling) or minority + (under-sampling) class in binary classification.""" + type_y = type_of_target(y) + if type_y != 'binary': + raise ValueError( + '"sampling_strategy" can be a float only when the type ' + 'of target is binary. For multi-class, use a dict.') + target_stats = Counter(y) + if sampling_type == 'over-sampling': + n_sample_majority = max(target_stats.values()) + class_majority = max(target_stats, key=target_stats.get) + sampling_strategy_ = { + key: int(n_sample_majority * sampling_strategy - value) + for (key, value) in target_stats.items() if key != class_majority + } + elif (sampling_type == 'under-sampling'): + n_sample_minority = min(target_stats.values()) + class_minority = min(target_stats, key=target_stats.get) + sampling_strategy_ = { + key: int(n_sample_minority / sampling_strategy) + for (key, value) in target_stats.items() if key != class_minority + } + else: + raise ValueError("'clean-sampling' methods do let the user " + "specify the sampling ratio.") + return sampling_strategy_ + + +def check_sampling_strategy(sampling_strategy, y, sampling_type, **kwargs): + """Sampling target validation for samplers. + + Checks that ``sampling_strategy`` is of consistent type and return a + dictionary containing each targeted class with its corresponding + number of sample. It is used in :class:`imblearn.base.BaseSampler`. + + Parameters + ---------- + sampling_strategy : float, str, dict, list or callable, + Sampling information to sample the data set. + + - When ``float``: + + For **under-sampling methods**, it corresponds to the ratio + :math:`\\alpha_{us}` defined by :math:`N_{rM} = \\alpha_{us} + \\times N_{m}` where :math:`N_{rM}` and :math:`N_{m}` are the + number of samples in the majority class after resampling and the + number of samples in the minority class, respectively; + + For **over-sampling methods**, it correspond to the ratio + :math:`\\alpha_{os}` defined by :math:`N_{rm} = \\alpha_{os} + \\times N_{m}` where :math:`N_{rm}` and :math:`N_{M}` are the + number of samples in the minority class after resampling and the + number of samples in the majority class, respectively. + + .. warning:: + ``float`` is only available for **binary** classification. An + error is raised for multi-class classification and with cleaning + samplers. + + - When ``str``, specify the class targeted by the resampling. For + **under- and over-sampling methods**, the number of samples in the + different classes will be equalized. For **cleaning methods**, the + number of samples will not be equal. Possible choices are: + + ``'minority'``: resample only the minority class; + + ``'majority'``: resample only the majority class; + ``'not minority'``: resample all classes but the minority class; + ``'not majority'``: resample all classes but the majority class; + + ``'all'``: resample all classes; + + ``'auto'``: for under-sampling methods, equivalent to ``'not + minority'`` and for over-sampling methods, equivalent to ``'not + majority'``. + + - When ``dict``, the keys correspond to the targeted classes. The + values correspond to the desired number of samples for each targeted + class. + + .. warning:: + ``dict`` is available for both **under- and over-sampling + methods**. An error is raised with **cleaning methods**. Use a + ``list`` instead. + + - When ``list``, the list contains the targeted classes. It used only + for **cleaning methods``. + + .. warning:: + ``list`` is available for **cleaning methods**. An error is raised + with **under- and over-sampling methods**. + + - When callable, function taking ``y`` and returns a ``dict``. The keys + correspond to the targeted classes. The values correspond to the + desired number of samples for each class. + + y : ndarray, shape (n_samples,) + The target array. + + sampling_type : str, + The type of sampling. Can be either ``'over-sampling'``, + ``'under-sampling'``, or ``'clean-sampling'``. + + kwargs : dict, optional + Dictionary of additional keyword arguments to pass to + ``sampling_strategy`` when this is a callable. + + Returns + ------- + sampling_strategy_converted : dict, + The converted and validated sampling target. Returns a dictionary with + the key being the class target and the value being the desired + number of samples. + + """ + if sampling_type not in SAMPLING_KIND: + raise ValueError("'sampling_type' should be one of {}. Got '{}'" + " instead.".format(SAMPLING_KIND, sampling_type)) + + if np.unique(y).size <= 1: + raise ValueError("The target 'y' needs to have more than 1 class." + " Got {} class instead".format(np.unique(y).size)) + + if sampling_type == 'ensemble': + return sampling_strategy + + if isinstance(sampling_strategy, six.string_types): + if sampling_strategy not in SAMPLING_TARGET_KIND.keys(): + raise ValueError("When 'sampling_strategy' is a string, it needs" + " to be one of {}. Got '{}' instead.".format( + SAMPLING_TARGET_KIND, sampling_strategy)) + return SAMPLING_TARGET_KIND[sampling_strategy](y, sampling_type) + elif isinstance(sampling_strategy, dict): + return _sampling_strategy_dict(sampling_strategy, y, sampling_type) + elif isinstance(sampling_strategy, list): + return _sampling_strategy_list(sampling_strategy, y, sampling_type) + elif isinstance(sampling_strategy, Real): + if sampling_strategy <= 0 or sampling_strategy > 1: + raise ValueError( + "When 'sampling_strategy' is a float, it should be " + "in the range (0, 1]. Got {} instead." + .format(sampling_strategy)) + return _sampling_strategy_float(sampling_strategy, y, sampling_type) + elif callable(sampling_strategy): + sampling_strategy_ = sampling_strategy(y, **kwargs) + return _sampling_strategy_dict(sampling_strategy_, y, sampling_type) + + +SAMPLING_TARGET_KIND = { + 'minority': _sampling_strategy_minority, + 'majority': _sampling_strategy_majority, + 'not minority': _sampling_strategy_not_minority, + 'not majority': _sampling_strategy_not_majority, + 'all': _sampling_strategy_all, + 'auto': _sampling_strategy_auto +} + + +@deprecated("imblearn.utils.check_ratio was deprecated in favor of " + "imblearn.utils.check_sampling_strategy in 0.4. It will be " + "removed in 0.6.") def check_ratio(ratio, y, sampling_type, **kwargs): - """Ratio validation for samplers. + """Sampling target validation for samplers. Checks ratio for consistent type and return a dictionary containing each targeted class with its corresponding number of - pixel. + sample. + + .. deprecated:: 0.4 + This function is deprecated in favor of + :func:`imblearn.utils.check_sampling_strategy`. It will be removed in + 0.6. Parameters ---------- @@ -307,32 +541,4 @@ def check_ratio(ratio, y, sampling_type, **kwargs): number of samples. """ - if sampling_type not in SAMPLING_KIND: - raise ValueError("'sampling_type' should be one of {}. Got '{}'" - " instead.".format(SAMPLING_KIND, sampling_type)) - - if np.unique(y).size <= 1: - raise ValueError("The target 'y' needs to have more than 1 class." - " Got {} class instead".format(np.unique(y).size)) - - if sampling_type == 'ensemble': - return ratio - - if isinstance(ratio, six.string_types): - if ratio not in RATIO_KIND.keys(): - raise ValueError("When 'ratio' is a string, it needs to be one of" - " {}. Got '{}' instead.".format(RATIO_KIND, - ratio)) - return RATIO_KIND[ratio](y, sampling_type) - elif isinstance(ratio, dict): - return _ratio_dict(ratio, y, sampling_type) - elif callable(ratio): - ratio_ = ratio(y, **kwargs) - return _ratio_dict(ratio_, y, sampling_type) - - -RATIO_KIND = {'minority': _ratio_minority, - 'majority': _ratio_majority, - 'not minority': _ratio_not_minority, - 'all': _ratio_all, - 'auto': _ratio_auto} + return check_sampling_strategy(ratio, y, sampling_type, **kwargs)