Skip to content

Commit e00e7f8

Browse files
authored
FIX enable bootstraping in bagging (#360)
1 parent c70333b commit e00e7f8

File tree

4 files changed

+49
-66
lines changed

4 files changed

+49
-66
lines changed

Diff for: doc/ensemble.rst

+5-5
Original file line numberDiff line numberDiff line change
@@ -82,9 +82,9 @@ classifier will favor the majority classes::
8282
BaggingClassifier(...)
8383
>>> y_pred = bc.predict(X_test)
8484
>>> confusion_matrix(y_test, y_pred)
85-
array([[ 0, 0, 12],
86-
[ 0, 0, 59],
87-
[ 0, 0, 1179]])
85+
array([[ 9, 1, 2],
86+
[ 0, 54, 5],
87+
[ 1, 6, 1172]])
8888

8989
:class:`BalancedBaggingClassifier` allows to resample each subset of data
9090
before to train each estimator of the ensemble. In short, it combines the
@@ -105,8 +105,8 @@ takes the same parameters than the scikit-learn
105105
>>> y_pred = bbc.predict(X_test)
106106
>>> confusion_matrix(y_test, y_pred)
107107
array([[ 12, 0, 0],
108-
[ 0, 55, 4],
109-
[ 68, 53, 1058]])
108+
[ 1, 54, 4],
109+
[ 49, 53, 1077]])
110110

111111
See
112112
:ref:`sphx_glr_auto_examples_ensemble_plot_comparison_bagging_classifier.py`.

Diff for: examples/ensemble/plot_comparison_bagging_classifier.py

+5-7
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,11 @@
2424
import matplotlib.pyplot as plt
2525
import numpy as np
2626

27-
from sklearn.datasets import load_iris
2827
from sklearn.model_selection import train_test_split
2928
from sklearn.ensemble import BaggingClassifier
3029
from sklearn.metrics import confusion_matrix
3130

32-
from imblearn.datasets import make_imbalance
31+
from imblearn.datasets import fetch_datasets
3332
from imblearn.ensemble import BalancedBaggingClassifier
3433

3534
from imblearn.metrics import classification_report_imbalanced
@@ -70,9 +69,8 @@ def plot_confusion_matrix(cm, classes,
7069
plt.xlabel('Predicted label')
7170

7271

73-
iris = load_iris()
74-
X, y = make_imbalance(iris.data, iris.target, ratio={0: 25, 1: 40, 2: 50},
75-
random_state=0)
72+
ozone = fetch_datasets()['ozone_level']
73+
X, y = ozone.data, ozone.target
7674
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
7775

7876
bagging = BaggingClassifier(random_state=0)
@@ -90,15 +88,15 @@ def plot_confusion_matrix(cm, classes,
9088
print(classification_report_imbalanced(y_test, y_pred_bagging))
9189
cm_bagging = confusion_matrix(y_test, y_pred_bagging)
9290
plt.figure()
93-
plot_confusion_matrix(cm_bagging, classes=iris.target_names,
91+
plot_confusion_matrix(cm_bagging, classes=np.unique(ozone.target),
9492
title='Confusion matrix using BaggingClassifier')
9593

9694
print('Classification results using a bagging classifier on balanced data')
9795
y_pred_balanced_bagging = balanced_bagging.predict(X_test)
9896
print(classification_report_imbalanced(y_test, y_pred_balanced_bagging))
9997
cm_balanced_bagging = confusion_matrix(y_test, y_pred_balanced_bagging)
10098
plt.figure()
101-
plot_confusion_matrix(cm_balanced_bagging, classes=iris.target_names,
99+
plot_confusion_matrix(cm_balanced_bagging, classes=np.unique(ozone.target),
102100
title='Confusion matrix using BalancedBaggingClassifier')
103101

104102
plt.show()

Diff for: imblearn/ensemble/classifier.py

-17
Original file line numberDiff line numberDiff line change
@@ -21,23 +21,6 @@
2121
old_generate = _generate_bagging_indices
2222

2323

24-
def _masked_bagging_indices(random_state, bootstrap_features,
25-
bootstrap_samples, n_features, n_samples,
26-
max_features, max_samples):
27-
"""Monkey-patch to always get a mask instead of indices"""
28-
feature_indices, sample_indices = old_generate(random_state,
29-
bootstrap_features,
30-
bootstrap_samples,
31-
n_features, n_samples,
32-
max_features, max_samples)
33-
sample_indices = indices_to_mask(sample_indices, n_samples)
34-
35-
return feature_indices, sample_indices
36-
37-
38-
sklearn.ensemble.bagging._generate_bagging_indices = _masked_bagging_indices
39-
40-
4124
class BalancedBaggingClassifier(BaggingClassifier):
4225
"""A Bagging classifier with additional balancing.
4326

Diff for: imblearn/ensemble/tests/test_classifier.py

+39-37
Original file line numberDiff line numberDiff line change
@@ -400,43 +400,45 @@ def test_oob_score_consistency():
400400
assert bagging.fit(X, y).oob_score_ == bagging.fit(X, y).oob_score_
401401

402402

403-
def test_estimators_samples():
404-
# Check that format of estimators_samples_ is correct and that results
405-
# generated at fit time can be identically reproduced at a later time
406-
# using data saved in object attributes.
407-
X, y = make_hastie_10_2(n_samples=200, random_state=1)
408-
409-
# remap the y outside of the BalancedBaggingclassifier
410-
# _, y = np.unique(y, return_inverse=True)
411-
bagging = BalancedBaggingClassifier(LogisticRegression(), max_samples=0.5,
412-
max_features=0.5, random_state=1,
413-
bootstrap=False)
414-
bagging.fit(X, y)
415-
416-
# Get relevant attributes
417-
estimators_samples = bagging.estimators_samples_
418-
estimators_features = bagging.estimators_features_
419-
estimators = bagging.estimators_
420-
421-
# Test for correct formatting
422-
assert len(estimators_samples) == len(estimators)
423-
assert len(estimators_samples[0]) == len(X)
424-
assert estimators_samples[0].dtype.kind == 'b'
425-
426-
# Re-fit single estimator to test for consistent sampling
427-
estimator_index = 0
428-
estimator_samples = estimators_samples[estimator_index]
429-
estimator_features = estimators_features[estimator_index]
430-
estimator = estimators[estimator_index]
431-
432-
X_train = (X[estimator_samples])[:, estimator_features]
433-
y_train = y[estimator_samples]
434-
435-
orig_coefs = estimator.steps[-1][1].coef_
436-
estimator.fit(X_train, y_train)
437-
new_coefs = estimator.steps[-1][1].coef_
438-
439-
assert_array_almost_equal(orig_coefs, new_coefs)
403+
# FIXME: uncomment when #9723 is merged in scikit-learn
404+
# def test_estimators_samples():
405+
# # Check that format of estimators_samples_ is correct and that results
406+
# # generated at fit time can be identically reproduced at a later time
407+
# # using data saved in object attributes.
408+
# X, y = make_hastie_10_2(n_samples=200, random_state=1)
409+
410+
# # remap the y outside of the BalancedBaggingclassifier
411+
# # _, y = np.unique(y, return_inverse=True)
412+
# bagging = BalancedBaggingClassifier(LogisticRegression(),
413+
# max_samples=0.5,
414+
# max_features=0.5, random_state=1,
415+
# bootstrap=False)
416+
# bagging.fit(X, y)
417+
418+
# # Get relevant attributes
419+
# estimators_samples = bagging.estimators_samples_
420+
# estimators_features = bagging.estimators_features_
421+
# estimators = bagging.estimators_
422+
423+
# # Test for correct formatting
424+
# assert len(estimators_samples) == len(estimators)
425+
# assert len(estimators_samples[0]) == len(X)
426+
# assert estimators_samples[0].dtype.kind == 'b'
427+
428+
# # Re-fit single estimator to test for consistent sampling
429+
# estimator_index = 0
430+
# estimator_samples = estimators_samples[estimator_index]
431+
# estimator_features = estimators_features[estimator_index]
432+
# estimator = estimators[estimator_index]
433+
434+
# X_train = (X[estimator_samples])[:, estimator_features]
435+
# y_train = y[estimator_samples]
436+
437+
# orig_coefs = estimator.steps[-1][1].coef_
438+
# estimator.fit(X_train, y_train)
439+
# new_coefs = estimator.steps[-1][1].coef_
440+
441+
# assert_array_almost_equal(orig_coefs, new_coefs)
440442

441443

442444
def test_max_samples_consistency():

0 commit comments

Comments
 (0)