-
Notifications
You must be signed in to change notification settings - Fork 1.3k
/
Copy pathplot_ratio_usage.py
134 lines (97 loc) · 4.88 KB
/
plot_ratio_usage.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
"""
============================================================
Usage of the ``ratio`` parameter for the different algorithm
============================================================
This example shows how to use the ``ratio`` parameter in the different
examples. It illustrated the use of passing ``ratio`` as a ``str``, ``dict`` or
a callable.
"""
# Authors: Guillaume Lemaitre <[email protected]>
# License: MIT
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from imblearn.datasets import make_imbalance
from imblearn.under_sampling import RandomUnderSampler
print(__doc__)
def plot_pie(y):
target_stats = Counter(y)
labels = list(target_stats.keys())
sizes = list(target_stats.values())
explode = tuple([0.1] * len(target_stats))
fig, ax = plt.subplots()
ax.pie(sizes, explode=explode, labels=labels, shadow=True,
autopct='%1.1f%%')
ax.axis('equal')
###############################################################################
# Creation of an imbalanced data set from a balanced data set
###############################################################################
###############################################################################
# We will show how to use the parameter ``ratio`` when dealing with the
# ``make_imbalance`` function. For this function, this parameter accepts both
# dictionary and callable. When using a dictionary, each key will correspond to
# the class of interest and the corresponding value will be the number of
# samples desired in this class.
iris = load_iris()
print('Information of the original iris data set: \n {}'.format(
Counter(iris.target)))
plot_pie(iris.target)
ratio = {0: 10, 1: 20, 2: 30}
X, y = make_imbalance(iris.data, iris.target, ratio=ratio)
print('Information of the iris data set after making it'
' imbalanced using a dict: \n ratio={} \n y: {}'.format(ratio,
Counter(y)))
plot_pie(y)
###############################################################################
# You might required more flexibility and require your own heuristic to
# determine the number of samples by class and you can define your own callable
# as follow. In this case we will define a function which will use a float
# multiplier to define the number of samples per class.
def ratio_multiplier(y):
multiplier = {0: 0.5, 1: 0.7, 2: 0.95}
target_stats = Counter(y)
for key, value in target_stats.items():
target_stats[key] = int(value * multiplier[key])
return target_stats
X, y = make_imbalance(iris.data, iris.target, ratio=ratio_multiplier)
print('Information of the iris data set after making it'
' imbalanced using a callable: \n ratio={} \n y: {}'.format(
ratio_multiplier, Counter(y)))
plot_pie(y)
###############################################################################
# Using ``ratio`` in resampling algorithm
###############################################################################
###############################################################################
# In all sampling algorithms, ``ratio`` can be used as illustrated earlier. In
# addition, some predefined functions are available and can be executed using a
# ``str`` with the following choices: (i) ``'minority'``: resample the minority
# class; (ii) ``'majority'``: resample the majority class, (iii) ``'not
# minority'``: resample all classes apart of the minority class, (iv)
# ``'all'``: resample all classes, and (v) ``'auto'``: correspond to 'all' with
# for over-sampling methods and 'not minority' for under-sampling methods. The
# classes targeted will be over-sampled or under-sampled to achieve an equal
# number of sample with the majority or minority class.
ratio = 'auto'
X_res, y_res = RandomUnderSampler(ratio=ratio, random_state=0).fit_sample(X, y)
print('Information of the iris data set after balancing using "auto"'
' mode:\n ratio={} \n y: {}'.format(ratio, Counter(y_res)))
plot_pie(y_res)
###############################################################################
# However, you can use the dictionary or the callable options as previously
# mentioned.
ratio = {0: 25, 1: 30, 2: 35}
X_res, y_res = RandomUnderSampler(ratio=ratio, random_state=0).fit_sample(X, y)
print('Information of the iris data set after balancing using a dict'
' mode:\n ratio={} \n y: {}'.format(ratio, Counter(y_res)))
plot_pie(y_res)
def ratio_multiplier(y):
multiplier = {1: 0.7, 2: 0.95}
target_stats = Counter(y)
for key, value in target_stats.items():
target_stats[key] = int(value * multiplier[key])
return target_stats
X_res, y_res = RandomUnderSampler(ratio=ratio, random_state=0).fit_sample(X, y)
print('Information of the iris data set after balancing using a callable'
' mode:\n ratio={} \n y: {}'.format(ratio, Counter(y_res)))
plot_pie(y_res)
plt.show()