-
Notifications
You must be signed in to change notification settings - Fork 1.3k
/
Copy path_random_over_sampler.py
111 lines (86 loc) · 3.83 KB
/
_random_over_sampler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
"""Class to perform random over-sampling."""
# Authors: Guillaume Lemaitre <[email protected]>
# Christos Aridas
# License: MIT
from __future__ import division
from collections import Counter
import numpy as np
from sklearn.utils import check_X_y, check_random_state, safe_indexing
from .base import BaseOverSampler
from ..utils import check_target_type
from ..utils import Substitution
from ..utils._docstring import _random_state_docstring
@Substitution(
sampling_strategy=BaseOverSampler._sampling_strategy_docstring,
random_state=_random_state_docstring)
class RandomOverSampler(BaseOverSampler):
"""Class to perform random over-sampling.
Object to over-sample the minority class(es) by picking samples at random
with replacement.
Read more in the :ref:`User Guide <random_over_sampler>`.
Parameters
----------
{sampling_strategy}
{random_state}
return_indices : bool, optional (default=False)
Whether or not to return the indices of the samples randomly selected
in the corresponding classes.
ratio : str, dict, or callable
.. deprecated:: 0.4
Use the parameter ``sampling_strategy`` instead. It will be removed
in 0.6.
Notes
-----
Supports multi-class resampling by sampling each class independently.
Supports heterogeneous data as object array containing string and numeric
data.
See
:ref:`sphx_glr_auto_examples_over-sampling_plot_comparison_over_sampling.py`,
:ref:`sphx_glr_auto_examples_over-sampling_plot_random_over_sampling.py`,
and
:ref:`sphx_glr_auto_examples_applications_plot_over_sampling_benchmark_lfw.py`.
Examples
--------
>>> from collections import Counter
>>> from sklearn.datasets import make_classification
>>> from imblearn.over_sampling import \
RandomOverSampler # doctest: +NORMALIZE_WHITESPACE
>>> X, y = make_classification(n_classes=2, class_sep=2,
... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0,
... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10)
>>> print('Original dataset shape %s' % Counter(y))
Original dataset shape Counter({{1: 900, 0: 100}})
>>> ros = RandomOverSampler(random_state=42)
>>> X_res, y_res = ros.fit_resample(X, y)
>>> print('Resampled dataset shape %s' % Counter(y_res))
Resampled dataset shape Counter({{0: 900, 1: 900}})
"""
def __init__(self, sampling_strategy='auto',
return_indices=False,
random_state=None,
ratio=None):
super(RandomOverSampler, self).__init__(
sampling_strategy=sampling_strategy, ratio=ratio)
self.return_indices = return_indices
self.random_state = random_state
@staticmethod
def _check_X_y(X, y):
y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'], dtype=None)
return X, y, binarize_y
def _fit_resample(self, X, y, sample_weight=None):
random_state = check_random_state(self.random_state)
target_stats = Counter(y)
sample_indices = range(X.shape[0])
for class_sample, num_samples in self.sampling_strategy_.items():
target_class_indices = np.flatnonzero(y == class_sample)
indices = random_state.randint(
low=0, high=target_stats[class_sample], size=num_samples)
sample_indices = np.append(sample_indices,
target_class_indices[indices])
resampled_arrays = [safe_indexing(arr, sample_indices)
for arr in (X, y, sample_weight)
if arr is not None]
if self.return_indices:
return tuple(resampled_arrays + [sample_indices])
return tuple(resampled_arrays)