Skip to content

Commit 45b538c

Browse files
authored
ENH accept non finite values in random samplers (#643)
1 parent 8039bd4 commit 45b538c

File tree

5 files changed

+59
-4
lines changed

5 files changed

+59
-4
lines changed

doc/whats_new/v0.6.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,11 @@ Enhancement
5151
to check or not the input ``X`` and ``y``.
5252
:pr:`637` by :user:`Guillaume Lemaitre <glemaitre>`.
5353

54+
- :class:`imblearn.under_sampling.RandomUnderSampler`,
55+
:class:`imblearn.over_sampling.RandomOverSampler` can resample when non
56+
finite values are present in ``X``.
57+
:pr:`643` by `Guillaume Lemaitre <glemaitre>`.
58+
5459
Deprecation
5560
...........
5661

imblearn/over_sampling/_random_over_sampler.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,8 @@ def _check_X_y(X, y):
7979
y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
8080
if not hasattr(X, "loc"):
8181
# Do not convert dataframe
82-
X = check_array(X, accept_sparse=["csr", "csc"], dtype=None)
82+
X = check_array(X, accept_sparse=["csr", "csc"], dtype=None,
83+
force_all_finite=False)
8384
y = check_array(
8485
y, accept_sparse=["csr", "csc"], dtype=None, ensure_2d=False
8586
)
@@ -108,4 +109,8 @@ def _fit_resample(self, X, y):
108109
)
109110

110111
def _more_tags(self):
111-
return {"X_types": ["2darray", "string"], "sample_indices": True}
112+
return {
113+
"X_types": ["2darray", "string"],
114+
"sample_indices": True,
115+
"allow_nan": True,
116+
}

imblearn/over_sampling/tests/test_random_over_sampler.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,3 +125,23 @@ def test_random_over_sampling_heterogeneous_data():
125125
assert y_res.shape[0] == 4
126126
assert X_res.dtype == object
127127
assert X_res[-1, 0] in X_hetero[:, 0]
128+
129+
130+
def test_random_over_sampling_nan_inf():
131+
# check that we can oversample even with missing or infinite data
132+
# regression tests for #605
133+
rng = np.random.RandomState(42)
134+
n_not_finite = X.shape[0] // 3
135+
row_indices = rng.choice(np.arange(X.shape[0]), size=n_not_finite)
136+
col_indices = rng.randint(0, X.shape[1], size=n_not_finite)
137+
not_finite_values = rng.choice([np.nan, np.inf], size=n_not_finite)
138+
139+
X_ = X.copy()
140+
X_[row_indices, col_indices] = not_finite_values
141+
142+
ros = RandomOverSampler(random_state=0)
143+
X_res, y_res = ros.fit_resample(X_, Y)
144+
145+
assert y_res.shape == (14,)
146+
assert X_res.shape == (14, 2)
147+
assert np.any(~np.isfinite(X_res))

imblearn/under_sampling/_prototype_selection/_random_under_sampler.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,8 @@ def _check_X_y(X, y):
8585
y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
8686
if not hasattr(X, "loc"):
8787
# Do not convert dataframe
88-
X = check_array(X, accept_sparse=["csr", "csc"], dtype=None)
88+
X = check_array(X, accept_sparse=["csr", "csc"], dtype=None,
89+
force_all_finite=False)
8990
y = check_array(
9091
y, accept_sparse=["csr", "csc"], dtype=None, ensure_2d=False
9192
)
@@ -121,4 +122,8 @@ def _fit_resample(self, X, y):
121122
return _safe_indexing(X, idx_under), _safe_indexing(y, idx_under)
122123

123124
def _more_tags(self):
124-
return {"X_types": ["2darray", "string"], "sample_indices": True}
125+
return {
126+
"X_types": ["2darray", "string"],
127+
"sample_indices": True,
128+
"allow_nan": True,
129+
}

imblearn/under_sampling/_prototype_selection/tests/test_random_under_sampler.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,3 +110,23 @@ def test_random_under_sampling_heterogeneous_data():
110110
assert X_res.shape[0] == 2
111111
assert y_res.shape[0] == 2
112112
assert X_res.dtype == object
113+
114+
115+
def test_random_under_sampling_nan_inf():
116+
# check that we can undersample even with missing or infinite data
117+
# regression tests for #605
118+
rng = np.random.RandomState(42)
119+
n_not_finite = X.shape[0] // 3
120+
row_indices = rng.choice(np.arange(X.shape[0]), size=n_not_finite)
121+
col_indices = rng.randint(0, X.shape[1], size=n_not_finite)
122+
not_finite_values = rng.choice([np.nan, np.inf], size=n_not_finite)
123+
124+
X_ = X.copy()
125+
X_[row_indices, col_indices] = not_finite_values
126+
127+
rus = RandomUnderSampler(random_state=0)
128+
X_res, y_res = rus.fit_resample(X_, Y)
129+
130+
assert y_res.shape == (6,)
131+
assert X_res.shape == (6, 2)
132+
assert np.any(~np.isfinite(X_res))

0 commit comments

Comments
 (0)