Skip to content

Commit 224995f

Browse files
committed
Merge pull request #37 from dukebody/sparse-features-optional
Return a sparse array if any feature is sparse and used `sparse=True` in `DataFrameMapper` constructor.
2 parents 70a224a + 1534015 commit 224995f

File tree

3 files changed

+87
-13
lines changed

3 files changed

+87
-13
lines changed

README.rst

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ For these examples, we'll also use pandas, numpy, and sklearn::
4444
>>> import numpy as np
4545
>>> import sklearn.preprocessing, sklearn.decomposition, \
4646
... sklearn.linear_model, sklearn.pipeline, sklearn.metrics
47+
>>> from sklearn.feature_extraction.text import CountVectorizer
4748

4849
Load some Data
4950
**************
@@ -156,6 +157,20 @@ Only columns that are listed in the DataFrameMapper are kept. To keep a column b
156157
[ 1., 0., 0., 5.],
157158
[ 0., 0., 1., 4.]])
158159

160+
161+
Working with sparse features
162+
****************************
163+
164+
`DataFrameMapper`s will return a dense feature array by default. Setting `sparse=True` in the mapper will return a sparse array whenever any of the extracted features is sparse. Example:
165+
166+
>>> mapper4 = DataFrameMapper([
167+
... ('pet', CountVectorizer()),
168+
... ], sparse=True)
169+
>>> type(mapper4.fit_transform(data))
170+
<class 'scipy.sparse.csr.csr_matrix'>
171+
172+
The stacking of the sparse features is done without ever densifying them.
173+
159174
Cross-Validation
160175
----------------
161176

@@ -179,6 +194,7 @@ Changelog
179194
********************
180195

181196
* Raise ``KeyError`` when selecting unexistent columns in the dataframe. Fixes #30.
197+
* Return sparse feature array if any of the features is sparse and `sparse` argument is `True`. Defaults to `False` to avoid potential breaking of existing code. Resolves #34.
182198

183199

184200
0.0.12 (2015-11-07)

sklearn_pandas/__init__.py

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import numpy as np
44
import pandas as pd
5+
from scipy import sparse
56
from sklearn.base import BaseEstimator, TransformerMixin
67
from sklearn import cross_validation
78
from sklearn import grid_search
@@ -55,11 +56,7 @@ def transform(self, X):
5556

5657

5758
def _handle_feature(fea):
58-
if hasattr(fea, 'toarray'):
59-
# sparse arrays should be converted to regular arrays
60-
# for hstack.
61-
fea = fea.toarray()
62-
59+
# convert 1-dimensional arrays to 2-dimensional column vectors
6360
if len(fea.shape) == 1:
6461
fea = np.array([fea]).T
6562

@@ -72,16 +69,19 @@ class DataFrameMapper(BaseEstimator, TransformerMixin):
7269
sklearn transformation.
7370
"""
7471

75-
def __init__(self, features):
72+
def __init__(self, features, sparse=False):
7673
"""
7774
Params:
7875
7976
features a list of pairs. The first element is the pandas column
8077
selector. This can be a string (for one column) or a list
8178
of strings. The second element is an object that supports
8279
sklearn's transform interface.
80+
sparse will return sparse matrix if set True and any of the
81+
extracted features is sparse. Defaults to False.
8382
"""
8483
self.features = features
84+
self.sparse = sparse
8585

8686
def _get_col_subset(self, X, cols):
8787
"""
@@ -156,4 +156,16 @@ def transform(self, X):
156156
# at this point we lose track of which features
157157
# were created from which input columns, so it's
158158
# assumed that that doesn't matter to the model.
159-
return np.hstack(extracted)
159+
160+
# If any of the extracted features is sparse, combine sparsely.
161+
# Otherwise, combine as normal arrays.
162+
if any(sparse.issparse(fea) for fea in extracted):
163+
stacked = sparse.hstack(extracted).tocsr()
164+
# return a sparse matrix only if the mapper was initialized
165+
# with sparse=True
166+
if not self.sparse:
167+
stacked = stacked.toarray()
168+
else:
169+
stacked = np.hstack(extracted)
170+
171+
return stacked

tests/test_dataframe_mapper.py

Lines changed: 52 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,13 @@
99

1010
from pandas import DataFrame
1111
import pandas as pd
12+
from scipy import sparse
1213
from sklearn.datasets import load_iris
1314
from sklearn.pipeline import Pipeline
1415
from sklearn.svm import SVC
1516
from sklearn.feature_extraction.text import CountVectorizer
1617
from sklearn.preprocessing import Imputer, StandardScaler
18+
from sklearn.base import BaseEstimator, TransformerMixin
1719
import numpy as np
1820

1921
from sklearn_pandas import (
@@ -23,6 +25,17 @@
2325
)
2426

2527

28+
class ToSparseTransformer(BaseEstimator, TransformerMixin):
29+
"""
30+
Transforms numpy matrix to sparse format.
31+
"""
32+
def fit(self, X):
33+
return self
34+
35+
def transform(self, X):
36+
return sparse.csr_matrix(X)
37+
38+
2639
@pytest.fixture
2740
def iris_dataframe():
2841
iris = load_iris()
@@ -42,6 +55,11 @@ def cars_dataframe():
4255
return pd.read_csv("tests/test_data/cars.csv.gz", compression='gzip')
4356

4457

58+
@pytest.fixture
59+
def simple_dataframe():
60+
return pd.DataFrame({'a': [1, 2, 3]})
61+
62+
4563
def test_nonexistent_columns_explicit_fail(iris_dataframe):
4664
"""
4765
If a nonexistent column is selected, KeyError is raised.
@@ -92,32 +110,32 @@ def test_with_car_dataframe(cars_dataframe):
92110
assert scores.mean() > 0.30
93111

94112

95-
def test_cols_string_array():
113+
def test_cols_string_array(simple_dataframe):
96114
"""
97115
If an string specified as the columns, the transformer
98116
is called with a 1-d array as input.
99117
"""
100-
dataframe = pd.DataFrame({"a": [1, 2, 3]})
118+
df = simple_dataframe
101119
mock_transformer = Mock()
102120
mock_transformer.transform.return_value = np.array([1, 2, 3]) # do nothing
103121
mapper = DataFrameMapper([("a", mock_transformer)])
104122

105-
mapper.fit_transform(dataframe)
123+
mapper.fit_transform(df)
106124
args, kwargs = mock_transformer.fit.call_args
107125
assert args[0].shape == (3,)
108126

109127

110-
def test_cols_list_column_vector():
128+
def test_cols_list_column_vector(simple_dataframe):
111129
"""
112130
If a one-element list is specified as the columns, the transformer
113131
is called with a column vector as input.
114132
"""
115-
dataframe = pd.DataFrame({"a": [1, 2, 3]})
133+
df = simple_dataframe
116134
mock_transformer = Mock()
117135
mock_transformer.transform.return_value = np.array([1, 2, 3]) # do nothing
118136
mapper = DataFrameMapper([(["a"], mock_transformer)])
119137

120-
mapper.fit_transform(dataframe)
138+
mapper.fit_transform(df)
121139
args, kwargs = mock_transformer.fit.call_args
122140
assert args[0].shape == (3, 1)
123141

@@ -140,3 +158,31 @@ def test_list_transformers():
140158
# all features have mean 0 and std deviation 1 (standardized)
141159
assert (abs(dmatrix.mean(axis=0) - 0) <= 1e-6).all()
142160
assert (abs(dmatrix.std(axis=0) - 1) <= 1e-6).all()
161+
162+
163+
def test_sparse_features(simple_dataframe):
164+
"""
165+
If any of the extracted features is sparse and "sparse" argument
166+
is true, the hstacked result is also sparse.
167+
"""
168+
df = simple_dataframe
169+
mapper = DataFrameMapper([
170+
("a", ToSparseTransformer())
171+
], sparse=True)
172+
dmatrix = mapper.fit_transform(df)
173+
174+
assert type(dmatrix) == sparse.csr.csr_matrix
175+
176+
177+
def test_sparse_off(simple_dataframe):
178+
"""
179+
If the resulting features are sparse but the "sparse" argument
180+
of the mapper is False, return a non-sparse matrix.
181+
"""
182+
df = simple_dataframe
183+
mapper = DataFrameMapper([
184+
("a", ToSparseTransformer())
185+
], sparse=False)
186+
187+
dmatrix = mapper.fit_transform(df)
188+
assert type(dmatrix) != sparse.csr.csr_matrix

0 commit comments

Comments
 (0)