Skip to content

If any of the extracted features is sparse, make the hstacked result sparse as well #36

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 10 additions & 6 deletions sklearn_pandas/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import cross_validation
from sklearn import grid_search
Expand Down Expand Up @@ -55,11 +56,7 @@ def transform(self, X):


def _handle_feature(fea):
if hasattr(fea, 'toarray'):
# sparse arrays should be converted to regular arrays
# for hstack.
fea = fea.toarray()

# convert 1-dimensional arrays to 2-dimensional column vectors
if len(fea.shape) == 1:
fea = np.array([fea]).T

Expand Down Expand Up @@ -156,4 +153,11 @@ def transform(self, X):
# at this point we lose track of which features
# were created from which input columns, so it's
# assumed that that doesn't matter to the model.
return np.hstack(extracted)

# If any of the extracted features is sparse, combine to produce a
# sparse matrix. Otherwise, produce a dense one.
if any(sparse.issparse(fea) for fea in extracted):
stacked = sparse.hstack(extracted).tocsr()
else:
stacked = np.hstack(extracted)
return stacked
17 changes: 16 additions & 1 deletion tests/test_dataframe_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,12 @@

from pandas import DataFrame
import pandas as pd
from scipy import sparse
from sklearn.datasets import load_iris
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.preprocessing import Imputer, StandardScaler, LabelBinarizer
import numpy as np

from sklearn_pandas import (
Expand Down Expand Up @@ -125,3 +126,17 @@ def test_list_transformers():
# all features have mean 0 and std deviation 1 (standardized)
assert (abs(dmatrix.mean(axis=0) - 0) <= 1e-6).all()
assert (abs(dmatrix.std(axis=0) - 1) <= 1e-6).all()


def test_sparse_features(cars_dataframe):
"""
If any of the extracted features is sparse, the hstacked
is also sparse.
"""
mapper = DataFrameMapper([
("description", CountVectorizer()), # sparse feature
("model", LabelBinarizer()), # dense feature
])
dmatrix = mapper.fit_transform(cars_dataframe)

assert type(dmatrix) == sparse.csr.csr_matrix