Skip to content

Sparse features optional #37

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ For these examples, we'll also use pandas, numpy, and sklearn::
>>> import numpy as np
>>> import sklearn.preprocessing, sklearn.decomposition, \
... sklearn.linear_model, sklearn.pipeline, sklearn.metrics
>>> from sklearn.feature_extraction.text import CountVectorizer

Load some Data
**************
Expand Down Expand Up @@ -156,6 +157,20 @@ Only columns that are listed in the DataFrameMapper are kept. To keep a column b
[ 1., 0., 0., 5.],
[ 0., 0., 1., 4.]])


Working with sparse features
****************************

`DataFrameMapper`s will return a dense feature array by default. Setting `sparse=True` in the mapper will return a sparse array whenever any of the extracted features is sparse. Example:

>>> mapper4 = DataFrameMapper([
... ('pet', CountVectorizer()),
... ], sparse=True)
>>> type(mapper4.fit_transform(data))
<class 'scipy.sparse.csr.csr_matrix'>

The stacking of the sparse features is done without ever densifying them.

Cross-Validation
----------------

Expand All @@ -179,6 +194,7 @@ Changelog
********************

* Raise ``KeyError`` when selecting unexistent columns in the dataframe. Fixes #30.
* Return sparse feature array if any of the features is sparse and `sparse` argument is `True`. Defaults to `False` to avoid potential breaking of existing code. Resolves #34.


0.0.12 (2015-11-07)
Expand Down
26 changes: 19 additions & 7 deletions sklearn_pandas/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import cross_validation
from sklearn import grid_search
Expand Down Expand Up @@ -55,11 +56,7 @@ def transform(self, X):


def _handle_feature(fea):
if hasattr(fea, 'toarray'):
# sparse arrays should be converted to regular arrays
# for hstack.
fea = fea.toarray()

# convert 1-dimensional arrays to 2-dimensional column vectors
if len(fea.shape) == 1:
fea = np.array([fea]).T

Expand All @@ -72,16 +69,19 @@ class DataFrameMapper(BaseEstimator, TransformerMixin):
sklearn transformation.
"""

def __init__(self, features):
def __init__(self, features, sparse=False):
"""
Params:

features a list of pairs. The first element is the pandas column
selector. This can be a string (for one column) or a list
of strings. The second element is an object that supports
sklearn's transform interface.
sparse will return sparse matrix if set True and any of the
extracted features is sparse. Defaults to False.
"""
self.features = features
self.sparse = sparse

def _get_col_subset(self, X, cols):
"""
Expand Down Expand Up @@ -156,4 +156,16 @@ def transform(self, X):
# at this point we lose track of which features
# were created from which input columns, so it's
# assumed that that doesn't matter to the model.
return np.hstack(extracted)

# If any of the extracted features is sparse, combine sparsely.
# Otherwise, combine as normal arrays.
if any(sparse.issparse(fea) for fea in extracted):
stacked = sparse.hstack(extracted).tocsr()
# return a sparse matrix only if the mapper was initialized
# with sparse=True
if not self.sparse:
stacked = stacked.toarray()
else:
stacked = np.hstack(extracted)

return stacked
58 changes: 52 additions & 6 deletions tests/test_dataframe_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,13 @@

from pandas import DataFrame
import pandas as pd
from scipy import sparse
from sklearn.datasets import load_iris
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

from sklearn_pandas import (
Expand All @@ -23,6 +25,17 @@
)


class ToSparseTransformer(BaseEstimator, TransformerMixin):
"""
Transforms numpy matrix to sparse format.
"""
def fit(self, X):
return self

def transform(self, X):
return sparse.csr_matrix(X)


@pytest.fixture
def iris_dataframe():
iris = load_iris()
Expand All @@ -42,6 +55,11 @@ def cars_dataframe():
return pd.read_csv("tests/test_data/cars.csv.gz", compression='gzip')


@pytest.fixture
def simple_dataframe():
return pd.DataFrame({'a': [1, 2, 3]})


def test_nonexistent_columns_explicit_fail(iris_dataframe):
"""
If a nonexistent column is selected, KeyError is raised.
Expand Down Expand Up @@ -92,32 +110,32 @@ def test_with_car_dataframe(cars_dataframe):
assert scores.mean() > 0.30


def test_cols_string_array():
def test_cols_string_array(simple_dataframe):
"""
If an string specified as the columns, the transformer
is called with a 1-d array as input.
"""
dataframe = pd.DataFrame({"a": [1, 2, 3]})
df = simple_dataframe
mock_transformer = Mock()
mock_transformer.transform.return_value = np.array([1, 2, 3]) # do nothing
mapper = DataFrameMapper([("a", mock_transformer)])

mapper.fit_transform(dataframe)
mapper.fit_transform(df)
args, kwargs = mock_transformer.fit.call_args
assert args[0].shape == (3,)


def test_cols_list_column_vector():
def test_cols_list_column_vector(simple_dataframe):
"""
If a one-element list is specified as the columns, the transformer
is called with a column vector as input.
"""
dataframe = pd.DataFrame({"a": [1, 2, 3]})
df = simple_dataframe
mock_transformer = Mock()
mock_transformer.transform.return_value = np.array([1, 2, 3]) # do nothing
mapper = DataFrameMapper([(["a"], mock_transformer)])

mapper.fit_transform(dataframe)
mapper.fit_transform(df)
args, kwargs = mock_transformer.fit.call_args
assert args[0].shape == (3, 1)

Expand All @@ -140,3 +158,31 @@ def test_list_transformers():
# all features have mean 0 and std deviation 1 (standardized)
assert (abs(dmatrix.mean(axis=0) - 0) <= 1e-6).all()
assert (abs(dmatrix.std(axis=0) - 1) <= 1e-6).all()


def test_sparse_features(simple_dataframe):
"""
If any of the extracted features is sparse and "sparse" argument
is true, the hstacked result is also sparse.
"""
df = simple_dataframe
mapper = DataFrameMapper([
("a", ToSparseTransformer())
], sparse=True)
dmatrix = mapper.fit_transform(df)

assert type(dmatrix) == sparse.csr.csr_matrix


def test_sparse_off(simple_dataframe):
"""
If the resulting features are sparse but the "sparse" argument
of the mapper is False, return a non-sparse matrix.
"""
df = simple_dataframe
mapper = DataFrameMapper([
("a", ToSparseTransformer())
], sparse=False)

dmatrix = mapper.fit_transform(df)
assert type(dmatrix) != sparse.csr.csr_matrix