Sparse output only if sparse=True. With docs.

dukebody · dukebody · commit 766f95571c7a · 2015-11-07T20:40:31.000+01:00
diff --git a/README.rst b/README.rst
@@ -44,6 +44,7 @@ For these examples, we'll also use pandas, numpy, and sklearn::
     >>> import numpy as np
     >>> import sklearn.preprocessing, sklearn.decomposition, \
     ...     sklearn.linear_model, sklearn.pipeline, sklearn.metrics
+    >>> from sklearn.feature_extraction.text import CountVectorizer
 
 Load some Data
 **************
@@ -156,6 +157,20 @@ Only columns that are listed in the DataFrameMapper are kept. To keep a column b
            [ 1.,  0.,  0.,  5.],
            [ 0.,  0.,  1.,  4.]])
 
+
+Working with sparse features
+****************************
+
+`DataFrameMapper`s will return a dense feature array by default. Setting `sparse=True` in the mapper will return a sparse array whenever any of the extracted features is sparse. Example:
+
+    >>> mapper4 = DataFrameMapper([
+    ...     ('pet', CountVectorizer()),
+    ... ], sparse=True)
+    >>> type(mapper4.fit_transform(data))
+    <class 'scipy.sparse.csr.csr_matrix'>
+
+The stacking of the sparse features is done without ever densifying them.
+
 Cross-Validation
 ----------------
 
diff --git a/sklearn_pandas/__init__.py b/sklearn_pandas/__init__.py
@@ -69,16 +69,19 @@ class DataFrameMapper(BaseEstimator, TransformerMixin):
     sklearn transformation.
     """
 
-    def __init__(self, features):
+    def __init__(self, features, sparse=False):
         """
         Params:
 
         features    a list of pairs. The first element is the pandas column
                     selector. This can be a string (for one column) or a list
                     of strings. The second element is an object that supports
                     sklearn's transform interface.
+        sparse      will return sparse matrix if set True and any of the
+                    extracted features is sparse. Defaults to False.
         """
         self.features = features
+        self.sparse = sparse
 
     def _get_col_subset(self, X, cols):
         """
@@ -154,10 +157,15 @@ def transform(self, X):
         # were created from which input columns, so it's
         # assumed that that doesn't matter to the model.
 
-        # If any of the extracted features is sparse, combine to produce a
-        # sparse matrix. Otherwise, produce a dense one.
+        # If any of the extracted features is sparse, combine sparsely.
+        # Otherwise, combine as normal arrays.
         if any(sparse.issparse(fea) for fea in extracted):
             stacked = sparse.hstack(extracted).tocsr()
+            # return a sparse matrix only if the mapper was initialized
+            # with sparse=True
+            if not self.sparse:
+                stacked = stacked.toarray()
         else:
             stacked = np.hstack(extracted)
+
         return stacked
diff --git a/tests/test_dataframe_mapper.py b/tests/test_dataframe_mapper.py
@@ -14,7 +14,8 @@
 from sklearn.pipeline import Pipeline
 from sklearn.svm import SVC
 from sklearn.feature_extraction.text import CountVectorizer
-from sklearn.preprocessing import Imputer, StandardScaler, LabelBinarizer
+from sklearn.preprocessing import Imputer, StandardScaler
+from sklearn.base import BaseEstimator, TransformerMixin
 import numpy as np
 
 from sklearn_pandas import (
@@ -24,6 +25,17 @@
 )
 
 
+class ToSparseTransformer(BaseEstimator, TransformerMixin):
+    """
+    Transforms numpy matrix to sparse format.
+    """
+    def fit(self, X):
+        return self
+
+    def transform(self, X):
+        return sparse.csr_matrix(X)
+
+
 @pytest.fixture
 def iris_dataframe():
     iris = load_iris()
@@ -43,6 +55,11 @@ def cars_dataframe():
     return pd.read_csv("tests/test_data/cars.csv.gz", compression='gzip')
 
 
+@pytest.fixture
+def simple_dataframe():
+    return pd.DataFrame({'a': [1, 2, 3]})
+
+
 def test_nonexistent_columns_explicit_fail(iris_dataframe):
     """
     If a nonexistent column is selected, KeyError is raised.
@@ -93,32 +110,32 @@ def test_with_car_dataframe(cars_dataframe):
     assert scores.mean() > 0.30
 
 
-def test_cols_string_array():
+def test_cols_string_array(simple_dataframe):
     """
     If an string specified as the columns, the transformer
     is called with a 1-d array as input.
     """
-    dataframe = pd.DataFrame({"a": [1, 2, 3]})
+    df = simple_dataframe
     mock_transformer = Mock()
     mock_transformer.transform.return_value = np.array([1, 2, 3])  # do nothing
     mapper = DataFrameMapper([("a", mock_transformer)])
 
-    mapper.fit_transform(dataframe)
+    mapper.fit_transform(df)
     args, kwargs = mock_transformer.fit.call_args
     assert args[0].shape == (3,)
 
 
-def test_cols_list_column_vector():
+def test_cols_list_column_vector(simple_dataframe):
     """
     If a one-element list is specified as the columns, the transformer
     is called with a column vector as input.
     """
-    dataframe = pd.DataFrame({"a": [1, 2, 3]})
+    df = simple_dataframe
     mock_transformer = Mock()
     mock_transformer.transform.return_value = np.array([1, 2, 3])  # do nothing
     mapper = DataFrameMapper([(["a"], mock_transformer)])
 
-    mapper.fit_transform(dataframe)
+    mapper.fit_transform(df)
     args, kwargs = mock_transformer.fit.call_args
     assert args[0].shape == (3, 1)
 
@@ -143,15 +160,29 @@ def test_list_transformers():
     assert (abs(dmatrix.std(axis=0) - 1) <= 1e-6).all()
 
 
-def test_sparse_features(cars_dataframe):
+def test_sparse_features(simple_dataframe):
     """
-    If any of the extracted features is sparse, the hstacked
-    is also sparse.
+    If any of the extracted features is sparse and "sparse" argument
+    is true, the hstacked result is also sparse.
     """
+    df = simple_dataframe
     mapper = DataFrameMapper([
-        ("description", CountVectorizer()),  # sparse feature
-        ("model", LabelBinarizer()),  # dense feature
-    ])
-    dmatrix = mapper.fit_transform(cars_dataframe)
+        ("a", ToSparseTransformer())
+    ], sparse=True)
+    dmatrix = mapper.fit_transform(df)
 
     assert type(dmatrix) == sparse.csr.csr_matrix
+
+
+def test_sparse_off(simple_dataframe):
+    """
+    If the resulting features are sparse but the "sparse" argument
+    of the mapper is False, return a non-sparse matrix.
+    """
+    df = simple_dataframe
+    mapper = DataFrameMapper([
+        ("a", ToSparseTransformer())
+    ], sparse=False)
+
+    dmatrix = mapper.fit_transform(df)
+    assert type(dmatrix) != sparse.csr.csr_matrix