ENH: Native conversion from/to scipy.sparse matrix to SparseDataFrame

kernc · kernc · commit a1a88ff7fdc7 · 2017-03-06T17:04:09.000+01:00
diff --git a/doc/source/sparse.rst b/doc/source/sparse.rst
@@ -186,9 +186,32 @@ the correct dense result.
 Interaction with scipy.sparse
 -----------------------------
 
-Experimental api to transform between sparse pandas and scipy.sparse structures.
+.. versionadded:: 0.20.0
 
-A :meth:`SparseSeries.to_coo` method is implemented for transforming a ``SparseSeries`` indexed by a ``MultiIndex`` to a ``scipy.sparse.coo_matrix``.
+Pandas supports creating sparse dataframes directly from ``scipy.sparse`` matrices.
+
+.. ipython:: python
+
+   from scipy.sparse import csr_matrix
+
+   arr = np.random.random(size=(1000, 5))
+   arr[arr < .9] = 0
+
+   sp_arr = csr_matrix(arr)
+   sp_arr
+
+   sdf = pd.SparseDataFrame(sp_arr)
+   sdf
+
+All sparse formats are supported, but matrices that aren't in *COOrdinate* format will be converted to it, copying the data as needed. To convert a ``SparseDataFrame`` back to sparse SciPy matrix in COO format, you can use :meth:`SparseDataFrame.to_coo` method:
+
+.. ipython:: python
+
+   sdf.to_coo()
+
+.. versionadded:: 0.16.0
+
+Additionally, a :meth:`SparseSeries.to_coo` method is implemented for transforming a ``SparseSeries`` indexed by a ``MultiIndex`` to a ``scipy.sparse.coo_matrix``.
 
 The method requires a ``MultiIndex`` with two or more levels.
 
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -189,6 +189,30 @@ You must enable this by setting the ``display.html.table_schema`` option to True
 .. _Table Schema: http://specs.frictionlessdata.io/json-table-schema/
 .. _nteract: http://nteract.io/
 
+.. _whatsnew_0200.enhancements.scipy_sparse:
+
+SciPy sparse matrix from/to SparseDataFrame
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Pandas now supports creating sparse dataframes directly from ``scipy.sparse.spmatrix`` instances (:issue:`4343`). See the :ref:`documentation <sparse.scipysparse>` for more information.
+
+All sparse formats are supported, but matrices that aren't in *COOrdinate* format will be converted to it, copying the data as needed.
+
+.. ipython:: python
+
+   from scipy.sparse import csr_matrix
+   arr = np.random.random(size=(1000, 5))
+   arr[arr < .9] = 0
+   sp_arr = csr_matrix(arr)
+   sp_arr
+   sdf = pd.SparseDataFrame(sp_arr)
+   sdf
+
+To convert a ``SparseDataFrame`` back to sparse SciPy matrix in COO format, you can use:
+
+.. ipython:: python
+
+   sdf.to_coo()
+
 .. _whatsnew_0200.enhancements.other:
 
 Other enhancements
diff --git a/pandas/sparse/array.py b/pandas/sparse/array.py
@@ -769,14 +769,17 @@ def make_sparse(arr, kind='block', fill_value=None):
     if isnull(fill_value):
         mask = notnull(arr)
     else:
-        mask = arr != fill_value
+        mask = np.not_equal(arr, fill_value)
+        # In NumPy 1.12.0, not implemented at least for arr.dtype=str
+        if mask is NotImplemented:
+            mask = np.not_equal(arr.astype(object), fill_value)
 
     length = len(arr)
     if length != mask.size:
         # the arr is a SparseArray
         indices = mask.sp_index.indices
     else:
-        indices = np.arange(length, dtype=np.int32)[mask]
+        indices = mask.nonzero()[0].astype(np.int32)
 
     index = _make_index(length, indices, kind)
     sparsified_values = arr[mask]
diff --git a/pandas/sparse/frame.py b/pandas/sparse/frame.py
@@ -11,8 +11,8 @@
 import numpy as np
 
 from pandas.types.missing import isnull, notnull
-from pandas.types.cast import _maybe_upcast
-from pandas.types.common import _ensure_platform_int
+from pandas.types.cast import _maybe_upcast, _find_common_type
+from pandas.types.common import _ensure_platform_int, is_scipy_sparse
 
 from pandas.core.common import _try_sort
 from pandas.compat.numpy import function as nv
@@ -25,6 +25,7 @@
                                    create_block_manager_from_arrays)
 import pandas.core.generic as generic
 from pandas.sparse.series import SparseSeries, SparseArray
+from pandas._sparse import BlockIndex, get_blocks
 from pandas.util.decorators import Appender
 import pandas.core.ops as ops
 
@@ -39,15 +40,16 @@ class SparseDataFrame(DataFrame):
 
     Parameters
     ----------
-    data : same types as can be passed to DataFrame
+    data : same types as can be passed to DataFrame or scipy.sparse.spmatrix
     index : array-like, optional
     column : array-like, optional
     default_kind : {'block', 'integer'}, default 'block'
         Default sparse kind for converting Series to SparseSeries. Will not
         override SparseSeries passed into constructor
     default_fill_value : float
-        Default fill_value for converting Series to SparseSeries. Will not
-        override SparseSeries passed in
+        Default fill_value for converting Series to SparseSeries
+        (default: nan). Will not override SparseSeries passed in. If `data`
+        is a scipy sparse matrix, the default is 0.
     """
     _constructor_sliced = SparseSeries
     _subtyp = 'sparse_frame'
@@ -77,29 +79,26 @@ def __init__(self, data=None, index=None, columns=None, default_kind=None,
             data = {columns[0]: data}
 
         if default_fill_value is None:
-            default_fill_value = np.nan
+            default_fill_value = 0 if is_scipy_sparse(data) else np.nan
         if default_kind is None:
             default_kind = 'block'
 
         self._default_kind = default_kind
         self._default_fill_value = default_fill_value
 
-        if isinstance(data, dict):
-            mgr = self._init_dict(data, index, columns)
-            if dtype is not None:
-                mgr = mgr.astype(dtype)
+        if is_scipy_sparse(data):
+            mgr = self._init_spmatrix(data, index, columns, dtype=dtype,
+                                      fill_value=default_fill_value)
+        elif isinstance(data, dict):
+            mgr = self._init_dict(data, index, columns, dtype=dtype)
         elif isinstance(data, (np.ndarray, list)):
-            mgr = self._init_matrix(data, index, columns)
-            if dtype is not None:
-                mgr = mgr.astype(dtype)
+            mgr = self._init_matrix(data, index, columns, dtype=dtype)
         elif isinstance(data, SparseDataFrame):
             mgr = self._init_mgr(data._data,
                                  dict(index=index, columns=columns),
                                  dtype=dtype, copy=copy)
         elif isinstance(data, DataFrame):
-            mgr = self._init_dict(data, data.index, data.columns)
-            if dtype is not None:
-                mgr = mgr.astype(dtype)
+            mgr = self._init_dict(data, data.index, data.columns, dtype=dtype)
         elif isinstance(data, BlockManager):
             mgr = self._init_mgr(data, axes=dict(index=index, columns=columns),
                                  dtype=dtype, copy=copy)
@@ -174,7 +173,43 @@ def _init_dict(self, data, index, columns, dtype=None):
         return to_manager(sdict, columns, index)
 
     def _init_matrix(self, data, index, columns, dtype=None):
+        """ Init self from ndarray or list of lists """
         data = _prep_ndarray(data, copy=False)
+        index, columns = self._prep_index(data, index, columns)
+        data = dict([(idx, data[:, i]) for i, idx in enumerate(columns)])
+        return self._init_dict(data, index, columns, dtype)
+
+    def _init_spmatrix(self, data, index, columns, dtype=None,
+                       fill_value=None):
+        """ Init self from scipy.sparse matrix """
+        index, columns = self._prep_index(data, index, columns)
+        data = data.tocoo(copy=False)
+        N = len(index)
+
+        # Construct a dict of SparseSeries
+        sdict = {}
+        values = Series(data.data, index=data.row, copy=False)
+        for col, rowvals in values.groupby(data.col):
+            # get_blocks expects int32 row indices in sorted order
+            rows = rowvals.index.values.astype(np.int32)
+            rows.sort()
+            blocs, blens = get_blocks(rows)
+
+            sdict[columns[col]] = SparseSeries(
+                rowvals.values, index=index,
+                fill_value=fill_value,
+                sparse_index=BlockIndex(N, blocs, blens))
+
+        # Add any columns that were empty and thus not grouped on above
+        sdict.update({column: SparseSeries(index=index,
+                                           fill_value=fill_value,
+                                           sparse_index=BlockIndex(N, [], []))
+                      for column in columns
+                      if column not in sdict})
+
+        return self._init_dict(sdict, index, columns, dtype)
+
+    def _prep_index(self, data, index, columns):
         N, K = data.shape
         if index is None:
             index = _default_index(N)
@@ -187,9 +222,48 @@ def _init_matrix(self, data, index, columns, dtype=None):
         if len(index) != N:
             raise ValueError('Index length mismatch: %d vs. %d' %
                              (len(index), N))
+        return index, columns
 
-        data = dict([(idx, data[:, i]) for i, idx in enumerate(columns)])
-        return self._init_dict(data, index, columns, dtype)
+    def to_coo(self):
+        """
+        Return the contents of the frame as a sparse SciPy COO matrix.
+
+        .. versionadded:: 0.20.0
+
+        Returns
+        -------
+        coo_matrix : scipy.sparse.spmatrix
+            If the caller is heterogeneous and contains booleans or objects,
+            the result will be of dtype=object. See Notes.
+
+        Notes
+        -----
+        The dtype will be the lowest-common-denominator type (implicit
+        upcasting); that is to say if the dtypes (even of numeric types)
+        are mixed, the one that accommodates all will be chosen.
+
+        e.g. If the dtypes are float16 and float32, dtype will be upcast to
+        float32. By numpy.find_common_type convention, mixing int64 and
+        and uint64 will result in a float64 dtype.
+        """
+        try:
+            from scipy.sparse import coo_matrix
+        except ImportError:
+            raise ImportError('Scipy is not installed')
+
+        dtype = _find_common_type(self.dtypes)
+        cols, rows, datas = [], [], []
+        for col, name in enumerate(self):
+            s = self[name]
+            row = s.sp_index.to_int_index().indices
+            cols.append(np.repeat(col, len(row)))
+            rows.append(row)
+            datas.append(s.sp_values.astype(dtype, copy=False))
+
+        cols = np.concatenate(cols)
+        rows = np.concatenate(rows)
+        datas = np.concatenate(datas)
+        return coo_matrix((datas, (rows, cols)), shape=self.shape)
 
     def __array_wrap__(self, result):
         return self._constructor(
diff --git a/pandas/tests/sparse/test_frame.py b/pandas/tests/sparse/test_frame.py
@@ -2,6 +2,8 @@
 
 import operator
 
+import pytest
+
 from numpy import nan
 import numpy as np
 import pandas as pd
@@ -1118,6 +1120,58 @@ def test_isnotnull(self):
         tm.assert_frame_equal(res.to_dense(), exp)
 
 
+@pytest.fixture(params=['bsr', 'coo', 'csc', 'csr', 'dia', 'dok', 'lil'])
+def spmatrix(request):
+    tm._skip_if_no_scipy()
+    from scipy import sparse
+    yield getattr(sparse, request.param + '_matrix')
+
+
+@pytest.mark.parametrize('index', [None, list('abc')])
+@pytest.mark.parametrize('columns', [None, list('def')])
+@pytest.mark.parametrize('fill_value', [None, 0, np.nan])
+def test_from_to_scipy(spmatrix, index, columns, fill_value):
+    # GH 4343
+    tm._skip_if_no_scipy()
+
+    # Make one sparse matrix and one ndarray for the two frames we'll try
+    # to match
+    spm = spmatrix([[0, 1, 0],
+                    [0, 0, 1],
+                    [1, 1, 1]]).astype(np.int16)
+    arr = spm.toarray()
+    assert spm.dtype == arr.dtype
+
+    # If fill_value is a float value, upcast the whole array to float
+    # Note the dtype of the resulting sdf.to_coo() will be the upcast one
+    if isinstance(fill_value, float):
+        arr = arr.astype(float)
+        arr[arr == 0] = np.nan
+    # If fill_value is None, set it to 0 for ndarray as value 0 is also the
+    # implied non-configurable default for scipy sparse matrices
+    arr_fill_value = 0 if fill_value is None else fill_value
+
+    sdf = pd.SparseDataFrame(spm, index=index, columns=columns,
+                             default_fill_value=fill_value)
+    res = pd.SparseDataFrame(arr, index=index, columns=columns,
+                             default_fill_value=arr_fill_value)
+
+    tm.assert_sp_frame_equal(sdf, res)
+    tm.assert_frame_equal(sdf.to_dense(), res.to_dense())
+    tm.assert_numpy_array_equal(res.values, arr)
+
+    # Assert spmatrices equal (operator!= not supported in scipy 0.8.0)
+    tm.assert_equal(dict(sdf.to_coo().todok()), dict(spm.todok()))
+
+    # Ensure dtype is preserved if possible
+    tm.assert_equal(sdf.to_coo().dtype, arr.dtype)
+
+    # However, adding an object column results in an upcast
+    sdf['names'] = np.arange(len(sdf)).astype(str)
+
+    tm.assert_equal(sdf.to_coo().dtype, np.object_)
+
+
 class TestSparseDataFrameArithmetic(tm.TestCase):
 
     def test_numeric_op_scalar(self):
diff --git a/pandas/tests/types/test_inference.py b/pandas/tests/types/test_inference.py
@@ -30,11 +30,14 @@
                                  is_float,
                                  is_bool,
                                  is_scalar,
+                                 is_scipy_sparse,
                                  _ensure_int32,
                                  _ensure_categorical)
 from pandas.types.missing import isnull
 from pandas.util import testing as tm
 
+from pandas.tests.sparse.test_frame import spmatrix  # noqa: F401
+
 
 def test_is_sequence():
     is_seq = inference.is_sequence
@@ -946,6 +949,12 @@ def test_nan_to_nat_conversions():
         assert (s[8].value == np.datetime64('NaT').astype(np.int64))
 
 
+def test_is_scipy_sparse(spmatrix):  # noqa: F811
+    tm._skip_if_no_scipy()
+    assert is_scipy_sparse(spmatrix([[0, 1]]))
+    assert not is_scipy_sparse(np.array([1]))
+
+
 def test_ensure_int32():
     values = np.arange(10, dtype=np.int32)
     result = _ensure_int32(values)
diff --git a/pandas/types/common.py b/pandas/types/common.py
@@ -23,6 +23,9 @@
 _TD_DTYPE = np.dtype('m8[ns]')
 _INT64_DTYPE = np.dtype(np.int64)
 
+# oh the troubles to reduce import time
+_is_scipy_sparse = None
+
 _ensure_float64 = algos.ensure_float64
 _ensure_float32 = algos.ensure_float32
 
@@ -59,6 +62,17 @@ def is_sparse(array):
     return isinstance(array, (ABCSparseArray, ABCSparseSeries))
 
 
+def is_scipy_sparse(array):
+    """ return if we are a scipy.sparse.spmatrix """
+    global _is_scipy_sparse
+    if _is_scipy_sparse is None:
+        try:
+            from scipy.sparse import issparse as _is_scipy_sparse
+        except ImportError:
+            return False
+    return _is_scipy_sparse(array)
+
+
 def is_categorical(array):
     """ return if we are a categorical possibility """
     return isinstance(array, ABCCategorical) or is_categorical_dtype(array)
diff --git a/pandas/util/testing.py b/pandas/util/testing.py
@@ -297,6 +297,11 @@ def _skip_if_no_scipy():
     except ImportError:
         import pytest
         pytest.skip('scipy.interpolate missing')
+    try:
+        import scipy.sparse  # noqa
+    except ImportError:
+        import pytest
+        pytest.skip('scipy.sparse missing')
 
 
 def _skip_if_scipy_0_17():