Skip to content

Commit a1a88ff

Browse files
committed
ENH: Native conversion from/to scipy.sparse matrix to SparseDataFrame
1 parent 0bf4532 commit a1a88ff

File tree

8 files changed

+228
-22
lines changed

8 files changed

+228
-22
lines changed

Diff for: doc/source/sparse.rst

+25-2
Original file line numberDiff line numberDiff line change
@@ -186,9 +186,32 @@ the correct dense result.
186186
Interaction with scipy.sparse
187187
-----------------------------
188188

189-
Experimental api to transform between sparse pandas and scipy.sparse structures.
189+
.. versionadded:: 0.20.0
190190

191-
A :meth:`SparseSeries.to_coo` method is implemented for transforming a ``SparseSeries`` indexed by a ``MultiIndex`` to a ``scipy.sparse.coo_matrix``.
191+
Pandas supports creating sparse dataframes directly from ``scipy.sparse`` matrices.
192+
193+
.. ipython:: python
194+
195+
from scipy.sparse import csr_matrix
196+
197+
arr = np.random.random(size=(1000, 5))
198+
arr[arr < .9] = 0
199+
200+
sp_arr = csr_matrix(arr)
201+
sp_arr
202+
203+
sdf = pd.SparseDataFrame(sp_arr)
204+
sdf
205+
206+
All sparse formats are supported, but matrices that aren't in *COOrdinate* format will be converted to it, copying the data as needed. To convert a ``SparseDataFrame`` back to sparse SciPy matrix in COO format, you can use :meth:`SparseDataFrame.to_coo` method:
207+
208+
.. ipython:: python
209+
210+
sdf.to_coo()
211+
212+
.. versionadded:: 0.16.0
213+
214+
Additionally, a :meth:`SparseSeries.to_coo` method is implemented for transforming a ``SparseSeries`` indexed by a ``MultiIndex`` to a ``scipy.sparse.coo_matrix``.
192215

193216
The method requires a ``MultiIndex`` with two or more levels.
194217

Diff for: doc/source/whatsnew/v0.20.0.txt

+24
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,30 @@ You must enable this by setting the ``display.html.table_schema`` option to True
189189
.. _Table Schema: http://specs.frictionlessdata.io/json-table-schema/
190190
.. _nteract: http://nteract.io/
191191

192+
.. _whatsnew_0200.enhancements.scipy_sparse:
193+
194+
SciPy sparse matrix from/to SparseDataFrame
195+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
196+
Pandas now supports creating sparse dataframes directly from ``scipy.sparse.spmatrix`` instances (:issue:`4343`). See the :ref:`documentation <sparse.scipysparse>` for more information.
197+
198+
All sparse formats are supported, but matrices that aren't in *COOrdinate* format will be converted to it, copying the data as needed.
199+
200+
.. ipython:: python
201+
202+
from scipy.sparse import csr_matrix
203+
arr = np.random.random(size=(1000, 5))
204+
arr[arr < .9] = 0
205+
sp_arr = csr_matrix(arr)
206+
sp_arr
207+
sdf = pd.SparseDataFrame(sp_arr)
208+
sdf
209+
210+
To convert a ``SparseDataFrame`` back to sparse SciPy matrix in COO format, you can use:
211+
212+
.. ipython:: python
213+
214+
sdf.to_coo()
215+
192216
.. _whatsnew_0200.enhancements.other:
193217

194218
Other enhancements

Diff for: pandas/sparse/array.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -769,14 +769,17 @@ def make_sparse(arr, kind='block', fill_value=None):
769769
if isnull(fill_value):
770770
mask = notnull(arr)
771771
else:
772-
mask = arr != fill_value
772+
mask = np.not_equal(arr, fill_value)
773+
# In NumPy 1.12.0, not implemented at least for arr.dtype=str
774+
if mask is NotImplemented:
775+
mask = np.not_equal(arr.astype(object), fill_value)
773776

774777
length = len(arr)
775778
if length != mask.size:
776779
# the arr is a SparseArray
777780
indices = mask.sp_index.indices
778781
else:
779-
indices = np.arange(length, dtype=np.int32)[mask]
782+
indices = mask.nonzero()[0].astype(np.int32)
780783

781784
index = _make_index(length, indices, kind)
782785
sparsified_values = arr[mask]

Diff for: pandas/sparse/frame.py

+92-18
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@
1111
import numpy as np
1212

1313
from pandas.types.missing import isnull, notnull
14-
from pandas.types.cast import _maybe_upcast
15-
from pandas.types.common import _ensure_platform_int
14+
from pandas.types.cast import _maybe_upcast, _find_common_type
15+
from pandas.types.common import _ensure_platform_int, is_scipy_sparse
1616

1717
from pandas.core.common import _try_sort
1818
from pandas.compat.numpy import function as nv
@@ -25,6 +25,7 @@
2525
create_block_manager_from_arrays)
2626
import pandas.core.generic as generic
2727
from pandas.sparse.series import SparseSeries, SparseArray
28+
from pandas._sparse import BlockIndex, get_blocks
2829
from pandas.util.decorators import Appender
2930
import pandas.core.ops as ops
3031

@@ -39,15 +40,16 @@ class SparseDataFrame(DataFrame):
3940
4041
Parameters
4142
----------
42-
data : same types as can be passed to DataFrame
43+
data : same types as can be passed to DataFrame or scipy.sparse.spmatrix
4344
index : array-like, optional
4445
column : array-like, optional
4546
default_kind : {'block', 'integer'}, default 'block'
4647
Default sparse kind for converting Series to SparseSeries. Will not
4748
override SparseSeries passed into constructor
4849
default_fill_value : float
49-
Default fill_value for converting Series to SparseSeries. Will not
50-
override SparseSeries passed in
50+
Default fill_value for converting Series to SparseSeries
51+
(default: nan). Will not override SparseSeries passed in. If `data`
52+
is a scipy sparse matrix, the default is 0.
5153
"""
5254
_constructor_sliced = SparseSeries
5355
_subtyp = 'sparse_frame'
@@ -77,29 +79,26 @@ def __init__(self, data=None, index=None, columns=None, default_kind=None,
7779
data = {columns[0]: data}
7880

7981
if default_fill_value is None:
80-
default_fill_value = np.nan
82+
default_fill_value = 0 if is_scipy_sparse(data) else np.nan
8183
if default_kind is None:
8284
default_kind = 'block'
8385

8486
self._default_kind = default_kind
8587
self._default_fill_value = default_fill_value
8688

87-
if isinstance(data, dict):
88-
mgr = self._init_dict(data, index, columns)
89-
if dtype is not None:
90-
mgr = mgr.astype(dtype)
89+
if is_scipy_sparse(data):
90+
mgr = self._init_spmatrix(data, index, columns, dtype=dtype,
91+
fill_value=default_fill_value)
92+
elif isinstance(data, dict):
93+
mgr = self._init_dict(data, index, columns, dtype=dtype)
9194
elif isinstance(data, (np.ndarray, list)):
92-
mgr = self._init_matrix(data, index, columns)
93-
if dtype is not None:
94-
mgr = mgr.astype(dtype)
95+
mgr = self._init_matrix(data, index, columns, dtype=dtype)
9596
elif isinstance(data, SparseDataFrame):
9697
mgr = self._init_mgr(data._data,
9798
dict(index=index, columns=columns),
9899
dtype=dtype, copy=copy)
99100
elif isinstance(data, DataFrame):
100-
mgr = self._init_dict(data, data.index, data.columns)
101-
if dtype is not None:
102-
mgr = mgr.astype(dtype)
101+
mgr = self._init_dict(data, data.index, data.columns, dtype=dtype)
103102
elif isinstance(data, BlockManager):
104103
mgr = self._init_mgr(data, axes=dict(index=index, columns=columns),
105104
dtype=dtype, copy=copy)
@@ -174,7 +173,43 @@ def _init_dict(self, data, index, columns, dtype=None):
174173
return to_manager(sdict, columns, index)
175174

176175
def _init_matrix(self, data, index, columns, dtype=None):
176+
""" Init self from ndarray or list of lists """
177177
data = _prep_ndarray(data, copy=False)
178+
index, columns = self._prep_index(data, index, columns)
179+
data = dict([(idx, data[:, i]) for i, idx in enumerate(columns)])
180+
return self._init_dict(data, index, columns, dtype)
181+
182+
def _init_spmatrix(self, data, index, columns, dtype=None,
183+
fill_value=None):
184+
""" Init self from scipy.sparse matrix """
185+
index, columns = self._prep_index(data, index, columns)
186+
data = data.tocoo(copy=False)
187+
N = len(index)
188+
189+
# Construct a dict of SparseSeries
190+
sdict = {}
191+
values = Series(data.data, index=data.row, copy=False)
192+
for col, rowvals in values.groupby(data.col):
193+
# get_blocks expects int32 row indices in sorted order
194+
rows = rowvals.index.values.astype(np.int32)
195+
rows.sort()
196+
blocs, blens = get_blocks(rows)
197+
198+
sdict[columns[col]] = SparseSeries(
199+
rowvals.values, index=index,
200+
fill_value=fill_value,
201+
sparse_index=BlockIndex(N, blocs, blens))
202+
203+
# Add any columns that were empty and thus not grouped on above
204+
sdict.update({column: SparseSeries(index=index,
205+
fill_value=fill_value,
206+
sparse_index=BlockIndex(N, [], []))
207+
for column in columns
208+
if column not in sdict})
209+
210+
return self._init_dict(sdict, index, columns, dtype)
211+
212+
def _prep_index(self, data, index, columns):
178213
N, K = data.shape
179214
if index is None:
180215
index = _default_index(N)
@@ -187,9 +222,48 @@ def _init_matrix(self, data, index, columns, dtype=None):
187222
if len(index) != N:
188223
raise ValueError('Index length mismatch: %d vs. %d' %
189224
(len(index), N))
225+
return index, columns
190226

191-
data = dict([(idx, data[:, i]) for i, idx in enumerate(columns)])
192-
return self._init_dict(data, index, columns, dtype)
227+
def to_coo(self):
228+
"""
229+
Return the contents of the frame as a sparse SciPy COO matrix.
230+
231+
.. versionadded:: 0.20.0
232+
233+
Returns
234+
-------
235+
coo_matrix : scipy.sparse.spmatrix
236+
If the caller is heterogeneous and contains booleans or objects,
237+
the result will be of dtype=object. See Notes.
238+
239+
Notes
240+
-----
241+
The dtype will be the lowest-common-denominator type (implicit
242+
upcasting); that is to say if the dtypes (even of numeric types)
243+
are mixed, the one that accommodates all will be chosen.
244+
245+
e.g. If the dtypes are float16 and float32, dtype will be upcast to
246+
float32. By numpy.find_common_type convention, mixing int64 and
247+
and uint64 will result in a float64 dtype.
248+
"""
249+
try:
250+
from scipy.sparse import coo_matrix
251+
except ImportError:
252+
raise ImportError('Scipy is not installed')
253+
254+
dtype = _find_common_type(self.dtypes)
255+
cols, rows, datas = [], [], []
256+
for col, name in enumerate(self):
257+
s = self[name]
258+
row = s.sp_index.to_int_index().indices
259+
cols.append(np.repeat(col, len(row)))
260+
rows.append(row)
261+
datas.append(s.sp_values.astype(dtype, copy=False))
262+
263+
cols = np.concatenate(cols)
264+
rows = np.concatenate(rows)
265+
datas = np.concatenate(datas)
266+
return coo_matrix((datas, (rows, cols)), shape=self.shape)
193267

194268
def __array_wrap__(self, result):
195269
return self._constructor(

Diff for: pandas/tests/sparse/test_frame.py

+54
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22

33
import operator
44

5+
import pytest
6+
57
from numpy import nan
68
import numpy as np
79
import pandas as pd
@@ -1118,6 +1120,58 @@ def test_isnotnull(self):
11181120
tm.assert_frame_equal(res.to_dense(), exp)
11191121

11201122

1123+
@pytest.fixture(params=['bsr', 'coo', 'csc', 'csr', 'dia', 'dok', 'lil'])
1124+
def spmatrix(request):
1125+
tm._skip_if_no_scipy()
1126+
from scipy import sparse
1127+
yield getattr(sparse, request.param + '_matrix')
1128+
1129+
1130+
@pytest.mark.parametrize('index', [None, list('abc')])
1131+
@pytest.mark.parametrize('columns', [None, list('def')])
1132+
@pytest.mark.parametrize('fill_value', [None, 0, np.nan])
1133+
def test_from_to_scipy(spmatrix, index, columns, fill_value):
1134+
# GH 4343
1135+
tm._skip_if_no_scipy()
1136+
1137+
# Make one sparse matrix and one ndarray for the two frames we'll try
1138+
# to match
1139+
spm = spmatrix([[0, 1, 0],
1140+
[0, 0, 1],
1141+
[1, 1, 1]]).astype(np.int16)
1142+
arr = spm.toarray()
1143+
assert spm.dtype == arr.dtype
1144+
1145+
# If fill_value is a float value, upcast the whole array to float
1146+
# Note the dtype of the resulting sdf.to_coo() will be the upcast one
1147+
if isinstance(fill_value, float):
1148+
arr = arr.astype(float)
1149+
arr[arr == 0] = np.nan
1150+
# If fill_value is None, set it to 0 for ndarray as value 0 is also the
1151+
# implied non-configurable default for scipy sparse matrices
1152+
arr_fill_value = 0 if fill_value is None else fill_value
1153+
1154+
sdf = pd.SparseDataFrame(spm, index=index, columns=columns,
1155+
default_fill_value=fill_value)
1156+
res = pd.SparseDataFrame(arr, index=index, columns=columns,
1157+
default_fill_value=arr_fill_value)
1158+
1159+
tm.assert_sp_frame_equal(sdf, res)
1160+
tm.assert_frame_equal(sdf.to_dense(), res.to_dense())
1161+
tm.assert_numpy_array_equal(res.values, arr)
1162+
1163+
# Assert spmatrices equal (operator!= not supported in scipy 0.8.0)
1164+
tm.assert_equal(dict(sdf.to_coo().todok()), dict(spm.todok()))
1165+
1166+
# Ensure dtype is preserved if possible
1167+
tm.assert_equal(sdf.to_coo().dtype, arr.dtype)
1168+
1169+
# However, adding an object column results in an upcast
1170+
sdf['names'] = np.arange(len(sdf)).astype(str)
1171+
1172+
tm.assert_equal(sdf.to_coo().dtype, np.object_)
1173+
1174+
11211175
class TestSparseDataFrameArithmetic(tm.TestCase):
11221176

11231177
def test_numeric_op_scalar(self):

Diff for: pandas/tests/types/test_inference.py

+9
Original file line numberDiff line numberDiff line change
@@ -30,11 +30,14 @@
3030
is_float,
3131
is_bool,
3232
is_scalar,
33+
is_scipy_sparse,
3334
_ensure_int32,
3435
_ensure_categorical)
3536
from pandas.types.missing import isnull
3637
from pandas.util import testing as tm
3738

39+
from pandas.tests.sparse.test_frame import spmatrix # noqa: F401
40+
3841

3942
def test_is_sequence():
4043
is_seq = inference.is_sequence
@@ -946,6 +949,12 @@ def test_nan_to_nat_conversions():
946949
assert (s[8].value == np.datetime64('NaT').astype(np.int64))
947950

948951

952+
def test_is_scipy_sparse(spmatrix): # noqa: F811
953+
tm._skip_if_no_scipy()
954+
assert is_scipy_sparse(spmatrix([[0, 1]]))
955+
assert not is_scipy_sparse(np.array([1]))
956+
957+
949958
def test_ensure_int32():
950959
values = np.arange(10, dtype=np.int32)
951960
result = _ensure_int32(values)

Diff for: pandas/types/common.py

+14
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,9 @@
2323
_TD_DTYPE = np.dtype('m8[ns]')
2424
_INT64_DTYPE = np.dtype(np.int64)
2525

26+
# oh the troubles to reduce import time
27+
_is_scipy_sparse = None
28+
2629
_ensure_float64 = algos.ensure_float64
2730
_ensure_float32 = algos.ensure_float32
2831

@@ -59,6 +62,17 @@ def is_sparse(array):
5962
return isinstance(array, (ABCSparseArray, ABCSparseSeries))
6063

6164

65+
def is_scipy_sparse(array):
66+
""" return if we are a scipy.sparse.spmatrix """
67+
global _is_scipy_sparse
68+
if _is_scipy_sparse is None:
69+
try:
70+
from scipy.sparse import issparse as _is_scipy_sparse
71+
except ImportError:
72+
return False
73+
return _is_scipy_sparse(array)
74+
75+
6276
def is_categorical(array):
6377
""" return if we are a categorical possibility """
6478
return isinstance(array, ABCCategorical) or is_categorical_dtype(array)

Diff for: pandas/util/testing.py

+5
Original file line numberDiff line numberDiff line change
@@ -297,6 +297,11 @@ def _skip_if_no_scipy():
297297
except ImportError:
298298
import pytest
299299
pytest.skip('scipy.interpolate missing')
300+
try:
301+
import scipy.sparse # noqa
302+
except ImportError:
303+
import pytest
304+
pytest.skip('scipy.sparse missing')
300305

301306

302307
def _skip_if_scipy_0_17():

0 commit comments

Comments
 (0)