Skip to content

Commit 990e21e

Browse files
committed
ENH: Native conversion from/to scipy.sparse matrix to SparseDataFrame
1 parent ae0a92a commit 990e21e

File tree

10 files changed

+255
-20
lines changed

10 files changed

+255
-20
lines changed

Diff for: doc/source/api.rst

+7
Original file line numberDiff line numberDiff line change
@@ -1031,6 +1031,13 @@ Serialization / IO / Conversion
10311031
DataFrame.to_string
10321032
DataFrame.to_clipboard
10331033

1034+
Sparse methods
1035+
~~~~~~~~~~~~~~
1036+
.. autosummary::
1037+
:toctree: generated/
1038+
1039+
SparseDataFrame.to_coo
1040+
10341041
.. _api.panel:
10351042

10361043
Panel

Diff for: doc/source/sparse.rst

+25-2
Original file line numberDiff line numberDiff line change
@@ -186,9 +186,32 @@ the correct dense result.
186186
Interaction with scipy.sparse
187187
-----------------------------
188188

189-
Experimental api to transform between sparse pandas and scipy.sparse structures.
189+
.. versionadded:: 0.20.0
190190

191-
A :meth:`SparseSeries.to_coo` method is implemented for transforming a ``SparseSeries`` indexed by a ``MultiIndex`` to a ``scipy.sparse.coo_matrix``.
191+
Pandas supports creating sparse dataframes directly from ``scipy.sparse`` matrices.
192+
193+
.. ipython:: python
194+
195+
from scipy.sparse import csr_matrix
196+
197+
arr = np.random.random(size=(1000, 5))
198+
arr[arr < .9] = 0
199+
200+
sp_arr = csr_matrix(arr)
201+
sp_arr
202+
203+
sdf = pd.SparseDataFrame(sp_arr)
204+
sdf
205+
206+
All sparse formats are supported, but matrices that aren't in :mod:`COOrdinate <scipy.sparse>` format will be converted to it, copying the data as needed. To convert a ``SparseDataFrame`` back to sparse SciPy matrix in COO format, you can use :meth:`SparseDataFrame.to_coo` method:
207+
208+
.. ipython:: python
209+
210+
sdf.to_coo()
211+
212+
.. versionadded:: 0.16.0
213+
214+
Additionally, a :meth:`SparseSeries.to_coo` method is implemented for transforming a ``SparseSeries`` indexed by a ``MultiIndex`` to a ``scipy.sparse.coo_matrix``.
192215

193216
The method requires a ``MultiIndex`` with two or more levels.
194217

Diff for: doc/source/whatsnew/v0.20.0.txt

+24
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,30 @@ You must enable this by setting the ``display.html.table_schema`` option to True
184184
.. _Table Schema: http://specs.frictionlessdata.io/json-table-schema/
185185
.. _nteract: http://nteract.io/
186186

187+
.. _whatsnew_0200.enhancements.scipy_sparse:
188+
189+
SciPy sparse matrix from/to SparseDataFrame
190+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
191+
Pandas now supports creating sparse dataframes directly from ``scipy.sparse.spmatrix`` instances. See the :ref:`documentation <sparse.scipysparse>` for more information. (:issue:`4343`)
192+
193+
All sparse formats are supported, but matrices that aren't in :mod:`COOrdinate <scipy.sparse>` format will be converted to it, copying the data as needed.
194+
195+
.. ipython:: python
196+
197+
from scipy.sparse import csr_matrix
198+
arr = np.random.random(size=(1000, 5))
199+
arr[arr < .9] = 0
200+
sp_arr = csr_matrix(arr)
201+
sp_arr
202+
sdf = pd.SparseDataFrame(sp_arr)
203+
sdf
204+
205+
To convert a ``SparseDataFrame`` back to sparse SciPy matrix in COO format, you can use:
206+
207+
.. ipython:: python
208+
209+
sdf.to_coo()
210+
187211
.. _whatsnew_0200.enhancements.other:
188212

189213
Other enhancements

Diff for: pandas/sparse/array.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
is_integer_dtype,
2121
is_bool_dtype,
2222
is_list_like,
23+
is_string_dtype,
2324
is_scalar, is_dtype_equal)
2425
from pandas.types.cast import (_possibly_convert_platform, _maybe_promote,
2526
_astype_nansafe, _find_common_type)
@@ -769,14 +770,20 @@ def make_sparse(arr, kind='block', fill_value=None):
769770
if isnull(fill_value):
770771
mask = notnull(arr)
771772
else:
773+
# For str arrays in NumPy 1.12.0, operator!= below isn't
774+
# element-wise but just returns False if fill_value is not str,
775+
# so cast to object comparison to be safe
776+
if is_string_dtype(arr):
777+
arr = arr.astype(object)
778+
772779
mask = arr != fill_value
773780

774781
length = len(arr)
775782
if length != mask.size:
776783
# the arr is a SparseArray
777784
indices = mask.sp_index.indices
778785
else:
779-
indices = np.arange(length, dtype=np.int32)[mask]
786+
indices = mask.nonzero()[0].astype(np.int32)
780787

781788
index = _make_index(length, indices, kind)
782789
sparsified_values = arr[mask]

Diff for: pandas/sparse/frame.py

+90-17
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@
1111
import numpy as np
1212

1313
from pandas.types.missing import isnull, notnull
14-
from pandas.types.cast import _maybe_upcast
15-
from pandas.types.common import _ensure_platform_int
14+
from pandas.types.cast import _maybe_upcast, _find_common_type
15+
from pandas.types.common import _ensure_platform_int, is_scipy_sparse
1616

1717
from pandas.core.common import _try_sort
1818
from pandas.compat.numpy import function as nv
@@ -25,6 +25,7 @@
2525
create_block_manager_from_arrays)
2626
import pandas.core.generic as generic
2727
from pandas.sparse.series import SparseSeries, SparseArray
28+
from pandas.sparse.libsparse import BlockIndex, get_blocks
2829
from pandas.util.decorators import Appender
2930
import pandas.core.ops as ops
3031

@@ -39,15 +40,15 @@ class SparseDataFrame(DataFrame):
3940
4041
Parameters
4142
----------
42-
data : same types as can be passed to DataFrame
43+
data : same types as can be passed to DataFrame or scipy.sparse.spmatrix
4344
index : array-like, optional
4445
column : array-like, optional
4546
default_kind : {'block', 'integer'}, default 'block'
4647
Default sparse kind for converting Series to SparseSeries. Will not
4748
override SparseSeries passed into constructor
4849
default_fill_value : float
49-
Default fill_value for converting Series to SparseSeries. Will not
50-
override SparseSeries passed in
50+
Default fill_value for converting Series to SparseSeries
51+
(default: nan). Will not override SparseSeries passed in.
5152
"""
5253
_constructor_sliced = SparseSeries
5354
_subtyp = 'sparse_frame'
@@ -84,22 +85,19 @@ def __init__(self, data=None, index=None, columns=None, default_kind=None,
8485
self._default_kind = default_kind
8586
self._default_fill_value = default_fill_value
8687

87-
if isinstance(data, dict):
88-
mgr = self._init_dict(data, index, columns)
89-
if dtype is not None:
90-
mgr = mgr.astype(dtype)
88+
if is_scipy_sparse(data):
89+
mgr = self._init_spmatrix(data, index, columns, dtype=dtype,
90+
fill_value=default_fill_value)
91+
elif isinstance(data, dict):
92+
mgr = self._init_dict(data, index, columns, dtype=dtype)
9193
elif isinstance(data, (np.ndarray, list)):
92-
mgr = self._init_matrix(data, index, columns)
93-
if dtype is not None:
94-
mgr = mgr.astype(dtype)
94+
mgr = self._init_matrix(data, index, columns, dtype=dtype)
9595
elif isinstance(data, SparseDataFrame):
9696
mgr = self._init_mgr(data._data,
9797
dict(index=index, columns=columns),
9898
dtype=dtype, copy=copy)
9999
elif isinstance(data, DataFrame):
100-
mgr = self._init_dict(data, data.index, data.columns)
101-
if dtype is not None:
102-
mgr = mgr.astype(dtype)
100+
mgr = self._init_dict(data, data.index, data.columns, dtype=dtype)
103101
elif isinstance(data, BlockManager):
104102
mgr = self._init_mgr(data, axes=dict(index=index, columns=columns),
105103
dtype=dtype, copy=copy)
@@ -174,7 +172,43 @@ def _init_dict(self, data, index, columns, dtype=None):
174172
return to_manager(sdict, columns, index)
175173

176174
def _init_matrix(self, data, index, columns, dtype=None):
175+
""" Init self from ndarray or list of lists """
177176
data = _prep_ndarray(data, copy=False)
177+
index, columns = self._prep_index(data, index, columns)
178+
data = dict([(idx, data[:, i]) for i, idx in enumerate(columns)])
179+
return self._init_dict(data, index, columns, dtype)
180+
181+
def _init_spmatrix(self, data, index, columns, dtype=None,
182+
fill_value=None):
183+
""" Init self from scipy.sparse matrix """
184+
index, columns = self._prep_index(data, index, columns)
185+
data = data.tocoo()
186+
N = len(index)
187+
188+
# Construct a dict of SparseSeries
189+
sdict = {}
190+
values = Series(data.data, index=data.row, copy=False)
191+
for col, rowvals in values.groupby(data.col):
192+
# get_blocks expects int32 row indices in sorted order
193+
rows = rowvals.index.values.astype(np.int32)
194+
rows.sort()
195+
blocs, blens = get_blocks(rows)
196+
197+
sdict[columns[col]] = SparseSeries(
198+
rowvals.values, index=index,
199+
fill_value=fill_value,
200+
sparse_index=BlockIndex(N, blocs, blens))
201+
202+
# Add any columns that were empty and thus not grouped on above
203+
sdict.update({column: SparseSeries(index=index,
204+
fill_value=fill_value,
205+
sparse_index=BlockIndex(N, [], []))
206+
for column in columns
207+
if column not in sdict})
208+
209+
return self._init_dict(sdict, index, columns, dtype)
210+
211+
def _prep_index(self, data, index, columns):
178212
N, K = data.shape
179213
if index is None:
180214
index = _default_index(N)
@@ -187,9 +221,48 @@ def _init_matrix(self, data, index, columns, dtype=None):
187221
if len(index) != N:
188222
raise ValueError('Index length mismatch: %d vs. %d' %
189223
(len(index), N))
224+
return index, columns
190225

191-
data = dict([(idx, data[:, i]) for i, idx in enumerate(columns)])
192-
return self._init_dict(data, index, columns, dtype)
226+
def to_coo(self):
227+
"""
228+
Return the contents of the frame as a sparse SciPy COO matrix.
229+
230+
.. versionadded:: 0.20.0
231+
232+
Returns
233+
-------
234+
coo_matrix : scipy.sparse.spmatrix
235+
If the caller is heterogeneous and contains booleans or objects,
236+
the result will be of dtype=object. See Notes.
237+
238+
Notes
239+
-----
240+
The dtype will be the lowest-common-denominator type (implicit
241+
upcasting); that is to say if the dtypes (even of numeric types)
242+
are mixed, the one that accommodates all will be chosen.
243+
244+
e.g. If the dtypes are float16 and float32, dtype will be upcast to
245+
float32. By numpy.find_common_type convention, mixing int64 and
246+
and uint64 will result in a float64 dtype.
247+
"""
248+
try:
249+
from scipy.sparse import coo_matrix
250+
except ImportError:
251+
raise ImportError('Scipy is not installed')
252+
253+
dtype = _find_common_type(self.dtypes)
254+
cols, rows, datas = [], [], []
255+
for col, name in enumerate(self):
256+
s = self[name]
257+
row = s.sp_index.to_int_index().indices
258+
cols.append(np.repeat(col, len(row)))
259+
rows.append(row)
260+
datas.append(s.sp_values.astype(dtype, copy=False))
261+
262+
cols = np.concatenate(cols)
263+
rows = np.concatenate(rows)
264+
datas = np.concatenate(datas)
265+
return coo_matrix((datas, (rows, cols)), shape=self.shape)
193266

194267
def __array_wrap__(self, result):
195268
return self._constructor(

Diff for: pandas/tests/sparse/common.py

+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
import pytest
2+
3+
import pandas.util.testing as tm
4+
5+
6+
@pytest.fixture(params=['bsr', 'coo', 'csc', 'csr', 'dia', 'dok', 'lil'])
7+
def spmatrix(request):
8+
tm._skip_if_no_scipy()
9+
from scipy import sparse
10+
return getattr(sparse, request.param + '_matrix')

Diff for: pandas/tests/sparse/test_frame.py

+63
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,17 @@
22

33
import operator
44

5+
import pytest
6+
57
from numpy import nan
68
import numpy as np
79
import pandas as pd
810

911
from pandas import Series, DataFrame, bdate_range, Panel
12+
from pandas.types.common import (is_bool_dtype,
13+
is_float_dtype,
14+
is_object_dtype,
15+
is_float)
1016
from pandas.tseries.index import DatetimeIndex
1117
from pandas.tseries.offsets import BDay
1218
import pandas.util.testing as tm
@@ -18,6 +24,8 @@
1824
from pandas.sparse.api import SparseSeries, SparseDataFrame, SparseArray
1925
from pandas.tests.frame.test_misc_api import SharedWithSparse
2026

27+
from pandas.tests.sparse.common import spmatrix # noqa: F401
28+
2129

2230
class TestSparseDataFrame(tm.TestCase, SharedWithSparse):
2331

@@ -1118,6 +1126,61 @@ def test_isnotnull(self):
11181126
tm.assert_frame_equal(res.to_dense(), exp)
11191127

11201128

1129+
@pytest.mark.parametrize('index', [None, list('ab')])
1130+
@pytest.mark.parametrize('columns', [None, list('cd')])
1131+
@pytest.mark.parametrize('fill_value', [None, 0, np.nan])
1132+
@pytest.mark.parametrize('dtype', [object, bool, int, float, np.uint16])
1133+
def test_from_to_scipy(spmatrix, # noqa: E811, F811
1134+
index, columns, fill_value, dtype):
1135+
# GH 4343
1136+
tm._skip_if_no_scipy()
1137+
1138+
# Make one ndarray and from it one sparse matrix, both to be used for
1139+
# constructing frames and comparing results
1140+
arr = np.eye(2, dtype=dtype)
1141+
try:
1142+
spm = spmatrix(arr)
1143+
assert spm.dtype == arr.dtype
1144+
except (TypeError, AssertionError):
1145+
# If conversion to sparse fails for this spmatrix type and arr.dtype,
1146+
# then the combination is not currently supported in NumPy, so we
1147+
# can just skip testing it thoroughly
1148+
return
1149+
1150+
sdf = pd.SparseDataFrame(spm, index=index, columns=columns,
1151+
default_fill_value=fill_value)
1152+
1153+
# Expected result construction is kind of tricky for all
1154+
# dtype-fill_value combinations; easiest to cast to something generic
1155+
# and except later on
1156+
rarr = arr.astype(object)
1157+
rarr[arr == 0] = np.nan
1158+
expected = pd.SparseDataFrame(rarr, index=index, columns=columns).fillna(
1159+
fill_value if fill_value is not None else np.nan)
1160+
1161+
# Assert frame is as expected
1162+
sdf_obj = sdf.astype(object)
1163+
tm.assert_sp_frame_equal(sdf_obj, expected)
1164+
tm.assert_frame_equal(sdf_obj.to_dense(), expected.to_dense())
1165+
1166+
# Assert spmatrices equal
1167+
tm.assert_equal(dict(sdf.to_coo().todok()), dict(spm.todok()))
1168+
1169+
# Ensure dtype is preserved if possible
1170+
was_upcast = ((fill_value is None or is_float(fill_value)) and
1171+
not is_object_dtype(dtype) and
1172+
not is_float_dtype(dtype))
1173+
res_dtype = (bool if is_bool_dtype(dtype) else
1174+
float if was_upcast else
1175+
dtype)
1176+
tm.assert_contains_all(sdf.dtypes, {np.dtype(res_dtype)})
1177+
tm.assert_equal(sdf.to_coo().dtype, res_dtype)
1178+
1179+
# However, adding a str column results in an upcast to object
1180+
sdf['strings'] = np.arange(len(sdf)).astype(str)
1181+
tm.assert_equal(sdf.to_coo().dtype, np.object_)
1182+
1183+
11211184
class TestSparseDataFrameArithmetic(tm.TestCase):
11221185

11231186
def test_numeric_op_scalar(self):

0 commit comments

Comments
 (0)