Skip to content

SparseSeries accepts scipy.sparse.spmatrix in constructor #16617

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 9 commits into from
19 changes: 19 additions & 0 deletions doc/source/sparse.rst
Original file line number Diff line number Diff line change
Expand Up @@ -213,9 +213,28 @@ To convert a ``SparseDataFrame`` back to sparse SciPy matrix in COO format, you

sdf.to_coo()

.. _sparse.scipysparse_series:

SparseSeries
~~~~~~~~~~~~

.. versionadded:: 0.20.2

``SparseSeries``, ``SparseArray`` can be constructed from ``scipy.sparse.spmatrix`` objects of shape ``(1, n)`` or ``(n, 1)``.
SciPy sparse matrices can also be assigned directly to a ``SparseDataFrame`` with an index.

.. ipython:: python

sa = pd.SparseSeries(sp_arr[:, 5])
sa

sdf['x'] = sa
sdf['y'] = sp_arr[:, 6]
sdf[['z', 'w']] = sp_arr[:, [7, 8]]
sdf.iloc[:, -5:]

Below interface is deprecated.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

say that this is deprecated in 0.21.0


.. versionadded:: 0.16.0

A :meth:`SparseSeries.to_coo` method is implemented for transforming a ``SparseSeries`` indexed by a ``MultiIndex`` to a ``scipy.sparse.coo_matrix``.
Expand Down
3 changes: 3 additions & 0 deletions doc/source/whatsnew/v0.20.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ Enhancements
has been added to return the group order (:issue:`11642`); see
:ref:`here <groupby.ngroup>`.


- ``SparseSeries`` and ``SparseArray`` now support 1d ``scipy.sparse.spmatrix`` in constructor. Additionally, ``SparseDataFrame`` can be assigned columns of ``scipy.sparse.spmatrix``; see :ref:`here <sparse.scipysparse_series>`. (:issue:`15634`)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

will be for 0.21.0


.. _whatsnew_0202.performance:

Performance Improvements
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -554,7 +554,7 @@ def can_do_equal_len():
setter(item, v)

# we have an equal len ndarray/convertible to our labels
elif np.array(value).ndim == 2:
elif np.asanyarray(value).ndim == 2:

# note that this coerces the dtype if we are mixed
# GH 7551
Expand Down
16 changes: 13 additions & 3 deletions pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -1634,7 +1634,7 @@ class ComplexBlock(FloatOrComplexBlock):

def _can_hold_element(self, element):
if is_list_like(element):
element = np.array(element)
element = np.asanyarray(element)
return issubclass(element.dtype.type,
(np.floating, np.integer, np.complexfloating))
return (isinstance(element,
Expand All @@ -1658,7 +1658,7 @@ class IntBlock(NumericBlock):

def _can_hold_element(self, element):
if is_list_like(element):
element = np.array(element)
element = np.asanyarray(element)
tipo = element.dtype.type
return (issubclass(tipo, np.integer) and
not issubclass(tipo, (np.datetime64, np.timedelta64)))
Expand Down Expand Up @@ -1805,7 +1805,7 @@ class BoolBlock(NumericBlock):

def _can_hold_element(self, element):
if is_list_like(element):
element = np.array(element)
element = np.asanyarray(element)
return issubclass(element.dtype.type, np.integer)
return isinstance(element, (int, bool))

Expand Down Expand Up @@ -2571,6 +2571,16 @@ def _astype(self, dtype, copy=False, raise_on_error=True, values=None,
return self.make_block_same_class(values=values,
placement=self.mgr_locs)

def _can_hold_element(self, element):
element = np.asanyarray(element)
return np.issubdtype(element.dtype, self.sp_values.dtype)

def _try_cast(self, element):
try:
return np.asarray(element, dtype=self.sp_values.dtype)
except ValueError:
return element

def __len__(self):
try:
return self.sp_index.length
Expand Down
29 changes: 21 additions & 8 deletions pandas/core/sparse/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@
is_bool_dtype,
is_list_like,
is_string_dtype,
is_scalar, is_dtype_equal)
is_scalar, is_dtype_equal,
is_scipy_sparse)
from pandas.core.dtypes.cast import (
maybe_convert_platform, maybe_promote,
astype_nansafe, find_common_type)
Expand Down Expand Up @@ -164,11 +165,13 @@ class SparseArray(PandasObject, np.ndarray):

Parameters
----------
data : {array-like (1-D), Series, SparseSeries, dict}
data : {array-like (1-D), Series, SparseSeries, dict, \
scipy.sparse.spmatrix}
kind : {'block', 'integer'}
fill_value : float
Code for missing value. Defaults depends on dtype.
0 for int dtype, False for bool dtype, and NaN for other dtypes
0 for int dtype or scipy sparse matrix, False for bool dtype, and NaN
for other dtypes
sparse_index : {BlockIndex, IntIndex}, optional
Only if you have one. Mainly used internally

Expand Down Expand Up @@ -197,17 +200,27 @@ def __new__(cls, data, sparse_index=None, index=None, kind='integer',
values.fill(data)
data = values

if isinstance(data, ABCSparseSeries):
data = data.values
is_sparse_array = isinstance(data, SparseArray)

if dtype is not None:
dtype = np.dtype(dtype)

if is_sparse_array:
if isinstance(data, ABCSparseSeries):
data = data.values

if isinstance(data, SparseArray):
sparse_index = data.sp_index
values = data.sp_values
fill_value = data.fill_value
elif is_scipy_sparse(data):
if not any(ax == 1 for ax in data.shape):
raise ValueError('Need 1D sparse matrix shaped '
'(n, 1) or (1, n)')
coo = data.tocoo()
values = coo.data
indices = coo.row if coo.shape[0] != 1 else coo.col
sparse_index = _make_index(max(coo.shape), indices, kind)
# SciPy Sparse matrices imply missing value = 0
if fill_value is None:
fill_value = 0
else:
# array-like
if sparse_index is None:
Expand Down
9 changes: 9 additions & 0 deletions pandas/core/sparse/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -433,6 +433,15 @@ def __getitem__(self, key):
else:
return self._get_item_cache(key)

def __setitem__(self, key, value):
if is_scipy_sparse(value):
if any(ax == 1 for ax in value.shape): # 1d spmatrix
value = SparseArray(value, fill_value=self._default_fill_value)
else:
# 2d; make it iterable
value = list(value.tocsc().T)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

does this materialize?

super().__setitem__(key, value)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use the fully qualified call


@Appender(DataFrame.get_value.__doc__, indents=0)
def get_value(self, index, col, takeable=False):
if takeable is True:
Expand Down
14 changes: 12 additions & 2 deletions pandas/core/sparse/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import warnings

from pandas.core.dtypes.missing import isnull, notnull
from pandas.core.dtypes.common import is_scalar
from pandas.core.dtypes.common import is_scalar, is_scipy_sparse
from pandas.core.common import _values_from_object, _maybe_match_name

from pandas.compat.numpy import function as nv
Expand Down Expand Up @@ -90,7 +90,7 @@ class SparseSeries(Series):

Parameters
----------
data : {array-like, Series, SparseSeries, dict}
data : {array-like, Series, SparseSeries, dict, scipy.sparse.spmatrix}
kind : {'block', 'integer'}
fill_value : float
Code for missing value. Defaults depends on dtype.
Expand Down Expand Up @@ -128,6 +128,10 @@ def __init__(self, data=None, index=None, sparse_index=None, kind='block',
if isinstance(data, Series) and name is None:
name = data.name

if is_scipy_sparse(data):
data = SparseArray(data, dtype=dtype, kind=kind,
fill_value=fill_value)

if isinstance(data, SparseArray):
if index is not None:
assert (len(index) == len(data))
Expand Down Expand Up @@ -722,6 +726,9 @@ def combine_first(self, other):

def to_coo(self, row_levels=(0, ), column_levels=(1, ), sort_labels=False):
"""
DEPRECATED; instead, make a SparseSeries with a two-level index,
unstack it, then use .to_coo() on the resulting SparseDataFrame.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use the deprecated sphinx directive (I think we are changing these all over)


Create a scipy.sparse.coo_matrix from a SparseSeries with MultiIndex.

Use row_levels and column_levels to determine the row and column
Expand Down Expand Up @@ -779,6 +786,9 @@ def to_coo(self, row_levels=(0, ), column_levels=(1, ), sort_labels=False):
@classmethod
def from_coo(cls, A, dense_index=False):
"""
DEPRECATED; instead, pass 1d scipy.sparse matrices directly into
SparseSeries constructor, and 2d into SparseDataFrame constructor.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same


Create a SparseSeries from a scipy.sparse.coo_matrix.

.. versionadded:: 0.16.0
Expand Down
18 changes: 18 additions & 0 deletions pandas/tests/sparse/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,24 @@ def test_constructor_spindex_dtype(self):
assert arr.dtype == np.int64
assert arr.fill_value == 0

def test_constructor_spmatrix(self):
# GH-15634
tm.skip_if_no_package('scipy')
from scipy.sparse import csr_matrix

spm = csr_matrix(np.arange(5))

arr = SparseArray(spm)
assert arr.dtype == spm.dtype
assert arr.fill_value == 0

arr = SparseArray(spm, kind='block', dtype=float, fill_value=np.nan)
assert arr.dtype == float
assert np.isnan(arr.fill_value)

tm.assert_raises_regex(ValueError, '1D',
lambda: SparseArray(csr_matrix(np.eye(3))))

def test_sparseseries_roundtrip(self):
# GH 13999
for kind in ['integer', 'block']:
Expand Down
27 changes: 27 additions & 0 deletions pandas/tests/sparse/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -540,6 +540,33 @@ def test_setitem_array(self):
self.frame['F'].reindex(index),
check_names=False)

def test_setitem_spmatrix(self):
# GH-15634
tm.skip_if_no_package('scipy')
from scipy.sparse import csr_matrix

sdf = self.frame.copy(False)

# 1d -- column
spm = csr_matrix(np.arange(len(sdf))).T
sdf['X'] = spm
assert (sdf[['X']].to_coo() != spm.tocoo()).nnz == 0

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this comparision on the scipy side is fine, but also let's compare with assert_sparse_series/frame_equal

# 1d -- existing column
sdf['A'] = spm.T
assert (sdf[['X']].to_coo() != spm.tocoo()).nnz == 0

# 1d row -- changing series contents not yet supported
spm = csr_matrix(np.arange(sdf.shape[1])).astype(float)
idx = np.r_[[False, True], np.full(sdf.shape[0] - 2, False)]
tm.assert_raises_regex(TypeError, 'assignment',
lambda: sdf.__setitem__(idx, spm))

# 2d -- 2 columns
spm = csr_matrix(np.eye(len(sdf))[:, :2])
sdf[['X', 'A']] = spm
assert (sdf[['X', 'A']].to_coo() != spm.tocoo()).nnz == 0

def test_delitem(self):
A = self.frame['A']
C = self.frame['C']
Expand Down
18 changes: 18 additions & 0 deletions pandas/tests/sparse/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,24 @@ def test_constructor_preserve_attr(self):
assert s.dtype == np.int64
assert s.fill_value == 0

def test_constructor_spmatrix(self):
# GH-15634
tm.skip_if_no_package('scipy')
from scipy.sparse import csr_matrix

spm = csr_matrix(np.eye(5)[:, 2])

arr = SparseSeries(spm)
assert arr.dtype == spm.dtype
assert arr.fill_value == 0

arr = SparseSeries(spm, kind='block', dtype=float, fill_value=np.nan)
assert arr.dtype == float
assert np.isnan(arr.fill_value)

tm.assert_raises_regex(ValueError, '1D',
lambda: SparseSeries(csr_matrix(np.eye(3))))

def test_series_density(self):
# GH2803
ts = Series(np.random.randn(10))
Expand Down