diff --git a/doc/source/sparse.rst b/doc/source/sparse.rst index b4884cf1c4141..d5cf39ce2925f 100644 --- a/doc/source/sparse.rst +++ b/doc/source/sparse.rst @@ -213,9 +213,28 @@ To convert a ``SparseDataFrame`` back to sparse SciPy matrix in COO format, you sdf.to_coo() +.. _sparse.scipysparse_series: + SparseSeries ~~~~~~~~~~~~ +.. versionadded:: 0.21.0 + +``SparseSeries``, ``SparseArray`` can be constructed from ``scipy.sparse.spmatrix`` objects of shape ``(1, n)`` or ``(n, 1)``. +SciPy sparse matrices can also be assigned directly to a ``SparseDataFrame`` with an index. + +.. ipython:: python + + sa = pd.SparseSeries(sp_arr[:, 5]) + sa + + sdf['x'] = sa + sdf['y'] = sp_arr[:, 6] + sdf[['z', 'w']] = sp_arr[:, [7, 8]] + sdf.iloc[:, -5:] + +Below interface is deprecated. + .. versionadded:: 0.16.0 A :meth:`SparseSeries.to_coo` method is implemented for transforming a ``SparseSeries`` indexed by a ``MultiIndex`` to a ``scipy.sparse.coo_matrix``. diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 3dd8bb2ac2de5..c1a7d9d621a33 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -25,6 +25,10 @@ New features - Added `__fspath__` method to :class`:pandas.HDFStore`, :class:`pandas.ExcelFile`, and :class:`pandas.ExcelWriter` to work properly with the file system path protocol (:issue:`13823`) +- ``SparseSeries`` and ``SparseArray`` now support 1d ``scipy.sparse.spmatrix`` in constructor. + Additionally, ``SparseDataFrame`` can be assigned columns of ``scipy.sparse.spmatrix``; + see :ref:`here `. (:issue:`15634`) + .. _whatsnew_0210.enhancements.other: Other Enhancements diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index a01e3dc46dfe9..ecfdc61a5f908 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -554,7 +554,7 @@ def can_do_equal_len(): setter(item, v) # we have an equal len ndarray/convertible to our labels - elif np.array(value).ndim == 2: + elif np.asanyarray(value).ndim == 2: # note that this coerces the dtype if we are mixed # GH 7551 diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 15851a17274ca..ee065258f40cf 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -775,7 +775,7 @@ def _is_empty_indexer(indexer): return block except ValueError: raise - except TypeError: + except TypeError as e: # cast to the passed dtype if possible # otherwise raise the original error @@ -788,7 +788,7 @@ def _is_empty_indexer(indexer): except: pass - raise + raise e except Exception: pass @@ -1634,7 +1634,7 @@ class ComplexBlock(FloatOrComplexBlock): def _can_hold_element(self, element): if is_list_like(element): - element = np.array(element) + element = np.asanyarray(element) return issubclass(element.dtype.type, (np.floating, np.integer, np.complexfloating)) return (isinstance(element, @@ -1658,7 +1658,7 @@ class IntBlock(NumericBlock): def _can_hold_element(self, element): if is_list_like(element): - element = np.array(element) + element = np.asanyarray(element) tipo = element.dtype.type return (issubclass(tipo, np.integer) and not issubclass(tipo, (np.datetime64, np.timedelta64))) @@ -1805,7 +1805,7 @@ class BoolBlock(NumericBlock): def _can_hold_element(self, element): if is_list_like(element): - element = np.array(element) + element = np.asanyarray(element) return issubclass(element.dtype.type, np.integer) return isinstance(element, (int, bool)) @@ -2571,6 +2571,16 @@ def _astype(self, dtype, copy=False, raise_on_error=True, values=None, return self.make_block_same_class(values=values, placement=self.mgr_locs) + def _can_hold_element(self, element): + element = np.asanyarray(element) + return np.issubdtype(element.dtype, self.sp_values.dtype) + + def _try_cast(self, element): + try: + return np.asarray(element, dtype=self.sp_values.dtype) + except ValueError: + return element + def __len__(self): try: return self.sp_index.length diff --git a/pandas/core/sparse/array.py b/pandas/core/sparse/array.py index 8ac9d3916573e..3cbf230fadeb1 100644 --- a/pandas/core/sparse/array.py +++ b/pandas/core/sparse/array.py @@ -23,7 +23,8 @@ is_bool_dtype, is_list_like, is_string_dtype, - is_scalar, is_dtype_equal) + is_scalar, is_dtype_equal, + is_scipy_sparse) from pandas.core.dtypes.cast import ( maybe_convert_platform, maybe_promote, astype_nansafe, find_common_type) @@ -164,11 +165,13 @@ class SparseArray(PandasObject, np.ndarray): Parameters ---------- - data : {array-like (1-D), Series, SparseSeries, dict} + data : {array-like (1-D), Series, SparseSeries, dict, \ + scipy.sparse.spmatrix} kind : {'block', 'integer'} fill_value : float Code for missing value. Defaults depends on dtype. - 0 for int dtype, False for bool dtype, and NaN for other dtypes + 0 for int dtype or scipy sparse matrix, False for bool dtype, and NaN + for other dtypes sparse_index : {BlockIndex, IntIndex}, optional Only if you have one. Mainly used internally @@ -197,17 +200,27 @@ def __new__(cls, data, sparse_index=None, index=None, kind='integer', values.fill(data) data = values - if isinstance(data, ABCSparseSeries): - data = data.values - is_sparse_array = isinstance(data, SparseArray) - if dtype is not None: dtype = np.dtype(dtype) - if is_sparse_array: + if isinstance(data, ABCSparseSeries): + data = data.values + + if isinstance(data, SparseArray): sparse_index = data.sp_index values = data.sp_values fill_value = data.fill_value + elif is_scipy_sparse(data): + if not any(ax == 1 for ax in data.shape): + raise ValueError('Need 1D sparse matrix shaped ' + '(n, 1) or (1, n)') + coo = data.tocoo() + values = coo.data + indices = coo.row if coo.shape[0] != 1 else coo.col + sparse_index = _make_index(max(coo.shape), indices, kind) + # SciPy Sparse matrices imply missing value = 0 + if fill_value is None: + fill_value = 0 else: # array-like if sparse_index is None: diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index 461dd50c5da6e..7a39306a4ee5b 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -433,6 +433,16 @@ def __getitem__(self, key): else: return self._get_item_cache(key) + def __setitem__(self, key, value): + if is_scipy_sparse(value): + if any(ax == 1 for ax in value.shape): # 1d spmatrix + value = SparseArray(value, fill_value=self._default_fill_value, + kind=self._default_kind) + else: + # 2d; make it iterable + value = list(value.tocsc().T) + super(SparseDataFrame, self).__setitem__(key, value) + @Appender(DataFrame.get_value.__doc__, indents=0) def get_value(self, index, col, takeable=False): if takeable is True: diff --git a/pandas/core/sparse/series.py b/pandas/core/sparse/series.py index 9dd061e26ba06..56fb10926b9eb 100644 --- a/pandas/core/sparse/series.py +++ b/pandas/core/sparse/series.py @@ -9,7 +9,7 @@ import warnings from pandas.core.dtypes.missing import isnull, notnull -from pandas.core.dtypes.common import is_scalar +from pandas.core.dtypes.common import is_scalar, is_scipy_sparse from pandas.core.common import _values_from_object, _maybe_match_name from pandas.compat.numpy import function as nv @@ -90,7 +90,7 @@ class SparseSeries(Series): Parameters ---------- - data : {array-like, Series, SparseSeries, dict} + data : {array-like, Series, SparseSeries, dict, scipy.sparse.spmatrix} kind : {'block', 'integer'} fill_value : float Code for missing value. Defaults depends on dtype. @@ -128,6 +128,10 @@ def __init__(self, data=None, index=None, sparse_index=None, kind='block', if isinstance(data, Series) and name is None: name = data.name + if is_scipy_sparse(data): + data = SparseArray(data, dtype=dtype, kind=kind, + fill_value=fill_value) + if isinstance(data, SparseArray): if index is not None: assert (len(index) == len(data)) @@ -722,6 +726,10 @@ def combine_first(self, other): def to_coo(self, row_levels=(0, ), column_levels=(1, ), sort_labels=False): """ + .. deprecated:: 0.21.0 + Instead, make a SparseSeries with a two-level index, + unstack it, then use .to_coo() on the resulting SparseDataFrame. + Create a scipy.sparse.coo_matrix from a SparseSeries with MultiIndex. Use row_levels and column_levels to determine the row and column @@ -779,6 +787,10 @@ def to_coo(self, row_levels=(0, ), column_levels=(1, ), sort_labels=False): @classmethod def from_coo(cls, A, dense_index=False): """ + .. deprecated:: 0.21.0 + Instead, pass 1d scipy.sparse matrices directly into SparseSeries + constructor, and 2d into SparseDataFrame constructor. + Create a SparseSeries from a scipy.sparse.coo_matrix. .. versionadded:: 0.16.0 diff --git a/pandas/tests/sparse/test_array.py b/pandas/tests/sparse/test_array.py index 4ce03f72dbba6..3b8d3b7b27e85 100644 --- a/pandas/tests/sparse/test_array.py +++ b/pandas/tests/sparse/test_array.py @@ -105,6 +105,24 @@ def test_constructor_spindex_dtype(self): assert arr.dtype == np.int64 assert arr.fill_value == 0 + def test_constructor_spmatrix(self): + # GH-15634 + tm.skip_if_no_package('scipy') + from scipy.sparse import csr_matrix + + spm = csr_matrix(np.arange(5)) + + arr = SparseArray(spm) + assert arr.dtype == spm.dtype + assert arr.fill_value == 0 + + arr = SparseArray(spm, kind='block', dtype=float, fill_value=np.nan) + assert arr.dtype == float + assert np.isnan(arr.fill_value) + + tm.assert_raises_regex(ValueError, '1D', + lambda: SparseArray(csr_matrix(np.eye(3)))) + def test_sparseseries_roundtrip(self): # GH 13999 for kind in ['integer', 'block']: diff --git a/pandas/tests/sparse/test_frame.py b/pandas/tests/sparse/test_frame.py index 654d12b782f37..fda26ded0d931 100644 --- a/pandas/tests/sparse/test_frame.py +++ b/pandas/tests/sparse/test_frame.py @@ -540,6 +540,37 @@ def test_setitem_array(self): self.frame['F'].reindex(index), check_names=False) + def test_setitem_spmatrix(self): + # GH-15634 + tm.skip_if_no_package('scipy') + from scipy.sparse import csr_matrix + + sdf = self.frame.copy(False) + + def _equal(spm1, spm2): + return np.all(spm1.toarray() == spm2.toarray()) + + # 1d -- column + spm = csr_matrix(np.arange(len(sdf))).T + sdf['X'] = spm + assert _equal(sdf[['X']].to_coo(), spm) + + # 1d -- existing column + sdf['A'] = spm.T + assert _equal(sdf[['X']].to_coo(), spm) + + # 1d row -- changing series contents not yet supported + spm = csr_matrix(np.arange(sdf.shape[1], dtype=float)) + idx = np.zeros(sdf.shape[0], dtype=bool) + idx[1] = True + tm.assert_raises_regex(TypeError, 'assignment', + lambda: sdf.__setitem__(idx, spm)) + + # 2d -- 2 columns + spm = csr_matrix(np.eye(len(sdf))[:, :2]) + sdf[['X', 'A']] = spm + assert _equal(sdf[['X', 'A']].to_coo(), spm) + def test_delitem(self): A = self.frame['A'] C = self.frame['C'] diff --git a/pandas/tests/sparse/test_series.py b/pandas/tests/sparse/test_series.py index b524d6bfab418..e428627e9116f 100644 --- a/pandas/tests/sparse/test_series.py +++ b/pandas/tests/sparse/test_series.py @@ -142,6 +142,24 @@ def test_constructor_preserve_attr(self): assert s.dtype == np.int64 assert s.fill_value == 0 + def test_constructor_spmatrix(self): + # GH-15634 + tm.skip_if_no_package('scipy') + from scipy.sparse import csr_matrix + + spm = csr_matrix(np.eye(5)[:, 2]) + + arr = SparseSeries(spm) + assert arr.dtype == spm.dtype + assert arr.fill_value == 0 + + arr = SparseSeries(spm, kind='block', dtype=float, fill_value=np.nan) + assert arr.dtype == float + assert np.isnan(arr.fill_value) + + tm.assert_raises_regex(ValueError, '1D', + lambda: SparseSeries(csr_matrix(np.eye(3)))) + def test_series_density(self): # GH2803 ts = Series(np.random.randn(10))