Skip to content

BUG: Stack/unstack do not return subclassed objects (GH15563) #18929

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jan 12, 2018
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.23.0.txt
Original file line number Diff line number Diff line change
@@ -449,6 +449,8 @@ Reshaping
- Bug in :func:`cut` which fails when using readonly arrays (:issue:`18773`)
- Bug in :func:`Dataframe.pivot_table` which fails when the ``aggfunc`` arg is of type string. The behavior is now consistent with other methods like ``agg`` and ``apply`` (:issue:`18713`)
- Bug in :func:`DataFrame.merge` in which merging using ``Index`` objects as vectors raised an Exception (:issue:`19038`)
- Bug in :func:`DataFrame.stack`, :func:`DataFrame.unstack`, :func:`Series.unstack` which were not returning subclasses (:issue:`15563`)
-

Numeric
^^^^^^^
6 changes: 2 additions & 4 deletions pandas/core/reshape/melt.py
Original file line number Diff line number Diff line change
@@ -80,8 +80,7 @@ def melt(frame, id_vars=None, value_vars=None, var_name=None,
mdata[col] = np.asanyarray(frame.columns
._get_level_values(i)).repeat(N)

from pandas import DataFrame
return DataFrame(mdata, columns=mcolumns)
return frame._constructor(mdata, columns=mcolumns)


def lreshape(data, groups, dropna=True, label=None):
@@ -152,8 +151,7 @@ def lreshape(data, groups, dropna=True, label=None):
if not mask.all():
mdata = {k: v[mask] for k, v in compat.iteritems(mdata)}

from pandas import DataFrame
return DataFrame(mdata, columns=id_cols + pivot_cols)
return data._constructor(mdata, columns=id_cols + pivot_cols)


def wide_to_long(df, stubnames, i, j, sep="", suffix=r'\d+'):
47 changes: 35 additions & 12 deletions pandas/core/reshape/reshape.py
Original file line number Diff line number Diff line change
@@ -37,8 +37,23 @@ class _Unstacker(object):
Parameters
----------
values : ndarray
Values of DataFrame to "Unstack"
index : object
Pandas ``Index``
level : int or str, default last level
Level to "unstack". Accepts a name for the level.
value_columns : Index, optional
Pandas ``Index`` or ``MultiIndex`` object if unstacking a DataFrame
fill_value : scalar, optional
Default value to fill in missing values if subgroups do not have the
same set of labels. By default, missing values will be replaced with
the default fill value for that data type, NaN for float, NaT for
datetimelike, etc. For integer types, by default data will converted to
float and missing values will be set to NaN.
constructor : object
Pandas ``DataFrame`` or subclass used to create unstacked
response. If None, DataFrame or SparseDataFrame will be used.
Examples
--------
@@ -69,7 +84,7 @@ class _Unstacker(object):
"""

def __init__(self, values, index, level=-1, value_columns=None,
fill_value=None):
fill_value=None, constructor=None):

self.is_categorical = None
self.is_sparse = is_sparse(values)
@@ -86,6 +101,14 @@ def __init__(self, values, index, level=-1, value_columns=None,
self.value_columns = value_columns
self.fill_value = fill_value

if constructor is None:
if self.is_sparse:
self.constructor = SparseDataFrame
else:
self.constructor = DataFrame
else:
self.constructor = constructor

if value_columns is None and values.shape[1] != 1: # pragma: no cover
raise ValueError('must pass column labels for multi-column data')

@@ -173,8 +196,7 @@ def get_result(self):
ordered=ordered)
for i in range(values.shape[-1])]

klass = SparseDataFrame if self.is_sparse else DataFrame
return klass(values, index=index, columns=columns)
return self.constructor(values, index=index, columns=columns)

def get_new_values(self):
values = self.values
@@ -374,8 +396,9 @@ def pivot(self, index=None, columns=None, values=None):
index = self.index
else:
index = self[index]
indexed = Series(self[values].values,
index=MultiIndex.from_arrays([index, self[columns]]))
indexed = self._constructor_sliced(
self[values].values,
index=MultiIndex.from_arrays([index, self[columns]]))
return indexed.unstack(columns)


@@ -461,7 +484,8 @@ def unstack(obj, level, fill_value=None):
return obj.T.stack(dropna=False)
else:
unstacker = _Unstacker(obj.values, obj.index, level=level,
fill_value=fill_value)
fill_value=fill_value,
constructor=obj._constructor_expanddim)
return unstacker.get_result()


@@ -470,12 +494,12 @@ def _unstack_frame(obj, level, fill_value=None):
unstacker = partial(_Unstacker, index=obj.index,
level=level, fill_value=fill_value)
blocks = obj._data.unstack(unstacker)
klass = type(obj)
return klass(blocks)
return obj._constructor(blocks)
else:
unstacker = _Unstacker(obj.values, obj.index, level=level,
value_columns=obj.columns,
fill_value=fill_value)
fill_value=fill_value,
constructor=obj._constructor)
return unstacker.get_result()


@@ -528,8 +552,7 @@ def factorize(index):
new_values = new_values[mask]
new_index = new_index[mask]

klass = type(frame)._constructor_sliced
return klass(new_values, index=new_index)
return frame._constructor_sliced(new_values, index=new_index)


def stack_multiple(frame, level, dropna=True):
@@ -676,7 +699,7 @@ def _convert_level_number(level_num, columns):
new_index = MultiIndex(levels=new_levels, labels=new_labels,
names=new_names, verify_integrity=False)

result = DataFrame(new_data, index=new_index, columns=new_columns)
result = frame._constructor(new_data, index=new_index, columns=new_columns)

# more efficient way to go about this? can do the whole masking biz but
# will only save a small amount of time...
269 changes: 268 additions & 1 deletion pandas/tests/frame/test_subclass.py
Original file line number Diff line number Diff line change
@@ -5,7 +5,7 @@
from warnings import catch_warnings
import numpy as np

from pandas import DataFrame, Series, MultiIndex, Panel
from pandas import DataFrame, Series, MultiIndex, Panel, Index
import pandas as pd
import pandas.util.testing as tm

@@ -247,3 +247,270 @@ def test_subclass_sparse_transpose(self):
[2, 5],
[3, 6]])
tm.assert_sp_frame_equal(ossdf.T, essdf)

def test_subclass_stack(self):
# GH 15564
df = tm.SubclassedDataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
index=['a', 'b', 'c'],
columns=['X', 'Y', 'Z'])

res = df.stack()
exp = tm.SubclassedSeries(
[1, 2, 3, 4, 5, 6, 7, 8, 9],
index=[list('aaabbbccc'), list('XYZXYZXYZ')])

tm.assert_series_equal(res, exp)

def test_subclass_stack_multi(self):
# GH 15564
df = tm.SubclassedDataFrame([
[10, 11, 12, 13],
[20, 21, 22, 23],
[30, 31, 32, 33],
[40, 41, 42, 43]],
index=MultiIndex.from_tuples(
list(zip(list('AABB'), list('cdcd'))),
names=['aaa', 'ccc']),
columns=MultiIndex.from_tuples(
list(zip(list('WWXX'), list('yzyz'))),
names=['www', 'yyy']))

exp = tm.SubclassedDataFrame([
[10, 12],
[11, 13],
[20, 22],
[21, 23],
[30, 32],
[31, 33],
[40, 42],
[41, 43]],
index=MultiIndex.from_tuples(list(zip(
list('AAAABBBB'), list('ccddccdd'), list('yzyzyzyz'))),
names=['aaa', 'ccc', 'yyy']),
columns=Index(['W', 'X'], name='www'))

res = df.stack()
tm.assert_frame_equal(res, exp)

res = df.stack('yyy')
tm.assert_frame_equal(res, exp)

exp = tm.SubclassedDataFrame([
[10, 11],
[12, 13],
[20, 21],
[22, 23],
[30, 31],
[32, 33],
[40, 41],
[42, 43]],
index=MultiIndex.from_tuples(list(zip(
list('AAAABBBB'), list('ccddccdd'), list('WXWXWXWX'))),
names=['aaa', 'ccc', 'www']),
columns=Index(['y', 'z'], name='yyy'))

res = df.stack('www')
tm.assert_frame_equal(res, exp)

def test_subclass_stack_multi_mixed(self):
# GH 15564
df = tm.SubclassedDataFrame([
[10, 11, 12.0, 13.0],
[20, 21, 22.0, 23.0],
[30, 31, 32.0, 33.0],
[40, 41, 42.0, 43.0]],
index=MultiIndex.from_tuples(
list(zip(list('AABB'), list('cdcd'))),
names=['aaa', 'ccc']),
columns=MultiIndex.from_tuples(
list(zip(list('WWXX'), list('yzyz'))),
names=['www', 'yyy']))

exp = tm.SubclassedDataFrame([
[10, 12.0],
[11, 13.0],
[20, 22.0],
[21, 23.0],
[30, 32.0],
[31, 33.0],
[40, 42.0],
[41, 43.0]],
index=MultiIndex.from_tuples(list(zip(
list('AAAABBBB'), list('ccddccdd'), list('yzyzyzyz'))),
names=['aaa', 'ccc', 'yyy']),
columns=Index(['W', 'X'], name='www'))

res = df.stack()
tm.assert_frame_equal(res, exp)

res = df.stack('yyy')
tm.assert_frame_equal(res, exp)

exp = tm.SubclassedDataFrame([
[10.0, 11.0],
[12.0, 13.0],
[20.0, 21.0],
[22.0, 23.0],
[30.0, 31.0],
[32.0, 33.0],
[40.0, 41.0],
[42.0, 43.0]],
index=MultiIndex.from_tuples(list(zip(
list('AAAABBBB'), list('ccddccdd'), list('WXWXWXWX'))),
names=['aaa', 'ccc', 'www']),
columns=Index(['y', 'z'], name='yyy'))

res = df.stack('www')
tm.assert_frame_equal(res, exp)

def test_subclass_unstack(self):
# GH 15564
df = tm.SubclassedDataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
index=['a', 'b', 'c'],
columns=['X', 'Y', 'Z'])

res = df.unstack()
exp = tm.SubclassedSeries(
[1, 4, 7, 2, 5, 8, 3, 6, 9],
index=[list('XXXYYYZZZ'), list('abcabcabc')])

tm.assert_series_equal(res, exp)

def test_subclass_unstack_multi(self):
# GH 15564
df = tm.SubclassedDataFrame([
[10, 11, 12, 13],
[20, 21, 22, 23],
[30, 31, 32, 33],
[40, 41, 42, 43]],
index=MultiIndex.from_tuples(
list(zip(list('AABB'), list('cdcd'))),
names=['aaa', 'ccc']),
columns=MultiIndex.from_tuples(
list(zip(list('WWXX'), list('yzyz'))),
names=['www', 'yyy']))

exp = tm.SubclassedDataFrame([
[10, 20, 11, 21, 12, 22, 13, 23],
[30, 40, 31, 41, 32, 42, 33, 43]],
index=Index(['A', 'B'], name='aaa'),
columns=MultiIndex.from_tuples(list(zip(
list('WWWWXXXX'), list('yyzzyyzz'), list('cdcdcdcd'))),
names=['www', 'yyy', 'ccc']))

res = df.unstack()
tm.assert_frame_equal(res, exp)

res = df.unstack('ccc')
tm.assert_frame_equal(res, exp)

exp = tm.SubclassedDataFrame([
[10, 30, 11, 31, 12, 32, 13, 33],
[20, 40, 21, 41, 22, 42, 23, 43]],
index=Index(['c', 'd'], name='ccc'),
columns=MultiIndex.from_tuples(list(zip(
list('WWWWXXXX'), list('yyzzyyzz'), list('ABABABAB'))),
names=['www', 'yyy', 'aaa']))

res = df.unstack('aaa')
tm.assert_frame_equal(res, exp)

def test_subclass_unstack_multi_mixed(self):
# GH 15564
df = tm.SubclassedDataFrame([
[10, 11, 12.0, 13.0],
[20, 21, 22.0, 23.0],
[30, 31, 32.0, 33.0],
[40, 41, 42.0, 43.0]],
index=MultiIndex.from_tuples(
list(zip(list('AABB'), list('cdcd'))),
names=['aaa', 'ccc']),
columns=MultiIndex.from_tuples(
list(zip(list('WWXX'), list('yzyz'))),
names=['www', 'yyy']))

exp = tm.SubclassedDataFrame([
[10, 20, 11, 21, 12.0, 22.0, 13.0, 23.0],
[30, 40, 31, 41, 32.0, 42.0, 33.0, 43.0]],
index=Index(['A', 'B'], name='aaa'),
columns=MultiIndex.from_tuples(list(zip(
list('WWWWXXXX'), list('yyzzyyzz'), list('cdcdcdcd'))),
names=['www', 'yyy', 'ccc']))

res = df.unstack()
tm.assert_frame_equal(res, exp)

res = df.unstack('ccc')
tm.assert_frame_equal(res, exp)

exp = tm.SubclassedDataFrame([
[10, 30, 11, 31, 12.0, 32.0, 13.0, 33.0],
[20, 40, 21, 41, 22.0, 42.0, 23.0, 43.0]],
index=Index(['c', 'd'], name='ccc'),
columns=MultiIndex.from_tuples(list(zip(
list('WWWWXXXX'), list('yyzzyyzz'), list('ABABABAB'))),
names=['www', 'yyy', 'aaa']))

res = df.unstack('aaa')
tm.assert_frame_equal(res, exp)

def test_subclass_pivot(self):
# GH 15564
df = tm.SubclassedDataFrame({
'index': ['A', 'B', 'C', 'C', 'B', 'A'],
'columns': ['One', 'One', 'One', 'Two', 'Two', 'Two'],
'values': [1., 2., 3., 3., 2., 1.]})

pivoted = df.pivot(
index='index', columns='columns', values='values')

expected = tm.SubclassedDataFrame({
'One': {'A': 1., 'B': 2., 'C': 3.},
'Two': {'A': 1., 'B': 2., 'C': 3.}})

expected.index.name, expected.columns.name = 'index', 'columns'

tm.assert_frame_equal(pivoted, expected)

def test_subclassed_melt(self):
# GH 15564
cheese = tm.SubclassedDataFrame({
'first': ['John', 'Mary'],
'last': ['Doe', 'Bo'],
'height': [5.5, 6.0],
'weight': [130, 150]})

melted = pd.melt(cheese, id_vars=['first', 'last'])

expected = tm.SubclassedDataFrame([
['John', 'Doe', 'height', 5.5],
['Mary', 'Bo', 'height', 6.0],
['John', 'Doe', 'weight', 130],
['Mary', 'Bo', 'weight', 150]],
columns=['first', 'last', 'variable', 'value'])

tm.assert_frame_equal(melted, expected)

def test_subclassed_wide_to_long(self):
# GH 9762

np.random.seed(123)
x = np.random.randn(3)
df = tm.SubclassedDataFrame({
"A1970": {0: "a", 1: "b", 2: "c"},
"A1980": {0: "d", 1: "e", 2: "f"},
"B1970": {0: 2.5, 1: 1.2, 2: .7},
"B1980": {0: 3.2, 1: 1.3, 2: .1},
"X": dict(zip(range(3), x))})

df["id"] = df.index
exp_data = {"X": x.tolist() + x.tolist(),
"A": ['a', 'b', 'c', 'd', 'e', 'f'],
"B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1],
"year": [1970, 1970, 1970, 1980, 1980, 1980],
"id": [0, 1, 2, 0, 1, 2]}
expected = tm.SubclassedDataFrame(exp_data)
expected = expected.set_index(['id', 'year'])[["X", "A", "B"]]
long_frame = pd.wide_to_long(df, ["A", "B"], i="id", j="year")

tm.assert_frame_equal(long_frame, expected)
15 changes: 11 additions & 4 deletions pandas/tests/series/test_subclass.py
Original file line number Diff line number Diff line change
@@ -13,24 +13,31 @@ def test_indexing_sliced(self):
res = s.loc[['a', 'b']]
exp = tm.SubclassedSeries([1, 2], index=list('ab'))
tm.assert_series_equal(res, exp)
assert isinstance(res, tm.SubclassedSeries)

res = s.iloc[[2, 3]]
exp = tm.SubclassedSeries([3, 4], index=list('cd'))
tm.assert_series_equal(res, exp)
assert isinstance(res, tm.SubclassedSeries)

res = s.loc[['a', 'b']]
exp = tm.SubclassedSeries([1, 2], index=list('ab'))
tm.assert_series_equal(res, exp)
assert isinstance(res, tm.SubclassedSeries)

def test_to_frame(self):
s = tm.SubclassedSeries([1, 2, 3, 4], index=list('abcd'), name='xxx')
res = s.to_frame()
exp = tm.SubclassedDataFrame({'xxx': [1, 2, 3, 4]}, index=list('abcd'))
tm.assert_frame_equal(res, exp)
assert isinstance(res, tm.SubclassedDataFrame)

def test_subclass_unstack(self):
# GH 15564
s = tm.SubclassedSeries(
[1, 2, 3, 4], index=[list('aabb'), list('xyxy')])

res = s.unstack()
exp = tm.SubclassedDataFrame(
{'x': [1, 3], 'y': [2, 4]}, index=['a', 'b'])

tm.assert_frame_equal(res, exp)


class TestSparseSeriesSubclassing(object):