-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
BUG: Maintain column order with groupby.nth #22811
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2944,17 +2944,20 @@ def intersection(self, other): | |
taken.name = None | ||
return taken | ||
|
||
def difference(self, other): | ||
def difference(self, other, sort=True): | ||
""" | ||
Return a new Index with elements from the index that are not in | ||
`other`. | ||
|
||
This is the set difference of two Index objects. | ||
It's sorted if sorting is possible. | ||
|
||
Parameters | ||
---------- | ||
other : Index or array-like | ||
sort : bool, default True | ||
Sort the resulting index if possible | ||
|
||
.. versionadded:: 0.24.0 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you make sure this is added to all subclasses as well (mutli, interval) I think have there own impl. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you do this (in this PR), can ideally update the tests for .difference for all types to parameterize it where appropriate |
||
|
||
Returns | ||
------- | ||
|
@@ -2963,10 +2966,12 @@ def difference(self, other): | |
Examples | ||
-------- | ||
|
||
>>> idx1 = pd.Index([1, 2, 3, 4]) | ||
>>> idx1 = pd.Index([2, 1, 3, 4]) | ||
>>> idx2 = pd.Index([3, 4, 5, 6]) | ||
>>> idx1.difference(idx2) | ||
Int64Index([1, 2], dtype='int64') | ||
>>> idx1.difference(idx2, sort=False) | ||
Int64Index([2, 1], dtype='int64') | ||
|
||
""" | ||
self._assert_can_do_setop(other) | ||
|
@@ -2985,10 +2990,11 @@ def difference(self, other): | |
label_diff = np.setdiff1d(np.arange(this.size), indexer, | ||
assume_unique=True) | ||
the_diff = this.values.take(label_diff) | ||
try: | ||
the_diff = sorting.safe_sort(the_diff) | ||
except TypeError: | ||
pass | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add some tests in the index tests to exercise this (prob just parameterize the parameter in the tests) |
||
if sort: | ||
try: | ||
the_diff = sorting.safe_sort(the_diff) | ||
except TypeError: | ||
pass | ||
|
||
return this._shallow_copy(the_diff, name=result_name, freq=None) | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1037,7 +1037,7 @@ def overlaps(self, other): | |
return self._data.overlaps(other) | ||
|
||
def _setop(op_name): | ||
def func(self, other): | ||
def func(self, other, sort=True): | ||
other = self._as_like_interval_index(other) | ||
|
||
# GH 19016: ensure set op will not return a prohibited dtype | ||
|
@@ -1048,7 +1048,11 @@ def func(self, other): | |
'objects that have compatible dtypes') | ||
raise TypeError(msg.format(op=op_name)) | ||
|
||
result = getattr(self._multiindex, op_name)(other._multiindex) | ||
if op_name == 'difference': | ||
result = getattr(self._multiindex, op_name)(other._multiindex, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a bit awkward at the moment because |
||
sort) | ||
else: | ||
result = getattr(self._multiindex, op_name)(other._multiindex) | ||
result_name = get_op_result_name(self, other) | ||
|
||
# GH 19101: ensure empty results have correct dtype | ||
|
Original file line number | Diff line number | Diff line change | ||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -2798,10 +2798,18 @@ def intersection(self, other): | |||||||||||||||||
return MultiIndex.from_arrays(lzip(*uniq_tuples), sortorder=0, | ||||||||||||||||||
names=result_names) | ||||||||||||||||||
|
||||||||||||||||||
def difference(self, other): | ||||||||||||||||||
def difference(self, other, sort=True): | ||||||||||||||||||
""" | ||||||||||||||||||
Compute sorted set difference of two MultiIndex objects | ||||||||||||||||||
|
||||||||||||||||||
Parameters | ||||||||||||||||||
---------- | ||||||||||||||||||
other : MultiIndex | ||||||||||||||||||
sort : bool, default True | ||||||||||||||||||
Sort the resulting MultiIndex if possible | ||||||||||||||||||
|
||||||||||||||||||
.. versionadded:: 0.24.0 | ||||||||||||||||||
|
||||||||||||||||||
Returns | ||||||||||||||||||
------- | ||||||||||||||||||
diff : MultiIndex | ||||||||||||||||||
|
@@ -2817,8 +2825,16 @@ def difference(self, other): | |||||||||||||||||
labels=[[]] * self.nlevels, | ||||||||||||||||||
names=result_names, verify_integrity=False) | ||||||||||||||||||
|
||||||||||||||||||
difference = sorted(set(self._ndarray_values) - | ||||||||||||||||||
set(other._ndarray_values)) | ||||||||||||||||||
this = self._get_unique_index() | ||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The old way of doing this using pandas/pandas/core/indexes/base.py Lines 2950 to 2957 in 145c227
|
||||||||||||||||||
|
||||||||||||||||||
indexer = this.get_indexer(other) | ||||||||||||||||||
indexer = indexer.take((indexer != -1).nonzero()[0]) | ||||||||||||||||||
|
||||||||||||||||||
label_diff = np.setdiff1d(np.arange(this.size), indexer, | ||||||||||||||||||
assume_unique=True) | ||||||||||||||||||
difference = this.values.take(label_diff) | ||||||||||||||||||
if sort: | ||||||||||||||||||
difference = sorted(difference) | ||||||||||||||||||
|
||||||||||||||||||
if len(difference) == 0: | ||||||||||||||||||
return MultiIndex(levels=[[]] * self.nlevels, | ||||||||||||||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -209,47 +209,55 @@ def test_intersection_bug_1708(self): | |
assert len(result) == 0 | ||
|
||
@pytest.mark.parametrize("tz", tz) | ||
def test_difference(self, tz): | ||
rng1 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) | ||
@pytest.mark.parametrize("sort", [True, False]) | ||
def test_difference(self, tz, sort): | ||
rng_dates = ['1/2/2000', '1/3/2000', '1/1/2000', '1/4/2000', | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wanted to ensure that the |
||
'1/5/2000'] | ||
|
||
rng1 = pd.DatetimeIndex(rng_dates, tz=tz) | ||
other1 = pd.date_range('1/6/2000', freq='D', periods=5, tz=tz) | ||
expected1 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) | ||
expected1 = pd.DatetimeIndex(rng_dates, tz=tz) | ||
|
||
rng2 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) | ||
rng2 = pd.DatetimeIndex(rng_dates, tz=tz) | ||
other2 = pd.date_range('1/4/2000', freq='D', periods=5, tz=tz) | ||
expected2 = pd.date_range('1/1/2000', freq='D', periods=3, tz=tz) | ||
expected2 = pd.DatetimeIndex(rng_dates[:3], tz=tz) | ||
|
||
rng3 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) | ||
rng3 = pd.DatetimeIndex(rng_dates, tz=tz) | ||
other3 = pd.DatetimeIndex([], tz=tz) | ||
expected3 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) | ||
expected3 = pd.DatetimeIndex(rng_dates, tz=tz) | ||
|
||
for rng, other, expected in [(rng1, other1, expected1), | ||
(rng2, other2, expected2), | ||
(rng3, other3, expected3)]: | ||
result_diff = rng.difference(other) | ||
result_diff = rng.difference(other, sort) | ||
if sort: | ||
expected = expected.sort_values() | ||
tm.assert_index_equal(result_diff, expected) | ||
|
||
def test_difference_freq(self): | ||
@pytest.mark.parametrize("sort", [True, False]) | ||
def test_difference_freq(self, sort): | ||
# GH14323: difference of DatetimeIndex should not preserve frequency | ||
|
||
index = date_range("20160920", "20160925", freq="D") | ||
other = date_range("20160921", "20160924", freq="D") | ||
expected = DatetimeIndex(["20160920", "20160925"], freq=None) | ||
idx_diff = index.difference(other) | ||
idx_diff = index.difference(other, sort) | ||
tm.assert_index_equal(idx_diff, expected) | ||
tm.assert_attr_equal('freq', idx_diff, expected) | ||
|
||
other = date_range("20160922", "20160925", freq="D") | ||
idx_diff = index.difference(other) | ||
idx_diff = index.difference(other, sort) | ||
expected = DatetimeIndex(["20160920", "20160921"], freq=None) | ||
tm.assert_index_equal(idx_diff, expected) | ||
tm.assert_attr_equal('freq', idx_diff, expected) | ||
|
||
def test_datetimeindex_diff(self): | ||
@pytest.mark.parametrize("sort", [True, False]) | ||
def test_datetimeindex_diff(self, sort): | ||
dti1 = DatetimeIndex(freq='Q-JAN', start=datetime(1997, 12, 31), | ||
periods=100) | ||
dti2 = DatetimeIndex(freq='Q-JAN', start=datetime(1997, 12, 31), | ||
periods=98) | ||
assert len(dti1.difference(dti2)) == 2 | ||
assert len(dti1.difference(dti2, sort)) == 2 | ||
|
||
def test_datetimeindex_union_join_empty(self): | ||
dti = DatetimeIndex(start='1/1/2001', end='2/1/2001', freq='D') | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Index.difference
tries to sort its result by default and this means that sometimes the order of the columns was changed from the original DataFrame. I added a newsort
parameter toIndex.difference
with a default of True to control this.