Skip to content

Index union sort hack #20809

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.23.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -847,6 +847,7 @@ Other API Changes
- ``KeyError`` now raises instead of ``ValueError`` in :meth:`~DataFrame.drop`, :meth:`~Panel.drop`, :meth:`~Series.drop`, :meth:`~Index.drop` when dropping a non-existent element in an axis with duplicates (:issue:`19186`)
- :func:`Series.to_csv` now accepts a ``compression`` argument that works in the same way as the ``compression`` argument in :func:`DataFrame.to_csv` (:issue:`18958`)
- Set operations (union, difference...) on :class:`IntervalIndex` with incompatible index types will now raise a ``TypeError`` rather than a ``ValueError`` (:issue:`19329`)
- Bug in the order of the result of ``Index.union()`` when indexes contain tuples (:issue:`17376`)
- :class:`DateOffset` objects render more simply, e.g. ``<DateOffset: days=1>`` instead of ``<DateOffset: kwds={'days': 1}>`` (:issue:`19403`)
- ``Categorical.fillna`` now validates its ``value`` and ``method`` keyword arguments. It now raises when both or none are specified, matching the behavior of :meth:`Series.fillna` (:issue:`19682`)
- ``pd.to_datetime('today')`` now returns a datetime, consistent with ``pd.Timestamp('today')``; previously ``pd.to_datetime('today')`` returned a ``.normalized()`` datetime (:issue:`19935`)
Expand Down
27 changes: 9 additions & 18 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2749,35 +2749,26 @@ def union(self, other):
value_set = set(lvals)
result.extend([x for x in rvals if x not in value_set])
else:
indexer = self.get_indexer(other)
indexer, = (indexer == -1).nonzero()

indexer = np.where(self.get_indexer(other) == -1)[0]
if len(indexer) > 0:
other_diff = algos.take_nd(rvals, indexer,
allow_fill=False)
result = _concat._concat_compat((lvals, other_diff))

# Workaround failure of #17378 on Windows
try:
lvals[0] < other_diff[0]
except TypeError as e:
warnings.warn("%s, sort order is undefined for "
"incomparable objects" % e, RuntimeWarning,
stacklevel=3)
else:
types = frozenset((self.inferred_type,
other.inferred_type))
if not types & _unsortable_types:
result.sort()

pass
else:
result = lvals

try:
result = np.sort(result)
except TypeError as e:
warnings.warn("%s, sort order is undefined for "
"incomparable objects" % e, RuntimeWarning,
stacklevel=3)
try:
result = sorting.safe_sort(result)
except TypeError as e:
warnings.warn("%s, sort order is undefined for "
"incomparable objects" % e, RuntimeWarning,
stacklevel=3)

# for subclasses
return self._wrap_union_result(other, result)
Expand Down
54 changes: 20 additions & 34 deletions pandas/tests/indexes/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -821,8 +821,7 @@ def test_union(self):
expected = Index(list('ab'), name='A')
tm.assert_index_equal(union, expected)

with tm.assert_produces_warning(RuntimeWarning):
firstCat = self.strIndex.union(self.dateIndex)
firstCat = self.strIndex.union(self.dateIndex)
secondCat = self.strIndex.union(self.strIndex)

if self.dateIndex.dtype == np.object_:
Expand Down Expand Up @@ -1518,28 +1517,25 @@ def test_drop_tuple(self, values, to_drop):
pytest.raises(KeyError, removed.drop, drop_me)

def test_tuple_union_bug(self):
import pandas
import numpy as np

aidx1 = np.array([(1, 'A'), (2, 'A'), (1, 'B'), (2, 'B')],
dtype=[('num', int), ('let', 'a1')])
aidx2 = np.array([(1, 'A'), (2, 'A'), (1, 'B'),
(2, 'B'), (1, 'C'), (2, 'C')],
dtype=[('num', int), ('let', 'a1')])

idx1 = pandas.Index(aidx1)
idx2 = pandas.Index(aidx2)
idx1 = Index(aidx1)
idx2 = Index(aidx2)

# intersection broken?
# intersection
int_idx = idx1.intersection(idx2)
expected = idx1 # pandas.Index(sorted(set(idx1) & set(idx2)))
# needs to be 1d like idx1 and idx2
expected = idx1[:4] # pandas.Index(sorted(set(idx1) & set(idx2)))
assert int_idx.ndim == 1
tm.assert_index_equal(int_idx, expected)

# union broken
# GH 17376 (union)
union_idx = idx1.union(idx2)
expected = idx2
expected = idx2.sort_values()
assert union_idx.ndim == 1
tm.assert_index_equal(union_idx, expected)

Expand Down Expand Up @@ -1728,13 +1724,19 @@ def test_outer_join_sort(self):
left_idx = Index(np.random.permutation(15))
right_idx = tm.makeDateIndex(10)

with tm.assert_produces_warning(RuntimeWarning):
if PY3:
with tm.assert_produces_warning(RuntimeWarning):
joined = left_idx.join(right_idx, how='outer')
else:
joined = left_idx.join(right_idx, how='outer')

# right_idx in this case because DatetimeIndex has join precedence over
# Int64Index
with tm.assert_produces_warning(RuntimeWarning):
expected = right_idx.astype(object).union(left_idx.astype(object))
if PY3:
with tm.assert_produces_warning(RuntimeWarning):
expected = right_idx.astype(object).union(left_idx)
else:
expected = right_idx.astype(object).union(left_idx)
tm.assert_index_equal(joined, expected)

def test_nan_first_take_datetime(self):
Expand Down Expand Up @@ -2133,10 +2135,7 @@ def test_copy_name(self):
s1 = Series(2, index=first)
s2 = Series(3, index=second[:-1])

warning_type = RuntimeWarning if PY3 else None
with tm.assert_produces_warning(warning_type):
# Python 3: Unorderable types
s3 = s1 * s2
s3 = s1 * s2

assert s3.index.name == 'mario'

Expand Down Expand Up @@ -2169,27 +2168,14 @@ def test_union_base(self):
first = idx[3:]
second = idx[:5]

if PY3:
with tm.assert_produces_warning(RuntimeWarning):
# unorderable types
result = first.union(second)
expected = Index(['b', 2, 'c', 0, 'a', 1])
tm.assert_index_equal(result, expected)
else:
result = first.union(second)
expected = Index(['b', 2, 'c', 0, 'a', 1])
tm.assert_index_equal(result, expected)
expected = Index([0, 1, 2, 'a', 'b', 'c'])
result = first.union(second)
tm.assert_index_equal(result, expected)

# GH 10149
cases = [klass(second.values)
for klass in [np.array, Series, list]]
for case in cases:
if PY3:
with tm.assert_produces_warning(RuntimeWarning):
# unorderable types
result = first.union(case)
assert tm.equalContents(result, idx)
else:
result = first.union(case)
assert tm.equalContents(result, idx)

Expand Down
20 changes: 4 additions & 16 deletions pandas/tests/series/test_operators.py
Original file line number Diff line number Diff line change
Expand Up @@ -1472,24 +1472,12 @@ def test_operators_bitwise(self):
pytest.raises(TypeError, lambda: s_0123 & [0.1, 4, 3.14, 2])

# s_0123 will be all false now because of reindexing like s_tft
if compat.PY3:
# unable to sort incompatible object via .union.
exp = Series([False] * 7, index=['b', 'c', 'a', 0, 1, 2, 3])
with tm.assert_produces_warning(RuntimeWarning):
assert_series_equal(s_tft & s_0123, exp)
else:
exp = Series([False] * 7, index=[0, 1, 2, 3, 'a', 'b', 'c'])
assert_series_equal(s_tft & s_0123, exp)
exp = Series([False] * 7, index=[0, 1, 2, 3, 'a', 'b', 'c'])
assert_series_equal(s_tft & s_0123, exp)

# s_tft will be all false now because of reindexing like s_0123
if compat.PY3:
# unable to sort incompatible object via .union.
exp = Series([False] * 7, index=[0, 1, 2, 3, 'b', 'c', 'a'])
with tm.assert_produces_warning(RuntimeWarning):
assert_series_equal(s_0123 & s_tft, exp)
else:
exp = Series([False] * 7, index=[0, 1, 2, 3, 'a', 'b', 'c'])
assert_series_equal(s_0123 & s_tft, exp)
exp = Series([False] * 7, index=[0, 1, 2, 3, 'a', 'b', 'c'])
assert_series_equal(s_0123 & s_tft, exp)

assert_series_equal(s_0123 & False, Series([False] * 4))
assert_series_equal(s_0123 ^ False, Series([False, True, True, True]))
Expand Down