Skip to content

Commit a2dd4f0

Browse files
harisbalPingviinituutti
authored andcommitted
ENH: Allow for join between two multi-index dataframe instances (pandas-dev#20356)
1 parent f3b1073 commit a2dd4f0

File tree

5 files changed

+866
-554
lines changed

5 files changed

+866
-554
lines changed

Diff for: doc/source/whatsnew/v0.24.0.rst

+41
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,47 @@ array, but rather an ``ExtensionArray``:
183183
This is the same behavior as ``Series.values`` for categorical data. See
184184
:ref:`whatsnew_0240.api_breaking.interval_values` for more.
185185

186+
.. _whatsnew_0240.enhancements.join_with_two_multiindexes:
187+
188+
Joining with two multi-indexes
189+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
190+
191+
:func:`Datafame.merge` and :func:`Dataframe.join` can now be used to join multi-indexed ``Dataframe`` instances on the overlaping index levels (:issue:`6360`)
192+
193+
See the :ref:`Merge, join, and concatenate
194+
<merging.Join_with_two_multi_indexes>` documentation section.
195+
196+
.. ipython:: python
197+
198+
index_left = pd.MultiIndex.from_tuples([('K0', 'X0'), ('K0', 'X1'),
199+
('K1', 'X2')],
200+
names=['key', 'X'])
201+
202+
203+
left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
204+
'B': ['B0', 'B1', 'B2']},
205+
index=index_left)
206+
207+
208+
index_right = pd.MultiIndex.from_tuples([('K0', 'Y0'), ('K1', 'Y1'),
209+
('K2', 'Y2'), ('K2', 'Y3')],
210+
names=['key', 'Y'])
211+
212+
213+
right = pd.DataFrame({'C': ['C0', 'C1', 'C2', 'C3'],
214+
'D': ['D0', 'D1', 'D2', 'D3']},
215+
index=index_right)
216+
217+
218+
left.join(right)
219+
220+
For earlier versions this can be done using the following.
221+
222+
.. ipython:: python
223+
224+
pd.merge(left.reset_index(), right.reset_index(),
225+
on=['key'], how='inner').set_index(['key', 'X', 'Y'])
226+
186227
.. _whatsnew_0240.enhancements.rename_axis:
187228

188229
Renaming names in a MultiIndex

Diff for: pandas/core/indexes/base.py

+62-36
Original file line numberDiff line numberDiff line change
@@ -3168,8 +3168,8 @@ def get_value(self, series, key):
31683168
iloc = self.get_loc(key)
31693169
return s[iloc]
31703170
except KeyError:
3171-
if (len(self) > 0
3172-
and (self.holds_integer() or self.is_boolean())):
3171+
if (len(self) > 0 and
3172+
(self.holds_integer() or self.is_boolean())):
31733173
raise
31743174
elif is_integer(key):
31753175
return s[key]
@@ -3957,46 +3957,72 @@ def join(self, other, how='left', level=None, return_indexers=False,
39573957

39583958
def _join_multi(self, other, how, return_indexers=True):
39593959
from .multi import MultiIndex
3960+
from pandas.core.reshape.merge import _restore_dropped_levels_multijoin
3961+
3962+
# figure out join names
3963+
self_names = set(com._not_none(*self.names))
3964+
other_names = set(com._not_none(*other.names))
3965+
overlap = self_names & other_names
3966+
3967+
# need at least 1 in common
3968+
if not overlap:
3969+
raise ValueError("cannot join with no overlapping index names")
3970+
39603971
self_is_mi = isinstance(self, MultiIndex)
39613972
other_is_mi = isinstance(other, MultiIndex)
39623973

3963-
# figure out join names
3964-
self_names = com._not_none(*self.names)
3965-
other_names = com._not_none(*other.names)
3966-
overlap = list(set(self_names) & set(other_names))
3967-
3968-
# need at least 1 in common, but not more than 1
3969-
if not len(overlap):
3970-
raise ValueError("cannot join with no level specified and no "
3971-
"overlapping names")
3972-
if len(overlap) > 1:
3973-
raise NotImplementedError("merging with more than one level "
3974-
"overlap on a multi-index is not "
3975-
"implemented")
3976-
jl = overlap[0]
3974+
if self_is_mi and other_is_mi:
3975+
3976+
# Drop the non-matching levels from left and right respectively
3977+
ldrop_names = list(self_names - overlap)
3978+
rdrop_names = list(other_names - overlap)
3979+
3980+
self_jnlevels = self.droplevel(ldrop_names)
3981+
other_jnlevels = other.droplevel(rdrop_names)
3982+
3983+
# Join left and right
3984+
# Join on same leveled multi-index frames is supported
3985+
join_idx, lidx, ridx = self_jnlevels.join(other_jnlevels, how,
3986+
return_indexers=True)
3987+
3988+
# Restore the dropped levels
3989+
# Returned index level order is
3990+
# common levels, ldrop_names, rdrop_names
3991+
dropped_names = ldrop_names + rdrop_names
3992+
3993+
levels, labels, names = (
3994+
_restore_dropped_levels_multijoin(self, other,
3995+
dropped_names,
3996+
join_idx,
3997+
lidx, ridx))
39773998

3999+
# Re-create the multi-index
4000+
multi_join_idx = MultiIndex(levels=levels, labels=labels,
4001+
names=names, verify_integrity=False)
4002+
4003+
multi_join_idx = multi_join_idx.remove_unused_levels()
4004+
4005+
return multi_join_idx, lidx, ridx
4006+
4007+
jl = list(overlap)[0]
4008+
4009+
# Case where only one index is multi
39784010
# make the indices into mi's that match
3979-
if not (self_is_mi and other_is_mi):
3980-
3981-
flip_order = False
3982-
if self_is_mi:
3983-
self, other = other, self
3984-
flip_order = True
3985-
# flip if join method is right or left
3986-
how = {'right': 'left', 'left': 'right'}.get(how, how)
3987-
3988-
level = other.names.index(jl)
3989-
result = self._join_level(other, level, how=how,
3990-
return_indexers=return_indexers)
3991-
3992-
if flip_order:
3993-
if isinstance(result, tuple):
3994-
return result[0], result[2], result[1]
3995-
return result
4011+
flip_order = False
4012+
if self_is_mi:
4013+
self, other = other, self
4014+
flip_order = True
4015+
# flip if join method is right or left
4016+
how = {'right': 'left', 'left': 'right'}.get(how, how)
4017+
4018+
level = other.names.index(jl)
4019+
result = self._join_level(other, level, how=how,
4020+
return_indexers=return_indexers)
39964021

3997-
# 2 multi-indexes
3998-
raise NotImplementedError("merging with both multi-indexes is not "
3999-
"implemented")
4022+
if flip_order:
4023+
if isinstance(result, tuple):
4024+
return result[0], result[2], result[1]
4025+
return result
40004026

40014027
def _join_non_unique(self, other, how='left', return_indexers=False):
40024028
from pandas.core.reshape.merge import _get_join_indexers

Diff for: pandas/core/reshape/merge.py

+89
Original file line numberDiff line numberDiff line change
@@ -1122,6 +1122,95 @@ def _get_join_indexers(left_keys, right_keys, sort=False, how='inner',
11221122
return join_func(lkey, rkey, count, **kwargs)
11231123

11241124

1125+
def _restore_dropped_levels_multijoin(left, right, dropped_level_names,
1126+
join_index, lindexer, rindexer):
1127+
"""
1128+
*this is an internal non-public method*
1129+
1130+
Returns the levels, labels and names of a multi-index to multi-index join.
1131+
Depending on the type of join, this method restores the appropriate
1132+
dropped levels of the joined multi-index.
1133+
The method relies on lidx, rindexer which hold the index positions of
1134+
left and right, where a join was feasible
1135+
1136+
Parameters
1137+
----------
1138+
left : MultiIndex
1139+
left index
1140+
right : MultiIndex
1141+
right index
1142+
dropped_level_names : str array
1143+
list of non-common level names
1144+
join_index : MultiIndex
1145+
the index of the join between the
1146+
common levels of left and right
1147+
lindexer : intp array
1148+
left indexer
1149+
rindexer : intp array
1150+
right indexer
1151+
1152+
Returns
1153+
-------
1154+
levels : list of Index
1155+
levels of combined multiindexes
1156+
labels : intp array
1157+
labels of combined multiindexes
1158+
names : str array
1159+
names of combined multiindexes
1160+
1161+
"""
1162+
1163+
def _convert_to_mulitindex(index):
1164+
if isinstance(index, MultiIndex):
1165+
return index
1166+
else:
1167+
return MultiIndex.from_arrays([index.values],
1168+
names=[index.name])
1169+
1170+
# For multi-multi joins with one overlapping level,
1171+
# the returned index if of type Index
1172+
# Assure that join_index is of type MultiIndex
1173+
# so that dropped levels can be appended
1174+
join_index = _convert_to_mulitindex(join_index)
1175+
1176+
join_levels = join_index.levels
1177+
join_labels = join_index.labels
1178+
join_names = join_index.names
1179+
1180+
# lindexer and rindexer hold the indexes where the join occurred
1181+
# for left and right respectively. If left/right is None then
1182+
# the join occurred on all indices of left/right
1183+
if lindexer is None:
1184+
lindexer = range(left.size)
1185+
1186+
if rindexer is None:
1187+
rindexer = range(right.size)
1188+
1189+
# Iterate through the levels that must be restored
1190+
for dropped_level_name in dropped_level_names:
1191+
if dropped_level_name in left.names:
1192+
idx = left
1193+
indexer = lindexer
1194+
else:
1195+
idx = right
1196+
indexer = rindexer
1197+
1198+
# The index of the level name to be restored
1199+
name_idx = idx.names.index(dropped_level_name)
1200+
1201+
restore_levels = idx.levels[name_idx]
1202+
# Inject -1 in the labels list where a join was not possible
1203+
# IOW indexer[i]=-1
1204+
labels = idx.labels[name_idx]
1205+
restore_labels = algos.take_nd(labels, indexer, fill_value=-1)
1206+
1207+
join_levels = join_levels + [restore_levels]
1208+
join_labels = join_labels + [restore_labels]
1209+
join_names = join_names + [dropped_level_name]
1210+
1211+
return join_levels, join_labels, join_names
1212+
1213+
11251214
class _OrderedMerge(_MergeOperation):
11261215
_merge_type = 'ordered_merge'
11271216

0 commit comments

Comments
 (0)