Skip to content

BUG: string slicing on MultiIndex DatetimeIndex level #42476

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,8 @@ Indexing
^^^^^^^^
- Bug in indexing on a :class:`Series` or :class:`DataFrame` with a :class:`DatetimeIndex` when passing a string, the return type depended on whether the index was monotonic (:issue:`24892`)
- Bug in :meth:`DataFrame.truncate` and :meth:`Series.truncate` when the object's Index has a length greater than one but only one unique value (:issue:`42365`)
- Bug in indexing on a :class:`MultiIndex` failing to drop scalar levels when the indexer is a tuple containing a datetime-like string (:issue:`42476`)
-

Missing
^^^^^^^
Expand Down
11 changes: 0 additions & 11 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@
DtypeObj,
F,
Shape,
T,
npt,
)
from pandas.compat.numpy import function as nv
Expand Down Expand Up @@ -3719,16 +3718,6 @@ def _filter_indexer_tolerance(
# --------------------------------------------------------------------
# Indexer Conversion Methods

def _get_partial_string_timestamp_match_key(self, key: T) -> T:
"""
Translate any partial string timestamp matches in key, returning the
new key.

Only relevant for MultiIndex.
"""
# GH#10331
return key

@final
def _validate_positional_slice(self, key: slice) -> None:
"""
Expand Down
74 changes: 41 additions & 33 deletions pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -2581,35 +2581,6 @@ def _get_indexer_level_0(self, target) -> np.ndarray:
ci = Index(cat)
return ci.get_indexer_for(target)

def _get_partial_string_timestamp_match_key(self, key):
"""
Translate any partial string timestamp matches in key, returning the
new key.

Only relevant for MultiIndex.
"""
# GH#10331
if isinstance(key, str) and self.levels[0]._supports_partial_string_indexing:
# Convert key '2016-01-01' to
# ('2016-01-01'[, slice(None, None, None)]+)
key = (key,) + (slice(None),) * (len(self.levels) - 1)

if isinstance(key, tuple):
# Convert (..., '2016-01-01', ...) in tuple to
# (..., slice('2016-01-01', '2016-01-01', None), ...)
new_key = []
for i, component in enumerate(key):
if (
isinstance(component, str)
and self.levels[i]._supports_partial_string_indexing
):
new_key.append(slice(component, component, None))
else:
new_key.append(component)
key = tuple(new_key)

return key

def get_slice_bound(
self, label: Hashable | Sequence[Hashable], side: str, kind: str | None = None
) -> int:
Expand Down Expand Up @@ -2858,7 +2829,12 @@ def _maybe_to_slice(loc):
)

if keylen == self.nlevels and self.is_unique:
return self._engine.get_loc(key)
try:
return self._engine.get_loc(key)
except TypeError:
# e.g. partial string slicing
loc, _ = self.get_loc_level(key, list(range(self.nlevels)))
return loc

# -- partial selection or non-unique index
# break the key into 2 parts based on the lexsort_depth of the index;
Expand Down Expand Up @@ -3008,6 +2984,10 @@ def maybe_mi_droplevels(indexer, levels):
return (self._engine.get_loc(key), None)
except KeyError as err:
raise KeyError(key) from err
except TypeError:
# e.g. partial string indexing
# test_partial_string_timestamp_multiindex
pass

# partial selection
indexer = self.get_loc(key)
Expand All @@ -3019,7 +2999,19 @@ def maybe_mi_droplevels(indexer, levels):

# TODO: in some cases we still need to drop some levels,
# e.g. test_multiindex_perf_warn
ilevels = []
# test_partial_string_timestamp_multiindex
ilevels = [
i
for i in range(len(key))
if (
not isinstance(key[i], str)
or not self.levels[i]._supports_partial_string_indexing
)
and key[i] != slice(None, None)
]
if len(ilevels) == self.nlevels:
# TODO: why?
ilevels = []
return indexer, maybe_mi_droplevels(indexer, ilevels)

else:
Expand Down Expand Up @@ -3060,6 +3052,16 @@ def maybe_mi_droplevels(indexer, levels):
return indexer, maybe_mi_droplevels(indexer, ilevels)
else:
indexer = self._get_level_indexer(key, level=level)
if (
isinstance(key, str)
and self.levels[level]._supports_partial_string_indexing
):
# check to see if we did an exact lookup vs sliced
check = self.levels[level].get_loc(key)
if not is_integer(check):
# e.g. test_partial_string_timestamp_multiindex
return indexer, self[indexer]

return indexer, maybe_mi_droplevels(indexer, [level])

def _get_level_indexer(
Expand Down Expand Up @@ -3157,15 +3159,21 @@ def convert_indexer(start, stop, step, indexer=indexer, codes=level_codes):

if level > 0 or self._lexsort_depth == 0:
# Desired level is not sorted
if isinstance(idx, slice):
locs = (level_codes >= idx.start) & (level_codes < idx.stop)
return locs

locs = np.array(level_codes == idx, dtype=bool, copy=False)
if not locs.any():
# The label is present in self.levels[level] but unused:
raise KeyError(key)
return locs

if isinstance(idx, slice):
start = idx.start
end = idx.stop
# e.g. test_partial_string_timestamp_multiindex
start = level_codes.searchsorted(idx.start, side="left")
# NB: "left" here bc of slice semantics
end = level_codes.searchsorted(idx.stop, side="left")
else:
start = level_codes.searchsorted(idx, side="left")
end = level_codes.searchsorted(idx, side="right")
Expand Down
7 changes: 0 additions & 7 deletions pandas/core/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -1129,12 +1129,6 @@ def _handle_lowerdim_multi_index_axis0(self, tup: tuple):
try:
# fast path for series or for tup devoid of slices
return self._get_label(tup, axis=axis)
except TypeError as err:
# slices are unhashable
# FIXME: this raises when we have a DatetimeIndex first level and a
# string for the first tup entry
# see test_partial_slicing_with_multiindex
raise IndexingError("No label returned") from err

except KeyError as ek:
# raise KeyError if number of indexers match
Expand All @@ -1149,7 +1143,6 @@ def _getitem_axis(self, key, axis: int):
key = list(key)

labels = self.obj._get_axis(axis)
key = labels._get_partial_string_timestamp_match_key(key)

if isinstance(key, slice):
self._validate_key(key, axis)
Expand Down
10 changes: 4 additions & 6 deletions pandas/tests/indexes/datetimes/test_partial_slicing.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
date_range,
)
import pandas._testing as tm
from pandas.core.indexing import IndexingError


class TestSlicing:
Expand Down Expand Up @@ -337,11 +336,10 @@ def test_partial_slicing_with_multiindex(self):
result = df_multi.loc[("2013-06-19 09:30:00", "ACCT1", "ABC")]
tm.assert_series_equal(result, expected)

# this is an IndexingError as we don't do partial string selection on
# multi-levels.
msg = "Too many indexers"
with pytest.raises(IndexingError, match=msg):
df_multi.loc[("2013-06-19", "ACCT1", "ABC")]
# partial string indexing on first level, scalar indexing on the other two
result = df_multi.loc[("2013-06-19", "ACCT1", "ABC")]
expected = df_multi.iloc[:1].droplevel([1, 2])
tm.assert_frame_equal(result, expected)

def test_partial_slicing_with_multiindex_series(self):
# GH 4294
Expand Down
9 changes: 7 additions & 2 deletions pandas/tests/indexes/multi/test_partial_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,9 @@ def test_partial_string_timestamp_multiindex(df):

# partial string match on date and hour, from middle
result = df.loc["2016-01-02 12"]
expected = df.iloc[9:12]
# hourly resolution, same as index.levels[0], so we are _not_ slicing on
# that level, so that level gets dropped
expected = df.iloc[9:12].droplevel(0)
tm.assert_frame_equal(result, expected)

# partial string match on secondary index
Expand All @@ -81,11 +83,14 @@ def test_partial_string_timestamp_multiindex(df):
tm.assert_frame_equal(result, expected)

# tuple selector with partial string match on date
# "2016-01-01" has daily resolution, so _is_ a slice on the first level.
result = df.loc[("2016-01-01", "a"), :]
expected = df.iloc[[0, 3]]
expected = df.iloc[[0, 3]].droplevel(1)
tm.assert_frame_equal(result, expected)

# Slicing date on first level should break (of course)
# Slicing date on first level should break (of course) bc the DTI is the
# second level on df_swap
with pytest.raises(KeyError, match="'2016-01-01'"):
df_swap.loc["2016-01-01"]

Expand Down
20 changes: 11 additions & 9 deletions pandas/tests/indexing/multiindex/test_loc.py
Original file line number Diff line number Diff line change
Expand Up @@ -554,15 +554,17 @@ def test_loc_period_string_indexing():
),
)
result = df.loc[("2013Q1", 1111), "OMS"]
expected = Series(
[np.nan],
dtype=object,
name="OMS",
index=MultiIndex.from_tuples(
[(pd.Period("2013Q1"), 1111)], names=["Period", "CVR"]
),
)
tm.assert_series_equal(result, expected)

alt = df.loc[(a[0], 1111), "OMS"]
assert np.isnan(alt)

# Because the resolution of the string matches, it is an exact lookup,
# not a slice
assert np.isnan(result)

# TODO: should it figure this out?
# alt = df.loc["2013Q1", 1111, "OMS"]
# assert np.isnan(alt)


def test_loc_datetime_mask_slicing():
Expand Down