From 4be0c0b6719a6b21345b22bbf9d1564f4068ab21 Mon Sep 17 00:00:00 2001 From: jreback Date: Sat, 15 Feb 2014 17:56:14 -0500 Subject: [PATCH] PERF: change Series indexing on multi-indexes to use a fast path like DataFrame uses --- doc/source/release.rst | 1 + pandas/core/common.py | 10 ++++++++++ pandas/core/frame.py | 2 +- pandas/core/generic.py | 11 ++++++++--- pandas/core/indexing.py | 10 ++++++++-- pandas/core/internals.py | 8 +++++++- pandas/core/series.py | 20 -------------------- vb_suite/indexing.py | 19 +++++++++++++++++++ vb_suite/join_merge.py | 2 +- 9 files changed, 55 insertions(+), 28 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index 5be88f2c1fd6c..a0f6e8571fc34 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -81,6 +81,7 @@ Improvements to existing features - Allow multi-index slicers (:issue:`6134`, :issue:`4036`, :issue:`3057`, :issue:`2598`, :issue:`5641`) - improve performance of slice indexing on Series with string keys (:issue:`6341`) - implement joining a single-level indexed DataFrame on a matching column of a multi-indexed DataFrame (:issue:`3662`) +- Performance improvement in indexing into a multi-indexed Series (:issue:`5567`) .. _release.bug_fixes-0.14.0: diff --git a/pandas/core/common.py b/pandas/core/common.py index 9f993fc9e0f32..785c1f45db607 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -1557,6 +1557,16 @@ def _maybe_box(indexer, values, obj, key): # return the value return values +def _maybe_box_datetimelike(value): + # turn a datetime like into a Timestamp/timedelta as needed + + if isinstance(value, np.datetime64): + value = tslib.Timestamp(value) + elif isinstance(value, np.timedelta64): + pass + + return value + _values_from_object = lib.values_from_object def _possibly_convert_objects(values, convert_dates=True, diff --git a/pandas/core/frame.py b/pandas/core/frame.py index de8bac05f211f..9f9af187d21dd 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1588,7 +1588,7 @@ def _ixs(self, i, axis=0, copy=False): result = self.reindex(i, takeable=True) copy=True else: - new_values, copy = self._data.fast_2d_xs(i, copy=copy) + new_values, copy = self._data.fast_xs(i, copy=copy) result = Series(new_values, index=self.columns, name=self.index[i], dtype=new_values.dtype) result._set_is_copy(self, copy=copy) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 03e16f243836a..fdec1d2955e90 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -21,8 +21,8 @@ from pandas import compat, _np_version_under1p7 from pandas.compat import map, zip, lrange, string_types, isidentifier from pandas.core.common import (isnull, notnull, is_list_like, - _values_from_object, _maybe_promote, ABCSeries, - SettingWithCopyError, SettingWithCopyWarning) + _values_from_object, _maybe_promote, _maybe_box_datetimelike, + ABCSeries, SettingWithCopyError, SettingWithCopyWarning) import pandas.core.nanops as nanops from pandas.util.decorators import Appender, Substitution from pandas.core import config @@ -1304,7 +1304,12 @@ def xs(self, key, axis=0, level=None, copy=True, drop_level=True): if np.isscalar(loc): from pandas import Series - new_values, copy = self._data.fast_2d_xs(loc, copy=copy) + new_values, copy = self._data.fast_xs(loc, copy=copy) + + # may need to box a datelike-scalar + if not is_list_like(new_values): + return _maybe_box_datetimelike(new_values) + result = Series(new_values, index=self.columns, name=self.index[loc]) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index ea7214d4cd020..830051ed41d44 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -70,9 +70,15 @@ def __getitem__(self, key): return self._getitem_axis(key, axis=0) def _get_label(self, label, axis=0): - # ueber-hack if self.ndim == 1: - return self.obj[label] + # for perf reasons we want to try _xs first + # as its basically direct indexing + # but will fail when the index is not present + # see GH5667 + try: + return self.obj._xs(label, axis=axis, copy=False) + except: + return self.obj[label] elif (isinstance(label, tuple) and isinstance(label[axis], slice)): raise IndexingError('no slices here, handle elsewhere') diff --git a/pandas/core/internals.py b/pandas/core/internals.py index b83e7df746c5a..d09191ce53868 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -2829,7 +2829,7 @@ def xs(self, key, axis=1, copy=True, takeable=False): return self.__class__(new_blocks, new_axes) - def fast_2d_xs(self, loc, copy=False): + def fast_xs(self, loc, copy=False): """ get a cross sectional for a given location in the items ; handle dups @@ -3757,6 +3757,12 @@ def _consolidate_check(self): def _consolidate_inplace(self): pass + def fast_xs(self, loc, copy=False): + """ + fast path for getting a cross-section + """ + result = self._block.values[loc] + return result, False def construction_error(tot_items, block_shape, axes, e=None): """ raise a helpful message about our construction """ diff --git a/pandas/core/series.py b/pandas/core/series.py index 300da3dc6834d..692c7ac072edc 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -427,26 +427,6 @@ def _unpickle_series_compat(self, state): def axes(self): return [self.index] - def _maybe_box(self, values): - """ genericically box the values """ - - if isinstance(values, self.__class__): - return values - elif not hasattr(values, '__iter__'): - v = lib.infer_dtype([values]) - if v == 'datetime': - return lib.Timestamp(v) - return values - - v = lib.infer_dtype(values) - if v == 'datetime': - return lib.map_infer(values, lib.Timestamp) - - if isinstance(values, np.ndarray): - return self.__class__(values) - - return values - def _ixs(self, i, axis=0): """ Return the i-th value or values in the Series by location diff --git a/vb_suite/indexing.py b/vb_suite/indexing.py index 18838b06af756..cc96c86d3cb81 100644 --- a/vb_suite/indexing.py +++ b/vb_suite/indexing.py @@ -91,6 +91,25 @@ name='indexing_frame_get_value', start_date=datetime(2011, 11, 12)) +setup = common_setup + """ +mi = MultiIndex.from_tuples([(x,y) for x in range(1000) for y in range(1000)]) +s = Series(np.random.randn(1000000), index=mi) +""" + +series_xs_mi_ix = Benchmark("s.ix[999]", setup, + name='series_xs_mi_ix', + start_date=datetime(2013, 1, 1)) + +setup = common_setup + """ +mi = MultiIndex.from_tuples([(x,y) for x in range(1000) for y in range(1000)]) +s = Series(np.random.randn(1000000), index=mi) +df = DataFrame(s) +""" + +frame_xs_mi_ix = Benchmark("df.ix[999]", setup, + name='frame_xs_mi_ix', + start_date=datetime(2013, 1, 1)) + #---------------------------------------------------------------------- # Boolean DataFrame row selection diff --git a/vb_suite/join_merge.py b/vb_suite/join_merge.py index aa883443f7d9d..b60009cd272bb 100644 --- a/vb_suite/join_merge.py +++ b/vb_suite/join_merge.py @@ -220,5 +220,5 @@ def sample(values, k): """ join_non_unique_equal = Benchmark('fracofday * temp[fracofday.index]', setup, - start_date=datetime(2013 1, 1)) + start_date=datetime(2013, 1, 1))