From a4057e5649e9d5488f1bc5d2ce8a22e342af6ae2 Mon Sep 17 00:00:00 2001 From: stevenae Date: Wed, 26 Mar 2025 15:11:43 -0400 Subject: [PATCH 01/19] dev setup --- dev_attempts.py | 39 +++++++++++++++++++++++++++++++ pandas/core/frame.py | 55 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 94 insertions(+) create mode 100644 dev_attempts.py diff --git a/dev_attempts.py b/dev_attempts.py new file mode 100644 index 0000000000000..b33a08cd82c63 --- /dev/null +++ b/dev_attempts.py @@ -0,0 +1,39 @@ +import pandas as pd +import numpy as np +import timeit +np.random.seed(43) +# also tested for n = 1000, 10_000, 100_000 +n=1_000_000 +cols = list('abcdef') +df = pd.DataFrame(np.random.randint(0, 10, size=(n,len(cols))), columns=cols) +df['col'] = np.random.choice(cols, n) +idx = df['col'].index.to_numpy() +cols = df['col'].to_numpy() + +def og_lookup(idx, cols): + return df.lookup(idx, cols,'og') + +# def melt_lookup(): +# melt = df.melt('col') +# melt = melt.loc[lambda x: x['col']==x['variable'], 'value'] +# melt = melt.reset_index(drop=True) +# return melt + +# def quan_lookup(idx,cols): +# return df.reindex(cols,axis=1).to_numpy()[np.arange(df.shape[0]), idx] + +# def quan_lookup2(idx,cols): +# return df.reindex(cols,axis=1).to_numpy()[np.arange(df.shape[0]), idx] + +# def marco_lookup(): +# return df.melt('col', ignore_index=False).query('col==variable')['value'].reindex(df.index).to_numpy() + + +timeit.timeit(lambda: og_lookup(idx,cols),number=10) +# timeit.timeit(lambda: melt_lookup(idx,cols),number=10) +# timeit.timeit(lambda: quan_lookup(idx,cols),number=10) +# timeit.timeit(lambda: quan_lookup2(idx,cols),number=10) +# timeit.timeit(lambda: marco_lookup(idx,cols),number=10) + +# idx, cols = pd.factorize(df['col']) +# df.reindex(cols, axis=1).to_numpy()[np.arange(len(df)), idx] diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 8f65277f660f7..11e7be46e1ee9 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -101,6 +101,7 @@ is_integer_dtype, is_iterator, is_list_like, + is_object_dtype, is_scalar, is_sequence, needs_i8_conversion, @@ -5135,6 +5136,60 @@ def _series(self): # ---------------------------------------------------------------------- # Reindexing and alignment + def lookup(self, row_labels, col_labels, dev_version) -> np.ndarray: + """ + Label-based "fancy indexing" function for DataFrame. + + + Given equal-length arrays of row and column labels, return an + array of the values corresponding to each (row, col) pair. + + + Parameters + ---------- + row_labels : sequence + The row labels to use for lookup. + col_labels : sequence + The column labels to use for lookup. + + + Returns + ------- + numpy.ndarray + The found values. + """ + n = len(row_labels) + if n != len(col_labels): + raise ValueError("Row labels must have same size as column labels") + if not (self.index.is_unique and self.columns.is_unique): + # GH#33041 + raise ValueError("DataFrame.lookup requires unique index and columns") + + + thresh = 1000 + if not self._is_mixed_type or n > thresh: + values = self.values + ridx = self.index.get_indexer(row_labels) + cidx = self.columns.get_indexer(col_labels) + if (ridx == -1).any(): + raise KeyError("One or more row labels was not found") + if (cidx == -1).any(): + raise KeyError("One or more column labels was not found") + flat_index = ridx * len(self.columns) + cidx + result = values.flat[flat_index] + else: + if dev_version=='og': + result = np.empty(n, dtype="O") + for i, (r, c) in enumerate(zip(row_labels, col_labels)): + result[i] = self._get_value(r, c) + + + if is_object_dtype(result): + result = lib.maybe_convert_objects(result) + + + return result + def _reindex_multi(self, axes: dict[str, Index], fill_value) -> DataFrame: """ We are guaranteed non-Nones in the axes. From 0f5ad86223425eff63e73cd89c7c66155fe67a45 Mon Sep 17 00:00:00 2001 From: stevenae Date: Wed, 26 Mar 2025 16:32:41 -0400 Subject: [PATCH 02/19] Update dev_attempts.py --- dev_attempts.py | 54 +++++++++++++++++-------------------------------- 1 file changed, 19 insertions(+), 35 deletions(-) diff --git a/dev_attempts.py b/dev_attempts.py index b33a08cd82c63..833cfa5542271 100644 --- a/dev_attempts.py +++ b/dev_attempts.py @@ -2,38 +2,22 @@ import numpy as np import timeit np.random.seed(43) -# also tested for n = 1000, 10_000, 100_000 -n=1_000_000 -cols = list('abcdef') -df = pd.DataFrame(np.random.randint(0, 10, size=(n,len(cols))), columns=cols) -df['col'] = np.random.choice(cols, n) -idx = df['col'].index.to_numpy() -cols = df['col'].to_numpy() - -def og_lookup(idx, cols): - return df.lookup(idx, cols,'og') - -# def melt_lookup(): -# melt = df.melt('col') -# melt = melt.loc[lambda x: x['col']==x['variable'], 'value'] -# melt = melt.reset_index(drop=True) -# return melt - -# def quan_lookup(idx,cols): -# return df.reindex(cols,axis=1).to_numpy()[np.arange(df.shape[0]), idx] - -# def quan_lookup2(idx,cols): -# return df.reindex(cols,axis=1).to_numpy()[np.arange(df.shape[0]), idx] - -# def marco_lookup(): -# return df.melt('col', ignore_index=False).query('col==variable')['value'].reindex(df.index).to_numpy() - - -timeit.timeit(lambda: og_lookup(idx,cols),number=10) -# timeit.timeit(lambda: melt_lookup(idx,cols),number=10) -# timeit.timeit(lambda: quan_lookup(idx,cols),number=10) -# timeit.timeit(lambda: quan_lookup2(idx,cols),number=10) -# timeit.timeit(lambda: marco_lookup(idx,cols),number=10) - -# idx, cols = pd.factorize(df['col']) -# df.reindex(cols, axis=1).to_numpy()[np.arange(len(df)), idx] +for n in [100,100_000]: + cols = list('abcdef') + df = pd.DataFrame(np.random.randint(0, 10, size=(n,len(cols))), columns=cols) + df['col'] = np.random.choice(cols, n) + idx = df['col'].index.to_numpy() + cols = df['col'].to_numpy() + timeit.timeit(lambda: df.lookup(idx, cols,'og'),number=10) + timeit.timeit(lambda: df.lookup(idx, cols,'a'),number=10) + timeit.timeit(lambda: df.lookup(idx, cols,'b'),number=10) + timeit.timeit(lambda: df.lookup(idx, cols,'c'),number=10) + df['a'] = df['a'].astype(str) + df['a'] = 'a' + print('mixed') + timeit.timeit(lambda: df.lookup(idx, cols,'og'),number=10) + timeit.timeit(lambda: df.lookup(idx, cols,'a'),number=10) + timeit.timeit(lambda: df.lookup(idx, cols,'b'),number=10) + timeit.timeit(lambda: df.lookup(idx, cols,'c'),number=10) + df.lookup(idx, cols,'b') + print('\n') \ No newline at end of file From 7e301815952e843ab53d32a1bc8fa458e863b243 Mon Sep 17 00:00:00 2001 From: stevenae Date: Wed, 26 Mar 2025 16:33:04 -0400 Subject: [PATCH 03/19] removed mixed type and threshold --- pandas/core/frame.py | 28 +++++++++------------------- 1 file changed, 9 insertions(+), 19 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 11e7be46e1ee9..e248a87b2a183 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5165,29 +5165,19 @@ def lookup(self, row_labels, col_labels, dev_version) -> np.ndarray: # GH#33041 raise ValueError("DataFrame.lookup requires unique index and columns") - - thresh = 1000 - if not self._is_mixed_type or n > thresh: - values = self.values - ridx = self.index.get_indexer(row_labels) - cidx = self.columns.get_indexer(col_labels) - if (ridx == -1).any(): - raise KeyError("One or more row labels was not found") - if (cidx == -1).any(): - raise KeyError("One or more column labels was not found") - flat_index = ridx * len(self.columns) + cidx - result = values.flat[flat_index] - else: - if dev_version=='og': - result = np.empty(n, dtype="O") - for i, (r, c) in enumerate(zip(row_labels, col_labels)): - result[i] = self._get_value(r, c) - + values = self.to_numpy() + ridx = self.index.get_indexer(row_labels) + cidx = self.columns.get_indexer(col_labels) + if (ridx == -1).any(): + raise KeyError("One or more row labels was not found") + if (cidx == -1).any(): + raise KeyError("One or more column labels was not found") + flat_index = ridx * len(self.columns) + cidx + result = values.flat[flat_index] if is_object_dtype(result): result = lib.maybe_convert_objects(result) - return result def _reindex_multi(self, axes: dict[str, Index], fill_value) -> DataFrame: From 6fed58dc1cb83a9fdd89fc518d9eaf84fdc40c01 Mon Sep 17 00:00:00 2001 From: stevenae Date: Wed, 26 Mar 2025 16:41:19 -0400 Subject: [PATCH 04/19] Delete dev_attempts.py --- dev_attempts.py | 23 ----------------------- 1 file changed, 23 deletions(-) delete mode 100644 dev_attempts.py diff --git a/dev_attempts.py b/dev_attempts.py deleted file mode 100644 index 833cfa5542271..0000000000000 --- a/dev_attempts.py +++ /dev/null @@ -1,23 +0,0 @@ -import pandas as pd -import numpy as np -import timeit -np.random.seed(43) -for n in [100,100_000]: - cols = list('abcdef') - df = pd.DataFrame(np.random.randint(0, 10, size=(n,len(cols))), columns=cols) - df['col'] = np.random.choice(cols, n) - idx = df['col'].index.to_numpy() - cols = df['col'].to_numpy() - timeit.timeit(lambda: df.lookup(idx, cols,'og'),number=10) - timeit.timeit(lambda: df.lookup(idx, cols,'a'),number=10) - timeit.timeit(lambda: df.lookup(idx, cols,'b'),number=10) - timeit.timeit(lambda: df.lookup(idx, cols,'c'),number=10) - df['a'] = df['a'].astype(str) - df['a'] = 'a' - print('mixed') - timeit.timeit(lambda: df.lookup(idx, cols,'og'),number=10) - timeit.timeit(lambda: df.lookup(idx, cols,'a'),number=10) - timeit.timeit(lambda: df.lookup(idx, cols,'b'),number=10) - timeit.timeit(lambda: df.lookup(idx, cols,'c'),number=10) - df.lookup(idx, cols,'b') - print('\n') \ No newline at end of file From 8156c42b6b02b65ef8f65b3cad5f820533ca0af6 Mon Sep 17 00:00:00 2001 From: stevenae Date: Wed, 26 Mar 2025 16:57:42 -0400 Subject: [PATCH 05/19] Update indexing.rst --- doc/source/user_guide/indexing.rst | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index ed5c7806b2e23..1a28589621acd 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -1456,24 +1456,15 @@ default value. .. _indexing.lookup: -Looking up values by index/column labels +The :meth:`~pandas.DataFrame.lookup` method ---------------------------------------- + Sometimes you want to extract a set of values given a sequence of row labels + and column labels, and the ``lookup`` method allows for this and returns a + NumPy array. For instance: -Sometimes you want to extract a set of values given a sequence of row labels -and column labels, this can be achieved by ``pandas.factorize`` and NumPy indexing. -For instance: - -.. ipython:: python - - df = pd.DataFrame({'col': ["A", "A", "B", "B"], - 'A': [80, 23, np.nan, 22], - 'B': [80, 55, 76, 67]}) - df - idx, cols = pd.factorize(df['col']) - df.reindex(cols, axis=1).to_numpy()[np.arange(len(df)), idx] - -Formerly this could be achieved with the dedicated ``DataFrame.lookup`` method -which was deprecated in version 1.2.0 and removed in version 2.0.0. + .. ipython:: python + dflookup = pd.DataFrame(np.random.rand(20, 4), columns = ['A', 'B', 'C', 'D']) + dflookup.lookup(list(range(0, 10, 2)), ['B', 'C', 'A', 'B', 'D']) .. _indexing.class: From c17a020f494a4595f39b2f96e11a315a77a44b28 Mon Sep 17 00:00:00 2001 From: stevenae Date: Wed, 26 Mar 2025 17:09:48 -0400 Subject: [PATCH 06/19] bringing tests back from 1.1.x --- pandas/tests/frame/indexing/test_indexing.py | 66 ++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 0c99b08cb30c4..89bb60f3cc114 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1414,6 +1414,72 @@ def test_loc_named_tuple_for_midx(self): ) tm.assert_frame_equal(result, expected) + def test_lookup_float(self, float_frame): + df = float_frame + rows = list(df.index) * len(df.columns) + cols = list(df.columns) * len(df.index) + result = df.lookup(rows, cols) + + expected = np.array([df.loc[r, c] for r, c in zip(rows, cols)]) + tm.assert_numpy_array_equal(result, expected) + + def test_lookup_mixed(self, float_string_frame): + df = float_string_frame + rows = list(df.index) * len(df.columns) + cols = list(df.columns) * len(df.index) + result = df.lookup(rows, cols) + + expected = np.array( + [df.loc[r, c] for r, c in zip(rows, cols)], dtype=np.object_ + ) + tm.assert_almost_equal(result, expected) + + def test_lookup_bool(self): + df = DataFrame( + { + "label": ["a", "b", "a", "c"], + "mask_a": [True, True, False, True], + "mask_b": [True, False, False, False], + "mask_c": [False, True, False, True], + } + ) + df["mask"] = df.lookup(df.index, "mask_" + df["label"]) + + exp_mask = np.array( + [df.loc[r, c] for r, c in zip(df.index, "mask_" + df["label"])] + ) + + tm.assert_series_equal(df["mask"], Series(exp_mask, name="mask")) + assert df["mask"].dtype == np.bool_ + + def test_lookup_raises(self, float_frame): + with pytest.raises(KeyError, match="'One or more row labels was not found'"): + float_frame.lookup(["xyz"], ["A"]) + + with pytest.raises(KeyError, match="'One or more column labels was not found'"): + float_frame.lookup([float_frame.index[0]], ["xyz"]) + + with pytest.raises(ValueError, match="same size"): + float_frame.lookup(["a", "b", "c"], ["a"]) + + def test_lookup_requires_unique_axes(self): + # GH#33041 raise with a helpful error message + df = DataFrame(np.random.Generator(6).reshape(3, 2), columns=["A", "A"]) + + rows = [0, 1] + cols = ["A", "A"] + + # homogeneous-dtype case + with pytest.raises(ValueError, match="requires unique index and columns"): + df.lookup(rows, cols) + with pytest.raises(ValueError, match="requires unique index and columns"): + df.T.lookup(cols, rows) + + # heterogeneous dtype + df["B"] = 0 + with pytest.raises(ValueError, match="requires unique index and columns"): + df.lookup(rows, cols) + @pytest.mark.parametrize("indexer", [["a"], "a"]) @pytest.mark.parametrize("col", [{}, {"b": 1}]) def test_set_2d_casting_date_to_int(self, col, indexer): From 4a0b85605d8e5e808caba5b805c5fe2333b3dd5d Mon Sep 17 00:00:00 2001 From: stevenae Date: Wed, 26 Mar 2025 17:30:57 -0400 Subject: [PATCH 07/19] extend underline --- doc/source/user_guide/indexing.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index 1a28589621acd..a3f7e03f692b4 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -1457,7 +1457,7 @@ default value. .. _indexing.lookup: The :meth:`~pandas.DataFrame.lookup` method ----------------------------------------- +------------------------------------------- Sometimes you want to extract a set of values given a sequence of row labels and column labels, and the ``lookup`` method allows for this and returns a NumPy array. For instance: From e0b0b57ffcc5fe2031777d7b2054e9bb0c146246 Mon Sep 17 00:00:00 2001 From: stevenae Date: Wed, 26 Mar 2025 17:34:20 -0400 Subject: [PATCH 08/19] spacing --- doc/source/user_guide/indexing.rst | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index a3f7e03f692b4..be49f95fed26a 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -1458,11 +1458,13 @@ default value. The :meth:`~pandas.DataFrame.lookup` method ------------------------------------------- - Sometimes you want to extract a set of values given a sequence of row labels - and column labels, and the ``lookup`` method allows for this and returns a - NumPy array. For instance: - .. ipython:: python +Sometimes you want to extract a set of values given a sequence of row labels +and column labels, and the ``lookup`` method allows for this and returns a +NumPy array. For instance: + +.. ipython:: python + dflookup = pd.DataFrame(np.random.rand(20, 4), columns = ['A', 'B', 'C', 'D']) dflookup.lookup(list(range(0, 10, 2)), ['B', 'C', 'A', 'B', 'D']) From 2a6dfaef0178feb176f3a0ca6a87f9126ec378b1 Mon Sep 17 00:00:00 2001 From: stevenae Date: Wed, 26 Mar 2025 17:48:03 -0400 Subject: [PATCH 09/19] remove dev_version --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e248a87b2a183..e5820a8b533e8 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5136,7 +5136,7 @@ def _series(self): # ---------------------------------------------------------------------- # Reindexing and alignment - def lookup(self, row_labels, col_labels, dev_version) -> np.ndarray: + def lookup(self, row_labels, col_labels) -> np.ndarray: """ Label-based "fancy indexing" function for DataFrame. From 21280ed6df9a18f4bde74d7350af8c0d713a0756 Mon Sep 17 00:00:00 2001 From: stevenae Date: Wed, 26 Mar 2025 18:13:29 -0400 Subject: [PATCH 10/19] fixed test_lookup_requires_unique_axes np.random.Generator.random, not np.random.Generator --- pandas/tests/frame/indexing/test_indexing.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 89bb60f3cc114..f79475b770302 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1464,7 +1464,9 @@ def test_lookup_raises(self, float_frame): def test_lookup_requires_unique_axes(self): # GH#33041 raise with a helpful error message - df = DataFrame(np.random.Generator(6).reshape(3, 2), columns=["A", "A"]) + df = DataFrame( + np.random.default_rng(2).standard_normal((3, 2)), columns=["A", "A"] + ) rows = [0, 1] cols = ["A", "A"] From 48f1cde41abe811bf7b88a993cbeea30b237e34a Mon Sep 17 00:00:00 2001 From: stevenae Date: Thu, 27 Mar 2025 12:19:02 -0400 Subject: [PATCH 11/19] Reduce columns to those in lookup --- pandas/core/frame.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e5820a8b533e8..7c864b36fea61 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5165,14 +5165,20 @@ def lookup(self, row_labels, col_labels) -> np.ndarray: # GH#33041 raise ValueError("DataFrame.lookup requires unique index and columns") - values = self.to_numpy() ridx = self.index.get_indexer(row_labels) cidx = self.columns.get_indexer(col_labels) if (ridx == -1).any(): raise KeyError("One or more row labels was not found") if (cidx == -1).any(): raise KeyError("One or more column labels was not found") - flat_index = ridx * len(self.columns) + cidx + if len(set(col_labels)) < len(self.columns): + sub = self.take(np.unique(cidx), axis=1) + values = sub.to_numpy() + cidx = sub.columns.get_indexer(col_labels) + flat_index = ridx * len(sub.columns) + cidx + else: + values = self.to_numpy() + flat_index = ridx * len(self.columns) + cidx result = values.flat[flat_index] if is_object_dtype(result): From 9c060a8d2b0f585991bc0957fe2c30756b90aa28 Mon Sep 17 00:00:00 2001 From: stevenae Date: Thu, 27 Mar 2025 14:39:09 -0400 Subject: [PATCH 12/19] Update frame.py --- pandas/core/frame.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e5820a8b533e8..a59bdfbf1a76e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5165,14 +5165,22 @@ def lookup(self, row_labels, col_labels) -> np.ndarray: # GH#33041 raise ValueError("DataFrame.lookup requires unique index and columns") - values = self.to_numpy() + # values = self.to_numpy() ridx = self.index.get_indexer(row_labels) cidx = self.columns.get_indexer(col_labels) if (ridx == -1).any(): raise KeyError("One or more row labels was not found") if (cidx == -1).any(): raise KeyError("One or more column labels was not found") - flat_index = ridx * len(self.columns) + cidx + + sub = self.take(np.unique(cidx), axis=1) + if sub._is_mixed_type: + sub = sub.take(np.unique(ridx), axis=0) + ridx = sub.index.get_indexer(row_labels) + values = sub.to_numpy() + cidx = sub.columns.get_indexer(col_labels) + flat_index = ridx * len(sub.columns) + cidx + result = values.flat[flat_index] if is_object_dtype(result): From 4e0c17ff5681be4c64de7fa76b338b36ddd5b3a8 Mon Sep 17 00:00:00 2001 From: stevenae Date: Fri, 28 Mar 2025 15:46:29 -0400 Subject: [PATCH 13/19] one line to separate sections --- pandas/core/frame.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index fd8479906796b..a6f4990defa50 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5140,11 +5140,9 @@ def lookup(self, row_labels, col_labels) -> np.ndarray: """ Label-based "fancy indexing" function for DataFrame. - Given equal-length arrays of row and column labels, return an array of the values corresponding to each (row, col) pair. - Parameters ---------- row_labels : sequence @@ -5152,7 +5150,6 @@ def lookup(self, row_labels, col_labels) -> np.ndarray: col_labels : sequence The column labels to use for lookup. - Returns ------- numpy.ndarray From a5e379b958beeff26cb89f5a4b2e6365008f7c47 Mon Sep 17 00:00:00 2001 From: stevenae Date: Fri, 28 Mar 2025 15:49:19 -0400 Subject: [PATCH 14/19] Update v3.0.0.rst pandas.DataFrame.lookup --- doc/source/whatsnew/v3.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index da2a9bdada469..cb25079cee917 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -30,6 +30,7 @@ Other enhancements ^^^^^^^^^^^^^^^^^^ - :class:`pandas.api.typing.FrozenList` is available for typing the outputs of :attr:`MultiIndex.names`, :attr:`MultiIndex.codes` and :attr:`MultiIndex.levels` (:issue:`58237`) - :class:`pandas.api.typing.SASReader` is available for typing the output of :func:`read_sas` (:issue:`55689`) +- :meth:`pandas.DataFrame.lookup` returns with optimizations for looking up values by list of row/column pairs (:issue:`40140`) - :meth:`pandas.api.interchange.from_dataframe` now uses the `PyCapsule Interface `_ if available, only falling back to the Dataframe Interchange Protocol if that fails (:issue:`60739`) - Added :meth:`.Styler.to_typst` to write Styler objects to file, buffer or string in Typst format (:issue:`57617`) - Added missing :meth:`pandas.Series.info` to API reference (:issue:`60926`) From 47e0b1bcb66262aadfb0661ea066e3ba7372cda3 Mon Sep 17 00:00:00 2001 From: stevenae Date: Fri, 28 Mar 2025 17:05:39 -0400 Subject: [PATCH 15/19] Adding an example --- pandas/core/frame.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a6f4990defa50..56545368a3800 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5154,6 +5154,23 @@ def lookup(self, row_labels, col_labels) -> np.ndarray: ------- numpy.ndarray The found values. + + Examples + -------- + >>> df = pd.DataFrame( + ... { + ... "Math_Sem1": [85, 92, 78, 88, 95], + ... "Math_Sem2": [88, 90, 82, 85, 93], + ... "Science_Sem1": [90, 85, 92, 79, 87], + ... "Science_Sem2": [92, 87, 90, 83, 89], + ... "English_Sem1": [95, 80, 85, 90, 82], + ... "English_Sem2": [93, 82, 87, 88, 80], + ... }, + ... index=["Alice", "Bob", "Charlie", "David", "Eve"], + ... ) + >>> student_top = df.rank(1).idxmax(1) # Column name for student's top score + >>> df.lookup(df.index, student_top) + array([95, 92, 92, 90, 95]) """ n = len(row_labels) if n != len(col_labels): From 0c04e97d6b49f9c0e0dc0db6019c3d501f1e178b Mon Sep 17 00:00:00 2001 From: stevenae Date: Fri, 28 Mar 2025 17:16:40 -0400 Subject: [PATCH 16/19] Update frame.py expanded example --- pandas/core/frame.py | 57 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 53 insertions(+), 4 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 56545368a3800..1a4acf1e95efd 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5157,7 +5157,7 @@ def lookup(self, row_labels, col_labels) -> np.ndarray: Examples -------- - >>> df = pd.DataFrame( + >>> grades = pd.DataFrame( ... { ... "Math_Sem1": [85, 92, 78, 88, 95], ... "Math_Sem2": [88, 90, 82, 85, 93], @@ -5168,9 +5168,58 @@ def lookup(self, row_labels, col_labels) -> np.ndarray: ... }, ... index=["Alice", "Bob", "Charlie", "David", "Eve"], ... ) - >>> student_top = df.rank(1).idxmax(1) # Column name for student's top score - >>> df.lookup(df.index, student_top) - array([95, 92, 92, 90, 95]) + >>> feedback = pd.DataFrame( + ... { + ... "Math_Sem1": [ + ... "Strong analytical skills", + ... "Excellent problem-solving", + ... "Needs more practice", + ... "Solid understanding", + ... "Exceptional reasoning", + ... ], + ... "Math_Sem2": [ + ... "Improved advanced techniques", + ... "Consistent high performance", + ... "Significant progress", + ... "Steady improvement", + ... "Consistently exceptional", + ... ], + ... "Science_Sem1": [ + ... "Excellent inquiry skills", + ... "Good theoretical concepts", + ... "Strong methodological interest", + ... "Needs focus", + ... "Outstanding curiosity", + ... ], + ... "Science_Sem2": [ + ... "Advanced scientific principles", + ... "Improved practical skills", + ... "Growing scientific reasoning", + ... "Better lab engagement", + ... "Continued excellence", + ... ], + ... "English_Sem1": [ + ... "Exceptional writing", + ... "Strong language use", + ... "Needs confident expression", + ... "Solid literary analysis", + ... "Creative insights", + ... ], + ... "English_Sem2": [ + ... "Refined writing techniques", + ... "Improved expression", + ... "More confident analysis", + ... "Developing writing style", + ... "Maintained high-level writing", + ... ], + ... }, + ... index=["Alice", "Bob", "Charlie", "David", "Eve"], + ... ) + >>> student_top = grades.rank(1).idxmax(1) # student's top score + >>> feedback.lookup(student_top.index, student_top) + array(['Exceptional writing', 'Excellent problem-solving', + 'Strong methodological interest', 'Solid literary analysis', + 'Exceptional reasoning'], dtype=object) """ n = len(row_labels) if n != len(col_labels): From 7d6dea53e63574fbc5a96530fdee1c53956939ca Mon Sep 17 00:00:00 2001 From: stevenae Date: Mon, 31 Mar 2025 13:03:09 -0400 Subject: [PATCH 17/19] shorter example --- pandas/core/frame.py | 44 ++++++-------------------------------------- 1 file changed, 6 insertions(+), 38 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1a4acf1e95efd..cda775258322a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5159,66 +5159,34 @@ def lookup(self, row_labels, col_labels) -> np.ndarray: -------- >>> grades = pd.DataFrame( ... { - ... "Math_Sem1": [85, 92, 78, 88, 95], - ... "Math_Sem2": [88, 90, 82, 85, 93], - ... "Science_Sem1": [90, 85, 92, 79, 87], - ... "Science_Sem2": [92, 87, 90, 83, 89], - ... "English_Sem1": [95, 80, 85, 90, 82], - ... "English_Sem2": [93, 82, 87, 88, 80], + ... "Math": [85, 92, 78, 88, 95], + ... "Science": [90, 85, 92, 79, 87], ... }, ... index=["Alice", "Bob", "Charlie", "David", "Eve"], ... ) >>> feedback = pd.DataFrame( ... { - ... "Math_Sem1": [ + ... "Math": [ ... "Strong analytical skills", ... "Excellent problem-solving", ... "Needs more practice", ... "Solid understanding", ... "Exceptional reasoning", ... ], - ... "Math_Sem2": [ - ... "Improved advanced techniques", - ... "Consistent high performance", - ... "Significant progress", - ... "Steady improvement", - ... "Consistently exceptional", - ... ], - ... "Science_Sem1": [ + ... "Science": [ ... "Excellent inquiry skills", ... "Good theoretical concepts", ... "Strong methodological interest", ... "Needs focus", ... "Outstanding curiosity", ... ], - ... "Science_Sem2": [ - ... "Advanced scientific principles", - ... "Improved practical skills", - ... "Growing scientific reasoning", - ... "Better lab engagement", - ... "Continued excellence", - ... ], - ... "English_Sem1": [ - ... "Exceptional writing", - ... "Strong language use", - ... "Needs confident expression", - ... "Solid literary analysis", - ... "Creative insights", - ... ], - ... "English_Sem2": [ - ... "Refined writing techniques", - ... "Improved expression", - ... "More confident analysis", - ... "Developing writing style", - ... "Maintained high-level writing", - ... ], ... }, ... index=["Alice", "Bob", "Charlie", "David", "Eve"], ... ) >>> student_top = grades.rank(1).idxmax(1) # student's top score >>> feedback.lookup(student_top.index, student_top) - array(['Exceptional writing', 'Excellent problem-solving', - 'Strong methodological interest', 'Solid literary analysis', + array(['Excellent inquiry skills', 'Excellent problem-solving', + 'Strong methodological interest', 'Solid understanding', 'Exceptional reasoning'], dtype=object) """ n = len(row_labels) From 50183653f7f08092934a9949629ec02758deeb7e Mon Sep 17 00:00:00 2001 From: stevenae Date: Mon, 31 Mar 2025 15:20:41 -0400 Subject: [PATCH 18/19] Update frame.py potential compromise --- pandas/core/frame.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index cda775258322a..38c6e2a7524ca 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5136,7 +5136,7 @@ def _series(self): # ---------------------------------------------------------------------- # Reindexing and alignment - def lookup(self, row_labels, col_labels) -> np.ndarray: + def lookup(self, row_labels, col_labels) -> ExtensionArray | np.ndarray: """ Label-based "fancy indexing" function for DataFrame. @@ -5207,7 +5207,7 @@ def lookup(self, row_labels, col_labels) -> np.ndarray: if sub._is_mixed_type: sub = sub.take(np.unique(ridx), axis=0) ridx = sub.index.get_indexer(row_labels) - values = sub.to_numpy() + values = sub.values cidx = sub.columns.get_indexer(col_labels) flat_index = ridx * len(sub.columns) + cidx From 6aa621897ab9db000a3758cd46d52f4cb41ca80c Mon Sep 17 00:00:00 2001 From: stevenae Date: Mon, 31 Mar 2025 16:29:04 -0400 Subject: [PATCH 19/19] rewrite to preserve types --- pandas/core/frame.py | 15 +++++---------- pandas/tests/frame/indexing/test_indexing.py | 20 ++++++++++---------- 2 files changed, 15 insertions(+), 20 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 38c6e2a7524ca..69106d70ccf1b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -101,7 +101,6 @@ is_integer_dtype, is_iterator, is_list_like, - is_object_dtype, is_scalar, is_sequence, needs_i8_conversion, @@ -5204,17 +5203,13 @@ def lookup(self, row_labels, col_labels) -> ExtensionArray | np.ndarray: raise KeyError("One or more column labels was not found") sub = self.take(np.unique(cidx), axis=1) - if sub._is_mixed_type: - sub = sub.take(np.unique(ridx), axis=0) - ridx = sub.index.get_indexer(row_labels) - values = sub.values + sub = sub.take(np.unique(ridx), axis=0) + ridx = sub.index.get_indexer(row_labels) + values = sub.melt()["value"] cidx = sub.columns.get_indexer(col_labels) - flat_index = ridx * len(sub.columns) + cidx + flat_index = ridx + cidx * len(sub) - result = values.flat[flat_index] - - if is_object_dtype(result): - result = lib.maybe_convert_objects(result) + result = values[flat_index] return result diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index f79475b770302..bb924acb2bf4e 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1420,8 +1420,8 @@ def test_lookup_float(self, float_frame): cols = list(df.columns) * len(df.index) result = df.lookup(rows, cols) - expected = np.array([df.loc[r, c] for r, c in zip(rows, cols)]) - tm.assert_numpy_array_equal(result, expected) + expected = Series([df.loc[r, c] for r, c in zip(rows, cols)]) + tm.assert_series_equal(result, expected, check_index=False, check_names=False) def test_lookup_mixed(self, float_string_frame): df = float_string_frame @@ -1429,10 +1429,8 @@ def test_lookup_mixed(self, float_string_frame): cols = list(df.columns) * len(df.index) result = df.lookup(rows, cols) - expected = np.array( - [df.loc[r, c] for r, c in zip(rows, cols)], dtype=np.object_ - ) - tm.assert_almost_equal(result, expected) + expected = Series([df.loc[r, c] for r, c in zip(rows, cols)], dtype=np.object_) + tm.assert_series_equal(result, expected, check_index=False, check_names=False) def test_lookup_bool(self): df = DataFrame( @@ -1443,14 +1441,16 @@ def test_lookup_bool(self): "mask_c": [False, True, False, True], } ) - df["mask"] = df.lookup(df.index, "mask_" + df["label"]) + df_mask = df.lookup(df.index, "mask_" + df["label"]) - exp_mask = np.array( + exp_mask = Series( [df.loc[r, c] for r, c in zip(df.index, "mask_" + df["label"])] ) - tm.assert_series_equal(df["mask"], Series(exp_mask, name="mask")) - assert df["mask"].dtype == np.bool_ + tm.assert_series_equal( + df_mask, Series(exp_mask, name="mask"), check_index=False, check_names=False + ) + assert df_mask.dtype == np.bool_ def test_lookup_raises(self, float_frame): with pytest.raises(KeyError, match="'One or more row labels was not found'"):