diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index e6fafc8b1b14c..9cd79dc58d9d2 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -736,6 +736,7 @@ I/O - Bug in :meth:`DataFrame.to_stata` when writing more than 32,000 value labels. (:issue:`60107`) - Bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`) - Bug in :meth:`HDFStore.get` was failing to save data of dtype datetime64[s] correctly (:issue:`59004`) +- Bug in :meth:`HDFStore.select` causing queries on categorical string columns to return unexpected results (:issue:`57608`) - Bug in :meth:`read_csv` causing segmentation fault when ``encoding_errors`` is not a string. (:issue:`59059`) - Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`) - Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`) diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py index 166c9d47294cd..77b7d9ad11a6c 100644 --- a/pandas/core/computation/pytables.py +++ b/pandas/core/computation/pytables.py @@ -239,7 +239,8 @@ def stringify(value): if conv_val not in metadata: result = -1 else: - result = metadata.searchsorted(conv_val, side="left") + # Find the index of the first match of conv_val in metadata + result = np.flatnonzero(metadata == conv_val)[0] return TermValue(result, result, "integer") elif kind == "integer": try: diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index bb2058c050f2a..5c32f2c8a4d8d 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -25,6 +25,9 @@ timedelta_range, ) import pandas._testing as tm +from pandas.api.types import ( + CategoricalDtype, +) from pandas.conftest import has_pyarrow from pandas.tests.io.pytables.common import ( _maybe_remove, @@ -1106,3 +1109,23 @@ def test_store_bool_index(tmp_path, setup_path): df.to_hdf(path, key="a") result = read_hdf(path, "a") tm.assert_frame_equal(expected, result) + + +@pytest.mark.parametrize("model", ["name", "longname", "verylongname"]) +def test_select_categorical_string_columns(tmp_path, model): + # Corresponding to BUG: 57608 + + path = tmp_path / "test.h5" + + models = CategoricalDtype(categories=["name", "longname", "verylongname"]) + df = DataFrame( + {"modelId": ["name", "longname", "longname"], "value": [1, 2, 3]} + ).astype({"modelId": models, "value": int}) + + with HDFStore(path, "w") as store: + store.append("df", df, data_columns=["modelId"]) + + with HDFStore(path, "r") as store: + result = store.select("df", "modelId == model") + expected = df[df["modelId"] == model] + tm.assert_frame_equal(result, expected)