Skip to content

Commit b55191d

Browse files
WillAydjorisvandenbossche
authored andcommitted
String dtype: use 'str' string alias and representation for NaN-variant of the dtype (#59388)
1 parent d4b669e commit b55191d

File tree

79 files changed

+306
-192
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

79 files changed

+306
-192
lines changed

Diff for: pandas/_testing/__init__.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
import numpy as np
1616

17+
from pandas._config import using_string_dtype
1718
from pandas._config.localization import (
1819
can_set_locale,
1920
get_locales,
@@ -110,7 +111,10 @@
110111
ALL_FLOAT_DTYPES: list[Dtype] = [*FLOAT_NUMPY_DTYPES, *FLOAT_EA_DTYPES]
111112

112113
COMPLEX_DTYPES: list[Dtype] = [complex, "complex64", "complex128"]
113-
STRING_DTYPES: list[Dtype] = [str, "str", "U"]
114+
if using_string_dtype():
115+
STRING_DTYPES: list[Dtype] = [str, "U"]
116+
else:
117+
STRING_DTYPES: list[Dtype] = [str, "str", "U"] # type: ignore[no-redef]
114118
COMPLEX_FLOAT_DTYPES: list[Dtype] = [*COMPLEX_DTYPES, *FLOAT_NUMPY_DTYPES]
115119

116120
DATETIME64_DTYPES: list[Dtype] = ["datetime64[ns]", "M8[ns]"]

Diff for: pandas/core/arrays/arrow/array.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -570,7 +570,10 @@ def __getitem__(self, item: PositionalIndexer):
570570
if isinstance(item, np.ndarray):
571571
if not len(item):
572572
# Removable once we migrate StringDtype[pyarrow] to ArrowDtype[string]
573-
if self._dtype.name == "string" and self._dtype.storage == "pyarrow":
573+
if (
574+
isinstance(self._dtype, StringDtype)
575+
and self._dtype.storage == "pyarrow"
576+
):
574577
# TODO(infer_string) should this be large_string?
575578
pa_dtype = pa.string()
576579
else:

Diff for: pandas/core/arrays/string_.py

+18-6
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
from typing import (
55
TYPE_CHECKING,
66
Any,
7-
ClassVar,
87
Literal,
98
cast,
109
)
@@ -114,9 +113,12 @@ class StringDtype(StorageExtensionDtype):
114113
string[pyarrow]
115114
"""
116115

117-
# error: Cannot override instance variable (previously declared on
118-
# base class "StorageExtensionDtype") with class variable
119-
name: ClassVar[str] = "string" # type: ignore[misc]
116+
@property
117+
def name(self) -> str: # type: ignore[override]
118+
if self._na_value is libmissing.NA:
119+
return "string"
120+
else:
121+
return "str"
120122

121123
#: StringDtype().na_value uses pandas.NA except the implementation that
122124
# follows NumPy semantics, which uses nan.
@@ -133,7 +135,7 @@ def __init__(
133135
) -> None:
134136
# infer defaults
135137
if storage is None:
136-
if using_string_dtype() and na_value is not libmissing.NA:
138+
if na_value is not libmissing.NA:
137139
if HAS_PYARROW:
138140
storage = "pyarrow"
139141
else:
@@ -166,11 +168,19 @@ def __init__(
166168
self.storage = storage
167169
self._na_value = na_value
168170

171+
def __repr__(self) -> str:
172+
if self._na_value is libmissing.NA:
173+
return f"{self.name}[{self.storage}]"
174+
else:
175+
# TODO add more informative repr
176+
return self.name
177+
169178
def __eq__(self, other: object) -> bool:
170179
# we need to override the base class __eq__ because na_value (NA or NaN)
171180
# cannot be checked with normal `==`
172181
if isinstance(other, str):
173-
if other == self.name:
182+
# TODO should dtype == "string" work for the NaN variant?
183+
if other == "string" or other == self.name: # noqa: PLR1714
174184
return True
175185
try:
176186
other = self.construct_from_string(other)
@@ -227,6 +237,8 @@ def construct_from_string(cls, string) -> Self:
227237
)
228238
if string == "string":
229239
return cls()
240+
elif string == "str" and using_string_dtype():
241+
return cls(na_value=np.nan)
230242
elif string == "string[python]":
231243
return cls(storage="python")
232244
elif string == "string[pyarrow]":

Diff for: pandas/core/frame.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -4979,7 +4979,9 @@ def select_dtypes(self, include=None, exclude=None) -> Self:
49794979
-----
49804980
* To select all *numeric* types, use ``np.number`` or ``'number'``
49814981
* To select strings you must use the ``object`` dtype, but note that
4982-
this will return *all* object dtype columns
4982+
this will return *all* object dtype columns. With
4983+
``pd.options.future.infer_string`` enabled, using ``"str"`` will
4984+
work to select all string columns.
49834985
* See the `numpy dtype hierarchy
49844986
<https://numpy.org/doc/stable/reference/arrays.scalars.html>`__
49854987
* To select datetimes, use ``np.datetime64``, ``'datetime'`` or

Diff for: pandas/core/interchange/utils.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,12 @@ def dtype_to_arrow_c_fmt(dtype: DtypeObj) -> str:
135135
if format_str is not None:
136136
return format_str
137137

138-
if lib.is_np_dtype(dtype, "M"):
138+
if isinstance(dtype, pd.StringDtype):
139+
# TODO(infer_string) this should be LARGE_STRING for pyarrow storage,
140+
# but current tests don't cover this distinction
141+
return ArrowCTypes.STRING
142+
143+
elif lib.is_np_dtype(dtype, "M"):
139144
# Selecting the first char of resolution string:
140145
# dtype.str -> '<M8[ns]' -> 'n'
141146
resolution = np.datetime_data(dtype)[0][0]

Diff for: pandas/tests/apply/test_numba.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ def test_numba_unsupported_dtypes(apply_axis):
110110

111111
with pytest.raises(
112112
ValueError,
113-
match="Column b must have a numeric dtype. Found 'object|string' instead",
113+
match="Column b must have a numeric dtype. Found 'object|str' instead",
114114
):
115115
df.apply(f, engine="numba", axis=apply_axis)
116116

Diff for: pandas/tests/apply/test_series_apply.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -244,7 +244,7 @@ def test_apply_categorical(by_row, using_infer_string):
244244
result = ser.apply(lambda x: "A")
245245
exp = Series(["A"] * 7, name="XX", index=list("abcdefg"))
246246
tm.assert_series_equal(result, exp)
247-
assert result.dtype == object if not using_infer_string else "string[pyarrow_numpy]"
247+
assert result.dtype == object if not using_infer_string else "str"
248248

249249

250250
@pytest.mark.parametrize("series", [["1-1", "1-1", np.nan], ["1-1", "1-2", np.nan]])

Diff for: pandas/tests/arrays/boolean/test_astype.py

+9-3
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import pandas._testing as tm
66

77

8-
def test_astype():
8+
def test_astype(using_infer_string):
99
# with missing values
1010
arr = pd.array([True, False, None], dtype="boolean")
1111

@@ -20,8 +20,14 @@ def test_astype():
2020
tm.assert_numpy_array_equal(result, expected)
2121

2222
result = arr.astype("str")
23-
expected = np.array(["True", "False", "<NA>"], dtype=f"{tm.ENDIAN}U5")
24-
tm.assert_numpy_array_equal(result, expected)
23+
if using_infer_string:
24+
expected = pd.array(
25+
["True", "False", None], dtype=pd.StringDtype(na_value=np.nan)
26+
)
27+
tm.assert_extension_array_equal(result, expected)
28+
else:
29+
expected = np.array(["True", "False", "<NA>"], dtype=f"{tm.ENDIAN}U5")
30+
tm.assert_numpy_array_equal(result, expected)
2531

2632
# no missing values
2733
arr = pd.array([True, False, True], dtype="boolean")

Diff for: pandas/tests/arrays/categorical/test_astype.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ def test_astype(self, ordered):
8989
expected = np.array(cat)
9090
tm.assert_numpy_array_equal(result, expected)
9191

92-
msg = r"Cannot cast object|string dtype to float64"
92+
msg = r"Cannot cast object|str dtype to float64"
9393
with pytest.raises(ValueError, match=msg):
9494
cat.astype(float)
9595

Diff for: pandas/tests/arrays/categorical/test_repr.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ def test_print(self, using_infer_string):
2222
if using_infer_string:
2323
expected = [
2424
"['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']",
25-
"Categories (3, string): [a < b < c]",
25+
"Categories (3, str): [a < b < c]",
2626
]
2727
else:
2828
expected = [

Diff for: pandas/tests/arrays/floating/test_astype.py

+13-4
Original file line numberDiff line numberDiff line change
@@ -63,12 +63,21 @@ def test_astype_to_integer_array():
6363
tm.assert_extension_array_equal(result, expected)
6464

6565

66-
def test_astype_str():
66+
def test_astype_str(using_infer_string):
6767
a = pd.array([0.1, 0.2, None], dtype="Float64")
68-
expected = np.array(["0.1", "0.2", "<NA>"], dtype="U32")
6968

70-
tm.assert_numpy_array_equal(a.astype(str), expected)
71-
tm.assert_numpy_array_equal(a.astype("str"), expected)
69+
if using_infer_string:
70+
expected = pd.array(["0.1", "0.2", None], dtype=pd.StringDtype(na_value=np.nan))
71+
tm.assert_extension_array_equal(a.astype("str"), expected)
72+
73+
# TODO(infer_string) this should also be a string array like above
74+
expected = np.array(["0.1", "0.2", "<NA>"], dtype="U32")
75+
tm.assert_numpy_array_equal(a.astype(str), expected)
76+
else:
77+
expected = np.array(["0.1", "0.2", "<NA>"], dtype="U32")
78+
79+
tm.assert_numpy_array_equal(a.astype(str), expected)
80+
tm.assert_numpy_array_equal(a.astype("str"), expected)
7281

7382

7483
def test_astype_copy():

Diff for: pandas/tests/arrays/integer/test_dtypes.py

+13-4
Original file line numberDiff line numberDiff line change
@@ -278,12 +278,21 @@ def test_to_numpy_na_raises(dtype):
278278
a.to_numpy(dtype=dtype)
279279

280280

281-
def test_astype_str():
281+
def test_astype_str(using_infer_string):
282282
a = pd.array([1, 2, None], dtype="Int64")
283-
expected = np.array(["1", "2", "<NA>"], dtype=f"{tm.ENDIAN}U21")
284283

285-
tm.assert_numpy_array_equal(a.astype(str), expected)
286-
tm.assert_numpy_array_equal(a.astype("str"), expected)
284+
if using_infer_string:
285+
expected = pd.array(["1", "2", None], dtype=pd.StringDtype(na_value=np.nan))
286+
tm.assert_extension_array_equal(a.astype("str"), expected)
287+
288+
# TODO(infer_string) this should also be a string array like above
289+
expected = np.array(["1", "2", "<NA>"], dtype=f"{tm.ENDIAN}U21")
290+
tm.assert_numpy_array_equal(a.astype(str), expected)
291+
else:
292+
expected = np.array(["1", "2", "<NA>"], dtype=f"{tm.ENDIAN}U21")
293+
294+
tm.assert_numpy_array_equal(a.astype(str), expected)
295+
tm.assert_numpy_array_equal(a.astype("str"), expected)
287296

288297

289298
def test_astype_boolean():

Diff for: pandas/tests/arrays/interval/test_interval_pyarrow.py

-3
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
import numpy as np
22
import pytest
33

4-
from pandas._config import using_string_dtype
5-
64
import pandas as pd
75
import pandas._testing as tm
86
from pandas.core.arrays import IntervalArray
@@ -82,7 +80,6 @@ def test_arrow_array_missing():
8280
assert result.storage.equals(expected)
8381

8482

85-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
8683
@pytest.mark.filterwarnings(
8784
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
8885
)

Diff for: pandas/tests/arrays/period/test_arrow_compat.py

-4
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
11
import pytest
22

3-
from pandas._config import using_string_dtype
4-
53
from pandas.compat.pyarrow import pa_version_under10p1
64

75
from pandas.core.dtypes.dtypes import PeriodDtype
@@ -79,7 +77,6 @@ def test_arrow_array_missing():
7977
assert result.storage.equals(expected)
8078

8179

82-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
8380
def test_arrow_table_roundtrip():
8481
from pandas.core.arrays.arrow.extension_types import ArrowPeriodType
8582

@@ -99,7 +96,6 @@ def test_arrow_table_roundtrip():
9996
tm.assert_frame_equal(result, expected)
10097

10198

102-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
10399
def test_arrow_load_from_zero_chunks():
104100
# GH-41040
105101

Diff for: pandas/tests/arrays/string_/test_string.py

+21-14
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ def test_repr(dtype):
6565
assert repr(df) == expected
6666

6767
if dtype.na_value is np.nan:
68-
expected = "0 a\n1 NaN\n2 b\nName: A, dtype: string"
68+
expected = "0 a\n1 NaN\n2 b\nName: A, dtype: str"
6969
else:
7070
expected = "0 a\n1 <NA>\n2 b\nName: A, dtype: string"
7171
assert repr(df.A) == expected
@@ -75,10 +75,10 @@ def test_repr(dtype):
7575
expected = f"<{arr_name}>\n['a', <NA>, 'b']\nLength: 3, dtype: string"
7676
elif dtype.storage == "pyarrow" and dtype.na_value is np.nan:
7777
arr_name = "ArrowStringArrayNumpySemantics"
78-
expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: string"
78+
expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: str"
7979
elif dtype.storage == "python" and dtype.na_value is np.nan:
8080
arr_name = "StringArrayNumpySemantics"
81-
expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: string"
81+
expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: str"
8282
else:
8383
arr_name = "StringArray"
8484
expected = f"<{arr_name}>\n['a', <NA>, 'b']\nLength: 3, dtype: string"
@@ -502,7 +502,7 @@ def test_fillna_args(dtype):
502502
tm.assert_extension_array_equal(res, expected)
503503

504504
if dtype.storage == "pyarrow":
505-
msg = "Invalid value '1' for dtype string"
505+
msg = "Invalid value '1' for dtype str"
506506
else:
507507
msg = "Cannot set non-string value '1' into a StringArray."
508508
with pytest.raises(TypeError, match=msg):
@@ -524,7 +524,7 @@ def test_arrow_array(dtype):
524524
assert arr.equals(expected)
525525

526526

527-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
527+
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
528528
@pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
529529
def test_arrow_roundtrip(dtype, string_storage, using_infer_string):
530530
# roundtrip possible from arrow 1.0.0
@@ -539,14 +539,17 @@ def test_arrow_roundtrip(dtype, string_storage, using_infer_string):
539539
assert table.field("a").type == "large_string"
540540
with pd.option_context("string_storage", string_storage):
541541
result = table.to_pandas()
542-
assert isinstance(result["a"].dtype, pd.StringDtype)
543-
expected = df.astype(f"string[{string_storage}]")
544-
tm.assert_frame_equal(result, expected)
545-
# ensure the missing value is represented by NA and not np.nan or None
546-
assert result.loc[2, "a"] is result["a"].dtype.na_value
542+
if dtype.na_value is np.nan and not using_string_dtype():
543+
assert result["a"].dtype == "object"
544+
else:
545+
assert isinstance(result["a"].dtype, pd.StringDtype)
546+
expected = df.astype(f"string[{string_storage}]")
547+
tm.assert_frame_equal(result, expected)
548+
# ensure the missing value is represented by NA and not np.nan or None
549+
assert result.loc[2, "a"] is result["a"].dtype.na_value
547550

548551

549-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
552+
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
550553
@pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
551554
def test_arrow_load_from_zero_chunks(dtype, string_storage, using_infer_string):
552555
# GH-41040
@@ -563,9 +566,13 @@ def test_arrow_load_from_zero_chunks(dtype, string_storage, using_infer_string):
563566
table = pa.table([pa.chunked_array([], type=pa.string())], schema=table.schema)
564567
with pd.option_context("string_storage", string_storage):
565568
result = table.to_pandas()
566-
assert isinstance(result["a"].dtype, pd.StringDtype)
567-
expected = df.astype(f"string[{string_storage}]")
568-
tm.assert_frame_equal(result, expected)
569+
570+
if dtype.na_value is np.nan and not using_string_dtype():
571+
assert result["a"].dtype == "object"
572+
else:
573+
assert isinstance(result["a"].dtype, pd.StringDtype)
574+
expected = df.astype(f"string[{string_storage}]")
575+
tm.assert_frame_equal(result, expected)
569576

570577

571578
def test_value_counts_na(dtype):

Diff for: pandas/tests/arrays/string_/test_string_arrow.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import numpy as np
55
import pytest
66

7+
from pandas.compat import HAS_PYARROW
78
import pandas.util._test_decorators as td
89

910
import pandas as pd
@@ -27,8 +28,9 @@ def test_eq_all_na():
2728

2829

2930
def test_config(string_storage, request, using_infer_string):
30-
if using_infer_string and string_storage == "python":
31-
# python string storage with na_value=NaN is not yet implemented
31+
if using_infer_string and string_storage == "python" and HAS_PYARROW:
32+
# string storage with na_value=NaN always uses pyarrow if available
33+
# -> does not yet honor the option
3234
request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)"))
3335

3436
with pd.option_context("string_storage", string_storage):

Diff for: pandas/tests/arrays/test_datetimelike.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -295,7 +295,9 @@ def test_searchsorted(self):
295295
assert result == 10
296296

297297
@pytest.mark.parametrize("box", [None, "index", "series"])
298-
def test_searchsorted_castable_strings(self, arr1d, box, string_storage):
298+
def test_searchsorted_castable_strings(
299+
self, arr1d, box, string_storage, using_infer_string
300+
):
299301
arr = arr1d
300302
if box is None:
301303
pass
@@ -331,7 +333,8 @@ def test_searchsorted_castable_strings(self, arr1d, box, string_storage):
331333
TypeError,
332334
match=re.escape(
333335
f"value should be a '{arr1d._scalar_type.__name__}', 'NaT', "
334-
"or array of those. Got string array instead."
336+
"or array of those. Got "
337+
f"{'str' if using_infer_string else 'string'} array instead."
335338
),
336339
):
337340
arr.searchsorted([str(arr[1]), "baz"])

0 commit comments

Comments
 (0)