Skip to content

Commit 224cb55

Browse files
jorisvandenbosscheWillAyd
authored andcommitted
BUG (string dtype): convert dictionary input to materialized string array in ArrowStringArray constructor (#59479)
1 parent 124359f commit 224cb55

File tree

2 files changed

+15
-12
lines changed

2 files changed

+15
-12
lines changed

Diff for: pandas/core/arrays/string_arrow.py

+10-6
Original file line numberDiff line numberDiff line change
@@ -125,18 +125,22 @@ class ArrowStringArray(ObjectStringArrayMixin, ArrowExtensionArray, BaseStringAr
125125

126126
def __init__(self, values) -> None:
127127
_chk_pyarrow_available()
128-
if isinstance(values, (pa.Array, pa.ChunkedArray)) and pa.types.is_string(
129-
values.type
128+
if isinstance(values, (pa.Array, pa.ChunkedArray)) and (
129+
pa.types.is_string(values.type)
130+
or (
131+
pa.types.is_dictionary(values.type)
132+
and (
133+
pa.types.is_string(values.type.value_type)
134+
or pa.types.is_large_string(values.type.value_type)
135+
)
136+
)
130137
):
131138
values = pc.cast(values, pa.large_string())
132139

133140
super().__init__(values)
134141
self._dtype = StringDtype(storage=self._storage, na_value=self._na_value)
135142

136-
if not pa.types.is_large_string(self._pa_array.type) and not (
137-
pa.types.is_dictionary(self._pa_array.type)
138-
and pa.types.is_large_string(self._pa_array.type.value_type)
139-
):
143+
if not pa.types.is_large_string(self._pa_array.type):
140144
raise ValueError(
141145
"ArrowStringArray requires a PyArrow (chunked) array of "
142146
"large_string type"

Diff for: pandas/tests/arrays/string_/test_string_arrow.py

+5-6
Original file line numberDiff line numberDiff line change
@@ -88,19 +88,18 @@ def test_constructor_not_string_type_value_dictionary_raises(chunked):
8888
ArrowStringArray(arr)
8989

9090

91-
@pytest.mark.xfail(
92-
reason="dict conversion does not seem to be implemented for large string in arrow"
93-
)
91+
@pytest.mark.parametrize("string_type", ["string", "large_string"])
9492
@pytest.mark.parametrize("chunked", [True, False])
95-
def test_constructor_valid_string_type_value_dictionary(chunked):
93+
def test_constructor_valid_string_type_value_dictionary(string_type, chunked):
9694
pa = pytest.importorskip("pyarrow")
9795

98-
arr = pa.array(["1", "2", "3"], pa.large_string()).dictionary_encode()
96+
arr = pa.array(["1", "2", "3"], getattr(pa, string_type)()).dictionary_encode()
9997
if chunked:
10098
arr = pa.chunked_array(arr)
10199

102100
arr = ArrowStringArray(arr)
103-
assert pa.types.is_string(arr._pa_array.type.value_type)
101+
# dictionary type get converted to dense large string array
102+
assert pa.types.is_large_string(arr._pa_array.type)
104103

105104

106105
def test_constructor_from_list():

0 commit comments

Comments
 (0)