Skip to content

String dtype: fix pyarrow-based IO + update tests #59478

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 10 additions & 6 deletions pandas/core/arrays/string_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,18 +130,22 @@ class ArrowStringArray(ObjectStringArrayMixin, ArrowExtensionArray, BaseStringAr

def __init__(self, values) -> None:
_chk_pyarrow_available()
if isinstance(values, (pa.Array, pa.ChunkedArray)) and pa.types.is_string(
values.type
if isinstance(values, (pa.Array, pa.ChunkedArray)) and (
pa.types.is_string(values.type)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a reason why here we only test for string but in the dictionary-encoded case we allow both string and large_string?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This part of the diff went away now that #59479 is merged. But to answer your question: the reason is that we are checking the types here for which we are going to cast to large_string.
If the input is already large_string, we don't need to cast (although the cast would be cheap zero copy anyway, I think), only if it is string or dictionary (of either string type) we need to cast, to ensure the final array we store is always of arrow type large_string.

or (
pa.types.is_dictionary(values.type)
and (
pa.types.is_string(values.type.value_type)
or pa.types.is_large_string(values.type.value_type)
)
)
):
values = pc.cast(values, pa.large_string())

super().__init__(values)
self._dtype = StringDtype(storage=self._storage, na_value=self._na_value)

if not pa.types.is_large_string(self._pa_array.type) and not (
pa.types.is_dictionary(self._pa_array.type)
and pa.types.is_large_string(self._pa_array.type.value_type)
):
if not pa.types.is_large_string(self._pa_array.type):
raise ValueError(
"ArrowStringArray requires a PyArrow (chunked) array of "
"large_string type"
Expand Down
2 changes: 2 additions & 0 deletions pandas/io/_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ def _arrow_dtype_mapping() -> dict:
pa.string(): pd.StringDtype(),
pa.float32(): pd.Float32Dtype(),
pa.float64(): pd.Float64Dtype(),
pa.string(): pd.StringDtype(),
pa.large_string(): pd.StringDtype(),
Comment on lines +30 to +31
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is to ensure that when dtype_backend="numpy_nullable" is passed, the user gets the NA-variant of the string dtype

}


Expand Down
51 changes: 20 additions & 31 deletions pandas/tests/io/test_feather.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,23 +5,15 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

import pandas as pd
import pandas._testing as tm
from pandas.core.arrays import (
ArrowStringArray,
StringArray,
)

from pandas.io.feather_format import read_feather, to_feather # isort:skip

pytestmark = [
pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
),
pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False),
]
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)


pa = pytest.importorskip("pyarrow")

Expand Down Expand Up @@ -154,8 +146,8 @@ def test_path_pathlib(self):
def test_passthrough_keywords(self):
df = pd.DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=pd.Index(list("ABCD"), dtype=object),
index=pd.Index([f"i-{i}" for i in range(30)], dtype=object),
columns=pd.Index(list("ABCD")),
index=pd.Index([f"i-{i}" for i in range(30)]),
).reset_index()
self.check_round_trip(df, write_kwargs={"version": 1})

Expand All @@ -169,7 +161,9 @@ def test_http_path(self, feather_file, httpserver):
res = read_feather(httpserver.url)
tm.assert_frame_equal(expected, res)

def test_read_feather_dtype_backend(self, string_storage, dtype_backend):
def test_read_feather_dtype_backend(
self, string_storage, dtype_backend, using_infer_string
):
# GH#50765
df = pd.DataFrame(
{
Expand All @@ -184,25 +178,20 @@ def test_read_feather_dtype_backend(self, string_storage, dtype_backend):
}
)

if string_storage == "python":
string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_))
string_array_na = StringArray(np.array(["a", "b", pd.NA], dtype=np.object_))

elif dtype_backend == "pyarrow":
from pandas.arrays import ArrowExtensionArray

string_array = ArrowExtensionArray(pa.array(["a", "b", "c"]))
string_array_na = ArrowExtensionArray(pa.array(["a", "b", None]))

else:
string_array = ArrowStringArray(pa.array(["a", "b", "c"]))
string_array_na = ArrowStringArray(pa.array(["a", "b", None]))

with tm.ensure_clean() as path:
to_feather(df, path)
with pd.option_context("mode.string_storage", string_storage):
result = read_feather(path, dtype_backend=dtype_backend)

if dtype_backend == "pyarrow":
pa = pytest.importorskip("pyarrow")
if using_infer_string:
string_dtype = pd.ArrowDtype(pa.large_string())
else:
string_dtype = pd.ArrowDtype(pa.string())
else:
string_dtype = pd.StringDtype(string_storage)

expected = pd.DataFrame(
{
"a": pd.Series([1, np.nan, 3], dtype="Int64"),
Expand All @@ -211,8 +200,8 @@ def test_read_feather_dtype_backend(self, string_storage, dtype_backend):
"d": pd.Series([1.5, 2.0, 2.5], dtype="Float64"),
"e": pd.Series([True, False, pd.NA], dtype="boolean"),
"f": pd.Series([True, False, True], dtype="boolean"),
"g": string_array,
"h": string_array_na,
"g": pd.Series(["a", "b", "c"], dtype=string_dtype),
"h": pd.Series(["a", "b", None], dtype=string_dtype),
}
)

Expand Down
6 changes: 3 additions & 3 deletions pandas/tests/io/test_fsspec.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ def test_excel_options(fsspectest):
assert fsspectest.test[0] == "read"


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) fastparquet")
def test_to_parquet_new_file(cleared_fs, df1):
"""Regression test for writing to a not-yet-existent GCS Parquet file."""
pytest.importorskip("fastparquet")
Expand Down Expand Up @@ -205,7 +205,7 @@ def test_arrowparquet_options(fsspectest):
assert fsspectest.test[0] == "parquet_read"


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) fastparquet")
def test_fastparquet_options(fsspectest):
"""Regression test for writing to a not-yet-existent GCS Parquet file."""
pytest.importorskip("fastparquet")
Expand Down Expand Up @@ -263,7 +263,7 @@ def test_s3_protocols(s3_public_bucket_with_data, tips_file, protocol, s3so):
)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) fastparquet")
@pytest.mark.single_cpu
def test_s3_parquet(s3_public_bucket, s3so, df1):
pytest.importorskip("fastparquet")
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/io/test_gcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ def test_to_csv_compression_encoding_gcs(
tm.assert_frame_equal(df, read_df)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) fastparquet")
def test_to_parquet_gcs_new_file(monkeypatch, tmpdir):
"""Regression test for writing to a not-yet-existent GCS Parquet file."""
pytest.importorskip("fastparquet")
Expand Down
25 changes: 14 additions & 11 deletions pandas/tests/io/test_orc.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

import pandas as pd
from pandas import read_orc
import pandas._testing as tm
Expand All @@ -20,20 +18,17 @@

import pyarrow as pa

pytestmark = [
pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
),
pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False),
]
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)


@pytest.fixture
def dirpath(datapath):
return datapath("io", "data", "orc")


def test_orc_reader_empty(dirpath):
def test_orc_reader_empty(dirpath, using_infer_string):
columns = [
"boolean1",
"byte1",
Expand All @@ -54,11 +49,12 @@ def test_orc_reader_empty(dirpath):
"float32",
"float64",
"object",
"object",
"str" if using_infer_string else "object",
]
expected = pd.DataFrame(index=pd.RangeIndex(0))
for colname, dtype in zip(columns, dtypes):
expected[colname] = pd.Series(dtype=dtype)
expected.columns = expected.columns.astype("str")

inputfile = os.path.join(dirpath, "TestOrcFile.emptyFile.orc")
got = read_orc(inputfile, columns=columns)
Expand Down Expand Up @@ -305,7 +301,7 @@ def test_orc_writer_dtypes_not_supported(orc_writer_dtypes_not_supported):
df.to_orc()


def test_orc_dtype_backend_pyarrow():
def test_orc_dtype_backend_pyarrow(using_infer_string):
pytest.importorskip("pyarrow")
df = pd.DataFrame(
{
Expand Down Expand Up @@ -338,6 +334,13 @@ def test_orc_dtype_backend_pyarrow():
for col in df.columns
}
)
if using_infer_string:
# ORC does not preserve distinction between string and large string
# -> the default large string comes back as string
string_dtype = pd.ArrowDtype(pa.string())
expected["string"] = expected["string"].astype(string_dtype)
expected["string_with_nan"] = expected["string_with_nan"].astype(string_dtype)
expected["string_with_none"] = expected["string_with_none"].astype(string_dtype)

tm.assert_frame_equal(result, expected)

Expand Down
62 changes: 41 additions & 21 deletions pandas/tests/io/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,6 @@
pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
),
pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False),
]


Expand All @@ -60,10 +59,17 @@
params=[
pytest.param(
"fastparquet",
marks=pytest.mark.skipif(
not _HAVE_FASTPARQUET,
reason="fastparquet is not installed",
),
marks=[
pytest.mark.skipif(
not _HAVE_FASTPARQUET,
reason="fastparquet is not installed",
),
pytest.mark.xfail(
using_string_dtype(),
reason="TODO(infer_string) fastparquet",
strict=False,
),
],
),
pytest.param(
"pyarrow",
Expand All @@ -85,15 +91,22 @@ def pa():


@pytest.fixture
def fp():
def fp(request):
if not _HAVE_FASTPARQUET:
pytest.skip("fastparquet is not installed")
if using_string_dtype():
request.applymarker(
pytest.mark.xfail(reason="TODO(infer_string) fastparquet", strict=False)
)
return "fastparquet"


@pytest.fixture
def df_compat():
return pd.DataFrame({"A": [1, 2, 3], "B": "foo"})
# TODO(infer_string) should this give str columns?
return pd.DataFrame(
{"A": [1, 2, 3], "B": "foo"}, columns=pd.Index(["A", "B"], dtype=object)
)


@pytest.fixture
Expand Down Expand Up @@ -365,16 +378,6 @@ def check_external_error_on_write(self, df, engine, exc):
with tm.external_error_raised(exc):
to_parquet(df, path, engine, compression=None)

@pytest.mark.network
@pytest.mark.single_cpu
def test_parquet_read_from_url(self, httpserver, datapath, df_compat, engine):
if engine != "auto":
pytest.importorskip(engine)
with open(datapath("io", "data", "parquet", "simple.parquet"), mode="rb") as f:
httpserver.serve_content(content=f.read())
df = read_parquet(httpserver.url)
tm.assert_frame_equal(df, df_compat)


class TestBasic(Base):
def test_error(self, engine):
Expand Down Expand Up @@ -672,6 +675,16 @@ def test_read_empty_array(self, pa, dtype):
df, pa, read_kwargs={"dtype_backend": "numpy_nullable"}, expected=expected
)

@pytest.mark.network
@pytest.mark.single_cpu
def test_parquet_read_from_url(self, httpserver, datapath, df_compat, engine):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test is just moved down from the Base class to TestBasic class (all tests that are generic and run for both engines are in this second class), but fixing that the engine is actually used in the read_parquet call

if engine != "auto":
pytest.importorskip(engine)
with open(datapath("io", "data", "parquet", "simple.parquet"), mode="rb") as f:
httpserver.serve_content(content=f.read())
df = read_parquet(httpserver.url, engine=engine)
tm.assert_frame_equal(df, df_compat)


class TestParquetPyArrow(Base):
@pytest.mark.xfail(reason="datetime_with_nat unit doesn't round-trip")
Expand Down Expand Up @@ -905,7 +918,7 @@ def test_write_with_schema(self, pa):
out_df = df.astype(bool)
check_round_trip(df, pa, write_kwargs={"schema": schema}, expected=out_df)

def test_additional_extension_arrays(self, pa):
def test_additional_extension_arrays(self, pa, using_infer_string):
# test additional ExtensionArrays that are supported through the
# __arrow_array__ protocol
pytest.importorskip("pyarrow")
Expand All @@ -916,17 +929,24 @@ def test_additional_extension_arrays(self, pa):
"c": pd.Series(["a", None, "c"], dtype="string"),
}
)
check_round_trip(df, pa)
if using_infer_string:
check_round_trip(df, pa, expected=df.astype({"c": "str"}))
else:
check_round_trip(df, pa)
Comment on lines +933 to +936
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is what I explained at length in the top post: although the original dataframe has "string" dtype (NA-variant), it currently comes back as "str" dtype (NaN-variant)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't that still be coming back as "string"? By default doesn't parquet just roundtrip the exact type?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See the long explanation in the top post of why this is currently not the case (Arrow<->Parquet roundtrips are exact for most types, but its the pandas<->Arrow roundtrip where it goes wrong if pandas has several extension dtypes for the same arrow type)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK thanks - that explanation makes sense technically, but I don't think this regression should be shipped. It's also going to be repeated with ADBC drivers.

Maybe our type mapping function just needs to be extended with that data, whether it should be using string inferrence or if the string type is well known?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this regression should be shipped

Well, I don't think there is any way around it, unfortunately ..

With future.infer_strings enabled, we definitely want that someone who reads a generic parquet file (that was not written from a pandas dataframe that was using the existing nullable string dtype) ends up with the future default string dtype. PyArrow currently only gives us one way to control this, which is via specifying a types_mapper argument. But that then means it overrules the stored pandas metadata, and so a NA string dtype is not roundtripped.

As far as I can see, we can't have it both ways.
Except, if we would essentially take over the conversion of the pyarrow Table to a pandas DataFrame, parse the pandas metadata from the schema, etc. Which is something that I think we should actually consider doing at some point, i.e move all/most of the pandas compat code from the pyarrow project into pandas. But that's a bigger project not for the coming month.

Now, as I also mentioned, currently the way to fix this is to ensure pyarrow does that for us (so we don't have to specify types_mapper by default), and then pyarrow will continue respecting the pandas metadata and roundtrip a string dtype.
I will ensure this is done for the next pyarrow release (scheduled for october). Given that this future.infer_string option will likely be optional until that time, I think that is fine (although of course even when pandas 3.0 is out, that would mean people using not the latest but an older pyarrow version will still run into it).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now, as I also mentioned, currently the way to fix this is to ensure pyarrow does that for us

Opened apache/arrow#43683 to track this

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But that then means it overrules the stored pandas metadata, and so a NA string dtype is not roundtripped.

Do you know exactly the metadata that parquet stores? I was under the (possibly false) impression that we would be storing "str" or "string" in a parquet file generated by pandas. If that is so, can't that be used to disambiguate between the two types?

Ultimately my concern goes back to one of the major points of PDEP-14, in that this is going to break backwards compatability for users that have been using our StringDtype for the past 5 years, assumedly to take them full circle after PDEP-16. Hoping to avoid that confusion

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was under the (possibly false) impression that we would be storing "str" or "string" in a parquet file generated by pandas.

That's indeed what happens

If that is so, can't that be used to disambiguate between the two types?

Not if the user passes types_mapper (and the "user" here is pandas), because that overrides the mechanism of getting the resulting dtype from this stored metadata. Quoting myself from above:

PyArrow has multiple mechanism to decide which pandas data type to use in converting a pyarrow.Table to pandas with to_pandas(). The relevant ones here are that it 1) looks at the pandas metadata info that is stored in the arrow schema metadata (for a column using the NA-variant string dtype, this info stores that this column has a pandas "string" dtype, which is then used to correctly roundtrip in the arrow->pandas conversion), and 2) the user can specify a specific pandas dtype to use for a certain pyarrow type using the types_mapper keyword (which we for example use to implement the dtypes_backend="pyarrow" support returning pd.ArrowDtype()).
However, to support using the future default NaN-variant of the string dtype, we do pass types_mapper mapping pyarrow string type to the NaN string dtype, and this overrules the info in the metadata that would otherwise restore the NA string dtype.

The problem is us specifying a types_mapper by default to enable the default string dtype, and as I said, right now this can only be realistically solved in pyarrow by ensuring that pyarrow uses the pandas.StringDtype by default (for pandas>=3 or when the option is enabled), so we don't have to specify types_mapper.

Ultimately my concern goes back to one of the major points of PDEP-14, in that this is going to break backwards compatability

I am very well aware of that, and I only realized this issue while working on this PR (when we were discussing this in context of the PDEP, I also assumed that because of using the different string alias "string" and "str", that would ensure such roundtripping kept working).

But as I said, I currently don't see any solution on the short term. I will ensure this if fixed in pyarrow in the next release, and maybe for pandas 3.0 or 3.1 we could take over a big part of the pandas compat code that currently lives in pyarrow, which ensures we can properly fix this for all older pyarrow versions as well and which would make such similar issues in the future easier to address.
I do think that, as long as this is explicit opt-in (which it is for pandas 2.3), the solution in this PR is acceptable. And for when pandas 3.0 is out and the change is enabled by default, we will ensure that there is no breaking change at least when using the latest pyarrow, and we still have time to also try to find a solution to keep it working for older pyarrow versions.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah OK. I was missing the context that pyarrow was doing the type mapping, instead of us.

I do think that, as long as this is explicit opt-in (which it is for pandas 2.3), the solution in this PR is acceptable.

OK sure I'm convinced. I think for now too, users can at least force retention of the nullable type with dtype_backend="numpy_nullable", which they would be doing in other I/O methods anyway


df = pd.DataFrame({"a": pd.Series([1, 2, 3, None], dtype="Int64")})
check_round_trip(df, pa)

def test_pyarrow_backed_string_array(self, pa, string_storage):
def test_pyarrow_backed_string_array(self, pa, string_storage, using_infer_string):
# test ArrowStringArray supported through the __arrow_array__ protocol
pytest.importorskip("pyarrow")
df = pd.DataFrame({"a": pd.Series(["a", None, "c"], dtype="string[pyarrow]")})
with pd.option_context("string_storage", string_storage):
check_round_trip(df, pa, expected=df.astype(f"string[{string_storage}]"))
if using_infer_string:
expected = df.astype("str")
else:
expected = df.astype(f"string[{string_storage}]")
check_round_trip(df, pa, expected=expected)

def test_additional_extension_types(self, pa):
# test additional ExtensionArrays that are supported through the
Expand Down