-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
String dtype: fix pyarrow-based IO + update tests #59478
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
71da3b0
8095291
33cc8d6
6a326a9
d9cd128
1db710d
890b81c
3ef26fe
06bb14b
3f38a8a
ef2f6cb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -27,6 +27,8 @@ def _arrow_dtype_mapping() -> dict: | |
pa.string(): pd.StringDtype(), | ||
pa.float32(): pd.Float32Dtype(), | ||
pa.float64(): pd.Float64Dtype(), | ||
pa.string(): pd.StringDtype(), | ||
pa.large_string(): pd.StringDtype(), | ||
Comment on lines
+30
to
+31
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is to ensure that when |
||
} | ||
|
||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -51,7 +51,6 @@ | |
pytest.mark.filterwarnings( | ||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning" | ||
), | ||
pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), | ||
] | ||
|
||
|
||
|
@@ -60,10 +59,17 @@ | |
params=[ | ||
pytest.param( | ||
"fastparquet", | ||
marks=pytest.mark.skipif( | ||
not _HAVE_FASTPARQUET, | ||
reason="fastparquet is not installed", | ||
), | ||
marks=[ | ||
pytest.mark.skipif( | ||
not _HAVE_FASTPARQUET, | ||
reason="fastparquet is not installed", | ||
), | ||
pytest.mark.xfail( | ||
using_string_dtype(), | ||
reason="TODO(infer_string) fastparquet", | ||
strict=False, | ||
), | ||
], | ||
), | ||
pytest.param( | ||
"pyarrow", | ||
|
@@ -85,15 +91,22 @@ def pa(): | |
|
||
|
||
@pytest.fixture | ||
def fp(): | ||
def fp(request): | ||
if not _HAVE_FASTPARQUET: | ||
pytest.skip("fastparquet is not installed") | ||
if using_string_dtype(): | ||
request.applymarker( | ||
pytest.mark.xfail(reason="TODO(infer_string) fastparquet", strict=False) | ||
) | ||
return "fastparquet" | ||
|
||
|
||
@pytest.fixture | ||
def df_compat(): | ||
return pd.DataFrame({"A": [1, 2, 3], "B": "foo"}) | ||
# TODO(infer_string) should this give str columns? | ||
return pd.DataFrame( | ||
{"A": [1, 2, 3], "B": "foo"}, columns=pd.Index(["A", "B"], dtype=object) | ||
) | ||
|
||
|
||
@pytest.fixture | ||
|
@@ -365,16 +378,6 @@ def check_external_error_on_write(self, df, engine, exc): | |
with tm.external_error_raised(exc): | ||
to_parquet(df, path, engine, compression=None) | ||
|
||
@pytest.mark.network | ||
@pytest.mark.single_cpu | ||
def test_parquet_read_from_url(self, httpserver, datapath, df_compat, engine): | ||
if engine != "auto": | ||
pytest.importorskip(engine) | ||
with open(datapath("io", "data", "parquet", "simple.parquet"), mode="rb") as f: | ||
httpserver.serve_content(content=f.read()) | ||
df = read_parquet(httpserver.url) | ||
tm.assert_frame_equal(df, df_compat) | ||
|
||
|
||
class TestBasic(Base): | ||
def test_error(self, engine): | ||
|
@@ -672,6 +675,16 @@ def test_read_empty_array(self, pa, dtype): | |
df, pa, read_kwargs={"dtype_backend": "numpy_nullable"}, expected=expected | ||
) | ||
|
||
@pytest.mark.network | ||
@pytest.mark.single_cpu | ||
def test_parquet_read_from_url(self, httpserver, datapath, df_compat, engine): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This test is just moved down from the |
||
if engine != "auto": | ||
pytest.importorskip(engine) | ||
with open(datapath("io", "data", "parquet", "simple.parquet"), mode="rb") as f: | ||
httpserver.serve_content(content=f.read()) | ||
df = read_parquet(httpserver.url, engine=engine) | ||
tm.assert_frame_equal(df, df_compat) | ||
|
||
|
||
class TestParquetPyArrow(Base): | ||
@pytest.mark.xfail(reason="datetime_with_nat unit doesn't round-trip") | ||
|
@@ -905,7 +918,7 @@ def test_write_with_schema(self, pa): | |
out_df = df.astype(bool) | ||
check_round_trip(df, pa, write_kwargs={"schema": schema}, expected=out_df) | ||
|
||
def test_additional_extension_arrays(self, pa): | ||
def test_additional_extension_arrays(self, pa, using_infer_string): | ||
# test additional ExtensionArrays that are supported through the | ||
# __arrow_array__ protocol | ||
pytest.importorskip("pyarrow") | ||
|
@@ -916,17 +929,24 @@ def test_additional_extension_arrays(self, pa): | |
"c": pd.Series(["a", None, "c"], dtype="string"), | ||
} | ||
) | ||
check_round_trip(df, pa) | ||
if using_infer_string: | ||
check_round_trip(df, pa, expected=df.astype({"c": "str"})) | ||
else: | ||
check_round_trip(df, pa) | ||
Comment on lines
+933
to
+936
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is what I explained at length in the top post: although the original dataframe has "string" dtype (NA-variant), it currently comes back as "str" dtype (NaN-variant) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Shouldn't that still be coming back as There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. See the long explanation in the top post of why this is currently not the case (Arrow<->Parquet roundtrips are exact for most types, but its the pandas<->Arrow roundtrip where it goes wrong if pandas has several extension dtypes for the same arrow type) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK thanks - that explanation makes sense technically, but I don't think this regression should be shipped. It's also going to be repeated with ADBC drivers. Maybe our type mapping function just needs to be extended with that data, whether it should be using string inferrence or if the string type is well known? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Well, I don't think there is any way around it, unfortunately .. With As far as I can see, we can't have it both ways. Now, as I also mentioned, currently the way to fix this is to ensure pyarrow does that for us (so we don't have to specify There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Opened apache/arrow#43683 to track this There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Do you know exactly the metadata that parquet stores? I was under the (possibly false) impression that we would be storing "str" or "string" in a parquet file generated by pandas. If that is so, can't that be used to disambiguate between the two types? Ultimately my concern goes back to one of the major points of PDEP-14, in that this is going to break backwards compatability for users that have been using our StringDtype for the past 5 years, assumedly to take them full circle after PDEP-16. Hoping to avoid that confusion There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
That's indeed what happens
Not if the user passes
The problem is us specifying a
I am very well aware of that, and I only realized this issue while working on this PR (when we were discussing this in context of the PDEP, I also assumed that because of using the different string alias "string" and "str", that would ensure such roundtripping kept working). But as I said, I currently don't see any solution on the short term. I will ensure this if fixed in pyarrow in the next release, and maybe for pandas 3.0 or 3.1 we could take over a big part of the pandas compat code that currently lives in pyarrow, which ensures we can properly fix this for all older pyarrow versions as well and which would make such similar issues in the future easier to address. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah OK. I was missing the context that pyarrow was doing the type mapping, instead of us.
OK sure I'm convinced. I think for now too, users can at least force retention of the nullable type with |
||
|
||
df = pd.DataFrame({"a": pd.Series([1, 2, 3, None], dtype="Int64")}) | ||
check_round_trip(df, pa) | ||
|
||
def test_pyarrow_backed_string_array(self, pa, string_storage): | ||
def test_pyarrow_backed_string_array(self, pa, string_storage, using_infer_string): | ||
# test ArrowStringArray supported through the __arrow_array__ protocol | ||
pytest.importorskip("pyarrow") | ||
df = pd.DataFrame({"a": pd.Series(["a", None, "c"], dtype="string[pyarrow]")}) | ||
with pd.option_context("string_storage", string_storage): | ||
check_round_trip(df, pa, expected=df.astype(f"string[{string_storage}]")) | ||
if using_infer_string: | ||
expected = df.astype("str") | ||
else: | ||
expected = df.astype(f"string[{string_storage}]") | ||
check_round_trip(df, pa, expected=expected) | ||
|
||
def test_additional_extension_types(self, pa): | ||
# test additional ExtensionArrays that are supported through the | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is there a reason why here we only test for string but in the dictionary-encoded case we allow both string and large_string?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This part of the diff went away now that #59479 is merged. But to answer your question: the reason is that we are checking the types here for which we are going to cast to
large_string
.If the input is already
large_string
, we don't need to cast (although the cast would be cheap zero copy anyway, I think), only if it isstring
ordictionary
(of either string type) we need to cast, to ensure the final array we store is always of arrow typelarge_string
.