Skip to content

Commit 98c2068

Browse files
committed
BUG: interchange protocol with nullable datatypes a non-null validity (pandas-dev#57665)
* BUG: interchange protocol with nullable datatypes a non-null validity provides nonsense results * whatsnew * 🏷️ typing * parametrise over more types * move whatsnew (cherry picked from commit 03717bc)
1 parent 63b9eba commit 98c2068

File tree

3 files changed

+57
-6
lines changed

3 files changed

+57
-6
lines changed

doc/source/whatsnew/v2.2.2.rst

+1
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ including other versions of pandas.
1313

1414
Fixed regressions
1515
~~~~~~~~~~~~~~~~~
16+
- :meth:`DataFrame.__dataframe__` was producing incorrect data buffers when the a column's type was a pandas nullable on with missing values (:issue:`56702`)
1617
-
1718

1819
.. ---------------------------------------------------------------------------

pandas/core/interchange/column.py

+18
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,10 @@ def describe_categorical(self):
190190

191191
@property
192192
def describe_null(self):
193+
if isinstance(self._col.dtype, BaseMaskedDtype):
194+
column_null_dtype = ColumnNullType.USE_BYTEMASK
195+
null_value = 1
196+
return column_null_dtype, null_value
193197
kind = self.dtype[0]
194198
try:
195199
null, value = _NULL_DESCRIPTION[kind]
@@ -291,6 +295,14 @@ def _get_data_buffer(
291295
np_arr = self._col.dt.tz_convert(None).to_numpy()
292296
else:
293297
np_arr = self._col.to_numpy()
298+
299+
arr = self._col.array
300+
if isinstance(self._col.dtype, BaseMaskedDtype):
301+
np_arr = arr._data # type: ignore[attr-defined]
302+
elif isinstance(self._col.dtype, ArrowDtype):
303+
raise NotImplementedError("ArrowDtype not handled yet")
304+
else:
305+
np_arr = arr._ndarray # type: ignore[attr-defined]
294306
buffer = PandasBuffer(np_arr, allow_copy=self._allow_copy)
295307
dtype = self.dtype
296308
elif self.dtype[0] == DtypeKind.CATEGORICAL:
@@ -328,6 +340,12 @@ def _get_validity_buffer(self) -> tuple[PandasBuffer, Any]:
328340
"""
329341
null, invalid = self.describe_null
330342

343+
if isinstance(self._col.dtype, BaseMaskedDtype):
344+
mask = self._col.array._mask # type: ignore[attr-defined]
345+
buffer = PandasBuffer(mask)
346+
dtype = (DtypeKind.BOOL, 8, ArrowCTypes.BOOL, Endianness.NATIVE)
347+
return buffer, dtype
348+
331349
if self.dtype[0] == DtypeKind.STRING:
332350
# For now, use byte array as the mask.
333351
# TODO: maybe store as bit array to save space?..

pandas/tests/interchange/test_impl.py

+38-6
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
is_platform_windows,
1010
)
1111
from pandas.compat.numpy import np_version_lt1p23
12-
import pandas.util._test_decorators as td
1312

1413
import pandas as pd
1514
import pandas._testing as tm
@@ -404,17 +403,50 @@ def test_non_str_names_w_duplicates():
404403
pd.api.interchange.from_dataframe(dfi, allow_copy=False)
405404

406405

407-
@pytest.mark.parametrize(
408-
"dtype", ["Int8", pytest.param("Int8[pyarrow]", marks=td.skip_if_no("pyarrow"))]
409-
)
410-
def test_nullable_integers(dtype: str) -> None:
406+
def test_nullable_integers() -> None:
407+
# https://github.com/pandas-dev/pandas/issues/55069
408+
df = pd.DataFrame({"a": [1]}, dtype="Int8")
409+
expected = pd.DataFrame({"a": [1]}, dtype="int8")
410+
result = pd.api.interchange.from_dataframe(df.__dataframe__())
411+
tm.assert_frame_equal(result, expected)
412+
413+
414+
@pytest.mark.xfail(reason="https://github.com/pandas-dev/pandas/issues/57664")
415+
def test_nullable_integers_pyarrow() -> None:
411416
# https://github.com/pandas-dev/pandas/issues/55069
412-
df = pd.DataFrame({"a": [1]}, dtype=dtype)
417+
df = pd.DataFrame({"a": [1]}, dtype="Int8[pyarrow]")
413418
expected = pd.DataFrame({"a": [1]}, dtype="int8")
414419
result = pd.api.interchange.from_dataframe(df.__dataframe__())
415420
tm.assert_frame_equal(result, expected)
416421

417422

423+
@pytest.mark.parametrize(
424+
("data", "dtype", "expected_dtype"),
425+
[
426+
([1, 2, None], "Int64", "int64"),
427+
(
428+
[1, 2, None],
429+
"UInt64",
430+
"uint64",
431+
),
432+
([1.0, 2.25, None], "Float32", "float32"),
433+
],
434+
)
435+
def test_pandas_nullable_w_missing_values(
436+
data: list, dtype: str, expected_dtype: str
437+
) -> None:
438+
# https://github.com/pandas-dev/pandas/issues/57643
439+
pytest.importorskip("pyarrow", "11.0.0")
440+
import pyarrow.interchange as pai
441+
442+
df = pd.DataFrame({"a": data}, dtype=dtype)
443+
result = pai.from_dataframe(df.__dataframe__())["a"]
444+
assert result.type == expected_dtype
445+
assert result[0].as_py() == data[0]
446+
assert result[1].as_py() == data[1]
447+
assert result[2].as_py() is None
448+
449+
418450
def test_empty_dataframe():
419451
# https://github.com/pandas-dev/pandas/issues/56700
420452
df = pd.DataFrame({"a": []}, dtype="int8")

0 commit comments

Comments
 (0)