BUG: interchange protocol with nullable datatypes a non-null validity (pandas-dev#57665)

MarcoGorelli · MarcoGorelli · commit 98c206827406 · 2024-03-07T21:05:02.000Z
* BUG: interchange protocol with nullable datatypes a non-null validity provides nonsense results * whatsnew * 🏷️ typing * parametrise over more types * move whatsnew (cherry picked from commit 03717bc)
diff --git a/doc/source/whatsnew/v2.2.2.rst b/doc/source/whatsnew/v2.2.2.rst
@@ -13,6 +13,7 @@ including other versions of pandas.
 
 Fixed regressions
 ~~~~~~~~~~~~~~~~~
+- :meth:`DataFrame.__dataframe__` was producing incorrect data buffers when the a column's type was a pandas nullable on with missing values (:issue:`56702`)
 -
 
 .. ---------------------------------------------------------------------------
diff --git a/pandas/core/interchange/column.py b/pandas/core/interchange/column.py
@@ -190,6 +190,10 @@ def describe_categorical(self):
 
     @property
     def describe_null(self):
+        if isinstance(self._col.dtype, BaseMaskedDtype):
+            column_null_dtype = ColumnNullType.USE_BYTEMASK
+            null_value = 1
+            return column_null_dtype, null_value
         kind = self.dtype[0]
         try:
             null, value = _NULL_DESCRIPTION[kind]
@@ -291,6 +295,14 @@ def _get_data_buffer(
                 np_arr = self._col.dt.tz_convert(None).to_numpy()
             else:
                 np_arr = self._col.to_numpy()
+
+            arr = self._col.array
+            if isinstance(self._col.dtype, BaseMaskedDtype):
+                np_arr = arr._data  # type: ignore[attr-defined]
+            elif isinstance(self._col.dtype, ArrowDtype):
+                raise NotImplementedError("ArrowDtype not handled yet")
+            else:
+                np_arr = arr._ndarray  # type: ignore[attr-defined]
             buffer = PandasBuffer(np_arr, allow_copy=self._allow_copy)
             dtype = self.dtype
         elif self.dtype[0] == DtypeKind.CATEGORICAL:
@@ -328,6 +340,12 @@ def _get_validity_buffer(self) -> tuple[PandasBuffer, Any]:
         """
         null, invalid = self.describe_null
 
+        if isinstance(self._col.dtype, BaseMaskedDtype):
+            mask = self._col.array._mask  # type: ignore[attr-defined]
+            buffer = PandasBuffer(mask)
+            dtype = (DtypeKind.BOOL, 8, ArrowCTypes.BOOL, Endianness.NATIVE)
+            return buffer, dtype
+
         if self.dtype[0] == DtypeKind.STRING:
             # For now, use byte array as the mask.
             # TODO: maybe store as bit array to save space?..
diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py
@@ -9,7 +9,6 @@
     is_platform_windows,
 )
 from pandas.compat.numpy import np_version_lt1p23
-import pandas.util._test_decorators as td
 
 import pandas as pd
 import pandas._testing as tm
@@ -404,17 +403,50 @@ def test_non_str_names_w_duplicates():
         pd.api.interchange.from_dataframe(dfi, allow_copy=False)
 
 
-@pytest.mark.parametrize(
-    "dtype", ["Int8", pytest.param("Int8[pyarrow]", marks=td.skip_if_no("pyarrow"))]
-)
-def test_nullable_integers(dtype: str) -> None:
+def test_nullable_integers() -> None:
+    # https://github.com/pandas-dev/pandas/issues/55069
+    df = pd.DataFrame({"a": [1]}, dtype="Int8")
+    expected = pd.DataFrame({"a": [1]}, dtype="int8")
+    result = pd.api.interchange.from_dataframe(df.__dataframe__())
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.xfail(reason="https://github.com/pandas-dev/pandas/issues/57664")
+def test_nullable_integers_pyarrow() -> None:
     # https://github.com/pandas-dev/pandas/issues/55069
-    df = pd.DataFrame({"a": [1]}, dtype=dtype)
+    df = pd.DataFrame({"a": [1]}, dtype="Int8[pyarrow]")
     expected = pd.DataFrame({"a": [1]}, dtype="int8")
     result = pd.api.interchange.from_dataframe(df.__dataframe__())
     tm.assert_frame_equal(result, expected)
 
 
+@pytest.mark.parametrize(
+    ("data", "dtype", "expected_dtype"),
+    [
+        ([1, 2, None], "Int64", "int64"),
+        (
+            [1, 2, None],
+            "UInt64",
+            "uint64",
+        ),
+        ([1.0, 2.25, None], "Float32", "float32"),
+    ],
+)
+def test_pandas_nullable_w_missing_values(
+    data: list, dtype: str, expected_dtype: str
+) -> None:
+    # https://github.com/pandas-dev/pandas/issues/57643
+    pytest.importorskip("pyarrow", "11.0.0")
+    import pyarrow.interchange as pai
+
+    df = pd.DataFrame({"a": data}, dtype=dtype)
+    result = pai.from_dataframe(df.__dataframe__())["a"]
+    assert result.type == expected_dtype
+    assert result[0].as_py() == data[0]
+    assert result[1].as_py() == data[1]
+    assert result[2].as_py() is None
+
+
 def test_empty_dataframe():
     # https://github.com/pandas-dev/pandas/issues/56700
     df = pd.DataFrame({"a": []}, dtype="int8")

Original file line number	Diff line number	Diff line change
`@@ -13,6 +13,7 @@ including other versions of pandas.`
`13`	`13`
`14`	`14`	`Fixed regressions`
`15`	`15`	`~~~~~~~~~~~~~~~~~`
	`16`	+- :meth:`DataFrame.__dataframe__` was producing incorrect data buffers when the a column's type was a pandas nullable on with missing values (:issue:`56702`)
`16`	`17`	`-`
`17`	`18`
`18`	`19`	`.. ---------------------------------------------------------------------------`