From 9876c64cd6bf1c89c6122f17bdba2acac1340ca7 Mon Sep 17 00:00:00 2001
From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com>
Date: Thu, 21 Sep 2023 12:54:06 +0200
Subject: [PATCH 1/9] use buffer dtype in interchange from_dataframe

---
 pandas/core/interchange/from_dataframe.py | 60 ++++++++++++++---------
 1 file changed, 37 insertions(+), 23 deletions(-)

diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py
index 214fbf9f36435..f5b1281c6523d 100644
--- a/pandas/core/interchange/from_dataframe.py
+++ b/pandas/core/interchange/from_dataframe.py
@@ -266,21 +266,29 @@ def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
 
     assert buffers["offsets"], "String buffers must contain offsets"
     # Retrieve the data buffer containing the UTF-8 code units
-    data_buff, protocol_data_dtype = buffers["data"]
-    # We're going to reinterpret the buffer as uint8, so make sure we can do it safely
-    assert protocol_data_dtype[1] == 8
-    assert protocol_data_dtype[2] in (
-        ArrowCTypes.STRING,
-        ArrowCTypes.LARGE_STRING,
-    )  # format_str == utf-8
-    # Convert the buffers to NumPy arrays. In order to go from STRING to
-    # an equivalent ndarray, we claim that the buffer is uint8 (i.e., a byte array)
-    data_dtype = (
-        DtypeKind.UINT,
-        8,
-        ArrowCTypes.UINT8,
-        Endianness.NATIVE,
-    )
+    data_buff, data_dtype = buffers["data"]
+
+    if (data_dtype[1] == 8) and (
+        data_dtype[2]
+        in (
+            ArrowCTypes.STRING,
+            ArrowCTypes.LARGE_STRING,
+        )
+    ):  # format_str == utf-8
+        # temporary workaround to keep backwards compatibility due to
+        # https://github.com/pandas-dev/pandas/issues/54781
+
+        # We're going to reinterpret the buffer as uint8, so make sure we can do it
+        # safely
+
+        # Convert the buffers to NumPy arrays. In order to go from STRING to
+        # an equivalent ndarray, we claim that the buffer is uint8 (i.e., a byte array)
+        data_dtype = (
+            DtypeKind.UINT,
+            8,
+            ArrowCTypes.UINT8,
+            Endianness.NATIVE,
+        )
     # Specify zero offset as we don't want to chunk the string data
     data = buffer_to_ndarray(data_buff, data_dtype, offset=0, length=data_buff.bufsize)
 
@@ -378,16 +386,22 @@ def datetime_column_to_ndarray(col: Column) -> tuple[np.ndarray | pd.Series, Any
     buffers = col.get_buffers()
 
     _, _, format_str, _ = col.dtype
-    dbuf, dtype = buffers["data"]
-    # Consider dtype being `uint` to get number of units passed since the 01.01.1970
-    data = buffer_to_ndarray(
-        dbuf,
-        (
+    dbuf, data_dtype = buffers["data"]
+
+    if data_dtype[0] == DtypeKind.DATETIME:
+        # temporary workaround to keep backwards compatibility due to
+        # https://github.com/pandas-dev/pandas/issues/54781
+        # Consider dtype being `uint` to get number of units passed since the 01.01.1970
+        data_dtype = (
             DtypeKind.UINT,
-            dtype[1],
-            getattr(ArrowCTypes, f"UINT{dtype[1]}"),
+            data_dtype[1],
+            getattr(ArrowCTypes, f"UINT{data_dtype[1]}"),
             Endianness.NATIVE,
-        ),
+        )
+
+    data = buffer_to_ndarray(
+        dbuf,
+        data_dtype,
         offset=col.offset,
         length=col.size(),
     )

From 00ad9c7826080a9454d462a2377c21ac66d24c98 Mon Sep 17 00:00:00 2001
From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com>
Date: Sat, 7 Oct 2023 20:30:09 +0300
Subject: [PATCH 2/9] wip

---
 pandas/core/interchange/from_dataframe.py |  6 +++---
 pandas/tests/interchange/test_impl.py     | 15 +++++++++++++++
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py
index f5b1281c6523d..515342916fe33 100644
--- a/pandas/core/interchange/from_dataframe.py
+++ b/pandas/core/interchange/from_dataframe.py
@@ -391,11 +391,11 @@ def datetime_column_to_ndarray(col: Column) -> tuple[np.ndarray | pd.Series, Any
     if data_dtype[0] == DtypeKind.DATETIME:
         # temporary workaround to keep backwards compatibility due to
         # https://github.com/pandas-dev/pandas/issues/54781
-        # Consider dtype being `uint` to get number of units passed since the 01.01.1970
+        # Consider dtype being `int` to get number of units passed since 1970-01-01
         data_dtype = (
-            DtypeKind.UINT,
+            DtypeKind.INT,
             data_dtype[1],
-            getattr(ArrowCTypes, f"UINT{data_dtype[1]}"),
+            getattr(ArrowCTypes, f"INT{data_dtype[1]}"),
             Endianness.NATIVE,
         )
 
diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py
index 8a25a2c1889f3..63a5f7cb3da87 100644
--- a/pandas/tests/interchange/test_impl.py
+++ b/pandas/tests/interchange/test_impl.py
@@ -14,6 +14,7 @@
     DtypeKind,
 )
 from pandas.core.interchange.from_dataframe import from_dataframe
+from pandas.core.interchange.utils import ArrowCTypes
 
 
 @pytest.fixture
@@ -326,3 +327,17 @@ def test_interchange_from_non_pandas_tz_aware():
         dtype="datetime64[us, Asia/Kathmandu]",
     )
     tm.assert_frame_equal(expected, result)
+
+
+def test_interchange_from_corrected_buffer_dtypes() -> None:
+    df = pd.DataFrame({"ts": [datetime(2020, 1, 1), datetime(2020, 1, 2)]})
+    interchange = df.__dataframe__()
+    column = interchange.get_column_by_name("ts")
+    buffer_dtype = column.get_buffers()["data"][1]
+    buffer_dtype = (
+        DtypeKind.INT,
+        buffer_dtype[1],
+        ArrowCTypes.INT64,
+        buffer_dtype[3],
+    )
+    pd.api.interchange.from_dataframe(df)

From d54f950e60a65c853856c620cdb0f391447462f4 Mon Sep 17 00:00:00 2001
From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com>
Date: Sat, 7 Oct 2023 20:36:42 +0300
Subject: [PATCH 3/9] wip

---
 pandas/tests/interchange/test_impl.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py
index 63a5f7cb3da87..90d8a5d6d94fd 100644
--- a/pandas/tests/interchange/test_impl.py
+++ b/pandas/tests/interchange/test_impl.py
@@ -329,15 +329,19 @@ def test_interchange_from_non_pandas_tz_aware():
     tm.assert_frame_equal(expected, result)
 
 
-def test_interchange_from_corrected_buffer_dtypes() -> None:
-    df = pd.DataFrame({"ts": [datetime(2020, 1, 1), datetime(2020, 1, 2)]})
+def test_interchange_from_corrected_buffer_dtypes(monkeypatch) -> None:
+    df = pd.DataFrame({"a": ["foo", "bar"]})
     interchange = df.__dataframe__()
-    column = interchange.get_column_by_name("ts")
-    buffer_dtype = column.get_buffers()["data"][1]
+    column = interchange.get_column_by_name("a")
+    buffers = column.get_buffers()
+    buffers_data = buffers["data"]
+    buffer_dtype = buffers_data[1]
     buffer_dtype = (
         DtypeKind.INT,
         buffer_dtype[1],
         ArrowCTypes.INT64,
         buffer_dtype[3],
     )
+    buffers["data"] = (buffers_data[0], buffer_dtype)
+    monkeypatch.setattr(column, "get_buffers", lambda: buffers)
     pd.api.interchange.from_dataframe(df)

From adeceb90e967b54c5817cf273bf4902d6ffa0fce Mon Sep 17 00:00:00 2001
From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com>
Date: Sat, 7 Oct 2023 20:54:03 +0300
Subject: [PATCH 4/9] add failing test

---
 pandas/tests/interchange/test_impl.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py
index 90d8a5d6d94fd..db163397ade67 100644
--- a/pandas/tests/interchange/test_impl.py
+++ b/pandas/tests/interchange/test_impl.py
@@ -330,18 +330,21 @@ def test_interchange_from_non_pandas_tz_aware():
 
 
 def test_interchange_from_corrected_buffer_dtypes(monkeypatch) -> None:
-    df = pd.DataFrame({"a": ["foo", "bar"]})
+    # https://github.com/pandas-dev/pandas/issues/54781
+    df = pd.DataFrame({"a": ["foo", "bar"]}).__dataframe__()
     interchange = df.__dataframe__()
     column = interchange.get_column_by_name("a")
     buffers = column.get_buffers()
     buffers_data = buffers["data"]
     buffer_dtype = buffers_data[1]
     buffer_dtype = (
-        DtypeKind.INT,
-        buffer_dtype[1],
-        ArrowCTypes.INT64,
+        DtypeKind.UINT,
+        8,
+        ArrowCTypes.UINT8,
         buffer_dtype[3],
     )
     buffers["data"] = (buffers_data[0], buffer_dtype)
-    monkeypatch.setattr(column, "get_buffers", lambda: buffers)
+    column.get_buffers = lambda: buffers
+    interchange.get_column_by_name = lambda _: column
+    monkeypatch.setattr(df, "__dataframe__", lambda allow_copy: interchange)
     pd.api.interchange.from_dataframe(df)

From 3557b4aa26a99bfd55ce5065748fe17b5a998835 Mon Sep 17 00:00:00 2001
From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com>
Date: Tue, 10 Oct 2023 12:31:48 +0300
Subject: [PATCH 5/9] wip

---
 pandas/core/interchange/from_dataframe.py | 61 ++++++++++-------------
 1 file changed, 26 insertions(+), 35 deletions(-)

diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py
index 515342916fe33..ae675a3009319 100644
--- a/pandas/core/interchange/from_dataframe.py
+++ b/pandas/core/interchange/from_dataframe.py
@@ -266,29 +266,24 @@ def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
 
     assert buffers["offsets"], "String buffers must contain offsets"
     # Retrieve the data buffer containing the UTF-8 code units
-    data_buff, data_dtype = buffers["data"]
-
-    if (data_dtype[1] == 8) and (
-        data_dtype[2]
-        in (
-            ArrowCTypes.STRING,
-            ArrowCTypes.LARGE_STRING,
-        )
-    ):  # format_str == utf-8
-        # temporary workaround to keep backwards compatibility due to
-        # https://github.com/pandas-dev/pandas/issues/54781
-
-        # We're going to reinterpret the buffer as uint8, so make sure we can do it
-        # safely
-
-        # Convert the buffers to NumPy arrays. In order to go from STRING to
-        # an equivalent ndarray, we claim that the buffer is uint8 (i.e., a byte array)
-        data_dtype = (
-            DtypeKind.UINT,
-            8,
-            ArrowCTypes.UINT8,
-            Endianness.NATIVE,
-        )
+    data_buff, _ = buffers["data"]
+
+    assert col.dtype[2] in (
+        ArrowCTypes.STRING,
+        ArrowCTypes.LARGE_STRING,
+    )  # format_str == utf-8
+
+    # We're going to reinterpret the buffer as uint8, so make sure we can do it
+    # safely
+
+    # Convert the buffers to NumPy arrays. In order to go from STRING to
+    # an equivalent ndarray, we claim that the buffer is uint8 (i.e., a byte array)
+    data_dtype = (
+        DtypeKind.UINT,
+        8,
+        ArrowCTypes.UINT8,
+        Endianness.NATIVE,
+    )
     # Specify zero offset as we don't want to chunk the string data
     data = buffer_to_ndarray(data_buff, data_dtype, offset=0, length=data_buff.bufsize)
 
@@ -386,22 +381,18 @@ def datetime_column_to_ndarray(col: Column) -> tuple[np.ndarray | pd.Series, Any
     buffers = col.get_buffers()
 
     _, _, format_str, _ = col.dtype
-    dbuf, data_dtype = buffers["data"]
+    dbuf, _ = buffers["data"]
 
-    if data_dtype[0] == DtypeKind.DATETIME:
-        # temporary workaround to keep backwards compatibility due to
-        # https://github.com/pandas-dev/pandas/issues/54781
-        # Consider dtype being `int` to get number of units passed since 1970-01-01
-        data_dtype = (
-            DtypeKind.INT,
-            data_dtype[1],
-            getattr(ArrowCTypes, f"INT{data_dtype[1]}"),
-            Endianness.NATIVE,
-        )
+    # Consider dtype being `int` to get number of units passed since 1970-01-01
 
     data = buffer_to_ndarray(
         dbuf,
-        data_dtype,
+        dtype=(
+            DtypeKind.INT,
+            col.dtype[1],
+            getattr(ArrowCTypes, f"INT{col.dtype[1]}"),
+            Endianness.NATIVE,
+        ),
         offset=col.offset,
         length=col.size(),
     )

From 0ef179ab39c8a8b1e02bb909b5b9f8f9f3a35848 Mon Sep 17 00:00:00 2001
From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com>
Date: Tue, 10 Oct 2023 12:36:04 +0300
Subject: [PATCH 6/9] simplify

---
 pandas/core/interchange/from_dataframe.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py
index ae675a3009319..1a6fbf609f3da 100644
--- a/pandas/core/interchange/from_dataframe.py
+++ b/pandas/core/interchange/from_dataframe.py
@@ -267,15 +267,13 @@ def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
     assert buffers["offsets"], "String buffers must contain offsets"
     # Retrieve the data buffer containing the UTF-8 code units
     data_buff, _ = buffers["data"]
-
+    # We're going to reinterpret the buffer as uint8, so make sure we can do it safely
+    assert col.dtype[1] == 8
     assert col.dtype[2] in (
         ArrowCTypes.STRING,
         ArrowCTypes.LARGE_STRING,
     )  # format_str == utf-8
 
-    # We're going to reinterpret the buffer as uint8, so make sure we can do it
-    # safely
-
     # Convert the buffers to NumPy arrays. In order to go from STRING to
     # an equivalent ndarray, we claim that the buffer is uint8 (i.e., a byte array)
     data_dtype = (
@@ -382,12 +380,11 @@ def datetime_column_to_ndarray(col: Column) -> tuple[np.ndarray | pd.Series, Any
 
     _, _, format_str, _ = col.dtype
     dbuf, _ = buffers["data"]
-
-    # Consider dtype being `int` to get number of units passed since 1970-01-01
+    # Consider dtype being `uint` to get number of units passed since the 01.01.1970
 
     data = buffer_to_ndarray(
         dbuf,
-        dtype=(
+        (
             DtypeKind.INT,
             col.dtype[1],
             getattr(ArrowCTypes, f"INT{col.dtype[1]}"),

From df996ac2107352beb3e1abd091af4a187bda0885 Mon Sep 17 00:00:00 2001
From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com>
Date: Tue, 10 Oct 2023 12:37:41 +0300
Subject: [PATCH 7/9] remove unnecessary assertion

---
 pandas/core/interchange/from_dataframe.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py
index 1a6fbf609f3da..8b19cb3ec8ef2 100644
--- a/pandas/core/interchange/from_dataframe.py
+++ b/pandas/core/interchange/from_dataframe.py
@@ -268,12 +268,10 @@ def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
     # Retrieve the data buffer containing the UTF-8 code units
     data_buff, _ = buffers["data"]
     # We're going to reinterpret the buffer as uint8, so make sure we can do it safely
-    assert col.dtype[1] == 8
     assert col.dtype[2] in (
         ArrowCTypes.STRING,
         ArrowCTypes.LARGE_STRING,
     )  # format_str == utf-8
-
     # Convert the buffers to NumPy arrays. In order to go from STRING to
     # an equivalent ndarray, we claim that the buffer is uint8 (i.e., a byte array)
     data_dtype = (

From 0bea19bb37420ebac63dadc6ef2da6ed1873316b Mon Sep 17 00:00:00 2001
From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com>
Date: Tue, 10 Oct 2023 13:06:39 +0300
Subject: [PATCH 8/9] dont double-extract bit width

---
 pandas/core/interchange/from_dataframe.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py
index 8b19cb3ec8ef2..9504e4fd5cc3e 100644
--- a/pandas/core/interchange/from_dataframe.py
+++ b/pandas/core/interchange/from_dataframe.py
@@ -376,7 +376,7 @@ def datetime_column_to_ndarray(col: Column) -> tuple[np.ndarray | pd.Series, Any
     """
     buffers = col.get_buffers()
 
-    _, _, format_str, _ = col.dtype
+    _, col_bit_width, format_str, _ = col.dtype
     dbuf, _ = buffers["data"]
     # Consider dtype being `uint` to get number of units passed since the 01.01.1970
 
@@ -384,7 +384,7 @@ def datetime_column_to_ndarray(col: Column) -> tuple[np.ndarray | pd.Series, Any
         dbuf,
         (
             DtypeKind.INT,
-            col.dtype[1],
+            col_bit_width,
             getattr(ArrowCTypes, f"INT{col.dtype[1]}"),
             Endianness.NATIVE,
         ),

From d04ac929242f318a594da79b1ffa574a6a6ae6b0 Mon Sep 17 00:00:00 2001
From: Marco Edward Gorelli <marcogorelli@protonmail.com>
Date: Tue, 10 Oct 2023 13:50:32 +0300
Subject: [PATCH 9/9] Update pandas/core/interchange/from_dataframe.py

Co-authored-by: Stijn de Gooijer <stijn@degooijer.io>
---
 pandas/core/interchange/from_dataframe.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py
index 9504e4fd5cc3e..d45ae37890ba7 100644
--- a/pandas/core/interchange/from_dataframe.py
+++ b/pandas/core/interchange/from_dataframe.py
@@ -385,7 +385,7 @@ def datetime_column_to_ndarray(col: Column) -> tuple[np.ndarray | pd.Series, Any
         (
             DtypeKind.INT,
             col_bit_width,
-            getattr(ArrowCTypes, f"INT{col.dtype[1]}"),
+            getattr(ArrowCTypes, f"INT{col_bit_width}"),
             Endianness.NATIVE,
         ),
         offset=col.offset,