From 9876c64cd6bf1c89c6122f17bdba2acac1340ca7 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Thu, 21 Sep 2023 12:54:06 +0200 Subject: [PATCH 1/9] use buffer dtype in interchange from_dataframe --- pandas/core/interchange/from_dataframe.py | 60 ++++++++++++++--------- 1 file changed, 37 insertions(+), 23 deletions(-) diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py index 214fbf9f36435..f5b1281c6523d 100644 --- a/pandas/core/interchange/from_dataframe.py +++ b/pandas/core/interchange/from_dataframe.py @@ -266,21 +266,29 @@ def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]: assert buffers["offsets"], "String buffers must contain offsets" # Retrieve the data buffer containing the UTF-8 code units - data_buff, protocol_data_dtype = buffers["data"] - # We're going to reinterpret the buffer as uint8, so make sure we can do it safely - assert protocol_data_dtype[1] == 8 - assert protocol_data_dtype[2] in ( - ArrowCTypes.STRING, - ArrowCTypes.LARGE_STRING, - ) # format_str == utf-8 - # Convert the buffers to NumPy arrays. In order to go from STRING to - # an equivalent ndarray, we claim that the buffer is uint8 (i.e., a byte array) - data_dtype = ( - DtypeKind.UINT, - 8, - ArrowCTypes.UINT8, - Endianness.NATIVE, - ) + data_buff, data_dtype = buffers["data"] + + if (data_dtype[1] == 8) and ( + data_dtype[2] + in ( + ArrowCTypes.STRING, + ArrowCTypes.LARGE_STRING, + ) + ): # format_str == utf-8 + # temporary workaround to keep backwards compatibility due to + # https://github.com/pandas-dev/pandas/issues/54781 + + # We're going to reinterpret the buffer as uint8, so make sure we can do it + # safely + + # Convert the buffers to NumPy arrays. In order to go from STRING to + # an equivalent ndarray, we claim that the buffer is uint8 (i.e., a byte array) + data_dtype = ( + DtypeKind.UINT, + 8, + ArrowCTypes.UINT8, + Endianness.NATIVE, + ) # Specify zero offset as we don't want to chunk the string data data = buffer_to_ndarray(data_buff, data_dtype, offset=0, length=data_buff.bufsize) @@ -378,16 +386,22 @@ def datetime_column_to_ndarray(col: Column) -> tuple[np.ndarray | pd.Series, Any buffers = col.get_buffers() _, _, format_str, _ = col.dtype - dbuf, dtype = buffers["data"] - # Consider dtype being `uint` to get number of units passed since the 01.01.1970 - data = buffer_to_ndarray( - dbuf, - ( + dbuf, data_dtype = buffers["data"] + + if data_dtype[0] == DtypeKind.DATETIME: + # temporary workaround to keep backwards compatibility due to + # https://github.com/pandas-dev/pandas/issues/54781 + # Consider dtype being `uint` to get number of units passed since the 01.01.1970 + data_dtype = ( DtypeKind.UINT, - dtype[1], - getattr(ArrowCTypes, f"UINT{dtype[1]}"), + data_dtype[1], + getattr(ArrowCTypes, f"UINT{data_dtype[1]}"), Endianness.NATIVE, - ), + ) + + data = buffer_to_ndarray( + dbuf, + data_dtype, offset=col.offset, length=col.size(), ) From 00ad9c7826080a9454d462a2377c21ac66d24c98 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sat, 7 Oct 2023 20:30:09 +0300 Subject: [PATCH 2/9] wip --- pandas/core/interchange/from_dataframe.py | 6 +++--- pandas/tests/interchange/test_impl.py | 15 +++++++++++++++ 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py index f5b1281c6523d..515342916fe33 100644 --- a/pandas/core/interchange/from_dataframe.py +++ b/pandas/core/interchange/from_dataframe.py @@ -391,11 +391,11 @@ def datetime_column_to_ndarray(col: Column) -> tuple[np.ndarray | pd.Series, Any if data_dtype[0] == DtypeKind.DATETIME: # temporary workaround to keep backwards compatibility due to # https://github.com/pandas-dev/pandas/issues/54781 - # Consider dtype being `uint` to get number of units passed since the 01.01.1970 + # Consider dtype being `int` to get number of units passed since 1970-01-01 data_dtype = ( - DtypeKind.UINT, + DtypeKind.INT, data_dtype[1], - getattr(ArrowCTypes, f"UINT{data_dtype[1]}"), + getattr(ArrowCTypes, f"INT{data_dtype[1]}"), Endianness.NATIVE, ) diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index 8a25a2c1889f3..63a5f7cb3da87 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -14,6 +14,7 @@ DtypeKind, ) from pandas.core.interchange.from_dataframe import from_dataframe +from pandas.core.interchange.utils import ArrowCTypes @pytest.fixture @@ -326,3 +327,17 @@ def test_interchange_from_non_pandas_tz_aware(): dtype="datetime64[us, Asia/Kathmandu]", ) tm.assert_frame_equal(expected, result) + + +def test_interchange_from_corrected_buffer_dtypes() -> None: + df = pd.DataFrame({"ts": [datetime(2020, 1, 1), datetime(2020, 1, 2)]}) + interchange = df.__dataframe__() + column = interchange.get_column_by_name("ts") + buffer_dtype = column.get_buffers()["data"][1] + buffer_dtype = ( + DtypeKind.INT, + buffer_dtype[1], + ArrowCTypes.INT64, + buffer_dtype[3], + ) + pd.api.interchange.from_dataframe(df) From d54f950e60a65c853856c620cdb0f391447462f4 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sat, 7 Oct 2023 20:36:42 +0300 Subject: [PATCH 3/9] wip --- pandas/tests/interchange/test_impl.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index 63a5f7cb3da87..90d8a5d6d94fd 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -329,15 +329,19 @@ def test_interchange_from_non_pandas_tz_aware(): tm.assert_frame_equal(expected, result) -def test_interchange_from_corrected_buffer_dtypes() -> None: - df = pd.DataFrame({"ts": [datetime(2020, 1, 1), datetime(2020, 1, 2)]}) +def test_interchange_from_corrected_buffer_dtypes(monkeypatch) -> None: + df = pd.DataFrame({"a": ["foo", "bar"]}) interchange = df.__dataframe__() - column = interchange.get_column_by_name("ts") - buffer_dtype = column.get_buffers()["data"][1] + column = interchange.get_column_by_name("a") + buffers = column.get_buffers() + buffers_data = buffers["data"] + buffer_dtype = buffers_data[1] buffer_dtype = ( DtypeKind.INT, buffer_dtype[1], ArrowCTypes.INT64, buffer_dtype[3], ) + buffers["data"] = (buffers_data[0], buffer_dtype) + monkeypatch.setattr(column, "get_buffers", lambda: buffers) pd.api.interchange.from_dataframe(df) From adeceb90e967b54c5817cf273bf4902d6ffa0fce Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sat, 7 Oct 2023 20:54:03 +0300 Subject: [PATCH 4/9] add failing test --- pandas/tests/interchange/test_impl.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index 90d8a5d6d94fd..db163397ade67 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -330,18 +330,21 @@ def test_interchange_from_non_pandas_tz_aware(): def test_interchange_from_corrected_buffer_dtypes(monkeypatch) -> None: - df = pd.DataFrame({"a": ["foo", "bar"]}) + # https://github.com/pandas-dev/pandas/issues/54781 + df = pd.DataFrame({"a": ["foo", "bar"]}).__dataframe__() interchange = df.__dataframe__() column = interchange.get_column_by_name("a") buffers = column.get_buffers() buffers_data = buffers["data"] buffer_dtype = buffers_data[1] buffer_dtype = ( - DtypeKind.INT, - buffer_dtype[1], - ArrowCTypes.INT64, + DtypeKind.UINT, + 8, + ArrowCTypes.UINT8, buffer_dtype[3], ) buffers["data"] = (buffers_data[0], buffer_dtype) - monkeypatch.setattr(column, "get_buffers", lambda: buffers) + column.get_buffers = lambda: buffers + interchange.get_column_by_name = lambda _: column + monkeypatch.setattr(df, "__dataframe__", lambda allow_copy: interchange) pd.api.interchange.from_dataframe(df) From 3557b4aa26a99bfd55ce5065748fe17b5a998835 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Tue, 10 Oct 2023 12:31:48 +0300 Subject: [PATCH 5/9] wip --- pandas/core/interchange/from_dataframe.py | 61 ++++++++++------------- 1 file changed, 26 insertions(+), 35 deletions(-) diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py index 515342916fe33..ae675a3009319 100644 --- a/pandas/core/interchange/from_dataframe.py +++ b/pandas/core/interchange/from_dataframe.py @@ -266,29 +266,24 @@ def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]: assert buffers["offsets"], "String buffers must contain offsets" # Retrieve the data buffer containing the UTF-8 code units - data_buff, data_dtype = buffers["data"] - - if (data_dtype[1] == 8) and ( - data_dtype[2] - in ( - ArrowCTypes.STRING, - ArrowCTypes.LARGE_STRING, - ) - ): # format_str == utf-8 - # temporary workaround to keep backwards compatibility due to - # https://github.com/pandas-dev/pandas/issues/54781 - - # We're going to reinterpret the buffer as uint8, so make sure we can do it - # safely - - # Convert the buffers to NumPy arrays. In order to go from STRING to - # an equivalent ndarray, we claim that the buffer is uint8 (i.e., a byte array) - data_dtype = ( - DtypeKind.UINT, - 8, - ArrowCTypes.UINT8, - Endianness.NATIVE, - ) + data_buff, _ = buffers["data"] + + assert col.dtype[2] in ( + ArrowCTypes.STRING, + ArrowCTypes.LARGE_STRING, + ) # format_str == utf-8 + + # We're going to reinterpret the buffer as uint8, so make sure we can do it + # safely + + # Convert the buffers to NumPy arrays. In order to go from STRING to + # an equivalent ndarray, we claim that the buffer is uint8 (i.e., a byte array) + data_dtype = ( + DtypeKind.UINT, + 8, + ArrowCTypes.UINT8, + Endianness.NATIVE, + ) # Specify zero offset as we don't want to chunk the string data data = buffer_to_ndarray(data_buff, data_dtype, offset=0, length=data_buff.bufsize) @@ -386,22 +381,18 @@ def datetime_column_to_ndarray(col: Column) -> tuple[np.ndarray | pd.Series, Any buffers = col.get_buffers() _, _, format_str, _ = col.dtype - dbuf, data_dtype = buffers["data"] + dbuf, _ = buffers["data"] - if data_dtype[0] == DtypeKind.DATETIME: - # temporary workaround to keep backwards compatibility due to - # https://github.com/pandas-dev/pandas/issues/54781 - # Consider dtype being `int` to get number of units passed since 1970-01-01 - data_dtype = ( - DtypeKind.INT, - data_dtype[1], - getattr(ArrowCTypes, f"INT{data_dtype[1]}"), - Endianness.NATIVE, - ) + # Consider dtype being `int` to get number of units passed since 1970-01-01 data = buffer_to_ndarray( dbuf, - data_dtype, + dtype=( + DtypeKind.INT, + col.dtype[1], + getattr(ArrowCTypes, f"INT{col.dtype[1]}"), + Endianness.NATIVE, + ), offset=col.offset, length=col.size(), ) From 0ef179ab39c8a8b1e02bb909b5b9f8f9f3a35848 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Tue, 10 Oct 2023 12:36:04 +0300 Subject: [PATCH 6/9] simplify --- pandas/core/interchange/from_dataframe.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py index ae675a3009319..1a6fbf609f3da 100644 --- a/pandas/core/interchange/from_dataframe.py +++ b/pandas/core/interchange/from_dataframe.py @@ -267,15 +267,13 @@ def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]: assert buffers["offsets"], "String buffers must contain offsets" # Retrieve the data buffer containing the UTF-8 code units data_buff, _ = buffers["data"] - + # We're going to reinterpret the buffer as uint8, so make sure we can do it safely + assert col.dtype[1] == 8 assert col.dtype[2] in ( ArrowCTypes.STRING, ArrowCTypes.LARGE_STRING, ) # format_str == utf-8 - # We're going to reinterpret the buffer as uint8, so make sure we can do it - # safely - # Convert the buffers to NumPy arrays. In order to go from STRING to # an equivalent ndarray, we claim that the buffer is uint8 (i.e., a byte array) data_dtype = ( @@ -382,12 +380,11 @@ def datetime_column_to_ndarray(col: Column) -> tuple[np.ndarray | pd.Series, Any _, _, format_str, _ = col.dtype dbuf, _ = buffers["data"] - - # Consider dtype being `int` to get number of units passed since 1970-01-01 + # Consider dtype being `uint` to get number of units passed since the 01.01.1970 data = buffer_to_ndarray( dbuf, - dtype=( + ( DtypeKind.INT, col.dtype[1], getattr(ArrowCTypes, f"INT{col.dtype[1]}"), From df996ac2107352beb3e1abd091af4a187bda0885 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Tue, 10 Oct 2023 12:37:41 +0300 Subject: [PATCH 7/9] remove unnecessary assertion --- pandas/core/interchange/from_dataframe.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py index 1a6fbf609f3da..8b19cb3ec8ef2 100644 --- a/pandas/core/interchange/from_dataframe.py +++ b/pandas/core/interchange/from_dataframe.py @@ -268,12 +268,10 @@ def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]: # Retrieve the data buffer containing the UTF-8 code units data_buff, _ = buffers["data"] # We're going to reinterpret the buffer as uint8, so make sure we can do it safely - assert col.dtype[1] == 8 assert col.dtype[2] in ( ArrowCTypes.STRING, ArrowCTypes.LARGE_STRING, ) # format_str == utf-8 - # Convert the buffers to NumPy arrays. In order to go from STRING to # an equivalent ndarray, we claim that the buffer is uint8 (i.e., a byte array) data_dtype = ( From 0bea19bb37420ebac63dadc6ef2da6ed1873316b Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Tue, 10 Oct 2023 13:06:39 +0300 Subject: [PATCH 8/9] dont double-extract bit width --- pandas/core/interchange/from_dataframe.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py index 8b19cb3ec8ef2..9504e4fd5cc3e 100644 --- a/pandas/core/interchange/from_dataframe.py +++ b/pandas/core/interchange/from_dataframe.py @@ -376,7 +376,7 @@ def datetime_column_to_ndarray(col: Column) -> tuple[np.ndarray | pd.Series, Any """ buffers = col.get_buffers() - _, _, format_str, _ = col.dtype + _, col_bit_width, format_str, _ = col.dtype dbuf, _ = buffers["data"] # Consider dtype being `uint` to get number of units passed since the 01.01.1970 @@ -384,7 +384,7 @@ def datetime_column_to_ndarray(col: Column) -> tuple[np.ndarray | pd.Series, Any dbuf, ( DtypeKind.INT, - col.dtype[1], + col_bit_width, getattr(ArrowCTypes, f"INT{col.dtype[1]}"), Endianness.NATIVE, ), From d04ac929242f318a594da79b1ffa574a6a6ae6b0 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Tue, 10 Oct 2023 13:50:32 +0300 Subject: [PATCH 9/9] Update pandas/core/interchange/from_dataframe.py Co-authored-by: Stijn de Gooijer --- pandas/core/interchange/from_dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py index 9504e4fd5cc3e..d45ae37890ba7 100644 --- a/pandas/core/interchange/from_dataframe.py +++ b/pandas/core/interchange/from_dataframe.py @@ -385,7 +385,7 @@ def datetime_column_to_ndarray(col: Column) -> tuple[np.ndarray | pd.Series, Any ( DtypeKind.INT, col_bit_width, - getattr(ArrowCTypes, f"INT{col.dtype[1]}"), + getattr(ArrowCTypes, f"INT{col_bit_width}"), Endianness.NATIVE, ), offset=col.offset,