Skip to content

Commit ace17a3

Browse files
fix: Iceberg schema evolution fails for map, array and struct types (#2755)
1 parent d574220 commit ace17a3

File tree

2 files changed

+62
-1
lines changed

2 files changed

+62
-1
lines changed

awswrangler/_data_types.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -308,6 +308,7 @@ def _split_map(s: str) -> list[str]:
308308

309309
def athena2pyarrow(dtype: str) -> pa.DataType: # noqa: PLR0911,PLR0912
310310
"""Athena to PyArrow data types conversion."""
311+
dtype = dtype.strip()
311312
if dtype.startswith(("array", "struct", "map")):
312313
orig_dtype: str = dtype
313314
dtype = dtype.lower().replace(" ", "")
@@ -375,7 +376,7 @@ def athena2pandas(dtype: str, dtype_backend: str | None = None) -> str: # noqa:
375376
return "decimal" if dtype_backend != "pyarrow" else "double[pyarrow]"
376377
if dtype in ("binary", "varbinary"):
377378
return "bytes" if dtype_backend != "pyarrow" else "binary[pyarrow]"
378-
if dtype in ("array", "row", "map"):
379+
if any(dtype.startswith(t) for t in ["array", "row", "map", "struct"]):
379380
return "object"
380381
if dtype == "geometry":
381382
return "string"

tests/unit/test_athena_iceberg.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -924,3 +924,63 @@ def test_to_iceberg_uppercase_columns(
924924
)
925925

926926
assert_pandas_equals(df, df_output)
927+
928+
929+
def test_to_iceberg_fill_missing_columns_with_complex_types(
930+
path: str,
931+
path2: str,
932+
glue_database: str,
933+
glue_table: str,
934+
) -> None:
935+
df_with_col = pd.DataFrame(
936+
{
937+
"partition": [1, 1, 2, 2],
938+
"column2": ["A", "B", "C", "D"],
939+
"map_col": [{"s": "d"}, {"s": "h"}, {"i": "l"}, {}],
940+
"struct_col": [
941+
{"a": "val1", "b": {"c": "val21"}},
942+
{"a": "val1", "b": {"c": None}},
943+
{"a": "val1", "b": None},
944+
{},
945+
],
946+
}
947+
)
948+
df_missing_col = pd.DataFrame(
949+
{
950+
"partition": [2, 2],
951+
"column2": ["Z", "X"],
952+
}
953+
)
954+
955+
glue_dtypes = {
956+
"partition": "int",
957+
"column2": "string",
958+
"map_col": "map<string, string>",
959+
"struct_col": "struct<a: string, b: struct<c: string>>",
960+
}
961+
962+
wr.athena.to_iceberg(
963+
df=df_with_col,
964+
database=glue_database,
965+
table=glue_table,
966+
table_location=path,
967+
temp_path=path2,
968+
keep_files=False,
969+
dtype=glue_dtypes,
970+
mode="overwrite_partitions",
971+
partition_cols=["partition"],
972+
)
973+
974+
wr.athena.to_iceberg(
975+
df=df_missing_col,
976+
database=glue_database,
977+
table=glue_table,
978+
table_location=path,
979+
temp_path=path2,
980+
keep_files=False,
981+
dtype=glue_dtypes,
982+
mode="overwrite_partitions",
983+
partition_cols=["partition"],
984+
schema_evolution=True,
985+
fill_missing_columns_in_df=True,
986+
)

0 commit comments

Comments
 (0)