Skip to content

Commit 96840d8

Browse files
authored
Merge branch 'main' into upgrade-ray-2.30
2 parents 2190bac + a474296 commit 96840d8

File tree

5 files changed

+80
-10
lines changed

5 files changed

+80
-10
lines changed

awswrangler/athena/_executions.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ def start_query_execution(
104104
If cached results are valid, awswrangler ignores the `ctas_approach`, `s3_output`, `encryption`, `kms_key`,
105105
`keep_files` and `ctas_temp_table_name` params.
106106
If reading cached data fails for any reason, execution falls back to the usual query run path.
107-
athena_query_wait_polling_delay: float, default: 0.25 seconds
107+
athena_query_wait_polling_delay: float, default: 1.0 seconds
108108
Interval in seconds for how often the function will check if the Athena query has completed.
109109
data_source : str, optional
110110
Data Source / Catalog name. If None, 'AwsDataCatalog' will be used by default.
@@ -211,7 +211,7 @@ def wait_query(
211211
Athena query execution ID.
212212
boto3_session : boto3.Session(), optional
213213
Boto3 Session. The default boto3 session will be used if boto3_session receive None.
214-
athena_query_wait_polling_delay: float, default: 0.25 seconds
214+
athena_query_wait_polling_delay: float, default: 1.0 seconds
215215
Interval in seconds for how often the function will check if the Athena query has completed.
216216
217217
Returns

awswrangler/athena/_read.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -705,7 +705,7 @@ def get_query_results(
705705
Forwarded to `to_pandas` method converting from PyArrow tables to Pandas DataFrame.
706706
Valid values include "split_blocks", "self_destruct", "ignore_metadata".
707707
e.g. pyarrow_additional_kwargs={'split_blocks': True}.
708-
athena_query_wait_polling_delay: float, default: 0.25 seconds
708+
athena_query_wait_polling_delay: float, default: 1.0 seconds
709709
Interval in seconds for how often the function will check if the Athena query has completed.
710710
711711
Returns
@@ -960,7 +960,7 @@ def read_sql_query(
960960
If reading cached data fails for any reason, execution falls back to the usual query run path.
961961
data_source : str, optional
962962
Data Source / Catalog name. If None, 'AwsDataCatalog' will be used by default.
963-
athena_query_wait_polling_delay: float, default: 0.25 seconds
963+
athena_query_wait_polling_delay: float, default: 1.0 seconds
964964
Interval in seconds for how often the function will check if the Athena query has completed.
965965
params: Dict[str, any] | List[str], optional
966966
Parameters that will be used for constructing the SQL query.
@@ -1426,7 +1426,7 @@ def unload(
14261426
14271427
- ``named``
14281428
- ``qmark``
1429-
athena_query_wait_polling_delay: float, default: 0.25 seconds
1429+
athena_query_wait_polling_delay: float, default: 1.0 seconds
14301430
Interval in seconds for how often the function will check if the Athena query has completed.
14311431
14321432
Returns

awswrangler/athena/_utils.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -506,7 +506,7 @@ def repair_table(
506506
None, 'SSE_S3', 'SSE_KMS', 'CSE_KMS'.
507507
kms_key : str, optional
508508
For SSE-KMS and CSE-KMS , this is the KMS key ARN or ID.
509-
athena_query_wait_polling_delay: float, default: 0.25 seconds
509+
athena_query_wait_polling_delay: float, default: 1.0 seconds
510510
Interval in seconds for how often the function will check if the Athena query has completed.
511511
boto3_session : boto3.Session(), optional
512512
Boto3 Session. The default boto3 session will be used if boto3_session receive None.
@@ -582,7 +582,7 @@ def describe_table(
582582
None, 'SSE_S3', 'SSE_KMS', 'CSE_KMS'.
583583
kms_key : str, optional
584584
For SSE-KMS and CSE-KMS , this is the KMS key ARN or ID.
585-
athena_query_wait_polling_delay: float, default: 0.25 seconds
585+
athena_query_wait_polling_delay: float, default: 1.0 seconds
586586
Interval in seconds for how often the function will check if the Athena query has completed.
587587
s3_additional_kwargs : dict[str, Any], optional
588588
Forwarded to botocore requests.
@@ -700,7 +700,7 @@ def create_ctas_table(
700700
Recommended for memory restricted environments.
701701
wait : bool, default False
702702
Whether to wait for the query to finish and return a dictionary with the Query metadata.
703-
athena_query_wait_polling_delay: float, default: 0.25 seconds
703+
athena_query_wait_polling_delay: float, default: 1.0 seconds
704704
Interval in seconds for how often the function will check if the Athena query has completed.
705705
execution_params: List[str], optional [DEPRECATED]
706706
A list of values for the parameters that are used in the SQL query.
@@ -912,7 +912,7 @@ def show_create_table(
912912
None, 'SSE_S3', 'SSE_KMS', 'CSE_KMS'.
913913
kms_key : str, optional
914914
For SSE-KMS and CSE-KMS , this is the KMS key ARN or ID.
915-
athena_query_wait_polling_delay: float, default: 0.25 seconds
915+
athena_query_wait_polling_delay: float, default: 1.0 seconds
916916
Interval in seconds for how often the function will check if the Athena query has completed.
917917
s3_additional_kwargs: dict[str, Any]
918918
Forwarded to botocore requests.

awswrangler/athena/_write_iceberg.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -515,7 +515,9 @@ def to_iceberg(
515515
sql_statement = f"""
516516
MERGE INTO "{database}"."{table}" target
517517
USING "{database}"."{temp_table}" source
518-
ON {' AND '.join([f'target."{x}" = source."{x}"' for x in merge_cols])}
518+
ON {' AND '.join([
519+
f'(target."{x}" = source."{x}" OR (target."{x}" IS NULL AND source."{x}" IS NULL))'
520+
for x in merge_cols])}
519521
{match_condition}
520522
WHEN NOT MATCHED THEN
521523
INSERT ({', '.join([f'"{x}"' for x in df.columns])})

tests/unit/test_athena_iceberg.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -650,6 +650,74 @@ def test_athena_to_iceberg_merge_into(path: str, path2: str, glue_database: str,
650650
assert_pandas_equals(df_expected, df_out)
651651

652652

653+
def test_athena_to_iceberg_merge_into_nulls(path: str, path2: str, glue_database: str, glue_table: str) -> None:
654+
df = pd.DataFrame(
655+
{
656+
"col1": ["a", "a", "a", np.nan],
657+
"col2": [0.0, 1.1, np.nan, 2.2],
658+
"action": ["insert", "insert", "insert", "insert"],
659+
}
660+
)
661+
df["col1"] = df["col1"].astype("string")
662+
df["col2"] = df["col2"].astype("float64")
663+
df["action"] = df["action"].astype("string")
664+
665+
wr.athena.to_iceberg(
666+
df=df,
667+
database=glue_database,
668+
table=glue_table,
669+
table_location=path,
670+
temp_path=path2,
671+
keep_files=False,
672+
)
673+
674+
# Perform MERGE INTO
675+
df2 = pd.DataFrame(
676+
{
677+
"col1": ["a", "a", np.nan, "b"],
678+
"col2": [1.1, np.nan, 2.2, 3.3],
679+
"action": ["update", "update", "update", "insert"],
680+
}
681+
)
682+
df2["col1"] = df2["col1"].astype("string")
683+
df2["col2"] = df2["col2"].astype("float64")
684+
df2["action"] = df2["action"].astype("string")
685+
686+
wr.athena.to_iceberg(
687+
df=df2,
688+
database=glue_database,
689+
table=glue_table,
690+
table_location=path,
691+
temp_path=path2,
692+
keep_files=False,
693+
merge_cols=["col1", "col2"],
694+
)
695+
696+
# Expected output
697+
df_expected = pd.DataFrame(
698+
{
699+
"col1": ["a", "a", "a", np.nan, "b"],
700+
"col2": [0.0, 1.1, np.nan, 2.2, 3.3],
701+
"action": ["insert", "update", "update", "update", "insert"],
702+
}
703+
)
704+
df_expected["col1"] = df_expected["col1"].astype("string")
705+
df_expected["col2"] = df_expected["col2"].astype("float64")
706+
df_expected["action"] = df_expected["action"].astype("string")
707+
708+
df_out = wr.athena.read_sql_query(
709+
sql=f'SELECT * FROM "{glue_table}"',
710+
database=glue_database,
711+
ctas_approach=False,
712+
unload_approach=False,
713+
)
714+
715+
assert_pandas_equals(
716+
df_out.sort_values(df_out.columns.to_list()).reset_index(drop=True),
717+
df_expected.sort_values(df_expected.columns.to_list()).reset_index(drop=True),
718+
)
719+
720+
653721
def test_athena_to_iceberg_merge_into_ignore(path: str, path2: str, glue_database: str, glue_table: str) -> None:
654722
df = pd.DataFrame({"title": ["Dune", "Fargo"], "year": ["1984", "1996"], "gross": [35_000_000, 60_000_000]})
655723
df["title"] = df["title"].astype("string")

0 commit comments

Comments
 (0)