Skip to content

Commit 746115d

Browse files
authored
Fix: Update dataframe.to_gbq to dedup column names. (#286)
Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes #<issue_number_goes_here> 🦕
1 parent 252f3a2 commit 746115d

File tree

2 files changed

+52
-6
lines changed

2 files changed

+52
-6
lines changed

bigframes/dataframe.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2759,26 +2759,28 @@ def _apply_unary_op(self, operation: ops.UnaryOp) -> DataFrame:
27592759
def _create_io_query(self, index: bool, ordering_id: Optional[str]) -> str:
27602760
"""Create query text representing this dataframe for I/O."""
27612761
array_value = self._block.expr
2762+
2763+
new_col_labels, new_idx_labels = utils.get_standardized_ids(
2764+
self._block.column_labels, self.index.names
2765+
)
2766+
27622767
columns = list(self._block.value_columns)
2763-
column_labels = list(self._block.column_labels)
2768+
column_labels = new_col_labels
27642769
# This code drops unnamed indexes to keep consistent with the behavior of
27652770
# most pandas write APIs. The exception is `pandas.to_csv`, which keeps
27662771
# unnamed indexes as `Unnamed: 0`.
27672772
# TODO(chelsealin): check if works for multiple indexes.
27682773
if index and self.index.name is not None:
27692774
columns.extend(self._block.index_columns)
2770-
column_labels.extend(self.index.names)
2775+
column_labels.extend(new_idx_labels)
27712776
else:
27722777
array_value = array_value.drop_columns(self._block.index_columns)
27732778

27742779
# Make columns in SQL reflect _labels_ not _ids_. Note: This may use
27752780
# the arbitrary unicode column labels feature in BigQuery, which is
27762781
# currently (June 2023) in preview.
2777-
# TODO(swast): Handle duplicate and NULL labels.
27782782
id_overrides = {
2779-
col_id: col_label
2780-
for col_id, col_label in zip(columns, column_labels)
2781-
if col_label and isinstance(col_label, str)
2783+
col_id: col_label for col_id, col_label in zip(columns, column_labels)
27822784
}
27832785

27842786
if ordering_id is not None:

tests/system/small/test_dataframe_io.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -273,6 +273,50 @@ def test_to_gbq_if_exists(
273273
)
274274

275275

276+
def test_to_gbq_w_duplicate_column_names(
277+
scalars_df_index, scalars_pandas_df_index, dataset_id
278+
):
279+
"""Test the `to_gbq` API when dealing with duplicate column names."""
280+
destination_table = f"{dataset_id}.test_to_gbq_w_duplicate_column_names"
281+
282+
# Renaming 'int64_too' to 'int64_col', which will result in 'int64_too'
283+
# becoming 'int64_col_1' after deduplication.
284+
scalars_df_index = scalars_df_index.rename(columns={"int64_too": "int64_col"})
285+
scalars_df_index.to_gbq(destination_table, if_exists="replace")
286+
287+
bf_result = bpd.read_gbq(destination_table, index_col="rowindex").to_pandas()
288+
289+
pd.testing.assert_series_equal(
290+
scalars_pandas_df_index["int64_col"], bf_result["int64_col"]
291+
)
292+
pd.testing.assert_series_equal(
293+
scalars_pandas_df_index["int64_too"],
294+
bf_result["int64_col_1"],
295+
check_names=False,
296+
)
297+
298+
299+
def test_to_gbq_w_None_column_names(
300+
scalars_df_index, scalars_pandas_df_index, dataset_id
301+
):
302+
"""Test the `to_gbq` API with None as a column name."""
303+
destination_table = f"{dataset_id}.test_to_gbq_w_none_column_names"
304+
305+
scalars_df_index = scalars_df_index.rename(columns={"int64_too": None})
306+
scalars_df_index.to_gbq(destination_table, if_exists="replace")
307+
308+
bf_result = bpd.read_gbq(destination_table, index_col="rowindex").to_pandas()
309+
310+
pd.testing.assert_series_equal(
311+
scalars_pandas_df_index["int64_col"], bf_result["int64_col"]
312+
)
313+
pd.testing.assert_series_equal(
314+
scalars_pandas_df_index["int64_too"],
315+
bf_result["bigframes_unnamed_column"],
316+
check_names=False,
317+
)
318+
319+
276320
def test_to_gbq_w_invalid_destination_table(scalars_df_index):
277321
with pytest.raises(ValueError):
278322
scalars_df_index.to_gbq("table_id")

0 commit comments

Comments
 (0)