Fix: Update dataframe.to_gbq to dedup column names. (#286)

Genesis929 · web-flow · commit 746115d5564c · 2023-12-28T20:42:15.000Z
Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes #<issue_number_goes_here> 🦕
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
@@ -2759,26 +2759,28 @@ def _apply_unary_op(self, operation: ops.UnaryOp) -> DataFrame:
     def _create_io_query(self, index: bool, ordering_id: Optional[str]) -> str:
         """Create query text representing this dataframe for I/O."""
         array_value = self._block.expr
+
+        new_col_labels, new_idx_labels = utils.get_standardized_ids(
+            self._block.column_labels, self.index.names
+        )
+
         columns = list(self._block.value_columns)
-        column_labels = list(self._block.column_labels)
+        column_labels = new_col_labels
         # This code drops unnamed indexes to keep consistent with the behavior of
         # most pandas write APIs. The exception is `pandas.to_csv`, which keeps
         # unnamed indexes as `Unnamed: 0`.
         # TODO(chelsealin): check if works for multiple indexes.
         if index and self.index.name is not None:
             columns.extend(self._block.index_columns)
-            column_labels.extend(self.index.names)
+            column_labels.extend(new_idx_labels)
         else:
             array_value = array_value.drop_columns(self._block.index_columns)
 
         # Make columns in SQL reflect _labels_ not _ids_. Note: This may use
         # the arbitrary unicode column labels feature in BigQuery, which is
         # currently (June 2023) in preview.
-        # TODO(swast): Handle duplicate and NULL labels.
         id_overrides = {
-            col_id: col_label
-            for col_id, col_label in zip(columns, column_labels)
-            if col_label and isinstance(col_label, str)
+            col_id: col_label for col_id, col_label in zip(columns, column_labels)
         }
 
         if ordering_id is not None:
diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py
@@ -273,6 +273,50 @@ def test_to_gbq_if_exists(
     )
 
 
+def test_to_gbq_w_duplicate_column_names(
+    scalars_df_index, scalars_pandas_df_index, dataset_id
+):
+    """Test the `to_gbq` API when dealing with duplicate column names."""
+    destination_table = f"{dataset_id}.test_to_gbq_w_duplicate_column_names"
+
+    # Renaming 'int64_too' to 'int64_col', which will result in 'int64_too'
+    # becoming 'int64_col_1' after deduplication.
+    scalars_df_index = scalars_df_index.rename(columns={"int64_too": "int64_col"})
+    scalars_df_index.to_gbq(destination_table, if_exists="replace")
+
+    bf_result = bpd.read_gbq(destination_table, index_col="rowindex").to_pandas()
+
+    pd.testing.assert_series_equal(
+        scalars_pandas_df_index["int64_col"], bf_result["int64_col"]
+    )
+    pd.testing.assert_series_equal(
+        scalars_pandas_df_index["int64_too"],
+        bf_result["int64_col_1"],
+        check_names=False,
+    )
+
+
+def test_to_gbq_w_None_column_names(
+    scalars_df_index, scalars_pandas_df_index, dataset_id
+):
+    """Test the `to_gbq` API with None as a column name."""
+    destination_table = f"{dataset_id}.test_to_gbq_w_none_column_names"
+
+    scalars_df_index = scalars_df_index.rename(columns={"int64_too": None})
+    scalars_df_index.to_gbq(destination_table, if_exists="replace")
+
+    bf_result = bpd.read_gbq(destination_table, index_col="rowindex").to_pandas()
+
+    pd.testing.assert_series_equal(
+        scalars_pandas_df_index["int64_col"], bf_result["int64_col"]
+    )
+    pd.testing.assert_series_equal(
+        scalars_pandas_df_index["int64_too"],
+        bf_result["bigframes_unnamed_column"],
+        check_names=False,
+    )
+
+
 def test_to_gbq_w_invalid_destination_table(scalars_df_index):
     with pytest.raises(ValueError):
         scalars_df_index.to_gbq("table_id")