From cbab7356fb6bc100f4677a40db0b041a33b1cc23 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 26 Jul 2019 10:44:13 -0700 Subject: [PATCH 1/3] BUG: Use object dtype for STRING, ARRAY, and STRUCT columns when there are zero rows. If a there are no rows, the default dtype is used (which is now float64, must previously have been object). --- docs/source/changelog.rst | 6 ++++++ pandas_gbq/gbq.py | 12 +++++++++--- tests/system/test_gbq.py | 20 +++++++++----------- 3 files changed, 24 insertions(+), 14 deletions(-) diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index 8cdd5c8a..e6928909 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -10,6 +10,12 @@ Changelog with the pandas package which dropped Python 2 support at the end of 2019. (:issue:`268`) +Implementation changes +~~~~~~~~~~~~~~~~~~~~~~ + +- Use object dtype for ``STRING``, ``ARRAY``, and ``STRUCT`` columns when + there are zero rows. (issue TBD) + .. _changelog-0.10.0: 0.10.0 / 2019-04-05 diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 4ec7f804..4590e1da 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -672,20 +672,26 @@ def _bqschema_to_nullsafe_dtypes(schema_fields): # If you update this mapping, also update the table at # `docs/source/reading.rst`. dtype_map = { + "DATE": "datetime64[ns]", + "DATETIME": "datetime64[ns]", "FLOAT": np.dtype(float), + "GEOMETRY": "object", + "RECORD": "object", + "STRING": "object", + "TIME": "datetime64[ns]", # pandas doesn't support timezone-aware dtype in DataFrame/Series # constructors. It's more idiomatic to localize after construction. # https://github.com/pandas-dev/pandas/issues/25843 "TIMESTAMP": "datetime64[ns]", - "TIME": "datetime64[ns]", - "DATE": "datetime64[ns]", - "DATETIME": "datetime64[ns]", } dtypes = {} for field in schema_fields: name = str(field["name"]) + # Array BigQuery type is represented as an object column containing + # list objects. if field["mode"].upper() == "REPEATED": + dtypes[name] = "object" continue dtype = dtype_map.get(field["type"].upper()) diff --git a/tests/system/test_gbq.py b/tests/system/test_gbq.py index 33369557..3ebdcaa9 100644 --- a/tests/system/test_gbq.py +++ b/tests/system/test_gbq.py @@ -577,24 +577,22 @@ def test_download_dataset_larger_than_200k_rows(self, project_id): def test_zero_rows(self, project_id): # Bug fix for https://github.com/pandas-dev/pandas/issues/10273 df = gbq.read_gbq( - "SELECT title, id, is_bot, " - "SEC_TO_TIMESTAMP(timestamp) ts " - "FROM [publicdata:samples.wikipedia] " - "WHERE timestamp=-9999999", + 'SELECT name, number, (mlc_class = "HU") is_hurricane, iso_time ' + "FROM `bigquery-public-data.noaa_hurricanes.hurricanes` " + 'WHERE iso_time = TIMESTAMP("1900-01-01 00:00:00") ', project_id=project_id, credentials=self.credentials, - dialect="legacy", ) empty_columns = { - "title": pandas.Series([], dtype=object), - "id": pandas.Series([], dtype=np.dtype(int)), - "is_bot": pandas.Series([], dtype=np.dtype(bool)), - "ts": pandas.Series([], dtype="datetime64[ns]"), + "name": pandas.Series([], dtype=object), + "number": pandas.Series([], dtype=np.dtype(int)), + "is_hurricane": pandas.Series([], dtype=np.dtype(bool)), + "iso_time": pandas.Series([], dtype="datetime64[ns]"), } expected_result = DataFrame( - empty_columns, columns=["title", "id", "is_bot", "ts"] + empty_columns, columns=["name", "number", "is_hurricane", "iso_time"] ) - expected_result["ts"] = expected_result["ts"].dt.tz_localize("UTC") + expected_result["iso_time"] = expected_result["iso_time"].dt.tz_localize("UTC") tm.assert_frame_equal(df, expected_result, check_index_type=False) def test_one_row_one_column(self, project_id): From b17111d12cbe968e0af6df8c712fa512514b3510 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 26 Jul 2019 10:51:54 -0700 Subject: [PATCH 2/3] Add PR number to changelog. --- docs/source/changelog.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index e6928909..bdf4413b 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -14,7 +14,7 @@ Implementation changes ~~~~~~~~~~~~~~~~~~~~~~ - Use object dtype for ``STRING``, ``ARRAY``, and ``STRUCT`` columns when - there are zero rows. (issue TBD) + there are zero rows. (:issue:`285`) .. _changelog-0.10.0: From 87db0fcf028e72c96abeba99a8f7712d76d8a20c Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 26 Jul 2019 10:53:13 -0700 Subject: [PATCH 3/3] Blacken --- tests/system/test_gbq.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/system/test_gbq.py b/tests/system/test_gbq.py index 3ebdcaa9..6f8ef406 100644 --- a/tests/system/test_gbq.py +++ b/tests/system/test_gbq.py @@ -590,9 +590,12 @@ def test_zero_rows(self, project_id): "iso_time": pandas.Series([], dtype="datetime64[ns]"), } expected_result = DataFrame( - empty_columns, columns=["name", "number", "is_hurricane", "iso_time"] + empty_columns, + columns=["name", "number", "is_hurricane", "iso_time"], ) - expected_result["iso_time"] = expected_result["iso_time"].dt.tz_localize("UTC") + expected_result["iso_time"] = expected_result[ + "iso_time" + ].dt.tz_localize("UTC") tm.assert_frame_equal(df, expected_result, check_index_type=False) def test_one_row_one_column(self, project_id):