From 0730f6ac7f261d0e0885e4c94b78df76f4ac974e Mon Sep 17 00:00:00 2001 From: Maximilian Roos Date: Fri, 28 Sep 2018 15:33:23 -0400 Subject: [PATCH 01/14] parse all datetime types --- pandas_gbq/gbq.py | 8 +++++++- tests/system/test_gbq.py | 19 ++++++++++++++++++- 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 79cd1aba..d9f208e8 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -577,7 +577,13 @@ def _parse_schema(schema_fields): # see: # http://pandas.pydata.org/pandas-docs/dev/missing_data.html # #missing-data-casting-rules-and-indexing - dtype_map = {"FLOAT": np.dtype(float), "TIMESTAMP": "M8[ns]"} + dtype_map = { + "FLOAT": np.dtype(float), + "TIMESTAMP": "datetime64[ns]", + "TIME": "datetime64[ns]", + "DATE": "datetime64[ns]", + "DATETIME": "datetime64[ns]", + } for field in schema_fields: name = str(field["name"]) diff --git a/tests/system/test_gbq.py b/tests/system/test_gbq.py index ba85b4c2..ced52fd0 100644 --- a/tests/system/test_gbq.py +++ b/tests/system/test_gbq.py @@ -1,8 +1,8 @@ # -*- coding: utf-8 -*- import sys -from datetime import datetime import uuid +from datetime import datetime import numpy as np import pandas.util.testing as tm @@ -338,6 +338,23 @@ def test_should_properly_handle_arbitrary_timestamp(self, project_id): ), ) + @pytest.mark.parametrize( + "date_type", ["DATE", "DATETIME", "TIMESTAMP", "TIME"] + ) + def test_should_properly_handle_all_timestamp_types( + self, project_id, date_type + ): + query = 'SELECT {typ}("2004-09-15") AS valid_timestamp'.format( + date_type + ) + df = gbq.read_gbq( + query, + project_id=project_id, + private_key=self.credentials, + dialect="legacy", + ) + assert df["valid_timestamp"].dtype == " Date: Fri, 28 Sep 2018 15:42:07 -0400 Subject: [PATCH 02/14] typo --- tests/system/test_gbq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/system/test_gbq.py b/tests/system/test_gbq.py index ced52fd0..37de8c19 100644 --- a/tests/system/test_gbq.py +++ b/tests/system/test_gbq.py @@ -345,7 +345,7 @@ def test_should_properly_handle_all_timestamp_types( self, project_id, date_type ): query = 'SELECT {typ}("2004-09-15") AS valid_timestamp'.format( - date_type + typ=date_type ) df = gbq.read_gbq( query, From a98f998aca09e771714cb8ae21b545f7bb3f862f Mon Sep 17 00:00:00 2001 From: Maximilian Roos Date: Fri, 28 Sep 2018 15:50:12 -0400 Subject: [PATCH 03/14] new black version --- pandas_gbq/gbq.py | 2 +- tests/system/test_gbq.py | 12 ++++++------ tests/unit/test_schema.py | 1 - 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index d9f208e8..b39489e0 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -283,7 +283,7 @@ def __init__( # BQ Queries costs $5 per TB. First 1 TB per month is free # see here for more: https://cloud.google.com/bigquery/pricing - self.query_price_for_TB = 5. / 2 ** 40 # USD/TB + self.query_price_for_TB = 5.0 / 2 ** 40 # USD/TB def _start_timer(self): self.start = time.time() diff --git a/tests/system/test_gbq.py b/tests/system/test_gbq.py index 37de8c19..cf299ce1 100644 --- a/tests/system/test_gbq.py +++ b/tests/system/test_gbq.py @@ -758,12 +758,12 @@ def test_query_response_bytes(self): assert self.gbq_connector.sizeof_fmt(1048576) == "1.0 MB" assert self.gbq_connector.sizeof_fmt(1048576000) == "1000.0 MB" assert self.gbq_connector.sizeof_fmt(1073741824) == "1.0 GB" - assert self.gbq_connector.sizeof_fmt(1.099512E12) == "1.0 TB" - assert self.gbq_connector.sizeof_fmt(1.125900E15) == "1.0 PB" - assert self.gbq_connector.sizeof_fmt(1.152922E18) == "1.0 EB" - assert self.gbq_connector.sizeof_fmt(1.180592E21) == "1.0 ZB" - assert self.gbq_connector.sizeof_fmt(1.208926E24) == "1.0 YB" - assert self.gbq_connector.sizeof_fmt(1.208926E28) == "10000.0 YB" + assert self.gbq_connector.sizeof_fmt(1.099512e12) == "1.0 TB" + assert self.gbq_connector.sizeof_fmt(1.125900e15) == "1.0 PB" + assert self.gbq_connector.sizeof_fmt(1.152922e18) == "1.0 EB" + assert self.gbq_connector.sizeof_fmt(1.180592e21) == "1.0 ZB" + assert self.gbq_connector.sizeof_fmt(1.208926e24) == "1.0 YB" + assert self.gbq_connector.sizeof_fmt(1.208926e28) == "10000.0 YB" def test_struct(self, project_id): query = """SELECT 1 int_field, diff --git a/tests/unit/test_schema.py b/tests/unit/test_schema.py index 66aca1dc..74f22f29 100644 --- a/tests/unit/test_schema.py +++ b/tests/unit/test_schema.py @@ -1,4 +1,3 @@ - import datetime import pandas From 28430618aef552509106a6497962d162b35917a5 Mon Sep 17 00:00:00 2001 From: Maximilian Roos Date: Fri, 28 Sep 2018 18:21:51 -0400 Subject: [PATCH 04/14] I tihnk we're doing similar things twice --- pandas_gbq/gbq.py | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index b39489e0..83af008c 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -2,7 +2,6 @@ import os import time import warnings -from collections import OrderedDict from datetime import datetime import numpy as np @@ -583,6 +582,8 @@ def _parse_schema(schema_fields): "TIME": "datetime64[ns]", "DATE": "datetime64[ns]", "DATETIME": "datetime64[ns]", + "BOOLEAN": bool, + "INTEGER": np.int64, } for field in schema_fields: @@ -590,17 +591,19 @@ def _parse_schema(schema_fields): if field["mode"].upper() == "REPEATED": yield name, object else: - dtype = dtype_map.get(field["type"].upper(), object) + dtype = dtype_map.get(field["type"].upper()) yield name, dtype def _parse_data(schema, rows): - column_dtypes = OrderedDict(_parse_schema(schema["fields"])) + column_dtypes = dict(_parse_schema(schema["fields"])) df = DataFrame(data=(iter(r) for r in rows), columns=column_dtypes.keys()) for column in df: - df[column] = df[column].astype(column_dtypes[column]) + dtype = column_dtypes[column] + if dtype: + df[column] = df[column].astype(column_dtypes[column]) return df @@ -755,16 +758,16 @@ def read_gbq( # cast BOOLEAN and INTEGER columns from object to bool/int # if they dont have any nulls AND field mode is not repeated (i.e., array) - type_map = {"BOOLEAN": bool, "INTEGER": np.int64} - for field in schema["fields"]: - if ( - field["type"].upper() in type_map - and final_df[field["name"]].notnull().all() - and field["mode"].lower() != "repeated" - ): - final_df[field["name"]] = final_df[field["name"]].astype( - type_map[field["type"].upper()] - ) + # type_map = {"BOOLEAN": bool, "INTEGER": np.int64} + # for field in schema["fields"]: + # if ( + # field["type"].upper() in type_map + # and final_df[field["name"]].notnull().all() + # and field["mode"].lower() != "repeated" + # ): + # final_df[field["name"]] = final_df[field["name"]].astype( + # type_map[field["type"].upper()] + # ) connector.log_elapsed_seconds( "Total time taken", From 541e96f63fb1a0fcfb20620c28b858d85481a90f Mon Sep 17 00:00:00 2001 From: Maximilian Roos Date: Fri, 28 Sep 2018 18:22:06 -0400 Subject: [PATCH 05/14] better type tests --- tests/system/test_gbq.py | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/tests/system/test_gbq.py b/tests/system/test_gbq.py index cf299ce1..42b66989 100644 --- a/tests/system/test_gbq.py +++ b/tests/system/test_gbq.py @@ -339,21 +339,30 @@ def test_should_properly_handle_arbitrary_timestamp(self, project_id): ) @pytest.mark.parametrize( - "date_type", ["DATE", "DATETIME", "TIMESTAMP", "TIME"] + "expression, type_", + [ + ("current_date()", " Date: Fri, 28 Sep 2018 18:31:51 -0400 Subject: [PATCH 06/14] check nulls before assigning type --- pandas_gbq/gbq.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 83af008c..e4fb878d 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -603,7 +603,9 @@ def _parse_data(schema, rows): for column in df: dtype = column_dtypes[column] if dtype: - df[column] = df[column].astype(column_dtypes[column]) + df[column] = df[column].astype( + column_dtypes[column], errors="ignore" + ) return df From 9cbeb1f99993b904d01a6138b6e4ba2841c8cfb2 Mon Sep 17 00:00:00 2001 From: Maximilian Roos Date: Fri, 28 Sep 2018 18:33:39 -0400 Subject: [PATCH 07/14] add env to gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index f0dd6fbd..9fb09906 100644 --- a/.gitignore +++ b/.gitignore @@ -22,6 +22,7 @@ .pytest_cache .testmon* .vscode/ +.env # Docs # ######## From dd276be316b7a20b59e0e69fc84438e33e9850e2 Mon Sep 17 00:00:00 2001 From: Maximilian Roos Date: Fri, 28 Sep 2018 18:34:11 -0400 Subject: [PATCH 08/14] remove old code --- pandas_gbq/gbq.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index e4fb878d..5dc8bc86 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -758,19 +758,6 @@ def read_gbq( "Column order does not match this DataFrame." ) - # cast BOOLEAN and INTEGER columns from object to bool/int - # if they dont have any nulls AND field mode is not repeated (i.e., array) - # type_map = {"BOOLEAN": bool, "INTEGER": np.int64} - # for field in schema["fields"]: - # if ( - # field["type"].upper() in type_map - # and final_df[field["name"]].notnull().all() - # and field["mode"].lower() != "repeated" - # ): - # final_df[field["name"]] = final_df[field["name"]].astype( - # type_map[field["type"].upper()] - # ) - connector.log_elapsed_seconds( "Total time taken", datetime.now().strftime("s.\nFinished at %Y-%m-%d %H:%M:%S."), From a933d7dc9bcbd6995d0bc91cb6bc057653f7e68b Mon Sep 17 00:00:00 2001 From: Maximilian Roos Date: Fri, 28 Sep 2018 19:21:52 -0400 Subject: [PATCH 09/14] handle float and int columns re nulls --- pandas_gbq/gbq.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 5dc8bc86..8e96a935 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -598,11 +598,16 @@ def _parse_schema(schema_fields): def _parse_data(schema, rows): column_dtypes = dict(_parse_schema(schema["fields"])) - df = DataFrame(data=(iter(r) for r in rows), columns=column_dtypes.keys()) + for column in df: dtype = column_dtypes[column] - if dtype: + null_safe = ( + df[column].notnull().all() + or dtype == float + or dtype == "datetime64[ns]" + ) + if dtype and null_safe: df[column] = df[column].astype( column_dtypes[column], errors="ignore" ) From c4c8cac7227917303927473bf2b31b66e8e4ac9e Mon Sep 17 00:00:00 2001 From: Maximilian Roos Date: Fri, 28 Sep 2018 19:22:39 -0400 Subject: [PATCH 10/14] nullable int columns as floats (separate issue) --- tests/system/test_gbq.py | 28 ++++------------------------ 1 file changed, 4 insertions(+), 24 deletions(-) diff --git a/tests/system/test_gbq.py b/tests/system/test_gbq.py index 42b66989..96a0eec8 100644 --- a/tests/system/test_gbq.py +++ b/tests/system/test_gbq.py @@ -200,9 +200,7 @@ def test_should_properly_handle_nullable_integers(self, project_id): private_key=self.credentials, dialect="legacy", ) - tm.assert_frame_equal( - df, DataFrame({"nullable_integer": [1, None]}).astype(object) - ) + tm.assert_frame_equal(df, DataFrame({"nullable_integer": [1, None]})) def test_should_properly_handle_valid_longs(self, project_id): query = "SELECT 1 << 62 AS valid_long" @@ -225,7 +223,7 @@ def test_should_properly_handle_nullable_longs(self, project_id): dialect="legacy", ) tm.assert_frame_equal( - df, DataFrame({"nullable_long": [1 << 62, None]}).astype(object) + df, DataFrame({"nullable_long": [1 << 62, None]}) ) def test_should_properly_handle_null_integers(self, project_id): @@ -344,6 +342,8 @@ def test_should_properly_handle_arbitrary_timestamp(self, project_id): ("current_date()", " Date: Fri, 28 Sep 2018 19:36:41 -0400 Subject: [PATCH 11/14] Chesterton's Fence --- pandas_gbq/gbq.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 8e96a935..01cf55c1 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -2,6 +2,7 @@ import os import time import warnings +from collections import OrderedDict from datetime import datetime import numpy as np @@ -597,7 +598,7 @@ def _parse_schema(schema_fields): def _parse_data(schema, rows): - column_dtypes = dict(_parse_schema(schema["fields"])) + column_dtypes = OrderedDict(_parse_schema(schema["fields"])) df = DataFrame(data=(iter(r) for r in rows), columns=column_dtypes.keys()) for column in df: From f05f006b813cc478b374814dbdb7ff34365f1a94 Mon Sep 17 00:00:00 2001 From: Maximilian Roos Date: Fri, 28 Sep 2018 23:21:27 -0400 Subject: [PATCH 12/14] try falling back to standard black check --- noxfile.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/noxfile.py b/noxfile.py index 7a76559e..104a34c4 100644 --- a/noxfile.py +++ b/noxfile.py @@ -77,13 +77,7 @@ def test_latest_deps(session, python=latest_python): @nox.session def lint(session, python=latest_python): session.install("black") - session.run( - "black", - "--check", - "--exclude", - "(\.git|\.hg|\.mypy_cache|\.tox|\.nox|\.venv|_build|buck-out|build|dist)", - ".", - ) + session.run("black", "--check", ".") @nox.session From 418084abf52cbd59b8fa0e85d8eb4b9a938e404b Mon Sep 17 00:00:00 2001 From: Maximilian Roos Date: Sat, 29 Sep 2018 00:30:41 -0400 Subject: [PATCH 13/14] exclude nox --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 90440f59..318a0442 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,4 +4,5 @@ exclude = ''' versioneer.py | _version.py | docs +| .nox ''' \ No newline at end of file From d1ca352eca9135f686b9925892e48b2ab4b9fa41 Mon Sep 17 00:00:00 2001 From: Maximilian Roos Date: Wed, 10 Oct 2018 13:00:50 -0400 Subject: [PATCH 14/14] changelog --- docs/source/changelog.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index 28cecbca..68dc8d60 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -6,6 +6,10 @@ Changelog 0.7.0 / [unreleased] -------------------- +- `int` columns which contain `NULL` are now cast to `float`, rather than + `object` type. (:issue:`174`) +- `DATE`, `DATETIME` and `TIMESTAMP` columns are now parsed as pandas' `timestamp` + objects (:issue:`224`) - Add :class:`pandas_gbq.Context` to cache credentials in-memory, across calls to ``read_gbq`` and ``to_gbq``. (:issue:`198`, :issue:`208`) - Fast queries now do not log above ``DEBUG`` level. (:issue:`204`) @@ -20,6 +24,8 @@ Internal changes ~~~~~~~~~~~~~~~~ - Avoid listing datasets and tables in system tests. (:issue:`215`) +- Improved performance from eliminating some duplicative parsing steps + (:issue:`224`) .. _changelog-0.6.1: