diff --git a/.gitignore b/.gitignore index f0dd6fbd..9fb09906 100644 --- a/.gitignore +++ b/.gitignore @@ -22,6 +22,7 @@ .pytest_cache .testmon* .vscode/ +.env # Docs # ######## diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index 28cecbca..68dc8d60 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -6,6 +6,10 @@ Changelog 0.7.0 / [unreleased] -------------------- +- `int` columns which contain `NULL` are now cast to `float`, rather than + `object` type. (:issue:`174`) +- `DATE`, `DATETIME` and `TIMESTAMP` columns are now parsed as pandas' `timestamp` + objects (:issue:`224`) - Add :class:`pandas_gbq.Context` to cache credentials in-memory, across calls to ``read_gbq`` and ``to_gbq``. (:issue:`198`, :issue:`208`) - Fast queries now do not log above ``DEBUG`` level. (:issue:`204`) @@ -20,6 +24,8 @@ Internal changes ~~~~~~~~~~~~~~~~ - Avoid listing datasets and tables in system tests. (:issue:`215`) +- Improved performance from eliminating some duplicative parsing steps + (:issue:`224`) .. _changelog-0.6.1: diff --git a/noxfile.py b/noxfile.py index 7a76559e..104a34c4 100644 --- a/noxfile.py +++ b/noxfile.py @@ -77,13 +77,7 @@ def test_latest_deps(session, python=latest_python): @nox.session def lint(session, python=latest_python): session.install("black") - session.run( - "black", - "--check", - "--exclude", - "(\.git|\.hg|\.mypy_cache|\.tox|\.nox|\.venv|_build|buck-out|build|dist)", - ".", - ) + session.run("black", "--check", ".") @nox.session diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 79cd1aba..01cf55c1 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -283,7 +283,7 @@ def __init__( # BQ Queries costs $5 per TB. First 1 TB per month is free # see here for more: https://cloud.google.com/bigquery/pricing - self.query_price_for_TB = 5. / 2 ** 40 # USD/TB + self.query_price_for_TB = 5.0 / 2 ** 40 # USD/TB def _start_timer(self): self.start = time.time() @@ -577,24 +577,41 @@ def _parse_schema(schema_fields): # see: # http://pandas.pydata.org/pandas-docs/dev/missing_data.html # #missing-data-casting-rules-and-indexing - dtype_map = {"FLOAT": np.dtype(float), "TIMESTAMP": "M8[ns]"} + dtype_map = { + "FLOAT": np.dtype(float), + "TIMESTAMP": "datetime64[ns]", + "TIME": "datetime64[ns]", + "DATE": "datetime64[ns]", + "DATETIME": "datetime64[ns]", + "BOOLEAN": bool, + "INTEGER": np.int64, + } for field in schema_fields: name = str(field["name"]) if field["mode"].upper() == "REPEATED": yield name, object else: - dtype = dtype_map.get(field["type"].upper(), object) + dtype = dtype_map.get(field["type"].upper()) yield name, dtype def _parse_data(schema, rows): column_dtypes = OrderedDict(_parse_schema(schema["fields"])) - df = DataFrame(data=(iter(r) for r in rows), columns=column_dtypes.keys()) + for column in df: - df[column] = df[column].astype(column_dtypes[column]) + dtype = column_dtypes[column] + null_safe = ( + df[column].notnull().all() + or dtype == float + or dtype == "datetime64[ns]" + ) + if dtype and null_safe: + df[column] = df[column].astype( + column_dtypes[column], errors="ignore" + ) return df @@ -747,19 +764,6 @@ def read_gbq( "Column order does not match this DataFrame." ) - # cast BOOLEAN and INTEGER columns from object to bool/int - # if they dont have any nulls AND field mode is not repeated (i.e., array) - type_map = {"BOOLEAN": bool, "INTEGER": np.int64} - for field in schema["fields"]: - if ( - field["type"].upper() in type_map - and final_df[field["name"]].notnull().all() - and field["mode"].lower() != "repeated" - ): - final_df[field["name"]] = final_df[field["name"]].astype( - type_map[field["type"].upper()] - ) - connector.log_elapsed_seconds( "Total time taken", datetime.now().strftime("s.\nFinished at %Y-%m-%d %H:%M:%S."), diff --git a/pyproject.toml b/pyproject.toml index 90440f59..318a0442 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,4 +4,5 @@ exclude = ''' versioneer.py | _version.py | docs +| .nox ''' \ No newline at end of file diff --git a/tests/system/test_gbq.py b/tests/system/test_gbq.py index ba85b4c2..96a0eec8 100644 --- a/tests/system/test_gbq.py +++ b/tests/system/test_gbq.py @@ -1,8 +1,8 @@ # -*- coding: utf-8 -*- import sys -from datetime import datetime import uuid +from datetime import datetime import numpy as np import pandas.util.testing as tm @@ -200,9 +200,7 @@ def test_should_properly_handle_nullable_integers(self, project_id): private_key=self.credentials, dialect="legacy", ) - tm.assert_frame_equal( - df, DataFrame({"nullable_integer": [1, None]}).astype(object) - ) + tm.assert_frame_equal(df, DataFrame({"nullable_integer": [1, None]})) def test_should_properly_handle_valid_longs(self, project_id): query = "SELECT 1 << 62 AS valid_long" @@ -225,7 +223,7 @@ def test_should_properly_handle_nullable_longs(self, project_id): dialect="legacy", ) tm.assert_frame_equal( - df, DataFrame({"nullable_long": [1 << 62, None]}).astype(object) + df, DataFrame({"nullable_long": [1 << 62, None]}) ) def test_should_properly_handle_null_integers(self, project_id): @@ -338,35 +336,43 @@ def test_should_properly_handle_arbitrary_timestamp(self, project_id): ), ) - def test_should_properly_handle_null_timestamp(self, project_id): - query = "SELECT TIMESTAMP(NULL) AS null_timestamp" - df = gbq.read_gbq( - query, - project_id=project_id, - private_key=self.credentials, - dialect="legacy", - ) - tm.assert_frame_equal(df, DataFrame({"null_timestamp": [NaT]})) + @pytest.mark.parametrize( + "expression, type_", + [ + ("current_date()", "