Skip to content

Commit ebcbfbe

Browse files
authored
CLN: Use to_dataframe to download query results. (#247)
* CLN: Use `to_dataframe` to download query results. This allows us to remove logic for parsing the schema and align with google-cloud-bigquery. * Bumps the minimum google-cloud-bigquery version, because we need to use the new dtypes argument. * Cast to correct dtype in empty dataframes. * Improve the conda CI build to truly use dependencies from conda, not pip. Adds pydata-google-auth to conda deps.
1 parent f729a44 commit ebcbfbe

12 files changed

+178
-102
lines changed

benchmark/README.md

+16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# pandas-gbq benchmarks
2+
3+
This directory contains a few scripts which are useful for performance
4+
testing the pandas-gbq library. Use cProfile to time the script and see
5+
details about where time is spent. To avoid timing how long BigQuery takes to
6+
execute a query, run the benchmark twice to ensure the results are cached.
7+
8+
## `read_gbq`
9+
10+
Read a small table (a few KB).
11+
12+
python -m cProfile --sort=cumtime read_gbq_small_results.py
13+
14+
Read a large-ish table (100+ MB).
15+
16+
python -m cProfile --sort=cumtime read_gbq_large_results.py

benchmark/read_gbq_large_results.py

+8
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
import pandas_gbq
2+
3+
# Select 163 MB worth of data, to time how long it takes to download large
4+
# result sets.
5+
df = pandas_gbq.read_gbq(
6+
"SELECT * FROM `bigquery-public-data.usa_names.usa_1910_2013`",
7+
dialect="standard",
8+
)

benchmark/read_gbq_small_results.py

+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
import pandas_gbq
2+
3+
# Select a few KB worth of data, to time downloading small result sets.
4+
df = pandas_gbq.read_gbq(
5+
"SELECT * FROM `bigquery-public-data.utility_us.country_code_iso`",
6+
dialect="standard",
7+
)

ci/requirements-2.7.pip

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,5 +2,5 @@ mock
22
pandas==0.17.1
33
google-auth==1.4.1
44
google-auth-oauthlib==0.0.1
5-
google-cloud-bigquery==0.32.0
5+
google-cloud-bigquery==1.9.0
66
pydata-google-auth==0.1.2

ci/requirements-3.5.pip

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
pandas==0.19.0
22
google-auth==1.4.1
33
google-auth-oauthlib==0.0.1
4-
google-cloud-bigquery==0.32.0
4+
google-cloud-bigquery==1.9.0
55
pydata-google-auth==0.1.2

ci/requirements-3.6-0.20.1.conda

+2-3
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
1-
google-auth
2-
google-auth-oauthlib
3-
google-cloud-bigquery==0.32.0
1+
pydata-google-auth
2+
google-cloud-bigquery==1.9.0
43
pytest
54
pytest-cov
65
codecov

ci/run_conda.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ fi
2121

2222
REQ="ci/requirements-${PYTHON}-${PANDAS}"
2323
conda install -q --file "$REQ.conda";
24-
python setup.py develop
24+
python setup.py develop --no-deps
2525

2626
# Run the tests
2727
$DIR/run_tests.sh

docs/source/changelog.rst

+18
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,24 @@
11
Changelog
22
=========
33

4+
.. _changelog-0.10.0:
5+
6+
0.10.0 / TBD
7+
------------
8+
9+
Dependency updates
10+
~~~~~~~~~~~~~~~~~~
11+
12+
- Update the minimum version of ``google-cloud-bigquery`` to 1.9.0.
13+
(:issue:`247`)
14+
15+
Internal changes
16+
~~~~~~~~~~~~~~~~
17+
18+
- Use ``to_dataframe()`` from ``google-cloud-bigquery`` in the ``read_gbq()``
19+
function. (:issue:`247`)
20+
21+
422
.. _changelog-0.9.0:
523

624
0.9.0 / 2019-01-11

pandas_gbq/gbq.py

+49-33
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,9 @@
11
import logging
22
import time
33
import warnings
4-
from collections import OrderedDict
54
from datetime import datetime
65

76
import numpy as np
8-
from pandas import DataFrame
97

108
from pandas_gbq.exceptions import AccessDenied
119

@@ -37,7 +35,7 @@ def _check_google_client_version():
3735
raise ImportError("Could not import pkg_resources (setuptools).")
3836

3937
# https://github.com/GoogleCloudPlatform/google-cloud-python/blob/master/bigquery/CHANGELOG.md
40-
bigquery_minimum_version = pkg_resources.parse_version("0.32.0")
38+
bigquery_minimum_version = pkg_resources.parse_version("1.9.0")
4139
BIGQUERY_INSTALLED_VERSION = pkg_resources.get_distribution(
4240
"google-cloud-bigquery"
4341
).parsed_version
@@ -482,15 +480,16 @@ def run_query(self, query, **kwargs):
482480
rows_iter = query_reply.result()
483481
except self.http_error as ex:
484482
self.process_http_error(ex)
485-
result_rows = list(rows_iter)
486-
total_rows = rows_iter.total_rows
487-
schema = {
488-
"fields": [field.to_api_repr() for field in rows_iter.schema]
489-
}
490483

491-
logger.debug("Got {} rows.\n".format(total_rows))
484+
schema_fields = [field.to_api_repr() for field in rows_iter.schema]
485+
nullsafe_dtypes = _bqschema_to_nullsafe_dtypes(schema_fields)
486+
df = rows_iter.to_dataframe(dtypes=nullsafe_dtypes)
487+
488+
if df.empty:
489+
df = _cast_empty_df_dtypes(schema_fields, df)
492490

493-
return schema, result_rows
491+
logger.debug("Got {} rows.\n".format(rows_iter.total_rows))
492+
return df
494493

495494
def load_data(
496495
self,
@@ -638,45 +637,62 @@ def delete_and_recreate_table(self, dataset_id, table_id, table_schema):
638637
table.create(table_id, table_schema)
639638

640639

641-
def _parse_schema(schema_fields):
642-
# see:
640+
def _bqschema_to_nullsafe_dtypes(schema_fields):
641+
# Only specify dtype when the dtype allows nulls. Otherwise, use pandas's
642+
# default dtype choice.
643+
#
644+
# See:
643645
# http://pandas.pydata.org/pandas-docs/dev/missing_data.html
644646
# #missing-data-casting-rules-and-indexing
645647
dtype_map = {
646648
"FLOAT": np.dtype(float),
649+
# Even though TIMESTAMPs are timezone-aware in BigQuery, pandas doesn't
650+
# support datetime64[ns, UTC] as dtype in DataFrame constructors. See:
651+
# https://github.com/pandas-dev/pandas/issues/12513
647652
"TIMESTAMP": "datetime64[ns]",
648653
"TIME": "datetime64[ns]",
649654
"DATE": "datetime64[ns]",
650655
"DATETIME": "datetime64[ns]",
651-
"BOOLEAN": bool,
652-
"INTEGER": np.int64,
653656
}
654657

658+
dtypes = {}
655659
for field in schema_fields:
656660
name = str(field["name"])
657661
if field["mode"].upper() == "REPEATED":
658-
yield name, object
659-
else:
660-
dtype = dtype_map.get(field["type"].upper())
661-
yield name, dtype
662+
continue
663+
664+
dtype = dtype_map.get(field["type"].upper())
665+
if dtype:
666+
dtypes[name] = dtype
662667

668+
return dtypes
663669

664-
def _parse_data(schema, rows):
665670

666-
column_dtypes = OrderedDict(_parse_schema(schema["fields"]))
667-
df = DataFrame(data=(iter(r) for r in rows), columns=column_dtypes.keys())
671+
def _cast_empty_df_dtypes(schema_fields, df):
672+
"""Cast any columns in an empty dataframe to correct type.
668673
669-
for column in df:
670-
dtype = column_dtypes[column]
671-
null_safe = (
672-
df[column].notnull().all()
673-
or dtype == float
674-
or dtype == "datetime64[ns]"
674+
In an empty dataframe, pandas cannot choose a dtype unless one is
675+
explicitly provided. The _bqschema_to_nullsafe_dtypes() function only
676+
provides dtypes when the dtype safely handles null values. This means
677+
that empty int64 and boolean columns are incorrectly classified as
678+
``object``.
679+
"""
680+
if not df.empty:
681+
raise ValueError(
682+
"DataFrame must be empty in order to cast non-nullsafe dtypes"
675683
)
676-
if dtype and null_safe:
677-
df[column] = df[column].astype(
678-
column_dtypes[column], errors="ignore"
679-
)
684+
685+
dtype_map = {"BOOLEAN": bool, "INTEGER": np.int64}
686+
687+
for field in schema_fields:
688+
column = str(field["name"])
689+
if field["mode"].upper() == "REPEATED":
690+
continue
691+
692+
dtype = dtype_map.get(field["type"].upper())
693+
if dtype:
694+
df[column] = df[column].astype(dtype)
695+
680696
return df
681697

682698

@@ -825,8 +841,8 @@ def read_gbq(
825841
credentials=credentials,
826842
private_key=private_key,
827843
)
828-
schema, rows = connector.run_query(query, configuration=configuration)
829-
final_df = _parse_data(schema, rows)
844+
845+
final_df = connector.run_query(query, configuration=configuration)
830846

831847
# Reindex the DataFrame on the provided column
832848
if index_col is not None:

setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ def readme():
2222
"pydata-google-auth",
2323
"google-auth",
2424
"google-auth-oauthlib",
25-
"google-cloud-bigquery>=0.32.0",
25+
"google-cloud-bigquery>=1.9.0",
2626
]
2727

2828
extras = {"tqdm": "tqdm>=4.23.0"}

tests/system/test_gbq.py

+58-28
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,12 @@
66

77
import google.oauth2.service_account
88
import numpy as np
9+
import pandas
910
import pandas.util.testing as tm
10-
import pytest
11-
import pytz
1211
from pandas import DataFrame, NaT, compat
1312
from pandas.compat import range, u
13+
import pytest
14+
import pytz
1415

1516
from pandas_gbq import gbq
1617

@@ -138,14 +139,6 @@ def test_should_be_able_to_get_a_bigquery_client(self, gbq_connector):
138139
bigquery_client = gbq_connector.get_client()
139140
assert bigquery_client is not None
140141

141-
def test_should_be_able_to_get_schema_from_query(self, gbq_connector):
142-
schema, pages = gbq_connector.run_query("SELECT 1")
143-
assert schema is not None
144-
145-
def test_should_be_able_to_get_results_from_query(self, gbq_connector):
146-
schema, pages = gbq_connector.run_query("SELECT 1")
147-
assert pages is not None
148-
149142

150143
def test_should_read(project, credentials):
151144
query = 'SELECT "PI" AS valid_string'
@@ -319,7 +312,8 @@ def test_should_properly_handle_timestamp_unix_epoch(self, project_id):
319312
tm.assert_frame_equal(
320313
df,
321314
DataFrame(
322-
{"unix_epoch": [np.datetime64("1970-01-01T00:00:00.000000Z")]}
315+
{"unix_epoch": ["1970-01-01T00:00:00.000000Z"]},
316+
dtype="datetime64[ns]",
323317
),
324318
)
325319

@@ -334,19 +328,46 @@ def test_should_properly_handle_arbitrary_timestamp(self, project_id):
334328
tm.assert_frame_equal(
335329
df,
336330
DataFrame(
337-
{
338-
"valid_timestamp": [
339-
np.datetime64("2004-09-15T05:00:00.000000Z")
340-
]
341-
}
331+
{"valid_timestamp": ["2004-09-15T05:00:00.000000Z"]},
332+
dtype="datetime64[ns]",
333+
),
334+
)
335+
336+
def test_should_properly_handle_datetime_unix_epoch(self, project_id):
337+
query = 'SELECT DATETIME("1970-01-01 00:00:00") AS unix_epoch'
338+
df = gbq.read_gbq(
339+
query,
340+
project_id=project_id,
341+
credentials=self.credentials,
342+
dialect="legacy",
343+
)
344+
tm.assert_frame_equal(
345+
df,
346+
DataFrame(
347+
{"unix_epoch": ["1970-01-01T00:00:00"]}, dtype="datetime64[ns]"
348+
),
349+
)
350+
351+
def test_should_properly_handle_arbitrary_datetime(self, project_id):
352+
query = 'SELECT DATETIME("2004-09-15 05:00:00") AS valid_timestamp'
353+
df = gbq.read_gbq(
354+
query,
355+
project_id=project_id,
356+
credentials=self.credentials,
357+
dialect="legacy",
358+
)
359+
tm.assert_frame_equal(
360+
df,
361+
DataFrame(
362+
{"valid_timestamp": [np.datetime64("2004-09-15T05:00:00")]}
342363
),
343364
)
344365

345366
@pytest.mark.parametrize(
346367
"expression, type_",
347368
[
348369
("current_date()", "<M8[ns]"),
349-
("current_timestamp()", "<M8[ns]"),
370+
("current_timestamp()", "datetime64[ns]"),
350371
("current_datetime()", "<M8[ns]"),
351372
("TRUE", bool),
352373
("FALSE", bool),
@@ -378,7 +399,19 @@ def test_should_properly_handle_null_timestamp(self, project_id):
378399
credentials=self.credentials,
379400
dialect="legacy",
380401
)
381-
tm.assert_frame_equal(df, DataFrame({"null_timestamp": [NaT]}))
402+
tm.assert_frame_equal(
403+
df, DataFrame({"null_timestamp": [NaT]}, dtype="datetime64[ns]")
404+
)
405+
406+
def test_should_properly_handle_null_datetime(self, project_id):
407+
query = "SELECT CAST(NULL AS DATETIME) AS null_datetime"
408+
df = gbq.read_gbq(
409+
query,
410+
project_id=project_id,
411+
credentials=self.credentials,
412+
dialect="standard",
413+
)
414+
tm.assert_frame_equal(df, DataFrame({"null_datetime": [NaT]}))
382415

383416
def test_should_properly_handle_null_boolean(self, project_id):
384417
query = "SELECT BOOLEAN(NULL) AS null_boolean"
@@ -549,17 +582,14 @@ def test_zero_rows(self, project_id):
549582
credentials=self.credentials,
550583
dialect="legacy",
551584
)
552-
page_array = np.zeros(
553-
(0,),
554-
dtype=[
555-
("title", object),
556-
("id", np.dtype(int)),
557-
("is_bot", np.dtype(bool)),
558-
("ts", "M8[ns]"),
559-
],
560-
)
585+
empty_columns = {
586+
"title": pandas.Series([], dtype=object),
587+
"id": pandas.Series([], dtype=np.dtype(int)),
588+
"is_bot": pandas.Series([], dtype=np.dtype(bool)),
589+
"ts": pandas.Series([], dtype="datetime64[ns]"),
590+
}
561591
expected_result = DataFrame(
562-
page_array, columns=["title", "id", "is_bot", "ts"]
592+
empty_columns, columns=["title", "id", "is_bot", "ts"]
563593
)
564594
tm.assert_frame_equal(df, expected_result, check_index_type=False)
565595

0 commit comments

Comments
 (0)