Skip to content

Commit e177978

Browse files
authored
BUG: fix AttributeError with BQ Storage API to download empty results (#310)
* BUG: fix AttributeError with BQ Storage API to download empty results Refactors timestamp helpers to their own file to help reduce the size of the gbq module. * blacken * fix lint * fix test_zero_rows * update release date
1 parent 612f165 commit e177978

File tree

7 files changed

+275
-135
lines changed

7 files changed

+275
-135
lines changed

docs/source/changelog.rst

+9
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,15 @@
11
Changelog
22
=========
33

4+
.. _changelog-0.13.1:
5+
6+
0.13.1 / 2020-02-13
7+
-------------------
8+
9+
- Fix ``AttributeError`` with BQ Storage API to download empty results.
10+
(:issue:`299`)
11+
12+
413
.. _changelog-0.13.0:
514

615
0.13.0 / 2019-12-12

pandas_gbq/gbq.py

+7-23
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,14 @@
88
try:
99
# The BigQuery Storage API client is an optional dependency. It is only
1010
# required when use_bqstorage_api=True.
11-
from google.cloud import bigquery_storage
11+
from google.cloud import bigquery_storage_v1beta1
1212
except ImportError: # pragma: NO COVER
13-
bigquery_storage = None
13+
bigquery_storage_v1beta1 = None
1414

1515
from pandas_gbq.exceptions import AccessDenied
1616
import pandas_gbq.schema
17+
import pandas_gbq.timestamp
18+
1719

1820
logger = logging.getLogger(__name__)
1921

@@ -564,7 +566,7 @@ def _download_results(
564566
df = _cast_empty_df_dtypes(schema_fields, df)
565567

566568
# Ensure any TIMESTAMP columns are tz-aware.
567-
df = _localize_df(schema_fields, df)
569+
df = pandas_gbq.timestamp.localize_df(df, schema_fields)
568570

569571
logger.debug("Got {} rows.\n".format(rows_iter.total_rows))
570572
return df
@@ -784,29 +786,11 @@ def _cast_empty_df_dtypes(schema_fields, df):
784786
return df
785787

786788

787-
def _localize_df(schema_fields, df):
788-
"""Localize any TIMESTAMP columns to tz-aware type.
789-
790-
In pandas versions before 0.24.0, DatetimeTZDtype cannot be used as the
791-
dtype in Series/DataFrame construction, so localize those columns after
792-
the DataFrame is constructed.
793-
"""
794-
for field in schema_fields:
795-
column = str(field["name"])
796-
if field["mode"].upper() == "REPEATED":
797-
continue
798-
799-
if field["type"].upper() == "TIMESTAMP" and df[column].dt.tz is None:
800-
df[column] = df[column].dt.tz_localize("UTC")
801-
802-
return df
803-
804-
805789
def _make_bqstorage_client(use_bqstorage_api, credentials):
806790
if not use_bqstorage_api:
807791
return None
808792

809-
if bigquery_storage is None:
793+
if bigquery_storage_v1beta1 is None:
810794
raise ImportError(
811795
"Install the google-cloud-bigquery-storage and fastavro/pyarrow "
812796
"packages to use the BigQuery Storage API."
@@ -818,7 +802,7 @@ def _make_bqstorage_client(use_bqstorage_api, credentials):
818802
client_info = google.api_core.gapic_v1.client_info.ClientInfo(
819803
user_agent="pandas-{}".format(pandas.__version__)
820804
)
821-
return bigquery_storage.BigQueryStorageClient(
805+
return bigquery_storage_v1beta1.BigQueryStorageClient(
822806
credentials=credentials, client_info=client_info
823807
)
824808

pandas_gbq/timestamp.py

+40
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
"""Helpers for working with TIMESTAMP data type.
2+
3+
Private module.
4+
"""
5+
6+
7+
def localize_df(df, schema_fields):
8+
"""Localize any TIMESTAMP columns to tz-aware type.
9+
10+
In pandas versions before 0.24.0, DatetimeTZDtype cannot be used as the
11+
dtype in Series/DataFrame construction, so localize those columns after
12+
the DataFrame is constructed.
13+
14+
Parameters
15+
----------
16+
schema_fields: sequence of dict
17+
BigQuery schema in parsed JSON data format.
18+
df: pandaas.DataFrame
19+
DataFrame in which to localize TIMESTAMP columns.
20+
21+
22+
Returns
23+
-------
24+
pandas.DataFrame
25+
DataFrame with localized TIMESTAMP columns.
26+
"""
27+
if len(df.index) == 0:
28+
# If there are no rows, there is nothing to do.
29+
# Fix for https://github.com/pydata/pandas-gbq/issues/299
30+
return df
31+
32+
for field in schema_fields:
33+
column = str(field["name"])
34+
if "mode" in field and field["mode"].upper() == "REPEATED":
35+
continue
36+
37+
if field["type"].upper() == "TIMESTAMP" and df[column].dt.tz is None:
38+
df[column] = df[column].dt.tz_localize("UTC")
39+
40+
return df

tests/system/conftest.py

+78
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
import google.oauth2.service_account
2+
import pytest
3+
4+
5+
@pytest.fixture(params=["env"])
6+
def project(request, project_id):
7+
if request.param == "env":
8+
return project_id
9+
elif request.param == "none":
10+
return None
11+
12+
13+
@pytest.fixture()
14+
def credentials(private_key_path):
15+
return google.oauth2.service_account.Credentials.from_service_account_file(
16+
private_key_path
17+
)
18+
19+
20+
@pytest.fixture()
21+
def gbq_connector(project, credentials):
22+
from pandas_gbq import gbq
23+
24+
return gbq.GbqConnector(project, credentials=credentials)
25+
26+
27+
@pytest.fixture()
28+
def random_dataset(bigquery_client, random_dataset_id):
29+
from google.cloud import bigquery
30+
31+
dataset_ref = bigquery_client.dataset(random_dataset_id)
32+
dataset = bigquery.Dataset(dataset_ref)
33+
bigquery_client.create_dataset(dataset)
34+
return dataset
35+
36+
37+
@pytest.fixture()
38+
def tokyo_dataset(bigquery_client, random_dataset_id):
39+
from google.cloud import bigquery
40+
41+
dataset_ref = bigquery_client.dataset(random_dataset_id)
42+
dataset = bigquery.Dataset(dataset_ref)
43+
dataset.location = "asia-northeast1"
44+
bigquery_client.create_dataset(dataset)
45+
return random_dataset_id
46+
47+
48+
@pytest.fixture()
49+
def tokyo_table(bigquery_client, tokyo_dataset):
50+
table_id = "tokyo_table"
51+
# Create a random table using DDL.
52+
# https://github.com/GoogleCloudPlatform/golang-samples/blob/2ab2c6b79a1ea3d71d8f91609b57a8fbde07ae5d/bigquery/snippets/snippet.go#L739
53+
bigquery_client.query(
54+
"""CREATE TABLE {}.{}
55+
AS SELECT
56+
2000 + CAST(18 * RAND() as INT64) as year,
57+
IF(RAND() > 0.5,"foo","bar") as token
58+
FROM UNNEST(GENERATE_ARRAY(0,5,1)) as r
59+
""".format(
60+
tokyo_dataset, table_id
61+
),
62+
location="asia-northeast1",
63+
).result()
64+
return table_id
65+
66+
67+
@pytest.fixture()
68+
def gbq_dataset(project, credentials):
69+
from pandas_gbq import gbq
70+
71+
return gbq._Dataset(project, credentials=credentials)
72+
73+
74+
@pytest.fixture()
75+
def gbq_table(project, credentials, random_dataset_id):
76+
from pandas_gbq import gbq
77+
78+
return gbq._Table(project, random_dataset_id, credentials=credentials)

tests/system/test_gbq.py

-112
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,8 @@
11
# -*- coding: utf-8 -*-
22

33
import sys
4-
import uuid
54
from datetime import datetime
65

7-
import google.oauth2.service_account
86
import numpy as np
97
import pandas
108
import pandas.api.types
@@ -28,76 +26,6 @@ def test_imports():
2826
gbq._test_google_api_imports()
2927

3028

31-
@pytest.fixture(params=["env"])
32-
def project(request, project_id):
33-
if request.param == "env":
34-
return project_id
35-
elif request.param == "none":
36-
return None
37-
38-
39-
@pytest.fixture()
40-
def credentials(private_key_path):
41-
return google.oauth2.service_account.Credentials.from_service_account_file(
42-
private_key_path
43-
)
44-
45-
46-
@pytest.fixture()
47-
def gbq_connector(project, credentials):
48-
return gbq.GbqConnector(project, credentials=credentials)
49-
50-
51-
@pytest.fixture()
52-
def random_dataset(bigquery_client, random_dataset_id):
53-
from google.cloud import bigquery
54-
55-
dataset_ref = bigquery_client.dataset(random_dataset_id)
56-
dataset = bigquery.Dataset(dataset_ref)
57-
bigquery_client.create_dataset(dataset)
58-
return dataset
59-
60-
61-
@pytest.fixture()
62-
def tokyo_dataset(bigquery_client, random_dataset_id):
63-
from google.cloud import bigquery
64-
65-
dataset_ref = bigquery_client.dataset(random_dataset_id)
66-
dataset = bigquery.Dataset(dataset_ref)
67-
dataset.location = "asia-northeast1"
68-
bigquery_client.create_dataset(dataset)
69-
return random_dataset_id
70-
71-
72-
@pytest.fixture()
73-
def tokyo_table(bigquery_client, tokyo_dataset):
74-
table_id = "tokyo_table"
75-
# Create a random table using DDL.
76-
# https://github.com/GoogleCloudPlatform/golang-samples/blob/2ab2c6b79a1ea3d71d8f91609b57a8fbde07ae5d/bigquery/snippets/snippet.go#L739
77-
bigquery_client.query(
78-
"""CREATE TABLE {}.{}
79-
AS SELECT
80-
2000 + CAST(18 * RAND() as INT64) as year,
81-
IF(RAND() > 0.5,"foo","bar") as token
82-
FROM UNNEST(GENERATE_ARRAY(0,5,1)) as r
83-
""".format(
84-
tokyo_dataset, table_id
85-
),
86-
location="asia-northeast1",
87-
).result()
88-
return table_id
89-
90-
91-
@pytest.fixture()
92-
def gbq_dataset(project, credentials):
93-
return gbq._Dataset(project, credentials=credentials)
94-
95-
96-
@pytest.fixture()
97-
def gbq_table(project, credentials, random_dataset_id):
98-
return gbq._Table(project, random_dataset_id, credentials=credentials)
99-
100-
10129
def make_mixed_dataframe_v2(test_size):
10230
# create df to test for all BQ datatypes except RECORD
10331
bools = np.random.randint(2, size=(1, test_size)).astype(bool)
@@ -600,9 +528,6 @@ def test_zero_rows(self, project_id):
600528
empty_columns,
601529
columns=["name", "number", "is_hurricane", "iso_time"],
602530
)
603-
expected_result["iso_time"] = expected_result[
604-
"iso_time"
605-
].dt.tz_localize("UTC")
606531
tm.assert_frame_equal(df, expected_result, check_index_type=False)
607532

608533
def test_one_row_one_column(self, project_id):
@@ -917,43 +842,6 @@ def test_tokyo(self, tokyo_dataset, tokyo_table, project_id):
917842
assert df["max_year"][0] >= 2000
918843

919844

920-
@pytest.mark.slow(reason="Large query for BQ Storage API tests.")
921-
def test_read_gbq_w_bqstorage_api(credentials, random_dataset):
922-
pytest.importorskip("google.cloud.bigquery_storage")
923-
df = gbq.read_gbq(
924-
"""
925-
SELECT
926-
total_amount,
927-
passenger_count,
928-
trip_distance
929-
FROM `bigquery-public-data.new_york_taxi_trips.tlc_green_trips_2014`
930-
-- Select non-null rows for no-copy conversion from Arrow to pandas.
931-
WHERE total_amount IS NOT NULL
932-
AND passenger_count IS NOT NULL
933-
AND trip_distance IS NOT NULL
934-
LIMIT 10000000
935-
""",
936-
use_bqstorage_api=True,
937-
credentials=credentials,
938-
configuration={
939-
"query": {
940-
"destinationTable": {
941-
"projectId": random_dataset.project,
942-
"datasetId": random_dataset.dataset_id,
943-
"tableId": "".join(
944-
[
945-
"test_read_gbq_w_bqstorage_api_",
946-
str(uuid.uuid4()).replace("-", "_"),
947-
]
948-
),
949-
},
950-
"writeDisposition": "WRITE_TRUNCATE",
951-
}
952-
},
953-
)
954-
assert len(df) == 10000000
955-
956-
957845
class TestToGBQIntegration(object):
958846
@pytest.fixture(autouse=True, scope="function")
959847
def setup(self, project, credentials, random_dataset_id):

0 commit comments

Comments
 (0)