Skip to content

Commit ac3ce3f

Browse files
authored
feat: use faster query_and_wait method from google-cloud-bigquery when available (#722)
* feat: use faster query_and_wait method from google-cloud-bigquery when available fix unit tests fix python 3.7 fix python 3.7 fix python 3.7 fix python 3.7 fix wait_timeout units boost test coverage remove dead code boost a little more coverage * restore missing test
1 parent e1c384e commit ac3ce3f

File tree

7 files changed

+411
-62
lines changed

7 files changed

+411
-62
lines changed

pandas_gbq/features.py

+9-1
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,9 @@
44

55
"""Module for checking dependency versions and supported features."""
66

7-
# https://github.com/googleapis/python-bigquery/blob/master/CHANGELOG.md
7+
# https://github.com/googleapis/python-bigquery/blob/main/CHANGELOG.md
88
BIGQUERY_MINIMUM_VERSION = "3.3.5"
9+
BIGQUERY_QUERY_AND_WAIT_VERSION = "3.14.0"
910
PANDAS_VERBOSITY_DEPRECATION_VERSION = "0.23.0"
1011
PANDAS_BOOLEAN_DTYPE_VERSION = "1.0.0"
1112
PANDAS_PARQUET_LOSSLESS_TIMESTAMP_VERSION = "1.1.0"
@@ -45,6 +46,13 @@ def bigquery_try_import(self):
4546

4647
return google.cloud.bigquery
4748

49+
@property
50+
def bigquery_has_query_and_wait(self):
51+
import packaging.version
52+
53+
min_version = packaging.version.parse(BIGQUERY_QUERY_AND_WAIT_VERSION)
54+
return self.bigquery_installed_version >= min_version
55+
4856
@property
4957
def pandas_installed_version(self):
5058
import pandas

pandas_gbq/gbq.py

+31-13
Original file line numberDiff line numberDiff line change
@@ -351,12 +351,17 @@ def process_http_error(ex):
351351
# See `BigQuery Troubleshooting Errors
352352
# <https://cloud.google.com/bigquery/troubleshooting-errors>`__
353353

354-
if "cancelled" in ex.message:
354+
message = (
355+
ex.message.casefold()
356+
if hasattr(ex, "message") and ex.message is not None
357+
else ""
358+
)
359+
if "cancelled" in message:
355360
raise QueryTimeout("Reason: {0}".format(ex))
356-
elif "Provided Schema does not match" in ex.message:
361+
elif "schema does not match" in message:
357362
error_message = ex.errors[0]["message"]
358363
raise InvalidSchema(f"Reason: {error_message}")
359-
elif "Already Exists: Table" in ex.message:
364+
elif "already exists: table" in message:
360365
error_message = ex.errors[0]["message"]
361366
raise TableCreationError(f"Reason: {error_message}")
362367
else:
@@ -410,16 +415,29 @@ def run_query(self, query, max_results=None, progress_bar_type=None, **kwargs):
410415

411416
self._start_timer()
412417
job_config = bigquery.QueryJobConfig.from_api_repr(job_config_dict)
413-
rows_iter = pandas_gbq.query.query_and_wait(
414-
self,
415-
self.client,
416-
query,
417-
location=self.location,
418-
project_id=self.project_id,
419-
job_config=job_config,
420-
max_results=max_results,
421-
timeout_ms=timeout_ms,
422-
)
418+
419+
if FEATURES.bigquery_has_query_and_wait:
420+
rows_iter = pandas_gbq.query.query_and_wait_via_client_library(
421+
self,
422+
self.client,
423+
query,
424+
location=self.location,
425+
project_id=self.project_id,
426+
job_config=job_config,
427+
max_results=max_results,
428+
timeout_ms=timeout_ms,
429+
)
430+
else:
431+
rows_iter = pandas_gbq.query.query_and_wait(
432+
self,
433+
self.client,
434+
query,
435+
location=self.location,
436+
project_id=self.project_id,
437+
job_config=job_config,
438+
max_results=max_results,
439+
timeout_ms=timeout_ms,
440+
)
423441

424442
dtypes = kwargs.get("dtypes")
425443
return self._download_results(

pandas_gbq/query.py

+56-19
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,11 @@
55
from __future__ import annotations
66

77
import concurrent.futures
8+
import functools
89
import logging
910
from typing import Optional
1011

12+
import google.auth.exceptions
1113
from google.cloud import bigquery
1214

1315
import pandas_gbq.exceptions
@@ -78,6 +80,26 @@ def _wait_for_query_job(
7880
connector.process_http_error(ex)
7981

8082

83+
def try_query(connector, query_fn):
84+
try:
85+
logger.debug("Requesting query... ")
86+
return query_fn()
87+
except concurrent.futures.TimeoutError as ex:
88+
raise pandas_gbq.exceptions.QueryTimeout("Reason: {0}".format(ex))
89+
except (google.auth.exceptions.RefreshError, ValueError) as ex:
90+
if connector.private_key:
91+
raise pandas_gbq.exceptions.AccessDenied(
92+
f"The service account credentials are not valid: {ex}"
93+
)
94+
else:
95+
raise pandas_gbq.exceptions.AccessDenied(
96+
"The credentials have been revoked or expired, "
97+
f"please re-run the application to re-authorize: {ex}"
98+
)
99+
except connector.http_error as ex:
100+
connector.process_http_error(ex)
101+
102+
81103
def query_and_wait(
82104
connector,
83105
client: bigquery.Client,
@@ -122,29 +144,17 @@ def query_and_wait(
122144
Result iterator from which we can download the results in the
123145
desired format (pandas.DataFrame).
124146
"""
125-
from google.auth.exceptions import RefreshError
126-
127-
try:
128-
logger.debug("Requesting query... ")
129-
query_reply = client.query(
147+
query_reply = try_query(
148+
connector,
149+
functools.partial(
150+
client.query,
130151
query,
131152
job_config=job_config,
132153
location=location,
133154
project=project_id,
134-
)
135-
logger.debug("Query running...")
136-
except (RefreshError, ValueError) as ex:
137-
if connector.private_key:
138-
raise pandas_gbq.exceptions.AccessDenied(
139-
f"The service account credentials are not valid: {ex}"
140-
)
141-
else:
142-
raise pandas_gbq.exceptions.AccessDenied(
143-
"The credentials have been revoked or expired, "
144-
f"please re-run the application to re-authorize: {ex}"
145-
)
146-
except connector.http_error as ex:
147-
connector.process_http_error(ex)
155+
),
156+
)
157+
logger.debug("Query running...")
148158

149159
job_id = query_reply.job_id
150160
logger.debug("Job ID: %s" % job_id)
@@ -173,3 +183,30 @@ def query_and_wait(
173183
return query_reply.result(max_results=max_results)
174184
except connector.http_error as ex:
175185
connector.process_http_error(ex)
186+
187+
188+
def query_and_wait_via_client_library(
189+
connector,
190+
client: bigquery.Client,
191+
query: str,
192+
*,
193+
job_config: bigquery.QueryJobConfig,
194+
location: Optional[str],
195+
project_id: Optional[str],
196+
max_results: Optional[int],
197+
timeout_ms: Optional[int],
198+
):
199+
rows_iter = try_query(
200+
connector,
201+
functools.partial(
202+
client.query_and_wait,
203+
query,
204+
job_config=job_config,
205+
location=location,
206+
project=project_id,
207+
max_results=max_results,
208+
wait_timeout=timeout_ms / 1000.0 if timeout_ms else None,
209+
),
210+
)
211+
logger.debug("Query done.\n")
212+
return rows_iter

tests/unit/test_context.py

+41-1
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
import google.cloud.bigquery
1010
import google.cloud.bigquery.table
11+
import packaging.version
1112
import pytest
1213

1314

@@ -55,8 +56,15 @@ def test_read_gbq_should_save_credentials(mock_get_credentials):
5556
mock_get_credentials.assert_not_called()
5657

5758

58-
def test_read_gbq_should_use_dialect(mock_bigquery_client):
59+
def test_read_gbq_should_use_dialect_with_query(monkeypatch, mock_bigquery_client):
5960
import pandas_gbq
61+
import pandas_gbq.features
62+
63+
monkeypatch.setattr(
64+
pandas_gbq.features.FEATURES,
65+
"_bigquery_installed_version",
66+
packaging.version.parse(pandas_gbq.features.BIGQUERY_MINIMUM_VERSION),
67+
)
6068

6169
assert pandas_gbq.context.dialect is None
6270
pandas_gbq.context.dialect = "legacy"
@@ -71,3 +79,35 @@ def test_read_gbq_should_use_dialect(mock_bigquery_client):
7179
_, kwargs = mock_bigquery_client.query.call_args
7280
assert not kwargs["job_config"].use_legacy_sql
7381
pandas_gbq.context.dialect = None # Reset the global state.
82+
83+
84+
def test_read_gbq_should_use_dialect_with_query_and_wait(
85+
monkeypatch, mock_bigquery_client
86+
):
87+
if not hasattr(mock_bigquery_client, "query_and_wait"):
88+
pytest.skip(
89+
f"google-cloud-bigquery {google.cloud.bigquery.__version__} does not have query_and_wait"
90+
)
91+
92+
import pandas_gbq
93+
import pandas_gbq.features
94+
95+
monkeypatch.setattr(
96+
pandas_gbq.features.FEATURES,
97+
"_bigquery_installed_version",
98+
packaging.version.parse(pandas_gbq.features.BIGQUERY_QUERY_AND_WAIT_VERSION),
99+
)
100+
101+
assert pandas_gbq.context.dialect is None
102+
pandas_gbq.context.dialect = "legacy"
103+
pandas_gbq.read_gbq("SELECT 1")
104+
105+
_, kwargs = mock_bigquery_client.query_and_wait.call_args
106+
assert kwargs["job_config"].use_legacy_sql
107+
108+
pandas_gbq.context.dialect = "standard"
109+
pandas_gbq.read_gbq("SELECT 1")
110+
111+
_, kwargs = mock_bigquery_client.query_and_wait.call_args
112+
assert not kwargs["job_config"].use_legacy_sql
113+
pandas_gbq.context.dialect = None # Reset the global state.

tests/unit/test_features.py

+17
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,23 @@ def fresh_bigquery_version(monkeypatch):
1313
monkeypatch.setattr(FEATURES, "_pandas_installed_version", None)
1414

1515

16+
@pytest.mark.parametrize(
17+
["bigquery_version", "expected"],
18+
[
19+
("1.99.100", False),
20+
("2.99.999", False),
21+
("3.13.11", False),
22+
("3.14.0", True),
23+
("4.999.999", True),
24+
],
25+
)
26+
def test_bigquery_has_query_and_wait(monkeypatch, bigquery_version, expected):
27+
import google.cloud.bigquery
28+
29+
monkeypatch.setattr(google.cloud.bigquery, "__version__", bigquery_version)
30+
assert FEATURES.bigquery_has_query_and_wait == expected
31+
32+
1633
@pytest.mark.parametrize(
1734
["pandas_version", "expected"],
1835
[

0 commit comments

Comments
 (0)