Skip to content

Commit 015a73e

Browse files
authored
fix: add minimum timeout to getQueryResults API requests (#444)
* fix: add minimum timeout to getQueryResults API requests Since successful responses can still take a long time to download, have a minimum timeout which should accomodate 99.9%+ of responses. I figure it's more important that *any* timeout is set if desired than it is that the specific timeout is used. This is especially true in cases where a short timeout is requested for the purposes of a progress bar. Making forward progress is more important than the progress bar update frequency. * docs: document minimum timeout value * test: remove redundant query timeout test * test: change assertion for done method * chore: remove unused import
1 parent 0023d19 commit 015a73e

File tree

4 files changed

+64
-26
lines changed

4 files changed

+64
-26
lines changed

google/cloud/bigquery/client.py

+20-2
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,14 @@
9393
)
9494
_LIST_ROWS_FROM_QUERY_RESULTS_FIELDS = "jobReference,totalRows,pageToken,rows"
9595

96+
# In microbenchmarks, it's been shown that even in ideal conditions (query
97+
# finished, local data), requests to getQueryResults can take 10+ seconds.
98+
# In less-than-ideal situations, the response can take even longer, as it must
99+
# be able to download a full 100+ MB row in that time. Don't let the
100+
# connection timeout before data can be downloaded.
101+
# https://github.com/googleapis/python-bigquery/issues/438
102+
_MIN_GET_QUERY_RESULTS_TIMEOUT = 120
103+
96104

97105
class Project(object):
98106
"""Wrapper for resource describing a BigQuery project.
@@ -1570,7 +1578,9 @@ def _get_query_results(
15701578
location (Optional[str]): Location of the query job.
15711579
timeout (Optional[float]):
15721580
The number of seconds to wait for the underlying HTTP transport
1573-
before using ``retry``.
1581+
before using ``retry``. If set, this connection timeout may be
1582+
increased to a minimum value. This prevents retries on what
1583+
would otherwise be a successful response.
15741584
15751585
Returns:
15761586
google.cloud.bigquery.query._QueryResults:
@@ -1579,6 +1589,9 @@ def _get_query_results(
15791589

15801590
extra_params = {"maxResults": 0}
15811591

1592+
if timeout is not None:
1593+
timeout = max(timeout, _MIN_GET_QUERY_RESULTS_TIMEOUT)
1594+
15821595
if project is None:
15831596
project = self.project
15841597

@@ -3293,7 +3306,9 @@ def _list_rows_from_query_results(
32933306
How to retry the RPC.
32943307
timeout (Optional[float]):
32953308
The number of seconds to wait for the underlying HTTP transport
3296-
before using ``retry``.
3309+
before using ``retry``. If set, this connection timeout may be
3310+
increased to a minimum value. This prevents retries on what
3311+
would otherwise be a successful response.
32973312
If multiple requests are made under the hood, ``timeout``
32983313
applies to each individual request.
32993314
Returns:
@@ -3306,6 +3321,9 @@ def _list_rows_from_query_results(
33063321
"location": location,
33073322
}
33083323

3324+
if timeout is not None:
3325+
timeout = max(timeout, _MIN_GET_QUERY_RESULTS_TIMEOUT)
3326+
33093327
if start_index is not None:
33103328
params["startIndex"] = start_index
33113329

tests/system.py

+11-22
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@
2727
import uuid
2828
import re
2929

30-
import requests
3130
import psutil
3231
import pytest
3332
import pytz
@@ -1798,15 +1797,25 @@ def test_query_w_wrong_config(self):
17981797
Config.CLIENT.query(good_query, job_config=bad_config).result()
17991798

18001799
def test_query_w_timeout(self):
1800+
job_config = bigquery.QueryJobConfig()
1801+
job_config.use_query_cache = False
1802+
18011803
query_job = Config.CLIENT.query(
18021804
"SELECT * FROM `bigquery-public-data.github_repos.commits`;",
18031805
job_id_prefix="test_query_w_timeout_",
1806+
location="US",
1807+
job_config=job_config,
18041808
)
18051809

18061810
with self.assertRaises(concurrent.futures.TimeoutError):
1807-
# 1 second is much too short for this query.
18081811
query_job.result(timeout=1)
18091812

1813+
# Even though the query takes >1 second, the call to getQueryResults
1814+
# should succeed.
1815+
self.assertFalse(query_job.done(timeout=1))
1816+
1817+
Config.CLIENT.cancel_job(query_job.job_id, location=query_job.location)
1818+
18101819
def test_query_w_page_size(self):
18111820
page_size = 45
18121821
query_job = Config.CLIENT.query(
@@ -2408,26 +2417,6 @@ def test_query_iter(self):
24082417
row_tuples = [r.values() for r in query_job]
24092418
self.assertEqual(row_tuples, [(1,)])
24102419

2411-
def test_querying_data_w_timeout(self):
2412-
job_config = bigquery.QueryJobConfig()
2413-
job_config.use_query_cache = False
2414-
2415-
query_job = Config.CLIENT.query(
2416-
"""
2417-
SELECT COUNT(*)
2418-
FROM UNNEST(GENERATE_ARRAY(1,1000000)), UNNEST(GENERATE_ARRAY(1, 10000))
2419-
""",
2420-
location="US",
2421-
job_config=job_config,
2422-
)
2423-
2424-
# Specify a very tight deadline to demonstrate that the timeout
2425-
# actually has effect.
2426-
with self.assertRaises(requests.exceptions.Timeout):
2427-
query_job.done(timeout=0.1)
2428-
2429-
Config.CLIENT.cancel_job(query_job.job_id, location=query_job.location)
2430-
24312420
@unittest.skipIf(pandas is None, "Requires `pandas`")
24322421
def test_query_results_to_dataframe(self):
24332422
QUERY = """

tests/unit/job/test_query.py

+6
Original file line numberDiff line numberDiff line change
@@ -1046,6 +1046,8 @@ def test_result_invokes_begins(self):
10461046
self.assertEqual(reload_request[1]["method"], "GET")
10471047

10481048
def test_result_w_timeout(self):
1049+
import google.cloud.bigquery.client
1050+
10491051
begun_resource = self._make_resource()
10501052
query_resource = {
10511053
"jobComplete": True,
@@ -1072,6 +1074,10 @@ def test_result_w_timeout(self):
10721074
"/projects/{}/queries/{}".format(self.PROJECT, self.JOB_ID),
10731075
)
10741076
self.assertEqual(query_request[1]["query_params"]["timeoutMs"], 900)
1077+
self.assertEqual(
1078+
query_request[1]["timeout"],
1079+
google.cloud.bigquery.client._MIN_GET_QUERY_RESULTS_TIMEOUT,
1080+
)
10751081
self.assertEqual(reload_request[1]["method"], "GET")
10761082

10771083
def test_result_w_page_size(self):

tests/unit/test_client.py

+27-2
Original file line numberDiff line numberDiff line change
@@ -311,7 +311,7 @@ def test__get_query_results_miss_w_explicit_project_and_timeout(self):
311311
project="other-project",
312312
location=self.LOCATION,
313313
timeout_ms=500,
314-
timeout=42,
314+
timeout=420,
315315
)
316316

317317
final_attributes.assert_called_once_with({"path": path}, client, None)
@@ -320,7 +320,32 @@ def test__get_query_results_miss_w_explicit_project_and_timeout(self):
320320
method="GET",
321321
path=path,
322322
query_params={"maxResults": 0, "timeoutMs": 500, "location": self.LOCATION},
323-
timeout=42,
323+
timeout=420,
324+
)
325+
326+
def test__get_query_results_miss_w_short_timeout(self):
327+
import google.cloud.bigquery.client
328+
from google.cloud.exceptions import NotFound
329+
330+
creds = _make_credentials()
331+
client = self._make_one(self.PROJECT, creds)
332+
conn = client._connection = make_connection()
333+
path = "/projects/other-project/queries/nothere"
334+
with self.assertRaises(NotFound):
335+
client._get_query_results(
336+
"nothere",
337+
None,
338+
project="other-project",
339+
location=self.LOCATION,
340+
timeout_ms=500,
341+
timeout=1,
342+
)
343+
344+
conn.api_request.assert_called_once_with(
345+
method="GET",
346+
path=path,
347+
query_params={"maxResults": 0, "timeoutMs": 500, "location": self.LOCATION},
348+
timeout=google.cloud.bigquery.client._MIN_GET_QUERY_RESULTS_TIMEOUT,
324349
)
325350

326351
def test__get_query_results_miss_w_client_location(self):

0 commit comments

Comments
 (0)