Skip to content

Commit cb16cfd

Browse files
authored
chore: remove workarounds for BQ Storage issue with small result sets (#133)
* chore: remove workarounds for BQ Storage issue with small result sets * Fix two typos in docstrings
1 parent fce76b3 commit cb16cfd

File tree

8 files changed

+54
-325
lines changed

8 files changed

+54
-325
lines changed

google/cloud/bigquery/dbapi/connection.py

Lines changed: 6 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -34,16 +34,10 @@ class Connection(object):
3434
):
3535
A client that uses the faster BigQuery Storage API to fetch rows from
3636
BigQuery. If not passed, it is created using the same credentials
37-
as ``client``.
37+
as ``client`` (provided that BigQuery Storage dependencies are installed).
3838
39-
When fetching query results, ``bqstorage_client`` is used first, with
40-
a fallback on ``client``, if necessary.
41-
42-
.. note::
43-
There is a known issue with the BigQuery Storage API with small
44-
anonymous result sets, which results in such fallback.
45-
46-
https://github.com/googleapis/python-bigquery-storage/issues/2
39+
If both clients are available, ``bqstorage_client`` is used for
40+
fetching query results.
4741
"""
4842

4943
def __init__(self, client=None, bqstorage_client=None):
@@ -110,16 +104,10 @@ def connect(client=None, bqstorage_client=None):
110104
):
111105
A client that uses the faster BigQuery Storage API to fetch rows from
112106
BigQuery. If not passed, it is created using the same credentials
113-
as ``client``.
114-
115-
When fetching query results, ``bqstorage_client`` is used first, with
116-
a fallback on ``client``, if necessary.
117-
118-
.. note::
119-
There is a known issue with the BigQuery Storage API with small
120-
anonymous result sets, which results in such fallback.
107+
as ``client`` (provided that BigQuery Storage dependencies are installed).
121108
122-
https://github.com/googleapis/python-bigquery-storage/issues/2
109+
If both clients are available, ``bqstorage_client`` is used for
110+
fetching query results.
123111
124112
Returns:
125113
google.cloud.bigquery.dbapi.Connection: A new DB-API connection to BigQuery.

google/cloud/bigquery/dbapi/cursor.py

Lines changed: 3 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -224,26 +224,9 @@ def _try_fetch(self, size=None):
224224
bqstorage_client = self.connection._bqstorage_client
225225

226226
if bqstorage_client is not None:
227-
try:
228-
rows_iterable = self._bqstorage_fetch(bqstorage_client)
229-
self._query_data = _helpers.to_bq_table_rows(rows_iterable)
230-
return
231-
except google.api_core.exceptions.GoogleAPICallError as exc:
232-
# NOTE: Forbidden is a subclass of GoogleAPICallError
233-
if isinstance(exc, google.api_core.exceptions.Forbidden):
234-
# Don't hide errors such as insufficient permissions to create
235-
# a read session, or the API is not enabled. Both of those are
236-
# clearly problems if the developer has explicitly asked for
237-
# BigQuery Storage API support.
238-
raise
239-
240-
# There is an issue with reading from small anonymous
241-
# query results tables. If such an error occurs, we silence
242-
# it in order to try again with the tabledata.list API.
243-
_LOGGER.debug(
244-
"Error fetching data with BigQuery Storage API, "
245-
"falling back to tabledata.list API."
246-
)
227+
rows_iterable = self._bqstorage_fetch(bqstorage_client)
228+
self._query_data = _helpers.to_bq_table_rows(rows_iterable)
229+
return
247230

248231
rows_iter = client.list_rows(
249232
self._query_job.destination,

google/cloud/bigquery/job.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3335,9 +3335,6 @@ def to_dataframe(
33353335
Reading from a specific partition or snapshot is not
33363336
currently supported by this method.
33373337
3338-
**Caution**: There is a known issue reading small anonymous
3339-
query result tables with the BQ Storage API. Write your query
3340-
results to a destination table to work around this issue.
33413338
dtypes (Map[str, Union[str, pandas.Series.dtype]]):
33423339
Optional. A dictionary of column names pandas ``dtype``s. The
33433340
provided ``dtype`` is used when constructing the series for

google/cloud/bigquery/table.py

Lines changed: 8 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1431,30 +1431,10 @@ def _to_page_iterable(
14311431
self, bqstorage_download, tabledata_list_download, bqstorage_client=None
14321432
):
14331433
if bqstorage_client is not None:
1434-
try:
1435-
# Iterate over the stream so that read errors are raised (and
1436-
# the method can then fallback to tabledata.list).
1437-
for item in bqstorage_download():
1438-
yield item
1439-
return
1440-
except google.api_core.exceptions.Forbidden:
1441-
# Don't hide errors such as insufficient permissions to create
1442-
# a read session, or the API is not enabled. Both of those are
1443-
# clearly problems if the developer has explicitly asked for
1444-
# BigQuery Storage API support.
1445-
raise
1446-
except google.api_core.exceptions.GoogleAPICallError:
1447-
# There is a known issue with reading from small anonymous
1448-
# query results tables, so some errors are expected. Rather
1449-
# than throw those errors, try reading the DataFrame again, but
1450-
# with the tabledata.list API.
1451-
pass
1452-
1453-
_LOGGER.debug(
1454-
"Started reading table '{}.{}.{}' with tabledata.list.".format(
1455-
self._table.project, self._table.dataset_id, self._table.table_id
1456-
)
1457-
)
1434+
for item in bqstorage_download():
1435+
yield item
1436+
return
1437+
14581438
for item in tabledata_list_download():
14591439
yield item
14601440

@@ -1599,14 +1579,10 @@ def to_dataframe_iterable(self, bqstorage_client=None, dtypes=None):
15991579
This method requires the ``pyarrow`` and
16001580
``google-cloud-bigquery-storage`` libraries.
16011581
1602-
This method only exposes a subset of the capabilities of the
1603-
BigQuery Storage API. For full access to all features
1582+
This method only exposes a subset of the capabilities of the
1583+
BigQuery Storage API. For full access to all features
16041584
(projections, filters, snapshots) use the Storage API directly.
16051585
1606-
**Caution**: There is a known issue reading small anonymous
1607-
query result tables with the BQ Storage API. When a problem
1608-
is encountered reading a table, the tabledata.list method
1609-
from the BigQuery API is used, instead.
16101586
dtypes (Map[str, Union[str, pandas.Series.dtype]]):
16111587
Optional. A dictionary of column names pandas ``dtype``s. The
16121588
provided ``dtype`` is used when constructing the series for
@@ -1668,14 +1644,10 @@ def to_dataframe(
16681644
This method requires the ``pyarrow`` and
16691645
``google-cloud-bigquery-storage`` libraries.
16701646
1671-
This method only exposes a subset of the capabilities of the
1672-
BigQuery Storage API. For full access to all features
1647+
This method only exposes a subset of the capabilities of the
1648+
BigQuery Storage API. For full access to all features
16731649
(projections, filters, snapshots) use the Storage API directly.
16741650
1675-
**Caution**: There is a known issue reading small anonymous
1676-
query result tables with the BQ Storage API. When a problem
1677-
is encountered reading a table, the tabledata.list method
1678-
from the BigQuery API is used, instead.
16791651
dtypes (Map[str, Union[str, pandas.Series.dtype]]):
16801652
Optional. A dictionary of column names pandas ``dtype``s. The
16811653
provided ``dtype`` is used when constructing the series for

tests/system.py

Lines changed: 32 additions & 130 deletions
Original file line numberDiff line numberDiff line change
@@ -1690,50 +1690,6 @@ def test_dbapi_fetchall(self):
16901690
row_tuples = [r.values() for r in rows]
16911691
self.assertEqual(row_tuples, [(1, 2), (3, 4), (5, 6)])
16921692

1693-
@unittest.skipIf(
1694-
bigquery_storage_v1 is None, "Requires `google-cloud-bigquery-storage`"
1695-
)
1696-
def test_dbapi_fetch_w_bqstorage_client_small_result_set(self):
1697-
bqstorage_client = bigquery_storage_v1.BigQueryReadClient(
1698-
credentials=Config.CLIENT._credentials
1699-
)
1700-
cursor = dbapi.connect(Config.CLIENT, bqstorage_client).cursor()
1701-
1702-
# Reading small result sets causes an issue with BQ storage client,
1703-
# and the DB API should transparently fall back to the default client.
1704-
cursor.execute(
1705-
"""
1706-
SELECT id, `by`, time_ts
1707-
FROM `bigquery-public-data.hacker_news.comments`
1708-
ORDER BY `id` ASC
1709-
LIMIT 10
1710-
"""
1711-
)
1712-
1713-
result_rows = [cursor.fetchone(), cursor.fetchone(), cursor.fetchone()]
1714-
1715-
field_name = operator.itemgetter(0)
1716-
fetched_data = [sorted(row.items(), key=field_name) for row in result_rows]
1717-
1718-
expected_data = [
1719-
[
1720-
("by", "sama"),
1721-
("id", 15),
1722-
("time_ts", datetime.datetime(2006, 10, 9, 19, 51, 1, tzinfo=UTC)),
1723-
],
1724-
[
1725-
("by", "pg"),
1726-
("id", 17),
1727-
("time_ts", datetime.datetime(2006, 10, 9, 19, 52, 45, tzinfo=UTC)),
1728-
],
1729-
[
1730-
("by", "pg"),
1731-
("id", 22),
1732-
("time_ts", datetime.datetime(2006, 10, 10, 2, 18, 22, tzinfo=UTC)),
1733-
],
1734-
]
1735-
self.assertEqual(fetched_data, expected_data)
1736-
17371693
@unittest.skipIf(
17381694
bigquery_storage_v1 is None, "Requires `google-cloud-bigquery-storage`"
17391695
)
@@ -1744,10 +1700,6 @@ def test_dbapi_fetch_w_bqstorage_client_large_result_set(self):
17441700
)
17451701
cursor = dbapi.connect(Config.CLIENT, bqstorage_client).cursor()
17461702

1747-
# Pick a large enough LIMIT value to assure that the fallback to the
1748-
# default client is not needed due to the result set being too small
1749-
# (a known issue that causes problems when reading such result sets with
1750-
# BQ storage client).
17511703
cursor.execute(
17521704
"""
17531705
SELECT id, `by`, time_ts
@@ -1794,10 +1746,6 @@ def test_dbapi_fetch_w_bqstorage_client_v1beta1_large_result_set(self):
17941746
)
17951747
cursor = dbapi.connect(Config.CLIENT, bqstorage_client).cursor()
17961748

1797-
# Pick a large enouhg LIMIT value to assure that the fallback to the
1798-
# default client is not needed due to the result set being too small
1799-
# (a known issue that causes problems when reding such result sets with
1800-
# BQ storage client).
18011749
cursor.execute(
18021750
"""
18031751
SELECT id, `by`, time_ts
@@ -1845,10 +1793,6 @@ def test_dbapi_connection_does_not_leak_sockets(self):
18451793
connection = dbapi.connect()
18461794
cursor = connection.cursor()
18471795

1848-
# Pick a large enough LIMIT value to assure that the fallback to the
1849-
# default client is not needed due to the result set being too small
1850-
# (a known issue that causes problems when reding such result sets with
1851-
# BQ storage client).
18521796
cursor.execute(
18531797
"""
18541798
SELECT id, `by`, time_ts
@@ -2272,9 +2216,6 @@ def test_query_results_to_dataframe(self):
22722216
bigquery_storage_v1 is None, "Requires `google-cloud-bigquery-storage`"
22732217
)
22742218
def test_query_results_to_dataframe_w_bqstorage(self):
2275-
dest_dataset = self.temp_dataset(_make_dataset_id("bqstorage_to_dataframe_"))
2276-
dest_ref = dest_dataset.table("query_results")
2277-
22782219
query = """
22792220
SELECT id, author, time_ts, dead
22802221
FROM `bigquery-public-data.hacker_news.comments`
@@ -2285,50 +2226,29 @@ def test_query_results_to_dataframe_w_bqstorage(self):
22852226
credentials=Config.CLIENT._credentials
22862227
)
22872228

2288-
job_configs = (
2289-
# There is a known issue reading small anonymous query result
2290-
# tables with the BQ Storage API. Writing to a destination
2291-
# table works around this issue.
2292-
bigquery.QueryJobConfig(
2293-
destination=dest_ref, write_disposition="WRITE_TRUNCATE"
2294-
),
2295-
# Check that the client is able to work around the issue with
2296-
# reading small anonymous query result tables by falling back to
2297-
# the tabledata.list API.
2298-
None,
2299-
)
2300-
2301-
for job_config in job_configs:
2302-
df = (
2303-
Config.CLIENT.query(query, job_config=job_config)
2304-
.result()
2305-
.to_dataframe(bqstorage_client)
2306-
)
2229+
df = Config.CLIENT.query(query).result().to_dataframe(bqstorage_client)
23072230

2308-
self.assertIsInstance(df, pandas.DataFrame)
2309-
self.assertEqual(len(df), 10) # verify the number of rows
2310-
column_names = ["id", "author", "time_ts", "dead"]
2311-
self.assertEqual(list(df), column_names)
2312-
exp_datatypes = {
2313-
"id": int,
2314-
"author": six.text_type,
2315-
"time_ts": pandas.Timestamp,
2316-
"dead": bool,
2317-
}
2318-
for index, row in df.iterrows():
2319-
for col in column_names:
2320-
# all the schema fields are nullable, so None is acceptable
2321-
if not row[col] is None:
2322-
self.assertIsInstance(row[col], exp_datatypes[col])
2231+
self.assertIsInstance(df, pandas.DataFrame)
2232+
self.assertEqual(len(df), 10) # verify the number of rows
2233+
column_names = ["id", "author", "time_ts", "dead"]
2234+
self.assertEqual(list(df), column_names)
2235+
exp_datatypes = {
2236+
"id": int,
2237+
"author": six.text_type,
2238+
"time_ts": pandas.Timestamp,
2239+
"dead": bool,
2240+
}
2241+
for index, row in df.iterrows():
2242+
for col in column_names:
2243+
# all the schema fields are nullable, so None is acceptable
2244+
if not row[col] is None:
2245+
self.assertIsInstance(row[col], exp_datatypes[col])
23232246

23242247
@unittest.skipIf(pandas is None, "Requires `pandas`")
23252248
@unittest.skipIf(
23262249
bigquery_storage_v1beta1 is None, "Requires `google-cloud-bigquery-storage`"
23272250
)
23282251
def test_query_results_to_dataframe_w_bqstorage_v1beta1(self):
2329-
dest_dataset = self.temp_dataset(_make_dataset_id("bqstorage_to_dataframe_"))
2330-
dest_ref = dest_dataset.table("query_results")
2331-
23322252
query = """
23332253
SELECT id, author, time_ts, dead
23342254
FROM `bigquery-public-data.hacker_news.comments`
@@ -2339,41 +2259,23 @@ def test_query_results_to_dataframe_w_bqstorage_v1beta1(self):
23392259
credentials=Config.CLIENT._credentials
23402260
)
23412261

2342-
job_configs = (
2343-
# There is a known issue reading small anonymous query result
2344-
# tables with the BQ Storage API. Writing to a destination
2345-
# table works around this issue.
2346-
bigquery.QueryJobConfig(
2347-
destination=dest_ref, write_disposition="WRITE_TRUNCATE"
2348-
),
2349-
# Check that the client is able to work around the issue with
2350-
# reading small anonymous query result tables by falling back to
2351-
# the tabledata.list API.
2352-
None,
2353-
)
2354-
2355-
for job_config in job_configs:
2356-
df = (
2357-
Config.CLIENT.query(query, job_config=job_config)
2358-
.result()
2359-
.to_dataframe(bqstorage_client)
2360-
)
2262+
df = Config.CLIENT.query(query).result().to_dataframe(bqstorage_client)
23612263

2362-
self.assertIsInstance(df, pandas.DataFrame)
2363-
self.assertEqual(len(df), 10) # verify the number of rows
2364-
column_names = ["id", "author", "time_ts", "dead"]
2365-
self.assertEqual(list(df), column_names)
2366-
exp_datatypes = {
2367-
"id": int,
2368-
"author": six.text_type,
2369-
"time_ts": pandas.Timestamp,
2370-
"dead": bool,
2371-
}
2372-
for index, row in df.iterrows():
2373-
for col in column_names:
2374-
# all the schema fields are nullable, so None is acceptable
2375-
if not row[col] is None:
2376-
self.assertIsInstance(row[col], exp_datatypes[col])
2264+
self.assertIsInstance(df, pandas.DataFrame)
2265+
self.assertEqual(len(df), 10) # verify the number of rows
2266+
column_names = ["id", "author", "time_ts", "dead"]
2267+
self.assertEqual(list(df), column_names)
2268+
exp_datatypes = {
2269+
"id": int,
2270+
"author": six.text_type,
2271+
"time_ts": pandas.Timestamp,
2272+
"dead": bool,
2273+
}
2274+
for index, row in df.iterrows():
2275+
for col in column_names:
2276+
# all the schema fields are nullable, so None is acceptable
2277+
if not row[col] is None:
2278+
self.assertIsInstance(row[col], exp_datatypes[col])
23772279

23782280
@unittest.skipIf(pandas is None, "Requires `pandas`")
23792281
def test_insert_rows_from_dataframe(self):

0 commit comments

Comments
 (0)