Skip to content

Commit f937edf

Browse files
authored
feat: read_gbq suggests using BigQuery DataFrames with large results (#769)
* feat: `read_gbq` suggests using BigQuery DataFrames with large results * update docs * guard against non-int bytes * tweak message * remove unnecessary also * remove dead code * remove directory that doesn't exist * comment about GiB vs GB
1 parent 12a8db7 commit f937edf

File tree

8 files changed

+153
-51
lines changed

8 files changed

+153
-51
lines changed

docs/index.rst

+6
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,12 @@ Note: The canonical version of this documentation can always be found on the
2323
`BigQuery sandbox <https://cloud.google.com/bigquery/docs/sandbox>`__ to
2424
try the service for free.
2525

26+
Also, consider using `BigQuery DataFrames
27+
<https://cloud.google.com/bigquery/docs/bigquery-dataframes-introduction>`__
28+
to process large results with pandas compatible APIs with transparent SQL
29+
pushdown to BigQuery engine. This provides an opportunity to save on costs
30+
and improve performance.
31+
2632
While BigQuery uses standard SQL syntax, it has some important differences
2733
from traditional databases both in functionality, API limitations (size and
2834
quantity of queries or uploads), and how Google charges for use of the

noxfile.py

+9
Original file line numberDiff line numberDiff line change
@@ -375,6 +375,15 @@ def cover(session):
375375
session.install("coverage", "pytest-cov")
376376
session.run("coverage", "report", "--show-missing", "--fail-under=96")
377377

378+
# Make sure there is no dead code in our test directories.
379+
session.run(
380+
"coverage",
381+
"report",
382+
"--show-missing",
383+
"--include=tests/unit/*",
384+
"--fail-under=100",
385+
)
386+
378387
session.run("coverage", "erase")
379388

380389

pandas_gbq/constants.py

+12
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# Copyright (c) 2024 pandas-gbq Authors All rights reserved.
2+
# Use of this source code is governed by a BSD-style
3+
# license that can be found in the LICENSE file.
4+
5+
# BigQuery uses powers of 2 in calculating data sizes. See:
6+
# https://cloud.google.com/bigquery/pricing#data The documentation uses
7+
# GiB rather than GB to disambiguate from the alternative base 10 units.
8+
# https://en.wikipedia.org/wiki/Byte#Multiple-byte_units
9+
BYTES_IN_KIB = 1024
10+
BYTES_IN_MIB = 1024 * BYTES_IN_KIB
11+
BYTES_IN_GIB = 1024 * BYTES_IN_MIB
12+
BYTES_TO_RECOMMEND_BIGFRAMES = BYTES_IN_GIB

pandas_gbq/exceptions.py

+4
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,10 @@ class InvalidPrivateKeyFormat(ValueError):
2828
"""
2929

3030

31+
class LargeResultsWarning(UserWarning):
32+
"""Raise when results are beyond that recommended for pandas DataFrame."""
33+
34+
3135
class PerformanceWarning(RuntimeWarning):
3236
"""
3337
Raised when a performance-related feature is requested, but unsupported.

pandas_gbq/features.py

-10
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
BIGQUERY_QUERY_AND_WAIT_VERSION = "3.14.0"
1010
PANDAS_VERBOSITY_DEPRECATION_VERSION = "0.23.0"
1111
PANDAS_BOOLEAN_DTYPE_VERSION = "1.0.0"
12-
PANDAS_PARQUET_LOSSLESS_TIMESTAMP_VERSION = "1.1.0"
1312

1413

1514
class Features:
@@ -82,14 +81,5 @@ def pandas_has_boolean_dtype(self):
8281
desired_version = packaging.version.parse(PANDAS_BOOLEAN_DTYPE_VERSION)
8382
return self.pandas_installed_version >= desired_version
8483

85-
@property
86-
def pandas_has_parquet_with_lossless_timestamp(self):
87-
import packaging.version
88-
89-
desired_version = packaging.version.parse(
90-
PANDAS_PARQUET_LOSSLESS_TIMESTAMP_VERSION
91-
)
92-
return self.pandas_installed_version >= desired_version
93-
9484

9585
FEATURES = Features()

pandas_gbq/gbq.py

+46-13
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919
if typing.TYPE_CHECKING: # pragma: NO COVER
2020
import pandas
2121

22+
import pandas_gbq.constants
23+
import pandas_gbq.exceptions
2224
from pandas_gbq.exceptions import GenericGBQException, QueryTimeout
2325
from pandas_gbq.features import FEATURES
2426
import pandas_gbq.query
@@ -478,6 +480,35 @@ def _download_results(
478480
if max_results is not None:
479481
create_bqstorage_client = False
480482

483+
# If we're downloading a large table, BigQuery DataFrames might be a
484+
# better fit. Not all code paths will populate rows_iter._table, but
485+
# if it's not populated that means we are working with a small result
486+
# set.
487+
if (table_ref := getattr(rows_iter, "_table", None)) is not None:
488+
table = self.client.get_table(table_ref)
489+
if (
490+
isinstance((num_bytes := table.num_bytes), int)
491+
and num_bytes > pandas_gbq.constants.BYTES_TO_RECOMMEND_BIGFRAMES
492+
):
493+
num_gib = num_bytes / pandas_gbq.constants.BYTES_IN_GIB
494+
warnings.warn(
495+
f"Recommendation: Your results are {num_gib:.1f} GiB. "
496+
"Consider using BigQuery DataFrames "
497+
"(https://cloud.google.com/bigquery/docs/bigquery-dataframes-introduction) "
498+
"to process large results with pandas compatible APIs with transparent SQL "
499+
"pushdown to BigQuery engine. This provides an opportunity to save on costs "
500+
"and improve performance. "
501+
"Please reach out to [email protected] with any "
502+
"questions or concerns. To disable this message, run "
503+
"warnings.simplefilter('ignore', category=pandas_gbq.exceptions.LargeResultsWarning)",
504+
category=pandas_gbq.exceptions.LargeResultsWarning,
505+
# user's code
506+
# -> read_gbq
507+
# -> run_query
508+
# -> download_results
509+
stacklevel=4,
510+
)
511+
481512
try:
482513
schema_fields = [field.to_api_repr() for field in rows_iter.schema]
483514
conversion_dtypes = _bqschema_to_nullsafe_dtypes(schema_fields)
@@ -663,18 +694,25 @@ def read_gbq(
663694
*,
664695
col_order=None,
665696
):
666-
r"""Load data from Google BigQuery using google-cloud-python
667-
668-
The main method a user calls to execute a Query in Google BigQuery
669-
and read results into a pandas DataFrame.
697+
r"""Read data from Google BigQuery to a pandas DataFrame.
670698
671-
This method uses the Google Cloud client library to make requests to
672-
Google BigQuery, documented `here
673-
<https://googleapis.dev/python/bigquery/latest/index.html>`__.
699+
Run a SQL query in BigQuery or read directly from a table
700+
the `Python client library for BigQuery
701+
<https://cloud.google.com/python/docs/reference/bigquery/latest/index.html>`__
702+
and for `BigQuery Storage
703+
<https://cloud.google.com/python/docs/reference/bigquerystorage/latest>`__
704+
to make API requests.
674705
675706
See the :ref:`How to authenticate with Google BigQuery <authentication>`
676707
guide for authentication instructions.
677708
709+
.. note::
710+
Consider using `BigQuery DataFrames
711+
<https://cloud.google.com/bigquery/docs/dataframes-quickstart>`__ to
712+
process large results with pandas compatible APIs that run in the
713+
BigQuery SQL query engine. This provides an opportunity to save on
714+
costs and improve performance.
715+
678716
Parameters
679717
----------
680718
query_or_table : str
@@ -1050,12 +1088,7 @@ def to_gbq(
10501088
)
10511089

10521090
if api_method == "default":
1053-
# Avoid using parquet if pandas doesn't support lossless conversions to
1054-
# parquet timestamp. See: https://stackoverflow.com/a/69758676/101923
1055-
if FEATURES.pandas_has_parquet_with_lossless_timestamp:
1056-
api_method = "load_parquet"
1057-
else:
1058-
api_method = "load_csv"
1091+
api_method = "load_parquet"
10591092

10601093
if chunksize is not None:
10611094
if api_method == "load_parquet":

tests/unit/test_gbq.py

+75-24
Original file line numberDiff line numberDiff line change
@@ -6,17 +6,22 @@
66

77
import copy
88
import datetime
9+
import re
910
from unittest import mock
11+
import warnings
1012

1113
import google.api_core.exceptions
1214
import google.cloud.bigquery
15+
import google.cloud.bigquery.table
1316
import numpy
1417
import packaging.version
1518
import pandas
1619
from pandas import DataFrame
1720
import pytest
1821

1922
from pandas_gbq import gbq
23+
import pandas_gbq.constants
24+
import pandas_gbq.exceptions
2025
import pandas_gbq.features
2126
from pandas_gbq.features import FEATURES
2227

@@ -147,6 +152,62 @@ def test__transform_read_gbq_configuration_makes_copy(original, expected):
147152
assert did_change == should_change
148153

149154

155+
def test_GbqConnector_download_results_warns_for_large_tables(default_bigquery_client):
156+
gbq._test_google_api_imports()
157+
connector = _make_connector()
158+
rows_iter = mock.create_autospec(
159+
google.cloud.bigquery.table.RowIterator, instance=True
160+
)
161+
table = google.cloud.bigquery.Table.from_api_repr(
162+
{
163+
"tableReference": {
164+
"projectId": "my-proj",
165+
"datasetId": "my-dset",
166+
"tableId": "my_tbl",
167+
},
168+
"numBytes": 2 * pandas_gbq.constants.BYTES_IN_GIB,
169+
},
170+
)
171+
rows_iter._table = table
172+
default_bigquery_client.get_table.reset_mock(side_effect=True)
173+
default_bigquery_client.get_table.return_value = table
174+
175+
with pytest.warns(
176+
pandas_gbq.exceptions.LargeResultsWarning,
177+
match=re.escape("Your results are 2.0 GiB. Consider using BigQuery DataFrames"),
178+
):
179+
connector._download_results(rows_iter)
180+
181+
182+
def test_GbqConnector_download_results_doesnt_warn_for_small_tables(
183+
default_bigquery_client,
184+
):
185+
gbq._test_google_api_imports()
186+
connector = _make_connector()
187+
rows_iter = mock.create_autospec(
188+
google.cloud.bigquery.table.RowIterator, instance=True
189+
)
190+
table = google.cloud.bigquery.Table.from_api_repr(
191+
{
192+
"tableReference": {
193+
"projectId": "my-proj",
194+
"datasetId": "my-dset",
195+
"tableId": "my_tbl",
196+
},
197+
"numBytes": 999 * pandas_gbq.constants.BYTES_IN_MIB,
198+
},
199+
)
200+
rows_iter._table = table
201+
default_bigquery_client.get_table.reset_mock(side_effect=True)
202+
default_bigquery_client.get_table.return_value = table
203+
204+
with warnings.catch_warnings():
205+
warnings.simplefilter(
206+
"error", category=pandas_gbq.exceptions.LargeResultsWarning
207+
)
208+
connector._download_results(rows_iter)
209+
210+
150211
def test_GbqConnector_get_client_w_new_bq(mock_bigquery_client):
151212
gbq._test_google_api_imports()
152213
pytest.importorskip("google.api_core.client_info")
@@ -191,16 +252,13 @@ def test_to_gbq_with_chunksize_warns_deprecation(
191252
api_method, warning_message, warning_type
192253
):
193254
with pytest.warns(warning_type, match=warning_message):
194-
try:
195-
gbq.to_gbq(
196-
DataFrame([[1]]),
197-
"dataset.tablename",
198-
project_id="my-project",
199-
api_method=api_method,
200-
chunksize=100,
201-
)
202-
except gbq.TableCreationError:
203-
pass
255+
gbq.to_gbq(
256+
DataFrame([[1]]),
257+
"dataset.tablename",
258+
project_id="my-project",
259+
api_method=api_method,
260+
chunksize=100,
261+
)
204262

205263

206264
@pytest.mark.parametrize(["verbose"], [(True,), (False,)])
@@ -211,15 +269,12 @@ def test_to_gbq_with_verbose_new_pandas_warns_deprecation(monkeypatch, verbose):
211269
mock.PropertyMock(return_value=True),
212270
)
213271
with pytest.warns(FutureWarning, match="verbose is deprecated"):
214-
try:
215-
gbq.to_gbq(
216-
DataFrame([[1]]),
217-
"dataset.tablename",
218-
project_id="my-project",
219-
verbose=verbose,
220-
)
221-
except gbq.TableCreationError:
222-
pass
272+
gbq.to_gbq(
273+
DataFrame([[1]]),
274+
"dataset.tablename",
275+
project_id="my-project",
276+
verbose=verbose,
277+
)
223278

224279

225280
def test_to_gbq_with_private_key_raises_notimplementederror():
@@ -233,11 +288,7 @@ def test_to_gbq_with_private_key_raises_notimplementederror():
233288

234289

235290
def test_to_gbq_doesnt_run_query(mock_bigquery_client):
236-
try:
237-
gbq.to_gbq(DataFrame([[1]]), "dataset.tablename", project_id="my-project")
238-
except gbq.TableCreationError:
239-
pass
240-
291+
gbq.to_gbq(DataFrame([[1]]), "dataset.tablename", project_id="my-project")
241292
mock_bigquery_client.query.assert_not_called()
242293

243294

tests/unit/test_to_gbq.py

+1-4
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,11 @@
88
import pytest
99

1010
from pandas_gbq import gbq
11-
from pandas_gbq.features import FEATURES
1211

1312

1413
@pytest.fixture
1514
def expected_load_method(mock_bigquery_client):
16-
if FEATURES.pandas_has_parquet_with_lossless_timestamp:
17-
return mock_bigquery_client.load_table_from_dataframe
18-
return mock_bigquery_client.load_table_from_file
15+
return mock_bigquery_client.load_table_from_dataframe
1916

2017

2118
def test_to_gbq_create_dataset_with_location(mock_bigquery_client):

0 commit comments

Comments
 (0)