Skip to content

Commit 8ec63a4

Browse files
committed
ENH: Add table_schema parameter for user-defined BigQuery schema (#46)
1 parent cec8c86 commit 8ec63a4

File tree

3 files changed

+57
-2
lines changed

3 files changed

+57
-2
lines changed

docs/source/changelog.rst

+1
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ Changelog
66

77
- Resolve issue where the optional ``--noauth_local_webserver`` command line argument would not be propagated during the authentication process. (:issue:`35`)
88
- Drop support for Python 3.4 (:issue:`40`)
9+
- Add support for a passed schema in :func:``to_gbq`` instead inferring the schema from the passed ``DataFrame`` with ``DataFrame.dtypes`` (:issue:`46`)
910

1011
0.1.6 / 2017-05-03
1112
------------------

pandas_gbq/gbq.py

+14-2
Original file line numberDiff line numberDiff line change
@@ -768,7 +768,8 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None,
768768

769769

770770
def to_gbq(dataframe, destination_table, project_id, chunksize=10000,
771-
verbose=True, reauth=False, if_exists='fail', private_key=None):
771+
verbose=True, reauth=False, if_exists='fail', private_key=None,
772+
table_schema=None):
772773
"""Write a DataFrame to a Google BigQuery table.
773774
774775
The main method a user calls to export pandas DataFrame contents to
@@ -815,6 +816,14 @@ def to_gbq(dataframe, destination_table, project_id, chunksize=10000,
815816
Service account private key in JSON format. Can be file path
816817
or string contents. This is useful for remote server
817818
authentication (eg. jupyter iPython notebook on remote host)
819+
table_schema : list of dicts
820+
List of BigQuery table fields to which according DataFrame columns
821+
conform to, e.g. `[{'name': 'col1', 'type': 'STRING'},...]`. If
822+
schema is not provided, it will be generated according to dtypes
823+
of DataFrame columns. See BigQuery API documentation on available
824+
names of a field.
825+
826+
.. versionadded:: 0.2.0
818827
"""
819828

820829
if if_exists not in ('fail', 'replace', 'append'):
@@ -831,7 +840,10 @@ def to_gbq(dataframe, destination_table, project_id, chunksize=10000,
831840
table = _Table(project_id, dataset_id, reauth=reauth,
832841
private_key=private_key)
833842

834-
table_schema = _generate_bq_schema(dataframe)
843+
if not table_schema:
844+
table_schema = _generate_bq_schema(dataframe)
845+
else:
846+
table_schema = dict(fields=table_schema)
835847

836848
# If table exists, check if_exists parameter
837849
if table.exists(table_id):

pandas_gbq/tests/test_gbq.py

+42
Original file line numberDiff line numberDiff line change
@@ -1258,6 +1258,48 @@ def test_verify_schema_ignores_field_mode(self):
12581258
assert self.sut.verify_schema(
12591259
self.dataset_prefix + "1", TABLE_ID + test_id, test_schema_2)
12601260

1261+
def test_upload_data_with_valid_user_schema(self):
1262+
# Issue #46; tests test scenarios with user-provided
1263+
# schemas
1264+
df = tm.makeMixedDataFrame()
1265+
test_id = "15"
1266+
test_schema = [{'name': 'A', 'type': 'FLOAT'},
1267+
{'name': 'B', 'type': 'FLOAT'},
1268+
{'name': 'C', 'type': 'STRING'},
1269+
{'name': 'D', 'type': 'TIMESTAMP'}]
1270+
destination_table = self.destination_table + test_id
1271+
gbq.to_gbq(df, destination_table, _get_project_id(),
1272+
private_key=_get_private_key_path(),
1273+
table_schema=test_schema)
1274+
dataset, table = destination_table.split('.')
1275+
assert self.table.verify_schema(dataset, table,
1276+
dict(fields=test_schema))
1277+
1278+
def test_upload_data_with_invalid_user_schema_raises_error(self):
1279+
df = tm.makeMixedDataFrame()
1280+
test_id = "16"
1281+
test_schema = [{'name': 'A', 'type': 'FLOAT'},
1282+
{'name': 'B', 'type': 'FLOAT'},
1283+
{'name': 'C', 'type': 'FLOAT'},
1284+
{'name': 'D', 'type': 'FLOAT'}]
1285+
destination_table = self.destination_table + test_id
1286+
with tm.assertRaises(gbq.StreamingInsertError):
1287+
gbq.to_gbq(df, destination_table, _get_project_id(),
1288+
private_key=_get_private_key_path(),
1289+
table_schema=test_schema)
1290+
1291+
def test_upload_data_with_missing_schema_fields_raises_error(self):
1292+
df = tm.makeMixedDataFrame()
1293+
test_id = "16"
1294+
test_schema = [{'name': 'A', 'type': 'FLOAT'},
1295+
{'name': 'B', 'type': 'FLOAT'},
1296+
{'name': 'C', 'type': 'FLOAT'}]
1297+
destination_table = self.destination_table + test_id
1298+
with tm.assertRaises(gbq.StreamingInsertError):
1299+
gbq.to_gbq(df, destination_table, _get_project_id(),
1300+
private_key=_get_private_key_path(),
1301+
table_schema=test_schema)
1302+
12611303
def test_list_dataset(self):
12621304
dataset_id = self.dataset_prefix + "1"
12631305
assert dataset_id in self.dataset.datasets()

0 commit comments

Comments
 (0)