Skip to content

Commit f9f2f45

Browse files
authored
fix: omit NaN values when uploading from insert_rows_from_dataframe (#170)
* fix: omit `NaN` values when uploading from `insert_rows_from_dataframe` NaN values are most often used to indicate a NULL value in pandas. Also, even when a column is a floating point column, the BigQuery streaming API JSON parser doesn't seem to be able to handle NaN literals. * doc: update docstring to indicate missing NaNs
1 parent 08c12c5 commit f9f2f45

File tree

4 files changed

+110
-9
lines changed

4 files changed

+110
-9
lines changed

google/cloud/bigquery/_pandas_helpers.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
import logging
2020
import warnings
2121

22+
import six
2223
from six.moves import queue
2324

2425
try:
@@ -780,3 +781,14 @@ def download_dataframe_bqstorage(
780781
selected_fields=selected_fields,
781782
page_to_item=page_to_item,
782783
)
784+
785+
786+
def dataframe_to_json_generator(dataframe):
787+
for row in dataframe.itertuples(index=False, name=None):
788+
output = {}
789+
for column, value in six.moves.zip(dataframe.columns, row):
790+
# Omit NaN values.
791+
if value != value:
792+
continue
793+
output[column] = value
794+
yield output

google/cloud/bigquery/client.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2535,7 +2535,9 @@ def insert_rows_from_dataframe(
25352535
]):
25362536
The destination table for the row data, or a reference to it.
25372537
dataframe (pandas.DataFrame):
2538-
A :class:`~pandas.DataFrame` containing the data to load.
2538+
A :class:`~pandas.DataFrame` containing the data to load. Any
2539+
``NaN`` values present in the dataframe are omitted from the
2540+
streaming API request(s).
25392541
selected_fields (Sequence[google.cloud.bigquery.schema.SchemaField]):
25402542
The fields to return. Required if ``table`` is a
25412543
:class:`~google.cloud.bigquery.table.TableReference`.
@@ -2559,10 +2561,7 @@ def insert_rows_from_dataframe(
25592561
insert_results = []
25602562

25612563
chunk_count = int(math.ceil(len(dataframe) / chunk_size))
2562-
rows_iter = (
2563-
dict(six.moves.zip(dataframe.columns, row))
2564-
for row in dataframe.itertuples(index=False, name=None)
2565-
)
2564+
rows_iter = _pandas_helpers.dataframe_to_json_generator(dataframe)
25662565

25672566
for _ in range(chunk_count):
25682567
rows_chunk = itertools.islice(rows_iter, chunk_size)

tests/system.py

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2335,6 +2335,14 @@ def test_insert_rows_from_dataframe(self):
23352335
"string_col": "another string",
23362336
"int_col": 50,
23372337
},
2338+
{
2339+
"float_col": 6.66,
2340+
"bool_col": True,
2341+
# Include a NaN value, because pandas often uses NaN as a
2342+
# NULL value indicator.
2343+
"string_col": float("NaN"),
2344+
"int_col": 60,
2345+
},
23382346
]
23392347
)
23402348

@@ -2344,14 +2352,28 @@ def test_insert_rows_from_dataframe(self):
23442352
table = retry_403(Config.CLIENT.create_table)(table_arg)
23452353
self.to_delete.insert(0, table)
23462354

2347-
Config.CLIENT.insert_rows_from_dataframe(table, dataframe, chunk_size=3)
2355+
chunk_errors = Config.CLIENT.insert_rows_from_dataframe(
2356+
table, dataframe, chunk_size=3
2357+
)
2358+
for errors in chunk_errors:
2359+
assert not errors
23482360

2349-
retry = RetryResult(_has_rows, max_tries=8)
2350-
rows = retry(self._fetch_single_page)(table)
2361+
# Use query to fetch rows instead of listing directly from the table so
2362+
# that we get values from the streaming buffer.
2363+
rows = list(
2364+
Config.CLIENT.query(
2365+
"SELECT * FROM `{}.{}.{}`".format(
2366+
table.project, table.dataset_id, table.table_id
2367+
)
2368+
)
2369+
)
23512370

23522371
sorted_rows = sorted(rows, key=operator.attrgetter("int_col"))
23532372
row_tuples = [r.values() for r in sorted_rows]
2354-
expected = [tuple(data_row) for data_row in dataframe.itertuples(index=False)]
2373+
expected = [
2374+
tuple(None if col != col else col for col in data_row)
2375+
for data_row in dataframe.itertuples(index=False)
2376+
]
23552377

23562378
assert len(row_tuples) == len(expected)
23572379

tests/unit/test_client.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5582,6 +5582,74 @@ def test_insert_rows_from_dataframe(self):
55825582
)
55835583
assert call == expected_call
55845584

5585+
@unittest.skipIf(pandas is None, "Requires `pandas`")
5586+
def test_insert_rows_from_dataframe_nan(self):
5587+
from google.cloud.bigquery.schema import SchemaField
5588+
from google.cloud.bigquery.table import Table
5589+
5590+
API_PATH = "/projects/{}/datasets/{}/tables/{}/insertAll".format(
5591+
self.PROJECT, self.DS_ID, self.TABLE_REF.table_id
5592+
)
5593+
5594+
dataframe = pandas.DataFrame(
5595+
{
5596+
"str_col": ["abc", "def", float("NaN"), "jkl"],
5597+
"int_col": [1, float("NaN"), 3, 4],
5598+
"float_col": [float("NaN"), 0.25, 0.5, 0.125],
5599+
}
5600+
)
5601+
5602+
# create client
5603+
creds = _make_credentials()
5604+
http = object()
5605+
client = self._make_one(project=self.PROJECT, credentials=creds, _http=http)
5606+
conn = client._connection = make_connection({}, {})
5607+
5608+
# create table
5609+
schema = [
5610+
SchemaField("str_col", "STRING"),
5611+
SchemaField("int_col", "INTEGER"),
5612+
SchemaField("float_col", "FLOAT"),
5613+
]
5614+
table = Table(self.TABLE_REF, schema=schema)
5615+
5616+
with mock.patch("uuid.uuid4", side_effect=map(str, range(len(dataframe)))):
5617+
error_info = client.insert_rows_from_dataframe(
5618+
table, dataframe, chunk_size=3, timeout=7.5
5619+
)
5620+
5621+
self.assertEqual(len(error_info), 2)
5622+
for chunk_errors in error_info:
5623+
assert chunk_errors == []
5624+
5625+
EXPECTED_SENT_DATA = [
5626+
{
5627+
"rows": [
5628+
{"insertId": "0", "json": {"str_col": "abc", "int_col": 1}},
5629+
{"insertId": "1", "json": {"str_col": "def", "float_col": 0.25}},
5630+
{"insertId": "2", "json": {"int_col": 3, "float_col": 0.5}},
5631+
]
5632+
},
5633+
{
5634+
"rows": [
5635+
{
5636+
"insertId": "3",
5637+
"json": {"str_col": "jkl", "int_col": 4, "float_col": 0.125},
5638+
}
5639+
]
5640+
},
5641+
]
5642+
5643+
actual_calls = conn.api_request.call_args_list
5644+
5645+
for call, expected_data in six.moves.zip_longest(
5646+
actual_calls, EXPECTED_SENT_DATA
5647+
):
5648+
expected_call = mock.call(
5649+
method="POST", path=API_PATH, data=expected_data, timeout=7.5
5650+
)
5651+
assert call == expected_call
5652+
55855653
@unittest.skipIf(pandas is None, "Requires `pandas`")
55865654
def test_insert_rows_from_dataframe_many_columns(self):
55875655
from google.cloud.bigquery.schema import SchemaField

0 commit comments

Comments
 (0)