Skip to content

Commit b54f867

Browse files
grooveygrtseavertswastbusunkim96
authored
perf: remove redundant array deepcopy (#26)
* perf(bigquery): remove redundant array deepcopy deepcopy can be a very costly operation when considering large arrays with complex nested objects. refactor helpers to allow recursive conversion without copying arrays. * add check to ignore REPEATED mode * Update google/cloud/bigquery/_helpers.py Co-authored-by: Bu Sun Kim <[email protected]> Co-authored-by: Tres Seaver <[email protected]> Co-authored-by: Tim Swast <[email protected]> Co-authored-by: Bu Sun Kim <[email protected]>
1 parent d1eb8b3 commit b54f867

File tree

2 files changed

+64
-10
lines changed

2 files changed

+64
-10
lines changed

google/cloud/bigquery/_helpers.py

+29-10
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@
1515
"""Shared helper functions for BigQuery API classes."""
1616

1717
import base64
18-
import copy
1918
import datetime
2019
import decimal
2120
import re
@@ -397,13 +396,9 @@ def _repeated_field_to_json(field, row_value):
397396
Returns:
398397
List[Any]: A list of JSON-serializable objects.
399398
"""
400-
# Remove the REPEATED, but keep the other fields. This allows us to process
401-
# each item as if it were a top-level field.
402-
item_field = copy.deepcopy(field)
403-
item_field._mode = "NULLABLE"
404399
values = []
405400
for item in row_value:
406-
values.append(_field_to_json(item_field, item))
401+
values.append(_single_field_to_json(field, item))
407402
return values
408403

409404

@@ -462,6 +457,33 @@ def _record_field_to_json(fields, row_value):
462457
return record
463458

464459

460+
def _single_field_to_json(field, row_value):
461+
"""Convert a single field into JSON-serializable values.
462+
463+
Ignores mode so that this can function for ARRAY / REPEATING fields
464+
without requiring a deepcopy of the field. See:
465+
https://github.com/googleapis/python-bigquery/issues/6
466+
467+
Args:
468+
field (google.cloud.bigquery.schema.SchemaField):
469+
The SchemaField to use for type conversion and field name.
470+
471+
row_value (Any):
472+
Scalar or Struct to be inserted. The type
473+
is inferred from the SchemaField's field_type.
474+
475+
Returns:
476+
Any: A JSON-serializable object.
477+
"""
478+
if row_value is None:
479+
return None
480+
481+
if field.field_type == "RECORD":
482+
return _record_field_to_json(field.fields, row_value)
483+
484+
return _scalar_field_to_json(field, row_value)
485+
486+
465487
def _field_to_json(field, row_value):
466488
"""Convert a field into JSON-serializable values.
467489
@@ -483,10 +505,7 @@ def _field_to_json(field, row_value):
483505
if field.mode == "REPEATED":
484506
return _repeated_field_to_json(field, row_value)
485507

486-
if field.field_type == "RECORD":
487-
return _record_field_to_json(field.fields, row_value)
488-
489-
return _scalar_field_to_json(field, row_value)
508+
return _single_field_to_json(field, row_value)
490509

491510

492511
def _snake_to_camel_case(value):

tests/unit/test__helpers.py

+35
Original file line numberDiff line numberDiff line change
@@ -806,6 +806,41 @@ def test_w_known_field_type(self):
806806
self.assertEqual(converted, str(original))
807807

808808

809+
class Test_single_field_to_json(unittest.TestCase):
810+
def _call_fut(self, field, value):
811+
from google.cloud.bigquery._helpers import _single_field_to_json
812+
813+
return _single_field_to_json(field, value)
814+
815+
def test_w_none(self):
816+
field = _make_field("INT64")
817+
original = None
818+
converted = self._call_fut(field, original)
819+
self.assertIsNone(converted)
820+
821+
def test_w_record(self):
822+
subfields = [
823+
_make_field("INT64", name="one"),
824+
_make_field("STRING", name="two"),
825+
]
826+
field = _make_field("RECORD", fields=subfields)
827+
original = {"one": 42, "two": "two"}
828+
converted = self._call_fut(field, original)
829+
self.assertEqual(converted, {"one": "42", "two": "two"})
830+
831+
def test_w_scalar(self):
832+
field = _make_field("INT64")
833+
original = 42
834+
converted = self._call_fut(field, original)
835+
self.assertEqual(converted, str(original))
836+
837+
def test_w_scalar_ignores_mode(self):
838+
field = _make_field("STRING", mode="REPEATED")
839+
original = "hello world"
840+
converted = self._call_fut(field, original)
841+
self.assertEqual(converted, original)
842+
843+
809844
class Test_repeated_field_to_json(unittest.TestCase):
810845
def _call_fut(self, field, value):
811846
from google.cloud.bigquery._helpers import _repeated_field_to_json

0 commit comments

Comments
 (0)