Skip to content

Commit d792ce0

Browse files
authored
feat: add support for Parquet options (#679)
* feat: add support for Parquet options For load jobs and external tables config. * Simplify ParquetOptions.to_api_repr() Co-authored by Tres Seaver. * Expose ParquetOptions in top level namespace * Parquet options should be reflected in options
1 parent a0a9fa2 commit d792ce0

File tree

7 files changed

+306
-1
lines changed

7 files changed

+306
-1
lines changed

google/cloud/bigquery/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@
4747
from google.cloud.bigquery.external_config import CSVOptions
4848
from google.cloud.bigquery.external_config import GoogleSheetsOptions
4949
from google.cloud.bigquery.external_config import ExternalSourceFormat
50+
from google.cloud.bigquery.format_options import ParquetOptions
5051
from google.cloud.bigquery.job import Compression
5152
from google.cloud.bigquery.job import CopyJob
5253
from google.cloud.bigquery.job import CopyJobConfig
@@ -136,6 +137,7 @@
136137
"BigtableColumn",
137138
"CSVOptions",
138139
"GoogleSheetsOptions",
140+
"ParquetOptions",
139141
"DEFAULT_RETRY",
140142
# Enum Constants
141143
"enums",

google/cloud/bigquery/external_config.py

+27-1
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
from google.cloud.bigquery._helpers import _bytes_to_json
2828
from google.cloud.bigquery._helpers import _int_or_none
2929
from google.cloud.bigquery._helpers import _str_or_none
30+
from google.cloud.bigquery.format_options import ParquetOptions
3031
from google.cloud.bigquery.schema import SchemaField
3132

3233

@@ -53,6 +54,12 @@ class ExternalSourceFormat(object):
5354
DATASTORE_BACKUP = "DATASTORE_BACKUP"
5455
"""Specifies datastore backup format"""
5556

57+
ORC = "ORC"
58+
"""Specifies ORC format."""
59+
60+
PARQUET = "PARQUET"
61+
"""Specifies Parquet format."""
62+
5663
BIGTABLE = "BIGTABLE"
5764
"""Specifies Bigtable format."""
5865

@@ -540,7 +547,7 @@ def from_api_repr(cls, resource: dict) -> "GoogleSheetsOptions":
540547
return config
541548

542549

543-
_OPTION_CLASSES = (BigtableOptions, CSVOptions, GoogleSheetsOptions)
550+
_OPTION_CLASSES = (BigtableOptions, CSVOptions, GoogleSheetsOptions, ParquetOptions)
544551

545552

546553
class HivePartitioningOptions(object):
@@ -784,6 +791,25 @@ def schema(self, value):
784791
prop = {"fields": [field.to_api_repr() for field in value]}
785792
self._properties["schema"] = prop
786793

794+
@property
795+
def parquet_options(self):
796+
"""Optional[google.cloud.bigquery.format_options.ParquetOptions]: Additional
797+
properties to set if ``sourceFormat`` is set to PARQUET.
798+
799+
See:
800+
https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#ExternalDataConfiguration.FIELDS.parquet_options
801+
"""
802+
if self.source_format != ExternalSourceFormat.PARQUET:
803+
return None
804+
return self._options
805+
806+
@parquet_options.setter
807+
def parquet_options(self, value):
808+
if self.source_format != ExternalSourceFormat.PARQUET:
809+
msg = f"Cannot set Parquet options, source format is {self.source_format}"
810+
raise TypeError(msg)
811+
self._options = value
812+
787813
def to_api_repr(self) -> dict:
788814
"""Build an API representation of this object.
789815
+80
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
# Copyright 2021 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import copy
16+
from typing import Dict
17+
18+
19+
class ParquetOptions:
20+
"""Additional options if the PARQUET source format is used."""
21+
22+
_SOURCE_FORMAT = "PARQUET"
23+
_RESOURCE_NAME = "parquetOptions"
24+
25+
def __init__(self):
26+
self._properties = {}
27+
28+
@property
29+
def enum_as_string(self) -> bool:
30+
"""Indicates whether to infer Parquet ENUM logical type as STRING instead of
31+
BYTES by default.
32+
33+
See
34+
https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#ParquetOptions.FIELDS.enum_as_string
35+
"""
36+
return self._properties.get("enumAsString")
37+
38+
@enum_as_string.setter
39+
def enum_as_string(self, value: bool) -> None:
40+
self._properties["enumAsString"] = value
41+
42+
@property
43+
def enable_list_inference(self) -> bool:
44+
"""Indicates whether to use schema inference specifically for Parquet LIST
45+
logical type.
46+
47+
See
48+
https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#ParquetOptions.FIELDS.enable_list_inference
49+
"""
50+
return self._properties.get("enableListInference")
51+
52+
@enable_list_inference.setter
53+
def enable_list_inference(self, value: bool) -> None:
54+
self._properties["enableListInference"] = value
55+
56+
@classmethod
57+
def from_api_repr(cls, resource: Dict[str, bool]) -> "ParquetOptions":
58+
"""Factory: construct an instance from a resource dict.
59+
60+
Args:
61+
resource (Dict[str, bool]):
62+
Definition of a :class:`~.format_options.ParquetOptions` instance in
63+
the same representation as is returned from the API.
64+
65+
Returns:
66+
:class:`~.format_options.ParquetOptions`:
67+
Configuration parsed from ``resource``.
68+
"""
69+
config = cls()
70+
config._properties = copy.deepcopy(resource)
71+
return config
72+
73+
def to_api_repr(self) -> dict:
74+
"""Build an API representation of this object.
75+
76+
Returns:
77+
Dict[str, bool]:
78+
A dictionary in the format used by the BigQuery API.
79+
"""
80+
return copy.deepcopy(self._properties)

google/cloud/bigquery/job/load.py

+21
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
from google.cloud.bigquery.encryption_configuration import EncryptionConfiguration
1818
from google.cloud.bigquery.external_config import HivePartitioningOptions
19+
from google.cloud.bigquery.format_options import ParquetOptions
1920
from google.cloud.bigquery import _helpers
2021
from google.cloud.bigquery.schema import SchemaField
2122
from google.cloud.bigquery.schema import _to_schema_fields
@@ -439,6 +440,26 @@ def write_disposition(self):
439440
def write_disposition(self, value):
440441
self._set_sub_prop("writeDisposition", value)
441442

443+
@property
444+
def parquet_options(self):
445+
"""Optional[google.cloud.bigquery.format_options.ParquetOptions]: Additional
446+
properties to set if ``sourceFormat`` is set to PARQUET.
447+
448+
See:
449+
https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.parquet_options
450+
"""
451+
prop = self._get_sub_prop("parquetOptions")
452+
if prop is not None:
453+
prop = ParquetOptions.from_api_repr(prop)
454+
return prop
455+
456+
@parquet_options.setter
457+
def parquet_options(self, value):
458+
if value is not None:
459+
self._set_sub_prop("parquetOptions", value.to_api_repr())
460+
else:
461+
self._del_sub_prop("parquetOptions")
462+
442463

443464
class LoadJob(_AsyncJob):
444465
"""Asynchronous job for loading data into a table.

tests/unit/job/test_load_config.py

+35
Original file line numberDiff line numberDiff line change
@@ -700,3 +700,38 @@ def test_write_disposition_setter(self):
700700
self.assertEqual(
701701
config._properties["load"]["writeDisposition"], write_disposition
702702
)
703+
704+
def test_parquet_options_missing(self):
705+
config = self._get_target_class()()
706+
self.assertIsNone(config.parquet_options)
707+
708+
def test_parquet_options_hit(self):
709+
config = self._get_target_class()()
710+
config._properties["load"]["parquetOptions"] = dict(
711+
enumAsString=True, enableListInference=False
712+
)
713+
self.assertTrue(config.parquet_options.enum_as_string)
714+
self.assertFalse(config.parquet_options.enable_list_inference)
715+
716+
def test_parquet_options_setter(self):
717+
from google.cloud.bigquery.format_options import ParquetOptions
718+
719+
parquet_options = ParquetOptions.from_api_repr(
720+
dict(enumAsString=False, enableListInference=True)
721+
)
722+
config = self._get_target_class()()
723+
724+
config.parquet_options = parquet_options
725+
self.assertEqual(
726+
config._properties["load"]["parquetOptions"],
727+
{"enumAsString": False, "enableListInference": True},
728+
)
729+
730+
def test_parquet_options_setter_clearing(self):
731+
config = self._get_target_class()()
732+
config._properties["load"]["parquetOptions"] = dict(
733+
enumAsString=False, enableListInference=True
734+
)
735+
736+
config.parquet_options = None
737+
self.assertNotIn("parquetOptions", config._properties["load"])

tests/unit/test_external_config.py

+100
Original file line numberDiff line numberDiff line change
@@ -425,6 +425,106 @@ def test_to_api_repr_bigtable(self):
425425

426426
self.assertEqual(got_resource, exp_resource)
427427

428+
def test_parquet_options_getter(self):
429+
from google.cloud.bigquery.format_options import ParquetOptions
430+
431+
parquet_options = ParquetOptions.from_api_repr(
432+
{"enumAsString": True, "enableListInference": False}
433+
)
434+
ec = external_config.ExternalConfig(
435+
external_config.ExternalSourceFormat.PARQUET
436+
)
437+
438+
self.assertIsNone(ec.parquet_options.enum_as_string)
439+
self.assertIsNone(ec.parquet_options.enable_list_inference)
440+
441+
ec._options = parquet_options
442+
443+
self.assertTrue(ec.parquet_options.enum_as_string)
444+
self.assertFalse(ec.parquet_options.enable_list_inference)
445+
446+
self.assertIs(ec.parquet_options, ec.options)
447+
448+
def test_parquet_options_getter_non_parquet_format(self):
449+
ec = external_config.ExternalConfig(external_config.ExternalSourceFormat.CSV)
450+
self.assertIsNone(ec.parquet_options)
451+
452+
def test_parquet_options_setter(self):
453+
from google.cloud.bigquery.format_options import ParquetOptions
454+
455+
parquet_options = ParquetOptions.from_api_repr(
456+
{"enumAsString": False, "enableListInference": True}
457+
)
458+
ec = external_config.ExternalConfig(
459+
external_config.ExternalSourceFormat.PARQUET
460+
)
461+
462+
ec.parquet_options = parquet_options
463+
464+
# Setting Parquet options should be reflected in the generic options attribute.
465+
self.assertFalse(ec.options.enum_as_string)
466+
self.assertTrue(ec.options.enable_list_inference)
467+
468+
def test_parquet_options_setter_non_parquet_format(self):
469+
from google.cloud.bigquery.format_options import ParquetOptions
470+
471+
parquet_options = ParquetOptions.from_api_repr(
472+
{"enumAsString": False, "enableListInference": True}
473+
)
474+
ec = external_config.ExternalConfig(external_config.ExternalSourceFormat.CSV)
475+
476+
with self.assertRaisesRegex(TypeError, "Cannot set.*source format is CSV"):
477+
ec.parquet_options = parquet_options
478+
479+
def test_from_api_repr_parquet(self):
480+
from google.cloud.bigquery.format_options import ParquetOptions
481+
482+
resource = _copy_and_update(
483+
self.BASE_RESOURCE,
484+
{
485+
"sourceFormat": "PARQUET",
486+
"parquetOptions": {"enumAsString": True, "enableListInference": False},
487+
},
488+
)
489+
490+
ec = external_config.ExternalConfig.from_api_repr(resource)
491+
492+
self._verify_base(ec)
493+
self.assertEqual(ec.source_format, external_config.ExternalSourceFormat.PARQUET)
494+
self.assertIsInstance(ec.options, ParquetOptions)
495+
self.assertTrue(ec.parquet_options.enum_as_string)
496+
self.assertFalse(ec.parquet_options.enable_list_inference)
497+
498+
got_resource = ec.to_api_repr()
499+
500+
self.assertEqual(got_resource, resource)
501+
502+
del resource["parquetOptions"]["enableListInference"]
503+
ec = external_config.ExternalConfig.from_api_repr(resource)
504+
self.assertIsNone(ec.options.enable_list_inference)
505+
got_resource = ec.to_api_repr()
506+
self.assertEqual(got_resource, resource)
507+
508+
def test_to_api_repr_parquet(self):
509+
from google.cloud.bigquery.format_options import ParquetOptions
510+
511+
ec = external_config.ExternalConfig(
512+
external_config.ExternalSourceFormat.PARQUET
513+
)
514+
options = ParquetOptions.from_api_repr(
515+
dict(enumAsString=False, enableListInference=True)
516+
)
517+
ec._options = options
518+
519+
exp_resource = {
520+
"sourceFormat": external_config.ExternalSourceFormat.PARQUET,
521+
"parquetOptions": {"enumAsString": False, "enableListInference": True},
522+
}
523+
524+
got_resource = ec.to_api_repr()
525+
526+
self.assertEqual(got_resource, exp_resource)
527+
428528

429529
def _copy_and_update(d, u):
430530
d = copy.deepcopy(d)

tests/unit/test_format_options.py

+41
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
# Copyright 2021 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
16+
class TestParquetOptions:
17+
@staticmethod
18+
def _get_target_class():
19+
from google.cloud.bigquery.format_options import ParquetOptions
20+
21+
return ParquetOptions
22+
23+
def test_ctor(self):
24+
config = self._get_target_class()()
25+
assert config.enum_as_string is None
26+
assert config.enable_list_inference is None
27+
28+
def test_from_api_repr(self):
29+
config = self._get_target_class().from_api_repr(
30+
{"enumAsString": False, "enableListInference": True}
31+
)
32+
assert not config.enum_as_string
33+
assert config.enable_list_inference
34+
35+
def test_to_api_repr(self):
36+
config = self._get_target_class()()
37+
config.enum_as_string = True
38+
config.enable_list_inference = False
39+
40+
result = config.to_api_repr()
41+
assert result == {"enumAsString": True, "enableListInference": False}

0 commit comments

Comments
 (0)