Skip to content

Commit cf1aadd

Browse files
tswastgcf-owl-bot[bot]chalmerlowe
authored
fix: to_gbq uses default_type for ambiguous array types and struct field types (#838)
* fix: `to_gbq` uses `default_type` for ambiguous array types and struct field types * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * fix arrow list(null) case too * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * lint * Update pandas_gbq/schema/pandas_to_bigquery.py Co-authored-by: Chalmer Lowe <[email protected]> * Update pandas_gbq/schema/pandas_to_bigquery.py Co-authored-by: Chalmer Lowe <[email protected]> * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * remove redundant string check * Apply suggestions from code review Co-authored-by: Chalmer Lowe <[email protected]> * add docstrings and a few more test cases * use python 3.10 for docs github action --------- Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com> Co-authored-by: Chalmer Lowe <[email protected]>
1 parent 5484a8c commit cf1aadd

File tree

7 files changed

+244
-49
lines changed

7 files changed

+244
-49
lines changed

.github/workflows/docs.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ jobs:
1212
- name: Setup Python
1313
uses: actions/setup-python@v5
1414
with:
15-
python-version: "3.9"
15+
python-version: "3.10"
1616
- name: Install nox
1717
run: |
1818
python -m pip install --upgrade setuptools pip wheel

owlbot.py

+1
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@
5757
"noxfile.py",
5858
"README.rst",
5959
# exclude this file as we have an alternate prerelease.cfg
60+
".github/workflows/docs.yml",
6061
".kokoro/presubmit/prerelease-deps.cfg",
6162
".kokoro/presubmit/presubmit.cfg",
6263
],

pandas_gbq/schema/pandas_to_bigquery.py

+92-19
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
import collections.abc
66
import datetime
7-
from typing import Optional, Tuple
7+
from typing import Any, Optional, Tuple
88
import warnings
99

1010
import db_dtypes
@@ -28,14 +28,21 @@
2828
# `docs/source/writing.rst`.
2929
_PANDAS_DTYPE_TO_BQ = {
3030
"bool": "BOOLEAN",
31+
"boolean": "BOOLEAN",
3132
"datetime64[ns, UTC]": "TIMESTAMP",
33+
"datetime64[us, UTC]": "TIMESTAMP",
3234
"datetime64[ns]": "DATETIME",
35+
"datetime64[us]": "DATETIME",
3336
"float32": "FLOAT",
3437
"float64": "FLOAT",
3538
"int8": "INTEGER",
3639
"int16": "INTEGER",
3740
"int32": "INTEGER",
3841
"int64": "INTEGER",
42+
"Int8": "INTEGER",
43+
"Int16": "INTEGER",
44+
"Int32": "INTEGER",
45+
"Int64": "INTEGER",
3946
"uint8": "INTEGER",
4047
"uint16": "INTEGER",
4148
"uint32": "INTEGER",
@@ -103,7 +110,7 @@ def dataframe_to_bigquery_fields(
103110

104111
# Try to automatically determine the type based on a few rows of the data.
105112
values = dataframe.reset_index()[column]
106-
bq_field = values_to_bigquery_field(column, values)
113+
bq_field = values_to_bigquery_field(column, values, default_type=default_type)
107114

108115
if bq_field:
109116
bq_schema_out.append(bq_field)
@@ -114,7 +121,9 @@ def dataframe_to_bigquery_fields(
114121
arrow_value = pyarrow.array(values)
115122
bq_field = (
116123
pandas_gbq.schema.pyarrow_to_bigquery.arrow_type_to_bigquery_field(
117-
column, arrow_value.type
124+
column,
125+
arrow_value.type,
126+
default_type=default_type,
118127
)
119128
)
120129

@@ -151,6 +160,19 @@ def dataframe_to_bigquery_fields(
151160

152161

153162
def dtype_to_bigquery_field(name, dtype) -> Optional[schema.SchemaField]:
163+
"""Infers the BigQuery schema field type from a pandas dtype.
164+
165+
Args:
166+
name (str):
167+
Name of the column/field.
168+
dtype:
169+
A pandas / numpy dtype object.
170+
171+
Returns:
172+
Optional[schema.SchemaField]:
173+
The schema field, or None if a type cannot be inferred, such as if
174+
it is ambiguous like the object dtype.
175+
"""
154176
bq_type = _PANDAS_DTYPE_TO_BQ.get(dtype.name)
155177

156178
if bq_type is not None:
@@ -164,9 +186,44 @@ def dtype_to_bigquery_field(name, dtype) -> Optional[schema.SchemaField]:
164186
return None
165187

166188

167-
def value_to_bigquery_field(name, value) -> Optional[schema.SchemaField]:
168-
if isinstance(value, str):
169-
return schema.SchemaField(name, "STRING")
189+
def value_to_bigquery_field(
190+
name: str, value: Any, default_type: Optional[str] = None
191+
) -> Optional[schema.SchemaField]:
192+
"""Infers the BigQuery schema field type from a single value.
193+
194+
Args:
195+
name:
196+
The name of the field.
197+
value:
198+
The value to infer the type from. If None, the default type is used
199+
if available.
200+
default_type:
201+
The default field type. Defaults to None.
202+
203+
Returns:
204+
The schema field, or None if a type cannot be inferred.
205+
"""
206+
207+
# Set the SchemaField datatype to the given default_type if the value
208+
# being assessed is None.
209+
if value is None:
210+
return schema.SchemaField(name, default_type)
211+
212+
# Map from Python types to BigQuery types. This isn't super exhaustive
213+
# because we rely more on pyarrow, which can check more than one value to
214+
# determine the type.
215+
type_mapping = {
216+
str: "STRING",
217+
}
218+
219+
# geopandas and shapely are optional dependencies, so only check if those
220+
# are installed.
221+
if _BaseGeometry is not None:
222+
type_mapping[_BaseGeometry] = "GEOGRAPHY"
223+
224+
for type_, bq_type in type_mapping.items():
225+
if isinstance(value, type_):
226+
return schema.SchemaField(name, bq_type)
170227

171228
# For timezone-naive datetimes, the later pyarrow conversion to try and
172229
# learn the type add a timezone to such datetimes, causing them to be
@@ -182,35 +239,51 @@ def value_to_bigquery_field(name, value) -> Optional[schema.SchemaField]:
182239
else:
183240
return schema.SchemaField(name, "DATETIME")
184241

185-
if _BaseGeometry is not None and isinstance(value, _BaseGeometry):
186-
return schema.SchemaField(name, "GEOGRAPHY")
187-
188242
return None
189243

190244

191-
def values_to_bigquery_field(name, values) -> Optional[schema.SchemaField]:
245+
def values_to_bigquery_field(
246+
name: str, values: Any, default_type: str = "STRING"
247+
) -> Optional[schema.SchemaField]:
248+
"""Infers the BigQuery schema field type from a list of values.
249+
250+
This function iterates through the given values to determine the
251+
corresponding schema field type.
252+
253+
Args:
254+
name:
255+
The name of the field.
256+
values:
257+
An iterable of values to infer the type from. If all the values
258+
are None or the iterable is empty, the function returns None.
259+
default_type:
260+
The default field type to use if a specific type cannot be
261+
determined from the values. Defaults to "STRING".
262+
263+
Returns:
264+
The schema field, or None if a type cannot be inferred.
265+
"""
192266
value = pandas_gbq.core.pandas.first_valid(values)
193267

194-
# All NULL, type not determinable.
268+
# All values came back as NULL, thus type not determinable by this method.
269+
# Return None so we can try other methods.
195270
if value is None:
196271
return None
197272

198-
field = value_to_bigquery_field(name, value)
199-
if field is not None:
273+
field = value_to_bigquery_field(name, value, default_type=default_type)
274+
if field:
200275
return field
201276

202-
if isinstance(value, str):
203-
return schema.SchemaField(name, "STRING")
204-
205-
# Check plain ARRAY values here. Let STRUCT get determined by pyarrow,
206-
# which can examine more values to determine all keys.
277+
# Check plain ARRAY values here. Exclude mapping types to let STRUCT get
278+
# determined by pyarrow, which can examine more values to determine all
279+
# keys.
207280
if isinstance(value, collections.abc.Iterable) and not isinstance(
208281
value, collections.abc.Mapping
209282
):
210283
# It could be that this value contains all None or is empty, so get the
211284
# first non-None value we can find.
212285
valid_item = pandas_gbq.core.pandas.first_array_valid(values)
213-
field = value_to_bigquery_field(name, valid_item)
286+
field = value_to_bigquery_field(name, valid_item, default_type=default_type)
214287

215288
if field is not None:
216289
return schema.SchemaField(name, field.field_type, mode="REPEATED")

pandas_gbq/schema/pyarrow_to_bigquery.py

+56-5
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,31 @@
3737
}
3838

3939

40-
def arrow_type_to_bigquery_field(name, type_) -> Optional[schema.SchemaField]:
40+
def arrow_type_to_bigquery_field(
41+
name, type_, default_type="STRING"
42+
) -> Optional[schema.SchemaField]:
43+
"""Infers the BigQuery schema field type from an arrow type.
44+
45+
Args:
46+
name (str):
47+
Name of the column/field.
48+
type_:
49+
A pyarrow type object.
50+
51+
Returns:
52+
Optional[schema.SchemaField]:
53+
The schema field, or None if a type cannot be inferred, such as if
54+
it is a type that doesn't have a clear mapping in BigQuery.
55+
56+
null() are assumed to be the ``default_type``, since there are no
57+
values that contradict that.
58+
"""
59+
# If a sub-field is the null type, then assume it's the default type, as
60+
# that's the best we can do.
61+
# https://github.com/googleapis/python-bigquery-pandas/issues/836
62+
if pyarrow.types.is_null(type_):
63+
return schema.SchemaField(name, default_type)
64+
4165
# Since both TIMESTAMP/DATETIME use pyarrow.timestamp(...), we need to use
4266
# a special case to disambiguate them. See:
4367
# https://github.com/googleapis/python-bigquery-pandas/issues/450
@@ -52,22 +76,49 @@ def arrow_type_to_bigquery_field(name, type_) -> Optional[schema.SchemaField]:
5276
return schema.SchemaField(name, detected_type)
5377

5478
if pyarrow.types.is_list(type_):
55-
return arrow_list_type_to_bigquery(name, type_)
79+
return arrow_list_type_to_bigquery(name, type_, default_type=default_type)
5680

5781
if pyarrow.types.is_struct(type_):
5882
inner_fields: list[pyarrow.Field] = []
5983
struct_type = cast(pyarrow.StructType, type_)
6084
for field_index in range(struct_type.num_fields):
6185
field = struct_type[field_index]
62-
inner_fields.append(arrow_type_to_bigquery_field(field.name, field.type))
86+
inner_fields.append(
87+
arrow_type_to_bigquery_field(
88+
field.name, field.type, default_type=default_type
89+
)
90+
)
6391

6492
return schema.SchemaField(name, "RECORD", fields=inner_fields)
6593

6694
return None
6795

6896

69-
def arrow_list_type_to_bigquery(name, type_) -> Optional[schema.SchemaField]:
70-
inner_field = arrow_type_to_bigquery_field(name, type_.value_type)
97+
def arrow_list_type_to_bigquery(
98+
name, type_, default_type="STRING"
99+
) -> Optional[schema.SchemaField]:
100+
"""Infers the BigQuery schema field type from an arrow list type.
101+
102+
Args:
103+
name (str):
104+
Name of the column/field.
105+
type_:
106+
A pyarrow type object.
107+
108+
Returns:
109+
Optional[schema.SchemaField]:
110+
The schema field, or None if a type cannot be inferred, such as if
111+
it is a type that doesn't have a clear mapping in BigQuery.
112+
113+
null() are assumed to be the ``default_type``, since there are no
114+
values that contradict that.
115+
"""
116+
inner_field = arrow_type_to_bigquery_field(
117+
name, type_.value_type, default_type=default_type
118+
)
119+
120+
# If this is None, it means we got some type that we can't cleanly map to
121+
# a BigQuery type, so bubble that status up.
71122
if inner_field is None:
72123
return None
73124

tests/unit/schema/test_pandas_to_bigquery.py

+40-9
Original file line numberDiff line numberDiff line change
@@ -21,13 +21,34 @@ def module_under_test():
2121
def test_dataframe_to_bigquery_fields_w_named_index(module_under_test):
2222
df_data = collections.OrderedDict(
2323
[
24+
("str_index", ["a", "b"]),
2425
("str_column", ["hello", "world"]),
2526
("int_column", [42, 8]),
27+
("nullable_int_column", pandas.Series([42, None], dtype="Int64")),
28+
("uint_column", pandas.Series([7, 13], dtype="uint8")),
2629
("bool_column", [True, False]),
30+
("boolean_column", pandas.Series([True, None], dtype="boolean")),
31+
(
32+
"datetime_column",
33+
[
34+
datetime.datetime(1999, 12, 31, 23, 59, 59, 999999),
35+
datetime.datetime(2000, 1, 1, 0, 0, 0),
36+
],
37+
),
38+
(
39+
"timestamp_column",
40+
[
41+
datetime.datetime(
42+
1999, 12, 31, 23, 59, 59, 999999, tzinfo=datetime.timezone.utc
43+
),
44+
datetime.datetime(
45+
2000, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc
46+
),
47+
],
48+
),
2749
]
2850
)
29-
index = pandas.Index(["a", "b"], name="str_index")
30-
dataframe = pandas.DataFrame(df_data, index=index)
51+
dataframe = pandas.DataFrame(df_data).set_index("str_index", drop=True)
3152

3253
returned_schema = module_under_test.dataframe_to_bigquery_fields(
3354
dataframe, [], index=True
@@ -37,27 +58,37 @@ def test_dataframe_to_bigquery_fields_w_named_index(module_under_test):
3758
schema.SchemaField("str_index", "STRING", "NULLABLE"),
3859
schema.SchemaField("str_column", "STRING", "NULLABLE"),
3960
schema.SchemaField("int_column", "INTEGER", "NULLABLE"),
61+
schema.SchemaField("nullable_int_column", "INTEGER", "NULLABLE"),
62+
schema.SchemaField("uint_column", "INTEGER", "NULLABLE"),
4063
schema.SchemaField("bool_column", "BOOLEAN", "NULLABLE"),
64+
schema.SchemaField("boolean_column", "BOOLEAN", "NULLABLE"),
65+
schema.SchemaField("datetime_column", "DATETIME", "NULLABLE"),
66+
schema.SchemaField("timestamp_column", "TIMESTAMP", "NULLABLE"),
4167
)
4268
assert returned_schema == expected_schema
4369

4470

4571
def test_dataframe_to_bigquery_fields_w_multiindex(module_under_test):
4672
df_data = collections.OrderedDict(
4773
[
74+
("str_index", ["a", "a"]),
75+
("int_index", [0, 0]),
76+
(
77+
"dt_index",
78+
[
79+
datetime.datetime(1999, 12, 31, 23, 59, 59, 999999),
80+
datetime.datetime(2000, 1, 1, 0, 0, 0),
81+
],
82+
),
4883
("str_column", ["hello", "world"]),
4984
("int_column", [42, 8]),
5085
("bool_column", [True, False]),
5186
]
5287
)
53-
index = pandas.MultiIndex.from_tuples(
54-
[
55-
("a", 0, datetime.datetime(1999, 12, 31, 23, 59, 59, 999999)),
56-
("a", 0, datetime.datetime(2000, 1, 1, 0, 0, 0)),
57-
],
58-
names=["str_index", "int_index", "dt_index"],
88+
dataframe = pandas.DataFrame(df_data).set_index(
89+
["str_index", "int_index", "dt_index"],
90+
drop=True,
5991
)
60-
dataframe = pandas.DataFrame(df_data, index=index)
6192

6293
returned_schema = module_under_test.dataframe_to_bigquery_fields(
6394
dataframe, [], index=True

tests/unit/schema/test_pyarrow_to_bigquery.py

+8-10
Original file line numberDiff line numberDiff line change
@@ -42,16 +42,14 @@ def test_arrow_type_to_bigquery_field_scalar_types(pyarrow_type, bigquery_type):
4242

4343

4444
def test_arrow_type_to_bigquery_field_unknown():
45-
assert (
46-
pyarrow_to_bigquery.arrow_type_to_bigquery_field("test_name", pyarrow.null())
47-
is None
48-
)
45+
assert pyarrow_to_bigquery.arrow_type_to_bigquery_field(
46+
"test_name", pyarrow.null(), default_type="DEFAULT_TYPE"
47+
) == bigquery.SchemaField("test_name", "DEFAULT_TYPE")
4948

5049

5150
def test_arrow_type_to_bigquery_field_list_of_unknown():
52-
assert (
53-
pyarrow_to_bigquery.arrow_type_to_bigquery_field(
54-
"test_name", pyarrow.list_(pyarrow.null())
55-
)
56-
is None
57-
)
51+
assert pyarrow_to_bigquery.arrow_type_to_bigquery_field(
52+
"test_name",
53+
pyarrow.list_(pyarrow.null()),
54+
default_type="DEFAULT_TYPE",
55+
) == bigquery.SchemaField("test_name", "DEFAULT_TYPE", mode="REPEATED")

0 commit comments

Comments
 (0)