4
4
5
5
import collections .abc
6
6
import datetime
7
- from typing import Optional , Tuple
7
+ from typing import Any , Optional , Tuple
8
8
import warnings
9
9
10
10
import db_dtypes
28
28
# `docs/source/writing.rst`.
29
29
_PANDAS_DTYPE_TO_BQ = {
30
30
"bool" : "BOOLEAN" ,
31
+ "boolean" : "BOOLEAN" ,
31
32
"datetime64[ns, UTC]" : "TIMESTAMP" ,
33
+ "datetime64[us, UTC]" : "TIMESTAMP" ,
32
34
"datetime64[ns]" : "DATETIME" ,
35
+ "datetime64[us]" : "DATETIME" ,
33
36
"float32" : "FLOAT" ,
34
37
"float64" : "FLOAT" ,
35
38
"int8" : "INTEGER" ,
36
39
"int16" : "INTEGER" ,
37
40
"int32" : "INTEGER" ,
38
41
"int64" : "INTEGER" ,
42
+ "Int8" : "INTEGER" ,
43
+ "Int16" : "INTEGER" ,
44
+ "Int32" : "INTEGER" ,
45
+ "Int64" : "INTEGER" ,
39
46
"uint8" : "INTEGER" ,
40
47
"uint16" : "INTEGER" ,
41
48
"uint32" : "INTEGER" ,
@@ -103,7 +110,7 @@ def dataframe_to_bigquery_fields(
103
110
104
111
# Try to automatically determine the type based on a few rows of the data.
105
112
values = dataframe .reset_index ()[column ]
106
- bq_field = values_to_bigquery_field (column , values )
113
+ bq_field = values_to_bigquery_field (column , values , default_type = default_type )
107
114
108
115
if bq_field :
109
116
bq_schema_out .append (bq_field )
@@ -114,7 +121,9 @@ def dataframe_to_bigquery_fields(
114
121
arrow_value = pyarrow .array (values )
115
122
bq_field = (
116
123
pandas_gbq .schema .pyarrow_to_bigquery .arrow_type_to_bigquery_field (
117
- column , arrow_value .type
124
+ column ,
125
+ arrow_value .type ,
126
+ default_type = default_type ,
118
127
)
119
128
)
120
129
@@ -151,6 +160,19 @@ def dataframe_to_bigquery_fields(
151
160
152
161
153
162
def dtype_to_bigquery_field (name , dtype ) -> Optional [schema .SchemaField ]:
163
+ """Infers the BigQuery schema field type from a pandas dtype.
164
+
165
+ Args:
166
+ name (str):
167
+ Name of the column/field.
168
+ dtype:
169
+ A pandas / numpy dtype object.
170
+
171
+ Returns:
172
+ Optional[schema.SchemaField]:
173
+ The schema field, or None if a type cannot be inferred, such as if
174
+ it is ambiguous like the object dtype.
175
+ """
154
176
bq_type = _PANDAS_DTYPE_TO_BQ .get (dtype .name )
155
177
156
178
if bq_type is not None :
@@ -164,9 +186,44 @@ def dtype_to_bigquery_field(name, dtype) -> Optional[schema.SchemaField]:
164
186
return None
165
187
166
188
167
- def value_to_bigquery_field (name , value ) -> Optional [schema .SchemaField ]:
168
- if isinstance (value , str ):
169
- return schema .SchemaField (name , "STRING" )
189
+ def value_to_bigquery_field (
190
+ name : str , value : Any , default_type : Optional [str ] = None
191
+ ) -> Optional [schema .SchemaField ]:
192
+ """Infers the BigQuery schema field type from a single value.
193
+
194
+ Args:
195
+ name:
196
+ The name of the field.
197
+ value:
198
+ The value to infer the type from. If None, the default type is used
199
+ if available.
200
+ default_type:
201
+ The default field type. Defaults to None.
202
+
203
+ Returns:
204
+ The schema field, or None if a type cannot be inferred.
205
+ """
206
+
207
+ # Set the SchemaField datatype to the given default_type if the value
208
+ # being assessed is None.
209
+ if value is None :
210
+ return schema .SchemaField (name , default_type )
211
+
212
+ # Map from Python types to BigQuery types. This isn't super exhaustive
213
+ # because we rely more on pyarrow, which can check more than one value to
214
+ # determine the type.
215
+ type_mapping = {
216
+ str : "STRING" ,
217
+ }
218
+
219
+ # geopandas and shapely are optional dependencies, so only check if those
220
+ # are installed.
221
+ if _BaseGeometry is not None :
222
+ type_mapping [_BaseGeometry ] = "GEOGRAPHY"
223
+
224
+ for type_ , bq_type in type_mapping .items ():
225
+ if isinstance (value , type_ ):
226
+ return schema .SchemaField (name , bq_type )
170
227
171
228
# For timezone-naive datetimes, the later pyarrow conversion to try and
172
229
# learn the type add a timezone to such datetimes, causing them to be
@@ -182,35 +239,51 @@ def value_to_bigquery_field(name, value) -> Optional[schema.SchemaField]:
182
239
else :
183
240
return schema .SchemaField (name , "DATETIME" )
184
241
185
- if _BaseGeometry is not None and isinstance (value , _BaseGeometry ):
186
- return schema .SchemaField (name , "GEOGRAPHY" )
187
-
188
242
return None
189
243
190
244
191
- def values_to_bigquery_field (name , values ) -> Optional [schema .SchemaField ]:
245
+ def values_to_bigquery_field (
246
+ name : str , values : Any , default_type : str = "STRING"
247
+ ) -> Optional [schema .SchemaField ]:
248
+ """Infers the BigQuery schema field type from a list of values.
249
+
250
+ This function iterates through the given values to determine the
251
+ corresponding schema field type.
252
+
253
+ Args:
254
+ name:
255
+ The name of the field.
256
+ values:
257
+ An iterable of values to infer the type from. If all the values
258
+ are None or the iterable is empty, the function returns None.
259
+ default_type:
260
+ The default field type to use if a specific type cannot be
261
+ determined from the values. Defaults to "STRING".
262
+
263
+ Returns:
264
+ The schema field, or None if a type cannot be inferred.
265
+ """
192
266
value = pandas_gbq .core .pandas .first_valid (values )
193
267
194
- # All NULL, type not determinable.
268
+ # All values came back as NULL, thus type not determinable by this method.
269
+ # Return None so we can try other methods.
195
270
if value is None :
196
271
return None
197
272
198
- field = value_to_bigquery_field (name , value )
199
- if field is not None :
273
+ field = value_to_bigquery_field (name , value , default_type = default_type )
274
+ if field :
200
275
return field
201
276
202
- if isinstance (value , str ):
203
- return schema .SchemaField (name , "STRING" )
204
-
205
- # Check plain ARRAY values here. Let STRUCT get determined by pyarrow,
206
- # which can examine more values to determine all keys.
277
+ # Check plain ARRAY values here. Exclude mapping types to let STRUCT get
278
+ # determined by pyarrow, which can examine more values to determine all
279
+ # keys.
207
280
if isinstance (value , collections .abc .Iterable ) and not isinstance (
208
281
value , collections .abc .Mapping
209
282
):
210
283
# It could be that this value contains all None or is empty, so get the
211
284
# first non-None value we can find.
212
285
valid_item = pandas_gbq .core .pandas .first_array_valid (values )
213
- field = value_to_bigquery_field (name , valid_item )
286
+ field = value_to_bigquery_field (name , valid_item , default_type = default_type )
214
287
215
288
if field is not None :
216
289
return schema .SchemaField (name , field .field_type , mode = "REPEATED" )
0 commit comments