Skip to content

Commit 62fd565

Browse files
committed
fix: use compliant Parquet by default
1 parent ee1e25c commit 62fd565

File tree

2 files changed

+54
-15
lines changed

2 files changed

+54
-15
lines changed

google/cloud/bigquery/_pandas_helpers.py

+18-2
Original file line numberDiff line numberDiff line change
@@ -530,7 +530,13 @@ def dataframe_to_arrow(dataframe, bq_schema):
530530
return pyarrow.Table.from_arrays(arrow_arrays, names=arrow_names)
531531

532532

533-
def dataframe_to_parquet(dataframe, bq_schema, filepath, parquet_compression="SNAPPY"):
533+
def dataframe_to_parquet(
534+
dataframe,
535+
bq_schema,
536+
filepath,
537+
parquet_compression="SNAPPY",
538+
parquet_use_compliant_nested_type=True,
539+
):
534540
"""Write dataframe as a Parquet file, according to the desired BQ schema.
535541
536542
This function requires the :mod:`pyarrow` package. Arrow is used as an
@@ -551,14 +557,24 @@ def dataframe_to_parquet(dataframe, bq_schema, filepath, parquet_compression="SN
551557
The compression codec to use by the the ``pyarrow.parquet.write_table``
552558
serializing method. Defaults to "SNAPPY".
553559
https://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html#pyarrow-parquet-write-table
560+
parquet_use_compliant_nested_type (bool):
561+
Whether the ``pyarrow.parquet.write_table`` serializing method should write
562+
compliant Parquet nested type (lists). Defaults to ``True``.
563+
https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#nested-types
564+
https://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html#pyarrow-parquet-write-table
554565
"""
555566
pyarrow = _helpers.PYARROW_VERSIONS.try_import(raise_if_error=True)
556567

557568
import pyarrow.parquet
558569

559570
bq_schema = schema._to_schema_fields(bq_schema)
560571
arrow_table = dataframe_to_arrow(dataframe, bq_schema)
561-
pyarrow.parquet.write_table(arrow_table, filepath, compression=parquet_compression)
572+
pyarrow.parquet.write_table(
573+
arrow_table,
574+
filepath,
575+
compression=parquet_compression,
576+
use_compliant_nested_type=parquet_use_compliant_nested_type,
577+
)
562578

563579

564580
def _row_iterator_page_to_arrow(page, column_names, arrow_types):

google/cloud/bigquery/client.py

+36-13
Original file line numberDiff line numberDiff line change
@@ -2456,6 +2456,7 @@ def load_table_from_dataframe(
24562456
project: str = None,
24572457
job_config: LoadJobConfig = None,
24582458
parquet_compression: str = "snappy",
2459+
parquet_use_compliant_nested_type: bool = True,
24592460
timeout: float = DEFAULT_TIMEOUT,
24602461
) -> job.LoadJob:
24612462
"""Upload the contents of a table from a pandas DataFrame.
@@ -2519,18 +2520,34 @@ def load_table_from_dataframe(
25192520
:attr:`~google.cloud.bigquery.job.SourceFormat.PARQUET` are
25202521
supported.
25212522
parquet_compression (Optional[str]):
2522-
[Beta] The compression method to use if intermittently
2523-
serializing ``dataframe`` to a parquet file.
2524-
2525-
The argument is directly passed as the ``compression``
2526-
argument to the underlying ``pyarrow.parquet.write_table()``
2527-
method (the default value "snappy" gets converted to uppercase).
2528-
https://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html#pyarrow-parquet-write-table
2529-
2530-
If the job config schema is missing, the argument is directly
2531-
passed as the ``compression`` argument to the underlying
2532-
``DataFrame.to_parquet()`` method.
2533-
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_parquet.html#pandas.DataFrame.to_parquet
2523+
[Beta] The compression method to use if intermittently
2524+
serializing ``dataframe`` to a parquet file.
2525+
2526+
The argument is directly passed as the ``compression``
2527+
argument to the underlying ``pyarrow.parquet.write_table()``
2528+
method (the default value "snappy" gets converted to uppercase).
2529+
https://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html#pyarrow-parquet-write-table
2530+
2531+
If the job config schema is missing, the argument is directly
2532+
passed as the ``compression`` argument to the underlying
2533+
``DataFrame.to_parquet()`` method.
2534+
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_parquet.html#pandas.DataFrame.to_parquet
2535+
parquet_use_compliant_nested_type (bool):
2536+
Whether the ``pyarrow.parquet.write_table`` serializing method should write
2537+
compliant Parquet nested type (lists). Defaults to ``True``.
2538+
2539+
The argument is directly passed as the ``use_compliant_nested_type``
2540+
argument to the underlying ``pyarrow.parquet.write_table()``
2541+
method.
2542+
https://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html#pyarrow-parquet-write-table
2543+
2544+
If the job config schema is missing, the argument is directly
2545+
passed as an additonal ``kwarg`` argument to the underlying
2546+
``DataFrame.to_parquet()`` method.
2547+
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_parquet.html#pandas.DataFrame.to_parquet
2548+
2549+
This argument is only present to allow for backwards compatibility with
2550+
tables created using an old version of this method.
25342551
timeout (Optional[float]):
25352552
The number of seconds to wait for the underlying HTTP transport
25362553
before using ``retry``.
@@ -2647,9 +2664,15 @@ def load_table_from_dataframe(
26472664
job_config.schema,
26482665
tmppath,
26492666
parquet_compression=parquet_compression,
2667+
parquet_use_compliant_nested_type=parquet_use_compliant_nested_type,
26502668
)
26512669
else:
2652-
dataframe.to_parquet(tmppath, compression=parquet_compression)
2670+
dataframe.to_parquet(
2671+
tmppath,
2672+
engine="pyarrow",
2673+
compression=parquet_compression,
2674+
use_compliant_nested_type=parquet_use_compliant_nested_type,
2675+
)
26532676

26542677
else:
26552678

0 commit comments

Comments
 (0)