Skip to content

Commit db50352

Browse files
emkornfieldwesm
authored andcommitted
ARROW-2587: [Python][Parquet] Verify nested data can be written
- Plumbs through engine version - Makes engine version settable via environment variable - Adds unit tests coverage Should also unblock: googleapis/python-bigquery#21 CC @wesm Closes #6751 from emkornfield/add_flag_to_python Authored-by: Micah Kornfield <[email protected]> Signed-off-by: Wes McKinney <[email protected]>
1 parent f140625 commit db50352

File tree

4 files changed

+54
-2
lines changed

4 files changed

+54
-2
lines changed

python/pyarrow/_parquet.pxd

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -376,6 +376,7 @@ cdef extern from "parquet/api/writer.h" namespace "parquet" nogil:
376376
Builder* allow_truncated_timestamps()
377377
Builder* disallow_truncated_timestamps()
378378
Builder* store_schema()
379+
Builder* set_engine_version(ArrowWriterEngineVersion version)
379380
shared_ptr[ArrowWriterProperties] build()
380381
c_bool support_deprecated_int96_timestamps()
381382

@@ -437,6 +438,11 @@ cdef extern from "parquet/arrow/schema.h" namespace "parquet::arrow" nogil:
437438
shared_ptr[SchemaDescriptor]* out)
438439

439440

441+
cdef extern from "parquet/properties.h" namespace "parquet" nogil:
442+
cdef enum ArrowWriterEngineVersion:
443+
V1 "parquet::ArrowWriterProperties::V1",
444+
V2 "parquet::ArrowWriterProperties::V2"
445+
440446
cdef extern from "parquet/arrow/writer.h" namespace "parquet::arrow" nogil:
441447
cdef cppclass FileWriter:
442448

python/pyarrow/_parquet.pyx

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1209,6 +1209,7 @@ cdef class ParquetWriter:
12091209
object compression_level
12101210
object version
12111211
object write_statistics
1212+
object writer_engine_version
12121213
int row_group_size
12131214
int64_t data_page_size
12141215

@@ -1221,7 +1222,8 @@ cdef class ParquetWriter:
12211222
data_page_size=None,
12221223
allow_truncated_timestamps=False,
12231224
compression_level=None,
1224-
use_byte_stream_split=False):
1225+
use_byte_stream_split=False,
1226+
writer_engine_version=None):
12251227
cdef:
12261228
shared_ptr[WriterProperties] properties
12271229
c_string c_where
@@ -1247,6 +1249,7 @@ cdef class ParquetWriter:
12471249
self.coerce_timestamps = coerce_timestamps
12481250
self.allow_truncated_timestamps = allow_truncated_timestamps
12491251
self.use_byte_stream_split = use_byte_stream_split
1252+
self.writer_engine_version = writer_engine_version
12501253

12511254
cdef WriterProperties.Builder properties_builder
12521255
self._set_version(&properties_builder)
@@ -1269,6 +1272,7 @@ cdef class ParquetWriter:
12691272
self._set_int96_support(&arrow_properties_builder)
12701273
self._set_coerce_timestamps(&arrow_properties_builder)
12711274
self._set_allow_truncated_timestamps(&arrow_properties_builder)
1275+
self._set_writer_engine_version(&arrow_properties_builder)
12721276

12731277
arrow_properties = arrow_properties_builder.build()
12741278

@@ -1302,6 +1306,14 @@ cdef class ParquetWriter:
13021306
else:
13031307
props.disallow_truncated_timestamps()
13041308

1309+
cdef int _set_writer_engine_version(
1310+
self, ArrowWriterProperties.Builder* props) except -1:
1311+
if self.writer_engine_version == "V1":
1312+
props.set_engine_version(ArrowWriterEngineVersion.V1)
1313+
elif self.writer_engine_version != "V2":
1314+
raise ValueError("Unsupported Writer Engine Version: {0}"
1315+
.format(self.writer_engine_version))
1316+
13051317
cdef int _set_version(self, WriterProperties.Builder* props) except -1:
13061318
if self.version is not None:
13071319
if self.version == "1.0":

python/pyarrow/parquet.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -400,7 +400,13 @@ def _sanitize_table(table, new_schema, flavor):
400400
only for some columns. If both dictionary and byte_stream_stream are
401401
enabled, then dictionary is prefered.
402402
The byte_stream_split encoding is valid only for floating-point data types
403-
and should be combined with a compression codec."""
403+
and should be combined with a compression codec.
404+
writer_engine_version: str, default "V2"
405+
The engine version to use when writing out Arrow data. V2 supports
406+
all nested types. V1 is legacy and will be removed in a future release.
407+
Setting the environment variable ARROW_PARQUET_WRITER_ENGINE will
408+
override the default.
409+
"""
404410

405411

406412
class ParquetWriter:
@@ -429,6 +435,7 @@ def __init__(self, where, schema, filesystem=None,
429435
use_deprecated_int96_timestamps=None,
430436
compression_level=None,
431437
use_byte_stream_split=False,
438+
writer_engine_version=None,
432439
**options):
433440
if use_deprecated_int96_timestamps is None:
434441
# Use int96 timestamps for Spark
@@ -456,6 +463,7 @@ def __init__(self, where, schema, filesystem=None,
456463
else:
457464
sink = where
458465
self._metadata_collector = options.pop('metadata_collector', None)
466+
engine_version = os.environ.get('ARROW_PARQUET_WRITER_ENGINE', 'V2')
459467
self.writer = _parquet.ParquetWriter(
460468
sink, schema,
461469
version=version,
@@ -465,6 +473,7 @@ def __init__(self, where, schema, filesystem=None,
465473
use_deprecated_int96_timestamps=use_deprecated_int96_timestamps,
466474
compression_level=compression_level,
467475
use_byte_stream_split=use_byte_stream_split,
476+
writer_engine_version=engine_version,
468477
**options)
469478
self.is_open = True
470479

python/pyarrow/tests/test_parquet.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -554,6 +554,31 @@ def test_pandas_parquet_empty_roundtrip(tempdir):
554554
tm.assert_frame_equal(df, df_read)
555555

556556

557+
@pytest.mark.pandas
558+
def test_pandas_can_write_nested_data(tempdir):
559+
data = {
560+
"agg_col": [
561+
{"page_type": 1},
562+
{"record_type": 1},
563+
{"non_consectutive_home": 0},
564+
],
565+
"uid_first": "1001"
566+
}
567+
df = pd.DataFrame(data=data)
568+
arrow_table = pa.Table.from_pandas(df)
569+
imos = pa.BufferOutputStream()
570+
# This succeeds under V2
571+
_write_table(arrow_table, imos)
572+
573+
# Under V1 it fails.
574+
with pytest.raises(ValueError):
575+
import os
576+
os.environ['ARROW_PARQUET_WRITER_ENGINE'] = 'V1'
577+
imos = pa.BufferOutputStream()
578+
_write_table(arrow_table, imos)
579+
del os.environ['ARROW_PARQUET_WRITER_ENGINE']
580+
581+
557582
@pytest.mark.pandas
558583
def test_pandas_parquet_pyfile_roundtrip(tempdir):
559584
filename = tempdir / 'pandas_pyfile_roundtrip.parquet'

0 commit comments

Comments
 (0)