Skip to content

Commit d485112

Browse files
fix: read parquet file in chunked mode per row group (#3016)
* fix: read parquet file in chunked mode per row group * fix: Fix test_empty_parquet * feat: reduce memory impact of chunked reads --------- Co-authored-by: Abdel Jaidi <[email protected]>
1 parent 05d0731 commit d485112

File tree

1 file changed

+26
-23
lines changed

1 file changed

+26
-23
lines changed

awswrangler/s3/_read_parquet.py

Lines changed: 26 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -247,33 +247,36 @@ def _read_parquet_chunked(
247247
if pq_file is None:
248248
continue
249249

250-
use_threads_flag: bool = use_threads if isinstance(use_threads, bool) else bool(use_threads > 1)
251-
chunks = pq_file.iter_batches(
252-
batch_size=batch_size, columns=columns, use_threads=use_threads_flag, use_pandas_metadata=False
253-
)
254-
255-
schema = pq_file.schema.to_arrow_schema()
250+
metadata = pq_file.metadata
251+
schema = metadata.schema.to_arrow_schema()
256252
if columns:
257253
schema = pa.schema([schema.field(column) for column in columns], schema.metadata)
258254

259-
table = _add_table_partitions(
260-
table=pa.Table.from_batches(chunks, schema=schema),
261-
path=path,
262-
path_root=path_root,
263-
)
264-
df = _table_to_df(table=table, kwargs=arrow_kwargs)
265-
if chunked is True:
266-
yield df
255+
use_threads_flag: bool = use_threads if isinstance(use_threads, bool) else bool(use_threads > 1)
256+
table_kwargs = {"path": path, "path_root": path_root}
257+
if metadata.num_rows > 0:
258+
for chunk in pq_file.iter_batches(
259+
batch_size=batch_size, columns=columns, use_threads=use_threads_flag, use_pandas_metadata=False
260+
):
261+
table = _add_table_partitions(table=pa.Table.from_batches([chunk], schema=schema), **table_kwargs)
262+
df = _table_to_df(table=table, kwargs=arrow_kwargs)
263+
if chunked is True:
264+
yield df
265+
else:
266+
if next_slice is not None:
267+
df = pd.concat(objs=[next_slice, df], sort=False, copy=False)
268+
while len(df.index) >= chunked:
269+
yield df.iloc[:chunked, :].copy()
270+
df = df.iloc[chunked:, :]
271+
if df.empty:
272+
next_slice = None
273+
else:
274+
next_slice = df
267275
else:
268-
if next_slice is not None:
269-
df = pd.concat(objs=[next_slice, df], sort=False, copy=False)
270-
while len(df.index) >= chunked:
271-
yield df.iloc[:chunked, :].copy()
272-
df = df.iloc[chunked:, :]
273-
if df.empty:
274-
next_slice = None
275-
else:
276-
next_slice = df
276+
table = _add_table_partitions(table=pa.Table.from_batches([], schema=schema), **table_kwargs)
277+
df = _table_to_df(table=table, kwargs=arrow_kwargs)
278+
yield df
279+
277280
if next_slice is not None:
278281
yield next_slice
279282

0 commit comments

Comments
 (0)