Skip to content

Commit fecdbd4

Browse files
committed
feat: reduce memory impact of chunked reads
1 parent 5d51c45 commit fecdbd4

File tree

1 file changed

+22
-29
lines changed

1 file changed

+22
-29
lines changed

awswrangler/s3/_read_parquet.py

Lines changed: 22 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -247,40 +247,33 @@ def _read_parquet_chunked(
247247
if pq_file is None:
248248
continue
249249

250-
schema = pq_file.schema.to_arrow_schema()
250+
metadata = pq_file.metadata
251+
schema = metadata.schema.to_arrow_schema()
251252
if columns:
252253
schema = pa.schema([schema.field(column) for column in columns], schema.metadata)
253254

254255
use_threads_flag: bool = use_threads if isinstance(use_threads, bool) else bool(use_threads > 1)
255-
iterate_at_least_once = False
256-
for chunk in pq_file.iter_batches(
257-
batch_size=batch_size, columns=columns, use_threads=use_threads_flag, use_pandas_metadata=False
258-
):
259-
iterate_at_least_once = True
260-
table = _add_table_partitions(
261-
table=pa.Table.from_batches([chunk], schema=schema),
262-
path=path,
263-
path_root=path_root,
264-
)
265-
df = _table_to_df(table=table, kwargs=arrow_kwargs)
266-
if chunked is True:
267-
yield df
268-
else:
269-
if next_slice is not None:
270-
df = pd.concat(objs=[next_slice, df], sort=False, copy=False)
271-
while len(df.index) >= chunked:
272-
yield df.iloc[:chunked, :].copy()
273-
df = df.iloc[chunked:, :]
274-
if df.empty:
275-
next_slice = None
256+
table_kwargs = {"path": path, "path_root": path_root}
257+
if metadata.num_rows > 0:
258+
for chunk in pq_file.iter_batches(
259+
batch_size=batch_size, columns=columns, use_threads=use_threads_flag, use_pandas_metadata=False
260+
):
261+
table = _add_table_partitions(table=pa.Table.from_batches([chunk], schema=schema), **table_kwargs)
262+
df = _table_to_df(table=table, kwargs=arrow_kwargs)
263+
if chunked is True:
264+
yield df
276265
else:
277-
next_slice = df
278-
if not iterate_at_least_once:
279-
table = _add_table_partitions(
280-
table=pa.Table.from_batches([], schema=schema),
281-
path=path,
282-
path_root=path_root,
283-
)
266+
if next_slice is not None:
267+
df = pd.concat(objs=[next_slice, df], sort=False, copy=False)
268+
while len(df.index) >= chunked:
269+
yield df.iloc[:chunked, :].copy()
270+
df = df.iloc[chunked:, :]
271+
if df.empty:
272+
next_slice = None
273+
else:
274+
next_slice = df
275+
else:
276+
table = _add_table_partitions(table=pa.Table.from_batches([], schema=schema), **table_kwargs)
284277
df = _table_to_df(table=table, kwargs=arrow_kwargs)
285278
yield df
286279

0 commit comments

Comments
 (0)