Skip to content

Commit 9af4854

Browse files
CCLDArjunAlexWaygoodgpshead
authored
gh-89550: Buffer GzipFile.write to reduce execution time by ~15% (#101251)
Use `io.BufferedWriter` to buffer gzip writes. --------- Co-authored-by: Alex Waygood <[email protected]> Co-authored-by: Gregory P. Smith <[email protected]>
1 parent 405eacc commit 9af4854

File tree

2 files changed

+37
-5
lines changed

2 files changed

+37
-5
lines changed

Lib/gzip.py

+35-5
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
_COMPRESS_LEVEL_BEST = 9
2323

2424
READ_BUFFER_SIZE = 128 * 1024
25+
_WRITE_BUFFER_SIZE = 4 * io.DEFAULT_BUFFER_SIZE
2526

2627

2728
def open(filename, mode="rb", compresslevel=_COMPRESS_LEVEL_BEST,
@@ -120,6 +121,21 @@ class BadGzipFile(OSError):
120121
"""Exception raised in some cases for invalid gzip files."""
121122

122123

124+
class _WriteBufferStream(io.RawIOBase):
125+
"""Minimal object to pass WriteBuffer flushes into GzipFile"""
126+
def __init__(self, gzip_file):
127+
self.gzip_file = gzip_file
128+
129+
def write(self, data):
130+
return self.gzip_file._write_raw(data)
131+
132+
def seekable(self):
133+
return False
134+
135+
def writable(self):
136+
return True
137+
138+
123139
class GzipFile(_compression.BaseStream):
124140
"""The GzipFile class simulates most of the methods of a file object with
125141
the exception of the truncate() method.
@@ -184,6 +200,7 @@ def __init__(self, filename=None, mode=None,
184200
if mode is None:
185201
mode = getattr(fileobj, 'mode', 'rb')
186202

203+
187204
if mode.startswith('r'):
188205
self.mode = READ
189206
raw = _GzipReader(fileobj)
@@ -206,6 +223,9 @@ def __init__(self, filename=None, mode=None,
206223
zlib.DEF_MEM_LEVEL,
207224
0)
208225
self._write_mtime = mtime
226+
self._buffer_size = _WRITE_BUFFER_SIZE
227+
self._buffer = io.BufferedWriter(_WriteBufferStream(self),
228+
buffer_size=self._buffer_size)
209229
else:
210230
raise ValueError("Invalid mode: {!r}".format(mode))
211231

@@ -231,6 +251,11 @@ def _init_write(self, filename):
231251
self.bufsize = 0
232252
self.offset = 0 # Current file offset for seek(), tell(), etc
233253

254+
def tell(self):
255+
self._check_not_closed()
256+
self._buffer.flush()
257+
return super().tell()
258+
234259
def _write_gzip_header(self, compresslevel):
235260
self.fileobj.write(b'\037\213') # magic header
236261
self.fileobj.write(b'\010') # compression method
@@ -272,6 +297,10 @@ def write(self,data):
272297
if self.fileobj is None:
273298
raise ValueError("write() on closed GzipFile object")
274299

300+
return self._buffer.write(data)
301+
302+
def _write_raw(self, data):
303+
# Called by our self._buffer underlying WriteBufferStream.
275304
if isinstance(data, (bytes, bytearray)):
276305
length = len(data)
277306
else:
@@ -322,16 +351,17 @@ def close(self):
322351
fileobj = self.fileobj
323352
if fileobj is None:
324353
return
325-
self.fileobj = None
326354
try:
327355
if self.mode == WRITE:
356+
self._buffer.flush()
328357
fileobj.write(self.compress.flush())
329358
write32u(fileobj, self.crc)
330359
# self.size may exceed 2 GiB, or even 4 GiB
331360
write32u(fileobj, self.size & 0xffffffff)
332361
elif self.mode == READ:
333362
self._buffer.close()
334363
finally:
364+
self.fileobj = None
335365
myfileobj = self.myfileobj
336366
if myfileobj:
337367
self.myfileobj = None
@@ -341,7 +371,7 @@ def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
341371
self._check_not_closed()
342372
if self.mode == WRITE:
343373
# Ensure the compressor's buffer is flushed
344-
self.fileobj.write(self.compress.flush(zlib_mode))
374+
self._buffer.flush()
345375
self.fileobj.flush()
346376

347377
def fileno(self):
@@ -378,10 +408,10 @@ def seek(self, offset, whence=io.SEEK_SET):
378408
if offset < self.offset:
379409
raise OSError('Negative seek in write mode')
380410
count = offset - self.offset
381-
chunk = b'\0' * 1024
382-
for i in range(count // 1024):
411+
chunk = b'\0' * self._buffer_size
412+
for i in range(count // self._buffer_size):
383413
self.write(chunk)
384-
self.write(b'\0' * (count % 1024))
414+
self.write(b'\0' * (count % self._buffer_size))
385415
elif self.mode == READ:
386416
self._check_not_closed()
387417
return self._buffer.seek(offset, whence)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Decrease execution time of some :mod:`gzip` file writes by 15% by
2+
adding more appropriate buffering.

0 commit comments

Comments
 (0)