From 6a38088f1d76849973e9980ee8d0f871e1eeda54 Mon Sep 17 00:00:00 2001 From: JuniorJPDJ Date: Mon, 31 Aug 2020 05:30:22 +0200 Subject: [PATCH 1/7] better approach for seek in non-compressed ZipExtFile from zipfile --- Lib/zipfile.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/Lib/zipfile.py b/Lib/zipfile.py index fc6ca65e5ed1e9..54493a05382a26 100644 --- a/Lib/zipfile.py +++ b/Lib/zipfile.py @@ -836,9 +836,11 @@ def __init__(self, fileobj, mode, zipinfo, pwd=None, if hasattr(zipinfo, 'CRC'): self._expected_crc = zipinfo.CRC + self._orig_crc = zipinfo.CRC self._running_crc = crc32(b'') else: self._expected_crc = None + self._orig_crc = None self._seekable = False try: @@ -1069,17 +1071,17 @@ def seekable(self): raise ValueError("I/O operation on closed file.") return self._seekable - def seek(self, offset, whence=0): + def seek(self, offset, whence=os.SEEK_SET): if self.closed: raise ValueError("seek on closed file.") if not self._seekable: raise io.UnsupportedOperation("underlying stream is not seekable") curr_pos = self.tell() - if whence == 0: # Seek from start of file + if whence == os.SEEK_SET: new_pos = offset - elif whence == 1: # Seek from current position + elif whence == os.SEEK_CUR: new_pos = curr_pos + offset - elif whence == 2: # Seek from EOF + elif whence == os.SEEK_END: new_pos = self._orig_file_size + offset else: raise ValueError("whence must be os.SEEK_SET (0), " @@ -1102,6 +1104,7 @@ def seek(self, offset, whence=0): # Position is before the current position. Reset the ZipExtFile self._fileobj.seek(self._orig_compress_start) self._running_crc = self._orig_start_crc + self._expected_crc = self._orig_crc self._compress_left = self._orig_compress_size self._left = self._orig_file_size self._readbuffer = b'' @@ -1112,6 +1115,15 @@ def seek(self, offset, whence=0): if self._decrypter is not None: self._init_decrypter() + if read_offset > 0 and self._compress_type == ZIP_STORED and self._decrypter == None: + # disable CRC checking after first seeking - it would be invalid + self._expected_crc = None + + self._fileobj.seek(read_offset, os.SEEK_CUR) + self._left -= read_offset + self._offset = 0 + read_offset = 0 + while read_offset > 0: read_len = min(self.MAX_SEEK_READ, read_offset) self.read(read_len) From 58e3b88b402f517b3e2df7081a4c7517eab4f0fe Mon Sep 17 00:00:00 2001 From: JuniorJPDJ Date: Thu, 26 Aug 2021 03:32:23 +0200 Subject: [PATCH 2/7] fixup! better approach for seek in non-compressed ZipExtFile from zipfile --- Lib/test/test_zipfile.py | 2 ++ Lib/zipfile.py | 10 +++++++--- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/Lib/test/test_zipfile.py b/Lib/test/test_zipfile.py index f4c11d88c8a09f..5a0d51ece2453f 100644 --- a/Lib/test/test_zipfile.py +++ b/Lib/test/test_zipfile.py @@ -2030,6 +2030,7 @@ def test_seek_tell(self): fp.seek(bloc, os.SEEK_CUR) self.assertEqual(fp.tell(), bloc) self.assertEqual(fp.read(5), txt[bloc:bloc+5]) + self.assertEqual(fp.tell(), bloc + 5) fp.seek(0, os.SEEK_END) self.assertEqual(fp.tell(), len(txt)) fp.seek(0, os.SEEK_SET) @@ -2047,6 +2048,7 @@ def test_seek_tell(self): fp.seek(bloc, os.SEEK_CUR) self.assertEqual(fp.tell(), bloc) self.assertEqual(fp.read(5), txt[bloc:bloc+5]) + self.assertEqual(fp.tell(), bloc + 5) fp.seek(0, os.SEEK_END) self.assertEqual(fp.tell(), len(txt)) fp.seek(0, os.SEEK_SET) diff --git a/Lib/zipfile.py b/Lib/zipfile.py index 54493a05382a26..869b72ac3ffe79 100644 --- a/Lib/zipfile.py +++ b/Lib/zipfile.py @@ -1115,14 +1115,18 @@ def seek(self, offset, whence=os.SEEK_SET): if self._decrypter is not None: self._init_decrypter() - if read_offset > 0 and self._compress_type == ZIP_STORED and self._decrypter == None: + # Fast seek uncompressed unencrypted file + if self._compress_type == ZIP_STORED and self._decrypter == None and read_offset > 0: # disable CRC checking after first seeking - it would be invalid self._expected_crc = None - + # seek actual file taking already buffered data into account + read_offset -= len(self._readbuffer) - self._offset self._fileobj.seek(read_offset, os.SEEK_CUR) self._left -= read_offset - self._offset = 0 read_offset = 0 + # flush read buffer + self._readbuffer = b'' + self._offset = 0 while read_offset > 0: read_len = min(self.MAX_SEEK_READ, read_offset) From 0ce67ae614b6525f982bc7ebf340cd85af232f26 Mon Sep 17 00:00:00 2001 From: JuniorJPDJ Date: Fri, 27 Aug 2021 18:08:06 +0200 Subject: [PATCH 3/7] news --- .../NEWS.d/next/Library/2021-08-27-18-07-35.bpo-44173.oW92Ev.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Library/2021-08-27-18-07-35.bpo-44173.oW92Ev.rst diff --git a/Misc/NEWS.d/next/Library/2021-08-27-18-07-35.bpo-44173.oW92Ev.rst b/Misc/NEWS.d/next/Library/2021-08-27-18-07-35.bpo-44173.oW92Ev.rst new file mode 100644 index 00000000000000..abc98266afb0ce --- /dev/null +++ b/Misc/NEWS.d/next/Library/2021-08-27-18-07-35.bpo-44173.oW92Ev.rst @@ -0,0 +1 @@ +Enable fast seeking of uncompressed unencrypted :class:`zipfile.ZipExtFile` From 84aca302fdbef8dbe56be569251802ecb69b7d73 Mon Sep 17 00:00:00 2001 From: JuniorJPDJ Date: Fri, 22 Apr 2022 14:43:29 +0200 Subject: [PATCH 4/7] hide fast stored seek under opt-out flag --- Lib/zipfile.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/Lib/zipfile.py b/Lib/zipfile.py index 869b72ac3ffe79..bb8497d52c557e 100644 --- a/Lib/zipfile.py +++ b/Lib/zipfile.py @@ -814,10 +814,11 @@ class ZipExtFile(io.BufferedIOBase): MAX_SEEK_READ = 1 << 24 def __init__(self, fileobj, mode, zipinfo, pwd=None, - close_fileobj=False): + close_fileobj=False, fast_stored_seek_nocrc=True): self._fileobj = fileobj self._pwd = pwd self._close_fileobj = close_fileobj + self._fast_stored_seek_nocrc = fast_stored_seek_nocrc self._compress_type = zipinfo.compress_type self._compress_left = zipinfo.compress_size @@ -1116,7 +1117,8 @@ def seek(self, offset, whence=os.SEEK_SET): self._init_decrypter() # Fast seek uncompressed unencrypted file - if self._compress_type == ZIP_STORED and self._decrypter == None and read_offset > 0: + if self._fast_stored_seek_nocrc and self._compress_type == ZIP_STORED \ + and self._decrypter == None and read_offset > 0: # disable CRC checking after first seeking - it would be invalid self._expected_crc = None # seek actual file taking already buffered data into account @@ -1249,6 +1251,9 @@ class ZipFile: When using ZIP_STORED or ZIP_LZMA this keyword has no effect. When using ZIP_DEFLATED integers 0 through 9 are accepted. When using ZIP_BZIP2 integers 1 through 9 are accepted. + fast_stored_seek_nocrc: if True ZipFile will use fast seeking for + uncompressed (ZIP_STORED) files which also skips + CRC verification after first seek operation. """ @@ -1256,7 +1261,8 @@ class ZipFile: _windows_illegal_name_trans_table = None def __init__(self, file, mode="r", compression=ZIP_STORED, allowZip64=True, - compresslevel=None, *, strict_timestamps=True, metadata_encoding=None): + compresslevel=None, *, strict_timestamps=True, metadata_encoding=None, + fast_stored_seek_nocrc=True): """Open the ZIP file with mode read 'r', write 'w', exclusive create 'x', or append 'a'.""" if mode not in ('r', 'w', 'x', 'a'): @@ -1276,6 +1282,7 @@ def __init__(self, file, mode="r", compression=ZIP_STORED, allowZip64=True, self._comment = b'' self._strict_timestamps = strict_timestamps self.metadata_encoding = metadata_encoding + self._fast_stored_seek_nocrc = fast_stored_seek_nocrc # Check that we don't try to write with nonconforming codecs if self.metadata_encoding and mode != 'r': @@ -1616,7 +1623,7 @@ def open(self, name, mode="r", pwd=None, *, force_zip64=False): else: pwd = None - return ZipExtFile(zef_file, mode, zinfo, pwd, True) + return ZipExtFile(zef_file, mode, zinfo, pwd, True, self._fast_stored_seek_nocrc) except: zef_file.close() raise From 8a6be474ac722f57d3c57c9aca86babbf2b715e2 Mon Sep 17 00:00:00 2001 From: JuniorJPDJ Date: Wed, 27 Apr 2022 22:44:05 +0200 Subject: [PATCH 5/7] Revert "hide fast stored seek under opt-out flag" This reverts commit 13438487cb14a9947f3fb79faefd61c6ea0123ca. --- Lib/zipfile.py | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/Lib/zipfile.py b/Lib/zipfile.py index bb8497d52c557e..869b72ac3ffe79 100644 --- a/Lib/zipfile.py +++ b/Lib/zipfile.py @@ -814,11 +814,10 @@ class ZipExtFile(io.BufferedIOBase): MAX_SEEK_READ = 1 << 24 def __init__(self, fileobj, mode, zipinfo, pwd=None, - close_fileobj=False, fast_stored_seek_nocrc=True): + close_fileobj=False): self._fileobj = fileobj self._pwd = pwd self._close_fileobj = close_fileobj - self._fast_stored_seek_nocrc = fast_stored_seek_nocrc self._compress_type = zipinfo.compress_type self._compress_left = zipinfo.compress_size @@ -1117,8 +1116,7 @@ def seek(self, offset, whence=os.SEEK_SET): self._init_decrypter() # Fast seek uncompressed unencrypted file - if self._fast_stored_seek_nocrc and self._compress_type == ZIP_STORED \ - and self._decrypter == None and read_offset > 0: + if self._compress_type == ZIP_STORED and self._decrypter == None and read_offset > 0: # disable CRC checking after first seeking - it would be invalid self._expected_crc = None # seek actual file taking already buffered data into account @@ -1251,9 +1249,6 @@ class ZipFile: When using ZIP_STORED or ZIP_LZMA this keyword has no effect. When using ZIP_DEFLATED integers 0 through 9 are accepted. When using ZIP_BZIP2 integers 1 through 9 are accepted. - fast_stored_seek_nocrc: if True ZipFile will use fast seeking for - uncompressed (ZIP_STORED) files which also skips - CRC verification after first seek operation. """ @@ -1261,8 +1256,7 @@ class ZipFile: _windows_illegal_name_trans_table = None def __init__(self, file, mode="r", compression=ZIP_STORED, allowZip64=True, - compresslevel=None, *, strict_timestamps=True, metadata_encoding=None, - fast_stored_seek_nocrc=True): + compresslevel=None, *, strict_timestamps=True, metadata_encoding=None): """Open the ZIP file with mode read 'r', write 'w', exclusive create 'x', or append 'a'.""" if mode not in ('r', 'w', 'x', 'a'): @@ -1282,7 +1276,6 @@ def __init__(self, file, mode="r", compression=ZIP_STORED, allowZip64=True, self._comment = b'' self._strict_timestamps = strict_timestamps self.metadata_encoding = metadata_encoding - self._fast_stored_seek_nocrc = fast_stored_seek_nocrc # Check that we don't try to write with nonconforming codecs if self.metadata_encoding and mode != 'r': @@ -1623,7 +1616,7 @@ def open(self, name, mode="r", pwd=None, *, force_zip64=False): else: pwd = None - return ZipExtFile(zef_file, mode, zinfo, pwd, True, self._fast_stored_seek_nocrc) + return ZipExtFile(zef_file, mode, zinfo, pwd, True) except: zef_file.close() raise From 8491b3be9a1f6251b149c9ef5f3dd5628cbf4b62 Mon Sep 17 00:00:00 2001 From: JuniorJPDJ Date: Wed, 27 Apr 2022 22:59:36 +0200 Subject: [PATCH 6/7] fix minor problems found in review --- Lib/zipfile.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Lib/zipfile.py b/Lib/zipfile.py index 869b72ac3ffe79..841eba845a36f5 100644 --- a/Lib/zipfile.py +++ b/Lib/zipfile.py @@ -836,11 +836,9 @@ def __init__(self, fileobj, mode, zipinfo, pwd=None, if hasattr(zipinfo, 'CRC'): self._expected_crc = zipinfo.CRC - self._orig_crc = zipinfo.CRC self._running_crc = crc32(b'') else: self._expected_crc = None - self._orig_crc = None self._seekable = False try: @@ -849,6 +847,7 @@ def __init__(self, fileobj, mode, zipinfo, pwd=None, self._orig_compress_size = zipinfo.compress_size self._orig_file_size = zipinfo.file_size self._orig_start_crc = self._running_crc + self._orig_crc = self._expected_crc self._seekable = True except AttributeError: pass @@ -1116,7 +1115,7 @@ def seek(self, offset, whence=os.SEEK_SET): self._init_decrypter() # Fast seek uncompressed unencrypted file - if self._compress_type == ZIP_STORED and self._decrypter == None and read_offset > 0: + if self._compress_type == ZIP_STORED and self._decrypter is None and read_offset > 0: # disable CRC checking after first seeking - it would be invalid self._expected_crc = None # seek actual file taking already buffered data into account From 420ae622a3dafdcce1965bb139e2fd116a237192 Mon Sep 17 00:00:00 2001 From: JuniorJPDJ Date: Wed, 27 Apr 2022 23:02:59 +0200 Subject: [PATCH 7/7] move fast seek to skip unneeded code --- Lib/zipfile.py | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/Lib/zipfile.py b/Lib/zipfile.py index 841eba845a36f5..95f8e5f1246757 100644 --- a/Lib/zipfile.py +++ b/Lib/zipfile.py @@ -1095,7 +1095,19 @@ def seek(self, offset, whence=os.SEEK_SET): read_offset = new_pos - curr_pos buff_offset = read_offset + self._offset - if buff_offset >= 0 and buff_offset < len(self._readbuffer): + # Fast seek uncompressed unencrypted file + if self._compress_type == ZIP_STORED and self._decrypter is None and read_offset > 0: + # disable CRC checking after first seeking - it would be invalid + self._expected_crc = None + # seek actual file taking already buffered data into account + read_offset -= len(self._readbuffer) - self._offset + self._fileobj.seek(read_offset, os.SEEK_CUR) + self._left -= read_offset + read_offset = 0 + # flush read buffer + self._readbuffer = b'' + self._offset = 0 + elif buff_offset >= 0 and buff_offset < len(self._readbuffer): # Just move the _offset index if the new position is in the _readbuffer self._offset = buff_offset read_offset = 0 @@ -1114,19 +1126,6 @@ def seek(self, offset, whence=os.SEEK_SET): if self._decrypter is not None: self._init_decrypter() - # Fast seek uncompressed unencrypted file - if self._compress_type == ZIP_STORED and self._decrypter is None and read_offset > 0: - # disable CRC checking after first seeking - it would be invalid - self._expected_crc = None - # seek actual file taking already buffered data into account - read_offset -= len(self._readbuffer) - self._offset - self._fileobj.seek(read_offset, os.SEEK_CUR) - self._left -= read_offset - read_offset = 0 - # flush read buffer - self._readbuffer = b'' - self._offset = 0 - while read_offset > 0: read_len = min(self.MAX_SEEK_READ, read_offset) self.read(read_len)