From f18239dc2aeb39d0185d475ec8cbd3aae0b4aee2 Mon Sep 17 00:00:00 2001 From: 5ec1cff <56485584+5ec1cff@users.noreply.github.com> Date: Sat, 21 Dec 2024 09:49:29 +0800 Subject: [PATCH 01/13] Random access uncompressed unencrypted ZipExtFile --- Lib/zipfile/__init__.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py index 6907ae6d5b7464..dc557eaf00b3c6 100644 --- a/Lib/zipfile/__init__.py +++ b/Lib/zipfile/__init__.py @@ -1162,13 +1162,16 @@ def seek(self, offset, whence=os.SEEK_SET): self._offset = buff_offset read_offset = 0 # Fast seek uncompressed unencrypted file - elif self._compress_type == ZIP_STORED and self._decrypter is None and read_offset > 0: + elif self._compress_type == ZIP_STORED and self._decrypter is None: # disable CRC checking after first seeking - it would be invalid self._expected_crc = None # seek actual file taking already buffered data into account read_offset -= len(self._readbuffer) - self._offset self._fileobj.seek(read_offset, os.SEEK_CUR) self._left -= read_offset + self._compress_left -= read_offset + if self._eof and read_offset < 0: + self._eof = False read_offset = 0 # flush read buffer self._readbuffer = b'' From f90340fcf3d706218afc394d28dfd2a58a3add02 Mon Sep 17 00:00:00 2001 From: "blurb-it[bot]" <43283697+blurb-it[bot]@users.noreply.github.com> Date: Sat, 21 Dec 2024 03:20:13 +0000 Subject: [PATCH 02/13] =?UTF-8?q?=F0=9F=93=9C=F0=9F=A4=96=20Added=20by=20b?= =?UTF-8?q?lurb=5Fit.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../next/Library/2024-12-21-03-20-12.gh-issue-128131.QpPmNt.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Library/2024-12-21-03-20-12.gh-issue-128131.QpPmNt.rst diff --git a/Misc/NEWS.d/next/Library/2024-12-21-03-20-12.gh-issue-128131.QpPmNt.rst b/Misc/NEWS.d/next/Library/2024-12-21-03-20-12.gh-issue-128131.QpPmNt.rst new file mode 100644 index 00000000000000..465889f2498cec --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-12-21-03-20-12.gh-issue-128131.QpPmNt.rst @@ -0,0 +1 @@ +Support forward seek in uncompressed unencrypted ZipExtFile From 5d23be66b4ad4b74653df6cb7e24a4cbe424df4d Mon Sep 17 00:00:00 2001 From: 5ec1cff <56485584+5ec1cff@users.noreply.github.com> Date: Sat, 21 Dec 2024 19:30:35 +0800 Subject: [PATCH 03/13] Update Misc/NEWS.d/next/Library/2024-12-21-03-20-12.gh-issue-128131.QpPmNt.rst MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Bénédikt Tran <10796600+picnixz@users.noreply.github.com> --- .../next/Library/2024-12-21-03-20-12.gh-issue-128131.QpPmNt.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Misc/NEWS.d/next/Library/2024-12-21-03-20-12.gh-issue-128131.QpPmNt.rst b/Misc/NEWS.d/next/Library/2024-12-21-03-20-12.gh-issue-128131.QpPmNt.rst index 465889f2498cec..da2660d118a03b 100644 --- a/Misc/NEWS.d/next/Library/2024-12-21-03-20-12.gh-issue-128131.QpPmNt.rst +++ b/Misc/NEWS.d/next/Library/2024-12-21-03-20-12.gh-issue-128131.QpPmNt.rst @@ -1 +1 @@ -Support forward seek in uncompressed unencrypted ZipExtFile +Support forward seek in uncompressed unencrypted :class:`!zipfile.ZipExtFile`. From 1a6461094d095843272ddee5e3f649df1d52aef9 Mon Sep 17 00:00:00 2001 From: 5ec1cff <56485584+5ec1cff@users.noreply.github.com> Date: Sat, 21 Dec 2024 22:27:32 +0800 Subject: [PATCH 04/13] Update test_core.py add unittest --- Lib/test/test_zipfile/test_core.py | 94 ++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index c36228c033a414..679dee16633c16 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -3447,6 +3447,100 @@ def test_too_short(self): self.assertEqual( b"zzz", zipfile._Extra.strip(b"zzz", (self.ZIP64_EXTRA,))) +class StoreSeekReadTest(unittest.TestCase): + def test_store_seek_read(self): + class StatIO(io.RawIOBase): + def __init__(self, buf: bytes): + self.bytes_read = 0 + self.buffer = buf + self.pos = 0 + self.eof = False + + def readinto(self, buffer, /): + if self.eof: + return 0 + sz = min(len(buffer), len(self.buffer) - self.pos) + buffer[:sz] = self.buffer[self.pos:self.pos + sz] + self.pos += sz + self.bytes_read += sz + self.eof = self.pos == len(self.buffer) + return sz + + def readall(self): + if self.eof: + return b'' + self.eof = True + self.bytes_read += len(self.buffer) - self.pos + ret = self.buffer[self.pos:] + self.pos = len(self.buffer) + return ret + + def seek(self, offset, whence=os.SEEK_SET, /): + if whence == os.SEEK_CUR: + new_pos = self.pos + offset + elif whence == os.SEEK_SET: + new_pos = offset + elif whence == os.SEEK_END: + new_pos = len(self.buffer) + offset + else: + raise ValueError("unsupported whence") + + if new_pos < 0: + new_pos = 0 + elif new_pos > len(self.buffer): + new_pos = len(self.buffer) + + self.eof = new_pos == len(self.buffer) + self.pos = new_pos + return new_pos + + def tell(self): + return self.pos + + def readable(self): + return True + + def writable(self): + return False + + def seekable(self): + return True + + def get_bytes_read(self): + return self.bytes_read + + bio = io.BytesIO() + txt = b'0123456789'*10000 + + # Check seek on a file + with zipfile.ZipFile(bio, "w") as zipf: + zipf.writestr("foo.txt", txt) # 100000 bytes + + sio = StatIO(bio.getvalue()) + with zipfile.ZipFile(sio, "r") as zipf: + with zipf.open("foo.txt", "r") as fp: + br = sio.get_bytes_read() + fp.seek(50000, os.SEEK_CUR) + self.assertEqual(sio.get_bytes_read() - br, 0, 'seek produces extra read!') + + b = fp.read(100) + self.assertEqual(b, txt[:100]) + + # backward seek + # seek length must be more than MIN_READ_SIZE (4096) + br = sio.get_bytes_read() + fp.seek(5000, os.SEEK_CUR) + b = fp.read(100) + self.assertEqual(b, txt[50000:50100]) + self.assertLessEqual(sio.get_bytes_read() - br, 4096, 'read more bytes after backward seek!') + + # forward seek + # seek length must be more than MIN_READ_SIZE (4096) + br = sio.get_bytes_read() + fp.seek(-40000, os.SEEK_CUR) + b = fp.read(100) + self.assertEqual(b, txt[10100:10200]) + self.assertLessEqual(sio.get_bytes_read() - br, 4096, 'read more bytes after forward seek!') if __name__ == "__main__": unittest.main() From 57cb51c423c2ac8dde55d78431a6b9c3cd95b8a0 Mon Sep 17 00:00:00 2001 From: 5ec1cff <56485584+5ec1cff@users.noreply.github.com> Date: Sun, 22 Dec 2024 10:38:35 +0800 Subject: [PATCH 05/13] update unittest --- Lib/test/test_zipfile/test_core.py | 83 ++++--------------- ...-12-21-03-20-12.gh-issue-128131.QpPmNt.rst | 2 +- 2 files changed, 19 insertions(+), 66 deletions(-) diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index 679dee16633c16..9229e75a4aa53c 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -3447,100 +3447,53 @@ def test_too_short(self): self.assertEqual( b"zzz", zipfile._Extra.strip(b"zzz", (self.ZIP64_EXTRA,))) -class StoreSeekReadTest(unittest.TestCase): - def test_store_seek_read(self): - class StatIO(io.RawIOBase): - def __init__(self, buf: bytes): +class StoredZipExtFileRandomAccessTest(unittest.TestCase): + def test_random_access(self): + from _pyio import BytesIO + class StatIO(BytesIO): + def __init__(self): + super().__init__() self.bytes_read = 0 - self.buffer = buf - self.pos = 0 - self.eof = False - - def readinto(self, buffer, /): - if self.eof: - return 0 - sz = min(len(buffer), len(self.buffer) - self.pos) - buffer[:sz] = self.buffer[self.pos:self.pos + sz] - self.pos += sz - self.bytes_read += sz - self.eof = self.pos == len(self.buffer) - return sz - - def readall(self): - if self.eof: - return b'' - self.eof = True - self.bytes_read += len(self.buffer) - self.pos - ret = self.buffer[self.pos:] - self.pos = len(self.buffer) - return ret - - def seek(self, offset, whence=os.SEEK_SET, /): - if whence == os.SEEK_CUR: - new_pos = self.pos + offset - elif whence == os.SEEK_SET: - new_pos = offset - elif whence == os.SEEK_END: - new_pos = len(self.buffer) + offset - else: - raise ValueError("unsupported whence") - - if new_pos < 0: - new_pos = 0 - elif new_pos > len(self.buffer): - new_pos = len(self.buffer) - - self.eof = new_pos == len(self.buffer) - self.pos = new_pos - return new_pos - - def tell(self): - return self.pos - def readable(self): - return True - - def writable(self): - return False - - def seekable(self): - return True + def read(self, size=-1): + bs = super().read(size) + self.bytes_read += len(bs) + return bs def get_bytes_read(self): return self.bytes_read - bio = io.BytesIO() + sio = StatIO() + # 100000 bytes txt = b'0123456789'*10000 # Check seek on a file - with zipfile.ZipFile(bio, "w") as zipf: - zipf.writestr("foo.txt", txt) # 100000 bytes + with zipfile.ZipFile(sio, "w", compression=zipfile.ZIP_STORED) as zipf: + zipf.writestr("foo.txt", txt) - sio = StatIO(bio.getvalue()) with zipfile.ZipFile(sio, "r") as zipf: with zipf.open("foo.txt", "r") as fp: br = sio.get_bytes_read() fp.seek(50000, os.SEEK_CUR) - self.assertEqual(sio.get_bytes_read() - br, 0, 'seek produces extra read!') + self.assertEqual(sio.get_bytes_read() - br, 0, 'seek produces redundant read!') b = fp.read(100) self.assertEqual(b, txt[:100]) + # seek length must be greater than ZipExtFile.MIN_READ_SIZE (4096) # backward seek - # seek length must be more than MIN_READ_SIZE (4096) br = sio.get_bytes_read() fp.seek(5000, os.SEEK_CUR) b = fp.read(100) self.assertEqual(b, txt[50000:50100]) - self.assertLessEqual(sio.get_bytes_read() - br, 4096, 'read more bytes after backward seek!') + self.assertLessEqual(sio.get_bytes_read() - br, 4096, 'read redundant bytes during backward seek!') # forward seek - # seek length must be more than MIN_READ_SIZE (4096) br = sio.get_bytes_read() fp.seek(-40000, os.SEEK_CUR) b = fp.read(100) self.assertEqual(b, txt[10100:10200]) - self.assertLessEqual(sio.get_bytes_read() - br, 4096, 'read more bytes after forward seek!') + self.assertLessEqual(sio.get_bytes_read() - br, 4096, 'read redundant bytes during forward seek!') if __name__ == "__main__": unittest.main() diff --git a/Misc/NEWS.d/next/Library/2024-12-21-03-20-12.gh-issue-128131.QpPmNt.rst b/Misc/NEWS.d/next/Library/2024-12-21-03-20-12.gh-issue-128131.QpPmNt.rst index da2660d118a03b..d9ead7e768fb9e 100644 --- a/Misc/NEWS.d/next/Library/2024-12-21-03-20-12.gh-issue-128131.QpPmNt.rst +++ b/Misc/NEWS.d/next/Library/2024-12-21-03-20-12.gh-issue-128131.QpPmNt.rst @@ -1 +1 @@ -Support forward seek in uncompressed unencrypted :class:`!zipfile.ZipExtFile`. +Support fast forward seek in uncompressed unencrypted :class:`!zipfile.ZipExtFile`. From 4d9cea4f36d917b05da49ed016293880936f46e2 Mon Sep 17 00:00:00 2001 From: 5ec1cff Date: Sun, 22 Dec 2024 22:15:12 +0800 Subject: [PATCH 06/13] update unittest --- Lib/test/test_zipfile/test_core.py | 44 +++++++------------ ...-12-21-03-20-12.gh-issue-128131.QpPmNt.rst | 2 +- 2 files changed, 18 insertions(+), 28 deletions(-) diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index 9229e75a4aa53c..21092c96e33b4e 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -3447,8 +3447,8 @@ def test_too_short(self): self.assertEqual( b"zzz", zipfile._Extra.strip(b"zzz", (self.ZIP64_EXTRA,))) -class StoredZipExtFileRandomAccessTest(unittest.TestCase): - def test_random_access(self): +class StoredZipExtFileRandomReadTest(unittest.TestCase): + def test_random_read(self): from _pyio import BytesIO class StatIO(BytesIO): def __init__(self): @@ -3460,40 +3460,30 @@ def read(self, size=-1): self.bytes_read += len(bs) return bs - def get_bytes_read(self): - return self.bytes_read - sio = StatIO() - # 100000 bytes - txt = b'0123456789'*10000 + # 20000 bytes + txt = b'0123456789' * 2000 - # Check seek on a file with zipfile.ZipFile(sio, "w", compression=zipfile.ZIP_STORED) as zipf: zipf.writestr("foo.txt", txt) + # check random seek and read on a file with zipfile.ZipFile(sio, "r") as zipf: with zipf.open("foo.txt", "r") as fp: - br = sio.get_bytes_read() - fp.seek(50000, os.SEEK_CUR) - self.assertEqual(sio.get_bytes_read() - br, 0, 'seek produces redundant read!') - - b = fp.read(100) - self.assertEqual(b, txt[:100]) - # seek length must be greater than ZipExtFile.MIN_READ_SIZE (4096) - # backward seek - br = sio.get_bytes_read() - fp.seek(5000, os.SEEK_CUR) - b = fp.read(100) - self.assertEqual(b, txt[50000:50100]) - self.assertLessEqual(sio.get_bytes_read() - br, 4096, 'read redundant bytes during backward seek!') - # forward seek - br = sio.get_bytes_read() - fp.seek(-40000, os.SEEK_CUR) - b = fp.read(100) - self.assertEqual(b, txt[10100:10200]) - self.assertLessEqual(sio.get_bytes_read() - br, 4096, 'read redundant bytes during forward seek!') + old_count = sio.bytes_read + fp.seek(10002, os.SEEK_CUR) + arr = fp.read(100) + self.assertEqual(arr, txt[10002:10102]) + self.assertLessEqual(sio.bytes_read - old_count, 4096, 'Redundant bytes were read during forward seek and read!') + + # backward seek + old_count = sio.bytes_read + fp.seek(-5003, os.SEEK_CUR) + arr = fp.read(100) + self.assertEqual(arr, txt[5099:5199]) + self.assertLessEqual(sio.bytes_read - old_count, 4096, 'Redundant bytes were read during backward seek and read!') if __name__ == "__main__": unittest.main() diff --git a/Misc/NEWS.d/next/Library/2024-12-21-03-20-12.gh-issue-128131.QpPmNt.rst b/Misc/NEWS.d/next/Library/2024-12-21-03-20-12.gh-issue-128131.QpPmNt.rst index d9ead7e768fb9e..cb11f6d18b23e4 100644 --- a/Misc/NEWS.d/next/Library/2024-12-21-03-20-12.gh-issue-128131.QpPmNt.rst +++ b/Misc/NEWS.d/next/Library/2024-12-21-03-20-12.gh-issue-128131.QpPmNt.rst @@ -1 +1 @@ -Support fast forward seek in uncompressed unencrypted :class:`!zipfile.ZipExtFile`. +Completely support random access of uncompressed unencrypted :class:`!zipfile.ZipExtFile`. From 3d37f311b3c43552e6b31043b49369dc2d5fb2d2 Mon Sep 17 00:00:00 2001 From: 5ec1cff Date: Sun, 22 Dec 2024 22:31:26 +0800 Subject: [PATCH 07/13] update unittest --- Lib/test/test_zipfile/test_core.py | 4 ++++ .../Library/2024-12-21-03-20-12.gh-issue-128131.QpPmNt.rst | 3 ++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index 21092c96e33b4e..7688e7a35a10be 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -3474,14 +3474,18 @@ def read(self, size=-1): # forward seek old_count = sio.bytes_read fp.seek(10002, os.SEEK_CUR) + self.assertEqual(fp.tell(), 10002) arr = fp.read(100) + self.assertEqual(fp.tell(), 10102) self.assertEqual(arr, txt[10002:10102]) self.assertLessEqual(sio.bytes_read - old_count, 4096, 'Redundant bytes were read during forward seek and read!') # backward seek old_count = sio.bytes_read fp.seek(-5003, os.SEEK_CUR) + self.assertEqual(fp.tell(), 5099) arr = fp.read(100) + self.assertEqual(fp.tell(), 5199) self.assertEqual(arr, txt[5099:5199]) self.assertLessEqual(sio.bytes_read - old_count, 4096, 'Redundant bytes were read during backward seek and read!') diff --git a/Misc/NEWS.d/next/Library/2024-12-21-03-20-12.gh-issue-128131.QpPmNt.rst b/Misc/NEWS.d/next/Library/2024-12-21-03-20-12.gh-issue-128131.QpPmNt.rst index cb11f6d18b23e4..f4c4ebce10729c 100644 --- a/Misc/NEWS.d/next/Library/2024-12-21-03-20-12.gh-issue-128131.QpPmNt.rst +++ b/Misc/NEWS.d/next/Library/2024-12-21-03-20-12.gh-issue-128131.QpPmNt.rst @@ -1 +1,2 @@ -Completely support random access of uncompressed unencrypted :class:`!zipfile.ZipExtFile`. +Completely support random access of uncompressed unencrypted read-only +zip files obtained by :meth:`ZipFile.open `. From 67f05de35f256811e934a7d2f6de98c6ea311de7 Mon Sep 17 00:00:00 2001 From: 5ec1cff Date: Mon, 23 Dec 2024 10:05:59 +0800 Subject: [PATCH 08/13] update unittest & zipfile --- Lib/test/test_zipfile/test_core.py | 21 ++++++++++++++++++--- Lib/zipfile/__init__.py | 3 +-- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index 7688e7a35a10be..0a95660a1969ab 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -3464,30 +3464,45 @@ def read(self, size=-1): # 20000 bytes txt = b'0123456789' * 2000 + # seek length must be greater than ZipExtFile.MIN_READ_SIZE (4096) + min_size = zipfile.ZipExtFile.MIN_READ_SIZE + self.assertGreaterEqual(min_size, 100) + with zipfile.ZipFile(sio, "w", compression=zipfile.ZIP_STORED) as zipf: zipf.writestr("foo.txt", txt) # check random seek and read on a file with zipfile.ZipFile(sio, "r") as zipf: with zipf.open("foo.txt", "r") as fp: - # seek length must be greater than ZipExtFile.MIN_READ_SIZE (4096) # forward seek old_count = sio.bytes_read fp.seek(10002, os.SEEK_CUR) self.assertEqual(fp.tell(), 10002) + self.assertEqual(fp._left, fp._compress_left) arr = fp.read(100) self.assertEqual(fp.tell(), 10102) self.assertEqual(arr, txt[10002:10102]) - self.assertLessEqual(sio.bytes_read - old_count, 4096, 'Redundant bytes were read during forward seek and read!') + self.assertEqual(fp._left, fp._compress_left) + d = sio.bytes_read - old_count + self.assertLessEqual(d, min_size) # backward seek old_count = sio.bytes_read fp.seek(-5003, os.SEEK_CUR) self.assertEqual(fp.tell(), 5099) + self.assertEqual(fp._left, fp._compress_left) arr = fp.read(100) self.assertEqual(fp.tell(), 5199) self.assertEqual(arr, txt[5099:5199]) - self.assertLessEqual(sio.bytes_read - old_count, 4096, 'Redundant bytes were read during backward seek and read!') + self.assertEqual(fp._left, fp._compress_left) + d = sio.bytes_read - old_count + self.assertLessEqual(d, min_size) + + # eof flags test + fp.seek(0, os.SEEK_END) + self.assertTrue(fp._eof) + fp.seek(12345, os.SEEK_SET) + self.assertFalse(fp._eof) if __name__ == "__main__": unittest.main() diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py index dc557eaf00b3c6..26e899658bab50 100644 --- a/Lib/zipfile/__init__.py +++ b/Lib/zipfile/__init__.py @@ -1170,8 +1170,7 @@ def seek(self, offset, whence=os.SEEK_SET): self._fileobj.seek(read_offset, os.SEEK_CUR) self._left -= read_offset self._compress_left -= read_offset - if self._eof and read_offset < 0: - self._eof = False + self._eof = self._left <= 0 read_offset = 0 # flush read buffer self._readbuffer = b'' From 25f0a7efd9e2d32caf088a2201bea2b3f03c55ca Mon Sep 17 00:00:00 2001 From: 5ec1cff Date: Tue, 24 Dec 2024 10:42:40 +0800 Subject: [PATCH 09/13] add comments --- Lib/test/test_zipfile/test_core.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index 0a95660a1969ab..599467e4a8366b 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -3464,8 +3464,14 @@ def read(self, size=-1): # 20000 bytes txt = b'0123456789' * 2000 - # seek length must be greater than ZipExtFile.MIN_READ_SIZE (4096) + # The seek length must be greater than ZipExtFile.MIN_READ_SIZE (4096) + # as `ZipExtFile._read2()` reads in blocks of this size and we need to + # seek out of the buffered data min_size = zipfile.ZipExtFile.MIN_READ_SIZE + self.assertGreaterEqual(10002, min_size) + self.assertGreaterEqual(5003, min_size) + # The read length must be less than MIN_READ_SIZE, since we assume that + # only 1 block is read in the test. self.assertGreaterEqual(min_size, 100) with zipfile.ZipFile(sio, "w", compression=zipfile.ZIP_STORED) as zipf: @@ -3474,6 +3480,9 @@ def read(self, size=-1): # check random seek and read on a file with zipfile.ZipFile(sio, "r") as zipf: with zipf.open("foo.txt", "r") as fp: + # Test this optimized read hasn't rewound and read from the + # start of the file (as in the case of the unoptimized path) + # forward seek old_count = sio.bytes_read fp.seek(10002, os.SEEK_CUR) From f2f2374c297312e78c7660307937d906e066c8c3 Mon Sep 17 00:00:00 2001 From: 5ec1cff <56485584+5ec1cff@users.noreply.github.com> Date: Wed, 25 Dec 2024 10:56:50 +0800 Subject: [PATCH 10/13] Apply suggestions from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Bénédikt Tran <10796600+picnixz@users.noreply.github.com> --- Lib/test/test_zipfile/test_core.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index 599467e4a8366b..92758ed14b9ad5 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -3464,15 +3464,15 @@ def read(self, size=-1): # 20000 bytes txt = b'0123456789' * 2000 - # The seek length must be greater than ZipExtFile.MIN_READ_SIZE (4096) - # as `ZipExtFile._read2()` reads in blocks of this size and we need to - # seek out of the buffered data + # The seek length must be greater than ZipExtFile.MIN_READ_SIZE + # as `ZipExtFile._read2()` reads in blocks of this size and we + # need to seek out of the buffered data min_size = zipfile.ZipExtFile.MIN_READ_SIZE - self.assertGreaterEqual(10002, min_size) - self.assertGreaterEqual(5003, min_size) + self.assertGreaterEqual(10002, min_size) # for forward seek test + self.assertGreaterEqual(5003, min_size) # for backward seek test # The read length must be less than MIN_READ_SIZE, since we assume that # only 1 block is read in the test. - self.assertGreaterEqual(min_size, 100) + self.assertGreaterEqual(min_size, 100) # for read() calls with zipfile.ZipFile(sio, "w", compression=zipfile.ZIP_STORED) as zipf: zipf.writestr("foo.txt", txt) @@ -3498,7 +3498,7 @@ def read(self, size=-1): # backward seek old_count = sio.bytes_read fp.seek(-5003, os.SEEK_CUR) - self.assertEqual(fp.tell(), 5099) + self.assertEqual(fp.tell(), 5099) # 5099 = 10102 - 5003 self.assertEqual(fp._left, fp._compress_left) arr = fp.read(100) self.assertEqual(fp.tell(), 5199) From a2c903721b750bcb052fb0c73afbad3e220c27f3 Mon Sep 17 00:00:00 2001 From: 5ec1cff Date: Wed, 25 Dec 2024 10:59:14 +0800 Subject: [PATCH 11/13] make StatIO non-local --- Lib/test/test_zipfile/test_core.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index 92758ed14b9ad5..2f9f8a2b23751e 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -26,6 +26,7 @@ from test.support.os_helper import ( TESTFN, unlink, rmtree, temp_dir, temp_cwd, fd_count, FakePath ) +import _pyio TESTFN2 = TESTFN + "2" @@ -3447,18 +3448,20 @@ def test_too_short(self): self.assertEqual( b"zzz", zipfile._Extra.strip(b"zzz", (self.ZIP64_EXTRA,))) + +class StatIO(_pyio.BytesIO): + def __init__(self): + super().__init__() + self.bytes_read = 0 + + def read(self, size=-1): + bs = super().read(size) + self.bytes_read += len(bs) + return bs + + class StoredZipExtFileRandomReadTest(unittest.TestCase): def test_random_read(self): - from _pyio import BytesIO - class StatIO(BytesIO): - def __init__(self): - super().__init__() - self.bytes_read = 0 - - def read(self, size=-1): - bs = super().read(size) - self.bytes_read += len(bs) - return bs sio = StatIO() # 20000 bytes From 6a55aadf03658f2fbac85f8cd61235da01d48b28 Mon Sep 17 00:00:00 2001 From: 5ec1cff Date: Thu, 26 Dec 2024 19:40:37 +0800 Subject: [PATCH 12/13] add suggested changes --- Lib/test/test_zipfile/test_core.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index 2f9f8a2b23751e..fda4870bb12419 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -1,3 +1,4 @@ +import _pyio import array import contextlib import importlib.util @@ -26,7 +27,6 @@ from test.support.os_helper import ( TESTFN, unlink, rmtree, temp_dir, temp_cwd, fd_count, FakePath ) -import _pyio TESTFN2 = TESTFN + "2" @@ -3450,6 +3450,8 @@ def test_too_short(self): class StatIO(_pyio.BytesIO): + """Buffer which remembers the number of bytes that were read.""" + def __init__(self): super().__init__() self.bytes_read = 0 @@ -3516,5 +3518,6 @@ def test_random_read(self): fp.seek(12345, os.SEEK_SET) self.assertFalse(fp._eof) + if __name__ == "__main__": unittest.main() From 123900503ad734cd5f69b7e8cf7d046a2655cb33 Mon Sep 17 00:00:00 2001 From: 5ec1cff Date: Thu, 2 Jan 2025 10:46:20 +0800 Subject: [PATCH 13/13] update --- Lib/test/test_zipfile/test_core.py | 54 ++++++++++++++++++------------ Lib/zipfile/__init__.py | 2 +- 2 files changed, 34 insertions(+), 22 deletions(-) diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index fda4870bb12419..9746877c36a1a2 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -3463,7 +3463,9 @@ def read(self, size=-1): class StoredZipExtFileRandomReadTest(unittest.TestCase): - def test_random_read(self): + """Tests whether an uncompressed, unencrypted zip entry can be randomly + seek and read without reading redundant bytes.""" + def test_stored_seek_and_read(self): sio = StatIO() # 20000 bytes @@ -3472,12 +3474,13 @@ def test_random_read(self): # The seek length must be greater than ZipExtFile.MIN_READ_SIZE # as `ZipExtFile._read2()` reads in blocks of this size and we # need to seek out of the buffered data - min_size = zipfile.ZipExtFile.MIN_READ_SIZE - self.assertGreaterEqual(10002, min_size) # for forward seek test - self.assertGreaterEqual(5003, min_size) # for backward seek test + read_buffer_size = zipfile.ZipExtFile.MIN_READ_SIZE + self.assertGreaterEqual(10002, read_buffer_size) # for forward seek test + self.assertGreaterEqual(5003, read_buffer_size) # for backward seek test # The read length must be less than MIN_READ_SIZE, since we assume that # only 1 block is read in the test. - self.assertGreaterEqual(min_size, 100) # for read() calls + read_length = 100 + self.assertGreaterEqual(read_buffer_size, read_length) # for read() calls with zipfile.ZipFile(sio, "w", compression=zipfile.ZIP_STORED) as zipf: zipf.writestr("foo.txt", txt) @@ -3490,33 +3493,42 @@ def test_random_read(self): # forward seek old_count = sio.bytes_read - fp.seek(10002, os.SEEK_CUR) - self.assertEqual(fp.tell(), 10002) + forward_seek_len = 10002 + current_pos = 0 + fp.seek(forward_seek_len, os.SEEK_CUR) + current_pos += forward_seek_len + self.assertEqual(fp.tell(), current_pos) self.assertEqual(fp._left, fp._compress_left) - arr = fp.read(100) - self.assertEqual(fp.tell(), 10102) - self.assertEqual(arr, txt[10002:10102]) + arr = fp.read(read_length) + current_pos += read_length + self.assertEqual(fp.tell(), current_pos) + self.assertEqual(arr, txt[current_pos - read_length:current_pos]) self.assertEqual(fp._left, fp._compress_left) - d = sio.bytes_read - old_count - self.assertLessEqual(d, min_size) + read_count = sio.bytes_read - old_count + self.assertLessEqual(read_count, read_buffer_size) # backward seek old_count = sio.bytes_read - fp.seek(-5003, os.SEEK_CUR) - self.assertEqual(fp.tell(), 5099) # 5099 = 10102 - 5003 + backward_seek_len = 5003 + fp.seek(-backward_seek_len, os.SEEK_CUR) + current_pos -= backward_seek_len + self.assertEqual(fp.tell(), current_pos) self.assertEqual(fp._left, fp._compress_left) - arr = fp.read(100) - self.assertEqual(fp.tell(), 5199) - self.assertEqual(arr, txt[5099:5199]) + arr = fp.read(read_length) + current_pos += read_length + self.assertEqual(fp.tell(), current_pos) + self.assertEqual(arr, txt[current_pos - read_length:current_pos]) self.assertEqual(fp._left, fp._compress_left) - d = sio.bytes_read - old_count - self.assertLessEqual(d, min_size) + read_count = sio.bytes_read - old_count + self.assertLessEqual(read_count, read_buffer_size) # eof flags test fp.seek(0, os.SEEK_END) - self.assertTrue(fp._eof) fp.seek(12345, os.SEEK_SET) - self.assertFalse(fp._eof) + current_pos = 12345 + arr = fp.read(read_length) + current_pos += read_length + self.assertEqual(arr, txt[current_pos - read_length:current_pos]) if __name__ == "__main__": diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py index 26e899658bab50..ed70885b74d331 100644 --- a/Lib/zipfile/__init__.py +++ b/Lib/zipfile/__init__.py @@ -1162,7 +1162,7 @@ def seek(self, offset, whence=os.SEEK_SET): self._offset = buff_offset read_offset = 0 # Fast seek uncompressed unencrypted file - elif self._compress_type == ZIP_STORED and self._decrypter is None: + elif self._compress_type == ZIP_STORED and self._decrypter is None and read_offset != 0: # disable CRC checking after first seeking - it would be invalid self._expected_crc = None # seek actual file taking already buffered data into account