Skip to content

Commit 03b2ecf

Browse files
miss-islington5ec1cffblurb-it[bot]picnixz
authored
[3.13] GH-128131: Completely support random read access of uncompressed unencrypted files in ZipFile (GH-128143) (#129091)
GH-128131: Completely support random read access of uncompressed unencrypted files in ZipFile (GH-128143) (cherry picked from commit dda02eb) Co-authored-by: 5ec1cff <[email protected]> Co-authored-by: blurb-it[bot] <43283697+blurb-it[bot]@users.noreply.github.com> Co-authored-by: Bénédikt Tran <[email protected]>
1 parent a1c48a7 commit 03b2ecf

File tree

3 files changed

+88
-1
lines changed

3 files changed

+88
-1
lines changed

Lib/test/test_zipfile/test_core.py

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import _pyio
12
import array
23
import contextlib
34
import importlib.util
@@ -3454,5 +3455,87 @@ def test_too_short(self):
34543455
b"zzz", zipfile._Extra.strip(b"zzz", (self.ZIP64_EXTRA,)))
34553456

34563457

3458+
class StatIO(_pyio.BytesIO):
3459+
"""Buffer which remembers the number of bytes that were read."""
3460+
3461+
def __init__(self):
3462+
super().__init__()
3463+
self.bytes_read = 0
3464+
3465+
def read(self, size=-1):
3466+
bs = super().read(size)
3467+
self.bytes_read += len(bs)
3468+
return bs
3469+
3470+
3471+
class StoredZipExtFileRandomReadTest(unittest.TestCase):
3472+
"""Tests whether an uncompressed, unencrypted zip entry can be randomly
3473+
seek and read without reading redundant bytes."""
3474+
def test_stored_seek_and_read(self):
3475+
3476+
sio = StatIO()
3477+
# 20000 bytes
3478+
txt = b'0123456789' * 2000
3479+
3480+
# The seek length must be greater than ZipExtFile.MIN_READ_SIZE
3481+
# as `ZipExtFile._read2()` reads in blocks of this size and we
3482+
# need to seek out of the buffered data
3483+
read_buffer_size = zipfile.ZipExtFile.MIN_READ_SIZE
3484+
self.assertGreaterEqual(10002, read_buffer_size) # for forward seek test
3485+
self.assertGreaterEqual(5003, read_buffer_size) # for backward seek test
3486+
# The read length must be less than MIN_READ_SIZE, since we assume that
3487+
# only 1 block is read in the test.
3488+
read_length = 100
3489+
self.assertGreaterEqual(read_buffer_size, read_length) # for read() calls
3490+
3491+
with zipfile.ZipFile(sio, "w", compression=zipfile.ZIP_STORED) as zipf:
3492+
zipf.writestr("foo.txt", txt)
3493+
3494+
# check random seek and read on a file
3495+
with zipfile.ZipFile(sio, "r") as zipf:
3496+
with zipf.open("foo.txt", "r") as fp:
3497+
# Test this optimized read hasn't rewound and read from the
3498+
# start of the file (as in the case of the unoptimized path)
3499+
3500+
# forward seek
3501+
old_count = sio.bytes_read
3502+
forward_seek_len = 10002
3503+
current_pos = 0
3504+
fp.seek(forward_seek_len, os.SEEK_CUR)
3505+
current_pos += forward_seek_len
3506+
self.assertEqual(fp.tell(), current_pos)
3507+
self.assertEqual(fp._left, fp._compress_left)
3508+
arr = fp.read(read_length)
3509+
current_pos += read_length
3510+
self.assertEqual(fp.tell(), current_pos)
3511+
self.assertEqual(arr, txt[current_pos - read_length:current_pos])
3512+
self.assertEqual(fp._left, fp._compress_left)
3513+
read_count = sio.bytes_read - old_count
3514+
self.assertLessEqual(read_count, read_buffer_size)
3515+
3516+
# backward seek
3517+
old_count = sio.bytes_read
3518+
backward_seek_len = 5003
3519+
fp.seek(-backward_seek_len, os.SEEK_CUR)
3520+
current_pos -= backward_seek_len
3521+
self.assertEqual(fp.tell(), current_pos)
3522+
self.assertEqual(fp._left, fp._compress_left)
3523+
arr = fp.read(read_length)
3524+
current_pos += read_length
3525+
self.assertEqual(fp.tell(), current_pos)
3526+
self.assertEqual(arr, txt[current_pos - read_length:current_pos])
3527+
self.assertEqual(fp._left, fp._compress_left)
3528+
read_count = sio.bytes_read - old_count
3529+
self.assertLessEqual(read_count, read_buffer_size)
3530+
3531+
# eof flags test
3532+
fp.seek(0, os.SEEK_END)
3533+
fp.seek(12345, os.SEEK_SET)
3534+
current_pos = 12345
3535+
arr = fp.read(read_length)
3536+
current_pos += read_length
3537+
self.assertEqual(arr, txt[current_pos - read_length:current_pos])
3538+
3539+
34573540
if __name__ == "__main__":
34583541
unittest.main()

Lib/zipfile/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1163,13 +1163,15 @@ def seek(self, offset, whence=os.SEEK_SET):
11631163
self._offset = buff_offset
11641164
read_offset = 0
11651165
# Fast seek uncompressed unencrypted file
1166-
elif self._compress_type == ZIP_STORED and self._decrypter is None and read_offset > 0:
1166+
elif self._compress_type == ZIP_STORED and self._decrypter is None and read_offset != 0:
11671167
# disable CRC checking after first seeking - it would be invalid
11681168
self._expected_crc = None
11691169
# seek actual file taking already buffered data into account
11701170
read_offset -= len(self._readbuffer) - self._offset
11711171
self._fileobj.seek(read_offset, os.SEEK_CUR)
11721172
self._left -= read_offset
1173+
self._compress_left -= read_offset
1174+
self._eof = self._left <= 0
11731175
read_offset = 0
11741176
# flush read buffer
11751177
self._readbuffer = b''
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Completely support random access of uncompressed unencrypted read-only
2+
zip files obtained by :meth:`ZipFile.open <zipfile.ZipFile.open>`.

0 commit comments

Comments
 (0)