Skip to content

Commit 840fdf5

Browse files
GitHKAndrei Neagu
and
Andrei Neagu
authored
🐛 archiving_utils creates deterministic zip archives (#6472)
Co-authored-by: Andrei Neagu <[email protected]>
1 parent 369777e commit 840fdf5

File tree

24 files changed

+196
-31
lines changed

24 files changed

+196
-31
lines changed

packages/aws-library/requirements/_base.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,8 @@ referencing==0.29.3
224224
# -c requirements/../../../packages/service-library/requirements/./constraints.txt
225225
# jsonschema
226226
# jsonschema-specifications
227+
repro-zipfile==0.3.1
228+
# via -r requirements/../../../packages/service-library/requirements/_base.in
227229
requests==2.32.3
228230
# via opentelemetry-exporter-otlp-proto-http
229231
rich==13.8.1

packages/service-library/requirements/_base.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ pydantic
2424
pyinstrument
2525
pyyaml
2626
redis
27+
repro-zipfile
2728
tenacity
2829
toolz
2930
tqdm

packages/service-library/requirements/_base.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,8 @@ referencing==0.29.3
172172
# -c requirements/./constraints.txt
173173
# jsonschema
174174
# jsonschema-specifications
175+
repro-zipfile==0.3.1
176+
# via -r requirements/_base.in
175177
requests==2.32.3
176178
# via opentelemetry-exporter-otlp-proto-http
177179
rich==13.8.1

packages/service-library/requirements/_test.in

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,9 @@ coverage
1717
docker
1818
faker
1919
flaky
20+
numpy
2021
openapi-spec-validator
22+
pillow
2123
pytest
2224
pytest-aiohttp
2325
pytest-asyncio

packages/service-library/requirements/_test.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,8 @@ mypy==1.11.2
121121
# via sqlalchemy
122122
mypy-extensions==1.0.0
123123
# via mypy
124+
numpy==2.1.1
125+
# via -r requirements/_test.in
124126
openapi-schema-validator==0.6.2
125127
# via
126128
# -c requirements/_aiohttp.txt
@@ -137,6 +139,8 @@ pathable==0.4.3
137139
# via
138140
# -c requirements/_aiohttp.txt
139141
# jsonschema-path
142+
pillow==10.4.0
143+
# via -r requirements/_test.in
140144
pluggy==1.5.0
141145
# via pytest
142146
pprintpp==0.4.0

packages/service-library/src/servicelib/archiving_utils.py

Lines changed: 37 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,16 @@
44
import logging
55
import types
66
import zipfile
7-
from contextlib import AsyncExitStack, contextmanager
7+
from collections.abc import Awaitable, Callable, Iterator
8+
from contextlib import AsyncExitStack, contextmanager, suppress
89
from functools import partial
910
from pathlib import Path
10-
from typing import Any, Awaitable, Callable, Final, Iterator
11+
from typing import Any, Final
1112

1213
import tqdm
1314
from models_library.basic_types import IDStr
15+
from pydantic import NonNegativeFloat
16+
from repro_zipfile import ReproducibleZipFile # type: ignore[import-untyped]
1417
from tqdm.contrib.logging import logging_redirect_tqdm, tqdm_logging_redirect
1518

1619
from .file_utils import remove_directory
@@ -21,8 +24,9 @@
2124
_MIN: Final[int] = 60 # secs
2225
_MAX_UNARCHIVING_WORKER_COUNT: Final[int] = 2
2326
_CHUNK_SIZE: Final[int] = 1024 * 8
27+
_UNIT_MULTIPLIER: Final[NonNegativeFloat] = 1024.0
2428

25-
log = logging.getLogger(__name__)
29+
_logger = logging.getLogger(__name__)
2630

2731

2832
class ArchiveError(Exception):
@@ -35,10 +39,10 @@ def _human_readable_size(size, decimal_places=3):
3539
human_readable_file_size = float(size)
3640
unit = "B"
3741
for t_unit in ["B", "KiB", "MiB", "GiB", "TiB"]:
38-
if human_readable_file_size < 1024.0:
42+
if human_readable_file_size < _UNIT_MULTIPLIER:
3943
unit = t_unit
4044
break
41-
human_readable_file_size /= 1024.0
45+
human_readable_file_size /= _UNIT_MULTIPLIER
4246

4347
return f"{human_readable_file_size:.{decimal_places}f}{unit}"
4448

@@ -56,19 +60,21 @@ def _iter_files_to_compress(
5660
dir_path: Path, exclude_patterns: set[str] | None
5761
) -> Iterator[Path]:
5862
exclude_patterns = exclude_patterns if exclude_patterns else set()
59-
for path in dir_path.rglob("*"):
63+
# NOTE: make sure to sort paths othrwise between different runs
64+
# the zip will have a different structure and hash
65+
for path in sorted(dir_path.rglob("*")):
6066
if path.is_file() and not any(
6167
fnmatch.fnmatch(f"{path}", x) for x in exclude_patterns
6268
):
6369
yield path
6470

6571

6672
def _strip_directory_from_path(input_path: Path, to_strip: Path) -> Path:
67-
_to_strip = f"{str(to_strip)}/"
73+
_to_strip = f"{to_strip}/"
6874
return Path(str(input_path).replace(_to_strip, ""))
6975

7076

71-
class _FastZipFileReader(zipfile.ZipFile):
77+
class _FastZipFileReader(ReproducibleZipFile):
7278
"""
7379
Used to gain a speed boost of several orders of magnitude.
7480
@@ -86,7 +92,7 @@ class _FastZipFileReader(zipfile.ZipFile):
8692
files contained in the archive.
8793
"""
8894

89-
def _RealGetContents(self):
95+
def _RealGetContents(self): # noqa: N802
9096
"""method disabled"""
9197

9298

@@ -107,7 +113,7 @@ def _zipfile_single_file_extract_worker(
107113
zip_file_path: Path,
108114
file_in_archive: zipfile.ZipInfo,
109115
destination_folder: Path,
110-
is_dir: bool,
116+
is_dir: bool, # noqa: FBT001
111117
) -> Path:
112118
"""Extracts file_in_archive from the archive zip_file_path -> destination_folder/file_in_archive
113119
@@ -129,7 +135,7 @@ def _zipfile_single_file_extract_worker(
129135
desc=desc,
130136
**(
131137
_TQDM_FILE_OPTIONS
132-
| dict(miniters=_compute_tqdm_miniters(file_in_archive.file_size))
138+
| {"miniters": _compute_tqdm_miniters(file_in_archive.file_size)}
133139
),
134140
) as pbar:
135141
while chunk := zip_fp.read(_CHUNK_SIZE):
@@ -139,7 +145,7 @@ def _zipfile_single_file_extract_worker(
139145

140146

141147
def _ensure_destination_subdirectories_exist(
142-
zip_file_handler: zipfile.ZipFile, destination_folder: Path
148+
zip_file_handler: ReproducibleZipFile, destination_folder: Path
143149
) -> None:
144150
# assemble full destination paths
145151
full_destination_paths = {
@@ -177,7 +183,7 @@ async def unarchive_dir(
177183
)
178184
async with AsyncExitStack() as zip_stack:
179185
zip_file_handler = zip_stack.enter_context(
180-
zipfile.ZipFile( # pylint: disable=consider-using-with
186+
ReproducibleZipFile( # pylint: disable=consider-using-with
181187
archive_to_extract,
182188
mode="r",
183189
)
@@ -232,7 +238,7 @@ async def unarchive_dir(
232238
extracted_path = await future
233239
extracted_file_size = extracted_path.stat().st_size
234240
if tqdm_progress.update(extracted_file_size) and log_cb:
235-
with log_catch(log, reraise=False):
241+
with log_catch(_logger, reraise=False):
236242
await log_cb(f"{tqdm_progress}")
237243
await sub_prog.update(extracted_file_size)
238244
extracted_paths.append(extracted_path)
@@ -266,34 +272,37 @@ async def unarchive_dir(
266272

267273
@contextmanager
268274
def _progress_enabled_zip_write_handler(
269-
zip_file_handler: zipfile.ZipFile, progress_bar: tqdm.tqdm
270-
) -> Iterator[zipfile.ZipFile]:
275+
zip_file_handler: ReproducibleZipFile, progress_bar: tqdm.tqdm
276+
) -> Iterator[ReproducibleZipFile]:
271277
"""This function overrides the default zip write fct to allow to get progress using tqdm library"""
272278

273279
def _write_with_progress(
274-
original_write_fct, self, data, pbar # pylint: disable=unused-argument
280+
original_write_fct,
281+
self, # pylint: disable=unused-argument # noqa: ARG001
282+
data,
283+
pbar,
275284
):
276285
pbar.update(len(data))
277286
return original_write_fct(data)
278287

279288
# Replace original write() with a wrapper to track progress
280289
assert zip_file_handler.fp # nosec
281290
old_write_method = zip_file_handler.fp.write
282-
zip_file_handler.fp.write = types.MethodType( # type: ignore[assignment]
291+
zip_file_handler.fp.write = types.MethodType(
283292
partial(_write_with_progress, old_write_method, pbar=progress_bar),
284293
zip_file_handler.fp,
285294
)
286295
try:
287296
yield zip_file_handler
288297
finally:
289-
zip_file_handler.fp.write = old_write_method # type: ignore[method-assign]
298+
zip_file_handler.fp.write = old_write_method
290299

291300

292301
def _add_to_archive(
293302
dir_to_compress: Path,
294303
destination: Path,
295-
compress: bool,
296-
store_relative_path: bool,
304+
compress: bool, # noqa: FBT001
305+
store_relative_path: bool, # noqa: FBT001
297306
update_progress,
298307
loop,
299308
exclude_patterns: set[str] | None = None,
@@ -308,11 +317,10 @@ def _add_to_archive(
308317
desc=f"{desc}\n",
309318
total=folder_size_bytes,
310319
**(
311-
_TQDM_FILE_OPTIONS
312-
| dict(miniters=_compute_tqdm_miniters(folder_size_bytes))
320+
_TQDM_FILE_OPTIONS | {"miniters": _compute_tqdm_miniters(folder_size_bytes)}
313321
),
314322
) as progress_bar, _progress_enabled_zip_write_handler(
315-
zipfile.ZipFile(destination, "w", compression=compression), progress_bar
323+
ReproducibleZipFile(destination, "w", compression=compression), progress_bar
316324
) as zip_file_handler:
317325
for file_to_add in _iter_files_to_compress(dir_to_compress, exclude_patterns):
318326
progress_bar.set_description(f"{desc}/{file_to_add.name}\n")
@@ -393,10 +401,11 @@ async def archive_dir(
393401
if destination.is_file():
394402
destination.unlink(missing_ok=True)
395403

396-
raise ArchiveError(
404+
msg = (
397405
f"Failed archiving {dir_to_compress} -> {destination} due to {type(err)}."
398406
f"Details: {err}"
399-
) from err
407+
)
408+
raise ArchiveError(msg) from err
400409

401410
except BaseException:
402411
if destination.is_file():
@@ -453,11 +462,9 @@ def prune(self, exclude: set[Path]) -> None:
453462
if path.is_file():
454463
path.unlink()
455464
elif path.is_dir():
456-
try:
465+
# prevents deleting non-empty folders
466+
with suppress(OSError):
457467
path.rmdir()
458-
except OSError:
459-
# prevents deleting non-empty folders
460-
pass
461468

462469
# second pass to delete empty folders
463470
# after deleting files, some folders might have been left empty

packages/service-library/tests/test_archiving_utils.py

Lines changed: 91 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,17 +11,20 @@
1111
import secrets
1212
import string
1313
import tempfile
14+
from collections.abc import AsyncIterable, Callable, Iterable, Iterator
1415
from concurrent.futures import ProcessPoolExecutor
1516
from dataclasses import dataclass
1617
from pathlib import Path
17-
from typing import Callable, Iterable, Iterator
1818

19+
import numpy
1920
import pytest
2021
from faker import Faker
22+
from PIL import Image
2123
from pydantic import ByteSize, parse_obj_as
2224
from pytest_benchmark.plugin import BenchmarkFixture
2325
from servicelib import archiving_utils
2426
from servicelib.archiving_utils import ArchiveError, archive_dir, unarchive_dir
27+
from servicelib.file_utils import remove_directory
2528

2629

2730
def _print_tree(path: Path, level=0):
@@ -596,3 +599,90 @@ def run_async_test(*args, **kwargs):
596599
)
597600

598601
benchmark(run_async_test)
602+
603+
604+
def _touch_all_files_in_path(path_to_archive: Path) -> None:
605+
for path in path_to_archive.rglob("*"):
606+
print("touching", path)
607+
path.touch()
608+
609+
610+
@pytest.fixture
611+
async def mixed_file_types(tmp_path: Path, faker: Faker) -> AsyncIterable[Path]:
612+
base_dir = tmp_path / "mixed_types_dir"
613+
base_dir.mkdir()
614+
615+
# mixed small text files and binary files
616+
(base_dir / "empty").mkdir()
617+
(base_dir / "d1").mkdir()
618+
(base_dir / "d1" / "f1.txt").write_text(faker.text())
619+
(base_dir / "d1" / "b2.bin").write_bytes(faker.json_bytes())
620+
(base_dir / "d1" / "sd1").mkdir()
621+
(base_dir / "d1" / "sd1" / "f1.txt").write_text(faker.text())
622+
(base_dir / "d1" / "sd1" / "b2.bin").write_bytes(faker.json_bytes())
623+
(base_dir / "images").mkdir()
624+
625+
# images cause issues with zipping, below content produced different
626+
# hashes for zip files
627+
for i in range(2):
628+
image_dir = base_dir / f"images{i}"
629+
image_dir.mkdir()
630+
for n in range(50):
631+
a = numpy.random.rand(900, 900, 3) * 255 # noqa: NPY002
632+
im_out = Image.fromarray(a.astype("uint8")).convert("RGB")
633+
image_path = image_dir / f"out{n}.jpg"
634+
im_out.save(image_path)
635+
636+
print("mixed_types_dir ---")
637+
_print_tree(base_dir)
638+
639+
yield base_dir
640+
641+
await remove_directory(base_dir)
642+
assert not base_dir.exists()
643+
644+
645+
@pytest.mark.parametrize(
646+
"store_relative_path, compress",
647+
[
648+
# test that all possible combinations still work
649+
pytest.param(False, False, id="no_relative_path_no_compress"),
650+
pytest.param(False, True, id="no_relative_path_with_compression"),
651+
pytest.param(True, False, id="nodeports_options"),
652+
pytest.param(True, True, id="with_relative_path_with_compression"),
653+
],
654+
)
655+
async def test_regression_archive_hash_does_not_change(
656+
mixed_file_types: Path,
657+
tmp_path: Path,
658+
store_relative_path: bool,
659+
compress: bool,
660+
):
661+
destination_path = tmp_path / "archives_to_compare"
662+
destination_path.mkdir(parents=True, exist_ok=True)
663+
664+
first_archive = destination_path / "first"
665+
second_archive = destination_path / "second"
666+
assert not first_archive.exists()
667+
assert not second_archive.exists()
668+
assert first_archive != second_archive
669+
670+
await archive_dir(
671+
mixed_file_types,
672+
first_archive,
673+
compress=compress,
674+
store_relative_path=store_relative_path,
675+
)
676+
677+
_touch_all_files_in_path(mixed_file_types)
678+
679+
await archive_dir(
680+
mixed_file_types,
681+
second_archive,
682+
compress=compress,
683+
store_relative_path=store_relative_path,
684+
)
685+
686+
_, first_hash = _compute_hash(first_archive)
687+
_, second_hash = _compute_hash(second_archive)
688+
assert first_hash == second_hash

packages/simcore-sdk/requirements/_base.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -247,6 +247,8 @@ referencing==0.29.3
247247
# -c requirements/../../../packages/service-library/requirements/./constraints.txt
248248
# jsonschema
249249
# jsonschema-specifications
250+
repro-zipfile==0.3.1
251+
# via -r requirements/../../../packages/service-library/requirements/_base.in
250252
requests==2.32.3
251253
# via opentelemetry-exporter-otlp-proto-http
252254
rich==13.8.1

services/api-server/requirements/_base.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -467,6 +467,10 @@ redis==5.0.4
467467
# -c requirements/../../../requirements/constraints.txt
468468
# -r requirements/../../../packages/service-library/requirements/_base.in
469469
# -r requirements/../../../packages/simcore-sdk/requirements/../../../packages/service-library/requirements/_base.in
470+
repro-zipfile==0.3.1
471+
# via
472+
# -r requirements/../../../packages/service-library/requirements/_base.in
473+
# -r requirements/../../../packages/simcore-sdk/requirements/../../../packages/service-library/requirements/_base.in
470474
requests==2.32.3
471475
# via opentelemetry-exporter-otlp-proto-http
472476
rich==13.7.1

0 commit comments

Comments
 (0)