Skip to content

♻️ export data feature enhancements #7498

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
from common_library.errors_classes import OsparcErrorMixin


class StorageRuntimeError(OsparcErrorMixin, RuntimeError):
...
class StorageRuntimeError(OsparcErrorMixin, RuntimeError): ...


class ConfigurationError(StorageRuntimeError):
Expand Down Expand Up @@ -45,3 +44,7 @@ class InvalidFileIdentifierError(AccessLayerError):

class DatCoreCredentialsMissingError(StorageRuntimeError):
msg_template: str = "DatCore credentials are incomplete. TIP: Check your settings"


class SelectionNotAllowedError(StorageRuntimeError):
msg_template: str = "Selection='{selection}' must be from the same folder"
11 changes: 9 additions & 2 deletions services/storage/src/simcore_service_storage/simcore_s3_dsm.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
LinkAlreadyExistsError,
ProjectAccessRightError,
ProjectNotFoundError,
SelectionNotAllowedError,
)
from .models import (
DatasetMetaData,
Expand All @@ -81,9 +82,11 @@
from .modules.s3 import get_s3_client
from .utils.s3_utils import S3TransferDataCB
from .utils.simcore_s3_dsm_utils import (
UserSelectionStr,
compute_file_id_prefix,
create_and_upload_export,
create_random_export_name,
ensure_user_selection_from_same_base_directory,
expand_directory,
get_accessible_project_ids,
get_directory_file_id,
Expand Down Expand Up @@ -1249,7 +1252,11 @@ async def create_s3_export(
*,
progress_bar: ProgressBarData,
) -> StorageFileID:
source_object_keys: set[StorageFileID] = set()
source_object_keys: set[tuple[UserSelectionStr, StorageFileID]] = set()

# ensure all selected items have the same parent
if not ensure_user_selection_from_same_base_directory(object_keys):
raise SelectionNotAllowedError(selection=object_keys)

# check access rights
for object_key in object_keys:
Expand Down Expand Up @@ -1279,7 +1286,7 @@ async def create_s3_export(
self.simcore_bucket_name, object_key
):
for entry in meta_data_files:
source_object_keys.add(entry.object_key)
source_object_keys.add((object_key, entry.object_key))

_logger.debug(
"User selection '%s' includes '%s' files",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
from contextlib import suppress
from pathlib import Path
from typing import TypeAlias
from uuid import uuid4

import orjson
from aws_library.s3 import S3MetaData, SimcoreS3API
from aws_library.s3._constants import STREAM_READER_CHUNK_SIZE
from aws_library.s3._models import S3ObjectKey
from models_library.api_schemas_storage.storage_schemas import S3BucketName
from models_library.projects import ProjectID
from models_library.projects_nodes_io import (
Expand Down Expand Up @@ -143,20 +145,40 @@ def create_random_export_name(user_id: UserID) -> StorageFileID:
)


def ensure_user_selection_from_same_base_directory(
object_keys: list[S3ObjectKey],
) -> bool:
parents = [Path(x).parent for x in object_keys]
return len(set(parents)) <= 1


UserSelectionStr: TypeAlias = str


def _base_path_parent(base_path: UserSelectionStr, s3_object: S3ObjectKey) -> str:
base_path_parent_path = Path(base_path).parent
s3_object_path = Path(s3_object)
if base_path_parent_path == s3_object_path:
return s3_object_path.name

result = s3_object_path.relative_to(base_path_parent_path)
return f"{result}"


async def create_and_upload_export(
s3_client: SimcoreS3API,
bucket: S3BucketName,
*,
source_object_keys: set[StorageFileID],
source_object_keys: set[tuple[UserSelectionStr, StorageFileID]],
destination_object_keys: StorageFileID,
progress_bar: ProgressBarData,
) -> None:
archive_entries: ArchiveEntries = [
(
s3_object,
_base_path_parent(selection, s3_object),
await s3_client.get_bytes_streamer_from_object(bucket, s3_object),
)
for s3_object in source_object_keys
for (selection, s3_object) in source_object_keys
]

async with progress_bar:
Expand Down
13 changes: 8 additions & 5 deletions services/storage/tests/unit/test_rpc_handlers_simcore_s3.py
Original file line number Diff line number Diff line change
Expand Up @@ -592,17 +592,20 @@ async def test_start_export_data(
):
_, src_projects_list = await random_project_with_files(project_params)

paths_to_export: set[SimcoreS3FileID] = set()
all_available_files: set[SimcoreS3FileID] = set()
for x in src_projects_list.values():
paths_to_export |= x.keys()
all_available_files |= x.keys()

nodes_in_project_to_export = {
TypeAdapter(PathToExport).validate_python("/".join(Path(x).parts[0:2]))
for x in all_available_files
}

result = await _request_start_export_data(
storage_rabbitmq_rpc_client,
user_id,
product_name,
paths_to_export=[
TypeAdapter(PathToExport).validate_python(x) for x in paths_to_export
],
paths_to_export=list(nodes_in_project_to_export),
)

assert re.fullmatch(
Expand Down
6 changes: 5 additions & 1 deletion services/storage/tests/unit/test_simcore_s3_dsm.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,11 @@ async def test_create_s3_export(
cleanup_files_closure: Callable[[SimcoreS3FileID], None],
):
initial_fmd_count = await _get_fmds_count(sqlalchemy_async_engine)
selection_to_export = _get_folder_and_files_selection(paths_for_export)
all_files_to_export = _get_folder_and_files_selection(paths_for_export)
selection_to_export = {
S3ObjectKey(project_id)
for project_id in {Path(p).parents[-2] for p in all_files_to_export}
}

reports: list[ProgressReport] = []

Expand Down
56 changes: 55 additions & 1 deletion services/storage/tests/unit/test_simcore_s3_dsm_utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,13 @@
from pathlib import Path

import pytest
from simcore_service_storage.utils.simcore_s3_dsm_utils import compute_file_id_prefix
from aws_library.s3._models import S3ObjectKey
from simcore_service_storage.utils.simcore_s3_dsm_utils import (
UserSelectionStr,
_base_path_parent,
compute_file_id_prefix,
ensure_user_selection_from_same_base_directory,
)


@pytest.mark.parametrize(
Expand All @@ -19,3 +27,49 @@
)
def test_compute_file_id_prefix(file_id, levels, expected):
assert compute_file_id_prefix(file_id, levels) == expected


_FOLDERS_PATH = Path("nested/folders/path")


@pytest.mark.parametrize(
"selection, s3_object, expected",
[
("single_file", "single_file", "single_file"),
("single_folder", "single_folder", "single_folder"),
("a/b/c", "a/b/c/d/e/f/g", "c/d/e/f/g"),
(_FOLDERS_PATH / "folder", _FOLDERS_PATH / "folder", "folder"),
(_FOLDERS_PATH / "a_file.txt", _FOLDERS_PATH / "a_file.txt", "a_file.txt"),
(_FOLDERS_PATH, _FOLDERS_PATH / "with/some/content", "path/with/some/content"),
],
)
def test__base_path_parent(selection: Path | str, s3_object: Path, expected: str):
assert (
_base_path_parent(UserSelectionStr(f"{selection}"), S3ObjectKey(f"{s3_object}"))
== expected
)


@pytest.mark.parametrize(
"user_selection, expected",
[
([], True),
(["folder"], True),
(["folder", "folder"], True),
(["", ""], True),
([""], True),
([_FOLDERS_PATH / "a", _FOLDERS_PATH / "b"], True),
(["a.txt", "b.txt"], True),
(["a/a.txt"], True),
# not same parent
(["firsta/file", "second/file"], False),
(["a/a.txt", "a.txt", "c.txt", "a/d.txt"], False),
],
)
def test_ensure_user_selection_from_same_base_directory(
user_selection: list[S3ObjectKey | Path], expected: bool
):
assert (
ensure_user_selection_from_same_base_directory([f"{x}" for x in user_selection])
== expected
)
Loading