diff --git a/api/specs/web-server/_storage.py b/api/specs/web-server/_storage.py index 50e1eaeb5fd..4daa159fc29 100644 --- a/api/specs/web-server/_storage.py +++ b/api/specs/web-server/_storage.py @@ -4,10 +4,11 @@ # pylint: disable=too-many-arguments -from typing import TypeAlias +from typing import Annotated, TypeAlias from uuid import UUID -from fastapi import APIRouter, Query, status +from fastapi import APIRouter, Depends, Query, status +from fastapi_pagination.cursor import CursorPage from models_library.api_schemas_storage.storage_schemas import ( FileLocation, FileMetaDataGet, @@ -20,9 +21,11 @@ ) from models_library.api_schemas_webserver.storage import ( DataExportPost, + ListPathsQueryParams, StorageAsyncJobGet, StorageAsyncJobResult, StorageAsyncJobStatus, + StorageLocationPathParams, ) from models_library.generics import Envelope from models_library.projects_nodes_io import LocationID @@ -53,12 +56,25 @@ async def list_storage_locations(): """Returns the list of available storage locations""" +@router.get( + "/storage/locations/{location_id}/paths", + response_model=CursorPage[FileMetaDataGet], +) +async def list_storage_paths( + _path: Annotated[StorageLocationPathParams, Depends()], + _query: Annotated[ListPathsQueryParams, Depends()], +): + """Lists the files/directories in WorkingDirectory""" + + @router.get( "/storage/locations/{location_id}/datasets", response_model=Envelope[list[DatasetMetaData]], description="Get datasets metadata", ) -async def list_datasets_metadata(location_id: LocationID): +async def list_datasets_metadata( + _path: Annotated[StorageLocationPathParams, Depends()], +): """returns all the top level datasets a user has access to""" @@ -68,7 +84,7 @@ async def list_datasets_metadata(location_id: LocationID): description="Get datasets metadata", ) async def get_files_metadata( - location_id: LocationID, + _path: Annotated[StorageLocationPathParams, Depends()], uuid_filter: str = "", expand_dirs: bool = Query( True, diff --git a/api/specs/web-server/requirements.txt b/api/specs/web-server/requirements.txt index 62bea8dd0a9..8ffca6a489d 100644 --- a/api/specs/web-server/requirements.txt +++ b/api/specs/web-server/requirements.txt @@ -3,6 +3,7 @@ --constraint ../../../requirements/constraints.txt fastapi +fastapi-pagination jsonref pydantic pydantic-extra-types diff --git a/packages/aws-library/src/aws_library/s3/_client.py b/packages/aws-library/src/aws_library/s3/_client.py index e62d55d2791..992b3cbc48f 100644 --- a/packages/aws-library/src/aws_library/s3/_client.py +++ b/packages/aws-library/src/aws_library/s3/_client.py @@ -28,19 +28,28 @@ from settings_library.s3 import S3Settings from types_aiobotocore_s3 import S3Client from types_aiobotocore_s3.literals import BucketLocationConstraintType -from types_aiobotocore_s3.type_defs import ObjectIdentifierTypeDef +from types_aiobotocore_s3.type_defs import ( + ListObjectsV2RequestRequestTypeDef, + ObjectIdentifierTypeDef, +) -from ._constants import MULTIPART_COPY_THRESHOLD, MULTIPART_UPLOADS_MIN_TOTAL_SIZE +from ._constants import ( + MULTIPART_COPY_THRESHOLD, + MULTIPART_UPLOADS_MIN_TOTAL_SIZE, + S3_OBJECT_DELIMITER, +) from ._error_handler import s3_exception_handler, s3_exception_handler_async_gen from ._errors import S3DestinationNotEmptyError, S3KeyNotFoundError from ._models import ( MultiPartUploadLinks, + PathCursor, S3DirectoryMetaData, S3MetaData, S3ObjectKey, + S3ObjectPrefix, UploadID, ) -from ._utils import compute_num_file_chunks +from ._utils import compute_num_file_chunks, create_final_prefix _logger = logging.getLogger(__name__) @@ -167,7 +176,99 @@ async def get_directory_metadata( size = 0 async for s3_object in self._list_all_objects(bucket=bucket, prefix=prefix): size += s3_object.size - return S3DirectoryMetaData(size=size) + return S3DirectoryMetaData(prefix=S3ObjectPrefix(prefix), size=ByteSize(size)) + + @s3_exception_handler(_logger) + async def count_objects( + self, + *, + bucket: S3BucketName, + prefix: S3ObjectPrefix | None, + start_after: S3ObjectKey | None, + is_partial_prefix: bool = False, + use_delimiter: bool = True, + ) -> int: + """returns the number of entries in the bucket, defined + by prefix and start_after same as list_objects + """ + paginator = self._client.get_paginator("list_objects_v2") + total_count = 0 + async for page in paginator.paginate( + Bucket=bucket, + Prefix=create_final_prefix(prefix, is_partial_prefix=is_partial_prefix), + StartAfter=start_after or "", + Delimiter=S3_OBJECT_DELIMITER if use_delimiter else "", + ): + total_count += page.get("KeyCount", 0) + return total_count + + @s3_exception_handler(_logger) + async def list_objects( + self, + *, + bucket: S3BucketName, + prefix: S3ObjectPrefix | None, + start_after: S3ObjectKey | None, + limit: int = _MAX_ITEMS_PER_PAGE, + next_cursor: PathCursor | None = None, + is_partial_prefix: bool = False, + ) -> tuple[list[S3MetaData | S3DirectoryMetaData], PathCursor | None]: + """returns a number of entries in the bucket, defined by limit + the entries are sorted alphabetically by key. If a cursor is returned + then the client can call the function again with the cursor to get the + next entries. + + the first entry is defined by start_after + if start_after is None, the first entry is the first one in the bucket + if prefix is not None, only entries with the given prefix are returned + if prefix is None, all entries in the bucket are returned + if next_cursor is set, then the call will return the next entries after the cursor + if is_partial_prefix is set then the prefix is not auto-delimited + (if False equivalent to `ls /home/user/` + if True equivalent to `ls /home/user*`) + limit must be >= 1 and <= _AWS_MAX_ITEMS_PER_PAGE + + Raises: + ValueError: in case of invalid limit + """ + if limit < 1: + msg = "num_objects must be >= 1" + raise ValueError(msg) + if limit > _AWS_MAX_ITEMS_PER_PAGE: + msg = f"num_objects must be <= {_AWS_MAX_ITEMS_PER_PAGE}" + raise ValueError(msg) + + list_config: ListObjectsV2RequestRequestTypeDef = { + "Bucket": bucket, + "Prefix": create_final_prefix(prefix, is_partial_prefix=is_partial_prefix), + "MaxKeys": limit, + "Delimiter": S3_OBJECT_DELIMITER, + } + if start_after: + list_config["StartAfter"] = start_after + if next_cursor: + list_config["ContinuationToken"] = next_cursor + listed_objects = await self._client.list_objects_v2(**list_config) + found_objects: list[S3MetaData | S3DirectoryMetaData] = [] + if "CommonPrefixes" in listed_objects: + # we have folders here + list_subfolders = listed_objects["CommonPrefixes"] + found_objects.extend( + S3DirectoryMetaData.model_construct( + prefix=S3ObjectPrefix(subfolder["Prefix"], size=None) + ) + for subfolder in list_subfolders + if "Prefix" in subfolder + ) + if "Contents" in listed_objects: + found_objects.extend( + S3MetaData.from_botocore_list_objects(obj) + for obj in listed_objects["Contents"] + ) + next_cursor = None + if listed_objects["IsTruncated"]: + next_cursor = listed_objects["NextContinuationToken"] + return found_objects, next_cursor @s3_exception_handler_async_gen(_logger) async def list_objects_paginated( @@ -459,7 +560,7 @@ async def copy_objects_recursively( dst_metadata = await self.get_directory_metadata( bucket=bucket, prefix=dst_prefix ) - if dst_metadata.size > 0: + if dst_metadata.size and dst_metadata.size > 0: raise S3DestinationNotEmptyError(dst_prefix=dst_prefix) await limited_gather( *[ diff --git a/packages/aws-library/src/aws_library/s3/_constants.py b/packages/aws-library/src/aws_library/s3/_constants.py index a94cd555f43..882c02774d2 100644 --- a/packages/aws-library/src/aws_library/s3/_constants.py +++ b/packages/aws-library/src/aws_library/s3/_constants.py @@ -12,3 +12,4 @@ PRESIGNED_LINK_MAX_SIZE: Final[ByteSize] = TypeAdapter(ByteSize).validate_python("5GiB") S3_MAX_FILE_SIZE: Final[ByteSize] = TypeAdapter(ByteSize).validate_python("5TiB") +S3_OBJECT_DELIMITER: Final[str] = "/" diff --git a/packages/aws-library/src/aws_library/s3/_models.py b/packages/aws-library/src/aws_library/s3/_models.py index f02a4765fad..4d722386526 100644 --- a/packages/aws-library/src/aws_library/s3/_models.py +++ b/packages/aws-library/src/aws_library/s3/_models.py @@ -1,23 +1,24 @@ import datetime -from dataclasses import dataclass -from typing import TypeAlias +from pathlib import Path +from typing import TypeAlias, cast from models_library.api_schemas_storage.storage_schemas import ETag from models_library.basic_types import SHA256Str -from pydantic import AnyUrl, BaseModel, ByteSize +from pydantic import AnyUrl, BaseModel, ByteSize, Field from types_aiobotocore_s3.type_defs import HeadObjectOutputTypeDef, ObjectTypeDef S3ObjectKey: TypeAlias = str +S3ObjectPrefix: TypeAlias = Path UploadID: TypeAlias = str +PathCursor: TypeAlias = str -@dataclass(frozen=True, slots=True, kw_only=True) -class S3MetaData: +class S3MetaData(BaseModel, frozen=True): object_key: S3ObjectKey last_modified: datetime.datetime e_tag: ETag sha256_checksum: SHA256Str | None - size: int + size: ByteSize @staticmethod def from_botocore_head_object( @@ -27,12 +28,8 @@ def from_botocore_head_object( object_key=object_key, last_modified=obj["LastModified"], e_tag=obj["ETag"].strip('"'), - sha256_checksum=( - SHA256Str(obj.get("ChecksumSHA256")) - if obj.get("ChecksumSHA256") - else None - ), - size=obj["ContentLength"], + sha256_checksum=obj.get("ChecksumSHA256"), + size=ByteSize(obj["ContentLength"]), ) @staticmethod @@ -47,18 +44,22 @@ def from_botocore_list_objects( object_key=obj["Key"], last_modified=obj["LastModified"], e_tag=obj["ETag"].strip('"'), - sha256_checksum=( - SHA256Str(obj.get("ChecksumSHA256")) - if obj.get("ChecksumSHA256") - else None - ), - size=obj["Size"], + sha256_checksum=cast(SHA256Str | None, obj.get("ChecksumSHA256")), + size=ByteSize(obj["Size"]), ) + def as_path(self) -> Path: + return Path(self.object_key) -@dataclass(frozen=True) -class S3DirectoryMetaData: - size: int + +class S3DirectoryMetaData(BaseModel, frozen=True): + prefix: S3ObjectPrefix + size: ByteSize | None = Field( + ..., description="Size of the directory if computed, None if unknown" + ) + + def as_path(self) -> Path: + return self.prefix class MultiPartUploadLinks(BaseModel): diff --git a/packages/aws-library/src/aws_library/s3/_utils.py b/packages/aws-library/src/aws_library/s3/_utils.py index 96ad59f57d3..51024f0f15a 100644 --- a/packages/aws-library/src/aws_library/s3/_utils.py +++ b/packages/aws-library/src/aws_library/s3/_utils.py @@ -2,6 +2,9 @@ from pydantic import ByteSize, TypeAdapter +from ._constants import S3_OBJECT_DELIMITER +from ._models import S3ObjectPrefix + _MULTIPART_MAX_NUMBER_OF_PARTS: Final[int] = 10000 # this is artifically defined, if possible we keep a maximum number of requests for parallel @@ -34,3 +37,15 @@ def compute_num_file_chunks(file_size: ByteSize) -> tuple[int, ByteSize]: raise ValueError( msg, ) + + +def create_final_prefix( + prefix: S3ObjectPrefix | None, *, is_partial_prefix: bool +) -> str: + final_prefix = f"{prefix}" if prefix else "" + if prefix and not is_partial_prefix: + final_prefix = ( + f"{final_prefix.rstrip(S3_OBJECT_DELIMITER)}{S3_OBJECT_DELIMITER}" + ) + + return final_prefix diff --git a/packages/aws-library/tests/test_s3_client.py b/packages/aws-library/tests/test_s3_client.py index cca7a19fd78..e239861696d 100644 --- a/packages/aws-library/tests/test_s3_client.py +++ b/packages/aws-library/tests/test_s3_client.py @@ -18,14 +18,14 @@ from contextlib import contextmanager from dataclasses import dataclass from pathlib import Path -from typing import Any +from typing import Any, Final from unittest.mock import AsyncMock, Mock import aiofiles import botocore.exceptions import pytest from aiohttp import ClientSession -from aws_library.s3._client import S3ObjectKey, SimcoreS3API +from aws_library.s3._client import _AWS_MAX_ITEMS_PER_PAGE, S3ObjectKey, SimcoreS3API from aws_library.s3._constants import ( MULTIPART_COPY_THRESHOLD, MULTIPART_UPLOADS_MIN_TOTAL_SIZE, @@ -36,7 +36,7 @@ S3KeyNotFoundError, S3UploadNotFoundError, ) -from aws_library.s3._models import MultiPartUploadLinks +from aws_library.s3._models import MultiPartUploadLinks, S3DirectoryMetaData, S3MetaData from faker import Faker from models_library.api_schemas_storage.storage_schemas import ( S3BucketName, @@ -44,7 +44,7 @@ ) from models_library.basic_types import SHA256Str from moto.server import ThreadedMotoServer -from pydantic import AnyUrl, ByteSize, TypeAdapter +from pydantic import AnyUrl, ByteSize, NonNegativeInt, TypeAdapter from pytest_benchmark.plugin import BenchmarkFixture from pytest_mock import MockerFixture from pytest_simcore.helpers.comparing import ( @@ -368,35 +368,50 @@ def set_log_levels_for_noisy_libraries() -> None: @pytest.fixture -async def with_uploaded_folder_on_s3( +async def create_folder_on_s3( create_folder_of_size_with_multiple_files: Callable[ - [ByteSize, ByteSize, ByteSize, Path | None], Path + [ByteSize, ByteSize, ByteSize, Path | None, NonNegativeInt | None], Path ], upload_file: Callable[[Path, Path], Awaitable[UploadedFile]], directory_size: ByteSize, min_file_size: ByteSize, max_file_size: ByteSize, + depth: NonNegativeInt | None, +) -> Callable[[], Awaitable[list[UploadedFile]]]: + async def _() -> list[UploadedFile]: + # create random files of random size and upload to S3 + folder = create_folder_of_size_with_multiple_files( + ByteSize(directory_size), + ByteSize(min_file_size), + ByteSize(max_file_size), + None, + depth, + ) + list_uploaded_files = [] + + with log_context(logging.INFO, msg=f"uploading {folder}") as ctx: + list_uploaded_files = [ + await uploaded_file + async for uploaded_file in limited_as_completed( + ( + upload_file(file, folder.parent) + for file in folder.rglob("*") + if file.is_file() + ), + limit=20, + ) + ] + ctx.logger.info("uploaded %s files", len(list_uploaded_files)) + return list_uploaded_files + + return _ + + +@pytest.fixture +async def with_uploaded_folder_on_s3( + create_folder_on_s3: Callable[[], Awaitable[list[UploadedFile]]], ) -> list[UploadedFile]: - # create random files of random size and upload to S3 - folder = create_folder_of_size_with_multiple_files( - ByteSize(directory_size), ByteSize(min_file_size), ByteSize(max_file_size), None - ) - list_uploaded_files = [] - - with log_context(logging.INFO, msg=f"uploading {folder}") as ctx: - list_uploaded_files = [ - await uploaded_file - async for uploaded_file in limited_as_completed( - ( - upload_file(file, folder.parent) - for file in folder.rglob("*") - if file.is_file() - ), - limit=20, - ) - ] - ctx.logger.info("uploaded %s files", len(list_uploaded_files)) - return list_uploaded_files + return await create_folder_on_s3() @pytest.fixture @@ -438,9 +453,10 @@ async def _copier(src_prefix: str, dst_prefix: str) -> str: src_directory_metadata = await simcore_s3_api.get_directory_metadata( bucket=with_s3_bucket, prefix=src_prefix ) + assert src_directory_metadata.size is not None with log_context( logging.INFO, - msg=f"copying {src_prefix} [{ByteSize(src_directory_metadata.size).human_readable()}] to {dst_prefix}", + msg=f"copying {src_prefix} [{src_directory_metadata.size.human_readable()}] to {dst_prefix}", ) as ctx: progress_cb = _CopyProgressCallback( file_size=src_directory_metadata.size, @@ -519,6 +535,270 @@ async def test_http_check_bucket_connected( ) +_ROOT_LEVEL: Final[int] = -2 + + +def _get_paths_with_prefix( + uploaded_files: list[UploadedFile], *, prefix_level: int, path_prefix: Path | None +) -> tuple[set[Path], set[Path]]: + def _filter_by_prefix(uploaded_file: UploadedFile) -> bool: + return Path(uploaded_file.s3_key).is_relative_to(path_prefix or "") + + directories = { + Path(file.s3_key).parents[_ROOT_LEVEL - prefix_level] + for file in filter(_filter_by_prefix, uploaded_files) + if Path(file.s3_key).parent != path_prefix + } + files = { + Path(file.s3_key) + for file in filter(_filter_by_prefix, uploaded_files) + if Path(file.s3_key).parent == path_prefix + } + return directories, files + + +@pytest.mark.parametrize( + "directory_size, min_file_size, max_file_size, depth", + [ + ( + TypeAdapter(ByteSize).validate_python("1Mib"), + TypeAdapter(ByteSize).validate_python("1B"), + TypeAdapter(ByteSize).validate_python("10Kib"), + None, + ) + ], + ids=byte_size_ids, +) +async def test_count_objects( + mocked_s3_server_envs: EnvVarsDict, + with_s3_bucket: S3BucketName, + with_uploaded_folder_on_s3: list[UploadedFile], + simcore_s3_api: SimcoreS3API, +): + # assert pre-conditions + assert len(with_uploaded_folder_on_s3) >= 1, "wrong initialization of test!" + + def find_deepest_file(files: list[UploadedFile]) -> Path: + return Path(max(files, key=lambda f: f.s3_key.count("/")).s3_key) + + deepest_file_path = find_deepest_file(with_uploaded_folder_on_s3) + prefixes = deepest_file_path.parents[0].parts + + # Start from the root and go down to the directory containing the deepest file + for level in range(len(prefixes)): + current_prefix = ( + Path(prefixes[0]).joinpath(*prefixes[1:level]) if level > 0 else None + ) + + directories, files = _get_paths_with_prefix( + with_uploaded_folder_on_s3, prefix_level=level, path_prefix=current_prefix + ) + all_paths = directories | files + + num_objects = await simcore_s3_api.count_objects( + bucket=with_s3_bucket, prefix=current_prefix, start_after=None + ) + assert num_objects == len(all_paths) + + # get number on root is 1 + got = await simcore_s3_api.count_objects( + bucket=with_s3_bucket, prefix=None, start_after=None + ) + assert got == len(directories) + + +@pytest.mark.parametrize( + "directory_size, min_file_size, max_file_size, depth", + [ + ( + TypeAdapter(ByteSize).validate_python("1Mib"), + TypeAdapter(ByteSize).validate_python("1B"), + TypeAdapter(ByteSize).validate_python("10Kib"), + None, + ) + ], + ids=byte_size_ids, +) +async def test_list_objects_prefix( + mocked_s3_server_envs: EnvVarsDict, + with_s3_bucket: S3BucketName, + with_uploaded_folder_on_s3: list[UploadedFile], + simcore_s3_api: SimcoreS3API, +): + # assert pre-conditions + assert len(with_uploaded_folder_on_s3) >= 1, "wrong initialization of test!" + + def find_deepest_file(files: list[UploadedFile]) -> Path: + return Path(max(files, key=lambda f: f.s3_key.count("/")).s3_key) + + deepest_file_path = find_deepest_file(with_uploaded_folder_on_s3) + prefixes = deepest_file_path.parents[0].parts + + # Start from the root and go down to the directory containing the deepest file + for level in range(len(prefixes)): + current_prefix = ( + Path(prefixes[0]).joinpath(*prefixes[1:level]) if level > 0 else None + ) + + directories, files = _get_paths_with_prefix( + with_uploaded_folder_on_s3, prefix_level=level, path_prefix=current_prefix + ) + all_paths = directories | files + + objects, next_cursor = await simcore_s3_api.list_objects( + bucket=with_s3_bucket, prefix=current_prefix, start_after=None + ) + assert next_cursor is None + assert len(objects) == len(all_paths) + assert {_.as_path() for _ in objects} == all_paths + + # Check files and directories are correctly separated + received_files = {_ for _ in objects if isinstance(_, S3MetaData)} + received_directories = { + _ for _ in objects if isinstance(_, S3DirectoryMetaData) + } + assert len(received_files) == len(files) + assert len(received_directories) == len(directories) + + +async def test_list_objects_pagination_num_objects_limits( + faker: Faker, + mocked_s3_server_envs: EnvVarsDict, + with_s3_bucket: S3BucketName, + simcore_s3_api: SimcoreS3API, +): + with pytest.raises(ValueError, match=r"num_objects must be >= 1"): + await simcore_s3_api.list_objects( + bucket=with_s3_bucket, + prefix=None, + start_after=None, + limit=faker.pyint(max_value=0), + ) + + with pytest.raises(ValueError, match=r"num_objects must be <= \d+"): + await simcore_s3_api.list_objects( + bucket=with_s3_bucket, + prefix=None, + start_after=None, + limit=_AWS_MAX_ITEMS_PER_PAGE + 1, + ) + + +@pytest.mark.parametrize( + "directory_size, min_file_size, max_file_size, depth", + [ + ( + TypeAdapter(ByteSize).validate_python("1Mib"), + TypeAdapter(ByteSize).validate_python("1B"), + TypeAdapter(ByteSize).validate_python("10Kib"), + 0, + ) + ], + ids=byte_size_ids, +) +@pytest.mark.parametrize("limit", [10, 50, 300], ids=lambda x: f"limit={x}") +async def test_list_objects_pagination( + mocked_s3_server_envs: EnvVarsDict, + with_s3_bucket: S3BucketName, + with_uploaded_folder_on_s3: list[UploadedFile], + simcore_s3_api: SimcoreS3API, + limit: int, +): + total_num_files = len(with_uploaded_folder_on_s3) + # pre-condition + directories, files = _get_paths_with_prefix( + with_uploaded_folder_on_s3, prefix_level=0, path_prefix=None + ) + assert len(directories) == 1, "test pre-condition not fulfilled!" + assert not files + + first_level_prefix = next(iter(directories)) + first_level_directories, first_level_files = _get_paths_with_prefix( + with_uploaded_folder_on_s3, prefix_level=1, path_prefix=first_level_prefix + ) + assert ( + not first_level_directories + ), "test pre-condition not fulfilled, there should be only files for this test" + assert len(first_level_files) == total_num_files + + # now we will fetch the file objects according to the given limit + num_fetch = int(round(total_num_files / limit + 0.5)) + assert num_fetch >= 1 + start_after_key = None + for i in range(num_fetch - 1): + objects, next_cursor = await simcore_s3_api.list_objects( + bucket=with_s3_bucket, + prefix=first_level_prefix, + start_after=start_after_key, + limit=limit, + ) + assert len(objects) == limit, f"fetch {i} returned a wrong number of objects" + assert isinstance(objects[-1], S3MetaData) + start_after_key = objects[-1].object_key + # last fetch + objects, next_cursor = await simcore_s3_api.list_objects( + bucket=with_s3_bucket, + prefix=first_level_prefix, + start_after=start_after_key, + limit=limit, + ) + assert next_cursor is None + assert len(objects) == (total_num_files - (num_fetch - 1) * limit) + + +@pytest.mark.parametrize( + "directory_size, min_file_size, max_file_size, depth", + [ + ( + TypeAdapter(ByteSize).validate_python("1Mib"), + TypeAdapter(ByteSize).validate_python("1B"), + TypeAdapter(ByteSize).validate_python("10Kib"), + 0, + ) + ], + ids=byte_size_ids, +) +async def test_list_objects_partial_prefix( + mocked_s3_server_envs: EnvVarsDict, + with_s3_bucket: S3BucketName, + with_uploaded_folder_on_s3: list[UploadedFile], + simcore_s3_api: SimcoreS3API, +): + total_num_files = len(with_uploaded_folder_on_s3) + # pre-condition + directories, files = _get_paths_with_prefix( + with_uploaded_folder_on_s3, prefix_level=0, path_prefix=None + ) + assert len(directories) == 1, "test pre-condition not fulfilled!" + assert not files + + first_level_prefix = next(iter(directories)) + first_level_directories, first_level_files = _get_paths_with_prefix( + with_uploaded_folder_on_s3, prefix_level=1, path_prefix=first_level_prefix + ) + assert ( + not first_level_directories + ), "test pre-condition not fulfilled, there should be only files for this test" + assert len(first_level_files) == total_num_files + + a_random_file = random.choice(list(first_level_files)) # noqa: S311 + a_partial_prefix = a_random_file.name[0:1] + expected_files = { + file for file in first_level_files if file.name.startswith(a_partial_prefix) + } + + # now we will fetch the file objects according to the given limit + objects, next_cursor = await simcore_s3_api.list_objects( + bucket=with_s3_bucket, + prefix=first_level_prefix / a_partial_prefix, + start_after=None, + is_partial_prefix=True, + ) + assert next_cursor is None + assert len(objects) == len(expected_files) + assert {_.as_path() for _ in objects} == expected_files + + async def test_get_file_metadata( mocked_s3_server_envs: EnvVarsDict, with_s3_bucket: S3BucketName, @@ -1126,12 +1406,13 @@ async def test_copy_file_invalid_raises( @pytest.mark.parametrize( - "directory_size, min_file_size, max_file_size", + "directory_size, min_file_size, max_file_size, depth", [ ( TypeAdapter(ByteSize).validate_python("1Mib"), TypeAdapter(ByteSize).validate_python("1B"), TypeAdapter(ByteSize).validate_python("10Kib"), + None, ) ], ids=byte_size_ids, @@ -1152,12 +1433,13 @@ async def test_get_directory_metadata( @pytest.mark.parametrize( - "directory_size, min_file_size, max_file_size", + "directory_size, min_file_size, max_file_size, depth", [ ( TypeAdapter(ByteSize).validate_python("1Mib"), TypeAdapter(ByteSize).validate_python("1B"), TypeAdapter(ByteSize).validate_python("10Kib"), + None, ) ], ids=byte_size_ids, @@ -1184,12 +1466,13 @@ async def test_get_directory_metadata_raises( @pytest.mark.parametrize( - "directory_size, min_file_size, max_file_size", + "directory_size, min_file_size, max_file_size, depth", [ ( TypeAdapter(ByteSize).validate_python("1Mib"), TypeAdapter(ByteSize).validate_python("1B"), TypeAdapter(ByteSize).validate_python("10Kib"), + None, ) ], ids=byte_size_ids, @@ -1220,12 +1503,13 @@ async def test_delete_file_recursively( @pytest.mark.parametrize( - "directory_size, min_file_size, max_file_size", + "directory_size, min_file_size, max_file_size, depth", [ ( TypeAdapter(ByteSize).validate_python("1Mib"), TypeAdapter(ByteSize).validate_python("1B"), TypeAdapter(ByteSize).validate_python("10Kib"), + None, ) ], ids=byte_size_ids, @@ -1258,12 +1542,13 @@ async def test_delete_file_recursively_raises( @pytest.mark.parametrize( - "directory_size, min_file_size, max_file_size", + "directory_size, min_file_size, max_file_size, depth", [ ( TypeAdapter(ByteSize).validate_python("1Mib"), TypeAdapter(ByteSize).validate_python("1B"), TypeAdapter(ByteSize).validate_python("10Kib"), + None, ) ], ids=byte_size_ids, @@ -1351,7 +1636,6 @@ def test_upload_file_performance( upload_file: Callable[[Path, Path | None], Awaitable[UploadedFile]], benchmark: BenchmarkFixture, ): - # create random files of random size and upload to S3 file = create_file_of_size(file_size) @@ -1362,17 +1646,19 @@ def run_async_test(*args, **kwargs) -> None: @pytest.mark.parametrize( - "directory_size, min_file_size, max_file_size", + "directory_size, min_file_size, max_file_size, depth", [ ( TypeAdapter(ByteSize).validate_python("1Mib"), TypeAdapter(ByteSize).validate_python("1B"), TypeAdapter(ByteSize).validate_python("10Kib"), + None, ), ( TypeAdapter(ByteSize).validate_python("500Mib"), TypeAdapter(ByteSize).validate_python("10Mib"), TypeAdapter(ByteSize).validate_python("50Mib"), + None, ), ], ids=byte_size_ids, diff --git a/packages/models-library/src/models_library/api_schemas_storage/storage_schemas.py b/packages/models-library/src/models_library/api_schemas_storage/storage_schemas.py index 2faa2462cfd..11c7025e2ac 100644 --- a/packages/models-library/src/models_library/api_schemas_storage/storage_schemas.py +++ b/packages/models-library/src/models_library/api_schemas_storage/storage_schemas.py @@ -8,13 +8,10 @@ from datetime import datetime from enum import Enum - -# /data-export +from pathlib import Path from typing import Annotated, Any, Literal, Self, TypeAlias from uuid import UUID -from models_library.projects import ProjectID -from models_library.users import UserID from pydantic import ( BaseModel, ByteSize, @@ -26,11 +23,18 @@ field_validator, model_validator, ) +from pydantic.config import JsonDict from pydantic.networks import AnyUrl -from ..basic_regex import DATCORE_DATASET_NAME_RE, S3_BUCKET_NAME_RE +from ..basic_regex import ( + DATCORE_COLLECTION_NAME_RE, + DATCORE_DATASET_NAME_RE, + DATCORE_FILE_ID_RE, + S3_BUCKET_NAME_RE, +) from ..basic_types import SHA256Str from ..generics import ListModel +from ..projects import ProjectID from ..projects_nodes_io import ( LocationID, LocationName, @@ -38,6 +42,7 @@ SimcoreS3FileID, StorageFileID, ) +from ..users import UserID ETag: TypeAlias = str @@ -46,6 +51,12 @@ DatCoreDatasetName: TypeAlias = Annotated[ str, StringConstraints(pattern=DATCORE_DATASET_NAME_RE) ] +DatCoreCollectionName: TypeAlias = Annotated[ + str, StringConstraints(pattern=DATCORE_COLLECTION_NAME_RE) +] +DatCorePackageName: TypeAlias = Annotated[ + str, StringConstraints(pattern=DATCORE_FILE_ID_RE) +] # / @@ -61,14 +72,20 @@ class FileLocation(BaseModel): name: LocationName id: LocationID + @staticmethod + def _update_json_schema_extra(schema: JsonDict) -> None: + schema.update( + { + "examples": [ + {"name": "simcore.s3", "id": 0}, + {"name": "datcore", "id": 1}, + ] + } + ) + model_config = ConfigDict( extra="forbid", - json_schema_extra={ - "examples": [ - {"name": "simcore.s3", "id": 0}, - {"name": "datcore", "id": 1}, - ] - }, + json_schema_extra=_update_json_schema_extra, ) @@ -79,35 +96,42 @@ class FileLocation(BaseModel): class DatasetMetaDataGet(BaseModel): dataset_id: UUID | DatCoreDatasetName display_name: str + + @staticmethod + def _update_json_schema_extra(schema: JsonDict) -> None: + schema.update( + { + "examples": [ + # simcore dataset + { + "dataset_id": "74a84992-8c99-47de-b88a-311c068055ea", + "display_name": "api", + }, + { + "dataset_id": "1c46752c-b096-11ea-a3c4-02420a00392e", + "display_name": "Octave JupyterLab", + }, + { + "dataset_id": "2de04d1a-f346-11ea-9c22-02420a00085a", + "display_name": "Sleepers", + }, + # datcore datasets + { + "dataset_id": "N:dataset:be862eb8-861e-4b36-afc3-997329dd02bf", + "display_name": "simcore-testing-bucket", + }, + { + "dataset_id": "N:dataset:9ad8adb0-8ea2-4be6-bc45-ecbec7546393", + "display_name": "YetAnotherTest", + }, + ] + } + ) + model_config = ConfigDict( extra="forbid", from_attributes=True, - json_schema_extra={ - "examples": [ - # simcore dataset - { - "dataset_id": "74a84992-8c99-47de-b88a-311c068055ea", - "display_name": "api", - }, - { - "dataset_id": "1c46752c-b096-11ea-a3c4-02420a00392e", - "display_name": "Octave JupyterLab", - }, - { - "dataset_id": "2de04d1a-f346-11ea-9c22-02420a00085a", - "display_name": "Sleepers", - }, - # datcore datasets - { - "dataset_id": "N:dataset:be862eb8-861e-4b36-afc3-997329dd02bf", - "display_name": "simcore-testing-bucket", - }, - { - "dataset_id": "N:dataset:9ad8adb0-8ea2-4be6-bc45-ecbec7546393", - "display_name": "YetAnotherTest", - }, - ] - }, + json_schema_extra=_update_json_schema_extra, ) @@ -171,84 +195,90 @@ class FileMetaDataGet(BaseModel): description="SHA256 message digest of the file content. Main purpose: cheap lookup.", ) + @staticmethod + def _update_json_schema_extra(schema: JsonDict) -> None: + schema.update( + { + "examples": [ + # typical S3 entry + { + "created_at": "2020-06-17 12:28:55.705340", + "entity_tag": "8711cf258714b2de5498f5a5ef48cc7b", + "file_id": "1c46752c-b096-11ea-a3c4-02420a00392e/e603724d-4af1-52a1-b866-0d4b792f8c4a/work.zip", + "file_name": "work.zip", + "file_size": 17866343, + "file_uuid": "1c46752c-b096-11ea-a3c4-02420a00392e/e603724d-4af1-52a1-b866-0d4b792f8c4a/work.zip", + "is_soft_link": False, + "last_modified": "2020-06-22 13:48:13.398000+00:00", + "location_id": 0, + "node_name": "JupyterLab Octave", + "project_name": "Octave JupyterLab", + }, + # typical directory entry + { + "created_at": "2020-06-17 12:28:55.705340", + "entity_tag": "8711cf258714b2de5498f5a5ef48cc7b", + "file_id": "9a759caa-9890-4537-8c26-8edefb7a4d7c/be165f45-ddbf-4911-a04d-bc0b885914ef/workspace", + "file_name": "workspace", + "file_size": -1, + "file_uuid": "9a759caa-9890-4537-8c26-8edefb7a4d7c/be165f45-ddbf-4911-a04d-bc0b885914ef/workspace", + "is_soft_link": False, + "last_modified": "2020-06-22 13:48:13.398000+00:00", + "location_id": 0, + "node_name": None, + "project_name": None, + "is_directory": True, + }, + # api entry (not soft link) + { + "created_at": "2020-06-17 12:28:55.705340", + "entity_tag": "8711cf258714b2de5498f5a5ef48cc7b", + "file_id": "api/7b6b4e3d-39ae-3559-8765-4f815a49984e/tmpf_qatpzx", + "file_name": "tmpf_qatpzx", + "file_size": 86, + "file_uuid": "api/7b6b4e3d-39ae-3559-8765-4f815a49984e/tmpf_qatpzx", + "is_soft_link": False, + "last_modified": "2020-06-22 13:48:13.398000+00:00", + "location_id": 0, + "node_name": None, + "project_name": None, + }, + # api entry (soft link) + { + "created_at": "2020-06-17 12:28:55.705340", + "entity_tag": "36aa3644f526655a6f557207e4fd25b8", + "file_id": "api/6f788ad9-0ad8-3d0d-9722-72f08c24a212/output_data.json", + "file_name": "output_data.json", + "file_size": 183, + "file_uuid": "api/6f788ad9-0ad8-3d0d-9722-72f08c24a212/output_data.json", + "is_soft_link": True, + "last_modified": "2020-06-22 13:48:13.398000+00:00", + "location_id": 0, + "node_name": None, + "project_name": None, + }, + # datcore entry + { + "created_at": "2020-05-28T15:48:34.386302+00:00", + "entity_tag": None, + "file_id": "N:package:ce145b61-7e4f-470b-a113-033653e86d3d", + "file_name": "templatetemplate.json", + "file_size": 238, + "file_uuid": "Kember Cardiac Nerve Model/templatetemplate.json", + "is_soft_link": False, + "last_modified": "2020-05-28T15:48:37.507387+00:00", + "location_id": 1, + "node_name": None, + "project_name": None, + }, + ] + } + ) + model_config = ConfigDict( extra="ignore", from_attributes=True, - json_schema_extra={ - "examples": [ - # typical S3 entry - { - "created_at": "2020-06-17 12:28:55.705340", - "entity_tag": "8711cf258714b2de5498f5a5ef48cc7b", - "file_id": "1c46752c-b096-11ea-a3c4-02420a00392e/e603724d-4af1-52a1-b866-0d4b792f8c4a/work.zip", - "file_name": "work.zip", - "file_size": 17866343, - "file_uuid": "1c46752c-b096-11ea-a3c4-02420a00392e/e603724d-4af1-52a1-b866-0d4b792f8c4a/work.zip", - "is_soft_link": False, - "last_modified": "2020-06-22 13:48:13.398000+00:00", - "location_id": 0, - "node_name": "JupyterLab Octave", - "project_name": "Octave JupyterLab", - }, - # typical directory entry - { - "created_at": "2020-06-17 12:28:55.705340", - "entity_tag": "8711cf258714b2de5498f5a5ef48cc7b", - "file_id": "9a759caa-9890-4537-8c26-8edefb7a4d7c/be165f45-ddbf-4911-a04d-bc0b885914ef/workspace", - "file_name": "workspace", - "file_size": -1, - "file_uuid": "9a759caa-9890-4537-8c26-8edefb7a4d7c/be165f45-ddbf-4911-a04d-bc0b885914ef/workspace", - "is_soft_link": False, - "last_modified": "2020-06-22 13:48:13.398000+00:00", - "location_id": 0, - "node_name": None, - "project_name": None, - "is_directory": True, - }, - # api entry (not soft link) - { - "created_at": "2020-06-17 12:28:55.705340", - "entity_tag": "8711cf258714b2de5498f5a5ef48cc7b", - "file_id": "api/7b6b4e3d-39ae-3559-8765-4f815a49984e/tmpf_qatpzx", - "file_name": "tmpf_qatpzx", - "file_size": 86, - "file_uuid": "api/7b6b4e3d-39ae-3559-8765-4f815a49984e/tmpf_qatpzx", - "is_soft_link": False, - "last_modified": "2020-06-22 13:48:13.398000+00:00", - "location_id": 0, - "node_name": None, - "project_name": None, - }, - # api entry (soft link) - { - "created_at": "2020-06-17 12:28:55.705340", - "entity_tag": "36aa3644f526655a6f557207e4fd25b8", - "file_id": "api/6f788ad9-0ad8-3d0d-9722-72f08c24a212/output_data.json", - "file_name": "output_data.json", - "file_size": 183, - "file_uuid": "api/6f788ad9-0ad8-3d0d-9722-72f08c24a212/output_data.json", - "is_soft_link": True, - "last_modified": "2020-06-22 13:48:13.398000+00:00", - "location_id": 0, - "node_name": None, - "project_name": None, - }, - # datcore entry - { - "created_at": "2020-05-28T15:48:34.386302+00:00", - "entity_tag": None, - "file_id": "N:package:ce145b61-7e4f-470b-a113-033653e86d3d", - "file_name": "templatetemplate.json", - "file_size": 238, - "file_uuid": "Kember Cardiac Nerve Model/templatetemplate.json", - "is_soft_link": False, - "last_modified": "2020-05-28T15:48:37.507387+00:00", - "location_id": 1, - "node_name": None, - "project_name": None, - }, - ] - }, + json_schema_extra=_update_json_schema_extra, ) @field_validator("location_id", mode="before") @@ -263,9 +293,6 @@ class FileMetaDataArray(RootModel[list[FileMetaDataGet]]): root: list[FileMetaDataGet] = Field(default_factory=list) -# /locations/{location_id}/files/{file_id} - - class LinkType(str, Enum): PRESIGNED = "PRESIGNED" S3 = "S3" @@ -285,23 +312,28 @@ class FileUploadSchema(BaseModel): urls: list[AnyUrl] links: FileUploadLinks - model_config = ConfigDict( - extra="forbid", - json_schema_extra={ - "examples": [ - # typical S3 entry - { - "chunk_size": "10000000", - "urls": [ - "https://s3.amazonaws.com/bucket-name/key-name?AWSAccessKeyId=AKIAIOSFODNN7EXAMPLE&Expires=1698298164&Signature=WObYM%2F%2B4t7O3%2FZS3Kegb%2Bc4%3D", - ], - "links": { - "abort_upload": "https://storage.com:3021/bucket-name/key-name:abort", - "complete_upload": "https://storage.com:3021/bucket-name/key-name:complete", + @staticmethod + def _update_json_schema_extra(schema: JsonDict) -> None: + schema.update( + { + "examples": [ + # typical S3 entry + { + "chunk_size": "10000000", + "urls": [ + "https://s3.amazonaws.com/bucket-name/key-name?AWSAccessKeyId=AKIAIOSFODNN7EXAMPLE&Expires=1698298164&Signature=WObYM%2F%2B4t7O3%2FZS3Kegb%2Bc4%3D", + ], + "links": { + "abort_upload": "https://storage.com:3021/bucket-name/key-name:abort", + "complete_upload": "https://storage.com:3021/bucket-name/key-name:complete", + }, }, - }, - ] - }, + ] + } + ) + + model_config = ConfigDict( + extra="forbid", json_schema_extra=_update_json_schema_extra ) @@ -370,3 +402,56 @@ def ensure_consistent_entries(self: Self) -> Self: class SoftCopyBody(BaseModel): link_id: SimcoreS3FileID + + +class PathMetaDataGet(BaseModel): + path: Annotated[Path, Field(description="the path to the current path")] + display_path: Annotated[ + Path, Field(description="the path to display with UUID replaced") + ] + + file_meta_data: Annotated[ + FileMetaDataGet | None, + Field(description="if filled, this is the file meta data of the s3 object"), + ] = None + + @staticmethod + def _update_json_schema_extra(schema: JsonDict) -> None: + schema.update( + { + "examples": [ + # ls no filter + { + "path": "f8da77a9-24b9-4eab-aee7-1f0608da1e3e", + "display_path": "my amazing project", + }, + # ls f8da77a9-24b9-4eab-aee7-1f0608da1e3e + { + "path": "f8da77a9-24b9-4eab-aee7-1f0608da1e3e/2f94f80f-633e-4dfa-a983-226b7babe3d7", + "display_path": "my amazing project/awesome node", + }, + # ls f8da77a9-24b9-4eab-aee7-1f0608da1e3e/2f94f80f-633e-4dfa-a983-226b7babe3d7 + { + "path": "f8da77a9-24b9-4eab-aee7-1f0608da1e3e/2f94f80f-633e-4dfa-a983-226b7babe3d7/outputs", + "display_path": "my amazing project/awesome node/outputs", + }, + # ls f8da77a9-24b9-4eab-aee7-1f0608da1e3e/2f94f80f-633e-4dfa-a983-226b7babe3d7/outputs + { + "path": "f8da77a9-24b9-4eab-aee7-1f0608da1e3e/2f94f80f-633e-4dfa-a983-226b7babe3d7/outputs/output5", + "display_path": "my amazing project/awesome node/outputs/output5", + }, + # ls f8da77a9-24b9-4eab-aee7-1f0608da1e3e/2f94f80f-633e-4dfa-a983-226b7babe3d7/outputs/output_5 + { + "path": f"f8da77a9-24b9-4eab-aee7-1f0608da1e3e/2f94f80f-633e-4dfa-a983-226b7babe3d7/outputs/output5/{FileMetaDataGet.model_json_schema()['examples'][0]['file_name']}", + "display_path": f"my amazing project/awesome node/outputs/output5/{FileMetaDataGet.model_json_schema()['examples'][0]['file_name']}", + "file_meta_data": FileMetaDataGet.model_json_schema()[ + "examples" + ][0], + }, + ] + } + ) + + model_config = ConfigDict( + extra="forbid", json_schema_extra=_update_json_schema_extra + ) diff --git a/packages/models-library/src/models_library/api_schemas_webserver/storage.py b/packages/models-library/src/models_library/api_schemas_webserver/storage.py index 10808b69049..0721d153db5 100644 --- a/packages/models-library/src/models_library/api_schemas_webserver/storage.py +++ b/packages/models-library/src/models_library/api_schemas_webserver/storage.py @@ -2,6 +2,8 @@ from pathlib import Path from typing import Any +from pydantic import BaseModel + from ..api_schemas_rpc_async_jobs.async_jobs import ( AsyncJobGet, AsyncJobId, @@ -11,10 +13,19 @@ from ..api_schemas_storage.data_export_async_jobs import DataExportTaskStartInput from ..progress_bar import ProgressReport from ..projects_nodes_io import LocationID +from ..rest_pagination import CursorQueryParameters from ..users import UserID from ._base import InputSchema, OutputSchema +class StorageLocationPathParams(BaseModel): + location_id: LocationID + + +class ListPathsQueryParams(InputSchema, CursorQueryParameters): + file_filter: Path | None = None + + class DataExportPost(InputSchema): paths: list[Path] diff --git a/packages/models-library/src/models_library/basic_regex.py b/packages/models-library/src/models_library/basic_regex.py index b65c0fd1fe1..33cff4a2f7d 100644 --- a/packages/models-library/src/models_library/basic_regex.py +++ b/packages/models-library/src/models_library/basic_regex.py @@ -1,8 +1,8 @@ -""" Regular expressions patterns to build pydantic contrained strings +"""Regular expressions patterns to build pydantic contrained strings - - Variants of the patterns with 'Named Groups' captured are suffixed with NG_RE +- Variants of the patterns with 'Named Groups' captured are suffixed with NG_RE - SEE tests_basic_regex.py for examples +SEE tests_basic_regex.py for examples """ # TODO: for every pattern we should have a formatter function # NOTE: some sites to manualy check ideas @@ -56,6 +56,7 @@ # Datcore file ID DATCORE_FILE_ID_RE = rf"^N:package:{UUID_RE_BASE}$" DATCORE_DATASET_NAME_RE = rf"^N:dataset:{UUID_RE_BASE}$" +DATCORE_COLLECTION_NAME_RE = rf"^N:collection:{UUID_RE_BASE}$" TWILIO_ALPHANUMERIC_SENDER_ID_RE = r"(?!^\d+$)^[a-zA-Z0-9\s]{2,11}$" diff --git a/packages/models-library/src/models_library/rest_pagination.py b/packages/models-library/src/models_library/rest_pagination.py index b2c82726798..5601a5968da 100644 --- a/packages/models-library/src/models_library/rest_pagination.py +++ b/packages/models-library/src/models_library/rest_pagination.py @@ -31,6 +31,23 @@ ).validate_python(20) +class CursorQueryParameters(RequestParameters): + """Use as pagination options in query parameters""" + + size: PageLimitInt = Field( + default=TypeAdapter(PageLimitInt).validate_python( + DEFAULT_NUMBER_OF_ITEMS_PER_PAGE + ), + description="maximum number of items to return (pagination)", + ) + cursor: Annotated[ + str | None, + Field( + description="unique identifier that represent the position in the dataset" + ), + ] = None + + class PageQueryParameters(RequestParameters): """Use as pagination options in query parameters""" diff --git a/packages/pytest-simcore/src/pytest_simcore/helpers/storage_utils.py b/packages/pytest-simcore/src/pytest_simcore/helpers/storage_utils.py index 25f3b2b6e49..f889cf31a77 100644 --- a/packages/pytest-simcore/src/pytest_simcore/helpers/storage_utils.py +++ b/packages/pytest-simcore/src/pytest_simcore/helpers/storage_utils.py @@ -1,10 +1,13 @@ import logging import os +from dataclasses import dataclass from pathlib import Path from typing import Any, TypedDict import sqlalchemy as sa +from faker import Faker from models_library.basic_types import SHA256Str +from pydantic import ByteSize from simcore_postgres_database.storage_models import projects from sqlalchemy.ext.asyncio import AsyncEngine @@ -33,3 +36,21 @@ async def get_updated_project( class FileIDDict(TypedDict): path: Path sha256_checksum: SHA256Str + + +@dataclass(frozen=True, kw_only=True, slots=True) +class ProjectWithFilesParams: + num_nodes: int + allowed_file_sizes: tuple[ByteSize, ...] + workspace_files_count: int + allowed_file_checksums: tuple[SHA256Str, ...] = None # type: ignore # NOTE: OK for testing + + def __post_init__(self): + if self.allowed_file_checksums is None: + # generate some random checksums for the corresponding file sizes + faker = Faker() + checksums = tuple(faker.sha256() for _ in self.allowed_file_sizes) + object.__setattr__(self, "allowed_file_checksums", checksums) + + def __repr__(self) -> str: + return f"ProjectWithFilesParams: #nodes={self.num_nodes}, file sizes={[_.human_readable() for _ in self.allowed_file_sizes]}" diff --git a/packages/pytest-simcore/src/pytest_simcore/services_api_mocks_for_aiohttp_clients.py b/packages/pytest-simcore/src/pytest_simcore/services_api_mocks_for_aiohttp_clients.py index da657de6917..7b7e9746170 100644 --- a/packages/pytest-simcore/src/pytest_simcore/services_api_mocks_for_aiohttp_clients.py +++ b/packages/pytest-simcore/src/pytest_simcore/services_api_mocks_for_aiohttp_clients.py @@ -297,9 +297,7 @@ async def storage_v0_service_mock( aioresponses_mocker.get( get_file_metadata_pattern, status=status.HTTP_200_OK, - payload={ - "data": FileMetaDataGet.model_config["json_schema_extra"]["examples"][0] - }, + payload={"data": FileMetaDataGet.model_json_schema()["examples"][0]}, repeat=True, ) aioresponses_mocker.get( diff --git a/packages/pytest-simcore/src/pytest_simcore/simcore_storage_data_models.py b/packages/pytest-simcore/src/pytest_simcore/simcore_storage_data_models.py index b172b3b34df..4ca55f24bd6 100644 --- a/packages/pytest-simcore/src/pytest_simcore/simcore_storage_data_models.py +++ b/packages/pytest-simcore/src/pytest_simcore/simcore_storage_data_models.py @@ -2,29 +2,23 @@ # pylint: disable=unused-argument # pylint: disable=unused-variable -from collections import deque from collections.abc import AsyncIterator, Awaitable, Callable from contextlib import asynccontextmanager -from pathlib import Path -from random import choice, randint -from typing import Any, cast +from typing import Any import pytest import sqlalchemy as sa from faker import Faker -from models_library.basic_types import SHA256Str from models_library.projects import ProjectID -from models_library.projects_nodes_io import NodeID, SimcoreS3FileID, StorageFileID +from models_library.projects_nodes_io import NodeID from models_library.users import UserID -from pydantic import ByteSize, TypeAdapter -from servicelib.utils import limited_gather +from pydantic import TypeAdapter from simcore_postgres_database.models.project_to_groups import project_to_groups from simcore_postgres_database.storage_models import projects, users from sqlalchemy.dialects.postgresql import insert as pg_insert from sqlalchemy.ext.asyncio import AsyncConnection, AsyncEngine from .helpers.faker_factories import random_project, random_user -from .helpers.storage_utils import FileIDDict, get_updated_project @asynccontextmanager @@ -257,134 +251,3 @@ async def _creator( return new_node_id return _creator - - -async def _upload_file_and_update_project( - project_id: ProjectID, - node_id: NodeID, - *, - file_name: str | None, - file_id: StorageFileID | None, - file_sizes: tuple[ByteSize, ...], - file_checksums: tuple[SHA256Str, ...], - node_to_files_mapping: dict[NodeID, dict[SimcoreS3FileID, FileIDDict]], - upload_file: Callable[..., Awaitable[tuple[Path, SimcoreS3FileID]]], - create_simcore_file_id: Callable[ - [ProjectID, NodeID, str, Path | None], SimcoreS3FileID - ], - faker: Faker, -) -> None: - if file_name is None: - file_name = faker.file_name() - file_id = create_simcore_file_id(project_id, node_id, file_name, None) - checksum: SHA256Str = choice(file_checksums) # noqa: S311 - src_file, _ = await upload_file( - file_size=choice(file_sizes), # noqa: S311 - file_name=file_name, - file_id=file_id, - sha256_checksum=checksum, - ) - assert file_name is not None - assert file_id is not None - node_to_files_mapping[node_id][file_id] = { - "path": src_file, - "sha256_checksum": checksum, - } - - -@pytest.fixture -async def random_project_with_files( - sqlalchemy_async_engine: AsyncEngine, - create_project: Callable[..., Awaitable[dict[str, Any]]], - create_project_node: Callable[..., Awaitable[NodeID]], - create_simcore_file_id: Callable[ - [ProjectID, NodeID, str, Path | None], SimcoreS3FileID - ], - upload_file: Callable[..., Awaitable[tuple[Path, SimcoreS3FileID]]], - faker: Faker, -) -> Callable[ - [int, tuple[ByteSize, ...], tuple[SHA256Str, ...]], - Awaitable[tuple[dict[str, Any], dict[NodeID, dict[SimcoreS3FileID, FileIDDict]]]], -]: - async def _creator( - num_nodes: int = 12, - file_sizes: tuple[ByteSize, ...] = ( - TypeAdapter(ByteSize).validate_python("7Mib"), - TypeAdapter(ByteSize).validate_python("110Mib"), - TypeAdapter(ByteSize).validate_python("1Mib"), - ), - file_checksums: tuple[SHA256Str, ...] = ( - TypeAdapter(SHA256Str).validate_python( - "311e2e130d83cfea9c3b7560699c221b0b7f9e5d58b02870bd52b695d8b4aabd" - ), - TypeAdapter(SHA256Str).validate_python( - "08e297db979d3c84f6b072c2a1e269e8aa04e82714ca7b295933a0c9c0f62b2e" - ), - TypeAdapter(SHA256Str).validate_python( - "488f3b57932803bbf644593bd46d95599b1d4da1d63bc020d7ebe6f1c255f7f3" - ), - ), - ) -> tuple[dict[str, Any], dict[NodeID, dict[SimcoreS3FileID, FileIDDict]]]: - assert len(file_sizes) == len(file_checksums) - project = await create_project(name="random-project") - node_to_files_mapping: dict[NodeID, dict[SimcoreS3FileID, FileIDDict]] = {} - upload_tasks: deque[Awaitable] = deque() - for _node_index in range(num_nodes): - # Create a node with outputs (files and others) - project_id = ProjectID(project["uuid"]) - node_id = cast(NodeID, faker.uuid4(cast_to=None)) - output3_file_name = faker.file_name() - output3_file_id = create_simcore_file_id( - project_id, node_id, output3_file_name, Path("outputs/output_3") - ) - created_node_id = await create_project_node( - ProjectID(project["uuid"]), - node_id, - outputs={ - "output_1": faker.pyint(), - "output_2": faker.pystr(), - "output_3": f"{output3_file_id}", - }, - ) - assert created_node_id == node_id - - node_to_files_mapping[created_node_id] = {} - upload_tasks.append( - _upload_file_and_update_project( - project_id, - node_id, - file_name=output3_file_name, - file_id=output3_file_id, - file_sizes=file_sizes, - file_checksums=file_checksums, - upload_file=upload_file, - create_simcore_file_id=create_simcore_file_id, - faker=faker, - node_to_files_mapping=node_to_files_mapping, - ) - ) - - # add a few random files in the node workspace - upload_tasks.extend( - [ - _upload_file_and_update_project( - project_id, - node_id, - file_name=None, - file_id=None, - file_sizes=file_sizes, - file_checksums=file_checksums, - upload_file=upload_file, - create_simcore_file_id=create_simcore_file_id, - faker=faker, - node_to_files_mapping=node_to_files_mapping, - ) - for _ in range(randint(0, 3)) # noqa: S311 - ] - ) - await limited_gather(*upload_tasks, limit=10) - - project = await get_updated_project(sqlalchemy_async_engine, project["uuid"]) - return project, node_to_files_mapping - - return _creator diff --git a/packages/pytest-simcore/src/pytest_simcore/simcore_storage_service.py b/packages/pytest-simcore/src/pytest_simcore/simcore_storage_service.py index c0c6c26fbe1..02e3ddbc167 100644 --- a/packages/pytest-simcore/src/pytest_simcore/simcore_storage_service.py +++ b/packages/pytest-simcore/src/pytest_simcore/simcore_storage_service.py @@ -4,6 +4,7 @@ import os from collections.abc import Callable, Iterable from copy import deepcopy +from pathlib import Path import aiohttp import pytest @@ -12,7 +13,6 @@ from models_library.projects_nodes_io import NodeID, SimcoreS3FileID from pydantic import TypeAdapter from pytest_mock import MockerFixture -from servicelib.minio_utils import ServiceRetryPolicyUponInitialization from yarl import URL from .helpers.docker import get_service_published_port @@ -60,24 +60,34 @@ async def storage_service( return storage_endpoint -# TODO: this can be used by ANY of the simcore services! -@tenacity.retry(**ServiceRetryPolicyUponInitialization().kwargs) +@tenacity.retry( + wait=tenacity.wait_fixed(1), + stop=tenacity.stop_after_delay(30), + reraise=True, +) async def wait_till_storage_responsive(storage_endpoint: URL): - async with aiohttp.ClientSession() as session: - async with session.get(storage_endpoint.with_path("/v0/")) as resp: - assert resp.status == 200 - data = await resp.json() - assert "data" in data - assert data["data"] is not None + async with ( + aiohttp.ClientSession() as session, + session.get(storage_endpoint.with_path("/v0/")) as resp, + ): + assert resp.status == 200 + data = await resp.json() + assert "data" in data + assert data["data"] is not None @pytest.fixture def create_simcore_file_id() -> Callable[[ProjectID, NodeID, str], SimcoreS3FileID]: def _creator( - project_id: ProjectID, node_id: NodeID, file_name: str + project_id: ProjectID, + node_id: NodeID, + file_name: str, + file_base_path: Path | None = None, ) -> SimcoreS3FileID: - return TypeAdapter(SimcoreS3FileID).validate_python( - f"{project_id}/{node_id}/{file_name}" - ) + s3_file_name = file_name + if file_base_path: + s3_file_name = f"{file_base_path / file_name}" + clean_path = Path(f"{project_id}/{node_id}/{s3_file_name}") + return TypeAdapter(SimcoreS3FileID).validate_python(f"{clean_path}") return _creator diff --git a/packages/simcore-sdk/tests/unit/test_node_ports_v2_port.py b/packages/simcore-sdk/tests/unit/test_node_ports_v2_port.py index 516c828266f..6817d788faa 100644 --- a/packages/simcore-sdk/tests/unit/test_node_ports_v2_port.py +++ b/packages/simcore-sdk/tests/unit/test_node_ports_v2_port.py @@ -219,7 +219,7 @@ async def mock_filemanager(mocker: MockerFixture, e_tag: str, faker: Faker) -> N mocker.patch( "simcore_sdk.node_ports_common.filemanager._get_file_meta_data", return_value=TypeAdapter(FileMetaDataGet).validate_python( - FileMetaDataGet.model_config["json_schema_extra"]["examples"][0], + FileMetaDataGet.model_json_schema()["examples"][0], ), ) mocker.patch( diff --git a/packages/simcore-sdk/tests/unit/test_storage_client.py b/packages/simcore-sdk/tests/unit/test_storage_client.py index b02c8b2244b..feb61ed2042 100644 --- a/packages/simcore-sdk/tests/unit/test_storage_client.py +++ b/packages/simcore-sdk/tests/unit/test_storage_client.py @@ -179,7 +179,7 @@ async def test_get_file_metada( ) assert file_metadata assert file_metadata == FileMetaDataGet.model_validate( - FileMetaDataGet.model_config["json_schema_extra"]["examples"][0] + FileMetaDataGet.model_json_schema()["examples"][0] ) diff --git a/services/api-server/tests/unit/test_models_schemas_files.py b/services/api-server/tests/unit/test_models_schemas_files.py index 3a57327e324..bd7cfddfaf8 100644 --- a/services/api-server/tests/unit/test_models_schemas_files.py +++ b/services/api-server/tests/unit/test_models_schemas_files.py @@ -83,7 +83,7 @@ async def test_create_filemetadata_from_starlette_uploadfile( def test_convert_between_file_models(): storage_file_meta = StorageFileMetaData( - **StorageFileMetaData.model_config["json_schema_extra"]["examples"][1] + **StorageFileMetaData.model_json_schema()["examples"][1] ) storage_file_meta.file_id = TypeAdapter(StorageFileID).validate_python( f"api/{uuid4()}/extensionless" diff --git a/services/autoscaling/requirements/_base.txt b/services/autoscaling/requirements/_base.txt index e1fb697d2a2..e6d9bfc6ca5 100644 --- a/services/autoscaling/requirements/_base.txt +++ b/services/autoscaling/requirements/_base.txt @@ -752,13 +752,13 @@ typer==0.15.1 # -r requirements/../../../packages/aws-library/requirements/../../../packages/settings-library/requirements/_base.in # -r requirements/../../../packages/service-library/requirements/../../../packages/settings-library/requirements/_base.in # -r requirements/../../../packages/settings-library/requirements/_base.in -types-aiobotocore==2.15.2.post3 +types-aiobotocore==2.19.0 # via -r requirements/../../../packages/aws-library/requirements/_base.in -types-aiobotocore-ec2==2.15.2 +types-aiobotocore-ec2==2.19.0 # via types-aiobotocore -types-aiobotocore-s3==2.15.2.post1 +types-aiobotocore-s3==2.19.0 # via types-aiobotocore -types-aiobotocore-ssm==2.15.2 +types-aiobotocore-ssm==2.19.0 # via types-aiobotocore types-awscrt==0.23.3 # via botocore-stubs diff --git a/services/autoscaling/requirements/_test.txt b/services/autoscaling/requirements/_test.txt index 9cfd356c089..56c4a30e303 100644 --- a/services/autoscaling/requirements/_test.txt +++ b/services/autoscaling/requirements/_test.txt @@ -298,21 +298,21 @@ sympy==1.13.3 # via cfn-lint termcolor==2.5.0 # via pytest-sugar -types-aiobotocore==2.15.2.post3 +types-aiobotocore==2.19.0 # via # -c requirements/_base.txt # -r requirements/_test.in -types-aiobotocore-ec2==2.15.2 +types-aiobotocore-ec2==2.19.0 # via # -c requirements/_base.txt # types-aiobotocore -types-aiobotocore-iam==2.15.2 +types-aiobotocore-iam==2.19.0 # via types-aiobotocore -types-aiobotocore-s3==2.15.2.post1 +types-aiobotocore-s3==2.19.0 # via # -c requirements/_base.txt # types-aiobotocore -types-aiobotocore-ssm==2.15.2 +types-aiobotocore-ssm==2.19.0 # via # -c requirements/_base.txt # types-aiobotocore diff --git a/services/clusters-keeper/requirements/_base.txt b/services/clusters-keeper/requirements/_base.txt index 19d74db0e9d..c27de4d8e5d 100644 --- a/services/clusters-keeper/requirements/_base.txt +++ b/services/clusters-keeper/requirements/_base.txt @@ -750,13 +750,13 @@ typer==0.15.1 # -r requirements/../../../packages/aws-library/requirements/../../../packages/settings-library/requirements/_base.in # -r requirements/../../../packages/service-library/requirements/../../../packages/settings-library/requirements/_base.in # -r requirements/../../../packages/settings-library/requirements/_base.in -types-aiobotocore==2.15.2.post3 +types-aiobotocore==2.19.0 # via -r requirements/../../../packages/aws-library/requirements/_base.in -types-aiobotocore-ec2==2.15.2 +types-aiobotocore-ec2==2.19.0 # via types-aiobotocore -types-aiobotocore-s3==2.15.2.post1 +types-aiobotocore-s3==2.19.0 # via types-aiobotocore -types-aiobotocore-ssm==2.15.2 +types-aiobotocore-ssm==2.19.0 # via types-aiobotocore types-awscrt==0.23.3 # via botocore-stubs diff --git a/services/docker-api-proxy/requirements/_test.txt b/services/docker-api-proxy/requirements/_test.txt index 4e719d9681e..3052c7cece0 100644 --- a/services/docker-api-proxy/requirements/_test.txt +++ b/services/docker-api-proxy/requirements/_test.txt @@ -224,6 +224,8 @@ protobuf==5.29.3 # opentelemetry-proto psutil==6.1.1 # via -r requirements/../../../packages/service-library/requirements/_base.in +pycryptodome==3.21.0 + # via stream-zip pydantic==2.10.6 # via # -c requirements/../../../packages/common-library/requirements/../../../requirements/constraints.txt @@ -391,6 +393,8 @@ starlette==0.45.3 # -c requirements/../../../packages/settings-library/requirements/../../../requirements/constraints.txt # -c requirements/../../../requirements/constraints.txt # fastapi +stream-zip==0.0.83 + # via -r requirements/../../../packages/service-library/requirements/_base.in tenacity==9.0.0 # via # -r requirements/../../../packages/service-library/requirements/_base.in diff --git a/services/efs-guardian/requirements/_base.txt b/services/efs-guardian/requirements/_base.txt index 0fba0025cdb..20fca38938e 100644 --- a/services/efs-guardian/requirements/_base.txt +++ b/services/efs-guardian/requirements/_base.txt @@ -754,13 +754,13 @@ typer==0.12.5 # -r requirements/../../../packages/aws-library/requirements/../../../packages/settings-library/requirements/_base.in # -r requirements/../../../packages/service-library/requirements/../../../packages/settings-library/requirements/_base.in # -r requirements/../../../packages/settings-library/requirements/_base.in -types-aiobotocore==2.15.2 +types-aiobotocore==2.19.0 # via -r requirements/../../../packages/aws-library/requirements/_base.in -types-aiobotocore-ec2==2.15.2 +types-aiobotocore-ec2==2.19.0 # via types-aiobotocore -types-aiobotocore-s3==2.15.2 +types-aiobotocore-s3==2.19.0 # via types-aiobotocore -types-aiobotocore-ssm==2.15.2 +types-aiobotocore-ssm==2.19.0 # via types-aiobotocore types-awscrt==0.22.0 # via botocore-stubs diff --git a/services/resource-usage-tracker/requirements/_base.txt b/services/resource-usage-tracker/requirements/_base.txt index 429f3e3d53e..f46019e65e0 100644 --- a/services/resource-usage-tracker/requirements/_base.txt +++ b/services/resource-usage-tracker/requirements/_base.txt @@ -802,13 +802,13 @@ typer==0.12.3 # -r requirements/../../../packages/service-library/requirements/../../../packages/settings-library/requirements/_base.in # -r requirements/../../../packages/settings-library/requirements/_base.in # -r requirements/_base.in -types-aiobotocore==2.12.1 +types-aiobotocore==2.19.0 # via -r requirements/../../../packages/aws-library/requirements/_base.in -types-aiobotocore-ec2==2.12.1 +types-aiobotocore-ec2==2.19.0 # via types-aiobotocore -types-aiobotocore-s3==2.12.1 +types-aiobotocore-s3==2.19.0 # via types-aiobotocore -types-aiobotocore-ssm==2.12.3 +types-aiobotocore-ssm==2.19.0 # via types-aiobotocore types-awscrt==0.20.5 # via botocore-stubs diff --git a/services/storage/openapi.json b/services/storage/openapi.json index 605354d0562..79c56f056fa 100644 --- a/services/storage/openapi.json +++ b/services/storage/openapi.json @@ -2,7 +2,7 @@ "openapi": "3.1.0", "info": { "title": "simcore_service_storage", - "description": "Service to auto-scale swarm", + "description": "Service that manages osparc storage backend", "version": "0.6.0" }, "paths": { @@ -979,6 +979,105 @@ } } }, + "/v0/locations/{location_id}/paths": { + "get": { + "tags": [ + "files" + ], + "summary": "List Paths", + "description": "Returns one level of files (paginated)", + "operationId": "list_paths_v0_locations__location_id__paths_get", + "parameters": [ + { + "name": "location_id", + "in": "path", + "required": true, + "schema": { + "type": "integer", + "title": "Location Id" + } + }, + { + "name": "user_id", + "in": "query", + "required": true, + "schema": { + "type": "integer", + "exclusiveMinimum": true, + "title": "User Id", + "minimum": 0 + } + }, + { + "name": "file_filter", + "in": "query", + "required": false, + "schema": { + "anyOf": [ + { + "type": "string", + "format": "path" + }, + { + "type": "null" + } + ], + "title": "File Filter" + } + }, + { + "name": "cursor", + "in": "query", + "required": false, + "schema": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Cursor" + } + }, + { + "name": "size", + "in": "query", + "required": false, + "schema": { + "type": "integer", + "maximum": 100, + "minimum": 0, + "default": 50, + "title": "Size" + } + } + ], + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/CursorPage_PathMetaDataGet_" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, "/v0/simcore-s3:access": { "post": { "tags": [ @@ -1320,6 +1419,82 @@ ], "title": "AppStatusCheck" }, + "CursorPage_PathMetaDataGet_": { + "properties": { + "items": { + "items": { + "$ref": "#/components/schemas/PathMetaDataGet" + }, + "type": "array", + "title": "Items" + }, + "total": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "title": "Total", + "description": "Total items" + }, + "current_page": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Current Page", + "description": "Cursor to refetch the current page" + }, + "current_page_backwards": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Current Page Backwards", + "description": "Cursor to refetch the current page starting from the last item" + }, + "previous_page": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Previous Page", + "description": "Cursor for the previous page" + }, + "next_page": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Next Page", + "description": "Cursor for the next page" + } + }, + "type": "object", + "required": [ + "items" + ], + "title": "CursorPage[PathMetaDataGet]" + }, "DatasetMetaDataGet": { "properties": { "dataset_id": { @@ -2256,6 +2431,40 @@ ], "title": "LinkType" }, + "PathMetaDataGet": { + "properties": { + "path": { + "type": "string", + "format": "path", + "title": "Path", + "description": "the path to the current path" + }, + "display_path": { + "type": "string", + "format": "path", + "title": "Display Path", + "description": "the path to display with UUID replaced" + }, + "file_meta_data": { + "anyOf": [ + { + "$ref": "#/components/schemas/FileMetaDataGet" + }, + { + "type": "null" + } + ], + "description": "if filled, this is the file meta data of the s3 object" + } + }, + "additionalProperties": false, + "type": "object", + "required": [ + "path", + "display_path" + ], + "title": "PathMetaDataGet" + }, "S3Settings": { "properties": { "S3_ACCESS_KEY": { diff --git a/services/storage/requirements/_base.txt b/services/storage/requirements/_base.txt index f947f8c7ba6..dac31468043 100644 --- a/services/storage/requirements/_base.txt +++ b/services/storage/requirements/_base.txt @@ -831,7 +831,7 @@ types-aiobotocore==2.19.0 # -r requirements/_base.in types-aiobotocore-ec2==2.19.0 # via types-aiobotocore -types-aiobotocore-s3==2.19.0.post1 +types-aiobotocore-s3==2.19.0 # via types-aiobotocore types-aiobotocore-ssm==2.19.0 # via types-aiobotocore diff --git a/services/storage/src/simcore_service_storage/api/rest/_files.py b/services/storage/src/simcore_service_storage/api/rest/_files.py index 3b1bb4c8a46..ca92fb1079f 100644 --- a/services/storage/src/simcore_service_storage/api/rest/_files.py +++ b/services/storage/src/simcore_service_storage/api/rest/_files.py @@ -81,6 +81,7 @@ async def get_file_metadata( user_agent: Annotated[str | None, Header()], request: Request, ): + # NOTE: Used by legacy dynamic services -> MUST BE BACKWARDS COMPATIBLE dsm = get_dsm_provider(request.app).get(location_id) try: data = await dsm.get_file( @@ -133,6 +134,7 @@ async def download_file( query_params: Annotated[FileDownloadQueryParams, Depends()], request: Request, ) -> Envelope[FileDownloadResponse]: + # NOTE: Used by legacy dynamic services -> MUST BE BACKWARDS COMPATIBLE dsm = get_dsm_provider(request.app).get(location_id) link = await dsm.create_file_download_link( query_params.user_id, file_id, query_params.link_type @@ -178,6 +180,7 @@ async def upload_file( Use-case v2.2: if query.file_size > 0 and query.link_type=presigned or None, returns 1 or more presigned links depending on the file size (limited to a single 5TB file) Use-case v2.3: if query.link_type=s3 and query.file_size>=0, returns a single s3 direct link (limited to a single 5TB file) """ + # NOTE: Used by legacy dynamic services with single presigned link -> MUST BE BACKWARDS COMPATIBLE dsm = get_dsm_provider(request.app).get(location_id) links: UploadLinks = await dsm.create_file_upload_links( user_id=query_params.user_id, diff --git a/services/storage/src/simcore_service_storage/api/rest/_locations.py b/services/storage/src/simcore_service_storage/api/rest/_locations.py index 133c65e2005..ec33f8e31c7 100644 --- a/services/storage/src/simcore_service_storage/api/rest/_locations.py +++ b/services/storage/src/simcore_service_storage/api/rest/_locations.py @@ -5,7 +5,6 @@ from models_library.api_schemas_storage.storage_schemas import FileLocation from models_library.generics import Envelope -# Exclusive for simcore-s3 storage ----------------------- from ...dsm import get_dsm_provider from ...models import StorageQueryParamsBase @@ -16,7 +15,6 @@ ) -# HANDLERS --------------------------------------------------- @router.get( "/locations", status_code=status.HTTP_200_OK, diff --git a/services/storage/src/simcore_service_storage/api/rest/_paths.py b/services/storage/src/simcore_service_storage/api/rest/_paths.py new file mode 100644 index 00000000000..3d1d3b64340 --- /dev/null +++ b/services/storage/src/simcore_service_storage/api/rest/_paths.py @@ -0,0 +1,45 @@ +import logging +from pathlib import Path +from typing import Annotated + +from fastapi import APIRouter, Depends +from fastapi_pagination import create_page +from fastapi_pagination.cursor import CursorPage, CursorParams +from models_library.api_schemas_storage.storage_schemas import PathMetaDataGet +from models_library.users import UserID + +from ...dsm_factory import BaseDataManager +from .dependencies.dsm_prodiver import get_data_manager + +_logger = logging.getLogger(__name__) + +router = APIRouter( + tags=[ + "files", + ], +) + + +@router.get( + "/locations/{location_id}/paths", + response_model=CursorPage[PathMetaDataGet], +) +async def list_paths( + page_params: Annotated[CursorParams, Depends()], + dsm: Annotated[BaseDataManager, Depends(get_data_manager)], + user_id: UserID, + file_filter: Path | None = None, +): + """Returns one level of files (paginated)""" + items, next_cursor, total_number = await dsm.list_paths( + user_id=user_id, + file_filter=file_filter, + limit=page_params.size, + cursor=page_params.to_raw_params().cursor, + ) + return create_page( + [_.to_api_model() for _ in items], + total=total_number, + params=page_params, + next_=next_cursor, + ) diff --git a/services/storage/src/simcore_service_storage/api/rest/dependencies/__init__.py b/services/storage/src/simcore_service_storage/api/rest/dependencies/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/services/storage/src/simcore_service_storage/api/rest/dependencies/application.py b/services/storage/src/simcore_service_storage/api/rest/dependencies/application.py new file mode 100644 index 00000000000..706818ca793 --- /dev/null +++ b/services/storage/src/simcore_service_storage/api/rest/dependencies/application.py @@ -0,0 +1,18 @@ +# mypy: disable-error-code=truthy-function +from fastapi import Request +from servicelib.fastapi.dependencies import get_app, get_reverse_url_mapper + +from ....core.settings import ApplicationSettings, get_application_settings + + +def get_settings(request: Request) -> ApplicationSettings: + return get_application_settings(request.app) + + +assert get_reverse_url_mapper # nosec +assert get_app # nosec + +__all__: tuple[str, ...] = ( + "get_app", + "get_reverse_url_mapper", +) diff --git a/services/storage/src/simcore_service_storage/api/rest/dependencies/dsm_prodiver.py b/services/storage/src/simcore_service_storage/api/rest/dependencies/dsm_prodiver.py new file mode 100644 index 00000000000..a0f16979e25 --- /dev/null +++ b/services/storage/src/simcore_service_storage/api/rest/dependencies/dsm_prodiver.py @@ -0,0 +1,23 @@ +from typing import Annotated + +from fastapi import Depends, FastAPI +from models_library.projects_nodes_io import LocationID +from servicelib.fastapi.dependencies import get_app + +from ....dsm import get_dsm_provider +from ....dsm_factory import BaseDataManager, DataManagerProvider + + +def get_data_manager_provider( + app: Annotated[FastAPI, Depends(get_app)], +) -> DataManagerProvider: + return get_dsm_provider(app) + + +async def get_data_manager( + location_id: LocationID, + data_manager_provider: Annotated[ + DataManagerProvider, Depends(get_data_manager_provider) + ], +) -> BaseDataManager: + return data_manager_provider.get(location_id) diff --git a/services/storage/src/simcore_service_storage/api/rest/routes.py b/services/storage/src/simcore_service_storage/api/rest/routes.py index bfba2ee5a22..bb77c1d3f77 100644 --- a/services/storage/src/simcore_service_storage/api/rest/routes.py +++ b/services/storage/src/simcore_service_storage/api/rest/routes.py @@ -1,23 +1,15 @@ from fastapi import APIRouter, FastAPI -from . import _datasets, _files, _health, _locations, _simcore_s3 +from . import _datasets, _files, _health, _locations, _paths, _simcore_s3 v0_router = APIRouter() -# health health_router = _health.router v0_router.include_router(_health.router) - -# locations v0_router.include_router(_locations.router) - -# datasets v0_router.include_router(_datasets.router) - -# files v0_router.include_router(_files.router) - -# simcore-s3 +v0_router.include_router(_paths.router) v0_router.include_router(_simcore_s3.router) diff --git a/services/storage/src/simcore_service_storage/core/application.py b/services/storage/src/simcore_service_storage/core/application.py index cfb4839079f..bc875ac70ba 100644 --- a/services/storage/src/simcore_service_storage/core/application.py +++ b/services/storage/src/simcore_service_storage/core/application.py @@ -8,6 +8,7 @@ from common_library.basic_types import BootModeEnum from fastapi import FastAPI from fastapi.middleware.gzip import GZipMiddleware +from fastapi_pagination import add_pagination from servicelib.fastapi import timing_middleware from servicelib.fastapi.client_session import setup_client_session from servicelib.fastapi.openapi import override_fastapi_openapi_method @@ -64,13 +65,14 @@ def create_app(settings: ApplicationSettings) -> FastAPI: debug=settings.SC_BOOT_MODE in [BootModeEnum.DEBUG, BootModeEnum.DEVELOPMENT, BootModeEnum.LOCAL], title=APP_NAME, - description="Service to auto-scale swarm", + description="Service that manages osparc storage backend", version=API_VERSION, openapi_url=f"/api/{API_VTAG}/openapi.json", docs_url="/dev/doc", redoc_url=None, # default disabled ) override_fastapi_openapi_method(app) + add_pagination(app) # STATE app.state.settings = settings diff --git a/services/storage/src/simcore_service_storage/datcore_dsm.py b/services/storage/src/simcore_service_storage/datcore_dsm.py index 3c9c50e3acc..b1def97a635 100644 --- a/services/storage/src/simcore_service_storage/datcore_dsm.py +++ b/services/storage/src/simcore_service_storage/datcore_dsm.py @@ -1,8 +1,13 @@ +import contextlib from dataclasses import dataclass +from pathlib import Path +import arrow from fastapi import FastAPI from models_library.api_schemas_storage.storage_schemas import ( + DatCoreCollectionName, DatCoreDatasetName, + DatCorePackageName, LinkType, UploadedPart, ) @@ -10,11 +15,19 @@ from models_library.projects import ProjectID from models_library.projects_nodes_io import LocationID, LocationName, StorageFileID from models_library.users import UserID -from pydantic import AnyUrl, ByteSize +from pydantic import AnyUrl, ByteSize, NonNegativeInt, TypeAdapter, ValidationError from .constants import DATCORE_ID, DATCORE_STR from .dsm_factory import BaseDataManager -from .models import DatasetMetaData, FileMetaData, UploadLinks +from .exceptions.errors import DatCoreCredentialsMissingError +from .models import ( + DatasetMetaData, + FileMetaData, + GenericCursor, + PathMetaData, + TotalNumber, + UploadLinks, +) from .modules.datcore_adapter import datcore_adapter from .modules.datcore_adapter.datcore_adapter_exceptions import ( DatcoreAdapterMultipleFilesError, @@ -22,11 +35,30 @@ from .modules.db.tokens import get_api_token_and_secret +def _check_api_credentials( + api_token: str | None, api_secret: str | None +) -> tuple[str, str]: + if not api_token or not api_secret: + raise DatCoreCredentialsMissingError + assert api_token is not None + assert api_secret is not None + return api_token, api_secret + + +def _is_collection(file_filter: Path) -> bool: + with contextlib.suppress(ValidationError): + TypeAdapter(DatCoreCollectionName).validate_python(file_filter.parts[1]) + return True + return False + + @dataclass class DatCoreDataManager(BaseDataManager): app: FastAPI - async def _get_datcore_tokens(self, user_id: UserID): + async def _get_datcore_tokens( + self, user_id: UserID + ) -> tuple[str | None, str | None]: return await get_api_token_and_secret(self.app, user_id) @classmethod @@ -47,14 +79,112 @@ async def authorized(self, user_id: UserID) -> bool: async def list_datasets(self, user_id: UserID) -> list[DatasetMetaData]: api_token, api_secret = await self._get_datcore_tokens(user_id) - return await datcore_adapter.list_datasets(self.app, api_token, api_secret) + api_token, api_secret = _check_api_credentials(api_token, api_secret) + return await datcore_adapter.list_all_datasets(self.app, api_token, api_secret) async def list_files_in_dataset( self, user_id: UserID, dataset_id: str, *, expand_dirs: bool ) -> list[FileMetaData]: api_token, api_secret = await self._get_datcore_tokens(user_id) + api_token, api_secret = _check_api_credentials(api_token, api_secret) return await datcore_adapter.list_all_files_metadatas_in_dataset( - self.app, user_id, api_token, api_secret, DatCoreDatasetName(dataset_id) + self.app, user_id, api_token, api_secret, dataset_id + ) + + async def list_paths( + self, + user_id: UserID, + *, + file_filter: Path | None, + cursor: GenericCursor | None, + limit: NonNegativeInt, + ) -> tuple[list[PathMetaData], GenericCursor | None, TotalNumber | None]: + """returns a page of the file meta data a user has access to""" + api_token, api_secret = await self._get_datcore_tokens(user_id) + api_token, api_secret = _check_api_credentials(api_token, api_secret) + if not file_filter: + datasets, next_cursor, total = await datcore_adapter.list_datasets( + self.app, + api_key=api_token, + api_secret=api_secret, + cursor=cursor, + limit=limit, + ) + return ( + [ + PathMetaData( + path=Path(f"{dataset.dataset_id}"), + display_path=Path(f"{dataset.display_name}"), + location_id=self.location_id, + location=self.location_name, + bucket_name="fake", + project_id=None, + node_id=None, + user_id=user_id, + created_at=arrow.utcnow().datetime, + last_modified=arrow.utcnow().datetime, + file_meta_data=None, + ) + for dataset in datasets + ], + next_cursor, + total, + ) + assert len(file_filter.parts) + + if len(file_filter.parts) == 1: + # this is looking into a dataset + return await datcore_adapter.list_top_level_objects_in_dataset( + self.app, + user_id=user_id, + api_key=api_token, + api_secret=api_secret, + dataset_id=TypeAdapter(DatCoreDatasetName).validate_python( + file_filter.parts[0] + ), + cursor=cursor, + limit=limit, + ) + assert len(file_filter.parts) == 2 + + if _is_collection(file_filter): + # this is a collection + return await datcore_adapter.list_top_level_objects_in_collection( + self.app, + user_id=user_id, + api_key=api_token, + api_secret=api_secret, + dataset_id=TypeAdapter(DatCoreDatasetName).validate_python( + file_filter.parts[0] + ), + collection_id=TypeAdapter(DatCoreCollectionName).validate_python( + file_filter.parts[1] + ), + cursor=cursor, + limit=limit, + ) + assert TypeAdapter(DatCorePackageName).validate_python( + file_filter.parts[1] + ) # nosec + + # only other option is a file or maybe a partial?? that would be bad + return ( + [ + await datcore_adapter.get_package_file_as_path( + self.app, + user_id=user_id, + api_key=api_token, + api_secret=api_secret, + dataset_id=TypeAdapter(DatCoreDatasetName).validate_python( + file_filter.parts[0] + ), + package_id=TypeAdapter(DatCorePackageName).validate_python( + file_filter.parts[1] + ), + ) + ], + None, + 1, ) async def list_files( @@ -66,34 +196,37 @@ async def list_files( project_id: ProjectID | None, ) -> list[FileMetaData]: api_token, api_secret = await self._get_datcore_tokens(user_id) + api_token, api_secret = _check_api_credentials(api_token, api_secret) return await datcore_adapter.list_all_datasets_files_metadatas( self.app, user_id, api_token, api_secret ) async def get_file(self, user_id: UserID, file_id: StorageFileID) -> FileMetaData: api_token, api_secret = await self._get_datcore_tokens(user_id) + api_token, api_secret = _check_api_credentials(api_token, api_secret) package_files = await datcore_adapter.get_package_files( - self.app, api_token, api_secret, file_id + self.app, api_key=api_token, api_secret=api_secret, package_id=file_id ) if not len(package_files) == 1: raise DatcoreAdapterMultipleFilesError( msg=f"{len(package_files)} files in package, this breaks the current assumption" ) - resp_data = package_files[0]["content"] + + file = package_files[0] return FileMetaData( file_uuid=file_id, location_id=DATCORE_ID, location=DATCORE_STR, - bucket_name=resp_data["s3bucket"], + bucket_name=file.s3_bucket, object_name=file_id, - file_name=resp_data["filename"], + file_name=file.filename, file_id=file_id, - file_size=resp_data["size"], - created_at=resp_data["createdAt"], - last_modified=resp_data["updatedAt"], + file_size=file.size, + created_at=file.created_at, + last_modified=file.updated_at, project_id=None, node_id=None, user_id=user_id, @@ -128,12 +261,14 @@ async def create_file_download_link( self, user_id: UserID, file_id: StorageFileID, link_type: LinkType ) -> AnyUrl: api_token, api_secret = await self._get_datcore_tokens(user_id) + api_token, api_secret = _check_api_credentials(api_token, api_secret) return await datcore_adapter.get_file_download_presigned_link( self.app, api_token, api_secret, file_id ) async def delete_file(self, user_id: UserID, file_id: StorageFileID) -> None: api_token, api_secret = await self._get_datcore_tokens(user_id) + api_token, api_secret = _check_api_credentials(api_token, api_secret) await datcore_adapter.delete_file(self.app, api_token, api_secret, file_id) diff --git a/services/storage/src/simcore_service_storage/dsm_factory.py b/services/storage/src/simcore_service_storage/dsm_factory.py index a5d579a23ee..28cb4abd98a 100644 --- a/services/storage/src/simcore_service_storage/dsm_factory.py +++ b/services/storage/src/simcore_service_storage/dsm_factory.py @@ -1,6 +1,7 @@ from abc import ABC, abstractmethod from collections.abc import Callable from dataclasses import dataclass, field +from pathlib import Path from fastapi import FastAPI from models_library.api_schemas_storage.storage_schemas import LinkType, UploadedPart @@ -8,9 +9,16 @@ from models_library.projects import ProjectID from models_library.projects_nodes_io import LocationID, LocationName, StorageFileID from models_library.users import UserID -from pydantic import AnyUrl, ByteSize +from pydantic import AnyUrl, ByteSize, NonNegativeInt -from .models import DatasetMetaData, FileMetaData, UploadLinks +from .models import ( + DatasetMetaData, + FileMetaData, + GenericCursor, + PathMetaData, + TotalNumber, + UploadLinks, +) class BaseDataManager(ABC): @@ -61,6 +69,17 @@ async def list_files( """returns all the file meta data a user has access to (uuid_filter and or project_id may be used)""" # NOTE: expand_dirs will be replaced by pagination in the future + @abstractmethod + async def list_paths( + self, + user_id: UserID, + *, + file_filter: Path | None, + cursor: GenericCursor | None, + limit: NonNegativeInt, + ) -> tuple[list[PathMetaData], GenericCursor | None, TotalNumber | None]: + """returns a page of the file meta data a user has access to""" + @abstractmethod async def get_file(self, user_id: UserID, file_id: StorageFileID) -> FileMetaData: """returns the file meta data of file_id if user_id has the rights to""" diff --git a/services/storage/src/simcore_service_storage/exceptions/errors.py b/services/storage/src/simcore_service_storage/exceptions/errors.py index 34674f2114e..5856a2fec5b 100644 --- a/services/storage/src/simcore_service_storage/exceptions/errors.py +++ b/services/storage/src/simcore_service_storage/exceptions/errors.py @@ -41,3 +41,7 @@ class AccessLayerError(StorageRuntimeError): class InvalidFileIdentifierError(AccessLayerError): msg_template: str = "Error in {identifier}: {details}" + + +class DatCoreCredentialsMissingError(StorageRuntimeError): + msg_template: str = "DatCore credentials are incomplete. TIP: Check your settings" diff --git a/services/storage/src/simcore_service_storage/exceptions/handlers.py b/services/storage/src/simcore_service_storage/exceptions/handlers.py index cb1af90443a..3c1d0bfa006 100644 --- a/services/storage/src/simcore_service_storage/exceptions/handlers.py +++ b/services/storage/src/simcore_service_storage/exceptions/handlers.py @@ -12,6 +12,7 @@ DatcoreAdapterTimeoutError, ) from .errors import ( + DatCoreCredentialsMissingError, FileAccessRightError, FileMetaDataNotFoundError, InvalidFileIdentifierError, @@ -85,17 +86,11 @@ def set_exception_handlers(app: FastAPI) -> None: envelope_error=True, ), ) - - # SEE https://docs.python.org/3/library/exceptions.html#exception-hierarchy - app.add_exception_handler( - NotImplementedError, - make_http_error_handler_for_exception( - status.HTTP_501_NOT_IMPLEMENTED, NotImplementedError, envelope_error=True - ), - ) app.add_exception_handler( - Exception, + DatCoreCredentialsMissingError, make_http_error_handler_for_exception( - status.HTTP_500_INTERNAL_SERVER_ERROR, Exception, envelope_error=True + status.HTTP_401_UNAUTHORIZED, + DatCoreCredentialsMissingError, + envelope_error=True, ), ) diff --git a/services/storage/src/simcore_service_storage/models.py b/services/storage/src/simcore_service_storage/models.py index 3febefb138f..3c9cfd51732 100644 --- a/services/storage/src/simcore_service_storage/models.py +++ b/services/storage/src/simcore_service_storage/models.py @@ -1,11 +1,13 @@ import datetime import urllib.parse from dataclasses import dataclass -from typing import Annotated, Any, Literal, NamedTuple +from pathlib import Path +from typing import Annotated, Any, Literal, NamedTuple, TypeAlias from uuid import UUID import arrow from aws_library.s3 import UploadID +from aws_library.s3._models import S3DirectoryMetaData, S3MetaData from models_library.api_schemas_storage.storage_schemas import ( UNDEFINED_SIZE, UNDEFINED_SIZE_TYPE, @@ -13,6 +15,7 @@ ETag, FileMetaDataGet, LinkType, + PathMetaDataGet, S3BucketName, ) from models_library.basic_types import SHA256Str @@ -36,6 +39,7 @@ ByteSize, ConfigDict, Field, + NonNegativeInt, PlainSerializer, TypeAdapter, field_validator, @@ -104,6 +108,16 @@ class FileMetaData(FileMetaDataGet): user_id: UserID | None sha256_checksum: SHA256Str | None + def update_display_fields(self, id_name_mapping: dict[str, str]) -> None: + if self.project_id: + # NOTE: this is disabled because the project_name is defined in FileMetaDataGet + # pylint: disable=attribute-defined-outside-init + self.project_name = id_name_mapping.get(f"{self.project_id}") + if self.node_id: + # NOTE: this is disabled because the node_name is defined in FileMetaDataGet + # pylint: disable=attribute-defined-outside-init + self.node_name = id_name_mapping.get(f"{self.node_id}") + @classmethod @validate_call def from_simcore_node( @@ -150,6 +164,30 @@ def from_simcore_node( fmd_kwargs.update(**file_meta_data_kwargs) return cls.model_validate(fmd_kwargs) + @classmethod + def from_db_model(cls, x: FileMetaDataAtDB) -> "FileMetaData": + return cls.model_validate( + x.model_dump() + | {"file_uuid": x.file_id, "file_name": x.file_id.split("/")[-1]} + ) + + @classmethod + def from_s3_object_in_dir( + cls, x: S3MetaData, dir_fmd: "FileMetaData" + ) -> "FileMetaData": + return dir_fmd.model_copy( + update={ + "object_name": x.object_key, + "file_id": x.object_key, + "file_size": x.size, + "entity_tag": x.e_tag, + "sha256_checksum": x.sha256_checksum, + "is_directory": False, + "created_at": x.last_modified, + "last_modified": x.last_modified, + } + ) + @dataclass class UploadLinks: @@ -159,7 +197,11 @@ class UploadLinks: class StorageQueryParamsBase(BaseModel): user_id: UserID - model_config = ConfigDict(populate_by_name=True, extra="forbid") + model_config = ConfigDict(populate_by_name=True) + + +class ListPathsQueryParams(StorageQueryParamsBase): + file_filter: Path | None = None class FilesMetadataDatasetQueryParams(StorageQueryParamsBase): @@ -303,3 +345,59 @@ def all(cls) -> "AccessRights": @classmethod def none(cls) -> "AccessRights": return cls(read=False, write=False, delete=False) + + +TotalNumber: TypeAlias = NonNegativeInt +GenericCursor: TypeAlias = str | bytes + + +class PathMetaData(BaseModel): + path: Path + display_path: Path + location_id: LocationID + location: LocationName + bucket_name: str + + project_id: ProjectID | None + node_id: NodeID | None + user_id: UserID | None + created_at: datetime.datetime + last_modified: datetime.datetime + + file_meta_data: FileMetaData | None + + def update_display_fields(self, id_name_mapping: dict[str, str]) -> None: + display_path = f"{self.path}" + for old, new in id_name_mapping.items(): + display_path = display_path.replace(old, new) + self.display_path = Path(display_path) + + if self.file_meta_data: + self.file_meta_data.update_display_fields(id_name_mapping) + + @classmethod + def from_s3_object_in_dir( + cls, s3_object: S3MetaData | S3DirectoryMetaData, dir_fmd: FileMetaData + ) -> "PathMetaData": + return cls( + path=s3_object.as_path(), + display_path=s3_object.as_path(), + location_id=dir_fmd.location_id, + location=dir_fmd.location, + bucket_name=dir_fmd.bucket_name, + user_id=dir_fmd.user_id, + project_id=dir_fmd.project_id, + node_id=dir_fmd.node_id, + created_at=dir_fmd.created_at, + last_modified=dir_fmd.last_modified, + file_meta_data=None + if isinstance(s3_object, S3DirectoryMetaData) + else FileMetaData.from_s3_object_in_dir(s3_object, dir_fmd), + ) + + def to_api_model(self) -> PathMetaDataGet: + return PathMetaDataGet.model_construct( + path=self.path, + display_path=self.display_path, + file_meta_data=self.file_meta_data, + ) diff --git a/services/storage/src/simcore_service_storage/modules/datcore_adapter/datcore_adapter.py b/services/storage/src/simcore_service_storage/modules/datcore_adapter/datcore_adapter.py index ebcb56cba0c..20a8300a8cd 100644 --- a/services/storage/src/simcore_service_storage/modules/datcore_adapter/datcore_adapter.py +++ b/services/storage/src/simcore_service_storage/modules/datcore_adapter/datcore_adapter.py @@ -1,121 +1,45 @@ import logging -from collections.abc import Callable -from math import ceil -from typing import Any, TypeVar, cast +from typing import Any, TypeAlias, cast import httpx from fastapi import FastAPI -from models_library.api_schemas_storage.storage_schemas import DatCoreDatasetName +from fastapi_pagination import Page +from models_library.api_schemas_datcore_adapter.datasets import ( + DatasetMetaData as DatCoreDatasetMetaData, +) +from models_library.api_schemas_datcore_adapter.datasets import ( + FileMetaData as DatCoreFileMetaData, +) +from models_library.api_schemas_datcore_adapter.datasets import PackageMetaData +from models_library.api_schemas_storage.storage_schemas import ( + DatCoreCollectionName, + DatCoreDatasetName, + DatCorePackageName, +) from models_library.users import UserID -from pydantic import AnyUrl, TypeAdapter +from pydantic import AnyUrl, BaseModel, NonNegativeInt, TypeAdapter from servicelib.fastapi.client_session import get_client_session from servicelib.utils import logged_gather from ...constants import DATCORE_ID, DATCORE_STR, MAX_CONCURRENT_REST_CALLS from ...core.settings import get_application_settings -from ...models import DatasetMetaData, FileMetaData -from .datcore_adapter_exceptions import ( - DatcoreAdapterClientError, - DatcoreAdapterError, - DatcoreAdapterTimeoutError, +from ...models import ( + DatasetMetaData, + FileMetaData, + GenericCursor, + PathMetaData, + TotalNumber, +) +from .datcore_adapter_client_utils import request, retrieve_all_pages +from .datcore_adapter_exceptions import DatcoreAdapterError +from .utils import ( + create_path_meta_data_from_datcore_fmd, + create_path_meta_data_from_datcore_package, ) _logger = logging.getLogger(__file__) -class _DatcoreAdapterResponseError(DatcoreAdapterError): - """Basic exception for response errors""" - - def __init__(self, status: int, reason: str) -> None: - self.status = status - self.reason = reason - super().__init__( - msg=f"forwarded call failed with status {status}, reason {reason}" - ) - - -async def _request( - app: FastAPI, - api_key: str, - api_secret: str, - method: str, - path: str, - *, - json: dict[str, Any] | None = None, - params: dict[str, Any] | None = None, - **request_kwargs, -) -> dict[str, Any] | list[dict[str, Any]]: - datcore_adapter_settings = get_application_settings(app).DATCORE_ADAPTER - url = datcore_adapter_settings.endpoint + path - session = get_client_session(app) - - try: - if request_kwargs is None: - request_kwargs = {} - response = await session.request( - method, - url, - headers={ - "x-datcore-api-key": api_key, - "x-datcore-api-secret": api_secret, - }, - json=json, - params=params, - **request_kwargs, - ) - response.raise_for_status() - response_data = response.json() - assert isinstance(response_data, dict | list) # nosec - return response_data - - except httpx.HTTPStatusError as exc: - raise _DatcoreAdapterResponseError( - status=exc.response.status_code, reason=f"{exc}" - ) from exc - - except TimeoutError as exc: - msg = f"datcore-adapter server timed-out: {exc}" - raise DatcoreAdapterTimeoutError(msg) from exc - - except httpx.RequestError as exc: - msg = f"unexpected request error: {exc}" - raise DatcoreAdapterClientError(msg) from exc - - -_T = TypeVar("_T") - - -async def _retrieve_all_pages( - app: FastAPI, - api_key: str, - api_secret: str, - method: str, - path: str, - return_type_creator: Callable[..., _T], -) -> list[_T]: - page = 1 - objs = [] - while ( - response := cast( - dict[str, Any], - await _request( - app, api_key, api_secret, method, path, params={"page": page} - ), - ) - ) and response.get("items"): - _logger.debug( - "called %s [%d/%d], received %d objects", - path, - page, - ceil(response.get("total", -1) / response.get("size", 1)), - len(response.get("items", [])), - ) - - objs += [return_type_creator(d) for d in response.get("items", [])] - page += 1 - return objs - - async def check_service_health(app: FastAPI) -> bool: datcore_adapter_settings = get_application_settings(app).DATCORE_ADAPTER url = datcore_adapter_settings.endpoint + "/ready" @@ -134,7 +58,7 @@ async def check_user_can_connect(app: FastAPI, api_key: str, api_secret: str) -> return False try: - await _request(app, api_key, api_secret, "GET", "/user/profile") + await request(app, api_key, api_secret, "GET", "/user/profile") return True except DatcoreAdapterError: return False @@ -143,7 +67,9 @@ async def check_user_can_connect(app: FastAPI, api_key: str, api_secret: str) -> async def list_all_datasets_files_metadatas( app: FastAPI, user_id: UserID, api_key: str, api_secret: str ) -> list[FileMetaData]: - all_datasets: list[DatasetMetaData] = await list_datasets(app, api_key, api_secret) + all_datasets: list[DatasetMetaData] = await list_all_datasets( + app, api_key, api_secret + ) results = await logged_gather( *( list_all_files_metadatas_in_dataset( @@ -176,7 +102,7 @@ async def list_all_files_metadatas_in_dataset( ) -> list[FileMetaData]: all_files: list[dict[str, Any]] = cast( list[dict[str, Any]], - await _request( + await request( app, api_key, api_secret, @@ -206,10 +132,138 @@ async def list_all_files_metadatas_in_dataset( ] -async def list_datasets( +_Size: TypeAlias = NonNegativeInt +_Page: TypeAlias = NonNegativeInt + + +class CursorParameters(BaseModel): + next_page: _Page + size: _Size + + +def _init_pagination( + cursor: GenericCursor | None, limit: NonNegativeInt +) -> tuple[_Page, _Size]: + if cursor is not None: + cursor_params = CursorParameters.model_validate_json(cursor) + return cursor_params.next_page, cursor_params.size + return 1, limit + + +def _create_next_cursor( + total: TotalNumber, page: _Page, size: _Size +) -> GenericCursor | None: + if total > page * size: + return CursorParameters.model_validate( + {"next_page": page + 1, "size": size} + ).model_dump_json() + return None + + +async def _list_top_level_objects( + app: FastAPI, + *, + user_id: UserID, + api_key: str, + api_secret: str, + cursor: GenericCursor | None, + limit: NonNegativeInt, + request_path: str, +) -> tuple[list[PathMetaData], GenericCursor | None, TotalNumber]: + page, size = _init_pagination(cursor, limit) + response = await request( + app, + api_key, + api_secret, + "GET", + request_path, + params={"size": size, "page": page}, + ) + assert isinstance(response, dict) # nosec + file_metadata_page = Page[DatCoreFileMetaData](**response) + entries = file_metadata_page.items + total = file_metadata_page.total + assert isinstance(total, int) # nosec + next_cursor = _create_next_cursor(total, page, size) + + return ( + [create_path_meta_data_from_datcore_fmd(user_id, e) for e in entries], + next_cursor, + total, + ) + + +async def list_top_level_objects_in_dataset( + app: FastAPI, + *, + user_id: UserID, + api_key: str, + api_secret: str, + dataset_id: DatCoreDatasetName, + cursor: GenericCursor | None, + limit: NonNegativeInt, +) -> tuple[list[PathMetaData], GenericCursor | None, TotalNumber]: + return await _list_top_level_objects( + app, + user_id=user_id, + api_key=api_key, + api_secret=api_secret, + cursor=cursor, + limit=limit, + request_path=f"/datasets/{dataset_id}/files", + ) + + +async def list_top_level_objects_in_collection( + app: FastAPI, + *, + user_id: UserID, + api_key: str, + api_secret: str, + dataset_id: DatCoreDatasetName, + collection_id: DatCoreCollectionName, + cursor: GenericCursor | None, + limit: NonNegativeInt, +) -> tuple[list[PathMetaData], GenericCursor | None, TotalNumber]: + return await _list_top_level_objects( + app, + user_id=user_id, + api_key=api_key, + api_secret=api_secret, + cursor=cursor, + limit=limit, + request_path=f"/datasets/{dataset_id}/files/{collection_id}", + ) + + +async def get_package_file_as_path( + app: FastAPI, + *, + user_id: UserID, + api_key: str, + api_secret: str, + dataset_id: DatCoreDatasetName, + package_id: DatCorePackageName, +) -> PathMetaData: + pck_files = await get_package_files( + app, + api_key=api_key, + api_secret=api_secret, + package_id=package_id, + ) + + assert len(pck_files) == 1 # nosec + return create_path_meta_data_from_datcore_package( + user_id, + dataset_id, + pck_files[0], + ) + + +async def list_all_datasets( app: FastAPI, api_key: str, api_secret: str ) -> list[DatasetMetaData]: - all_datasets: list[DatasetMetaData] = await _retrieve_all_pages( + all_datasets: list[DatasetMetaData] = await retrieve_all_pages( app, api_key, api_secret, @@ -221,33 +275,67 @@ async def list_datasets( return all_datasets +async def list_datasets( + app: FastAPI, + *, + api_key: str, + api_secret: str, + cursor: GenericCursor | None, + limit: NonNegativeInt, +) -> tuple[list[DatasetMetaData], GenericCursor | None, TotalNumber]: + page, size = _init_pagination(cursor, limit) + + response = await request( + app, + api_key, + api_secret, + "GET", + "/datasets", + params={"size": size, "page": page}, + ) + assert isinstance(response, dict) # nosec + datasets_page = Page[DatCoreDatasetMetaData](**response) + datasets = datasets_page.items + total = datasets_page.total + + assert isinstance(total, int) # nosec + next_cursor = _create_next_cursor(total, page, size) + + return ( + [ + DatasetMetaData(dataset_id=d.id, display_name=d.display_name) + for d in datasets + ], + next_cursor, + total, + ) + + async def get_file_download_presigned_link( app: FastAPI, api_key: str, api_secret: str, file_id: str ) -> AnyUrl: file_download_data = cast( dict[str, Any], - await _request(app, api_key, api_secret, "GET", f"/files/{file_id}"), + await request(app, api_key, api_secret, "GET", f"/files/{file_id}"), ) - url: AnyUrl = TypeAdapter(AnyUrl).validate_python(file_download_data["link"]) - return url + return TypeAdapter(AnyUrl).validate_python(file_download_data["link"]) async def get_package_files( - app: FastAPI, api_key: str, api_secret: str, package_id: str -) -> list[dict[str, Any]]: - return cast( - list[dict[str, Any]], - await _request( + app: FastAPI, *, api_key: str, api_secret: str, package_id: str +) -> list[PackageMetaData]: + return TypeAdapter(list[PackageMetaData]).validate_python( + await request( app, api_key, api_secret, "GET", f"/packages/{package_id}/files", - ), + ) ) async def delete_file( app: FastAPI, api_key: str, api_secret: str, file_id: str ) -> None: - await _request(app, api_key, api_secret, "DELETE", f"/files/{file_id}") + await request(app, api_key, api_secret, "DELETE", f"/files/{file_id}") diff --git a/services/storage/src/simcore_service_storage/modules/datcore_adapter/datcore_adapter_client_utils.py b/services/storage/src/simcore_service_storage/modules/datcore_adapter/datcore_adapter_client_utils.py new file mode 100644 index 00000000000..3972d07d72d --- /dev/null +++ b/services/storage/src/simcore_service_storage/modules/datcore_adapter/datcore_adapter_client_utils.py @@ -0,0 +1,99 @@ +import logging +from collections.abc import Callable +from math import ceil +from typing import Any, TypeVar, cast + +import httpx +from fastapi import FastAPI +from servicelib.fastapi.client_session import get_client_session + +from ...core.settings import get_application_settings +from .datcore_adapter_exceptions import ( + DatcoreAdapterClientError, + DatcoreAdapterResponseError, + DatcoreAdapterTimeoutError, +) + +_logger = logging.getLogger(__file__) + + +async def request( + app: FastAPI, + api_key: str, + api_secret: str, + method: str, + path: str, + *, + json: dict[str, Any] | None = None, + params: dict[str, Any] | None = None, + **request_kwargs, +) -> dict[str, Any] | list[dict[str, Any]]: + datcore_adapter_settings = get_application_settings(app).DATCORE_ADAPTER + url = datcore_adapter_settings.endpoint + path + session = get_client_session(app) + + try: + if request_kwargs is None: + request_kwargs = {} + response = await session.request( + method.upper(), + url, + headers={ + "x-datcore-api-key": api_key, + "x-datcore-api-secret": api_secret, + }, + json=json, + params=params, + **request_kwargs, + ) + response.raise_for_status() + response_data = response.json() + assert isinstance(response_data, dict | list) # nosec + return response_data + + except httpx.HTTPStatusError as exc: + raise DatcoreAdapterResponseError( + status=exc.response.status_code, reason=f"{exc}" + ) from exc + + except TimeoutError as exc: + msg = f"datcore-adapter server timed-out: {exc}" + raise DatcoreAdapterTimeoutError(msg) from exc + + except httpx.RequestError as exc: + msg = f"unexpected request error: {exc}" + raise DatcoreAdapterClientError(msg) from exc + + +_T = TypeVar("_T") + + +async def retrieve_all_pages( + app: FastAPI, + api_key: str, + api_secret: str, + method: str, + path: str, + return_type_creator: Callable[..., _T], +) -> list[_T]: + page = 1 + objs = [] + while ( + response := cast( + dict[str, Any], + await request( + app, api_key, api_secret, method, path, params={"page": page} + ), + ) + ) and response.get("items"): + _logger.debug( + "called %s [%d/%d], received %d objects", + path, + page, + ceil(response.get("total", -1) / response.get("size", 1)), + len(response.get("items", [])), + ) + + objs += [return_type_creator(d) for d in response.get("items", [])] + page += 1 + return objs diff --git a/services/storage/src/simcore_service_storage/modules/datcore_adapter/datcore_adapter_exceptions.py b/services/storage/src/simcore_service_storage/modules/datcore_adapter/datcore_adapter_exceptions.py index 03ba1aa603c..2e2ccc8548f 100644 --- a/services/storage/src/simcore_service_storage/modules/datcore_adapter/datcore_adapter_exceptions.py +++ b/services/storage/src/simcore_service_storage/modules/datcore_adapter/datcore_adapter_exceptions.py @@ -33,3 +33,14 @@ class DatcoreAdapterMultipleFilesError(DatcoreAdapterError): def __init__(self, msg: str) -> None: super().__init__(msg=msg) + + +class DatcoreAdapterResponseError(DatcoreAdapterError): + """Basic exception for response errors""" + + def __init__(self, status: int, reason: str) -> None: + self.status = status + self.reason = reason + super().__init__( + msg=f"forwarded call failed with status {status}, reason {reason}" + ) diff --git a/services/storage/src/simcore_service_storage/modules/datcore_adapter/utils.py b/services/storage/src/simcore_service_storage/modules/datcore_adapter/utils.py new file mode 100644 index 00000000000..34a1379f8c0 --- /dev/null +++ b/services/storage/src/simcore_service_storage/modules/datcore_adapter/utils.py @@ -0,0 +1,97 @@ +from pathlib import Path + +from models_library.api_schemas_datcore_adapter.datasets import ( + DataType as DatCoreDataType, +) +from models_library.api_schemas_datcore_adapter.datasets import ( + FileMetaData as DatCoreFileMetaData, +) +from models_library.api_schemas_datcore_adapter.datasets import PackageMetaData +from models_library.api_schemas_storage.storage_schemas import DatCoreDatasetName +from models_library.users import UserID +from pydantic import ByteSize + +from ...constants import DATCORE_ID, DATCORE_STR +from ...models import FileMetaData, PathMetaData + + +def create_fmd_from_datcore_package( + user_id: UserID, pck_metadata: PackageMetaData +) -> FileMetaData: + return FileMetaData( + file_uuid=f"{pck_metadata.package_id}", + location_id=DATCORE_ID, + location=DATCORE_STR, + bucket_name=pck_metadata.s3_bucket, + object_name=f"{pck_metadata.package_id}", + file_name=pck_metadata.name, + file_id=pck_metadata.package_id, + file_size=ByteSize(pck_metadata.size), + created_at=pck_metadata.created_at, + last_modified=pck_metadata.updated_at, + project_id=None, + node_id=None, + user_id=user_id, + is_soft_link=False, + sha256_checksum=None, + ) + + +def create_fmd_from_datcore_fmd( + user_id: UserID, dat_core_fmd: DatCoreFileMetaData +) -> FileMetaData: + return FileMetaData( + file_uuid=f"{dat_core_fmd.path}", + location_id=DATCORE_ID, + location=DATCORE_STR, + bucket_name=dat_core_fmd.dataset_id, + object_name=f"{dat_core_fmd.package_id}", + file_name=dat_core_fmd.name, + file_id=dat_core_fmd.package_id, + file_size=ByteSize(dat_core_fmd.size), + created_at=dat_core_fmd.created_at, + last_modified=dat_core_fmd.last_modified_at, + project_id=None, + node_id=None, + user_id=user_id, + is_soft_link=False, + sha256_checksum=None, + ) + + +def create_path_meta_data_from_datcore_package( + user_id: UserID, dataset_id: DatCoreDatasetName, pck_metadata: PackageMetaData +) -> PathMetaData: + return PathMetaData( + path=Path(dataset_id) / pck_metadata.package_id, + display_path=pck_metadata.display_path, + location_id=DATCORE_ID, + location=DATCORE_STR, + bucket_name=pck_metadata.s3_bucket, + project_id=None, + node_id=None, + user_id=user_id, + created_at=pck_metadata.created_at, + last_modified=pck_metadata.updated_at, + file_meta_data=create_fmd_from_datcore_package(user_id, pck_metadata), + ) + + +def create_path_meta_data_from_datcore_fmd( + user_id: UserID, dat_core_fmd: DatCoreFileMetaData +) -> PathMetaData: + return PathMetaData( + path=Path(dat_core_fmd.dataset_id) / dat_core_fmd.id, + display_path=dat_core_fmd.path, + location_id=DATCORE_ID, + location=DATCORE_STR, + bucket_name=dat_core_fmd.dataset_id, + project_id=None, + node_id=None, + user_id=user_id, + created_at=dat_core_fmd.created_at, + last_modified=dat_core_fmd.last_modified_at, + file_meta_data=None + if dat_core_fmd.data_type == DatCoreDataType.FOLDER + else create_fmd_from_datcore_fmd(user_id, dat_core_fmd), + ) diff --git a/services/storage/src/simcore_service_storage/modules/db/file_meta_data.py b/services/storage/src/simcore_service_storage/modules/db/file_meta_data.py index d2aeae30a35..00ebc91f70b 100644 --- a/services/storage/src/simcore_service_storage/modules/db/file_meta_data.py +++ b/services/storage/src/simcore_service_storage/modules/db/file_meta_data.py @@ -1,5 +1,8 @@ +import contextlib import datetime from collections.abc import AsyncGenerator +from pathlib import Path +from typing import TypeAlias import sqlalchemy as sa from models_library.basic_types import SHA256Str @@ -7,13 +10,21 @@ from models_library.projects_nodes_io import NodeID, SimcoreS3FileID from models_library.users import UserID from models_library.utils.fastapi_encoders import jsonable_encoder +from pydantic import BaseModel from simcore_postgres_database.storage_models import file_meta_data from sqlalchemy import and_, literal_column from sqlalchemy.dialects.postgresql import insert as pg_insert +from sqlalchemy.exc import MultipleResultsFound from sqlalchemy.ext.asyncio import AsyncConnection from ...exceptions.errors import FileMetaDataNotFoundError -from ...models import FileMetaData, FileMetaDataAtDB, UserOrProjectFilter +from ...models import ( + FileMetaData, + FileMetaDataAtDB, + GenericCursor, + PathMetaData, + UserOrProjectFilter, +) async def exists(conn: AsyncConnection, file_id: SimcoreS3FileID) -> bool: @@ -136,6 +147,186 @@ async def list_filter_with_partial_file_id( ] +async def try_get_directory( + conn: AsyncConnection, file_filter: Path +) -> FileMetaData | None: + """Check if the given file filter is a directory or is inside a directory.""" + # we might be exactly on a directory or inside it + potential_directories = (file_filter, *file_filter.parents) + with contextlib.suppress(MultipleResultsFound): + for file_id in potential_directories: + # there should be only 1 entry if this is a directory + result = await conn.execute( + sa.select(file_meta_data).where( + file_meta_data.c.file_id == f"{file_id}" + ) + ) + if row := result.one_or_none(): + fmd = FileMetaDataAtDB.model_validate(row) + if fmd.is_directory: + return FileMetaData.from_db_model(fmd) + return None + return None + + +TotalChildren: TypeAlias = int + + +class _PathsCursorParameters(BaseModel): + offset: int + file_prefix: Path | None + project_ids: list[ProjectID] | None + partial: bool + + +def _init_pagination( + cursor: GenericCursor | None, + *, + filter_by_project_ids: list[ProjectID] | None, + filter_by_file_prefix: Path | None, + is_partial_prefix: bool, +) -> _PathsCursorParameters: + if cursor: + return _PathsCursorParameters.model_validate_json(cursor) + return _PathsCursorParameters( + offset=0, + file_prefix=filter_by_file_prefix, + project_ids=filter_by_project_ids, + partial=is_partial_prefix, + ) + + +def _create_next_cursor( + total_count: TotalChildren, limit: int, cursor_params: _PathsCursorParameters +) -> GenericCursor | None: + if cursor_params.offset + limit < total_count: + return cursor_params.model_copy( + update={"offset": cursor_params.offset + limit} + ).model_dump_json() + return None + + +async def list_child_paths( + conn: AsyncConnection, + *, + filter_by_project_ids: list[ProjectID] | None, + filter_by_file_prefix: Path | None, + cursor: GenericCursor | None, + limit: int, + is_partial_prefix: bool, +) -> tuple[list[PathMetaData], GenericCursor | None, TotalChildren]: + """returns a list of FileMetaDataAtDB that are one level deep. + e.g. when no filter is used, these are top level objects + """ + + cursor_params = _init_pagination( + cursor, + filter_by_project_ids=filter_by_project_ids, + filter_by_file_prefix=filter_by_file_prefix, + is_partial_prefix=is_partial_prefix, + ) + + if cursor_params.file_prefix: + prefix_levels = len(cursor_params.file_prefix.parts) - 1 + search_prefix = ( + f"{cursor_params.file_prefix}%" + if cursor_params.partial + else f"{cursor_params.file_prefix / '%'}" + ) + search_regex = rf"^[^/]+(?:/[^/]+){{{prefix_levels}}}{'' if cursor_params.partial else '/[^/]+'}" + ranked_files = ( + sa.select( + file_meta_data.c.file_id, + sa.func.substring(file_meta_data.c.file_id, search_regex).label("path"), + sa.func.row_number() + .over( + partition_by=sa.func.substring( + file_meta_data.c.file_id, search_regex + ), + order_by=(file_meta_data.c.file_id.asc(),), + ) + .label("row_num"), + ) + .where( + and_( + file_meta_data.c.file_id.like(search_prefix), + file_meta_data.c.project_id.in_( + [f"{_}" for _ in cursor_params.project_ids] + ) + if cursor_params.project_ids + else True, + ) + ) + .cte("ranked_files") + ) + else: + ranked_files = ( + sa.select( + file_meta_data.c.file_id, + sa.func.split_part(file_meta_data.c.file_id, "/", 1).label("path"), + sa.func.row_number() + .over( + partition_by=sa.func.split_part(file_meta_data.c.file_id, "/", 1), + order_by=(file_meta_data.c.file_id.asc(),), + ) + .label("row_num"), + ) + .where( + file_meta_data.c.project_id.in_( + [f"{_}" for _ in cursor_params.project_ids] + ) + if cursor_params.project_ids + else True + ) + .cte("ranked_files") + ) + + files_query = ( + ( + sa.select(ranked_files, file_meta_data) + .where( + and_( + ranked_files.c.row_num == 1, + ranked_files.c.file_id == file_meta_data.c.file_id, + ) + ) + .order_by(file_meta_data.c.file_id.asc()) + ) + .limit(limit) + .offset(cursor_params.offset) + ) + + total_count = await conn.scalar( + sa.select(sa.func.count()) + .select_from(ranked_files) + .where(ranked_files.c.row_num == 1) + ) + + items = [ + PathMetaData( + path=row.path + or row.file_id, # NOTE: if path_prefix is partial then path is None + display_path=row.path or row.file_id, + location_id=row.location_id, + location=row.location, + bucket_name=row.bucket_name, + project_id=row.project_id, + node_id=row.node_id, + user_id=row.user_id, + created_at=row.created_at, + last_modified=row.last_modified, + file_meta_data=FileMetaData.from_db_model( + FileMetaDataAtDB.model_validate(row) + ) + if row.file_id == row.path and not row.is_directory + else None, + ) + async for row in await conn.stream(files_query) + ] + + return items, _create_next_cursor(total_count, limit, cursor_params), total_count + + async def list_fmds( conn: AsyncConnection, *, diff --git a/services/storage/src/simcore_service_storage/modules/db/projects.py b/services/storage/src/simcore_service_storage/modules/db/projects.py index d5d231a71d4..774931e17c7 100644 --- a/services/storage/src/simcore_service_storage/modules/db/projects.py +++ b/services/storage/src/simcore_service_storage/modules/db/projects.py @@ -2,7 +2,8 @@ from contextlib import suppress import sqlalchemy as sa -from models_library.projects import ProjectAtDB, ProjectID +from models_library.projects import ProjectAtDB, ProjectID, ProjectIDStr +from models_library.projects_nodes_io import NodeIDStr from pydantic import ValidationError from simcore_postgres_database.storage_models import projects from sqlalchemy.ext.asyncio import AsyncConnection @@ -37,3 +38,19 @@ async def project_exists( ) == 1 ) + + +async def get_project_id_and_node_id_to_names_map( + conn: AsyncConnection, project_uuids: list[ProjectID] +) -> dict[ProjectID, dict[ProjectIDStr | NodeIDStr, str]]: + mapping = {} + async for row in await conn.stream( + sa.select(projects.c.uuid, projects.c.name, projects.c.workbench).where( + projects.c.uuid.in_(f"{pid}" for pid in project_uuids) + ) + ): + mapping[ProjectID(f"{row.uuid}")] = {f"{row.uuid}": row.name} | { + f"{node_id}": node["label"] for node_id, node in row.workbench.items() + } + + return mapping diff --git a/services/storage/src/simcore_service_storage/simcore_s3_dsm.py b/services/storage/src/simcore_service_storage/simcore_s3_dsm.py index 753e36f9834..4888b695ed0 100644 --- a/services/storage/src/simcore_service_storage/simcore_s3_dsm.py +++ b/services/storage/src/simcore_service_storage/simcore_s3_dsm.py @@ -63,6 +63,9 @@ DatasetMetaData, FileMetaData, FileMetaDataAtDB, + GenericCursor, + PathMetaData, + TotalNumber, UploadLinks, UserOrProjectFilter, ) @@ -79,6 +82,8 @@ compute_file_id_prefix, expand_directory, get_directory_file_id, + list_child_paths_from_repository, + list_child_paths_from_s3, ) from .utils.utils import ( convert_db_to_model, @@ -93,6 +98,37 @@ _logger = logging.getLogger(__name__) +async def _add_frontend_needed_data( + engine: AsyncEngine, + *, + project_ids: list[ProjectID], + data: list[FileMetaData], +) -> list[FileMetaData]: + # artifically fills ['project_name', 'node_name', 'file_id', 'raw_file_path', 'display_file_path'] + # with information from the projects table! + # NOTE: This part with the projects, should be done in the client code not here! + + async with engine.connect() as conn: + prj_names_mapping: dict[ProjectID | NodeID, str] = {} + async for proj_data in projects.list_valid_projects_in(conn, project_ids): + prj_names_mapping |= {proj_data.uuid: proj_data.name} | { + NodeID(node_id): node_data.label + for node_id, node_data in proj_data.workbench.items() + } + + clean_data: list[FileMetaData] = [] + for d in data: + if d.project_id not in prj_names_mapping: + continue + d.project_name = prj_names_mapping[d.project_id] + if d.node_id in prj_names_mapping: + d.node_name = prj_names_mapping[d.node_id] + if d.node_name and d.project_name: + clean_data.append(d) + + return clean_data + + @dataclass class SimcoreS3DataManager(BaseDataManager): engine: AsyncEngine @@ -136,7 +172,86 @@ async def list_files_in_dataset( ) return data - async def list_files( # noqa C901 + async def list_paths( + self, + user_id: UserID, + *, + file_filter: Path | None, + cursor: GenericCursor | None, + limit: NonNegativeInt, + ) -> tuple[list[PathMetaData], GenericCursor | None, TotalNumber | None]: + """returns a page of the file meta data a user has access to""" + + next_cursor: GenericCursor | None = None + total: TotalNumber | None = None + # if we have a file_filter, that means that we have potentially a project ID + project_id = None + with contextlib.suppress(ValueError): + # NOTE: we currently do not support anything else than project_id/node_id/file_path here, sorry chap + project_id = ProjectID(file_filter.parts[0]) if file_filter else None + + async with self.engine.connect() as conn: + if project_id: + project_access_rights = await get_project_access_rights( + conn=conn, user_id=user_id, project_id=project_id + ) + if not project_access_rights.read: + raise ProjectAccessRightError( + access_right="read", project_id=project_id + ) + accessible_projects_ids = [project_id] + else: + accessible_projects_ids = await get_readable_project_ids(conn, user_id) + + # check if the file_filter is a directory or inside one + dir_fmd = None + if file_filter: + dir_fmd = await file_meta_data.try_get_directory(conn, file_filter) + + if dir_fmd: + # NOTE: files are not listed in the DB but in S3 only + assert file_filter # nosec + assert project_id # nosec + (paths_metadata, next_cursor) = await list_child_paths_from_s3( + get_s3_client(self.app), + dir_fmd=dir_fmd, + bucket=self.simcore_bucket_name, + file_filter=file_filter, + limit=limit, + cursor=cursor, + ) + else: + # NOTE: files are DB-based + async with self.engine.connect() as conn: + ( + paths_metadata, + next_cursor, + total, + ) = await list_child_paths_from_repository( + conn, + filter_by_project_ids=accessible_projects_ids, + filter_by_file_prefix=file_filter, + limit=limit, + cursor=cursor, + ) + + # extract the returned project_ids + project_ids = list( + {path.project_id for path in paths_metadata if path.project_id is not None} + ) + async with self.engine.connect() as conn: + ids_names_map = await projects.get_project_id_and_node_id_to_names_map( + conn, project_ids + ) + + for path in paths_metadata: + if path.project_id is not None: + id_name_map = ids_names_map.get(path.project_id, {}) + path.update_display_fields(id_name_map) + + return paths_metadata, next_cursor, total + + async def list_files( self, user_id: UserID, *, @@ -172,17 +287,17 @@ async def list_files( # noqa C901 else: accessible_projects_ids = await get_readable_project_ids(conn, user_id) uid = user_id - file_and_directory_meta_data: list[ - FileMetaDataAtDB - ] = await file_meta_data.list_filter_with_partial_file_id( - conn, - user_or_project_filter=UserOrProjectFilter( - user_id=uid, project_ids=accessible_projects_ids - ), - file_id_prefix=None, - is_directory=None, - partial_file_id=uuid_filter, - sha256_checksum=None, + file_and_directory_meta_data = ( + await file_meta_data.list_filter_with_partial_file_id( + conn, + user_or_project_filter=UserOrProjectFilter( + user_id=uid, project_ids=accessible_projects_ids + ), + file_id_prefix=None, + is_directory=None, + partial_file_id=uuid_filter, + sha256_checksum=None, + ) ) # add all the entries from file_meta_data without @@ -199,17 +314,6 @@ async def list_files( # noqa C901 updated_fmd = await self._update_database_from_storage(metadata) data.append(convert_db_to_model(updated_fmd)) - # now parse the project to search for node/project names - async with self.engine.connect() as conn: - prj_names_mapping: dict[ProjectID | NodeID, str] = {} - async for proj_data in projects.list_valid_projects_in( - conn, accessible_projects_ids - ): - prj_names_mapping |= {proj_data.uuid: proj_data.name} | { - NodeID(node_id): node_data.label - for node_id, node_data in proj_data.workbench.items() - } - # expand directories until the max number of files to return is reached directory_expands: list[Coroutine] = [] for metadata in file_and_directory_meta_data: @@ -232,21 +336,9 @@ async def list_files( # noqa C901 ): data.extend(files_in_directory) - # artifically fills ['project_name', 'node_name', 'file_id', 'raw_file_path', 'display_file_path'] - # with information from the projects table! - # NOTE: This part with the projects, should be done in the client code not here! - clean_data: list[FileMetaData] = [] - for d in data: - if d.project_id not in prj_names_mapping: - continue - d.project_name = prj_names_mapping[d.project_id] - if d.node_id in prj_names_mapping: - d.node_name = prj_names_mapping[d.node_id] - if d.node_name and d.project_name: - clean_data.append(d) - - data = clean_data - return data + return await _add_frontend_needed_data( + self.engine, project_ids=accessible_projects_ids, data=data + ) async def get_file(self, user_id: UserID, file_id: StorageFileID) -> FileMetaData: async with self.engine.connect() as conn: diff --git a/services/storage/src/simcore_service_storage/utils/simcore_s3_dsm_utils.py b/services/storage/src/simcore_service_storage/utils/simcore_s3_dsm_utils.py index db431b02c01..2243caea7de 100644 --- a/services/storage/src/simcore_service_storage/utils/simcore_s3_dsm_utils.py +++ b/services/storage/src/simcore_service_storage/utils/simcore_s3_dsm_utils.py @@ -1,8 +1,10 @@ from contextlib import suppress from pathlib import Path +import orjson from aws_library.s3 import S3MetaData, SimcoreS3API from models_library.api_schemas_storage.storage_schemas import S3BucketName +from models_library.projects import ProjectID from models_library.projects_nodes_io import ( SimcoreS3DirectoryID, SimcoreS3FileID, @@ -13,7 +15,7 @@ from sqlalchemy.ext.asyncio import AsyncConnection from ..exceptions.errors import FileMetaDataNotFoundError -from ..models import FileMetaData, FileMetaDataAtDB +from ..models import FileMetaData, FileMetaDataAtDB, GenericCursor, PathMetaData from ..modules.db import file_meta_data from .utils import convert_db_to_model @@ -124,3 +126,84 @@ async def _get_fmd( def compute_file_id_prefix(file_id: str, levels: int): components = file_id.strip("/").split("/") return "/".join(components[:levels]) + + +async def list_child_paths_from_s3( + s3_client: SimcoreS3API, + *, + dir_fmd: FileMetaData, + bucket: S3BucketName, + file_filter: Path, + limit: int, + cursor: GenericCursor | None, +) -> tuple[list[PathMetaData], GenericCursor | None]: + """list direct children given by `file_filter` of a directory. + Tries first using file_filter as a full path, if not results are found will + try using file_filter as a partial prefix. + """ + objects_cursor = None + if cursor is not None: + cursor_params = orjson.loads(cursor) + assert cursor_params["file_filter"] == f"{file_filter}" # nosec + objects_cursor = cursor_params["objects_next_cursor"] + list_s3_objects, objects_next_cursor = await s3_client.list_objects( + bucket=bucket, + prefix=file_filter, + start_after=None, + limit=limit, + next_cursor=objects_cursor, + is_partial_prefix=False, + ) + if not list_s3_objects: + list_s3_objects, objects_next_cursor = await s3_client.list_objects( + bucket=bucket, + prefix=file_filter, + start_after=None, + limit=limit, + next_cursor=objects_cursor, + is_partial_prefix=True, + ) + + paths_metadata = [ + PathMetaData.from_s3_object_in_dir(s3_object, dir_fmd) + for s3_object in list_s3_objects + ] + next_cursor = None + if objects_next_cursor: + next_cursor = orjson.dumps( + { + "file_filter": f"{file_filter}", + "objects_next_cursor": objects_next_cursor, + } + ) + + return paths_metadata, next_cursor + + +async def list_child_paths_from_repository( + conn: AsyncConnection, + *, + filter_by_project_ids: list[ProjectID] | None, + filter_by_file_prefix: Path | None, + cursor: GenericCursor | None, + limit: int, +) -> tuple[list[PathMetaData], GenericCursor | None, file_meta_data.TotalChildren]: + paths_metadata, next_cursor, total = await file_meta_data.list_child_paths( + conn, + filter_by_project_ids=filter_by_project_ids, + filter_by_file_prefix=filter_by_file_prefix, + limit=limit, + cursor=cursor, + is_partial_prefix=False, + ) + if not paths_metadata: + paths_metadata, next_cursor, total = await file_meta_data.list_child_paths( + conn, + filter_by_project_ids=filter_by_project_ids, + filter_by_file_prefix=filter_by_file_prefix, + limit=limit, + cursor=cursor, + is_partial_prefix=True, + ) + + return paths_metadata, next_cursor, total diff --git a/services/storage/tests/conftest.py b/services/storage/tests/conftest.py index 15a95dd919a..a12a153dbd2 100644 --- a/services/storage/tests/conftest.py +++ b/services/storage/tests/conftest.py @@ -8,9 +8,9 @@ import asyncio import logging +import random import sys from collections.abc import AsyncIterator, Awaitable, Callable -from contextlib import AbstractAsyncContextManager, asynccontextmanager from pathlib import Path from typing import Any, Final, cast @@ -36,7 +36,7 @@ from models_library.basic_types import SHA256Str from models_library.projects import ProjectID from models_library.projects_nodes import NodeID -from models_library.projects_nodes_io import LocationID, SimcoreS3FileID +from models_library.projects_nodes_io import LocationID, SimcoreS3FileID, StorageFileID from models_library.users import UserID from models_library.utils.fastapi_encoders import jsonable_encoder from pydantic import ByteSize, TypeAdapter @@ -46,12 +46,17 @@ from pytest_simcore.helpers.logging_tools import log_context from pytest_simcore.helpers.monkeypatch_envs import delenvs_from_dict, setenvs_from_dict from pytest_simcore.helpers.s3 import upload_file_to_presigned_link -from pytest_simcore.helpers.storage_utils import FileIDDict +from pytest_simcore.helpers.storage_utils import ( + FileIDDict, + ProjectWithFilesParams, + get_updated_project, +) from pytest_simcore.helpers.storage_utils_file_meta_data import ( assert_file_meta_data_in_db, ) from pytest_simcore.helpers.typing_env import EnvVarsDict from servicelib.aiohttp import status +from servicelib.utils import limited_gather from simcore_postgres_database.storage_models import file_meta_data, projects, users from simcore_service_storage.core.application import create_app from simcore_service_storage.core.settings import ApplicationSettings @@ -86,6 +91,7 @@ "pytest_simcore.repository_paths", "pytest_simcore.simcore_storage_data_models", "pytest_simcore.simcore_storage_datcore_adapter", + "pytest_simcore.simcore_storage_service", ] CURRENT_DIR = Path(sys.argv[0] if __name__ == "__main__" else __file__).resolve().parent @@ -430,25 +436,6 @@ async def _uploader( return _uploader -@pytest.fixture -def create_simcore_file_id( - faker: Faker, -) -> Callable[[ProjectID, NodeID, str, Path | None], SimcoreS3FileID]: - def _creator( - project_id: ProjectID, - node_id: NodeID, - file_name: str, - file_base_path: Path | None = None, - ) -> SimcoreS3FileID: - s3_file_name = file_name - if file_base_path: - s3_file_name = f"{file_base_path / file_name}" - clean_path = Path(f"{project_id}/{node_id}/{s3_file_name}") - return TypeAdapter(SimcoreS3FileID).validate_python(f"{clean_path}") - - return _creator - - @pytest.fixture async def with_versioning_enabled( s3_client: S3Client, @@ -466,10 +453,10 @@ async def create_empty_directory( create_upload_file_link_v2: Callable[..., Awaitable[FileUploadSchema]], initialized_app: FastAPI, client: httpx.AsyncClient, - project_id: ProjectID, - node_id: NodeID, -) -> Callable[..., Awaitable[FileUploadSchema]]: - async def _directory_creator(dir_name: str): +) -> Callable[[str, ProjectID, NodeID], Awaitable[SimcoreS3FileID]]: + async def _directory_creator( + dir_name: str, project_id: ProjectID, node_id: NodeID + ) -> SimcoreS3FileID: # creating an empty directory goes through the same procedure as uploading a multipart file # done by using 3 calls: # 1. create the link as a directory @@ -527,46 +514,89 @@ async def _directory_creator(dir_name: str): f"--> done waiting, data is completely uploaded [{attempt.retry_state.retry_object.statistics}]", ) - return directory_file_upload + return directory_file_id return _directory_creator +async def _upload_file_to_s3( + s3_client: SimcoreS3API, + faker: Faker, + *, + s3_bucket: S3BucketName, + local_file: Path, + file_id: SimcoreS3FileID, +) -> dict[SHA256Str, FileIDDict]: + await s3_client.upload_file( + bucket=s3_bucket, + file=local_file, + object_key=file_id, + bytes_transfered_cb=None, + ) + return {file_id: FileIDDict(path=local_file, sha256_checksum=f"{faker.sha256()}")} + + @pytest.fixture async def populate_directory( create_file_of_size: Callable[[ByteSize, str | None], Path], storage_s3_client: SimcoreS3API, storage_s3_bucket: S3BucketName, - project_id: ProjectID, - node_id: NodeID, -) -> Callable[..., Awaitable[None]]: + faker: Faker, +) -> Callable[ + [ByteSize, str, ProjectID, NodeID, int, int], + Awaitable[tuple[NodeID, dict[SimcoreS3FileID, FileIDDict]]], +]: async def _create_content( file_size_in_dir: ByteSize, dir_name: str, - subdir_count: int = 4, - file_count: int = 5, - ) -> None: - file = create_file_of_size(file_size_in_dir, "some_file") - - async def _create_file(s: int, f: int): - file_name = f"{dir_name}/sub-dir-{s}/file-{f}" - clean_path = Path(f"{project_id}/{node_id}/{file_name}") - await storage_s3_client.upload_file( - bucket=storage_s3_bucket, - file=file, - object_key=TypeAdapter(SimcoreS3FileID).validate_python( - f"{clean_path}" - ), - bytes_transfered_cb=None, + project_id: ProjectID, + node_id: NodeID, + subdir_count: int, + file_count: int, + ) -> tuple[NodeID, dict[SimcoreS3FileID, FileIDDict]]: + assert subdir_count >= 1, "cannot use fixture with subdir_count < 1!" + assert file_count >= 1, "cannot use fixture with file_count < 1!" + + local_file = create_file_of_size(file_size_in_dir, None) + + # Create subdirectories + s3_base_path = Path(f"{project_id}") / f"{node_id}" / dir_name + s3_subdirs = [s3_base_path / f"sub-dir-{i}" for i in range(subdir_count)] + # Randomly distribute files across subdirectories + selected_subdirs = random.choices(s3_subdirs, k=file_count) # noqa: S311 + # Upload to S3 + with log_context( + logging.INFO, + msg=f"Uploading {file_count} files to S3 (each {file_size_in_dir.human_readable()}, total: {ByteSize(file_count * file_size_in_dir).human_readable()})", + ): + results = await asyncio.gather( + *( + _upload_file_to_s3( + storage_s3_client, + faker, + s3_bucket=storage_s3_bucket, + local_file=local_file, + file_id=TypeAdapter(SimcoreS3FileID).validate_python( + f"{selected_subdir / faker.unique.file_name()}" + ), + ) + for selected_subdir in selected_subdirs + ) ) - tasks = [ - _create_file(s, f) for f in range(file_count) for s in range(subdir_count) - ] + assert len(results) == file_count - await asyncio.gather(*tasks) + # check this is true + counted_uploaded_objects = await storage_s3_client.count_objects( + bucket=storage_s3_bucket, + prefix=s3_base_path, + is_partial_prefix=True, + start_after=None, + use_delimiter=False, + ) + assert counted_uploaded_objects == file_count - file.unlink() + return node_id, {k: v for r in results for k, v in r.items()} return _create_content @@ -575,21 +605,16 @@ async def _create_file(s: int, f: int): async def delete_directory( initialized_app: FastAPI, client: httpx.AsyncClient, - storage_s3_client: SimcoreS3API, - storage_s3_bucket: S3BucketName, user_id: UserID, location_id: LocationID, ) -> Callable[..., Awaitable[None]]: - async def _dir_remover(directory_file_upload: FileUploadSchema) -> None: - assert directory_file_upload.urls[0].path - directory_file_id = directory_file_upload.urls[0].path.strip("/") - + async def _dir_remover(directory_s3: StorageFileID) -> None: delete_url = url_from_operation_id( client, initialized_app, "delete_file", location_id=f"{location_id}", - file_id=directory_file_id, + file_id=directory_s3, ).with_query(user_id=user_id) response = await client.delete(f"{delete_url}") @@ -599,7 +624,7 @@ async def _dir_remover(directory_file_upload: FileUploadSchema) -> None: # even if one file is left this will detect it list_files_metadata_url = url_from_operation_id( client, initialized_app, "list_files_metadata", location_id=f"{location_id}" - ).with_query(user_id=user_id, uuid_filter=directory_file_id) + ).with_query(user_id=user_id, uuid_filter=directory_s3) response = await client.get(f"{list_files_metadata_url}") data, error = assert_status(response, status.HTTP_200_OK, list[FileMetaDataGet]) assert error is None @@ -610,48 +635,217 @@ async def _dir_remover(directory_file_upload: FileUploadSchema) -> None: @pytest.fixture async def create_directory_with_files( - create_empty_directory: Callable[..., Awaitable[FileUploadSchema]], - populate_directory: Callable[..., Awaitable[None]], + create_empty_directory: Callable[ + [str, ProjectID, NodeID], Awaitable[SimcoreS3FileID] + ], + populate_directory: Callable[ + [ByteSize, str, ProjectID, NodeID, int, int], + Awaitable[tuple[NodeID, dict[SimcoreS3FileID, FileIDDict]]], + ], delete_directory: Callable[..., Awaitable[None]], -) -> Callable[..., AbstractAsyncContextManager[FileUploadSchema]]: - @asynccontextmanager - async def _create_context( - dir_name: str, file_size_in_dir: ByteSize, subdir_count: int, file_count: int - ) -> AsyncIterator[FileUploadSchema]: - directory_file_upload: FileUploadSchema = await create_empty_directory( - dir_name=dir_name +) -> AsyncIterator[ + Callable[ + [str, ByteSize, int, int, ProjectID, NodeID], + Awaitable[ + tuple[SimcoreS3FileID, tuple[NodeID, dict[SimcoreS3FileID, FileIDDict]]] + ], + ] +]: + uploaded_directories = [] + + async def _( + dir_name: str, + file_size_in_dir: ByteSize, + subdir_count: int, + file_count: int, + project_id: ProjectID, + node_id: NodeID, + ) -> tuple[SimcoreS3FileID, tuple[NodeID, dict[SimcoreS3FileID, FileIDDict]]]: + directory_file_id = await create_empty_directory(dir_name, project_id, node_id) + + uploaded_files = await populate_directory( + file_size_in_dir, + dir_name, + project_id, + node_id, + subdir_count, + file_count, ) - await populate_directory( - file_size_in_dir=file_size_in_dir, - dir_name=dir_name, - subdir_count=subdir_count, - file_count=file_count, + uploaded_directories.append(directory_file_id) + + return directory_file_id, uploaded_files + + yield _ + + await asyncio.gather(*(delete_directory(_) for _ in uploaded_directories)) + + +async def _upload_one_file_task( + upload_file: Callable[..., Awaitable[tuple[Path, SimcoreS3FileID]]], + allowed_file_sizes: tuple[ByteSize, ...], + allowed_file_checksums: tuple[SHA256Str, ...], + *, + file_name: str, + file_id: SimcoreS3FileID, + node_id: NodeID, +) -> tuple[NodeID, dict[SimcoreS3FileID, FileIDDict]]: + selected_checksum = random.choice(allowed_file_checksums) # noqa: S311 + uploaded_file, uploaded_file_id = await upload_file( + file_size=random.choice(allowed_file_sizes), # noqa: S311 + file_name=file_name, + file_id=file_id, + sha256_checksum=selected_checksum, + ) + assert uploaded_file_id == file_id + return ( + node_id, + { + uploaded_file_id: FileIDDict( + path=uploaded_file, sha256_checksum=selected_checksum + ) + }, + ) + + +async def _upload_folder_task( + create_directory_with_files: Callable[ + ..., + Awaitable[ + tuple[SimcoreS3FileID, tuple[NodeID, dict[SimcoreS3FileID, FileIDDict]]] + ], + ], + allowed_file_sizes: tuple[ByteSize, ...], + *, + dir_name: str, + project_id: ProjectID, + node_id: NodeID, + workspace_file_count: int, +) -> tuple[NodeID, dict[SimcoreS3FileID, FileIDDict]]: + dir_file_id, node_files_map = await create_directory_with_files( + dir_name=dir_name, + file_size_in_dir=random.choice(allowed_file_sizes), # noqa: S311 + subdir_count=3, + file_count=workspace_file_count, + project_id=project_id, + node_id=node_id, + ) + assert dir_file_id + return node_files_map + + +@pytest.fixture +async def random_project_with_files( + sqlalchemy_async_engine: AsyncEngine, + create_project: Callable[..., Awaitable[dict[str, Any]]], + create_project_node: Callable[..., Awaitable[NodeID]], + create_simcore_file_id: Callable[ + [ProjectID, NodeID, str, Path | None], SimcoreS3FileID + ], + faker: Faker, + create_directory_with_files: Callable[ + ..., + Awaitable[ + tuple[SimcoreS3FileID, tuple[NodeID, dict[SimcoreS3FileID, FileIDDict]]] + ], + ], + upload_file: Callable[..., Awaitable[tuple[Path, SimcoreS3FileID]]], +) -> Callable[ + [ProjectWithFilesParams], + Awaitable[tuple[dict[str, Any], dict[NodeID, dict[SimcoreS3FileID, FileIDDict]]]], +]: + async def _creator( + project_params: ProjectWithFilesParams, + ) -> tuple[dict[str, Any], dict[NodeID, dict[SimcoreS3FileID, FileIDDict]]]: + assert len(project_params.allowed_file_sizes) == len( + project_params.allowed_file_checksums ) + project = await create_project(name="random-project") + node_to_files_mapping: dict[NodeID, dict[SimcoreS3FileID, FileIDDict]] = {} + upload_tasks = [] + for _ in range(project_params.num_nodes): + # Create a node with outputs (files and others) + project_id = ProjectID(project["uuid"]) + node_id = cast(NodeID, faker.uuid4(cast_to=None)) + node_to_files_mapping[node_id] = {} + output3_file_name = faker.file_name() + output3_file_id = create_simcore_file_id( + project_id, node_id, output3_file_name, Path("outputs/output_3") + ) + created_node_id = await create_project_node( + ProjectID(project["uuid"]), + node_id, + outputs={ + "output_1": faker.pyint(), + "output_2": faker.pystr(), + "output_3": f"{output3_file_id}", + }, + ) + assert created_node_id == node_id + + upload_tasks.append( + _upload_one_file_task( + upload_file, + project_params.allowed_file_sizes, + project_params.allowed_file_checksums, + file_name=output3_file_name, + file_id=output3_file_id, + node_id=node_id, + ) + ) + + # some workspace files (these are not referenced in the file_meta_data, only as a folder) + if project_params.workspace_files_count > 0: + upload_tasks.append( + _upload_folder_task( + create_directory_with_files, + project_params.allowed_file_sizes, + dir_name="workspace", + project_id=project_id, + node_id=node_id, + workspace_file_count=project_params.workspace_files_count, + ) + ) + + # add a few random files in the node root space for good measure + for _ in range(random.randint(1, 3)): # noqa: S311 + root_file_name = faker.file_name() + root_file_id = create_simcore_file_id( + project_id, node_id, root_file_name, None + ) + upload_tasks.append( + _upload_one_file_task( + upload_file, + project_params.allowed_file_sizes, + project_params.allowed_file_checksums, + file_name=root_file_name, + file_id=root_file_id, + node_id=node_id, + ), + ) - yield directory_file_upload + # upload everything of the node + results = await limited_gather(*upload_tasks, limit=10) - await delete_directory(directory_file_upload=directory_file_upload) + for node_id, file_id_to_dict_mapping in results: + for file_id, file_dict in file_id_to_dict_mapping.items(): + node_to_files_mapping[node_id][file_id] = file_dict - return _create_context + project = await get_updated_project(sqlalchemy_async_engine, project["uuid"]) + return project, node_to_files_mapping + + return _creator @pytest.fixture async def with_random_project_with_files( random_project_with_files: Callable[ - ..., + [ProjectWithFilesParams], Awaitable[ - tuple[ - dict[str, Any], - dict[NodeID, dict[SimcoreS3FileID, FileIDDict]], - ] + tuple[dict[str, Any], dict[NodeID, dict[SimcoreS3FileID, FileIDDict]]] ], ], + project_params: ProjectWithFilesParams, + faker: Faker, ) -> tuple[dict[str, Any], dict[NodeID, dict[SimcoreS3FileID, FileIDDict]],]: - return await random_project_with_files( - file_sizes=( - TypeAdapter(ByteSize).validate_python("1Mib"), - TypeAdapter(ByteSize).validate_python("2Mib"), - TypeAdapter(ByteSize).validate_python("5Mib"), - ) - ) + return await random_project_with_files(project_params) diff --git a/services/storage/tests/unit/test_data_export.py b/services/storage/tests/unit/test_data_export.py index bb25413e948..7798621f1e8 100644 --- a/services/storage/tests/unit/test_data_export.py +++ b/services/storage/tests/unit/test_data_export.py @@ -1,7 +1,7 @@ # pylint: disable=W0621 # pylint: disable=W0613 +from collections.abc import Awaitable, Callable from pathlib import Path -from typing import Awaitable, Callable import pytest from faker import Faker diff --git a/services/storage/tests/unit/test_handlers_files.py b/services/storage/tests/unit/test_handlers_files.py index d6946433614..1e76da434c1 100644 --- a/services/storage/tests/unit/test_handlers_files.py +++ b/services/storage/tests/unit/test_handlers_files.py @@ -12,7 +12,6 @@ import logging import urllib.parse from collections.abc import AsyncIterator, Awaitable, Callable -from contextlib import AbstractAsyncContextManager from dataclasses import dataclass from pathlib import Path from random import choice @@ -38,7 +37,6 @@ SoftCopyBody, UploadedPart, ) -from models_library.basic_types import SHA256Str from models_library.projects import ProjectID from models_library.projects_nodes_io import LocationID, NodeID, SimcoreS3FileID from models_library.users import UserID @@ -50,7 +48,7 @@ from pytest_simcore.helpers.logging_tools import log_context from pytest_simcore.helpers.parametrizations import byte_size_ids from pytest_simcore.helpers.s3 import upload_file_part, upload_file_to_presigned_link -from pytest_simcore.helpers.storage_utils import FileIDDict +from pytest_simcore.helpers.storage_utils import FileIDDict, ProjectWithFilesParams from pytest_simcore.helpers.storage_utils_file_meta_data import ( assert_file_meta_data_in_db, ) @@ -1009,7 +1007,11 @@ async def test_download_file_from_inside_a_directory( file_size: ByteSize, location_id: int, user_id: UserID, - create_empty_directory: Callable[..., Awaitable[FileUploadSchema]], + project_id: ProjectID, + node_id: NodeID, + create_empty_directory: Callable[ + [str, ProjectID, NodeID], Awaitable[SimcoreS3FileID] + ], create_file_of_size: Callable[[ByteSize, str | None], Path], storage_s3_client: SimcoreS3API, storage_s3_bucket: S3BucketName, @@ -1020,10 +1022,7 @@ async def test_download_file_from_inside_a_directory( # upload a file inside a directory and check the download link directory_name = "a-test-dir" - directory_file_upload = await create_empty_directory(directory_name) - - assert directory_file_upload.urls[0].path - dir_path_in_s3 = directory_file_upload.urls[0].path.strip("/") + dir_path_in_s3 = await create_empty_directory(directory_name, project_id, node_id) file_name = "meta_data_entry_is_dir.file" file_to_upload_in_dir = create_file_of_size(file_size, file_name) @@ -1069,15 +1068,16 @@ async def test_download_file_the_file_is_missing_from_the_directory( client: httpx.AsyncClient, location_id: int, user_id: UserID, - create_empty_directory: Callable[..., Awaitable[FileUploadSchema]], + project_id: ProjectID, + node_id: NodeID, + create_empty_directory: Callable[ + [str, ProjectID, NodeID], Awaitable[SimcoreS3FileID] + ], ): # file_meta_data entry corresponds to a directory but file is not present in directory directory_name = "a-second-test-dir" - directory_file_upload = await create_empty_directory(directory_name) - - assert directory_file_upload.urls[0].path - dir_path_in_s3 = directory_file_upload.urls[0].path.strip("/") + dir_path_in_s3 = await create_empty_directory(directory_name, project_id, node_id) missing_s3_file_id = TypeAdapter(SimcoreS3FileID).validate_python( f"{dir_path_in_s3}/missing_inside_dir.file" @@ -1250,9 +1250,7 @@ async def _list_files_legacy( client: httpx.AsyncClient, user_id: UserID, location_id: LocationID, - directory_file_upload: FileUploadSchema, ) -> list[FileMetaDataGet]: - assert directory_file_upload.urls[0].path return await _list_files( initialized_app, client, @@ -1267,9 +1265,7 @@ async def _list_files_and_directories( client: httpx.AsyncClient, user_id: UserID, location_id: LocationID, - directory_file_upload: FileUploadSchema, ) -> list[FileMetaDataGet]: - assert directory_file_upload.urls[0].path return await _list_files( initialized_app, client, @@ -1312,7 +1308,7 @@ async def test_is_directory_link_forces_link_type_and_size( assert len(directory_file_upload.urls) == 1 files_and_directories: list[FileMetaDataGet] = await _list_files_and_directories( - initialized_app, client, user_id, location_id, directory_file_upload + initialized_app, client, user_id, location_id ) assert len(files_and_directories) == 1 assert files_and_directories[0].is_directory is True @@ -1348,53 +1344,59 @@ async def test_ensure_expand_dirs_defaults_true( async def test_upload_file_is_directory_and_remove_content( initialized_app: FastAPI, - create_empty_directory: Callable[..., Awaitable[FileUploadSchema]], - populate_directory: Callable[..., Awaitable[None]], + create_empty_directory: Callable[ + [str, ProjectID, NodeID], Awaitable[SimcoreS3FileID] + ], + populate_directory: Callable[ + [ByteSize, str, ProjectID, NodeID, int, int], + Awaitable[tuple[NodeID, dict[SimcoreS3FileID, FileIDDict]]], + ], delete_directory: Callable[..., Awaitable[None]], client: httpx.AsyncClient, location_id: LocationID, user_id: UserID, - faker: Faker, + project_id: ProjectID, + node_id: NodeID, ): FILE_SIZE_IN_DIR = TypeAdapter(ByteSize).validate_python("1Mib") DIR_NAME = "some-dir" SUBDIR_COUNT = 4 - FILE_COUNT = 5 + FILE_COUNT = 20 # DIRECTORY CREATION (is empty) - directory_file_upload: FileUploadSchema = await create_empty_directory( - dir_name=DIR_NAME - ) + directory_in_s3 = await create_empty_directory(DIR_NAME, project_id, node_id) files_and_directories: list[FileMetaDataGet] = await _list_files_and_directories( - initialized_app, client, user_id, location_id, directory_file_upload + initialized_app, client, user_id, location_id ) assert len(files_and_directories) == 1 list_of_files: list[FileMetaDataGet] = await _list_files_legacy( - initialized_app, client, user_id, location_id, directory_file_upload + initialized_app, client, user_id, location_id ) assert len(list_of_files) == 0 # DIRECTORY WITH CONTENT await populate_directory( - file_size_in_dir=FILE_SIZE_IN_DIR, - dir_name=DIR_NAME, - subdir_count=SUBDIR_COUNT, - file_count=FILE_COUNT, + FILE_SIZE_IN_DIR, + DIR_NAME, + project_id, + node_id, + SUBDIR_COUNT, + FILE_COUNT, ) files_and_directories: list[FileMetaDataGet] = await _list_files_and_directories( - initialized_app, client, user_id, location_id, directory_file_upload + initialized_app, client, user_id, location_id ) assert len(files_and_directories) == 1 list_of_files: list[FileMetaDataGet] = await _list_files_legacy( - initialized_app, client, user_id, location_id, directory_file_upload + initialized_app, client, user_id, location_id ) - assert len(list_of_files) == SUBDIR_COUNT * FILE_COUNT + assert len(list_of_files) == FILE_COUNT # DELETE NOT EXISTING @@ -1410,10 +1412,10 @@ async def test_upload_file_is_directory_and_remove_content( assert error is None list_of_files: list[FileMetaDataGet] = await _list_files_legacy( - initialized_app, client, user_id, location_id, directory_file_upload + initialized_app, client, user_id, location_id ) - assert len(list_of_files) == SUBDIR_COUNT * FILE_COUNT + assert len(list_of_files) == FILE_COUNT # DELETE ONE FILE FROM THE DIRECTORY @@ -1428,52 +1430,71 @@ async def test_upload_file_is_directory_and_remove_content( _, error = assert_status(response, status.HTTP_204_NO_CONTENT, None) assert error is None - list_of_files: list[FileMetaDataGet] = await _list_files_legacy( - initialized_app, client, user_id, location_id, directory_file_upload + list_of_files = await _list_files_legacy( + initialized_app, client, user_id, location_id ) - assert len(list_of_files) == SUBDIR_COUNT * FILE_COUNT - 1 + assert len(list_of_files) == FILE_COUNT - 1 # DIRECTORY REMOVAL - await delete_directory(directory_file_upload=directory_file_upload) + await delete_directory(directory_in_s3) - list_of_files: list[FileMetaDataGet] = await _list_files_legacy( - initialized_app, client, user_id, location_id, directory_file_upload + list_of_files = await _list_files_legacy( + initialized_app, client, user_id, location_id ) assert len(list_of_files) == 0 - files_and_directories: list[FileMetaDataGet] = await _list_files_and_directories( - initialized_app, client, user_id, location_id, directory_file_upload + files_and_directories = await _list_files_and_directories( + initialized_app, client, user_id, location_id ) assert len(files_and_directories) == 0 -@pytest.mark.parametrize("files_in_dir", [1002]) +@pytest.mark.parametrize("files_count", [1002]) async def test_listing_more_than_1000_objects_in_bucket( create_directory_with_files: Callable[ - ..., AbstractAsyncContextManager[FileUploadSchema] + [str, ByteSize, int, int, ProjectID, NodeID], + Awaitable[ + tuple[SimcoreS3FileID, tuple[NodeID, dict[SimcoreS3FileID, FileIDDict]]] + ], ], initialized_app: FastAPI, client: httpx.AsyncClient, location_id: LocationID, user_id: UserID, - files_in_dir: int, + project_id: ProjectID, + node_id: NodeID, + files_count: int, ): - async with create_directory_with_files( - dir_name="some-random", - file_size_in_dir=TypeAdapter(ByteSize).validate_python("1"), - subdir_count=1, - file_count=files_in_dir, - ) as directory_file_upload: - list_of_files: list[FileMetaDataGet] = await _list_files_legacy( - initialized_app, client, user_id, location_id, directory_file_upload - ) - # for now no more than 1000 objects will be returned - assert len(list_of_files) == 1000 + SUBDIR_COUNT = 1 + await create_directory_with_files( + "random-directory", + TypeAdapter(ByteSize).validate_python("1"), + SUBDIR_COUNT, + files_count, + project_id, + node_id, + ) + list_of_files = await _list_files_legacy( + initialized_app, client, user_id, location_id + ) + # for now no more than 1000 objects will be returned + assert len(list_of_files) == 1000 @pytest.mark.parametrize("uuid_filter", [True, False]) +@pytest.mark.parametrize( + "project_params", + [ + ProjectWithFilesParams( + num_nodes=1, + allowed_file_sizes=(TypeAdapter(ByteSize).validate_python("1b"),), + workspace_files_count=0, + ), + ], + ids=str, +) async def test_listing_with_project_id_filter( initialized_app: FastAPI, client: httpx.AsyncClient, @@ -1481,26 +1502,16 @@ async def test_listing_with_project_id_filter( user_id: UserID, faker: Faker, random_project_with_files: Callable[ - [int, tuple[ByteSize, ...]], + [ProjectWithFilesParams], Awaitable[ - tuple[ - dict[str, Any], - dict[NodeID, dict[SimcoreS3FileID, FileIDDict]], - ] + tuple[dict[str, Any], dict[NodeID, dict[SimcoreS3FileID, FileIDDict]]] ], ], uuid_filter: bool, + project_params: ProjectWithFilesParams, ): - src_project, src_projects_list = await random_project_with_files( - num_nodes=1, - file_sizes=(ByteSize(1),), - file_checksums=(TypeAdapter(SHA256Str).validate_python(faker.sha256()),), - ) - _, _ = await random_project_with_files( - num_nodes=1, - file_sizes=(ByteSize(1),), - file_checksums=(TypeAdapter(SHA256Str).validate_python(faker.sha256()),), - ) + src_project, src_projects_list = await random_project_with_files(project_params) + _, _ = await random_project_with_files(project_params) assert len(src_projects_list.keys()) > 0 node_id = next(iter(src_projects_list.keys())) project_files_in_db = set(src_projects_list[node_id]) diff --git a/services/storage/tests/unit/test_handlers_paths.py b/services/storage/tests/unit/test_handlers_paths.py new file mode 100644 index 00000000000..5e8640b688c --- /dev/null +++ b/services/storage/tests/unit/test_handlers_paths.py @@ -0,0 +1,363 @@ +# pylint:disable=no-name-in-module +# pylint:disable=protected-access +# pylint:disable=redefined-outer-name +# pylint:disable=too-many-arguments +# pylint:disable=too-many-positional-arguments +# pylint:disable=unused-argument +# pylint:disable=unused-variable + + +import random +from collections.abc import Awaitable, Callable +from pathlib import Path +from typing import Any, TypeAlias + +import httpx +import pytest +from faker import Faker +from fastapi import FastAPI, status +from fastapi_pagination.cursor import CursorPage +from models_library.api_schemas_storage.storage_schemas import PathMetaDataGet +from models_library.projects_nodes_io import LocationID, NodeID, SimcoreS3FileID +from models_library.users import UserID +from pydantic import ByteSize, TypeAdapter +from pytest_simcore.helpers.fastapi import url_from_operation_id +from pytest_simcore.helpers.httpx_assert_checks import assert_status +from pytest_simcore.helpers.storage_utils import FileIDDict, ProjectWithFilesParams + +pytest_simcore_core_services_selection = ["postgres"] +pytest_simcore_ops_services_selection = ["adminer"] + +_IsFile: TypeAlias = bool + + +def _filter_and_group_paths_one_level_deeper( + paths: list[Path], prefix: Path +) -> list[tuple[Path, _IsFile]]: + relative_paths = (path for path in paths if path.is_relative_to(prefix)) + return sorted( + { + ( + (path, len(path.relative_to(prefix).parts) == 1) + if len(path.relative_to(prefix).parts) == 1 + else (prefix / path.relative_to(prefix).parts[0], False) + ) + for path in relative_paths + }, + key=lambda x: x[0], + ) + + +async def _assert_list_paths( + initialized_app: FastAPI, + client: httpx.AsyncClient, + location_id: LocationID, + user_id: UserID, + *, + file_filter: Path | None, + limit: int = 25, + expected_paths: list[tuple[Path, _IsFile]], + check_total: bool = True, +) -> CursorPage[PathMetaDataGet]: + offset = 0 + total_expected = len(expected_paths) + next_cursor = 0 # NOTE: this will initialize + total_received = 0 + while next_cursor is not None: + url = url_from_operation_id( + client, initialized_app, "list_paths", location_id=f"{location_id}" + ).with_query( + user_id=user_id, + size=limit, + ) + if next_cursor: + url = url.update_query(cursor=next_cursor) + + if file_filter is not None: + url = url.update_query(file_filter=f"{file_filter}") + response = await client.get(f"{url}") + + page_of_files, _ = assert_status( + response, + status.HTTP_200_OK, + CursorPage[PathMetaDataGet], + expect_envelope=False, + ) + assert page_of_files + assert len(page_of_files.items) == min(limit, total_expected - offset) + + for (expected_path, is_file), received_path in zip( + expected_paths[offset : offset + limit], page_of_files.items, strict=True + ): + assert received_path.path == expected_path + if is_file: + assert received_path.file_meta_data is not None + else: + assert received_path.file_meta_data is None + + if check_total: + assert page_of_files.total == total_expected + else: + assert page_of_files.total is None + next_cursor = page_of_files.next_page + total_received += len(page_of_files.items) + offset += limit + assert total_received == total_expected + assert page_of_files.next_page is None + return page_of_files + + +async def test_list_paths_root_folder_of_empty_returns_nothing( + initialized_app: FastAPI, + client: httpx.AsyncClient, + location_id: LocationID, + user_id: UserID, +): + await _assert_list_paths( + initialized_app, + client, + location_id, + user_id, + file_filter=None, + expected_paths=[], + ) + + +@pytest.mark.parametrize( + "project_params", + [ + ProjectWithFilesParams( + num_nodes=10, + allowed_file_sizes=(TypeAdapter(ByteSize).validate_python("1b"),), + workspace_files_count=10, + ) + ], + ids=str, +) +async def test_list_paths_pagination( + initialized_app: FastAPI, + client: httpx.AsyncClient, + location_id: LocationID, + user_id: UserID, + with_random_project_with_files: tuple[ + dict[str, Any], + dict[NodeID, dict[SimcoreS3FileID, FileIDDict]], + ], + faker: Faker, +): + project, list_of_files = with_random_project_with_files + num_nodes = len(list(project["workbench"])) + + # ls the nodes (DB-based) + file_filter = Path(project["uuid"]) + expected_paths = sorted( + ((file_filter / node_key, False) for node_key in project["workbench"]), + key=lambda x: x[0], + ) + await _assert_list_paths( + initialized_app, + client, + location_id, + user_id, + file_filter=file_filter, + expected_paths=expected_paths, + limit=int(num_nodes / 2 + 0.5), + ) + + # ls in the workspace (S3-based) + # ls in the workspace + selected_node_id = NodeID(random.choice(list(project["workbench"]))) # noqa: S311 + selected_node_s3_keys = [ + Path(s3_object_id) for s3_object_id in list_of_files[selected_node_id] + ] + workspace_file_filter = file_filter / f"{selected_node_id}" / "workspace" + expected_paths = _filter_and_group_paths_one_level_deeper( + selected_node_s3_keys, workspace_file_filter + ) + await _assert_list_paths( + initialized_app, + client, + location_id, + user_id, + file_filter=workspace_file_filter, + expected_paths=expected_paths, + limit=1, + check_total=False, + ) + # ls in until we get to some files + while selected_subfolders := [p for p in expected_paths if p[1] is False]: + selected_path_filter = random.choice(selected_subfolders) # noqa: S311 + expected_paths = _filter_and_group_paths_one_level_deeper( + selected_node_s3_keys, selected_path_filter[0] + ) + await _assert_list_paths( + initialized_app, + client, + location_id, + user_id, + file_filter=selected_path_filter[0], + expected_paths=expected_paths, + check_total=False, + ) + + +@pytest.mark.parametrize( + "project_params, num_projects", + [ + ( + ProjectWithFilesParams( + num_nodes=3, + allowed_file_sizes=(TypeAdapter(ByteSize).validate_python("1b"),), + workspace_files_count=10, + ), + 3, + ) + ], + ids=str, +) +async def test_list_paths( + initialized_app: FastAPI, + client: httpx.AsyncClient, + location_id: LocationID, + user_id: UserID, + random_project_with_files: Callable[ + [ProjectWithFilesParams], + Awaitable[ + tuple[dict[str, Any], dict[NodeID, dict[SimcoreS3FileID, FileIDDict]]] + ], + ], + project_params: ProjectWithFilesParams, + num_projects: int, +): + project_to_files_mapping = [ + await random_project_with_files(project_params) for _ in range(num_projects) + ] + project_to_files_mapping.sort(key=lambda x: x[0]["uuid"]) + + # ls root returns our projects + expected_paths = sorted( + ((Path(f"{prj_db['uuid']}"), False) for prj_db, _ in project_to_files_mapping), + key=lambda x: x[0], + ) + await _assert_list_paths( + initialized_app, + client, + location_id, + user_id, + file_filter=None, + expected_paths=expected_paths, + ) + + # ls with only some part of the path should return only the projects that match + selected_project, selected_project_files = random.choice( # noqa: S311 + project_to_files_mapping + ) + partial_file_filter = Path( + selected_project["uuid"][: len(selected_project["uuid"]) // 2] + ) + partial_expected_paths = [ + p for p in expected_paths if f"{p[0]}".startswith(f"{partial_file_filter}") + ] + + await _assert_list_paths( + initialized_app, + client, + location_id, + user_id, + file_filter=partial_file_filter, + expected_paths=partial_expected_paths, + ) + + # now we ls inside one of the projects returns the nodes + file_filter = Path(selected_project["uuid"]) + expected_paths = sorted( + ((file_filter / node_key, False) for node_key in selected_project["workbench"]), + key=lambda x: x[0], + ) + await _assert_list_paths( + initialized_app, + client, + location_id, + user_id, + file_filter=file_filter, + expected_paths=expected_paths, + ) + + # now we ls in one of the nodes + selected_node_id = NodeID( + random.choice(list(selected_project["workbench"])) # noqa: S311 + ) + selected_node_s3_keys = [ + Path(s3_object_id) for s3_object_id in selected_project_files[selected_node_id] + ] + file_filter = file_filter / f"{selected_node_id}" + expected_node_files = _filter_and_group_paths_one_level_deeper( + selected_node_s3_keys, + file_filter, + ) + await _assert_list_paths( + initialized_app, + client, + location_id, + user_id, + file_filter=file_filter, + expected_paths=expected_node_files, + ) + + # ls in the outputs will list 1 entry which is a folder + node_outputs_file_filter = file_filter / "outputs" + expected_paths = _filter_and_group_paths_one_level_deeper( + selected_node_s3_keys, node_outputs_file_filter + ) + await _assert_list_paths( + initialized_app, + client, + location_id, + user_id, + file_filter=node_outputs_file_filter, + expected_paths=expected_paths, + ) + + # ls in output_3 shall reveal the file + node_outputs_file_filter = file_filter / "outputs" / "output_3" + expected_paths = _filter_and_group_paths_one_level_deeper( + selected_node_s3_keys, node_outputs_file_filter + ) + await _assert_list_paths( + initialized_app, + client, + location_id, + user_id, + file_filter=node_outputs_file_filter, + expected_paths=expected_paths, + ) + + # ls in the workspace + workspace_file_filter = file_filter / "workspace" + expected_paths = _filter_and_group_paths_one_level_deeper( + selected_node_s3_keys, workspace_file_filter + ) + await _assert_list_paths( + initialized_app, + client, + location_id, + user_id, + file_filter=workspace_file_filter, + expected_paths=expected_paths, + check_total=False, + ) + # ls in until we get to some files + while selected_subfolders := [p for p in expected_paths if p[1] is False]: + selected_path_filter = random.choice(selected_subfolders) # noqa: S311 + expected_paths = _filter_and_group_paths_one_level_deeper( + selected_node_s3_keys, selected_path_filter[0] + ) + await _assert_list_paths( + initialized_app, + client, + location_id, + user_id, + file_filter=selected_path_filter[0], + expected_paths=expected_paths, + check_total=False, + ) diff --git a/services/storage/tests/unit/test_handlers_simcore_s3.py b/services/storage/tests/unit/test_handlers_simcore_s3.py index 0ddc39f206d..8449e5ee7c7 100644 --- a/services/storage/tests/unit/test_handlers_simcore_s3.py +++ b/services/storage/tests/unit/test_handlers_simcore_s3.py @@ -32,7 +32,11 @@ from pytest_simcore.helpers.fastapi import url_from_operation_id from pytest_simcore.helpers.httpx_assert_checks import assert_status from pytest_simcore.helpers.logging_tools import log_context -from pytest_simcore.helpers.storage_utils import FileIDDict, get_updated_project +from pytest_simcore.helpers.storage_utils import ( + FileIDDict, + ProjectWithFilesParams, + get_updated_project, +) from pytest_simcore.helpers.storage_utils_file_meta_data import ( assert_file_meta_data_in_db, ) @@ -205,6 +209,22 @@ def short_dsm_cleaner_interval(monkeypatch: pytest.MonkeyPatch) -> int: return 1 +@pytest.mark.parametrize( + "project_params", + [ + ProjectWithFilesParams( + num_nodes=1, + allowed_file_sizes=(TypeAdapter(ByteSize).validate_python("210Mib"),), + allowed_file_checksums=( + TypeAdapter(SHA256Str).validate_python( + "0b3216d95ec5a36c120ba16c88911dcf5ff655925d0fbdbc74cf95baf86de6fc" + ), + ), + workspace_files_count=0, + ), + ], + ids=str, +) async def test_copy_folders_from_valid_project_with_one_large_file( initialized_app: FastAPI, short_dsm_cleaner_interval: int, @@ -213,24 +233,15 @@ async def test_copy_folders_from_valid_project_with_one_large_file( create_project: Callable[[], Awaitable[dict[str, Any]]], sqlalchemy_async_engine: AsyncEngine, random_project_with_files: Callable[ - [int, tuple[ByteSize], tuple[SHA256Str]], + [ProjectWithFilesParams], Awaitable[ - tuple[ - dict[str, Any], - dict[NodeID, dict[SimcoreS3FileID, FileIDDict]], - ] + tuple[dict[str, Any], dict[NodeID, dict[SimcoreS3FileID, FileIDDict]]] ], ], + project_params: ProjectWithFilesParams, ): # 1. create a src project with 1 large file - sha256_checksum: SHA256Str = TypeAdapter(SHA256Str).validate_python( - "0b3216d95ec5a36c120ba16c88911dcf5ff655925d0fbdbc74cf95baf86de6fc" - ) - src_project, src_projects_list = await random_project_with_files( - 1, - (TypeAdapter(ByteSize).validate_python("210Mib"),), - (sha256_checksum,), - ) + src_project, src_projects_list = await random_project_with_files(project_params) # 2. create a dst project without files dst_project, nodes_map = clone_project_data(src_project) dst_project = await create_project(**dst_project) @@ -274,6 +285,32 @@ async def test_copy_folders_from_valid_project_with_one_large_file( ) +@pytest.mark.parametrize( + "project_params", + [ + ProjectWithFilesParams( + num_nodes=12, + allowed_file_sizes=( + TypeAdapter(ByteSize).validate_python("7Mib"), + TypeAdapter(ByteSize).validate_python("110Mib"), + TypeAdapter(ByteSize).validate_python("1Mib"), + ), + allowed_file_checksums=( + TypeAdapter(SHA256Str).validate_python( + "311e2e130d83cfea9c3b7560699c221b0b7f9e5d58b02870bd52b695d8b4aabd" + ), + TypeAdapter(SHA256Str).validate_python( + "08e297db979d3c84f6b072c2a1e269e8aa04e82714ca7b295933a0c9c0f62b2e" + ), + TypeAdapter(SHA256Str).validate_python( + "488f3b57932803bbf644593bd46d95599b1d4da1d63bc020d7ebe6f1c255f7f3" + ), + ), + workspace_files_count=0, + ), + ], + ids=str, +) async def test_copy_folders_from_valid_project( short_dsm_cleaner_interval: int, initialized_app: FastAPI, @@ -283,17 +320,15 @@ async def test_copy_folders_from_valid_project( create_simcore_file_id: Callable[[ProjectID, NodeID, str], SimcoreS3FileID], sqlalchemy_async_engine: AsyncEngine, random_project_with_files: Callable[ - ..., + [ProjectWithFilesParams], Awaitable[ - tuple[ - dict[str, Any], - dict[NodeID, dict[SimcoreS3FileID, FileIDDict]], - ] + tuple[dict[str, Any], dict[NodeID, dict[SimcoreS3FileID, FileIDDict]]] ], ], + project_params: ProjectWithFilesParams, ): # 1. create a src project with some files - src_project, src_projects_list = await random_project_with_files() + src_project, src_projects_list = await random_project_with_files(project_params) # 2. create a dst project without files dst_project, nodes_map = clone_project_data(src_project) dst_project = await create_project(**dst_project) @@ -426,6 +461,20 @@ async def test_connect_to_external( print(data) +@pytest.mark.parametrize( + "project_params", + [ + ProjectWithFilesParams( + num_nodes=3, + allowed_file_sizes=( + TypeAdapter(ByteSize).validate_python("7Mib"), + TypeAdapter(ByteSize).validate_python("110Mib"), + TypeAdapter(ByteSize).validate_python("1Mib"), + ), + workspace_files_count=0, + ) + ], +) async def test_create_and_delete_folders_from_project( set_log_levels_for_noisy_libraries: None, initialized_app: FastAPI, @@ -449,6 +498,20 @@ async def test_create_and_delete_folders_from_project( ) +@pytest.mark.parametrize( + "project_params", + [ + ProjectWithFilesParams( + num_nodes=3, + allowed_file_sizes=( + TypeAdapter(ByteSize).validate_python("7Mib"), + TypeAdapter(ByteSize).validate_python("110Mib"), + TypeAdapter(ByteSize).validate_python("1Mib"), + ), + workspace_files_count=0, + ) + ], +) @pytest.mark.parametrize("num_concurrent_calls", [50]) async def test_create_and_delete_folders_from_project_burst( set_log_levels_for_noisy_libraries: None, diff --git a/services/storage/tests/unit/test_simcore_s3_dsm.py b/services/storage/tests/unit/test_simcore_s3_dsm.py index 92f3a9751bb..50b664199d3 100644 --- a/services/storage/tests/unit/test_simcore_s3_dsm.py +++ b/services/storage/tests/unit/test_simcore_s3_dsm.py @@ -2,16 +2,16 @@ # pylint:disable=redefined-outer-name from collections.abc import Awaitable, Callable -from contextlib import AbstractAsyncContextManager from pathlib import Path import pytest from faker import Faker -from models_library.api_schemas_storage.storage_schemas import FileUploadSchema from models_library.basic_types import SHA256Str -from models_library.projects_nodes_io import SimcoreS3FileID +from models_library.projects import ProjectID +from models_library.projects_nodes_io import NodeID, SimcoreS3FileID from models_library.users import UserID from pydantic import ByteSize, TypeAdapter +from pytest_simcore.helpers.storage_utils import FileIDDict from simcore_service_storage.models import FileMetaData from simcore_service_storage.modules.db import file_meta_data from simcore_service_storage.modules.s3 import get_s3_client @@ -38,11 +38,16 @@ def copy_transfer_cb(total_bytes_copied: int, *, file_name: str) -> None: async def test__copy_path_s3_s3( simcore_s3_dsm: SimcoreS3DataManager, create_directory_with_files: Callable[ - ..., AbstractAsyncContextManager[FileUploadSchema] + [str, ByteSize, int, int, ProjectID, NodeID], + Awaitable[ + tuple[SimcoreS3FileID, tuple[NodeID, dict[SimcoreS3FileID, FileIDDict]]] + ], ], upload_file: Callable[[ByteSize, str], Awaitable[tuple[Path, SimcoreS3FileID]]], file_size: ByteSize, user_id: UserID, + project_id: ProjectID, + node_id: NodeID, mock_copy_transfer_cb: Callable[..., None], sqlalchemy_async_engine: AsyncEngine, ): @@ -74,24 +79,23 @@ async def _count_files(s3_file_id: SimcoreS3FileID, expected_count: int) -> None # using directory - FILE_COUNT = 4 + FILE_COUNT = 20 SUBDIR_COUNT = 5 - async with create_directory_with_files( + s3_object, _ = await create_directory_with_files( dir_name="some-random", file_size_in_dir=file_size, subdir_count=SUBDIR_COUNT, file_count=FILE_COUNT, - ) as directory_file_upload: - assert len(directory_file_upload.urls) == 1 - assert directory_file_upload.urls[0].path - s3_object = directory_file_upload.urls[0].path.lstrip("/") + project_id=project_id, + node_id=node_id, + ) - s3_file_id_dir_src = TypeAdapter(SimcoreS3FileID).validate_python(s3_object) - s3_file_id_dir_dst = _get_dest_file_id(s3_file_id_dir_src) + s3_file_id_dir_src = TypeAdapter(SimcoreS3FileID).validate_python(s3_object) + s3_file_id_dir_dst = _get_dest_file_id(s3_file_id_dir_src) - await _count_files(s3_file_id_dir_dst, expected_count=0) - await _copy_s3_path(s3_file_id_dir_src) - await _count_files(s3_file_id_dir_dst, expected_count=FILE_COUNT * SUBDIR_COUNT) + await _count_files(s3_file_id_dir_dst, expected_count=0) + await _copy_s3_path(s3_file_id_dir_src) + await _count_files(s3_file_id_dir_dst, expected_count=FILE_COUNT) # using a single file diff --git a/services/web/server/VERSION b/services/web/server/VERSION index cb6b534abe1..7e750b4ebf3 100644 --- a/services/web/server/VERSION +++ b/services/web/server/VERSION @@ -1 +1 @@ -0.59.0 +0.60.0 diff --git a/services/web/server/requirements/_test.in b/services/web/server/requirements/_test.in index 6c0fb8f4cc6..368a9d03f7d 100644 --- a/services/web/server/requirements/_test.in +++ b/services/web/server/requirements/_test.in @@ -16,6 +16,7 @@ coverage docker Faker fastapi[standard] +fastapi-pagination flaky hypothesis jsonref diff --git a/services/web/server/requirements/_test.txt b/services/web/server/requirements/_test.txt index a9e9d7abac3..1d75bc28c13 100644 --- a/services/web/server/requirements/_test.txt +++ b/services/web/server/requirements/_test.txt @@ -14,10 +14,16 @@ alembic==1.8.1 # via # -c requirements/_base.txt # -r requirements/_test.in +annotated-types==0.7.0 + # via + # -c requirements/_base.txt + # pydantic anyio==4.3.0 # via # -c requirements/_base.txt # httpx + # starlette + # watchfiles async-timeout==4.0.3 # via # -c requirements/_base.txt @@ -77,6 +83,8 @@ fastapi==0.115.6 # via -r requirements/_test.in fastapi-cli==0.0.5 # via fastapi +fastapi-pagination==0.12.34 + # via -r requirements/_test.in flaky==3.8.1 # via -r requirements/_test.in frozenlist==1.4.1 @@ -92,14 +100,18 @@ h11==0.14.0 # via # -c requirements/_base.txt # httpcore + # uvicorn httpcore==1.0.7 # via # -c requirements/_base.txt # httpx +httptools==0.6.4 + # via uvicorn httpx==0.28.1 # via # -c requirements/../../../../requirements/constraints.txt # -c requirements/_base.txt + # fastapi # respx hypothesis==6.91.0 # via -r requirements/_test.in @@ -109,6 +121,7 @@ idna==3.3 # via # -c requirements/_base.txt # anyio + # email-validator # httpx # requests # yarl @@ -175,6 +188,7 @@ pydantic==2.10.2 # -c requirements/../../../../requirements/constraints.txt # -c requirements/_base.txt # fastapi + # fastapi-pagination pydantic-core==2.27.1 # via # -c requirements/_base.txt @@ -253,6 +267,10 @@ requests==2.32.2 # docker respx==0.22.0 # via -r requirements/_test.in +rich==13.4.2 + # via + # -c requirements/_base.txt + # typer setuptools==69.1.1 # via # -c requirements/_base.txt @@ -310,6 +328,7 @@ typing-extensions==4.12.2 # -c requirements/_base.txt # asyncpg-stubs # fastapi + # fastapi-pagination # mypy # pydantic # pydantic-core diff --git a/services/web/server/setup.cfg b/services/web/server/setup.cfg index 8300bb68d43..6012cf501a1 100644 --- a/services/web/server/setup.cfg +++ b/services/web/server/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.59.0 +current_version = 0.60.0 commit = True message = services/webserver api version: {current_version} → {new_version} tag = False @@ -12,13 +12,13 @@ commit_args = --no-verify [tool:pytest] addopts = --strict-markers asyncio_mode = auto -markers = +markers = slow: marks tests as slow (deselect with '-m "not slow"') acceptance_test: "marks tests as 'acceptance tests' i.e. does the system do what the user expects? Typically those are workflows." testit: "marks test to run during development" heavy_load: "mark tests that require large amount of data" [mypy] -plugins = +plugins = pydantic.mypy sqlalchemy.ext.mypy.plugin diff --git a/services/web/server/src/simcore_service_webserver/api/v0/openapi.yaml b/services/web/server/src/simcore_service_webserver/api/v0/openapi.yaml index f84eec0effe..41cee8c4ba8 100644 --- a/services/web/server/src/simcore_service_webserver/api/v0/openapi.yaml +++ b/services/web/server/src/simcore_service_webserver/api/v0/openapi.yaml @@ -2,7 +2,7 @@ openapi: 3.1.0 info: title: simcore-service-webserver description: Main service with an interface (http-API & websockets) to the web front-end - version: 0.59.0 + version: 0.60.0 servers: - url: '' description: webserver @@ -6024,6 +6024,54 @@ paths: $ref: '#/components/schemas/FileLocation' type: array title: Response List Storage Locations + /v0/storage/locations/{location_id}/paths: + get: + tags: + - storage + summary: List Storage Paths + description: Lists the files/directories in WorkingDirectory + operationId: list_storage_paths + parameters: + - name: location_id + in: path + required: true + schema: + type: integer + title: Location Id + - name: size + in: query + required: false + schema: + type: integer + minimum: 1 + exclusiveMaximum: true + default: 20 + title: Size + maximum: 50 + - name: cursor + in: query + required: false + schema: + anyOf: + - type: string + - type: 'null' + title: Cursor + - name: fileFilter + in: query + required: false + schema: + anyOf: + - type: string + format: path + - type: 'null' + title: Filefilter + responses: + '200': + description: Successful Response + content: + application/json: + schema: + $ref: '#/components/schemas/CursorPage_FileMetaDataGet_' /v0/storage/locations/{location_id}/datasets: get: tags: @@ -8362,6 +8410,47 @@ components: required: - priceDollars title: CreateWalletPayment + CursorPage_FileMetaDataGet_: + properties: + items: + items: + $ref: '#/components/schemas/FileMetaDataGet' + type: array + title: Items + total: + anyOf: + - type: integer + - type: 'null' + title: Total + description: Total items + current_page: + anyOf: + - type: string + - type: 'null' + title: Current Page + description: Cursor to refetch the current page + current_page_backwards: + anyOf: + - type: string + - type: 'null' + title: Current Page Backwards + description: Cursor to refetch the current page starting from the last item + previous_page: + anyOf: + - type: string + - type: 'null' + title: Previous Page + description: Cursor for the previous page + next_page: + anyOf: + - type: string + - type: 'null' + title: Next Page + description: Cursor for the next page + type: object + required: + - items + title: CursorPage[FileMetaDataGet] DatCoreFileLink: properties: store: diff --git a/services/web/server/src/simcore_service_webserver/storage/_rest.py b/services/web/server/src/simcore_service_webserver/storage/_rest.py index 5dc1fb227fd..d65a15fefc6 100644 --- a/services/web/server/src/simcore_service_webserver/storage/_rest.py +++ b/services/web/server/src/simcore_service_webserver/storage/_rest.py @@ -25,6 +25,7 @@ StorageAsyncJobStatus, ) from models_library.projects_nodes_io import LocationID +from models_library.utils.change_case import camel_to_snake from models_library.utils.fastapi_encoders import jsonable_encoder from pydantic import AnyUrl, BaseModel, ByteSize, TypeAdapter from servicelib.aiohttp import status @@ -45,12 +46,12 @@ ) from servicelib.request_keys import RQT_USERID_KEY from servicelib.rest_responses import unwrap_envelope -from simcore_service_webserver.rabbitmq import get_rabbitmq_rpc_client from yarl import URL from .._meta import API_VTAG from ..login.decorators import login_required from ..models import RequestContext +from ..rabbitmq import get_rabbitmq_rpc_client from ..security.decorators import permission_required from ._exception_handlers import handle_data_export_exceptions from .schemas import StorageFileIDStr @@ -91,7 +92,7 @@ def _to_storage_url(request: web.Request) -> URL: return ( url.joinpath(fastapi_encoded_suffix, encoded=True) - .with_query(request.query) + .with_query({camel_to_snake(k): v for k, v in request.query.items()}) .update_query(user_id=userid) ) @@ -123,7 +124,10 @@ class _ResponseTuple(NamedTuple): async def _forward_request_to_storage( - request: web.Request, method: str, body: dict[str, Any] | None = None, **kwargs + request: web.Request, + method: str, + body: dict[str, Any] | None = None, + **kwargs, ) -> _ResponseTuple: url = _to_storage_url(request) session = get_client_session(request.app) @@ -131,10 +135,18 @@ async def _forward_request_to_storage( async with session.request( method.upper(), url, ssl=False, json=body, **kwargs ) as resp: - if resp.status >= status.HTTP_400_BAD_REQUEST: - raise web.HTTPException(reason=await resp.text()) - payload = await resp.json() - return _ResponseTuple(payload=payload, status_code=resp.status) + match resp.status: + case status.HTTP_422_UNPROCESSABLE_ENTITY: + raise web.HTTPUnprocessableEntity( + reason=await resp.text(), content_type=resp.content_type + ) + case status.HTTP_404_NOT_FOUND: + raise web.HTTPNotFound(reason=await resp.text()) + case _ if resp.status >= status.HTTP_400_BAD_REQUEST: + raise web.HTTPError(reason=await resp.text()) + case _: + payload = await resp.json() + return _ResponseTuple(payload=payload, status_code=resp.status) # --------------------------------------------------------------------- @@ -152,6 +164,16 @@ async def list_storage_locations(request: web.Request) -> web.Response: return create_data_response(payload, status=resp_status) +@routes.get( + f"{_storage_locations_prefix}/{{location_id}}/paths", name="list_storage_paths" +) +@login_required +@permission_required("storage.files.*") +async def list_paths(request: web.Request) -> web.Response: + payload, resp_status = await _forward_request_to_storage(request, "GET", body=None) + return create_data_response(payload, status=resp_status) + + @routes.get( _storage_locations_prefix + "/{location_id}/datasets", name="list_datasets_metadata" ) @@ -423,7 +445,6 @@ class _PathParams(BaseModel): @permission_required("storage.files.*") @handle_data_export_exceptions async def get_async_jobs(request: web.Request) -> web.Response: - _req_ctx = RequestContext.model_validate(request) rabbitmq_rpc_client = get_rabbitmq_rpc_client(request.app) @@ -449,7 +470,6 @@ async def get_async_jobs(request: web.Request) -> web.Response: @permission_required("storage.files.*") @handle_data_export_exceptions async def get_async_job_status(request: web.Request) -> web.Response: - _req_ctx = RequestContext.model_validate(request) rabbitmq_rpc_client = get_rabbitmq_rpc_client(request.app) @@ -503,7 +523,6 @@ async def abort_async_job(request: web.Request) -> web.Response: @permission_required("storage.files.*") @handle_data_export_exceptions async def get_async_job_result(request: web.Request) -> web.Response: - _req_ctx = RequestContext.model_validate(request) rabbitmq_rpc_client = get_rabbitmq_rpc_client(request.app) diff --git a/services/web/server/tests/unit/with_dbs/01/storage/conftest.py b/services/web/server/tests/unit/with_dbs/01/storage/conftest.py new file mode 100644 index 00000000000..d3bf9a09ed5 --- /dev/null +++ b/services/web/server/tests/unit/with_dbs/01/storage/conftest.py @@ -0,0 +1,313 @@ +# pylint: disable=redefined-outer-name +# pylint: disable=unused-argument +# pylint: disable=unused-variable +# pylint: disable=too-many-arguments + +import logging +import random +from collections.abc import Iterator +from pathlib import Path +from threading import Thread +from typing import Annotated + +import pytest +import uvicorn +from faker import Faker +from fastapi import APIRouter, Depends, FastAPI, Request, status +from fastapi_pagination import add_pagination, create_page +from fastapi_pagination.cursor import CursorPage, CursorParams +from models_library.api_schemas_storage.storage_schemas import ( + DatasetMetaDataGet, + FileLocation, + FileMetaDataGet, + FileMetaDataGetv010, + FileUploadCompleteResponse, + FileUploadCompletionBody, + FileUploadSchema, + LinkType, + PathMetaDataGet, +) +from models_library.generics import Envelope +from models_library.projects import ProjectID +from models_library.projects_nodes_io import LocationID, StorageFileID +from models_library.users import UserID +from pydantic import AnyUrl, TypeAdapter +from pytest_simcore.helpers.logging_tools import log_context +from pytest_simcore.helpers.monkeypatch_envs import setenvs_from_dict +from servicelib.utils import unused_port +from yarl import URL + + +@pytest.fixture(scope="session") +def storage_vtag() -> str: + return "v9" + + +@pytest.fixture(scope="module") +def fake_storage_app(storage_vtag: str) -> FastAPI: # noqa: C901 + app = FastAPI(debug=True) + add_pagination(app) + + router = APIRouter( + prefix=f"/{storage_vtag}", + ) + + @router.get("/") + async def _root(request: Request): + return {"message": "Hello World"} + + @router.get( + "/locations", + status_code=status.HTTP_200_OK, + response_model=Envelope[list[FileLocation]], + ) + async def _list_storage_locations(user_id: UserID, request: Request): + assert "json_schema_extra" in FileLocation.model_config + + return Envelope[list[FileLocation]]( + data=[ + FileLocation.model_validate(e) + for e in FileLocation.model_json_schema()["examples"] + ] + ) + + @router.get( + "/locations/{location_id}/paths", + response_model=CursorPage[PathMetaDataGet], + ) + async def _list_paths( + page_params: Annotated[CursorParams, Depends()], + # dsm: Annotated[BaseDataManager, Depends(get_data_manager)], + user_id: UserID, + file_filter: Path | None = None, + ): + assert user_id + assert "json_schema_extra" in PathMetaDataGet.model_config + + example_index = len(file_filter.parts) if file_filter else 0 + assert example_index < len( + PathMetaDataGet.model_json_schema()["examples"] + ), "fake server unable to server this example" + chosen_example = PathMetaDataGet.model_json_schema()["examples"][example_index] + + return create_page( + random.randint(3, 15) + * [PathMetaDataGet.model_validate(chosen_example)], # noqa: S311 + params=page_params, + next_=None, + ) + + @router.get( + "/locations/{location_id}/files/metadata", + response_model=Envelope[list[FileMetaDataGet]], + ) + async def _list_files_metadata( + user_id: UserID, + request: Request, + uuid_filter: str = "", + project_id: ProjectID | None = None, + expand_dirs: bool = True, + ): + assert "json_schema_extra" in FileMetaDataGet.model_config + + if uuid_filter: + return Envelope[list[FileMetaDataGet]]( + data=random.sample( + [ + FileMetaDataGet.model_validate(e) + for e in FileMetaDataGet.model_json_schema()["examples"] + ], + 2, + ) + ) + return Envelope[list[FileMetaDataGet]]( + data=[ + FileMetaDataGet.model_validate(e) + for e in FileMetaDataGet.model_json_schema()["examples"] + ] + ) + + @router.get( + "/locations/{location_id}/files/{file_id:path}/metadata", + response_model=Envelope[FileMetaDataGet] + | Envelope[FileMetaDataGetv010] + | Envelope[dict], + ) + async def _get_file_metadata(user_id: UserID, request: Request): + assert "json_schema_extra" in FileMetaDataGet.model_config + + return Envelope[FileMetaDataGet]( + data=random.choice( # noqa: S311 + [ + FileMetaDataGet.model_validate(e) + for e in FileMetaDataGet.model_json_schema()["examples"] + ] + ) + ) + + @router.get( + "/locations/{location_id}/datasets", + response_model=Envelope[list[DatasetMetaDataGet]], + ) + async def _list_datasets_metadata(user_id: UserID, request: Request): + assert "json_schema_extra" in DatasetMetaDataGet.model_config + + return Envelope[list[DatasetMetaDataGet]]( + data=[ + DatasetMetaDataGet.model_validate(e) + for e in DatasetMetaDataGet.model_json_schema()["examples"] + ] + ) + + @router.get( + "/locations/{location_id}/datasets/{dataset_id}/metadata", + response_model=Envelope[list[FileMetaDataGet]], + ) + async def _list_dataset_files_metadata(user_id: UserID, request: Request): + assert "json_schema_extra" in FileMetaDataGet.model_config + + return Envelope[list[FileMetaDataGet]]( + data=[ + FileMetaDataGet.model_validate(e) + for e in FileMetaDataGet.model_json_schema()["examples"] + ] + ) + + @router.put( + "/locations/{location_id}/files/{file_id:path}", + response_model=Envelope[FileUploadSchema], + ) + async def upload_file( + user_id: UserID, + location_id: LocationID, + file_id: StorageFileID, + request: Request, + link_type: LinkType = LinkType.PRESIGNED, + ): + assert "json_schema_extra" in FileUploadSchema.model_config + + abort_url = ( + URL(f"{request.url}") + .with_path( + request.app.url_path_for( + "abort_upload_file", + location_id=f"{location_id}", + file_id=file_id, + ) + ) + .with_query(user_id=user_id) + ) + + complete_url = ( + URL(f"{request.url}") + .with_path( + request.app.url_path_for( + "complete_upload_file", + location_id=f"{location_id}", + file_id=file_id, + ) + ) + .with_query(user_id=user_id) + ) + response = FileUploadSchema.model_validate( + random.choice( # noqa: S311 + FileUploadSchema.model_json_schema()["examples"] + ) + ) + response.links.abort_upload = TypeAdapter(AnyUrl).validate_python( + f"{abort_url}" + ) + response.links.complete_upload = TypeAdapter(AnyUrl).validate_python( + f"{complete_url}" + ) + + return Envelope[FileUploadSchema](data=response) + + @router.post( + "/locations/{location_id}/files/{file_id:path}:complete", + response_model=Envelope[FileUploadCompleteResponse], + status_code=status.HTTP_202_ACCEPTED, + ) + async def complete_upload_file( + user_id: UserID, + location_id: LocationID, + file_id: StorageFileID, + body: FileUploadCompletionBody, + request: Request, + ): + ... + + @router.post( + "/locations/{location_id}/files/{file_id:path}:abort", + status_code=status.HTTP_204_NO_CONTENT, + ) + async def abort_upload_file( + user_id: UserID, + location_id: LocationID, + file_id: StorageFileID, + request: Request, + ): + ... + + app.include_router(router) + + return app + + +@pytest.fixture(scope="module") +def fake_storage_server( + storage_vtag: str, + fake_storage_app: FastAPI, + # app_environment: EnvVarsDict, +) -> Iterator[URL]: + storage_port = unused_port() + with log_context( + logging.INFO, + msg=f"with fake storage server on 127.0.0.1:{storage_port}/{storage_vtag}", + ) as ctx: + config = uvicorn.Config( + fake_storage_app, + host="127.0.0.1", + port=storage_port, + log_level="error", + ) + server = uvicorn.Server(config) + + thread = Thread(target=server.run) + thread.daemon = True + thread.start() + + ctx.logger.info( + "health at : %s", + f"http://127.0.0.1:{storage_port}/{storage_vtag}", + ) + + yield URL(f"http://127.0.0.1:{storage_port}") + + server.should_exit = True + thread.join(timeout=10) + + +@pytest.fixture +def app_environment( + storage_vtag: str, + fake_storage_server: URL, + app_environment: dict[str, str], + monkeypatch: pytest.MonkeyPatch, +) -> dict[str, str]: + # NOTE: overrides app_environment + + return app_environment | setenvs_from_dict( + monkeypatch, + { + "STORAGE_PORT": f"{fake_storage_server.port}", + "STORAGE_VTAG": storage_vtag, + "WEBSERVER_DB_LISTENER": "0", + "WEBSERVER_GARBAGE_COLLECTOR": "null", + }, + ) + + +@pytest.fixture +def location_id(faker: Faker) -> LocationID: + return TypeAdapter(LocationID).validate_python(faker.pyint(min_value=0)) diff --git a/services/web/server/tests/unit/with_dbs/01/storage/test_storage.py b/services/web/server/tests/unit/with_dbs/01/storage/test_storage.py new file mode 100644 index 00000000000..13d124b0f64 --- /dev/null +++ b/services/web/server/tests/unit/with_dbs/01/storage/test_storage.py @@ -0,0 +1,253 @@ +# pylint: disable=redefined-outer-name +# pylint: disable=unused-argument +# pylint: disable=unused-variable +# pylint: disable=too-many-arguments + +from typing import Any +from urllib.parse import quote + +import pytest +from aiohttp.test_utils import TestClient +from faker import Faker +from fastapi_pagination.cursor import CursorPage +from models_library.api_schemas_storage.storage_schemas import ( + DatasetMetaDataGet, + FileLocation, + FileMetaDataGet, + FileUploadSchema, + PathMetaDataGet, +) +from models_library.projects_nodes_io import LocationID, StorageFileID +from pydantic import TypeAdapter +from pytest_simcore.helpers.assert_checks import assert_status +from servicelib.aiohttp import status +from simcore_postgres_database.models.users import UserRole + +API_VERSION = "v0" + + +PREFIX = "/" + API_VERSION + "/storage" + + +@pytest.mark.parametrize( + "user_role,expected", + [ + (UserRole.ANONYMOUS, status.HTTP_401_UNAUTHORIZED), + (UserRole.GUEST, status.HTTP_200_OK), + (UserRole.USER, status.HTTP_200_OK), + (UserRole.TESTER, status.HTTP_200_OK), + ], +) +async def test_list_storage_locations( + client: TestClient, + logged_user: dict[str, Any], + expected: int, +): + url = "/v0/storage/locations" + assert url.startswith(PREFIX) + + resp = await client.get(url, params={"user_id": logged_user["id"]}) + data, error = await assert_status(resp, expected) + + if not error: + assert "json_schema_extra" in FileLocation.model_config + + assert len(data) == len(FileLocation.model_json_schema()["examples"]) + assert data == FileLocation.model_json_schema()["examples"] + + +@pytest.mark.parametrize( + "user_role,expected", + [ + (UserRole.ANONYMOUS, status.HTTP_401_UNAUTHORIZED), + (UserRole.GUEST, status.HTTP_200_OK), + (UserRole.USER, status.HTTP_200_OK), + (UserRole.TESTER, status.HTTP_200_OK), + ], +) +async def test_list_storage_paths( + client: TestClient, + logged_user: dict[str, Any], + expected: int, + location_id: LocationID, +): + assert client.app + url = client.app.router["list_storage_paths"].url_for(location_id=f"{location_id}") + + resp = await client.get(f"{url}", params={"user_id": logged_user["id"]}) + data, error = await assert_status(resp, expected) + if not error: + TypeAdapter(CursorPage[PathMetaDataGet]).validate_python(data) + + +@pytest.mark.parametrize( + "user_role,expected", + [ + (UserRole.ANONYMOUS, status.HTTP_401_UNAUTHORIZED), + (UserRole.GUEST, status.HTTP_200_OK), + (UserRole.USER, status.HTTP_200_OK), + (UserRole.TESTER, status.HTTP_200_OK), + ], +) +async def test_list_datasets_metadata( + client: TestClient, + logged_user: dict[str, Any], + expected: int, +): + url = "/v0/storage/locations/0/datasets" + assert url.startswith(PREFIX) + assert client.app + _url = client.app.router["list_datasets_metadata"].url_for(location_id="0") + + assert url == str(_url) + + resp = await client.get(url, params={"user_id": logged_user["id"]}) + data, error = await assert_status(resp, expected) + + if not error: + assert "json_schema_extra" in DatasetMetaDataGet.model_config + + assert len(data) == len(DatasetMetaDataGet.model_json_schema()["examples"]) + assert data == DatasetMetaDataGet.model_json_schema()["examples"] + + +@pytest.mark.parametrize( + "user_role,expected", + [ + (UserRole.ANONYMOUS, status.HTTP_401_UNAUTHORIZED), + (UserRole.GUEST, status.HTTP_200_OK), + (UserRole.USER, status.HTTP_200_OK), + (UserRole.TESTER, status.HTTP_200_OK), + ], +) +async def test_list_dataset_files_metadata( + client: TestClient, + logged_user: dict[str, Any], + expected: int, +): + url = "/v0/storage/locations/0/datasets/N:asdfsdf/metadata" + assert url.startswith(PREFIX) + assert client.app + _url = client.app.router["list_dataset_files_metadata"].url_for( + location_id="0", dataset_id="N:asdfsdf" + ) + + assert url == str(_url) + + resp = await client.get(url, params={"user_id": logged_user["id"]}) + data, error = await assert_status(resp, expected) + + if not error: + assert "json_schema_extra" in FileMetaDataGet.model_config + + assert len(data) == len(FileMetaDataGet.model_json_schema()["examples"]) + assert data == [ + FileMetaDataGet.model_validate(e).model_dump(mode="json") + for e in FileMetaDataGet.model_json_schema()["examples"] + ] + + +@pytest.mark.parametrize( + "user_role,expected", + [ + (UserRole.ANONYMOUS, status.HTTP_401_UNAUTHORIZED), + (UserRole.GUEST, status.HTTP_200_OK), + (UserRole.USER, status.HTTP_200_OK), + (UserRole.TESTER, status.HTTP_200_OK), + ], +) +async def test_storage_file_meta( + client: TestClient, + logged_user: dict[str, Any], + expected: int, + faker: Faker, +): + # tests redirect of path with quotes in path + file_id = f"{faker.uuid4()}/{faker.uuid4()}/a/b/c/d/e/dat" + quoted_file_id = quote(file_id, safe="") + url = f"/v0/storage/locations/0/files/{quoted_file_id}/metadata" + + assert url.startswith(PREFIX) + + resp = await client.get(url, params={"user_id": logged_user["id"]}) + data, error = await assert_status(resp, expected) + + if not error: + assert data + model = FileMetaDataGet.model_validate(data) + assert model + + +@pytest.mark.parametrize( + "user_role,expected", + [ + (UserRole.ANONYMOUS, status.HTTP_401_UNAUTHORIZED), + (UserRole.GUEST, status.HTTP_200_OK), + (UserRole.USER, status.HTTP_200_OK), + (UserRole.TESTER, status.HTTP_200_OK), + ], +) +async def test_storage_list_filter( + client: TestClient, + logged_user: dict[str, Any], + expected: int, +): + # tests composition of 2 queries + file_id = "a/b/c/d/e/dat" + url = "/v0/storage/locations/0/files/metadata?uuid_filter={}".format( + quote(file_id, safe="") + ) + + assert url.startswith(PREFIX) + + resp = await client.get(url, params={"user_id": logged_user["id"]}) + data, error = await assert_status(resp, expected) + + if not error: + assert len(data) == 2 + for item in data: + model = FileMetaDataGet.model_validate(item) + assert model + + +@pytest.fixture +def file_id(faker: Faker) -> StorageFileID: + return TypeAdapter(StorageFileID).validate_python( + f"{faker.uuid4()}/{faker.uuid4()}/{faker.file_name()} with spaces.dat" + ) + + +@pytest.mark.parametrize( + "user_role,expected", + [ + (UserRole.ANONYMOUS, status.HTTP_401_UNAUTHORIZED), + (UserRole.GUEST, status.HTTP_200_OK), + (UserRole.USER, status.HTTP_200_OK), + (UserRole.TESTER, status.HTTP_200_OK), + ], +) +async def test_upload_file( + client: TestClient, + logged_user: dict[str, Any], + expected: int, + file_id: StorageFileID, +): + url = f"/v0/storage/locations/0/files/{quote(file_id, safe='')}" + + assert url.startswith(PREFIX) + + resp = await client.put(url, params={"user_id": logged_user["id"]}) + data, error = await assert_status(resp, expected) + if not error: + assert not error + assert data + file_upload_schema = FileUploadSchema.model_validate(data) + + # let's abort + resp = await client.post( + f"{file_upload_schema.links.abort_upload.path}", + params={"user_id": logged_user["id"]}, + ) + data, error = await assert_status(resp, status.HTTP_204_NO_CONTENT) + assert not error + assert not data diff --git a/services/web/server/tests/unit/with_dbs/03/test_storage_handlers.py b/services/web/server/tests/unit/with_dbs/01/storage/test_storage_handlers.py similarity index 98% rename from services/web/server/tests/unit/with_dbs/03/test_storage_handlers.py rename to services/web/server/tests/unit/with_dbs/01/storage/test_storage_handlers.py index 57e16128c53..61909c8742d 100644 --- a/services/web/server/tests/unit/with_dbs/03/test_storage_handlers.py +++ b/services/web/server/tests/unit/with_dbs/01/storage/test_storage_handlers.py @@ -30,8 +30,6 @@ def app_environment( return app_environment | setenvs_from_dict( monkeypatch, { - "WEBSERVER_DB_LISTENER": "0", - "WEBSERVER_GARBAGE_COLLECTOR": "null", "STORAGE_HOST": "fake-storage", }, ) diff --git a/services/web/server/tests/unit/with_dbs/03/test_storage_rpc.py b/services/web/server/tests/unit/with_dbs/01/storage/test_storage_rpc.py similarity index 100% rename from services/web/server/tests/unit/with_dbs/03/test_storage_rpc.py rename to services/web/server/tests/unit/with_dbs/01/storage/test_storage_rpc.py diff --git a/services/web/server/tests/unit/with_dbs/01/test_storage.py b/services/web/server/tests/unit/with_dbs/01/test_storage.py deleted file mode 100644 index 02b1945f885..00000000000 --- a/services/web/server/tests/unit/with_dbs/01/test_storage.py +++ /dev/null @@ -1,533 +0,0 @@ -# pylint: disable=redefined-outer-name -# pylint: disable=unused-argument -# pylint: disable=unused-variable -# pylint: disable=too-many-arguments - -import logging -import random -from collections.abc import Iterator -from threading import Thread -from typing import Any -from urllib.parse import quote - -import pytest -import uvicorn -from aiohttp.test_utils import TestClient -from faker import Faker -from fastapi import APIRouter, FastAPI, Request -from models_library.api_schemas_storage.storage_schemas import ( - DatasetMetaDataGet, - FileLocation, - FileMetaDataGet, - FileMetaDataGetv010, - FileUploadCompleteResponse, - FileUploadCompletionBody, - FileUploadSchema, - LinkType, -) -from models_library.generics import Envelope -from models_library.projects import ProjectID -from models_library.projects_nodes_io import LocationID, StorageFileID -from models_library.users import UserID -from pydantic import AnyUrl, TypeAdapter -from pytest_simcore.helpers.assert_checks import assert_status -from pytest_simcore.helpers.logging_tools import log_context -from servicelib.aiohttp import status -from servicelib.utils import unused_port -from simcore_postgres_database.models.users import UserRole -from yarl import URL - -API_VERSION = "v0" - - -@pytest.fixture(scope="session") -def storage_vtag() -> str: - return "v9" - - -@pytest.fixture(scope="module") -def fake_storage_app(storage_vtag: str) -> FastAPI: - app = FastAPI(debug=True) - router = APIRouter( - prefix=f"/{storage_vtag}", - ) - - @router.get("/") - async def _root(request: Request): - return {"message": "Hello World"} - - @router.get( - "/locations", - status_code=status.HTTP_200_OK, - response_model=Envelope[list[FileLocation]], - ) - async def _list_storage_locations(user_id: UserID, request: Request): - assert "json_schema_extra" in FileLocation.model_config - assert isinstance(FileLocation.model_config["json_schema_extra"], dict) - assert isinstance( - FileLocation.model_config["json_schema_extra"]["examples"], list - ) - - return Envelope[list[FileLocation]]( - data=[ - FileLocation.model_validate(e) - for e in FileLocation.model_config["json_schema_extra"]["examples"] - ] - ) - - @router.get( - "/locations/{location_id}/files/metadata", - response_model=Envelope[list[FileMetaDataGet]], - ) - async def _list_files_metadata( - user_id: UserID, - request: Request, - uuid_filter: str = "", - project_id: ProjectID | None = None, - expand_dirs: bool = True, - ): - assert "json_schema_extra" in FileMetaDataGet.model_config - assert isinstance(FileMetaDataGet.model_config["json_schema_extra"], dict) - assert isinstance( - FileMetaDataGet.model_config["json_schema_extra"]["examples"], list - ) - if uuid_filter: - return Envelope[list[FileMetaDataGet]]( - data=random.sample( - [ - FileMetaDataGet.model_validate(e) - for e in FileMetaDataGet.model_config["json_schema_extra"][ - "examples" - ] - ], - 2, - ) - ) - return Envelope[list[FileMetaDataGet]]( - data=[ - FileMetaDataGet.model_validate(e) - for e in FileMetaDataGet.model_config["json_schema_extra"]["examples"] - ] - ) - - @router.get( - "/locations/{location_id}/files/{file_id:path}/metadata", - response_model=Envelope[FileMetaDataGet] - | Envelope[FileMetaDataGetv010] - | Envelope[dict], - ) - async def _get_file_metadata(user_id: UserID, request: Request): - assert "json_schema_extra" in FileMetaDataGet.model_config - assert isinstance(FileMetaDataGet.model_config["json_schema_extra"], dict) - assert isinstance( - FileMetaDataGet.model_config["json_schema_extra"]["examples"], list - ) - return Envelope[FileMetaDataGet]( - data=random.choice( # noqa: S311 - [ - FileMetaDataGet.model_validate(e) - for e in FileMetaDataGet.model_config["json_schema_extra"][ - "examples" - ] - ] - ) - ) - - @router.get( - "/locations/{location_id}/datasets", - response_model=Envelope[list[DatasetMetaDataGet]], - ) - async def _list_datasets_metadata(user_id: UserID, request: Request): - assert "json_schema_extra" in DatasetMetaDataGet.model_config - assert isinstance(DatasetMetaDataGet.model_config["json_schema_extra"], dict) - assert isinstance( - DatasetMetaDataGet.model_config["json_schema_extra"]["examples"], list - ) - return Envelope[list[DatasetMetaDataGet]]( - data=[ - DatasetMetaDataGet.model_validate(e) - for e in DatasetMetaDataGet.model_config["json_schema_extra"][ - "examples" - ] - ] - ) - - @router.get( - "/locations/{location_id}/datasets/{dataset_id}/metadata", - response_model=Envelope[list[FileMetaDataGet]], - ) - async def _list_dataset_files_metadata(user_id: UserID, request: Request): - assert "json_schema_extra" in FileMetaDataGet.model_config - assert isinstance(FileMetaDataGet.model_config["json_schema_extra"], dict) - assert isinstance( - FileMetaDataGet.model_config["json_schema_extra"]["examples"], list - ) - return Envelope[list[FileMetaDataGet]]( - data=[ - FileMetaDataGet.model_validate(e) - for e in FileMetaDataGet.model_config["json_schema_extra"]["examples"] - ] - ) - - @router.put( - "/locations/{location_id}/files/{file_id:path}", - response_model=Envelope[FileUploadSchema], - ) - async def upload_file( - user_id: UserID, - location_id: LocationID, - file_id: StorageFileID, - request: Request, - link_type: LinkType = LinkType.PRESIGNED, - ): - assert "json_schema_extra" in FileUploadSchema.model_config - assert isinstance(FileUploadSchema.model_config["json_schema_extra"], dict) - assert isinstance( - FileUploadSchema.model_config["json_schema_extra"]["examples"], list - ) - - abort_url = ( - URL(f"{request.url}") - .with_path( - request.app.url_path_for( - "abort_upload_file", - location_id=f"{location_id}", - file_id=file_id, - ) - ) - .with_query(user_id=user_id) - ) - - complete_url = ( - URL(f"{request.url}") - .with_path( - request.app.url_path_for( - "complete_upload_file", - location_id=f"{location_id}", - file_id=file_id, - ) - ) - .with_query(user_id=user_id) - ) - response = FileUploadSchema.model_validate( - random.choice( # noqa: S311 - FileUploadSchema.model_config["json_schema_extra"]["examples"] - ) - ) - response.links.abort_upload = TypeAdapter(AnyUrl).validate_python( - f"{abort_url}" - ) - response.links.complete_upload = TypeAdapter(AnyUrl).validate_python( - f"{complete_url}" - ) - - return Envelope[FileUploadSchema](data=response) - - @router.post( - "/locations/{location_id}/files/{file_id:path}:complete", - response_model=Envelope[FileUploadCompleteResponse], - status_code=status.HTTP_202_ACCEPTED, - ) - async def complete_upload_file( - user_id: UserID, - location_id: LocationID, - file_id: StorageFileID, - body: FileUploadCompletionBody, - request: Request, - ): - ... - - @router.post( - "/locations/{location_id}/files/{file_id:path}:abort", - status_code=status.HTTP_204_NO_CONTENT, - ) - async def abort_upload_file( - user_id: UserID, - location_id: LocationID, - file_id: StorageFileID, - request: Request, - ): - ... - - app.include_router(router) - - return app - - -@pytest.fixture(scope="module") -def fake_storage_server( - storage_vtag: str, - fake_storage_app: FastAPI, - # app_environment: EnvVarsDict, -) -> Iterator[URL]: - storage_port = unused_port() - with log_context( - logging.INFO, - msg=f"with fake storage server on 127.0.0.1:{storage_port}/{storage_vtag}", - ) as ctx: - config = uvicorn.Config( - fake_storage_app, - host="127.0.0.1", - port=storage_port, - log_level="error", - ) - server = uvicorn.Server(config) - - thread = Thread(target=server.run) - thread.daemon = True - thread.start() - - ctx.logger.info( - "health at : %s", - f"http://127.0.0.1:{storage_port}/{storage_vtag}", - ) - - yield URL(f"http://127.0.0.1:{storage_port}") - - server.should_exit = True - thread.join(timeout=10) - - -@pytest.fixture -def app_environment( - storage_vtag: str, - fake_storage_server: URL, - app_environment: dict[str, str], - monkeypatch: pytest.MonkeyPatch, -) -> dict[str, str]: - # NOTE: overrides app_environment - monkeypatch.setenv("STORAGE_PORT", f"{fake_storage_server.port}") - monkeypatch.setenv("STORAGE_VTAG", storage_vtag) - monkeypatch.setenv("WEBSERVER_GARBAGE_COLLECTOR", "null") - return app_environment | {"WEBSERVER_GARBAGE_COLLECTOR": "null"} - - -# -------------------------------------------------------------------------- -PREFIX = "/" + API_VERSION + "/storage" - - -@pytest.mark.parametrize( - "user_role,expected", - [ - (UserRole.ANONYMOUS, status.HTTP_401_UNAUTHORIZED), - (UserRole.GUEST, status.HTTP_200_OK), - (UserRole.USER, status.HTTP_200_OK), - (UserRole.TESTER, status.HTTP_200_OK), - ], -) -async def test_list_storage_locations( - client: TestClient, - logged_user: dict[str, Any], - expected: int, -): - url = "/v0/storage/locations" - assert url.startswith(PREFIX) - - resp = await client.get(url, params={"user_id": logged_user["id"]}) - data, error = await assert_status(resp, expected) - - if not error: - assert "json_schema_extra" in FileLocation.model_config - assert isinstance(FileLocation.model_config["json_schema_extra"], dict) - assert isinstance( - FileLocation.model_config["json_schema_extra"]["examples"], list - ) - assert len(data) == len( - FileLocation.model_config["json_schema_extra"]["examples"] - ) - assert data == FileLocation.model_config["json_schema_extra"]["examples"] - - -@pytest.mark.parametrize( - "user_role,expected", - [ - (UserRole.ANONYMOUS, status.HTTP_401_UNAUTHORIZED), - (UserRole.GUEST, status.HTTP_200_OK), - (UserRole.USER, status.HTTP_200_OK), - (UserRole.TESTER, status.HTTP_200_OK), - ], -) -async def test_list_datasets_metadata( - client: TestClient, - logged_user: dict[str, Any], - expected: int, -): - url = "/v0/storage/locations/0/datasets" - assert url.startswith(PREFIX) - assert client.app - _url = client.app.router["list_datasets_metadata"].url_for(location_id="0") - - assert url == str(_url) - - resp = await client.get(url, params={"user_id": logged_user["id"]}) - data, error = await assert_status(resp, expected) - - if not error: - assert "json_schema_extra" in DatasetMetaDataGet.model_config - assert isinstance(DatasetMetaDataGet.model_config["json_schema_extra"], dict) - assert isinstance( - DatasetMetaDataGet.model_config["json_schema_extra"]["examples"], list - ) - - assert len(data) == len( - DatasetMetaDataGet.model_config["json_schema_extra"]["examples"] - ) - assert data == DatasetMetaDataGet.model_config["json_schema_extra"]["examples"] - - -@pytest.mark.parametrize( - "user_role,expected", - [ - (UserRole.ANONYMOUS, status.HTTP_401_UNAUTHORIZED), - (UserRole.GUEST, status.HTTP_200_OK), - (UserRole.USER, status.HTTP_200_OK), - (UserRole.TESTER, status.HTTP_200_OK), - ], -) -async def test_list_dataset_files_metadata( - client: TestClient, - logged_user: dict[str, Any], - expected: int, -): - url = "/v0/storage/locations/0/datasets/N:asdfsdf/metadata" - assert url.startswith(PREFIX) - assert client.app - _url = client.app.router["list_dataset_files_metadata"].url_for( - location_id="0", dataset_id="N:asdfsdf" - ) - - assert url == str(_url) - - resp = await client.get(url, params={"user_id": logged_user["id"]}) - data, error = await assert_status(resp, expected) - - if not error: - assert "json_schema_extra" in FileMetaDataGet.model_config - assert isinstance(FileMetaDataGet.model_config["json_schema_extra"], dict) - assert isinstance( - FileMetaDataGet.model_config["json_schema_extra"]["examples"], list - ) - assert len(data) == len( - FileMetaDataGet.model_config["json_schema_extra"]["examples"] - ) - assert data == [ - FileMetaDataGet.model_validate(e).model_dump(mode="json") - for e in FileMetaDataGet.model_config["json_schema_extra"]["examples"] - ] - - -@pytest.mark.parametrize( - "user_role,expected", - [ - (UserRole.ANONYMOUS, status.HTTP_401_UNAUTHORIZED), - (UserRole.GUEST, status.HTTP_200_OK), - (UserRole.USER, status.HTTP_200_OK), - (UserRole.TESTER, status.HTTP_200_OK), - ], -) -async def test_storage_file_meta( - client: TestClient, - logged_user: dict[str, Any], - expected: int, - faker: Faker, -): - # tests redirect of path with quotes in path - file_id = f"{faker.uuid4()}/{faker.uuid4()}/a/b/c/d/e/dat" - quoted_file_id = quote(file_id, safe="") - url = f"/v0/storage/locations/0/files/{quoted_file_id}/metadata" - - assert url.startswith(PREFIX) - - resp = await client.get(url, params={"user_id": logged_user["id"]}) - data, error = await assert_status(resp, expected) - - if not error: - assert "json_schema_extra" in FileMetaDataGet.model_config - assert isinstance(FileMetaDataGet.model_config["json_schema_extra"], dict) - assert isinstance( - FileMetaDataGet.model_config["json_schema_extra"]["examples"], list - ) - - assert data - model = FileMetaDataGet.model_validate(data) - assert model - - -@pytest.mark.parametrize( - "user_role,expected", - [ - (UserRole.ANONYMOUS, status.HTTP_401_UNAUTHORIZED), - (UserRole.GUEST, status.HTTP_200_OK), - (UserRole.USER, status.HTTP_200_OK), - (UserRole.TESTER, status.HTTP_200_OK), - ], -) -async def test_storage_list_filter( - client: TestClient, - logged_user: dict[str, Any], - expected: int, -): - # tests composition of 2 queries - file_id = "a/b/c/d/e/dat" - url = "/v0/storage/locations/0/files/metadata?uuid_filter={}".format( - quote(file_id, safe="") - ) - - assert url.startswith(PREFIX) - - resp = await client.get(url, params={"user_id": logged_user["id"]}) - data, error = await assert_status(resp, expected) - - if not error: - assert "json_schema_extra" in FileMetaDataGet.model_config - assert isinstance(FileMetaDataGet.model_config["json_schema_extra"], dict) - assert isinstance( - FileMetaDataGet.model_config["json_schema_extra"]["examples"], list - ) - - assert len(data) == 2 - for item in data: - model = FileMetaDataGet.model_validate(item) - assert model - - -@pytest.fixture -def file_id(faker: Faker) -> StorageFileID: - return TypeAdapter(StorageFileID).validate_python( - f"{faker.uuid4()}/{faker.uuid4()}/{faker.file_name()} with spaces.dat" - ) - - -@pytest.mark.parametrize( - "user_role,expected", - [ - # (UserRole.ANONYMOUS, status.HTTP_401_UNAUTHORIZED), - # (UserRole.GUEST, status.HTTP_200_OK), - (UserRole.USER, status.HTTP_200_OK), - # (UserRole.TESTER, status.HTTP_200_OK), - ], -) -async def test_upload_file( - client: TestClient, - logged_user: dict[str, Any], - expected: int, - file_id: StorageFileID, -): - url = f"/v0/storage/locations/0/files/{quote(file_id, safe='')}" - - assert url.startswith(PREFIX) - - resp = await client.put(url, params={"user_id": logged_user["id"]}) - data, error = await assert_status(resp, expected) - assert not error - assert data - file_upload_schema = FileUploadSchema.model_validate(data) - - # let's abort - resp = await client.post( - f"{file_upload_schema.links.abort_upload.path}", - params={"user_id": logged_user["id"]}, - ) - data, error = await assert_status(resp, status.HTTP_204_NO_CONTENT) - assert not error - assert not data