diff --git a/api/specs/webserver/openapi-projects.yaml b/api/specs/webserver/openapi-projects.yaml index fe4d61f6e1f..4ec50db7bef 100644 --- a/api/specs/webserver/openapi-projects.yaml +++ b/api/specs/webserver/openapi-projects.yaml @@ -201,11 +201,6 @@ paths: required: true schema: type: string - - name: compressed - in: query - required: false - schema: - type: boolean post: tags: - exporter @@ -222,6 +217,31 @@ paths: default: $ref: "#/components/responses/DefaultErrorResponse" + /projects/{project_id}/duplicate: + parameters: + - name: project_id + in: path + required: true + schema: + type: string + post: + tags: + - exporter + summary: duplicates an existing project + operationId: duplicate_project + responses: + "200": + description: project was duplicated correctly + content: + application/json: + schema: + type: object + properties: + uuid: + type: string + default: + $ref: "#/components/responses/DefaultErrorResponse" + /projects/import: post: tags: @@ -238,13 +258,15 @@ paths: type: string format: binary responses: - "201": + "200": description: creates a new project from an archive content: application/json: schema: - #TODO: change this with an OK response - $ref: "#/components/schemas/ProjectEnveloped" + type: object + properties: + uuid: + type: string default: $ref: "#/components/responses/DefaultErrorResponse" diff --git a/api/specs/webserver/openapi.yaml b/api/specs/webserver/openapi.yaml index 0e67db99218..a97911790d4 100644 --- a/api/specs/webserver/openapi.yaml +++ b/api/specs/webserver/openapi.yaml @@ -171,6 +171,9 @@ paths: /projects/{project_id}:xport: $ref: "./openapi-projects.yaml#/paths/~1projects~1{project_id}~1xport" + /projects/{project_id}:duplicate: + $ref: "./openapi-projects.yaml#/paths/~1projects~1{project_id}~1duplicate" + /projects:import: $ref: "./openapi-projects.yaml#/paths/~1projects~1import" diff --git a/packages/models-library/src/models_library/projects.py b/packages/models-library/src/models_library/projects.py index 30233ba5b39..296716657f8 100644 --- a/packages/models-library/src/models_library/projects.py +++ b/packages/models-library/src/models_library/projects.py @@ -87,7 +87,9 @@ class Project(BaseModel): ui: Optional[StudyUI] = None # Quality - quality: Dict[str, Any] = {} + quality: Dict[str, Any] = Field( + {}, description="stores the study quality assessment" + ) # Dev only dev: Optional[Dict] = Field(description="object used for development purposes only") diff --git a/services/web/server/src/simcore_service_webserver/api/v0/openapi.yaml b/services/web/server/src/simcore_service_webserver/api/v0/openapi.yaml index aaa9aa9f8f3..433f4c28991 100644 --- a/services/web/server/src/simcore_service_webserver/api/v0/openapi.yaml +++ b/services/web/server/src/simcore_service_webserver/api/v0/openapi.yaml @@ -10590,11 +10590,6 @@ paths: required: true schema: type: string - - name: compressed - in: query - required: false - schema: - type: boolean post: tags: - exporter @@ -10691,6 +10686,111 @@ paths: message: Password is not secure field: pasword status: 400 + '/projects/{project_id}:duplicate': + parameters: + - name: project_id + in: path + required: true + schema: + type: string + post: + tags: + - exporter + summary: duplicates an existing project + operationId: duplicate_project + responses: + '200': + description: project was duplicated correctly + content: + application/json: + schema: + type: object + properties: + uuid: + type: string + default: + description: Default http error response body + content: + application/json: + schema: + type: object + required: + - error + properties: + data: + nullable: true + default: null + error: + type: object + nullable: true + properties: + logs: + description: log messages + type: array + items: + type: object + properties: + level: + description: log level + type: string + default: INFO + enum: + - DEBUG + - WARNING + - INFO + - ERROR + message: + description: 'log message. If logger is USER, then it MUST be human readable' + type: string + logger: + description: name of the logger receiving this message + type: string + required: + - message + example: + message: 'Hi there, Mr user' + level: INFO + logger: user-logger + errors: + description: errors metadata + type: array + items: + type: object + required: + - code + - message + properties: + code: + type: string + description: Typically the name of the exception that produced it otherwise some known error code + message: + type: string + description: Error message specific to this item + resource: + type: string + description: API resource affected by this error + field: + type: string + description: Specific field within the resource + status: + description: HTTP error code + type: integer + example: + BadRequestError: + logs: + - message: Requested information is incomplete or malformed + level: ERROR + - message: Invalid email and password + level: ERROR + logger: USER + errors: + - code: InvalidEmail + message: Email is malformed + field: email + - code: UnsavePassword + message: Password is not secure + field: pasword + status: 400 '/projects:import': post: tags: @@ -10707,489 +10807,15 @@ paths: type: string format: binary responses: - '201': + '200': description: creates a new project from an archive content: application/json: schema: type: object - required: - - data properties: - data: - allOf: - - title: simcore project - description: Description of a simcore project - type: object - additionalProperties: false - required: - - uuid - - name - - description - - prjOwner - - accessRights - - creationDate - - lastChangeDate - - thumbnail - - workbench - properties: - uuid: - type: string - format: uuid - description: project unique identifier - example: 07640335-a91f-468c-ab69-a374fa82078d - name: - type: string - description: project name - example: Temporal Distortion Simulator - description: - type: string - description: longer one-line description about the project - example: Dabbling in temporal transitions ... - prjOwner: - type: string - format: email - description: user email - accessRights: - type: object - description: object containing the GroupID as key and read/write/execution permissions as value - x-patternProperties: - ^\S+$: - type: object - description: the group id - additionalProperties: false - required: - - read - - write - - delete - properties: - read: - type: boolean - description: gives read access - write: - type: boolean - description: gives write access - delete: - type: boolean - description: gives deletion rights - additionalProperties: true - creationDate: - type: string - description: project creation date - pattern: '\d{4}-(12|11|10|0?[1-9])-(31|30|[0-2]?\d)T(2[0-3]|1\d|0?[0-9])(:(\d|[0-5]\d)){2}(\.\d{3})?Z' - example: '2018-07-01T11:13:43Z' - lastChangeDate: - type: string - description: last save date - pattern: '\d{4}-(12|11|10|0?[1-9])-(31|30|[0-2]?\d)T(2[0-3]|1\d|0?[0-9])(:(\d|[0-5]\d)){2}(\.\d{3})?Z' - example: '2018-07-01T11:13:43Z' - thumbnail: - type: string - minLength: 0 - maxLength: 2083 - format: uri - description: url of the latest screenshot of the project - example: 'https://placeimg.com/171/96/tech/grayscale/?0.jpg' - workbench: - type: object - x-patternProperties: - '^[0-9a-fA-F]{8}-?[0-9a-fA-F]{4}-?4[0-9a-fA-F]{3}-?[89abAB][0-9a-fA-F]{3}-?[0-9a-fA-F]{12}$': - type: object - additionalProperties: false - required: - - key - - version - - label - properties: - key: - type: string - description: distinctive name for the node based on the docker registry path - pattern: '^(simcore)/(services)/(comp|dynamic|frontend)(/[^\s/]+)+$' - example: - - simcore/services/comp/sleeper - - simcore/services/dynamic/3dviewer - - simcore/services/frontend/file-picker - version: - type: string - description: semantic version number of the node - pattern: '^(0|[1-9]\d*)(\.(0|[1-9]\d*)){2}(-(0|[1-9]\d*|\d*[-a-zA-Z][-\da-zA-Z]*)(\.(0|[1-9]\d*|\d*[-a-zA-Z][-\da-zA-Z]*))*)?(\+[-\da-zA-Z]+(\.[-\da-zA-Z-]+)*)?$' - example: - - 1.0.0 - - 0.0.1 - label: - type: string - description: The short name of the node - example: - - JupyterLab - progress: - type: number - maximum: 100 - minimum: 0 - description: the node progress value - thumbnail: - minLength: 0 - maxLength: 2083 - format: uri - type: string - description: url of the latest screenshot of the node - example: - - 'https://placeimg.com/171/96/tech/grayscale/?0.jpg' - runHash: - description: the hex digest of the resolved inputs +outputs hash at the time when the last outputs were generated - type: - - string - - 'null' - example: - - a4337bc45a8fc544c03f52dc550cd6e1e87021bc896588bd79e901e2 - inputs: - type: object - description: values of input properties - patternProperties: - '^[-_a-zA-Z0-9]+$': - oneOf: - - type: - - integer - - boolean - - string - - number - - 'null' - - type: object - additionalProperties: false - required: - - nodeUuid - - output - properties: - nodeUuid: - type: string - format: uuid - output: - type: string - pattern: '^[-_a-zA-Z0-9]+$' - - type: object - additionalProperties: false - required: - - store - - path - properties: - store: - type: - - string - - integer - dataset: - type: string - path: - type: string - label: - type: string - eTag: - type: string - - type: object - additionalProperties: false - required: - - downloadLink - properties: - downloadLink: - minLength: 1 - maxLength: 65536 - type: string - format: uri - label: - type: string - inputAccess: - description: map with key - access level pairs - type: object - patternProperties: - '^[-_a-zA-Z0-9]+$': - type: string - enum: - - Invisible - - ReadOnly - - ReadAndWrite - default: ReadAndWrite - example: - - ReadOnly - inputNodes: - type: array - items: - type: string - format: uuid - description: node IDs of where the node is connected to - example: - - nodeUuid1 - - nodeUuid2 - outputs: - default: {} - type: object - patternProperties: - '^[-_a-zA-Z0-9]+$': - oneOf: - - type: - - integer - - boolean - - string - - number - - 'null' - - type: object - additionalProperties: false - required: - - store - - path - properties: - store: - type: - - string - - integer - dataset: - type: string - path: - type: string - label: - type: string - eTag: - type: string - - type: object - additionalProperties: false - required: - - downloadLink - properties: - downloadLink: - minLength: 1 - maxLength: 65536 - type: string - format: uri - label: - type: string - outputNode: - type: boolean - deprecated: true - outputNodes: - type: array - items: - type: string - format: uuid - description: Used in group-nodes. Node IDs of those connected to the output - example: - - nodeUuid1 - - nodeUuid2 - parent: - type: - - 'null' - - string - format: uuid - description: Parent's (group-nodes') node ID s. - example: - - nodeUuid1 - - nodeUuid2 - position: - type: object - additionalProperties: false - required: - - x - - 'y' - properties: - x: - type: integer - description: The x position - example: - - '12' - 'y': - type: integer - description: The y position - example: - - '15' - deprecated: true - state: - title: RunningState - description: the node's running state - default: NOT_STARTED - enum: - - UNKNOWN - - NOT_STARTED - - PUBLISHED - - PENDING - - STARTED - - RETRY - - SUCCESS - - FAILED - - ABORTED - type: string - additionalProperties: true - ui: - type: object - additionalProperties: true - properties: - workbench: - type: object - x-patternProperties: - '^[0-9a-fA-F]{8}-?[0-9a-fA-F]{4}-?4[0-9a-fA-F]{3}-?[89abAB][0-9a-fA-F]{3}-?[0-9a-fA-F]{12}$': - type: object - additionalProperties: false - required: - - position - properties: - position: - type: object - additionalProperties: false - required: - - x - - 'y' - properties: - x: - type: integer - description: The x position - example: - - '12' - 'y': - type: integer - description: The y position - example: - - '15' - additionalProperties: true - slideshow: - type: object - x-patternProperties: - '^[0-9a-fA-F]{8}-?[0-9a-fA-F]{4}-?4[0-9a-fA-F]{3}-?[89abAB][0-9a-fA-F]{3}-?[0-9a-fA-F]{12}$': - type: object - additionalProperties: false - required: - - position - properties: - position: - type: integer - description: Slide's position - example: - - 0 - - 2 - additionalProperties: true - currentNodeId: - type: string - format: uuid - tags: - type: array - items: - type: integer - classifiers: - type: array - description: Contains the reference to the project classifiers - items: - type: string - example: 'some:id:to:a:classifier' - dev: - type: object - description: object used for development purposes only - state: - title: State - description: Project state - anyOf: - - nullable: true - - title: ProjectState - type: object - additionalProperties: false - properties: - locked: - title: Locked - description: The project lock state - allOf: - - title: ProjectLocked - type: object - additionalProperties: false - properties: - value: - title: Value - description: True if the project is locked by another user - type: boolean - owner: - title: Owner - description: The user that owns the lock - allOf: - - title: Owner - type: object - additionalProperties: false - properties: - user_id: - title: User Id - type: integer - description: Owner's identifier when registered in the user's database table - example: - - 2 - first_name: - title: First Name - description: Owner first name - example: - - John - type: string - last_name: - title: Last Name - description: Owner last name - example: - - Smith - type: string - required: - - user_id - - first_name - - last_name - required: - - value - state: - title: State - description: The project running state - allOf: - - title: ProjectRunningState - type: object - additionalProperties: false - properties: - value: - title: RunningState - description: An enumeration. - enum: - - UNKNOWN - - NOT_STARTED - - PUBLISHED - - PENDING - - STARTED - - RETRY - - SUCCESS - - FAILED - - ABORTED - type: string - required: - - value - required: - - locked - - state - quality: - type: object - title: Quality - description: Object containing Quality Assessment related data - - type: object - properties: - state: - type: object - required: - - locked - properties: - locked: - type: object - description: describes the project lock state - required: - - value - properties: - value: - type: boolean - description: true if the project is locked - owner: - type: object - properties: - first_name: - type: string - last_name: - type: string - required: - - firstName - - lastName - error: - nullable: true - default: null + uuid: + type: string default: description: Default http error response body content: diff --git a/services/web/server/src/simcore_service_webserver/exporter/archiving.py b/services/web/server/src/simcore_service_webserver/exporter/archiving.py index 2b6a461cb9d..512465803d6 100644 --- a/services/web/server/src/simcore_service_webserver/exporter/archiving.py +++ b/services/web/server/src/simcore_service_webserver/exporter/archiving.py @@ -1,9 +1,9 @@ import asyncio import logging -import shutil -from concurrent.futures import ProcessPoolExecutor +import zipfile from pathlib import Path -from typing import Any, Callable, Tuple +from concurrent.futures import ProcessPoolExecutor +from typing import Tuple, Iterator from passlib import pwd @@ -14,12 +14,114 @@ log = logging.getLogger(__name__) -def get_random_chars(length: int) -> str: +def _full_file_path_from_dir_and_subdirs(dir_path: Path) -> Iterator[Path]: + for path in dir_path.rglob("*"): + if path.is_file(): + yield path + + +def _strip_directory_from_path(input_path: Path, to_strip: Path) -> Path: + to_strip = f"{str(to_strip)}/" + return Path(str(input_path).replace(to_strip, "")) + + +def _zipfile_single_file_extract_worker( + zip_file_path: Path, file_in_archive: str, destination_folder: Path +) -> None: + with open(zip_file_path, "rb") as f: + zf = zipfile.ZipFile(f) + zf.extract(file_in_archive, destination_folder) + + +def ensure_destination_subdirectories_exist( + zip_file_handler: zipfile.ZipFile, destination_folder: Path +) -> None: + # assemble full destination paths + full_destination_paths = { + destination_folder / entry.filename for entry in zip_file_handler.infolist() + } + # extract all possible subdirectories + subdirectories = {x.parent for x in full_destination_paths} + # create all subdirectories before extracting + for subdirectory in subdirectories: + Path(subdirectory).mkdir(parents=True, exist_ok=True) + + +async def unarchive_dir(archive_to_extract: Path, destination_folder: Path) -> None: + try: + with zipfile.ZipFile(archive_to_extract, mode="r") as zip_file_handler: + with ProcessPoolExecutor() as pool: + loop = asyncio.get_event_loop() + + # running in process poll is not ideal for concurrency issues + # to avoid race conditions all subdirectories where files will be extracted need to exist + # creating them before the extraction is under way avoids the issue + # the following avoids race conditions while unzippin in parallel + ensure_destination_subdirectories_exist( + zip_file_handler=zip_file_handler, + destination_folder=destination_folder, + ) + + tasks = [ + loop.run_in_executor( + pool, + _zipfile_single_file_extract_worker, + archive_to_extract, + zip_entry.filename, + destination_folder, + ) + for zip_entry in zip_file_handler.infolist() + ] + + await asyncio.gather(*tasks) + except Exception as e: + message = f"There was an error while extracting directory '{archive_to_extract}' to '{destination_folder}'" + log.exception(message) + raise ExporterException(f"{message} {e}") from e + + +def _serial_add_to_archive( + dir_to_compress: Path, destination: Path, compress: bool, store_relative_path: bool +) -> None: + compression = zipfile.ZIP_DEFLATED if compress else zipfile.ZIP_STORED + with zipfile.ZipFile(destination, "w", compression=compression) as zip_file_handler: + files_to_compress_generator = _full_file_path_from_dir_and_subdirs( + dir_to_compress + ) + for file_to_add in files_to_compress_generator: + try: + file_name_in_archive = ( + _strip_directory_from_path(file_to_add, dir_to_compress) + if store_relative_path + else file_to_add + ) + zip_file_handler.write(file_to_add, file_name_in_archive) + except ValueError: + log.exception("Could write files to archive, please check logs") + return False + return True + + +async def archive_dir( + dir_to_compress: Path, destination: Path, compress: bool, store_relative_path: bool +) -> bool: + with ProcessPoolExecutor(max_workers=1) as pool: + return await asyncio.get_event_loop().run_in_executor( + pool, + _serial_add_to_archive, + dir_to_compress, + destination, + compress, + store_relative_path, + ) + + +def _get_random_chars(length: int) -> str: return pwd.genword(entropy=52, charset="hex")[:length] -def get_osparc_export_name(sha256_sum: str, algorithm: Algorithm) -> str: - return f"{get_random_chars(4)}#{algorithm.name}={sha256_sum}.osparc" +def _get_osparc_export_name(sha256_sum: str, algorithm: Algorithm) -> str: + return f"{_get_random_chars(4)}#{algorithm.name}={sha256_sum}.osparc" def validate_osparc_import_name(file_name: str) -> Tuple[Algorithm, str]: @@ -59,53 +161,36 @@ def search_for_unzipped_path(search_path: Path) -> Path: return search_path / found_dirs[0] -async def run_in_process_pool(function: Callable, *args: Tuple[Any]) -> Any: - try: - with ProcessPoolExecutor(max_workers=1) as pool: - return await asyncio.get_event_loop().run_in_executor(pool, function, *args) - except asyncio.CancelledError: - # making sure this error gets propagated correctly - raise - except Exception as e: - reason = str(e) - log.warning("During %s call there was an error: %s", function.__name__, reason) - raise ExporterException(reason) from e - - -async def zip_folder(project_id: str, input_path: Path) -> Path: +async def zip_folder(folder_to_zip: Path, destination_folder: Path) -> Path: """Zips a folder and returns the path to the new archive""" - zip_file = Path(input_path.parent) / f"{project_id}.zip" - if zip_file.is_file(): + archived_file = destination_folder / "archive.zip" + if archived_file.is_file(): raise ExporterException( - f"Cannot archive because file already exists '{str(zip_file)}'" + f"Cannot archive '{folder_to_zip}' because '{str(archived_file)}' already exists" ) - await run_in_process_pool( - shutil.make_archive, # callable - Path(input_path.parent) / project_id, # base_name - "zip", # format - input_path.parent, # root_dir - project_id, # base_dir + await archive_dir( + dir_to_compress=folder_to_zip, + destination=archived_file, + compress=True, + store_relative_path=True, ) # compute checksum and rename - sha256_sum = await checksum(file_path=zip_file, algorithm=Algorithm.SHA256) + sha256_sum = await checksum(file_path=archived_file, algorithm=Algorithm.SHA256) # opsarc_formatted_name= "4_rand_chars#sha256_sum.osparc" - osparc_formatted_name = Path(input_path.parent) / get_osparc_export_name( + osparc_formatted_name = Path(folder_to_zip) / _get_osparc_export_name( sha256_sum=sha256_sum, algorithm=Algorithm.SHA256 ) - await rename(zip_file, osparc_formatted_name) + await rename(archived_file, osparc_formatted_name) return osparc_formatted_name -async def unzip_folder(input_path: Path) -> Path: - await run_in_process_pool( - shutil.unpack_archive, # callable - str(input_path), # filename - str(input_path.parent), # extract_dir - "zip", # format +async def unzip_folder(archive_to_extract: Path, destination_folder: Path) -> Path: + await unarchive_dir( + archive_to_extract=archive_to_extract, destination_folder=destination_folder ) - return search_for_unzipped_path(input_path.parent) + return search_for_unzipped_path(destination_folder) diff --git a/services/web/server/src/simcore_service_webserver/exporter/export_import.py b/services/web/server/src/simcore_service_webserver/exporter/export_import.py index 6a8a8b75473..01f0863322f 100644 --- a/services/web/server/src/simcore_service_webserver/exporter/export_import.py +++ b/services/web/server/src/simcore_service_webserver/exporter/export_import.py @@ -27,7 +27,8 @@ async def study_export( returns: directory if archive is True else a compressed archive is returned """ # storage area for the project data - destination = Path(tmp_dir) / project_id + base_temp_dir = Path(tmp_dir) + destination = base_temp_dir / project_id destination.mkdir(parents=True, exist_ok=True) # The formatter will always be chosen to be the highest availabel version @@ -41,7 +42,10 @@ async def study_export( return destination # an archive is always produced when compression is active - archive_path = await zip_folder(project_id=project_id, input_path=destination) + + archive_path = await zip_folder( + folder_to_zip=base_temp_dir, destination_folder=base_temp_dir + ) return archive_path @@ -58,7 +62,8 @@ async def study_import( the imported project's uuid. """ # Storing file to disk - upload_file_name = Path(temp_dir) / "uploaded.zip" + base_temp_dir = Path(temp_dir) + upload_file_name = base_temp_dir / "uploaded.zip" # upload and verify checksum original_file_name = file_field.filename @@ -85,7 +90,16 @@ async def study_import( f"{digest_from_filename}, upload_digest={upload_digest}" ) - unzipped_root_folder = await unzip_folder(upload_file_name) + unzipped_root_folder = await unzip_folder( + archive_to_extract=upload_file_name, destination_folder=base_temp_dir + ) formatter: BaseFormatter = await validate_manifest(unzipped_root_folder) return await formatter.validate_and_import_directory(app=app, user_id=user_id) + + +async def study_duplicate( + app: web.Application, user_id: int, exported_project_path: Path +) -> str: + formatter: BaseFormatter = await validate_manifest(exported_project_path) + return await formatter.validate_and_import_directory(app=app, user_id=user_id) \ No newline at end of file diff --git a/services/web/server/src/simcore_service_webserver/exporter/formatters/formatter_v1.py b/services/web/server/src/simcore_service_webserver/exporter/formatters/formatter_v1.py index 304a6f50dc8..f2616454a4a 100644 --- a/services/web/server/src/simcore_service_webserver/exporter/formatters/formatter_v1.py +++ b/services/web/server/src/simcore_service_webserver/exporter/formatters/formatter_v1.py @@ -283,6 +283,7 @@ async def import_files_and_validate_project( dev=shuffled_project_file.dev, workbench=shuffled_project_file.workbench, ui=shuffled_project_file.ui, + quality=shuffled_project_file.quality, ) project_uuid = str(project.uuid) diff --git a/services/web/server/src/simcore_service_webserver/exporter/request_handlers.py b/services/web/server/src/simcore_service_webserver/exporter/request_handlers.py index 50f7e7aa5d1..6219062f4ba 100644 --- a/services/web/server/src/simcore_service_webserver/exporter/request_handlers.py +++ b/services/web/server/src/simcore_service_webserver/exporter/request_handlers.py @@ -1,4 +1,5 @@ import logging +from tempfile import TemporaryDirectory from aiohttp import web from aiohttp.web_request import FileField @@ -7,7 +8,7 @@ from ..security_decorators import permission_required from .config import get_settings from .exceptions import ExporterException -from .export_import import study_export, study_import +from .export_import import study_export, study_import, study_duplicate from .utils import CleanupFileResponse, get_empty_tmp_dir, remove_dir ONE_GB: int = 1024 * 1024 * 1024 @@ -88,4 +89,31 @@ async def import_project(request: web.Request): return dict(uuid=imported_project_uuid) -rest_handler_functions = {fun.__name__: fun for fun in [export_project, import_project]} +@login_required +@permission_required("project.duplicate") +async def duplicate_project(request: web.Request): + user_id = request[RQT_USERID_KEY] + project_uuid = request.match_info.get("project_id") + + with TemporaryDirectory() as temp_dir: + exported_project_path = await study_export( + app=request.app, + tmp_dir=temp_dir, + project_id=project_uuid, + user_id=user_id, + archive=False, + ) + log.info("Study to duplicate '%s'", exported_project_path) + + # return the duplicated study ID + duplicated_project_uuid = await study_duplicate( + app=request.app, + user_id=user_id, + exported_project_path=exported_project_path, + ) + return dict(uuid=duplicated_project_uuid) + + +rest_handler_functions = { + fun.__name__: fun for fun in {export_project, import_project, duplicate_project} +} \ No newline at end of file diff --git a/services/web/server/src/simcore_service_webserver/security_roles.py b/services/web/server/src/simcore_service_webserver/security_roles.py index e2104db3fb7..321e9dd2c78 100644 --- a/services/web/server/src/simcore_service_webserver/security_roles.py +++ b/services/web/server/src/simcore_service_webserver/security_roles.py @@ -49,6 +49,7 @@ "project.close", "project.delete", # "study.node.create", "project.export", + "project.duplicate", "project.import", "project.access_rights.update", # "study.node.delete", diff --git a/services/web/server/tests/integration/test_exporter.py b/services/web/server/tests/integration/test_exporter.py index b45dd67e34b..5eb6d383434 100644 --- a/services/web/server/tests/integration/test_exporter.py +++ b/services/web/server/tests/integration/test_exporter.py @@ -3,11 +3,15 @@ import json import logging import sys +import asyncio import tempfile from contextlib import contextmanager from copy import deepcopy +import operator +import itertools from pathlib import Path -from typing import Any, Dict, List, Set +from typing import Any, Dict, List, Set, Callable, Tuple +from collections import deque import aiofiles import aiohttp @@ -32,6 +36,10 @@ from simcore_service_webserver.session import setup_session from simcore_service_webserver.socketio import setup_socketio from simcore_service_webserver.users import setup_users +from simcore_service_webserver.storage_handlers import get_file_download_url +from simcore_service_webserver.storage import setup_storage +from simcore_service_webserver.exporter.file_downloader import ParallelDownloader +from simcore_service_webserver.exporter.async_hashing import Algorithm, checksum from yarl import URL log = logging.getLogger(__name__) @@ -44,7 +52,7 @@ "postgres", "storage", ] -ops_services = ["minio", "adminer"] +ops_services = ["minio"] CURRENT_DIR = Path(sys.argv[0] if __name__ == "__main__" else __file__).resolve().parent @@ -55,7 +63,14 @@ # store only lowercase "v1", "v2", etc... SUPPORTED_EXPORTER_VERSIONS = {"v1"} -KEYS_TO_IGNORE_FROM_COMPARISON = {"id", "uuid", "creation_date", "last_change_date"} +REMAPPING_KEY = "__reverse__remapping__dict__key__" +KEYS_TO_IGNORE_FROM_COMPARISON = { + "id", + "uuid", + "creation_date", + "last_change_date", + REMAPPING_KEY, +} @pytest.fixture @@ -138,6 +153,7 @@ def client( setup_director(app) setup_director_v2(app) setup_exporter(app) + setup_storage(app) assert setup_resource_manager(app) yield loop.run_until_complete( @@ -231,7 +247,9 @@ async def query_project_from_db( return dict(project) -def replace_uuids_with_sequences(project: Dict[str, Any]) -> Dict[str, Any]: +def replace_uuids_with_sequences(original_project: Dict[str, Any]) -> Dict[str, Any]: + # first make a copy + project = deepcopy(original_project) workbench = project["workbench"] ui = project["ui"] @@ -258,6 +276,8 @@ def replace_uuids_with_sequences(project: Dict[str, Any]) -> Dict[str, Any]: project["workbench"] = json.loads(str_workbench) project["ui"] = json.loads(str_ui) + # store for later usage + project[REMAPPING_KEY] = remapping_dict return project @@ -269,6 +289,108 @@ def dict_without_keys(dict_data: Dict[str, Any], keys: Set[str]) -> Dict[str, An return result +def assert_combined_entires_condition( + *entries: Any, condition_operator: Callable +) -> None: + """Ensures the condition_operator is True for all unique combinations""" + for combination in itertools.combinations(entries, 2): + assert condition_operator(combination[0], combination[1]) is True + + +def extract_original_files_for_node_sequence( + project: Dict[str, Any], normalized_project: Dict[str, Any] +) -> Dict[str, Dict[str, str]]: + """ + Extracts path and store from ouput_1 field of each node and + returns mapped to the normalized data node keys for simpler comparison + """ + results = {} + reverse_search_dict = normalized_project[REMAPPING_KEY] + + for uuid_key, node in project["workbench"].items(): + output_1 = node["outputs"]["output_1"] + sequence_key = reverse_search_dict[uuid_key] + results[sequence_key] = {"store": output_1["store"], "path": output_1["path"]} + + return results + + +async def extract_download_links_from_storage( + app: aiohttp.web.Application, + original_files: Dict[str, Dict[str, str]], + user_id: str, +) -> Dict[str, str]: + async def _get_mapped_link( + seq_key: str, location_id: str, raw_file_path: str + ) -> Tuple[str, str]: + link = await get_file_download_url( + app=app, + location_id=location_id, + fileId=raw_file_path, + user_id=user_id, + ) + return seq_key, link + + tasks = deque() + for seq_key, data in original_files.items(): + tasks.append( + _get_mapped_link( + seq_key=seq_key, + location_id=data["store"], + raw_file_path=data["path"], + ) + ) + + results = await asyncio.gather(*tasks) + + return {x[0]: x[1] for x in results} + + +async def download_files_and_get_checksums( + app: aiohttp.web.Application, download_links: Dict[str, str] +) -> Dict[str, str]: + with tempfile.TemporaryDirectory() as store_dir: + download_paths = {} + parallel_downloader = ParallelDownloader() + for seq_id, url in download_links.items(): + download_path = Path(store_dir) / seq_id + await parallel_downloader.append_file(link=url, download_path=download_path) + download_paths[seq_id] = download_path + + await parallel_downloader.download_files(app=app) + + # compute checksums for each downloaded file + checksums = {} + for seq_id, download_path in download_paths.items(): + checksums[seq_id] = await checksum( + file_path=download_path, algorithm=Algorithm.SHA256 + ) + + return checksums + + +async def get_checksmus_for_files_in_storage( + app: aiohttp.web.Application, + project: Dict[str, Any], + normalized_project: Dict[str, Any], + user_id: str, +) -> Dict[str, str]: + original_files = extract_original_files_for_node_sequence( + project=project, normalized_project=normalized_project + ) + + download_links = await extract_download_links_from_storage( + app=app, original_files=original_files, user_id=user_id + ) + + files_checksums = await download_files_and_get_checksums( + app=app, + download_links=download_links, + ) + + return files_checksums + + ################ end utils @@ -287,7 +409,8 @@ async def import_study_from_file(client, file_path: Path) -> str: @pytest.mark.parametrize("export_version", get_exported_projects()) -async def test_import_export_import( +async def test_import_export_import_duplicate( + loop, client, push_services_to_registry, socketio_client, @@ -298,9 +421,12 @@ async def test_import_export_import( simcore_services, monkey_patch_aiohttp_request_url, ): - """Check that the full import -> export -> import cycle produces the same result in the DB""" + """ + Checks if the full "import -> export -> import -> duplicate" cycle + produces the same result in the DB. + """ - _ = await login_user(client) + user = await login_user(client) export_file_name = export_version.name version_from_name = export_file_name.split("#")[0] @@ -335,15 +461,73 @@ async def test_import_export_import( client, downloaded_file_path ) + # duplicate newly imported project + url_duplicate = client.app.router["duplicate_project"].url_for( + project_id=imported_project_uuid + ) + assert url_duplicate == URL( + API_PREFIX + f"/projects/{imported_project_uuid}:duplicate" + ) + async with await client.post(url_duplicate, timeout=10) as duplicate_response: + assert duplicate_response.status == 200, await duplicate_response.text() + reply_data = await duplicate_response.json() + assert reply_data.get("data") is not None + + duplicated_project_uuid = reply_data["data"]["uuid"] + imported_project = await query_project_from_db(db_engine, imported_project_uuid) reimported_project = await query_project_from_db(db_engine, reimported_project_uuid) + duplicated_project = await query_project_from_db(db_engine, duplicated_project_uuid) # uuids are changed each time the project is imported, need to normalize them normalized_imported_project = replace_uuids_with_sequences(imported_project) normalized_reimported_project = replace_uuids_with_sequences(reimported_project) + normalized_duplicated_project = replace_uuids_with_sequences(duplicated_project) + + # ensure values are different + for key in KEYS_TO_IGNORE_FROM_COMPARISON: + assert_combined_entires_condition( + normalized_imported_project[key], + normalized_reimported_project[key], + normalized_duplicated_project[key], + condition_operator=operator.ne, + ) + + # assert same structure in both directories + assert_combined_entires_condition( + dict_without_keys(normalized_imported_project, KEYS_TO_IGNORE_FROM_COMPARISON), + dict_without_keys( + normalized_reimported_project, KEYS_TO_IGNORE_FROM_COMPARISON + ), + dict_without_keys( + normalized_duplicated_project, KEYS_TO_IGNORE_FROM_COMPARISON + ), + condition_operator=operator.eq, + ) + + # check files in storage fingerprint matches + imported_files_checksums = await get_checksmus_for_files_in_storage( + app=client.app, + project=imported_project, + normalized_project=normalized_imported_project, + user_id=user["id"], + ) + reimported_files_checksums = await get_checksmus_for_files_in_storage( + app=client.app, + project=reimported_project, + normalized_project=normalized_reimported_project, + user_id=user["id"], + ) + duplicated_files_checksums = await get_checksmus_for_files_in_storage( + app=client.app, + project=duplicated_project, + normalized_project=normalized_duplicated_project, + user_id=user["id"], + ) - assert dict_without_keys( - normalized_imported_project, KEYS_TO_IGNORE_FROM_COMPARISON - ) == dict_without_keys( - normalized_reimported_project, KEYS_TO_IGNORE_FROM_COMPARISON + assert_combined_entires_condition( + imported_files_checksums, + reimported_files_checksums, + duplicated_files_checksums, + condition_operator=operator.eq, ) diff --git a/services/web/server/tests/unit/isolated/test_exporter_archiving.py b/services/web/server/tests/unit/isolated/test_exporter_archiving.py index aff49064e52..6361dfd113f 100644 --- a/services/web/server/tests/unit/isolated/test_exporter_archiving.py +++ b/services/web/server/tests/unit/isolated/test_exporter_archiving.py @@ -4,7 +4,15 @@ import sys import tempfile import uuid +import hashlib +import random from pathlib import Path +import asyncio +from typing import Set, List, Dict, Iterator +from concurrent.futures import ProcessPoolExecutor +import string +import secrets + import pytest from simcore_service_webserver.exporter.archiving import ( @@ -50,9 +58,18 @@ def returncode(self): @pytest.fixture -def temp_dir() -> Path: - with tempfile.TemporaryDirectory() as dir_path: - yield Path(dir_path) +def temp_dir(tmpdir) -> Path: + # cast to Path object + yield Path(tmpdir) + + +@pytest.fixture +def temp_dir2() -> Path: + with tempfile.TemporaryDirectory() as temp_dir: + temp_dir_path = Path(temp_dir) + extract_dir_path = temp_dir_path / "extract_dir" + extract_dir_path.mkdir(parents=True, exist_ok=True) + yield extract_dir_path @pytest.fixture @@ -68,10 +85,66 @@ def project_uuid(): return str(uuid.uuid4()) +@pytest.fixture +def dir_with_random_content() -> Path: + def random_string(length: int) -> str: + return "".join(secrets.choice(string.ascii_letters) for i in range(length)) + + def make_files_in_dir(dir_path: Path, file_count: int) -> None: + for _ in range(file_count): + (dir_path / f"{random_string(8)}.bin").write_bytes( + os.urandom(random.randint(1, 10)) + ) + + def ensure_dir(path_to_ensure: Path) -> Path: + path_to_ensure.mkdir(parents=True, exist_ok=True) + return path_to_ensure + + def make_subdirectory_with_content(subdir_name: Path, max_file_count: int) -> None: + subdir_name = ensure_dir(subdir_name) + make_files_in_dir( + dir_path=subdir_name, + file_count=random.randint(1, max_file_count), + ) + + def make_subdirectories_with_content( + subdir_name: Path, max_subdirectories_count: int, max_file_count: int + ) -> None: + subdirectories_count = random.randint(1, max_subdirectories_count) + for _ in range(subdirectories_count): + make_subdirectory_with_content( + subdir_name=subdir_name / f"{random_string(4)}", + max_file_count=max_file_count, + ) + + def get_dirs_and_subdris_in_path(path_to_scan: Path) -> Iterator[Path]: + return [path for path in path_to_scan.rglob("*") if path.is_dir()] + + with tempfile.TemporaryDirectory() as temp_dir: + temp_dir_path = Path(temp_dir) + data_container = ensure_dir(temp_dir_path / "study_data") + + make_subdirectories_with_content( + subdir_name=data_container, max_subdirectories_count=5, max_file_count=5 + ) + make_files_in_dir(dir_path=data_container, file_count=5) + + # creates a good amount of files + for _ in range(4): + for subdirectory_path in get_dirs_and_subdris_in_path(data_container): + make_subdirectories_with_content( + subdir_name=subdirectory_path, + max_subdirectories_count=3, + max_file_count=3, + ) + + yield temp_dir_path + + def temp_dir_with_existing_archive(temp_dir, project_uui) -> Path: nested_dir = temp_dir / "nested" nested_dir.mkdir(parents=True, exist_ok=True) - nested_file = nested_dir.parent / f"{project_uui}.zip" + nested_file = nested_dir / "archive.zip" nested_file.write_text("some_data") return nested_dir @@ -98,6 +171,80 @@ def temp_dir_to_compress_with_too_many_targets(temp_dir, project_uuid) -> Path: return nested_dir +def strip_directory_from_path(input_path: Path, to_strip: Path) -> Path: + to_strip = f"{str(to_strip)}/" + return Path(str(input_path).replace(to_strip, "")) + + +def get_all_files_in_dir(dir_path: Path) -> Set[Path]: + return { + strip_directory_from_path(x, dir_path) + for x in dir_path.rglob("*") + if x.is_file() + } + + +def _compute_hash(file_path: Path) -> str: + with open(file_path, "rb") as file_to_hash: + file_hash = hashlib.md5() + chunk = file_to_hash.read(8192) + while chunk: + file_hash.update(chunk) + chunk = file_to_hash.read(8192) + + return file_path, file_hash.hexdigest() + + +async def compute_hashes(file_paths: List[Path]) -> Dict[Path, str]: + """given a list of files computes hashes for the files on a process pool""" + + loop = asyncio.get_event_loop() + + with ProcessPoolExecutor() as prcess_pool_executor: + tasks = [ + loop.run_in_executor(prcess_pool_executor, _compute_hash, file_path) + for file_path in file_paths + ] + return {k: v for k, v in await asyncio.gather(*tasks)} + + +def full_file_path_from_dir_and_subdirs(dir_path: Path) -> List[Path]: + return [x for x in dir_path.rglob("*") if x.is_file()] + + +async def assert_same_directory_content( + dir_to_compress: Path, output_dir: Path +) -> None: + input_set = get_all_files_in_dir(dir_to_compress) + output_set = get_all_files_in_dir(output_dir) + assert ( + input_set == output_set + ), f"There following files are missing {input_set - output_set}" + + # computing the hashes for dir_to_compress and map in a dict + # with the name starting from the root of the directory and md5sum + dir_to_compress_hashes = { + strip_directory_from_path(k, dir_to_compress): v + for k, v in ( + await compute_hashes(full_file_path_from_dir_and_subdirs(dir_to_compress)) + ).items() + } + + # computing the hashes for output_dir and map in a dict + # with the name starting from the root of the directory and md5sum + output_dir_hashes = { + strip_directory_from_path(k, output_dir): v + for k, v in ( + await compute_hashes(full_file_path_from_dir_and_subdirs(output_dir)) + ).items() + } + + # finally check if hashes are mapped 1 to 1 in order to verify + # that the compress/decompress worked correctly + for key in dir_to_compress_hashes: + assert dir_to_compress_hashes[key] == output_dir_hashes[key] + + # end utils @@ -153,72 +300,40 @@ def test_validate_osparc_file_name_too_many_shasums(): ) -async def test_error_during_compression(temp_dir, project_uuid): +async def test_error_during_decompression(loop): with pytest.raises(ExporterException) as exc_info: - await zip_folder(project_id=project_uuid, input_path=temp_dir) + await unzip_folder(Path("/i/do/not/exist"), "/") assert exc_info.type is ExporterException - assert ( - exc_info.value.args[0] - == f"[Errno 2] No such file or directory: '{project_uuid}'" + assert exc_info.value.args[0] == ( + "There was an error while extracting directory '/i/do/not/exist' " + "to '/' [Errno 2] No such file or directory: '/i/do/not/exist'" ) -async def test_error_during_decompression(): - with pytest.raises(ExporterException) as exc_info: - await unzip_folder(Path("/i/do/not/exist")) - - assert exc_info.type is ExporterException - assert exc_info.value.args[0] == "/i/do/not/exist is not a zip file" - - -async def test_archive_already_exists(temp_dir, project_uuid): +async def test_archive_already_exists(loop, temp_dir, project_uuid): tmp_dir_to_compress = temp_dir_with_existing_archive(temp_dir, project_uuid) with pytest.raises(ExporterException) as exc_info: - await zip_folder(project_id=project_uuid, input_path=tmp_dir_to_compress) + await zip_folder( + folder_to_zip=tmp_dir_to_compress, destination_folder=tmp_dir_to_compress + ) assert exc_info.type is ExporterException assert ( exc_info.value.args[0] - == f"Cannot archive because file already exists '{str(temp_dir)}/{project_uuid}.zip'" - ) - - -@pytest.mark.parametrize("no_compresion", [True, False]) -async def test_zip_unzip_folder( - temp_dir, project_uuid, no_compresion, monkey_patch_asyncio_subporcess -): - tmp_dir_to_compress = temp_dir_to_compress(temp_dir, project_uuid) - file_in_archive = tmp_dir_to_compress / "random_file.txt" - data_before_compression = file_in_archive.read_text() - - archive_path = await zip_folder( - project_id=project_uuid, input_path=tmp_dir_to_compress + == f"Cannot archive '{temp_dir}/nested' because '{str(temp_dir)}/nested/archive.zip' already exists" ) - str_archive_path = str(archive_path) - - assert ".osparc" in str_archive_path - assert "#" in str_archive_path - - os.system(f"rm -rf {str(tmp_dir_to_compress)}") - - await unzip_folder(archive_path) - - data_after_decompression = file_in_archive.read_text() - - assert data_before_compression == data_after_decompression - async def test_unzip_found_too_many_project_targets( - temp_dir, project_uuid, monkey_patch_asyncio_subporcess + loop, temp_dir, project_uuid, monkey_patch_asyncio_subporcess ): tmp_dir_to_compress = temp_dir_to_compress_with_too_many_targets( temp_dir, project_uuid ) archive_path = await zip_folder( - project_id=project_uuid, input_path=tmp_dir_to_compress + folder_to_zip=tmp_dir_to_compress, destination_folder=tmp_dir_to_compress ) str_archive_path = str(archive_path) @@ -229,9 +344,30 @@ async def test_unzip_found_too_many_project_targets( os.system(f"rm -rf {str(tmp_dir_to_compress)}") with pytest.raises(ExporterException) as exc_info: - await unzip_folder(archive_path) + await unzip_folder(archive_path, archive_path.parent) assert exc_info.type is ExporterException assert exc_info.value.args[0].startswith( - "Unexpected number of directories after unzipping" + "There was an error while extracting directory" + ) + + +async def test_same_dir_structure_after_compress_decompress( + loop, dir_with_random_content: Path, temp_dir2: Path +): + zip_archive = await zip_folder( + folder_to_zip=dir_with_random_content, + destination_folder=dir_with_random_content, + ) + + unzipped_content = await unzip_folder( + archive_to_extract=zip_archive, destination_folder=temp_dir2 + ) + zip_archive.unlink() + (unzipped_content.parent / "archive.zip").unlink() + + print(unzipped_content.parent) + print(dir_with_random_content) + await assert_same_directory_content( + dir_with_random_content, unzipped_content.parent )