Skip to content

Commit 7dd5842

Browse files
authored
🎨E2E: Fail fast conditions + websocket logging in case of error (#7463)
1 parent c1f0a35 commit 7dd5842

File tree

7 files changed

+210
-97
lines changed

7 files changed

+210
-97
lines changed

packages/pytest-simcore/src/pytest_simcore/helpers/playwright.py

+118-32
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1+
# pylint:disable=unused-variable
2+
# pylint:disable=unused-argument
3+
# pylint:disable=redefined-outer-name
4+
# pylint:disable=too-many-instance-attributes
5+
16
import contextlib
27
import json
38
import logging
@@ -8,12 +13,25 @@
813
from dataclasses import dataclass, field
914
from datetime import UTC, datetime, timedelta
1015
from enum import Enum, unique
16+
from pathlib import Path
1117
from typing import Any, Final
1218

19+
import pytest
1320
from playwright._impl._sync_base import EventContextManager
14-
from playwright.sync_api import APIRequestContext, FrameLocator, Locator, Page, Request
21+
from playwright.sync_api import (
22+
APIRequestContext,
23+
)
24+
from playwright.sync_api import Error as PlaywrightError
25+
from playwright.sync_api import (
26+
FrameLocator,
27+
Locator,
28+
Page,
29+
Request,
30+
)
1531
from playwright.sync_api import TimeoutError as PlaywrightTimeoutError
16-
from playwright.sync_api import WebSocket
32+
from playwright.sync_api import (
33+
WebSocket,
34+
)
1735
from pydantic import AnyUrl
1836

1937
from .logging_tools import log_context
@@ -63,12 +81,15 @@ class NodeProgressType(str, Enum):
6381
# NOTE: this is a partial duplicate of models_library/rabbitmq_messages.py
6482
# It must remain as such until that module is pydantic V2 compatible
6583
CLUSTER_UP_SCALING = "CLUSTER_UP_SCALING"
66-
SERVICE_INPUTS_PULLING = "SERVICE_INPUTS_PULLING"
6784
SIDECARS_PULLING = "SIDECARS_PULLING"
85+
SERVICE_INPUTS_PULLING = "SERVICE_INPUTS_PULLING"
6886
SERVICE_OUTPUTS_PULLING = "SERVICE_OUTPUTS_PULLING"
6987
SERVICE_STATE_PULLING = "SERVICE_STATE_PULLING"
7088
SERVICE_IMAGES_PULLING = "SERVICE_IMAGES_PULLING"
7189
SERVICE_CONTAINERS_STARTING = "SERVICE_CONTAINERS_STARTING"
90+
SERVICE_STATE_PUSHING = "SERVICE_STATE_PUSHING"
91+
SERVICE_OUTPUTS_PUSHING = "SERVICE_OUTPUTS_PUSHING"
92+
PROJECT_CLOSING = "PROJECT_CLOSING"
7293

7394
@classmethod
7495
def required_types_for_started_service(cls) -> set["NodeProgressType"]:
@@ -94,6 +115,7 @@ class _OSparcMessages(str, Enum):
94115
SERVICE_DISK_USAGE = "serviceDiskUsage"
95116
WALLET_OSPARC_CREDITS_UPDATED = "walletOsparcCreditsUpdated"
96117
LOGGER = "logger"
118+
SERVICE_STATUS = "serviceStatus"
97119

98120

99121
@dataclass(frozen=True, slots=True, kw_only=True)
@@ -107,6 +129,9 @@ class SocketIOEvent:
107129
name: str
108130
obj: dict[str, Any]
109131

132+
def to_json(self) -> str:
133+
return json.dumps({"name": self.name, "obj": self.obj})
134+
110135

111136
SOCKETIO_MESSAGE_PREFIX: Final[str] = "42"
112137

@@ -286,23 +311,45 @@ def __call__(self, message: str) -> None:
286311
print("WS Message:", decoded_message.name, decoded_message.obj)
287312

288313

314+
_FAIL_FAST_DYNAMIC_SERVICE_STATES: Final[tuple[str, ...]] = ("idle", "failed")
315+
316+
289317
@dataclass
290318
class SocketIONodeProgressCompleteWaiter:
291319
node_id: str
292320
logger: logging.Logger
293321
product_url: AnyUrl
294322
api_request_context: APIRequestContext
295323
is_service_legacy: bool
324+
assertion_output_folder: Path
296325
_current_progress: dict[NodeProgressType, float] = field(
297326
default_factory=defaultdict
298327
)
299328
_last_poll_timestamp: datetime = field(default_factory=lambda: datetime.now(tz=UTC))
329+
_received_messages: list[SocketIOEvent] = field(default_factory=list)
330+
_service_ready: bool = False
300331

301332
def __call__(self, message: str) -> bool:
302333
# socket.io encodes messages like so
303334
# https://stackoverflow.com/questions/24564877/what-do-these-numbers-mean-in-socket-io-payload
304335
if message.startswith(SOCKETIO_MESSAGE_PREFIX):
305336
decoded_message = decode_socketio_42_message(message)
337+
self._received_messages.append(decoded_message)
338+
if (
339+
(decoded_message.name == _OSparcMessages.SERVICE_STATUS.value)
340+
and (decoded_message.obj["service_uuid"] == self.node_id)
341+
and (
342+
decoded_message.obj["service_state"]
343+
in _FAIL_FAST_DYNAMIC_SERVICE_STATES
344+
)
345+
):
346+
# NOTE: this is a fail fast for dynamic services that fail to start
347+
self.logger.error(
348+
"node %s failed with state %s, failing fast",
349+
self.node_id,
350+
decoded_message.obj["service_state"],
351+
)
352+
return True
306353
if decoded_message.name == _OSparcMessages.NODE_PROGRESS.value:
307354
node_progress_event = retrieve_node_progress_from_decoded_message(
308355
decoded_message
@@ -327,48 +374,56 @@ def __call__(self, message: str) -> bool:
327374
len(NodeProgressType.required_types_for_started_service()),
328375
f"{json.dumps({k: round(v, 2) for k, v in self._current_progress.items()})}",
329376
)
330-
331-
return self.got_expected_node_progress_types() and all(
377+
self._service_ready = self.got_expected_node_progress_types() and all(
332378
round(progress, 1) == 1.0
333379
for progress in self._current_progress.values()
334380
)
381+
return self._service_ready
335382

336383
_current_timestamp = datetime.now(UTC)
337384
if _current_timestamp - self._last_poll_timestamp > timedelta(seconds=5):
385+
# NOTE: we might have missed some websocket messages, and we check if the service is ready
338386
if self.is_service_legacy:
339387
url = f"https://{self.get_partial_product_url()}x/{self.node_id}/"
340388
else:
341389
url = (
342390
f"https://{self.node_id}.services.{self.get_partial_product_url()}"
343391
)
344-
with contextlib.suppress(PlaywrightTimeoutError):
345-
response = self.api_request_context.get(url, timeout=1000)
346-
level = logging.DEBUG
347-
if (response.status >= 400) and (response.status not in (502, 503)):
348-
level = logging.ERROR
392+
response = None
393+
with contextlib.suppress(
394+
PlaywrightTimeoutError, TimeoutError, PlaywrightError
395+
):
396+
response = self.api_request_context.get(url, timeout=5000)
397+
if response:
349398
self.logger.log(
350-
level,
399+
(
400+
logging.ERROR
401+
if (response.status >= 400)
402+
and (response.status not in (502, 503))
403+
else logging.DEBUG
404+
),
351405
"Querying service endpoint in case we missed some websocket messages. Url: %s Response: '%s' TIP: %s",
352406
url,
353407
f"{response.status}: {response.text()}",
354408
(
355-
"We are emulating the frontend; a 5XX response is acceptable if the service is not yet ready."
409+
"We are emulating the frontend; a 502/503 response is acceptable if the service is not yet ready."
356410
),
357411
)
358412

359413
if response.status <= 400:
360-
# NOTE: If the response status is less than 400, it means that the backend is ready (There are some services that respond with a 3XX)
414+
# NOTE: If the response status is less than 400, it means that the service is ready (There are some services that respond with a 3XX)
361415
if self.got_expected_node_progress_types():
362416
self.logger.warning(
363417
"⚠️ Progress bar didn't receive 100 percent but service is already running: %s. TIP: we missed some websocket messages! ⚠️", # https://github.com/ITISFoundation/osparc-simcore/issues/6449
364418
self.get_current_progress(),
365419
)
420+
self._service_ready = True
366421
return True
367-
self._last_poll_timestamp = datetime.now(UTC)
422+
self._last_poll_timestamp = datetime.now(UTC)
368423

369424
return False
370425

371-
def got_expected_node_progress_types(self):
426+
def got_expected_node_progress_types(self) -> bool:
372427
return all(
373428
progress_type in self._current_progress
374429
for progress_type in NodeProgressType.required_types_for_started_service()
@@ -377,9 +432,35 @@ def got_expected_node_progress_types(self):
377432
def get_current_progress(self):
378433
return self._current_progress.values()
379434

380-
def get_partial_product_url(self):
435+
def get_partial_product_url(self) -> str:
381436
return f"{self.product_url}".split("//")[1]
382437

438+
@property
439+
def number_received_messages(self) -> int:
440+
return len(self._received_messages)
441+
442+
def assert_service_ready(self) -> None:
443+
if not self._service_ready:
444+
with self.assertion_output_folder.joinpath("websocket.json").open("w") as f:
445+
f.writelines("[")
446+
f.writelines(
447+
f"{msg.to_json()}," for msg in self._received_messages[:-1]
448+
)
449+
f.writelines(
450+
f"{self._received_messages[-1].to_json()}"
451+
) # no comma for last element
452+
f.writelines("]")
453+
assert self._service_ready, (
454+
f"the service failed and received {self.number_received_messages} websocket messages while waiting!"
455+
"\nTIP: check websocket.log for detailed information in the test-results folder!"
456+
)
457+
458+
459+
_FAIL_FAST_COMPUTATIONAL_STATES: Final[tuple[RunningState, ...]] = (
460+
RunningState.FAILED,
461+
RunningState.ABORTED,
462+
)
463+
383464

384465
def wait_for_pipeline_state(
385466
current_state: RunningState,
@@ -397,13 +478,22 @@ def wait_for_pipeline_state(
397478
f"pipeline is now in {current_state=}",
398479
),
399480
):
400-
waiter = SocketIOProjectStateUpdatedWaiter(expected_states=expected_states)
481+
waiter = SocketIOProjectStateUpdatedWaiter(
482+
expected_states=expected_states + _FAIL_FAST_COMPUTATIONAL_STATES
483+
)
401484
with websocket.expect_event(
402485
"framereceived", waiter, timeout=timeout_ms
403486
) as event:
404487
current_state = retrieve_project_state_from_decoded_message(
405488
decode_socketio_42_message(event.value)
406489
)
490+
if (
491+
current_state in _FAIL_FAST_COMPUTATIONAL_STATES
492+
and current_state not in expected_states
493+
):
494+
pytest.fail(
495+
f"Pipeline failed with state {current_state}. Expected one of {expected_states}"
496+
)
407497
return current_state
408498

409499

@@ -437,6 +527,7 @@ def expected_service_running(
437527
press_start_button: bool,
438528
product_url: AnyUrl,
439529
is_service_legacy: bool,
530+
assertion_output_folder: Path,
440531
) -> Generator[ServiceRunning, None, None]:
441532
with log_context(
442533
logging.INFO, msg=f"Waiting for node to run. Timeout: {timeout}"
@@ -447,25 +538,16 @@ def expected_service_running(
447538
product_url=product_url,
448539
api_request_context=page.request,
449540
is_service_legacy=is_service_legacy,
541+
assertion_output_folder=assertion_output_folder,
450542
)
451543
service_running = ServiceRunning(iframe_locator=None)
452544

453-
try:
454-
with websocket.expect_event("framereceived", waiter, timeout=timeout):
455-
if press_start_button:
456-
_trigger_service_start(page, node_id)
457-
458-
yield service_running
459-
460-
except PlaywrightTimeoutError:
461-
if waiter.got_expected_node_progress_types():
462-
ctx.logger.warning(
463-
"⚠️ Progress bar didn't receive 100 percent but all expected node-progress-types are in place: %s ⚠️", # https://github.com/ITISFoundation/osparc-simcore/issues/6449
464-
waiter.get_current_progress(),
465-
)
466-
else:
467-
raise
545+
with websocket.expect_event("framereceived", waiter, timeout=timeout):
546+
if press_start_button:
547+
_trigger_service_start(page, node_id)
468548

549+
yield service_running
550+
waiter.assert_service_ready()
469551
service_running.iframe_locator = page.frame_locator(
470552
f'[osparc-test-id="iframe_{node_id}"]'
471553
)
@@ -480,6 +562,7 @@ def wait_for_service_running(
480562
press_start_button: bool,
481563
product_url: AnyUrl,
482564
is_service_legacy: bool,
565+
assertion_output_folder: Path,
483566
) -> FrameLocator:
484567
"""NOTE: if the service was already started this will not work as some of the required websocket events will not be emitted again
485568
In which case this will need further adjutment"""
@@ -493,10 +576,13 @@ def wait_for_service_running(
493576
product_url=product_url,
494577
api_request_context=page.request,
495578
is_service_legacy=is_service_legacy,
579+
assertion_output_folder=assertion_output_folder,
496580
)
497581
with websocket.expect_event("framereceived", waiter, timeout=timeout):
498582
if press_start_button:
499583
_trigger_service_start(page, node_id)
584+
585+
waiter.assert_service_ready()
500586
return page.frame_locator(f'[osparc-test-id="iframe_{node_id}"]')
501587

502588

packages/pytest-simcore/src/pytest_simcore/helpers/playwright_sim4life.py

+3
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import logging
33
import re
44
from dataclasses import dataclass
5+
from pathlib import Path
56
from typing import Final, TypedDict
67

78
import arrow
@@ -107,6 +108,7 @@ def wait_for_launched_s4l(
107108
copy_workspace: bool,
108109
product_url: AnyUrl,
109110
is_service_legacy: bool,
111+
assertion_output_folder: Path,
110112
) -> WaitForS4LDict:
111113
with log_context(logging.INFO, "launch S4L") as ctx:
112114
predicate = S4LWaitForWebsocket(logger=ctx.logger)
@@ -134,6 +136,7 @@ def wait_for_launched_s4l(
134136
press_start_button=False,
135137
product_url=product_url,
136138
is_service_legacy=is_service_legacy,
139+
assertion_output_folder=assertion_output_folder,
137140
)
138141
s4l_websocket = ws_info.value
139142
ctx.logger.info("acquired S4L websocket!")

0 commit comments

Comments
 (0)