Skip to content

Commit 9380819

Browse files
authored
🐛 Fixes catalog's synchronization background task continues errors due to faulty service info (#6344)
1 parent 3457e01 commit 9380819

File tree

6 files changed

+80
-35
lines changed

6 files changed

+80
-35
lines changed

packages/service-library/src/servicelib/rabbitmq/rpc_interfaces/catalog/services.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ async def _call(
5454
user_id=user_id,
5555
limit=limit,
5656
offset=offset,
57-
timeout_s=20 * RPC_REQUEST_DEFAULT_TIMEOUT_S,
57+
timeout_s=40 * RPC_REQUEST_DEFAULT_TIMEOUT_S,
5858
)
5959

6060
result = await _call(

services/catalog/src/simcore_service_catalog/core/background_tasks.py

Lines changed: 35 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,14 @@
1515
from pprint import pformat
1616
from typing import Final
1717

18-
from fastapi import FastAPI
18+
from fastapi import FastAPI, HTTPException
1919
from models_library.services import ServiceMetaDataPublished
2020
from models_library.services_types import ServiceKey, ServiceVersion
2121
from packaging.version import Version
22+
from pydantic import ValidationError
2223
from simcore_service_catalog.api.dependencies.director import get_director_api
2324
from simcore_service_catalog.services import manifest
25+
from sqlalchemy.exc import SQLAlchemyError
2426
from sqlalchemy.ext.asyncio import AsyncEngine
2527

2628
from ..db.repositories.groups import GroupsRepository
@@ -62,32 +64,45 @@ def _by_version(t: tuple[ServiceKey, ServiceVersion]) -> Version:
6264
sorted_services = sorted(service_keys, key=_by_version)
6365

6466
for service_key, service_version in sorted_services:
67+
6568
service_metadata: ServiceMetaDataPublished = services_in_registry[
6669
(service_key, service_version)
6770
]
68-
## Set deprecation date to null (is valid date value for postgres)
71+
try:
72+
## Set deprecation date to null (is valid date value for postgres)
6973

70-
# DEFAULT policies
71-
(
72-
owner_gid,
73-
service_access_rights,
74-
) = await access_rights.evaluate_default_policy(app, service_metadata)
74+
# DEFAULT policies
75+
(
76+
owner_gid,
77+
service_access_rights,
78+
) = await access_rights.evaluate_default_policy(app, service_metadata)
7579

76-
# AUTO-UPGRADE PATCH policy
77-
inherited_access_rights = await access_rights.evaluate_auto_upgrade_policy(
78-
service_metadata, services_repo
79-
)
80+
# AUTO-UPGRADE PATCH policy
81+
inherited_access_rights = await access_rights.evaluate_auto_upgrade_policy(
82+
service_metadata, services_repo
83+
)
8084

81-
service_access_rights += inherited_access_rights
82-
service_access_rights = access_rights.reduce_access_rights(
83-
service_access_rights
84-
)
85+
service_access_rights += inherited_access_rights
86+
service_access_rights = access_rights.reduce_access_rights(
87+
service_access_rights
88+
)
8589

86-
# set the service in the DB
87-
await services_repo.create_or_update_service(
88-
ServiceMetaDataAtDB(**service_metadata.dict(), owner=owner_gid),
89-
service_access_rights,
90-
)
90+
# set the service in the DB
91+
await services_repo.create_or_update_service(
92+
ServiceMetaDataAtDB(**service_metadata.dict(), owner=owner_gid),
93+
service_access_rights,
94+
)
95+
96+
except (HTTPException, ValidationError, SQLAlchemyError) as err:
97+
# Resilient to single failures: errors in individual (service,key) should not prevent the evaluation of the rest
98+
# and stop the background task from running.
99+
# SEE https://github.com/ITISFoundation/osparc-simcore/issues/6318
100+
_logger.warning(
101+
"Skipping '%s:%s' due to %s",
102+
service_key,
103+
service_version,
104+
err,
105+
)
91106

92107

93108
async def _ensure_registry_and_database_are_synced(app: FastAPI) -> None:

services/catalog/src/simcore_service_catalog/exceptions/errors.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,5 +16,13 @@ class UninitializedGroupError(RepositoryError):
1616
msg_tempalte = "{group} groups was never initialized"
1717

1818

19-
class DirectorUnresponsiveError(CatalogBaseError):
19+
class BaseDirectorError(CatalogBaseError):
20+
...
21+
22+
23+
class DirectorUnresponsiveError(BaseDirectorError):
2024
msg_template = "Director-v0 is not responsive"
25+
26+
27+
class DirectorStatusError(BaseDirectorError):
28+
...

services/catalog/src/simcore_service_catalog/services/access_rights.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import logging
66
import operator
77
from collections.abc import Callable
8-
from datetime import datetime, timezone
8+
from datetime import UTC, datetime
99
from typing import Any, cast
1010
from urllib.parse import quote_plus
1111

@@ -25,9 +25,7 @@
2525

2626
_logger = logging.getLogger(__name__)
2727

28-
_LEGACY_SERVICES_DATE: datetime = datetime(
29-
year=2020, month=8, day=19, tzinfo=timezone.utc
30-
)
28+
_LEGACY_SERVICES_DATE: datetime = datetime(year=2020, month=8, day=19, tzinfo=UTC)
3129

3230

3331
def _is_frontend_service(service: ServiceMetaDataPublished) -> bool:
@@ -46,9 +44,6 @@ async def _is_old_service(app: FastAPI, service: ServiceMetaDataPublished) -> bo
4644
)
4745
if not data or "build_date" not in data:
4846
return True
49-
50-
_logger.debug("retrieved service extras are %s", data)
51-
5247
service_build_data = arrow.get(data["build_date"]).datetime
5348
return bool(service_build_data < _LEGACY_SERVICES_DATE)
5449

@@ -63,6 +58,12 @@ async def evaluate_default_policy(
6358
1. All services published in osparc prior 19.08.2020 will be visible to everyone (refered as 'old service').
6459
2. Services published after 19.08.2020 will be visible ONLY to his/her owner
6560
3. Front-end services are have execute-access to everyone
61+
62+
63+
Raises:
64+
HTTPException: from calls to director's rest API. Maps director errors into catalog's server error
65+
SQLAlchemyError: from access to pg database
66+
ValidationError: from pydantic model errors
6667
"""
6768
db_engine: AsyncEngine = app.state.engine
6869

services/catalog/tests/unit/with_dbs/test_core_background_task__sync.py

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,8 @@
99
from typing import Any
1010

1111
import pytest
12-
from fastapi import FastAPI
12+
from fastapi import FastAPI, HTTPException, status
13+
from pytest_mock import MockerFixture
1314
from respx.router import MockRouter
1415
from simcore_postgres_database.models.services import services_meta_data
1516
from simcore_service_catalog.core.background_tasks import _run_sync_services
@@ -40,6 +41,7 @@ async def cleanup_service_meta_data_db_content(sqlalchemy_async_engine: AsyncEng
4041
await conn.execute(services_meta_data.delete())
4142

4243

44+
@pytest.mark.parametrize("director_fails", [False, True])
4345
async def test_registry_sync_task(
4446
background_tasks_setup_disabled: None,
4547
rabbitmq_and_rpc_setup_disabled: None,
@@ -49,10 +51,20 @@ async def test_registry_sync_task(
4951
app: FastAPI,
5052
services_repo: ServicesRepository,
5153
cleanup_service_meta_data_db_content: None,
54+
mocker: MockerFixture,
55+
director_fails: bool,
5256
):
53-
5457
assert app.state
5558

59+
if director_fails:
60+
# Emulates issue https://github.com/ITISFoundation/osparc-simcore/issues/6318
61+
mocker.patch(
62+
"simcore_service_catalog.services.access_rights._is_old_service",
63+
side_effect=HTTPException(
64+
status_code=status.HTTP_404_NOT_FOUND, detail="fake director error"
65+
),
66+
)
67+
5668
service_key = expected_director_list_services[0]["key"]
5769
service_version = expected_director_list_services[0]["version"]
5870

@@ -75,6 +87,10 @@ async def test_registry_sync_task(
7587
key=service_key,
7688
version=service_version,
7789
)
78-
assert got_from_db
79-
assert got_from_db.key == service_key
80-
assert got_from_db.version == service_version
90+
91+
if director_fails:
92+
assert not got_from_db
93+
else:
94+
assert got_from_db
95+
assert got_from_db.key == service_key
96+
assert got_from_db.version == service_version

services/web/server/docker/boot.sh

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,14 @@ echo "$INFO" "Selected config ${APP_CONFIG}"
3838
echo "$INFO" "Log-level app/server: $APP_LOG_LEVEL/$SERVER_LOG_LEVEL"
3939

4040
# NOTE: the number of workers ```(2 x $num_cores) + 1``` is
41-
# the official recommendation [https://docs.gunicorn.org/en/latest/design.html#how-many-workers]
41+
# the official recommendation https://docs.gunicorn.org/en/latest/design.html#how-many-workers
4242
# For now we set it to 1 to check what happens with websockets
43+
#
4344
# SEE also https://docs.aiohttp.org/en/stable/deployment.html#start-gunicorn
45+
#
46+
# NOTE: GUNICORN_CMD_ARGS is affecting as well gunicorn
47+
# SEE https://docs.gunicorn.org/en/latest/settings.html#settings
48+
echo "$INFO" "GUNICORN_CMD_ARGS: $GUNICORN_CMD_ARGS"
4449

4550
if [ "${SC_BOOT_MODE}" = "debug" ]; then
4651
# NOTE: ptvsd is programmatically enabled inside of the service

0 commit comments

Comments
 (0)