Skip to content

Commit d8c8f6c

Browse files
GitHKAndrei Neagumergify[bot]
authored
🐛 Fixed issue where GC stopping too early (#7457)
Co-authored-by: Andrei Neagu <[email protected]> Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
1 parent f09b9c2 commit d8c8f6c

File tree

2 files changed

+78
-80
lines changed

2 files changed

+78
-80
lines changed

services/web/server/src/simcore_service_webserver/garbage_collector/_core.py

+7-8
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,9 @@
1-
""" Core implementation of garbage collector
2-
3-
4-
"""
1+
"""Core implementation of garbage collector"""
52

63
import logging
74

85
from aiohttp import web
9-
from servicelib.logging_utils import log_context
6+
from servicelib.logging_utils import log_catch, log_context
107

118
from ..resource_manager.registry import RedisResourceRegistry, get_registry
129
from ._core_disconnected import remove_disconnected_user_resources
@@ -39,21 +36,23 @@ async def collect_garbage(app: web.Application):
3936
"""
4037
registry: RedisResourceRegistry = get_registry(app)
4138

42-
with log_context(
39+
with log_catch(_logger, reraise=False), log_context(
4340
_logger, logging.INFO, "Step 1: Removes disconnected user sessions"
4441
):
4542
# Triggers signal to close possible pending opened projects
4643
# Removes disconnected GUEST users after they finished their sessions
4744
await remove_disconnected_user_resources(registry, app)
4845

49-
with log_context(
46+
with log_catch(_logger, reraise=False), log_context(
5047
_logger, logging.INFO, "Step 2: Removes users manually marked for removal"
5148
):
5249
# if a user was manually marked as GUEST it needs to be
5350
# removed together with all the associated projects
5451
await remove_users_manually_marked_as_guests(registry, app)
5552

56-
with log_context(_logger, logging.INFO, "Step 3: Removes orphaned services"):
53+
with log_catch(_logger, reraise=False), log_context(
54+
_logger, logging.INFO, "Step 3: Removes orphaned services"
55+
):
5756
# For various reasons, some services remain pending after
5857
# the projects are closed or the user was disconencted.
5958
# This will close and remove all these services from

services/web/server/src/simcore_service_webserver/garbage_collector/_core_orphans.py

+71-72
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from servicelib.common_headers import UNDEFINED_DEFAULT_SIMCORE_USER_AGENT_VALUE
1313
from servicelib.logging_errors import create_troubleshotting_log_kwargs
1414
from servicelib.logging_utils import log_catch, log_context
15-
from servicelib.utils import limited_as_completed, logged_gather
15+
from servicelib.utils import limited_as_completed, limited_gather
1616

1717
from ..dynamic_scheduler import api as dynamic_scheduler_service
1818
from ..projects._projects_service import (
@@ -50,10 +50,10 @@ async def _remove_service(
5050
except (UserNotFoundError, ValueError):
5151
save_service_state = False
5252

53-
with log_context(
53+
with log_catch(_logger, reraise=False), log_context(
5454
_logger,
5555
logging.INFO,
56-
msg=f"removing {(service.node_uuid, service.host)} with {save_service_state=}",
56+
f"removing {(service.node_uuid, service.host)} with {save_service_state=}",
5757
):
5858
await dynamic_scheduler_service.stop_dynamic_service(
5959
app,
@@ -89,73 +89,72 @@ async def remove_orphaned_services(
8989
# otherwise if some time goes it could very well be that there are new projects opened
9090
# in between and the GC would remove services that actually should be running.
9191

92-
with log_catch(_logger, reraise=False):
93-
running_services = await dynamic_scheduler_service.list_dynamic_services(app)
94-
if not running_services:
95-
# nothing to do
96-
return
97-
_logger.debug(
98-
"Actual running dynamic services: %s",
99-
[(x.node_uuid, x.host) for x in running_services],
100-
)
101-
running_services_by_id: dict[NodeID, DynamicServiceGet] = {
102-
service.node_uuid: service for service in running_services
103-
}
104-
105-
known_opened_project_ids = await _list_opened_project_ids(registry)
106-
107-
# NOTE: Always skip orphan repmoval when `list_node_ids_in_project` raises an error.
108-
# Why? If a service is running but the nodes form the correspondign project cannot be listed,
109-
# the service will be considered as orphaned and closed.
110-
potentially_running_service_ids: list[set[NodeID]] = []
111-
async for project_nodes_future in limited_as_completed(
112-
(
113-
list_node_ids_in_project(app, project_id)
114-
for project_id in known_opened_project_ids
115-
),
116-
limit=_MAX_CONCURRENT_CALLS,
117-
):
118-
try:
119-
project_nodes = await project_nodes_future
120-
potentially_running_service_ids.append(project_nodes)
121-
except BaseException as e: # pylint:disable=broad-exception-caught
122-
_logger.warning(
123-
create_troubleshotting_log_kwargs(
124-
(
125-
"Skipping orpahn services removal, call to "
126-
"`list_node_ids_in_project` raised"
127-
),
128-
error=e,
129-
error_context={
130-
"running_services": running_services,
131-
"running_services_by_id": running_services_by_id,
132-
"known_opened_project_ids": known_opened_project_ids,
133-
},
92+
running_services = await dynamic_scheduler_service.list_dynamic_services(app)
93+
if not running_services:
94+
# nothing to do
95+
return
96+
_logger.debug(
97+
"Actual running dynamic services: %s",
98+
[(x.node_uuid, x.host) for x in running_services],
99+
)
100+
running_services_by_id: dict[NodeID, DynamicServiceGet] = {
101+
service.node_uuid: service for service in running_services
102+
}
103+
104+
known_opened_project_ids = await _list_opened_project_ids(registry)
105+
106+
# NOTE: Always skip orphan repmoval when `list_node_ids_in_project` raises an error.
107+
# Why? If a service is running but the nodes form the correspondign project cannot be listed,
108+
# the service will be considered as orphaned and closed.
109+
potentially_running_service_ids: list[set[NodeID]] = []
110+
async for project_nodes_future in limited_as_completed(
111+
(
112+
list_node_ids_in_project(app, project_id)
113+
for project_id in known_opened_project_ids
114+
),
115+
limit=_MAX_CONCURRENT_CALLS,
116+
):
117+
try:
118+
project_nodes = await project_nodes_future
119+
potentially_running_service_ids.append(project_nodes)
120+
except BaseException as e: # pylint:disable=broad-exception-caught
121+
_logger.warning(
122+
create_troubleshotting_log_kwargs(
123+
(
124+
"Skipping orpahn services removal, call to "
125+
"`list_node_ids_in_project` raised"
134126
),
135-
exc_info=True,
136-
)
137-
return
138-
139-
potentially_running_service_ids_set: set[NodeID] = set().union(
140-
*(node_id for node_id in potentially_running_service_ids)
141-
)
142-
_logger.debug(
143-
"Allowed service UUIDs from known opened projects: %s",
144-
potentially_running_service_ids_set,
145-
)
146-
147-
# compute the difference to find the orphaned services
148-
orphaned_running_service_ids = (
149-
set(running_services_by_id) - potentially_running_service_ids_set
150-
)
151-
_logger.debug("Found orphaned services: %s", orphaned_running_service_ids)
152-
# NOTE: no need to not reraise here, since we catch everything above
153-
# and logged_gather first runs everything
154-
await logged_gather(
155-
*(
156-
_remove_service(app, node_id, running_services_by_id[node_id])
157-
for node_id in orphaned_running_service_ids
158-
),
159-
log=_logger,
160-
max_concurrency=_MAX_CONCURRENT_CALLS,
161-
)
127+
error=e,
128+
error_context={
129+
"running_services": running_services,
130+
"running_services_by_id": running_services_by_id,
131+
"known_opened_project_ids": known_opened_project_ids,
132+
},
133+
),
134+
exc_info=True,
135+
)
136+
continue
137+
138+
potentially_running_service_ids_set: set[NodeID] = set().union(
139+
*(node_id for node_id in potentially_running_service_ids)
140+
)
141+
_logger.debug(
142+
"Allowed service UUIDs from known opened projects: %s",
143+
potentially_running_service_ids_set,
144+
)
145+
146+
# compute the difference to find the orphaned services
147+
orphaned_running_service_ids = (
148+
set(running_services_by_id) - potentially_running_service_ids_set
149+
)
150+
_logger.debug("Found orphaned services: %s", orphaned_running_service_ids)
151+
# NOTE: no need to not reraise here, since we catch everything above
152+
# and logged_gather first runs everything
153+
await limited_gather(
154+
*(
155+
_remove_service(app, node_id, running_services_by_id[node_id])
156+
for node_id in orphaned_running_service_ids
157+
),
158+
log=_logger,
159+
limit=_MAX_CONCURRENT_CALLS,
160+
)

0 commit comments

Comments
 (0)