18
18
from aws_library .ec2 ._errors import EC2TooManyInstancesError
19
19
from fastapi import FastAPI
20
20
from models_library .generated_models .docker_rest_api import Node , NodeState
21
+ from models_library .rabbitmq_messages import ProgressType
21
22
from servicelib .logging_utils import log_catch , log_context
22
23
from servicelib .utils import limited_gather
23
24
from servicelib .utils_formatting import timedelta_as_minute_second
51
52
get_deactivated_buffer_ec2_tags ,
52
53
is_buffer_machine ,
53
54
)
54
- from ..utils .rabbitmq import post_autoscaling_status_message
55
+ from ..utils .rabbitmq import (
56
+ post_autoscaling_status_message ,
57
+ post_tasks_log_message ,
58
+ post_tasks_progress_message ,
59
+ )
55
60
from .auto_scaling_mode_base import BaseAutoscaling
56
61
from .docker import get_docker_client
57
62
from .ec2 import get_ec2_client
@@ -354,7 +359,6 @@ def _as_selection(instance_type: EC2InstanceType) -> int:
354
359
355
360
async def _activate_and_notify (
356
361
app : FastAPI ,
357
- auto_scaling_mode : BaseAutoscaling ,
358
362
drained_node : AssociatedInstance ,
359
363
) -> AssociatedInstance :
360
364
app_settings = get_application_settings (app )
@@ -363,14 +367,17 @@ async def _activate_and_notify(
363
367
utils_docker .set_node_osparc_ready (
364
368
app_settings , docker_client , drained_node .node , ready = True
365
369
),
366
- auto_scaling_mode . log_message_from_tasks (
370
+ post_tasks_log_message (
367
371
app ,
368
- drained_node .assigned_tasks ,
369
- "cluster adjusted, service should start shortly..." ,
372
+ tasks = drained_node .assigned_tasks ,
373
+ message = "cluster adjusted, service should start shortly..." ,
370
374
level = logging .INFO ,
371
375
),
372
- auto_scaling_mode .progress_message_from_tasks (
373
- app , drained_node .assigned_tasks , progress = 1.0
376
+ post_tasks_progress_message (
377
+ app ,
378
+ tasks = drained_node .assigned_tasks ,
379
+ progress = 1.0 ,
380
+ progress_type = ProgressType .CLUSTER_UP_SCALING ,
374
381
),
375
382
)
376
383
return dataclasses .replace (drained_node , node = updated_node )
@@ -379,7 +386,6 @@ async def _activate_and_notify(
379
386
async def _activate_drained_nodes (
380
387
app : FastAPI ,
381
388
cluster : Cluster ,
382
- auto_scaling_mode : BaseAutoscaling ,
383
389
) -> Cluster :
384
390
nodes_to_activate = [
385
391
node
@@ -396,10 +402,7 @@ async def _activate_drained_nodes(
396
402
f"activate { len (nodes_to_activate )} drained nodes { [n .ec2_instance .id for n in nodes_to_activate ]} " ,
397
403
):
398
404
activated_nodes = await asyncio .gather (
399
- * (
400
- _activate_and_notify (app , auto_scaling_mode , node )
401
- for node in nodes_to_activate
402
- )
405
+ * (_activate_and_notify (app , node ) for node in nodes_to_activate )
403
406
)
404
407
new_active_node_ids = {node .ec2_instance .id for node in activated_nodes }
405
408
remaining_drained_nodes = [
@@ -787,10 +790,10 @@ async def _launch_instances(
787
790
app , needed_instances , new_instance_tags
788
791
)
789
792
except EC2TooManyInstancesError :
790
- await auto_scaling_mode . log_message_from_tasks (
793
+ await post_tasks_log_message (
791
794
app ,
792
- tasks ,
793
- "The maximum number of machines in the cluster was reached. Please wait for your running jobs "
795
+ tasks = tasks ,
796
+ message = "The maximum number of machines in the cluster was reached. Please wait for your running jobs "
794
797
"to complete and try again later or contact osparc support if this issue does not resolve." ,
795
798
level = logging .ERROR ,
796
799
)
@@ -829,10 +832,10 @@ async def _launch_instances(
829
832
new_pending_instances : list [EC2InstanceData ] = []
830
833
for r in results :
831
834
if isinstance (r , EC2TooManyInstancesError ):
832
- await auto_scaling_mode . log_message_from_tasks (
835
+ await post_tasks_log_message (
833
836
app ,
834
- tasks ,
835
- "Exceptionally high load on computational cluster, please try again later." ,
837
+ tasks = tasks ,
838
+ message = "Exceptionally high load on computational cluster, please try again later." ,
836
839
level = logging .ERROR ,
837
840
)
838
841
elif isinstance (r , BaseException ):
@@ -847,14 +850,14 @@ async def _launch_instances(
847
850
f"{ sum (n for n in capped_needed_machines .values ())} new machines launched"
848
851
", it might take up to 3 minutes to start, Please wait..."
849
852
)
850
- await auto_scaling_mode . log_message_from_tasks (
851
- app , tasks , log_message , level = logging .INFO
853
+ await post_tasks_log_message (
854
+ app , tasks = tasks , message = log_message , level = logging .INFO
852
855
)
853
856
if last_issue :
854
- await auto_scaling_mode . log_message_from_tasks (
857
+ await post_tasks_log_message (
855
858
app ,
856
- tasks ,
857
- "Unexpected issues detected, probably due to high load, please contact support" ,
859
+ tasks = tasks ,
860
+ message = "Unexpected issues detected, probably due to high load, please contact support" ,
858
861
level = logging .ERROR ,
859
862
)
860
863
@@ -1064,7 +1067,6 @@ async def _try_scale_down_cluster(app: FastAPI, cluster: Cluster) -> Cluster:
1064
1067
async def _notify_based_on_machine_type (
1065
1068
app : FastAPI ,
1066
1069
instances : list [AssociatedInstance ] | list [NonAssociatedInstance ],
1067
- auto_scaling_mode : BaseAutoscaling ,
1068
1070
* ,
1069
1071
message : str ,
1070
1072
) -> None :
@@ -1088,24 +1090,22 @@ async def _notify_based_on_machine_type(
1088
1090
f" est. remaining time: { timedelta_as_minute_second (estimated_time_to_completion )} )...please wait..."
1089
1091
)
1090
1092
if tasks :
1091
- await auto_scaling_mode . log_message_from_tasks (
1092
- app , tasks , message = msg , level = logging .INFO
1093
+ await post_tasks_log_message (
1094
+ app , tasks = tasks , message = msg , level = logging .INFO
1093
1095
)
1094
- await auto_scaling_mode . progress_message_from_tasks (
1096
+ await post_tasks_progress_message (
1095
1097
app ,
1096
- tasks ,
1098
+ tasks = tasks ,
1097
1099
progress = time_since_launch .total_seconds ()
1098
1100
/ instance_max_time_to_start .total_seconds (),
1101
+ progress_type = ProgressType .CLUSTER_UP_SCALING ,
1099
1102
)
1100
1103
1101
1104
1102
- async def _notify_machine_creation_progress (
1103
- app : FastAPI , cluster : Cluster , auto_scaling_mode : BaseAutoscaling
1104
- ) -> None :
1105
+ async def _notify_machine_creation_progress (app : FastAPI , cluster : Cluster ) -> None :
1105
1106
await _notify_based_on_machine_type (
1106
1107
app ,
1107
1108
cluster .pending_ec2s ,
1108
- auto_scaling_mode ,
1109
1109
message = "waiting for machine to join cluster" ,
1110
1110
)
1111
1111
@@ -1191,10 +1191,10 @@ async def _scale_up_cluster(
1191
1191
if needed_ec2_instances := await _find_needed_instances (
1192
1192
app , unassigned_tasks , allowed_instance_types , cluster , auto_scaling_mode
1193
1193
):
1194
- await auto_scaling_mode . log_message_from_tasks (
1194
+ await post_tasks_log_message (
1195
1195
app ,
1196
- unassigned_tasks ,
1197
- "service is pending due to missing resources, scaling up cluster now..." ,
1196
+ tasks = unassigned_tasks ,
1197
+ message = "service is pending due to missing resources, scaling up cluster now..." ,
1198
1198
level = logging .INFO ,
1199
1199
)
1200
1200
new_pending_instances = await _launch_instances (
@@ -1228,7 +1228,7 @@ async def _autoscale_cluster(
1228
1228
)
1229
1229
1230
1230
# 2. activate available drained nodes to cover some of the tasks
1231
- cluster = await _activate_drained_nodes (app , cluster , auto_scaling_mode )
1231
+ cluster = await _activate_drained_nodes (app , cluster )
1232
1232
1233
1233
# 3. start warm buffer instances to cover the remaining tasks
1234
1234
cluster = await _start_warm_buffer_instances (app , cluster , auto_scaling_mode )
@@ -1301,5 +1301,5 @@ async def auto_scale_cluster(
1301
1301
)
1302
1302
1303
1303
# notify
1304
- await _notify_machine_creation_progress (app , cluster , auto_scaling_mode )
1304
+ await _notify_machine_creation_progress (app , cluster )
1305
1305
await _notify_autoscaling_status (app , cluster , auto_scaling_mode )
0 commit comments