27
27
import org .elasticsearch .cluster .service .ClusterService ;
28
28
import org .elasticsearch .common .inject .Inject ;
29
29
import org .elasticsearch .common .settings .Settings ;
30
+ import org .elasticsearch .common .unit .ByteSizeValue ;
30
31
import org .elasticsearch .common .util .set .Sets ;
31
32
import org .elasticsearch .core .TimeValue ;
32
33
import org .elasticsearch .index .query .QueryBuilders ;
62
63
import org .elasticsearch .xpack .ml .inference .allocation .TrainedModelAllocationService ;
63
64
import org .elasticsearch .xpack .ml .inference .persistence .ChunkedTrainedModelRestorer ;
64
65
import org .elasticsearch .xpack .ml .inference .persistence .TrainedModelDefinitionDoc ;
66
+ import org .elasticsearch .xpack .ml .job .NodeLoadDetector ;
65
67
import org .elasticsearch .xpack .ml .process .MlMemoryTracker ;
66
68
67
69
import java .util .Collections ;
70
72
import java .util .List ;
71
73
import java .util .Map ;
72
74
import java .util .Objects ;
75
+ import java .util .OptionalLong ;
73
76
import java .util .Set ;
74
77
import java .util .function .Predicate ;
75
78
import java .util .stream .Collectors ;
@@ -89,6 +92,7 @@ public class TransportStartTrainedModelDeploymentAction extends TransportMasterN
89
92
private final NamedXContentRegistry xContentRegistry ;
90
93
private final MlMemoryTracker memoryTracker ;
91
94
protected volatile int maxLazyMLNodes ;
95
+ protected volatile long maxMLNodeSize ;
92
96
93
97
@ Inject
94
98
public TransportStartTrainedModelDeploymentAction (
@@ -121,13 +125,19 @@ public TransportStartTrainedModelDeploymentAction(
121
125
this .memoryTracker = Objects .requireNonNull (memoryTracker );
122
126
this .trainedModelAllocationService = Objects .requireNonNull (trainedModelAllocationService );
123
127
this .maxLazyMLNodes = MachineLearning .MAX_LAZY_ML_NODES .get (settings );
128
+ this .maxMLNodeSize = MachineLearning .MAX_ML_NODE_SIZE .get (settings ).getBytes ();
124
129
clusterService .getClusterSettings ().addSettingsUpdateConsumer (MachineLearning .MAX_LAZY_ML_NODES , this ::setMaxLazyMLNodes );
130
+ clusterService .getClusterSettings ().addSettingsUpdateConsumer (MachineLearning .MAX_ML_NODE_SIZE , this ::setMaxMLNodeSize );
125
131
}
126
132
127
133
private void setMaxLazyMLNodes (int value ) {
128
134
this .maxLazyMLNodes = value ;
129
135
}
130
136
137
+ private void setMaxMLNodeSize (ByteSizeValue value ) {
138
+ this .maxMLNodeSize = value .getBytes ();
139
+ }
140
+
131
141
@ Override
132
142
protected void masterOperation (
133
143
Task task ,
@@ -241,7 +251,7 @@ private void waitForDeploymentState(
241
251
AllocationStatus .State state ,
242
252
ActionListener <CreateTrainedModelAllocationAction .Response > listener
243
253
) {
244
- DeploymentStartedPredicate predicate = new DeploymentStartedPredicate (modelId , state , maxLazyMLNodes );
254
+ DeploymentStartedPredicate predicate = new DeploymentStartedPredicate (modelId , state , maxLazyMLNodes , maxMLNodeSize );
245
255
trainedModelAllocationService .waitForAllocationCondition (
246
256
modelId ,
247
257
predicate ,
@@ -402,11 +412,13 @@ private static class DeploymentStartedPredicate implements Predicate<ClusterStat
402
412
private final String modelId ;
403
413
private final AllocationStatus .State waitForState ;
404
414
private final int maxLazyMLNodes ;
415
+ private final long maxMLNodeSize ;
405
416
406
- DeploymentStartedPredicate (String modelId , AllocationStatus .State waitForState , int maxLazyMLNodes ) {
417
+ DeploymentStartedPredicate (String modelId , AllocationStatus .State waitForState , int maxLazyMLNodes , long maxMLNodeSize ) {
407
418
this .modelId = ExceptionsHelper .requireNonNull (modelId , "model_id" );
408
419
this .waitForState = waitForState ;
409
420
this .maxLazyMLNodes = maxLazyMLNodes ;
421
+ this .maxMLNodeSize = maxMLNodeSize ;
410
422
}
411
423
412
424
@ Override
@@ -445,9 +457,14 @@ public boolean test(ClusterState clusterState) {
445
457
.filter (d -> nodesShuttingDown .contains (d .getId ()) == false )
446
458
.filter (TaskParams ::mayAllocateToNode )
447
459
.collect (Collectors .toList ());
460
+ OptionalLong smallestMLNode = nodes .stream ().map (NodeLoadDetector ::getNodeSize ).flatMapToLong (OptionalLong ::stream ).min ();
448
461
449
462
// No nodes allocated at all!
450
- if (nodesAndState .isEmpty () && maxLazyMLNodes <= nodes .size ()) {
463
+ if (nodesAndState .isEmpty ()
464
+ // We cannot scale horizontally
465
+ && maxLazyMLNodes <= nodes .size ()
466
+ // We cannot scale vertically
467
+ && (smallestMLNode .isEmpty () || smallestMLNode .getAsLong () >= maxMLNodeSize )) {
451
468
String msg = "Could not start deployment because no suitable nodes were found, allocation explanation ["
452
469
+ trainedModelAllocation .getReason ()
453
470
+ "]" ;
0 commit comments