@@ -278,7 +278,7 @@ public void testSelectLeastLoadedMlNode_maxConcurrentOpeningJobs() {
278
278
nodeAttr , Collections .emptySet (), Version .CURRENT ))
279
279
.build ();
280
280
281
- PersistentTasksCustomMetaData .Builder tasksBuilder = PersistentTasksCustomMetaData .builder ();
281
+ PersistentTasksCustomMetaData .Builder tasksBuilder = PersistentTasksCustomMetaData .builder ();
282
282
addJobTask ("job_id1" , "_node_id1" , null , tasksBuilder );
283
283
addJobTask ("job_id2" , "_node_id1" , null , tasksBuilder );
284
284
addJobTask ("job_id3" , "_node_id2" , null , tasksBuilder );
@@ -333,6 +333,55 @@ public void testSelectLeastLoadedMlNode_maxConcurrentOpeningJobs() {
333
333
assertTrue (result .getExplanation ().contains ("because node exceeds [2] the maximum number of jobs [2] in opening state" ));
334
334
}
335
335
336
+ public void testSelectLeastLoadedMlNode_concurrentOpeningJobsAndStaleFailedJob () {
337
+ Map <String , String > nodeAttr = new HashMap <>();
338
+ nodeAttr .put (MachineLearning .ML_ENABLED_NODE_ATTR , "true" );
339
+ DiscoveryNodes nodes = DiscoveryNodes .builder ()
340
+ .add (new DiscoveryNode ("_node_name1" , "_node_id1" , new TransportAddress (InetAddress .getLoopbackAddress (), 9300 ),
341
+ nodeAttr , Collections .emptySet (), Version .CURRENT ))
342
+ .add (new DiscoveryNode ("_node_name2" , "_node_id2" , new TransportAddress (InetAddress .getLoopbackAddress (), 9301 ),
343
+ nodeAttr , Collections .emptySet (), Version .CURRENT ))
344
+ .add (new DiscoveryNode ("_node_name3" , "_node_id3" , new TransportAddress (InetAddress .getLoopbackAddress (), 9302 ),
345
+ nodeAttr , Collections .emptySet (), Version .CURRENT ))
346
+ .build ();
347
+
348
+ PersistentTasksCustomMetaData .Builder tasksBuilder = PersistentTasksCustomMetaData .builder ();
349
+ addJobTask ("job_id1" , "_node_id1" , JobState .fromString ("failed" ), tasksBuilder );
350
+ // This will make the allocation stale for job_id1
351
+ tasksBuilder .reassignTask (MlMetadata .jobTaskId ("job_id1" ), new Assignment ("_node_id1" , "test assignment" ));
352
+ addJobTask ("job_id2" , "_node_id1" , null , tasksBuilder );
353
+ addJobTask ("job_id3" , "_node_id2" , null , tasksBuilder );
354
+ addJobTask ("job_id4" , "_node_id2" , null , tasksBuilder );
355
+ addJobTask ("job_id5" , "_node_id3" , null , tasksBuilder );
356
+ addJobTask ("job_id6" , "_node_id3" , null , tasksBuilder );
357
+ PersistentTasksCustomMetaData tasks = tasksBuilder .build ();
358
+
359
+ ClusterState .Builder csBuilder = ClusterState .builder (new ClusterName ("_name" ));
360
+ csBuilder .nodes (nodes );
361
+ MetaData .Builder metaData = MetaData .builder ();
362
+ RoutingTable .Builder routingTable = RoutingTable .builder ();
363
+ addJobAndIndices (metaData , routingTable , "job_id1" , "job_id2" , "job_id3" , "job_id4" , "job_id5" , "job_id6" , "job_id7" , "job_id8" );
364
+ csBuilder .routingTable (routingTable .build ());
365
+ metaData .putCustom (PersistentTasksCustomMetaData .TYPE , tasks );
366
+ csBuilder .metaData (metaData );
367
+
368
+ ClusterState cs = csBuilder .build ();
369
+ // Allocation won't be possible if the stale failed job is treated as opening
370
+ Assignment result = TransportOpenJobAction .selectLeastLoadedMlNode ("job_id7" , cs , 2 , 10 , 30 , logger );
371
+ assertEquals ("_node_id1" , result .getExecutorNode ());
372
+
373
+ tasksBuilder = PersistentTasksCustomMetaData .builder (tasks );
374
+ addJobTask ("job_id7" , "_node_id1" , null , tasksBuilder );
375
+ tasks = tasksBuilder .build ();
376
+
377
+ csBuilder = ClusterState .builder (cs );
378
+ csBuilder .metaData (MetaData .builder (cs .metaData ()).putCustom (PersistentTasksCustomMetaData .TYPE , tasks ));
379
+ cs = csBuilder .build ();
380
+ result = TransportOpenJobAction .selectLeastLoadedMlNode ("job_id8" , cs , 2 , 10 , 30 , logger );
381
+ assertNull ("no node selected, because OPENING state" , result .getExecutorNode ());
382
+ assertTrue (result .getExplanation ().contains ("because node exceeds [2] the maximum number of jobs [2] in opening state" ));
383
+ }
384
+
336
385
public void testSelectLeastLoadedMlNode_noCompatibleJobTypeNodes () {
337
386
Map <String , String > nodeAttr = new HashMap <>();
338
387
nodeAttr .put (MachineLearning .ML_ENABLED_NODE_ATTR , "true" );
0 commit comments