@@ -55,6 +55,9 @@ const chaosMonkey = new ChaosMonkey(
55
55
! ! process . env . CHAOS_MONKEY_DISABLE_DELAYS
56
56
) ;
57
57
58
+ class CheckpointReadinessTimeoutError extends Error { }
59
+ class CheckpointCancelError extends Error { }
60
+
58
61
class TaskCoordinator {
59
62
#httpServer: ReturnType < typeof createServer > ;
60
63
#checkpointer = new Checkpointer ( {
@@ -398,9 +401,14 @@ class TaskCoordinator {
398
401
399
402
let timeout : NodeJS . Timeout | undefined = undefined ;
400
403
404
+ const CHECKPOINTABLE_TIMEOUT_SECONDS = 20 ;
405
+
401
406
const isCheckpointable = new Promise ( ( resolve , reject ) => {
402
407
// We set a reasonable timeout to prevent waiting forever
403
- timeout = setTimeout ( ( ) => reject ( "timeout" ) , 20_000 ) ;
408
+ timeout = setTimeout (
409
+ ( ) => reject ( new CheckpointReadinessTimeoutError ( ) ) ,
410
+ CHECKPOINTABLE_TIMEOUT_SECONDS * 1000
411
+ ) ;
404
412
405
413
this . #checkpointableTasks. set ( socket . data . runId , { resolve, reject } ) ;
406
414
} ) ;
@@ -415,10 +423,24 @@ class TaskCoordinator {
415
423
} catch ( error ) {
416
424
logger . error ( "Error while waiting for checkpointable state" , { error } ) ;
417
425
418
- await crashRun ( {
419
- name : "ReadyForCheckpointError" ,
420
- message : `Failed to become checkpointable for ${ reason } ` ,
421
- } ) ;
426
+ if ( error instanceof CheckpointReadinessTimeoutError ) {
427
+ await crashRun ( {
428
+ name : error . name ,
429
+ message : `Failed to become checkpointable in ${ CHECKPOINTABLE_TIMEOUT_SECONDS } s for ${ reason } ` ,
430
+ } ) ;
431
+
432
+ return {
433
+ success : false ,
434
+ reason : "timeout" ,
435
+ } ;
436
+ }
437
+
438
+ if ( error instanceof CheckpointCancelError ) {
439
+ return {
440
+ success : false ,
441
+ reason : "canceled" ,
442
+ } ;
443
+ }
422
444
423
445
return {
424
446
success : false ,
@@ -1065,7 +1087,7 @@ class TaskCoordinator {
1065
1087
1066
1088
if ( checkpointWait ) {
1067
1089
// Stop waiting for task to reach checkpointable state
1068
- checkpointWait . reject ( "Checkpoint cancelled" ) ;
1090
+ checkpointWait . reject ( new CheckpointCancelError ( ) ) ;
1069
1091
}
1070
1092
1071
1093
// Cancel checkpointing procedure
0 commit comments