Skip to content

Commit d673fdc

Browse files
authored
Add a configurable escalation behaviour (#162)
* Add a configurable escalation behaviour # Motivation The service group is often running multiple services and orchestrates shutdown for them. We have seen that sometimes some services never shutdown nor do they respond to task cancellation properly. This can become quite problematic when the whole application is waiting for a service to cancel and otherwise appears to be healthy but in reality can't serve any traffic. # Modification This PR adds a new configuration to escalate both graceful shutdown and cancellation. The escalation order is graceful shutdown -> task cancellation -> `fatalError`. The `fatalError` acts a last resort to make sure applications are never stuck. * George review * Fix some smaller issues
1 parent b71a961 commit d673fdc

File tree

3 files changed

+273
-17
lines changed

3 files changed

+273
-17
lines changed

Sources/ServiceLifecycle/ServiceGroup.swift

Lines changed: 112 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,10 @@ public actor ServiceGroup: Sendable {
3333
private let logger: Logger
3434
/// The logging configuration.
3535
private let loggingConfiguration: ServiceGroupConfiguration.LoggingConfiguration
36+
/// The maximum amount of time that graceful shutdown is allowed to take.
37+
private let maximumGracefulShutdownDuration: (secondsComponent: Int64, attosecondsComponent: Int64)?
38+
/// The maximum amount of time that task cancellation is allowed to take.
39+
private let maximumCancellationDuration: (secondsComponent: Int64, attosecondsComponent: Int64)?
3640
/// The signals that lead to graceful shutdown.
3741
private let gracefulShutdownSignals: [UnixSignal]
3842
/// The signals that lead to cancellation.
@@ -57,6 +61,8 @@ public actor ServiceGroup: Sendable {
5761
self.cancellationSignals = configuration.cancellationSignals
5862
self.logger = configuration.logger
5963
self.loggingConfiguration = configuration.logging
64+
self.maximumGracefulShutdownDuration = configuration._maximumGracefulShutdownDuration
65+
self.maximumCancellationDuration = configuration._maximumCancellationDuration
6066
}
6167

6268
/// Initializes a new ``ServiceGroup``.
@@ -94,6 +100,8 @@ public actor ServiceGroup: Sendable {
94100
self.cancellationSignals = configuration.cancellationSignals
95101
self.logger = logger
96102
self.loggingConfiguration = configuration.logging
103+
self.maximumGracefulShutdownDuration = configuration._maximumGracefulShutdownDuration
104+
self.maximumCancellationDuration = configuration._maximumCancellationDuration
97105
}
98106

99107
/// Runs all the services by spinning up a child task per service.
@@ -176,6 +184,8 @@ public actor ServiceGroup: Sendable {
176184
case signalSequenceFinished
177185
case gracefulShutdownCaught
178186
case gracefulShutdownFinished
187+
case gracefulShutdownTimedOut
188+
case cancellationCaught
179189
}
180190

181191
private func _run(
@@ -191,6 +201,10 @@ public actor ServiceGroup: Sendable {
191201
]
192202
)
193203

204+
// A task that is spawned when we got cancelled or
205+
// we cancel the task group to keep track of a timeout.
206+
var cancellationTimeoutTask: Task<Void, Never>?
207+
194208
// Using a result here since we want a task group that has non-throwing child tasks
195209
// but the body itself is throwing
196210
let result = try await withThrowingTaskGroup(of: ChildTaskResult.self, returning: Result<Void, Error>.self) { group in
@@ -267,6 +281,13 @@ public actor ServiceGroup: Sendable {
267281
}
268282
}
269283

284+
group.addTask {
285+
// This child task is waiting forever until the group gets cancelled.
286+
let (stream, _) = AsyncStream.makeStream(of: Void.self)
287+
await stream.first { _ in true }
288+
return .cancellationCaught
289+
}
290+
270291
// We are storing the services in an optional array now. When a slot in the array is
271292
// empty it indicates that the service has been shutdown.
272293
var services = services.map { Optional($0) }
@@ -293,7 +314,7 @@ public actor ServiceGroup: Sendable {
293314
self.loggingConfiguration.keys.serviceKey: "\(service.service)",
294315
]
295316
)
296-
group.cancelAll()
317+
self.cancelGroupAndSpawnTimeoutIfNeeded(group: &group, cancellationTimeoutTask: &cancellationTimeoutTask)
297318
return .failure(ServiceGroupError.serviceFinishedUnexpectedly())
298319

299320
case .gracefullyShutdownGroup:
@@ -307,6 +328,7 @@ public actor ServiceGroup: Sendable {
307328
do {
308329
try await self.shutdownGracefully(
309330
services: services,
331+
cancellationTimeoutTask: &cancellationTimeoutTask,
310332
group: &group,
311333
gracefulShutdownManagers: gracefulShutdownManagers
312334
)
@@ -327,7 +349,7 @@ public actor ServiceGroup: Sendable {
327349
self.logger.debug(
328350
"All services finished."
329351
)
330-
group.cancelAll()
352+
self.cancelGroupAndSpawnTimeoutIfNeeded(group: &group, cancellationTimeoutTask: &cancellationTimeoutTask)
331353
return .success(())
332354
}
333355
}
@@ -342,7 +364,7 @@ public actor ServiceGroup: Sendable {
342364
self.loggingConfiguration.keys.errorKey: "\(serviceError)",
343365
]
344366
)
345-
group.cancelAll()
367+
self.cancelGroupAndSpawnTimeoutIfNeeded(group: &group, cancellationTimeoutTask: &cancellationTimeoutTask)
346368
return .failure(serviceError)
347369

348370
case .gracefullyShutdownGroup:
@@ -358,6 +380,7 @@ public actor ServiceGroup: Sendable {
358380
do {
359381
try await self.shutdownGracefully(
360382
services: services,
383+
cancellationTimeoutTask: &cancellationTimeoutTask,
361384
group: &group,
362385
gracefulShutdownManagers: gracefulShutdownManagers
363386
)
@@ -381,7 +404,7 @@ public actor ServiceGroup: Sendable {
381404
"All services finished."
382405
)
383406

384-
group.cancelAll()
407+
self.cancelGroupAndSpawnTimeoutIfNeeded(group: &group, cancellationTimeoutTask: &cancellationTimeoutTask)
385408
return .success(())
386409
}
387410
}
@@ -398,6 +421,7 @@ public actor ServiceGroup: Sendable {
398421
do {
399422
try await self.shutdownGracefully(
400423
services: services,
424+
cancellationTimeoutTask: &cancellationTimeoutTask,
401425
group: &group,
402426
gracefulShutdownManagers: gracefulShutdownManagers
403427
)
@@ -413,7 +437,7 @@ public actor ServiceGroup: Sendable {
413437
]
414438
)
415439

416-
group.cancelAll()
440+
self.cancelGroupAndSpawnTimeoutIfNeeded(group: &group, cancellationTimeoutTask: &cancellationTimeoutTask)
417441
}
418442

419443
case .gracefulShutdownCaught:
@@ -423,19 +447,29 @@ public actor ServiceGroup: Sendable {
423447
do {
424448
try await self.shutdownGracefully(
425449
services: services,
450+
cancellationTimeoutTask: &cancellationTimeoutTask,
426451
group: &group,
427452
gracefulShutdownManagers: gracefulShutdownManagers
428453
)
429454
} catch {
430455
return .failure(error)
431456
}
432457

458+
case .cancellationCaught:
459+
// We caught cancellation in our child task so we have to spawn
460+
// our cancellation timeout task if needed
461+
self.logger.debug("Caught cancellation.")
462+
self.cancelGroupAndSpawnTimeoutIfNeeded(group: &group, cancellationTimeoutTask: &cancellationTimeoutTask)
463+
433464
case .signalSequenceFinished, .gracefulShutdownFinished:
434465
// This can happen when we are either cancelling everything or
435466
// when the user did not specify any shutdown signals. We just have to tolerate
436467
// this.
437468
continue
438469

470+
case .gracefulShutdownTimedOut:
471+
fatalError("Received gracefulShutdownTimedOut but never triggered a graceful shutdown")
472+
439473
case nil:
440474
fatalError("Invalid result from group.next(). We checked if the group is empty before and still got nil")
441475
}
@@ -447,18 +481,30 @@ public actor ServiceGroup: Sendable {
447481
self.logger.debug(
448482
"Service lifecycle ended"
449483
)
484+
cancellationTimeoutTask?.cancel()
450485
try result.get()
451486
}
452487

453488
private func shutdownGracefully(
454489
services: [ServiceGroupConfiguration.ServiceConfiguration?],
490+
cancellationTimeoutTask: inout Task<Void, Never>?,
455491
group: inout ThrowingTaskGroup<ChildTaskResult, Error>,
456492
gracefulShutdownManagers: [GracefulShutdownManager]
457493
) async throws {
458494
guard case .running = self.state else {
459495
fatalError("Unexpected state")
460496
}
461497

498+
if #available(macOS 13.0, iOS 16.0, watchOS 9.0, tvOS 16.0, *), let maximumGracefulShutdownDuration = self.maximumGracefulShutdownDuration {
499+
group.addTask {
500+
try? await Task.sleep(for: Duration(
501+
secondsComponent: maximumGracefulShutdownDuration.secondsComponent,
502+
attosecondsComponent: maximumGracefulShutdownDuration.attosecondsComponent
503+
))
504+
return .gracefulShutdownTimedOut
505+
}
506+
}
507+
462508
// We are storing the first error of a service that threw here.
463509
var error: Error?
464510

@@ -509,7 +555,7 @@ public actor ServiceGroup: Sendable {
509555
]
510556
)
511557

512-
group.cancelAll()
558+
self.cancelGroupAndSpawnTimeoutIfNeeded(group: &group, cancellationTimeoutTask: &cancellationTimeoutTask)
513559
throw ServiceGroupError.serviceFinishedUnexpectedly()
514560
}
515561

@@ -561,9 +607,26 @@ public actor ServiceGroup: Sendable {
561607
]
562608
)
563609

564-
group.cancelAll()
610+
self.cancelGroupAndSpawnTimeoutIfNeeded(group: &group, cancellationTimeoutTask: &cancellationTimeoutTask)
565611
}
566612

613+
case .gracefulShutdownTimedOut:
614+
// Gracefully shutting down took longer than the user configured
615+
// so we have to escalate it now.
616+
self.logger.debug(
617+
"Graceful shutdown took longer than allowed by the configuration. Cancelling the group now.",
618+
metadata: [
619+
self.loggingConfiguration.keys.serviceKey: "\(service.service)",
620+
]
621+
)
622+
self.cancelGroupAndSpawnTimeoutIfNeeded(group: &group, cancellationTimeoutTask: &cancellationTimeoutTask)
623+
624+
case .cancellationCaught:
625+
// We caught cancellation in our child task so we have to spawn
626+
// our cancellation timeout task if needed
627+
self.logger.debug("Caught cancellation.")
628+
self.cancelGroupAndSpawnTimeoutIfNeeded(group: &group, cancellationTimeoutTask: &cancellationTimeoutTask)
629+
567630
case .signalSequenceFinished, .gracefulShutdownCaught, .gracefulShutdownFinished:
568631
// We just have to tolerate this since signals and parent graceful shutdowns downs can race.
569632
continue
@@ -575,7 +638,9 @@ public actor ServiceGroup: Sendable {
575638

576639
// If we hit this then all services are shutdown. The only thing remaining
577640
// are the tasks that listen to the various graceful shutdown signals. We
578-
// just have to cancel those
641+
// just have to cancel those.
642+
// In this case we don't have to spawn our cancellation timeout task since
643+
// we are sure all other child tasks are handling cancellation appropriately.
579644
group.cancelAll()
580645

581646
// If we saw an error during graceful shutdown from a service that triggers graceful
@@ -584,6 +649,45 @@ public actor ServiceGroup: Sendable {
584649
throw error
585650
}
586651
}
652+
653+
private func cancelGroupAndSpawnTimeoutIfNeeded(
654+
group: inout ThrowingTaskGroup<ChildTaskResult, Error>,
655+
cancellationTimeoutTask: inout Task<Void, Never>?
656+
) {
657+
guard cancellationTimeoutTask == nil else {
658+
// We already have a cancellation timeout task running.
659+
self.logger.debug(
660+
"Task cancellation timeout task already running."
661+
)
662+
return
663+
}
664+
group.cancelAll()
665+
666+
if #available(macOS 13.0, iOS 16.0, watchOS 9.0, tvOS 16.0, *), let maximumCancellationDuration = self.maximumCancellationDuration {
667+
// We have to spawn an unstructured task here because the call to our `run`
668+
// method might have already been cancelled and we need to protect the sleep
669+
// from being cancelled.
670+
cancellationTimeoutTask = Task {
671+
do {
672+
self.logger.debug(
673+
"Task cancellation timeout task started."
674+
)
675+
try await Task.sleep(for: Duration(
676+
secondsComponent: maximumCancellationDuration.secondsComponent,
677+
attosecondsComponent: maximumCancellationDuration.attosecondsComponent
678+
))
679+
self.logger.debug(
680+
"Cancellation took longer than allowed by the configuration."
681+
)
682+
fatalError("Cancellation took longer than allowed by the configuration.")
683+
} catch {
684+
// We got cancelled so our services must have finished up.
685+
}
686+
}
687+
} else {
688+
cancellationTimeoutTask = nil
689+
}
690+
}
587691
}
588692

589693
// This should be removed once we support Swift 5.9+

Sources/ServiceLifecycle/ServiceGroupConfiguration.swift

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,61 @@ public struct ServiceGroupConfiguration: Sendable {
111111
/// The group's logging configuration.
112112
public var logging = LoggingConfiguration()
113113

114+
/// The maximum amount of time that graceful shutdown is allowed to take.
115+
///
116+
/// After this time has elapsed graceful shutdown will be escalated to task cancellation.
117+
@available(macOS 13.0, iOS 16.0, watchOS 9.0, tvOS 16.0, *)
118+
public var maximumGracefulShutdownDuration: Duration? {
119+
get {
120+
if let maximumGracefulShutdownDuration = self._maximumGracefulShutdownDuration {
121+
return .init(
122+
secondsComponent: maximumGracefulShutdownDuration.secondsComponent,
123+
attosecondsComponent: maximumGracefulShutdownDuration.attosecondsComponent
124+
)
125+
} else {
126+
return nil
127+
}
128+
}
129+
set {
130+
if let newValue = newValue {
131+
self._maximumGracefulShutdownDuration = (newValue.components.seconds, newValue.components.attoseconds)
132+
} else {
133+
self._maximumGracefulShutdownDuration = nil
134+
}
135+
}
136+
}
137+
138+
internal var _maximumGracefulShutdownDuration: (secondsComponent: Int64, attosecondsComponent: Int64)?
139+
140+
/// The maximum amount of time that task cancellation is allowed to take.
141+
///
142+
/// After this time has elapsed task cancellation will be escalated to a `fatalError`.
143+
///
144+
/// - Important: This setting is useful to guarantee that your application will exit at some point and
145+
/// should be used to identify APIs that are not properly implementing task cancellation.
146+
@available(macOS 13.0, iOS 16.0, watchOS 9.0, tvOS 16.0, *)
147+
public var maximumCancellationDuration: Duration? {
148+
get {
149+
if let maximumCancellationDuration = self._maximumCancellationDuration {
150+
return .init(
151+
secondsComponent: maximumCancellationDuration.secondsComponent,
152+
attosecondsComponent: maximumCancellationDuration.attosecondsComponent
153+
)
154+
} else {
155+
return nil
156+
}
157+
}
158+
set {
159+
if let newValue = newValue {
160+
self._maximumCancellationDuration = (newValue.components.seconds, newValue.components.attoseconds)
161+
} else {
162+
self._maximumCancellationDuration = nil
163+
}
164+
}
165+
}
166+
167+
internal var _maximumCancellationDuration: (secondsComponent: Int64, attosecondsComponent: Int64)?
168+
114169
/// Initializes a new ``ServiceGroupConfiguration``.
115170
///
116171
/// - Parameters:

0 commit comments

Comments
 (0)