Skip to content

Commit dd104ab

Browse files
10000-kicsviri
authored andcommitted
feat: support for graceful shutdown based on configuration (#2479)
--------- Signed-off-by: 10000-ki <[email protected]>
1 parent 657301f commit dd104ab

File tree

7 files changed

+72
-42
lines changed

7 files changed

+72
-42
lines changed

Diff for: docs/content/en/docs/patterns-and-best-practices/_index.md

+12
Original file line numberDiff line numberDiff line change
@@ -120,3 +120,15 @@ might be a permission issue for some resources in another namespace.
120120
The `stopOnInformerErrorDuringStartup` has implication on [cache sync timeout](https://github.com/java-operator-sdk/java-operator-sdk/blob/114c4312c32b34688811df8dd7cea275878c9e73/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/api/config/ConfigurationService.java#L177-L179)
121121
behavior. If true operator will stop on cache sync timeout. if `false`, after the timeout the controller will start
122122
reconcile resources even if one or more event source caches did not sync yet.
123+
124+
## Graceful Shutdown
125+
126+
You can provide sufficient time for the reconciler to process and complete the currently ongoing events before shutting down.
127+
The configuration is simple. You just need to set an appropriate duration value for `reconciliationTerminationTimeout` using `ConfigurationServiceOverrider`.
128+
129+
```java
130+
final var overridden = new ConfigurationServiceOverrider(config)
131+
.withReconciliationTerminationTimeout(Duration.ofSeconds(5));
132+
133+
final var operator = new Operator(overridden);
134+
```

Diff for: operator-framework-core/src/main/java/io/javaoperatorsdk/operator/Operator.java

+8-10
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ private static ConfigurationService initConfigurationService(KubernetesClient cl
100100
@SuppressWarnings("unused")
101101
public void installShutdownHook(Duration gracefulShutdownTimeout) {
102102
if (!leaderElectionManager.isLeaderElectionEnabled()) {
103-
Runtime.getRuntime().addShutdownHook(new Thread(() -> stop(gracefulShutdownTimeout)));
103+
Runtime.getRuntime().addShutdownHook(new Thread(this::stop));
104104
} else {
105105
log.warn("Leader election is on, shutdown hook will not be installed.");
106106
}
@@ -145,15 +145,18 @@ public synchronized void start() {
145145
}
146146
}
147147

148-
public void stop(Duration gracefulShutdownTimeout) throws OperatorException {
148+
@Override
149+
public void stop() throws OperatorException {
150+
Duration reconciliationTerminationTimeout =
151+
configurationService.reconciliationTerminationTimeout();
149152
if (!started) {
150153
return;
151154
}
152-
log.info(
153-
"Operator SDK {} is shutting down...", configurationService.getVersion().getSdkVersion());
155+
log.info("Operator SDK {} is shutting down...",
156+
configurationService.getVersion().getSdkVersion());
154157
controllerManager.stop();
155158

156-
configurationService.getExecutorServiceManager().stop(gracefulShutdownTimeout);
159+
configurationService.getExecutorServiceManager().stop(reconciliationTerminationTimeout);
157160
leaderElectionManager.stop();
158161
if (configurationService.closeClientOnStop()) {
159162
getKubernetesClient().close();
@@ -162,11 +165,6 @@ public void stop(Duration gracefulShutdownTimeout) throws OperatorException {
162165
started = false;
163166
}
164167

165-
@Override
166-
public void stop() throws OperatorException {
167-
stop(Duration.ZERO);
168-
}
169-
170168
/**
171169
* Add a registration requests for the specified reconciler with this operator. The effective
172170
* registration of the reconciler is delayed till the operator is started.

Diff for: operator-framework-core/src/main/java/io/javaoperatorsdk/operator/api/config/ConfigurationService.java

+27-20
Original file line numberDiff line numberDiff line change
@@ -101,8 +101,8 @@ static ConfigurationService newOverriddenConfigurationService(
101101
*
102102
* @param reconciler the reconciler we want the configuration of
103103
* @param <R> the {@code CustomResource} type associated with the specified reconciler
104-
* @return the {@link ControllerConfiguration} associated with the specified reconciler or {@code
105-
* null} if no configuration exists for the reconciler
104+
* @return the {@link ControllerConfiguration} associated with the specified reconciler or
105+
* {@code null} if no configuration exists for the reconciler
106106
*/
107107
<R extends HasMetadata> ControllerConfiguration<R> getConfigurationFor(Reconciler<R> reconciler);
108108

@@ -211,7 +211,7 @@ default int concurrentWorkflowExecutorThreads() {
211211

212212
/**
213213
* Override to provide a custom {@link Metrics} implementation
214-
*
214+
*
215215
* @return the {@link Metrics} implementation
216216
*/
217217
default Metrics getMetrics() {
@@ -221,7 +221,7 @@ default Metrics getMetrics() {
221221
/**
222222
* Override to provide a custom {@link ExecutorService} implementation to change how threads
223223
* handle concurrent reconciliations
224-
*
224+
*
225225
* @return the {@link ExecutorService} implementation to use for concurrent reconciliation
226226
* processing
227227
*/
@@ -232,7 +232,7 @@ default ExecutorService getExecutorService() {
232232
/**
233233
* Override to provide a custom {@link ExecutorService} implementation to change how dependent
234234
* workflows are processed in parallel
235-
*
235+
*
236236
* @return the {@link ExecutorService} implementation to use for dependent workflow processing
237237
*/
238238
default ExecutorService getWorkflowExecutorService() {
@@ -242,7 +242,7 @@ default ExecutorService getWorkflowExecutorService() {
242242
/**
243243
* Determines whether the associated Kubernetes client should be closed when the associated
244244
* {@link io.javaoperatorsdk.operator.Operator} is stopped.
245-
*
245+
*
246246
* @return {@code true} if the Kubernetes should be closed on stop, {@code false} otherwise
247247
*/
248248
default boolean closeClientOnStop() {
@@ -252,7 +252,7 @@ default boolean closeClientOnStop() {
252252
/**
253253
* Override to provide a custom {@link DependentResourceFactory} implementation to change how
254254
* {@link io.javaoperatorsdk.operator.api.reconciler.dependent.DependentResource} are instantiated
255-
*
255+
*
256256
* @return the custom {@link DependentResourceFactory} implementation
257257
*/
258258
@SuppressWarnings("rawtypes")
@@ -264,7 +264,7 @@ default DependentResourceFactory dependentResourceFactory() {
264264
* Retrieves the optional {@link LeaderElectionConfiguration} to specify how the associated
265265
* {@link io.javaoperatorsdk.operator.Operator} handles leader election to ensure only one
266266
* instance of the operator runs on the cluster at any given time
267-
*
267+
*
268268
* @return the {@link LeaderElectionConfiguration}
269269
*/
270270
default Optional<LeaderElectionConfiguration> getLeaderElectionConfiguration() {
@@ -299,6 +299,17 @@ default Duration cacheSyncTimeout() {
299299
return Duration.ofMinutes(2);
300300
}
301301

302+
/**
303+
* This is the timeout value that allows the reconciliation threads to gracefully shut down. If no
304+
* value is set, the default is immediate shutdown.
305+
*
306+
* @return The duration of time to wait before terminating the reconciliation threads
307+
* @since 5.0.0
308+
*/
309+
default Duration reconciliationTerminationTimeout() {
310+
return Duration.ZERO;
311+
}
312+
302313
/**
303314
* Handler for an informer stop. Informer stops if there is a non-recoverable error. Like received
304315
* a resource that cannot be deserialized.
@@ -326,7 +337,7 @@ default Optional<InformerStoppedHandler> getInformerStoppedHandler() {
326337
* Override to provide a custom {@link ManagedWorkflowFactory} implementation to change how
327338
* {@link io.javaoperatorsdk.operator.processing.dependent.workflow.ManagedWorkflow} are
328339
* instantiated
329-
*
340+
*
330341
* @return the custom {@link ManagedWorkflowFactory} implementation
331342
*/
332343
@SuppressWarnings("rawtypes")
@@ -336,7 +347,7 @@ default ManagedWorkflowFactory getWorkflowFactory() {
336347

337348
/**
338349
* Override to provide a custom {@link ExecutorServiceManager} implementation
339-
*
350+
*
340351
* @return the custom {@link ExecutorServiceManager} implementation
341352
*/
342353
default ExecutorServiceManager getExecutorServiceManager() {
@@ -353,9 +364,8 @@ default ExecutorServiceManager getExecutorServiceManager() {
353364
* SSA based create/update can be still used with the legacy matching, just overriding the match
354365
* method of Kubernetes Dependent Resource.
355366
*
356-
* @since 4.4.0
357-
*
358367
* @return if SSA should be used for dependent resources
368+
* @since 4.4.0
359369
*/
360370
default boolean ssaBasedCreateUpdateMatchForDependentResources() {
361371
return true;
@@ -383,9 +393,8 @@ default Set<Class<? extends HasMetadata>> defaultNonSSAResource() {
383393
* <p>
384394
* Disable this if you want to react to your own dependent resource updates
385395
*
386-
* @since 4.5.0
387-
*
388396
* @return if special annotation should be used for dependent resource to filter events
397+
* @since 4.5.0
389398
*/
390399
default boolean previousAnnotationForDependentResourcesEventFiltering() {
391400
return true;
@@ -400,9 +409,8 @@ default boolean previousAnnotationForDependentResourcesEventFiltering() {
400409
* logic, and you want to further minimize the amount of work done / updates issued by the
401410
* operator.
402411
*
403-
* @since 4.5.0
404-
*
405412
* @return if resource version should be parsed (as integer)
413+
* @since 4.5.0
406414
*/
407415
default boolean parseResourceVersionsForEventFilteringAndCaching() {
408416
return false;
@@ -415,8 +423,8 @@ default boolean parseResourceVersionsForEventFilteringAndCaching() {
415423
*
416424
* @return {@code true} if Server-Side Apply (SSA) should be used when patching the primary
417425
* resources, {@code false} otherwise
418-
* @since 5.0.0
419426
* @see ConfigurationServiceOverrider#withUseSSAToPatchPrimaryResource(boolean)
427+
* @since 5.0.0
420428
*/
421429
default boolean useSSAToPatchPrimaryResource() {
422430
return true;
@@ -427,18 +435,17 @@ default boolean useSSAToPatchPrimaryResource() {
427435
* Determines whether resources retrieved from caches such as via calls to
428436
* {@link Context#getSecondaryResource(Class)} should be defensively cloned first.
429437
* </p>
430-
*
438+
*
431439
* <p>
432440
* Defensive cloning to prevent problematic cache modifications (modifying the resource would
433441
* otherwise modify the stored copy in the cache) was transparently done in previous JOSDK
434442
* versions. This might have performance consequences and, with the more prevalent use of
435443
* Server-Side Apply, where you should create a new copy of your resource with only modified
436444
* fields, such modifications of these resources are less likely to occur.
437445
* </p>
438-
*
446+
*
439447
* @return {@code true} if resources should be defensively cloned before returning them from
440448
* caches, {@code false} otherwise
441-
*
442449
* @since 5.0.0
443450
*/
444451
default boolean cloneSecondaryResourcesWhenGettingFromCache() {

Diff for: operator-framework-core/src/main/java/io/javaoperatorsdk/operator/api/config/ConfigurationServiceOverrider.java

+13
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ public class ConfigurationServiceOverrider {
3232
private InformerStoppedHandler informerStoppedHandler;
3333
private Boolean stopOnInformerErrorDuringStartup;
3434
private Duration cacheSyncTimeout;
35+
private Duration reconciliationTerminationTimeout;
3536
private Boolean ssaBasedCreateUpdateMatchForDependentResources;
3637
private Set<Class<? extends HasMetadata>> defaultNonSSAResource;
3738
private Boolean previousAnnotationForDependentResources;
@@ -127,6 +128,12 @@ public ConfigurationServiceOverrider withCacheSyncTimeout(Duration cacheSyncTime
127128
return this;
128129
}
129130

131+
public ConfigurationServiceOverrider withReconciliationTerminationTimeout(
132+
Duration reconciliationTerminationTimeout) {
133+
this.reconciliationTerminationTimeout = reconciliationTerminationTimeout;
134+
return this;
135+
}
136+
130137
public ConfigurationServiceOverrider withSSABasedCreateUpdateMatchForDependentResources(
131138
boolean value) {
132139
this.ssaBasedCreateUpdateMatchForDependentResources = value;
@@ -251,6 +258,12 @@ public Duration cacheSyncTimeout() {
251258
return overriddenValueOrDefault(cacheSyncTimeout, ConfigurationService::cacheSyncTimeout);
252259
}
253260

261+
@Override
262+
public Duration reconciliationTerminationTimeout() {
263+
return overriddenValueOrDefault(reconciliationTerminationTimeout,
264+
ConfigurationService::reconciliationTerminationTimeout);
265+
}
266+
254267
@Override
255268
public boolean ssaBasedCreateUpdateMatchForDependentResources() {
256269
return overriddenValueOrDefault(ssaBasedCreateUpdateMatchForDependentResources,

Diff for: operator-framework-core/src/test/java/io/javaoperatorsdk/operator/api/config/ConfigurationServiceOverriderTest.java

+4
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
package io.javaoperatorsdk.operator.api.config;
22

3+
import java.time.Duration;
34
import java.util.Optional;
45
import java.util.concurrent.Executors;
56

@@ -63,6 +64,7 @@ public <R extends HasMetadata> R clone(R object) {
6364
.withLeaderElectionConfiguration(new LeaderElectionConfiguration("newLease", "newLeaseNS"))
6465
.withInformerStoppedHandler((informer, ex) -> {
6566
})
67+
.withReconciliationTerminationTimeout(Duration.ofSeconds(30))
6668
.build();
6769

6870
assertNotEquals(config.closeClientOnStop(), overridden.closeClientOnStop());
@@ -77,6 +79,8 @@ public <R extends HasMetadata> R clone(R object) {
7779
overridden.getLeaderElectionConfiguration());
7880
assertNotEquals(config.getInformerStoppedHandler(),
7981
overridden.getLeaderElectionConfiguration());
82+
assertNotEquals(config.reconciliationTerminationTimeout(),
83+
overridden.reconciliationTerminationTimeout());
8084
}
8185

8286
}

Diff for: operator-framework/src/test/java/io/javaoperatorsdk/operator/GracefulStopIT.java

+6-11
Original file line numberDiff line numberDiff line change
@@ -18,26 +18,21 @@
1818
public class GracefulStopIT {
1919

2020
public static final String TEST_1 = "test1";
21-
public static final String TEST_2 = "test2";
2221

2322
@RegisterExtension
2423
LocallyRunOperatorExtension operator =
2524
LocallyRunOperatorExtension.builder()
26-
.withConfigurationService(o -> o.withCloseClientOnStop(false))
25+
.withConfigurationService(o -> o.withCloseClientOnStop(false)
26+
.withReconciliationTerminationTimeout(Duration.ofMillis(RECONCILER_SLEEP)))
2727
.withReconciler(new GracefulStopTestReconciler())
2828
.build();
2929

3030
@Test
31-
void stopsGracefullyWIthTimeout() {
32-
testGracefulStop(TEST_1, RECONCILER_SLEEP, 2);
31+
void stopsGracefullyWithTimeoutConfiguration() {
32+
testGracefulStop(TEST_1, 2);
3333
}
3434

35-
@Test
36-
void stopsGracefullyWithExpiredTimeout() {
37-
testGracefulStop(TEST_2, RECONCILER_SLEEP / 5, 1);
38-
}
39-
40-
private void testGracefulStop(String resourceName, int stopTimeout, int expectedFinalGeneration) {
35+
private void testGracefulStop(String resourceName, int expectedFinalGeneration) {
4136
var testRes = operator.create(testResource(resourceName));
4237
await().untilAsserted(() -> {
4338
var r = operator.get(GracefulStopTestCustomResource.class, resourceName);
@@ -54,7 +49,7 @@ private void testGracefulStop(String resourceName, int stopTimeout, int expected
5449
() -> assertThat(operator.getReconcilerOfType(GracefulStopTestReconciler.class)
5550
.getNumberOfExecutions()).isEqualTo(2));
5651

57-
operator.getOperator().stop(Duration.ofMillis(stopTimeout));
52+
operator.getOperator().stop();
5853

5954
await().untilAsserted(() -> {
6055
var r = operator.get(GracefulStopTestCustomResource.class, resourceName);

Diff for: operator-framework/src/test/java/io/javaoperatorsdk/operator/InformerRelatedBehaviorITS.java

+2-1
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ void beforeEach(TestInfo testInfo) {
7676
@AfterEach
7777
void cleanup() {
7878
if (operator != null) {
79-
operator.stop(Duration.ofSeconds(1));
79+
operator.stop();
8080
}
8181
adminClient.resource(dependentConfigMap()).delete();
8282
adminClient.resource(testCustomResource()).delete();
@@ -321,6 +321,7 @@ Operator startOperator(boolean stopOnInformerErrorDuringStartup, boolean addStop
321321
co.withKubernetesClient(clientUsingServiceAccount());
322322
co.withStopOnInformerErrorDuringStartup(stopOnInformerErrorDuringStartup);
323323
co.withCacheSyncTimeout(Duration.ofMillis(3000));
324+
co.withReconciliationTerminationTimeout(Duration.ofSeconds(1));
324325
if (addStopHandler) {
325326
co.withInformerStoppedHandler((informer, ex) -> replacementStopHandlerCalled = true);
326327
}

0 commit comments

Comments
 (0)