UPSTREAM: <drop>: leaderelection settings

thetechnick · openshift-merge-bot[bot] · commit 8f2307c3f127 · 2025-01-29T18:36:25.000Z
Extensive e2e tests revealed that our component might run into
leader election timeouts during cluster bootstrap, causing
sporadic alerts being generated.

This commit uses recommended settings for leaderelection:
LeaseDuration: 15s -&gt; 137s
RenewDeadline: 10s -&gt; 107s
RetryPeriod:    2s -&gt;  26s

Warning: This will increase potential down-time of components
to 163s in the worst case (up from 17s).
(LeaseDuration + RetryPeriod)
diff --git a/catalogd/cmd/catalogd/main.go b/catalogd/cmd/catalogd/main.go
@@ -42,6 +42,7 @@ import (
 	_ "k8s.io/client-go/plugin/pkg/client/auth"
 	"k8s.io/klog/v2"
 	"k8s.io/klog/v2/textlogger"
+	"k8s.io/utils/ptr"
 	ctrl "sigs.k8s.io/controller-runtime"
 	crcache "sigs.k8s.io/controller-runtime/pkg/cache"
 	"sigs.k8s.io/controller-runtime/pkg/certwatcher"
@@ -231,8 +232,14 @@ func main() {
 		HealthProbeBindAddress: probeAddr,
 		LeaderElection:         enableLeaderElection,
 		LeaderElectionID:       "catalogd-operator-lock",
-		WebhookServer:          webhookServer,
-		Cache:                  cacheOptions,
+		// Recommended Leader Election values
+		// https://github.com/openshift/enhancements/blob/61581dcd985130357d6e4b0e72b87ee35394bf6e/CONVENTIONS.md#handling-kube-apiserver-disruption
+		LeaseDuration: ptr.To(137 * time.Second),
+		RenewDeadline: ptr.To(107 * time.Second),
+		RetryPeriod:   ptr.To(26 * time.Second),
+
+		WebhookServer: webhookServer,
+		Cache:         cacheOptions,
 	})
 	if err != nil {
 		setupLog.Error(err, "unable to create manager")
diff --git a/cmd/operator-controller/main.go b/cmd/operator-controller/main.go
@@ -40,6 +40,7 @@ import (
 	_ "k8s.io/client-go/plugin/pkg/client/auth"
 	"k8s.io/klog/v2"
 	"k8s.io/klog/v2/textlogger"
+	"k8s.io/utils/ptr"
 	ctrl "sigs.k8s.io/controller-runtime"
 	crcache "sigs.k8s.io/controller-runtime/pkg/cache"
 	"sigs.k8s.io/controller-runtime/pkg/certwatcher"
@@ -229,7 +230,13 @@ func main() {
 		HealthProbeBindAddress: probeAddr,
 		LeaderElection:         enableLeaderElection,
 		LeaderElectionID:       "9c4404e7.operatorframework.io",
-		Cache:                  cacheOptions,
+		// Recommended Leader Election values
+		// https://github.com/openshift/enhancements/blob/61581dcd985130357d6e4b0e72b87ee35394bf6e/CONVENTIONS.md#handling-kube-apiserver-disruption
+		LeaseDuration: ptr.To(137 * time.Second),
+		RenewDeadline: ptr.To(107 * time.Second),
+		RetryPeriod:   ptr.To(26 * time.Second),
+
+		Cache: cacheOptions,
 		// LeaderElectionReleaseOnCancel defines if the leader should step down voluntarily
 		// when the Manager ends. This requires the binary to immediately end when the
 		// Manager is stopped, otherwise, this setting is unsafe. Setting this significantly