openshift · wking · Apr 11, 2025 · wking · Apr 15, 2025 · wking
diff --git a/manifests/0000_90_kube-controller-manager-operator_05_alerts.yaml b/manifests/0000_90_kube-controller-manager-operator_05_alerts.yaml
@@ -25,7 +25,8 @@ spec:
         - alert: PodDisruptionBudgetAtLimit
           annotations:
             summary: The pod disruption budget is preventing further disruption to pods.
-            description: The pod disruption budget is at the minimum disruptions allowed level. The number of current healthy pods is equal to the desired healthy pods.
+            description: |-
+              The {{ $labels.poddisruptionbudget }} pod disruption budget in the {{ $labels.namespace }} namespace is at the maximum allowed disruption. The number of current healthy pods is equal to the desired healthy pods.{{ with $console_url := "console_url" | query }}{{ if ne (len (label "url" (first $console_url))) 0}} For more information refer to {{ label "url" (first $console_url) }}/k8s/ns/{{ $labels.namespace }}/poddisruptionbudgets/{{ $labels.poddisruptionbudget }}{{ end }}{{ end }}
             runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-controller-manager-operator/PodDisruptionBudgetAtLimit.md
           expr: |
             max by(namespace, poddisruptionbudget) (kube_poddisruptionbudget_status_current_healthy == kube_poddisruptionbudget_status_desired_healthy and on (namespace, poddisruptionbudget) kube_poddisruptionbudget_status_expected_pods > 0)
@@ -35,17 +36,19 @@ spec:
         - alert: PodDisruptionBudgetLimit
           annotations:
             summary: The pod disruption budget registers insufficient amount of pods.
-            description: The pod disruption budget is below the minimum disruptions allowed level and is not satisfied. The number of current healthy pods is less than the desired healthy pods.
+            description: |-
+              The {{ $labels.poddisruptionbudget }} pod disruption budget in the {{ $labels.namespace }} namespace exceeds the maximum allowed disruption and is not satisfied. The number of current healthy pods is {{ $value }} less than the desired healthy pods.{{ with $console_url := "console_url" | query }}{{ if ne (len (label "url" (first $console_url))) 0}} For more information refer to {{ label "url" (first $console_url) }}/k8s/ns/{{ $labels.namespace }}/poddisruptionbudgets/{{ $labels.poddisruptionbudget }}{{ end }}{{ end }}
             runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-controller-manager-operator/PodDisruptionBudgetLimit.md
           expr: |
-            max by (namespace, poddisruptionbudget) (kube_poddisruptionbudget_status_current_healthy < kube_poddisruptionbudget_status_desired_healthy)
+            max by (namespace, poddisruptionbudget) (kube_poddisruptionbudget_status_desired_healthy - kube_poddisruptionbudget_status_current_healthy) > 0
           for: 15m
           labels:
             severity: critical
         - alert: GarbageCollectorSyncFailed
           annotations:
             summary: There was a problem with syncing the resources for garbage collection.
-            description: Garbage Collector had a problem with syncing and monitoring the available resources. Please see KubeControllerManager logs for more details.
+            description: |-
+              Garbage Collector had a problem with syncing and monitoring the available resources. Please see KubeControllerManager logs for more details: 'oc -n {{ $labels.namespace }} logs -c {{ $labels.container }} {{ $labels.pod }}'{{ with $console_url := "console_url" | query }}{{ if ne (len (label "url" (first $console_url))) 0}} For more information refer to {{ label "url" (first $console_url) }}/k8s/ns/{{ $labels.namespace }}/pods/{{ $labels.pod }}/logs?container={{ $labels.container }} {{ end }}{{ end }}.
             runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-controller-manager-operator/GarbageCollectorSyncFailed.md
           expr: |
             rate(garbagecollector_controller_resources_sync_error_total{}[5m]) > 0