Skip to content

Commit 316223f

Browse files
committed
[ops] WebApp: Fix WebAppServicesCrashlooping
1 parent 3b70e47 commit 316223f

File tree

1 file changed

+3
-2
lines changed

1 file changed

+3
-2
lines changed

operations/observability/mixins/meta/rules/components/server/alerts.libsonnet

+3-2
Original file line numberDiff line numberDiff line change
@@ -134,8 +134,9 @@
134134
},
135135
{
136136
alert: 'WebAppServicesCrashlooping',
137-
expr: 'sum(rate(container_cpu_usage_seconds_total{container!="POD", node=~".*", pod=~"(content-service|dashboard|db|db-sync|messagebus|payment-endpoint|proxy|server|ws-manager-bridge|usage)-.*"}[5m])) by (pod, node) > 0.80',
138-
'for': '15m',
137+
// Reasoning: alert if any pod is restarting more than 3 times / 5 minutes.
138+
expr: 'increase(kube_pod_container_status_restarts_total{container!="POD", pod=~"(content-service|dashboard|db|db-sync|messagebus|payment-endpoint|proxy|server|ws-manager-bridge|usage)-.*"}[5m]) > 3',
139+
'for': '5m',
139140
labels: {
140141
// sent to the team internal channel until we fine tuned it
141142
severity: 'warning',

0 commit comments

Comments
 (0)