Skip to content

Commit cda76ee

Browse files
simonpasquiertmshort
authored andcommitted
Make alerting rules resilient to scrape failures
This change tightens the alerting rules to avoid resetting the alerts upon transient scrape failures. It also removes the `message` annotation in favor of the `description` annotation which is more commonly used by the Prometheus community. Signed-off-by: Simon Pasquier <[email protected]>
1 parent fbd6f95 commit cda76ee

File tree

1 file changed

+5
-10
lines changed

1 file changed

+5
-10
lines changed

deploy/chart/templates/0000_90_olm_01-prometheus-rule.yaml

+5-10
Original file line numberDiff line numberDiff line change
@@ -12,25 +12,21 @@ spec:
1212
- name: olm.csv_abnormal.rules
1313
rules:
1414
- alert: CsvAbnormalFailedOver2Min
15-
expr: csv_abnormal{phase=~"^Failed$"}
15+
expr: last_over_time(csv_abnormal{phase="Failed"}[5m]}
1616
for: 2m
1717
labels:
1818
severity: warning
19-
namespace: "{{ "{{ $labels.namespace }}" }}"
2019
annotations:
2120
summary: CSV failed for over 2 minutes
22-
description: Fires whenever a CSV has been in the failed phase for more than 2 minutes.
23-
message: Failed to install Operator {{ printf "{{ $labels.name }}" }} version {{ printf "{{ $labels.version }}" }}. Reason-{{ printf "{{ $labels.reason }}" }}
21+
description: Failed to install Operator {{ printf "{{ $labels.name }}" }} version {{ printf "{{ $labels.version }}" }}. Reason-{{ printf "{{ $labels.reason }}" }}
2422
- alert: CsvAbnormalOver30Min
25-
expr: csv_abnormal{phase=~"(^Replacing$|^Pending$|^Deleting$|^Unknown$)"}
23+
expr: last_over_time(csv_abnormal{phase=~"(Replacing|Pending|Deleting|Unknown)"}[5m])
2624
for: 30m
2725
labels:
2826
severity: warning
29-
namespace: "{{ "{{ $labels.namespace }}" }}"
3027
annotations:
3128
summary: CSV abnormal for over 30 minutes
32-
description: Fires whenever a CSV is in the Replacing, Pending, Deleting, or Unknown phase for more than 30 minutes.
33-
message: Failed to install Operator {{ printf "{{ $labels.name }}" }} version {{ printf "{{ $labels.version }}" }}. Phase-{{ printf "{{ $labels.phase }}" }} Reason-{{ printf "{{ $labels.reason }}" }}
29+
description: Failed to install Operator {{ printf "{{ $labels.name }}" }} version {{ printf "{{ $labels.version }}" }}. Phase-{{ printf "{{ $labels.phase }}" }} Reason-{{ printf "{{ $labels.reason }}" }}
3430
- name: olm.installplan.rules
3531
rules:
3632
- alert: InstallPlanStepAppliedWithWarnings
@@ -39,6 +35,5 @@ spec:
3935
severity: warning
4036
annotations:
4137
summary: API returned a warning when modifying an operator
42-
description: Fires whenever the API server returns a warning when attempting to modify an operator.
43-
message: The API server returned a warning during installation or upgrade of an operator. An Event with reason "AppliedWithWarnings" has been created with complete details, including a reference to the InstallPlan step that generated the warning.
38+
description: The API server returned a warning during installation or upgrade of an operator. An Event with reason "AppliedWithWarnings" has been created with complete details, including a reference to the InstallPlan step that generated the warning.
4439
{{ end }}

0 commit comments

Comments
 (0)