@@ -63,129 +63,7 @@ spec:
63
63
- expr : count without(instance, pod, node) (up == 0)
64
64
record : count:up0
65
65
- name : alertmanager.rules
66
- rules :
67
- - alert : AlertmanagerFailedReload
68
- annotations :
69
- description : Configuration has failed to load for {{ $labels.namespace }}/{{ $labels.pod}}.
70
- runbook_url : https://github.com/gitpod-io/runbooks/blob/main/runbooks/AlertmanagerFailedReload.md
71
- summary : Reloading an Alertmanager configuration has failed.
72
- expr : |
73
- # Without max_over_time, failed scrapes could create false negatives, see
74
- # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
75
- max_over_time(alertmanager_config_last_reload_successful{job="alertmanager-main",namespace="monitoring-satellite"}[5m]) == 0
76
- for : 10m
77
- labels :
78
- severity : critical
79
- - alert : AlertmanagerMembersInconsistent
80
- annotations :
81
- description : Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} has only found {{ $value }} members of the {{$labels.job}} cluster.
82
- runbook_url : https://github.com/gitpod-io/runbooks/blob/main/runbooks/AlertmanagerMembersInconsistent.md
83
- summary : A member of an Alertmanager cluster has not found all other cluster members.
84
- expr : |
85
- # Without max_over_time, failed scrapes could create false negatives, see
86
- # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
87
- max_over_time(alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring-satellite"}[5m])
88
- < on (cluster) group_left
89
- count by (cluster) (max_over_time(alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring-satellite"}[5m]))
90
- for : 15m
91
- labels :
92
- severity : critical
93
- - alert : AlertmanagerFailedToSendAlerts
94
- annotations :
95
- description : Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} failed to send {{ $value | humanizePercentage }} of notifications to {{ $labels.integration }}.
96
- runbook_url : https://github.com/gitpod-io/runbooks/blob/main/runbooks/AlertmanagerFailedToSendAlerts.md
97
- summary : An Alertmanager instance failed to send notifications.
98
- expr : |
99
- (
100
- rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring-satellite"}[5m])
101
- /
102
- rate(alertmanager_notifications_total{job="alertmanager-main",namespace="monitoring-satellite"}[5m])
103
- )
104
- > 0.01
105
- for : 5m
106
- labels :
107
- severity : warning
108
- - alert : AlertmanagerClusterFailedToSendAlerts
109
- annotations :
110
- description : The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage }}.
111
- runbook_url : https://github.com/gitpod-io/runbooks/blob/main/runbooks/AlertmanagerClusterFailedToSendAlerts.md
112
- summary : All Alertmanager instances in a cluster failed to send notifications to a critical integration.
113
- expr : |
114
- min by (cluster, integration) (
115
- rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring-satellite", integration=~`slack|pagerduty`}[5m])
116
- /
117
- rate(alertmanager_notifications_total{job="alertmanager-main",namespace="monitoring-satellite", integration=~`slack|pagerduty`}[5m])
118
- )
119
- > 0.01
120
- for : 5m
121
- labels :
122
- severity : critical
123
- - alert : AlertmanagerClusterFailedToSendAlerts
124
- annotations :
125
- description : The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage }}.
126
- runbook_url : https://github.com/gitpod-io/runbooks/blob/main/runbooks/AlertmanagerClusterFailedToSendAlerts.md
127
- summary : All Alertmanager instances in a cluster failed to send notifications to a non-critical integration.
128
- expr : |
129
- min by (cluster, integration) (
130
- rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring-satellite", integration!~`slack|pagerduty`}[5m])
131
- /
132
- rate(alertmanager_notifications_total{job="alertmanager-main",namespace="monitoring-satellite", integration!~`slack|pagerduty`}[5m])
133
- )
134
- > 0.01
135
- for : 5m
136
- labels :
137
- severity : warning
138
- - alert : AlertmanagerConfigInconsistent
139
- annotations :
140
- description : Alertmanager instances within the {{$labels.job}} cluster have different configurations.
141
- runbook_url : https://github.com/gitpod-io/runbooks/blob/main/runbooks/AlertmanagerConfigInconsistent.md
142
- summary : Alertmanager instances within the same cluster have different configurations.
143
- expr : |
144
- count by (cluster) (
145
- count_values by (cluster) ("config_hash", alertmanager_config_hash{job="alertmanager-main",namespace="monitoring-satellite"})
146
- )
147
- != 1
148
- for : 20m
149
- labels :
150
- severity : critical
151
- - alert : AlertmanagerClusterDown
152
- annotations :
153
- description : ' {{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have been up for less than half of the last 5m.'
154
- runbook_url : https://github.com/gitpod-io/runbooks/blob/main/runbooks/AlertmanagerClusterDown.md
155
- summary : Half or more of the Alertmanager instances within the same cluster are down.
156
- expr : |
157
- (
158
- count by (cluster) (
159
- avg_over_time(up{job="alertmanager-main",namespace="monitoring-satellite"}[5m]) < 0.5
160
- )
161
- /
162
- count by (cluster) (
163
- up{job="alertmanager-main",namespace="monitoring-satellite"}
164
- )
165
- )
166
- >= 0.5
167
- for : 5m
168
- labels :
169
- severity : critical
170
- - alert : AlertmanagerClusterCrashlooping
171
- annotations :
172
- description : ' {{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have restarted at least 5 times in the last 10m.'
173
- runbook_url : https://github.com/gitpod-io/runbooks/blob/main/runbooks/AlertmanagerClusterCrashlooping.md
174
- summary : Half or more of the Alertmanager instances within the same cluster are crashlooping.
175
- expr : |
176
- (
177
- count by (cluster) (
178
- changes(process_start_time_seconds{job="alertmanager-main",namespace="monitoring-satellite"}[10m]) > 4
179
- )
180
- /
181
- count by (cluster) (
182
- up{job="alertmanager-main",namespace="monitoring-satellite"}
183
- )
184
- )
185
- >= 0.5
186
- for : 5m
187
- labels :
188
- severity : critical
66
+ rules : []
189
67
- name : kube-state-metrics
190
68
rules : []
191
69
- name : kubernetes-apps
0 commit comments