Skip to content

Commit 60f17bb

Browse files
Remove alertmanager alerts (#296)
Signed-off-by: ArthurSens <[email protected]> Signed-off-by: ArthurSens <[email protected]> Co-authored-by: Aleksandar Aleksandrov <[email protected]>
1 parent fd73f04 commit 60f17bb

File tree

2 files changed

+11
-123
lines changed

2 files changed

+11
-123
lines changed

lib/alert-filter.libsonnet

+10
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,16 @@ local unwatedAlerts = [
6868
'KubeletDown', // Re-added to platform-mixin
6969
'KubeClientErrors',
7070

71+
// From Alertmanager
72+
'AlertmanagerFailedReload', // Re-added to platform-mixin
73+
'AlertmanagerFailedToSendAlerts', // Re-added to platform-mixin
74+
'AlertmanagerMembersInconsistent',
75+
'AlertmanagerClusterFailedToSendAlerts',
76+
'AlertmanagerConfigInconsistent',
77+
'AlertmanagerClusterDown',
78+
'AlertmanagerClusterCrashlooping',
79+
80+
7181
// From kube-state-metrics
7282
'KubeStateMetricsWatchErrors', // Re-added to platform-mixin
7383
'KubeStateMetricsShardingMismatch',

monitoring-satellite/manifests/kube-prometheus-rules/rules.yaml

+1-123
Original file line numberDiff line numberDiff line change
@@ -63,129 +63,7 @@ spec:
6363
- expr: count without(instance, pod, node) (up == 0)
6464
record: count:up0
6565
- name: alertmanager.rules
66-
rules:
67-
- alert: AlertmanagerFailedReload
68-
annotations:
69-
description: Configuration has failed to load for {{ $labels.namespace }}/{{ $labels.pod}}.
70-
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/AlertmanagerFailedReload.md
71-
summary: Reloading an Alertmanager configuration has failed.
72-
expr: |
73-
# Without max_over_time, failed scrapes could create false negatives, see
74-
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
75-
max_over_time(alertmanager_config_last_reload_successful{job="alertmanager-main",namespace="monitoring-satellite"}[5m]) == 0
76-
for: 10m
77-
labels:
78-
severity: critical
79-
- alert: AlertmanagerMembersInconsistent
80-
annotations:
81-
description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} has only found {{ $value }} members of the {{$labels.job}} cluster.
82-
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/AlertmanagerMembersInconsistent.md
83-
summary: A member of an Alertmanager cluster has not found all other cluster members.
84-
expr: |
85-
# Without max_over_time, failed scrapes could create false negatives, see
86-
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
87-
max_over_time(alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring-satellite"}[5m])
88-
< on (cluster) group_left
89-
count by (cluster) (max_over_time(alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring-satellite"}[5m]))
90-
for: 15m
91-
labels:
92-
severity: critical
93-
- alert: AlertmanagerFailedToSendAlerts
94-
annotations:
95-
description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} failed to send {{ $value | humanizePercentage }} of notifications to {{ $labels.integration }}.
96-
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/AlertmanagerFailedToSendAlerts.md
97-
summary: An Alertmanager instance failed to send notifications.
98-
expr: |
99-
(
100-
rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring-satellite"}[5m])
101-
/
102-
rate(alertmanager_notifications_total{job="alertmanager-main",namespace="monitoring-satellite"}[5m])
103-
)
104-
> 0.01
105-
for: 5m
106-
labels:
107-
severity: warning
108-
- alert: AlertmanagerClusterFailedToSendAlerts
109-
annotations:
110-
description: The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage }}.
111-
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/AlertmanagerClusterFailedToSendAlerts.md
112-
summary: All Alertmanager instances in a cluster failed to send notifications to a critical integration.
113-
expr: |
114-
min by (cluster, integration) (
115-
rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring-satellite", integration=~`slack|pagerduty`}[5m])
116-
/
117-
rate(alertmanager_notifications_total{job="alertmanager-main",namespace="monitoring-satellite", integration=~`slack|pagerduty`}[5m])
118-
)
119-
> 0.01
120-
for: 5m
121-
labels:
122-
severity: critical
123-
- alert: AlertmanagerClusterFailedToSendAlerts
124-
annotations:
125-
description: The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage }}.
126-
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/AlertmanagerClusterFailedToSendAlerts.md
127-
summary: All Alertmanager instances in a cluster failed to send notifications to a non-critical integration.
128-
expr: |
129-
min by (cluster, integration) (
130-
rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring-satellite", integration!~`slack|pagerduty`}[5m])
131-
/
132-
rate(alertmanager_notifications_total{job="alertmanager-main",namespace="monitoring-satellite", integration!~`slack|pagerduty`}[5m])
133-
)
134-
> 0.01
135-
for: 5m
136-
labels:
137-
severity: warning
138-
- alert: AlertmanagerConfigInconsistent
139-
annotations:
140-
description: Alertmanager instances within the {{$labels.job}} cluster have different configurations.
141-
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/AlertmanagerConfigInconsistent.md
142-
summary: Alertmanager instances within the same cluster have different configurations.
143-
expr: |
144-
count by (cluster) (
145-
count_values by (cluster) ("config_hash", alertmanager_config_hash{job="alertmanager-main",namespace="monitoring-satellite"})
146-
)
147-
!= 1
148-
for: 20m
149-
labels:
150-
severity: critical
151-
- alert: AlertmanagerClusterDown
152-
annotations:
153-
description: '{{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have been up for less than half of the last 5m.'
154-
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/AlertmanagerClusterDown.md
155-
summary: Half or more of the Alertmanager instances within the same cluster are down.
156-
expr: |
157-
(
158-
count by (cluster) (
159-
avg_over_time(up{job="alertmanager-main",namespace="monitoring-satellite"}[5m]) < 0.5
160-
)
161-
/
162-
count by (cluster) (
163-
up{job="alertmanager-main",namespace="monitoring-satellite"}
164-
)
165-
)
166-
>= 0.5
167-
for: 5m
168-
labels:
169-
severity: critical
170-
- alert: AlertmanagerClusterCrashlooping
171-
annotations:
172-
description: '{{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have restarted at least 5 times in the last 10m.'
173-
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/AlertmanagerClusterCrashlooping.md
174-
summary: Half or more of the Alertmanager instances within the same cluster are crashlooping.
175-
expr: |
176-
(
177-
count by (cluster) (
178-
changes(process_start_time_seconds{job="alertmanager-main",namespace="monitoring-satellite"}[10m]) > 4
179-
)
180-
/
181-
count by (cluster) (
182-
up{job="alertmanager-main",namespace="monitoring-satellite"}
183-
)
184-
)
185-
>= 0.5
186-
for: 5m
187-
labels:
188-
severity: critical
66+
rules: []
18967
- name: kube-state-metrics
19068
rules: []
19169
- name: kubernetes-apps

0 commit comments

Comments
 (0)