Skip to content

Commit aee56a5

Browse files
ArthurSensroboquat
authored andcommitted
Add alerts related to kubernetes resources
Signed-off-by: ArthurSens <[email protected]>
1 parent 07af344 commit aee56a5

File tree

1 file changed

+135
-0
lines changed
  • operations/observability/mixins/platform/rules/kubernetes

1 file changed

+135
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
# Copyright (c) 2022 Gitpod GmbH. All rights reserved.
2+
# Licensed under the GNU Affero General Public License (AGPL).
3+
# See License-AGPL.txt in the project root for license information.
4+
5+
apiVersion: monitoring.coreos.com/v1
6+
kind: PrometheusRule
7+
metadata:
8+
labels:
9+
app.kubernetes.io/name: kubernetes
10+
app.kubernetes.io/part-of: kube-prometheus
11+
prometheus: k8s
12+
role: alert-rules
13+
name: kubernetes-monitoring-rules
14+
namespace: monitoring-satellite
15+
spec:
16+
groups:
17+
- name: kubernetes
18+
rules:
19+
- alert: KubeDaemonSetNotScheduled
20+
annotations:
21+
description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled.'
22+
summary: DaemonSet pods are not scheduled.
23+
expr: |
24+
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}
25+
-
26+
kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} > 0
27+
for: 10m
28+
labels:
29+
severity: warning
30+
team: platform
31+
- alert: KubeJobNotCompleted
32+
annotations:
33+
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than {{ "43200" | humanizeDuration }} to complete.
34+
summary: Job did not complete in time
35+
expr: |
36+
time() - max by(namespace, job_name, cluster) (kube_job_status_start_time{job="kube-state-metrics"}
37+
and
38+
kube_job_status_active{job="kube-state-metrics"} > 0) > 43200
39+
labels:
40+
severity: warning
41+
team: platform
42+
- alert: KubeJobFailed
43+
annotations:
44+
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. Removing failed job after investigation should clear this alert.
45+
summary: Job failed to complete.
46+
expr: |
47+
kube_job_failed{job="kube-state-metrics"} > 0
48+
for: 15m
49+
labels:
50+
severity: warning
51+
team: platform
52+
- alert: KubeCPUOvercommit
53+
annotations:
54+
description: Cluster has overcommitted CPU resource requests for Pods by {{ $value }} CPU shares and cannot tolerate node failure.
55+
summary: Cluster has overcommitted CPU resource requests.
56+
expr: |
57+
sum(namespace_cpu:kube_pod_container_resource_requests:sum{}) - (sum(kube_node_status_allocatable{resource="cpu"}) - max(kube_node_status_allocatable{resource="cpu"})) > 0
58+
and
59+
(sum(kube_node_status_allocatable{resource="cpu"}) - max(kube_node_status_allocatable{resource="cpu"})) > 0
60+
for: 10m
61+
labels:
62+
severity: warning
63+
team: platform
64+
- alert: KubeMemoryOvercommit
65+
annotations:
66+
description: Cluster has overcommitted memory resource requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node failure.
67+
summary: Cluster has overcommitted memory resource requests.
68+
expr: |
69+
sum(namespace_memory:kube_pod_container_resource_requests:sum{}) - (sum(kube_node_status_allocatable{resource="memory"}) - max(kube_node_status_allocatable{resource="memory"})) > 0
70+
and
71+
(sum(kube_node_status_allocatable{resource="memory"}) - max(kube_node_status_allocatable{resource="memory"})) > 0
72+
for: 10m
73+
labels:
74+
severity: warning
75+
team: platform
76+
- alert: KubePersistentVolumeFillingUp
77+
annotations:
78+
description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage }} free.
79+
summary: PersistentVolume is filling up.
80+
expr: |
81+
(
82+
kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}
83+
/
84+
kubelet_volume_stats_capacity_bytes{job="kubelet", metrics_path="/metrics"}
85+
) < 0.03
86+
and
87+
kubelet_volume_stats_used_bytes{job="kubelet", metrics_path="/metrics"} > 0
88+
unless on(namespace, persistentvolumeclaim)
89+
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
90+
unless on(namespace, persistentvolumeclaim)
91+
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
92+
for: 1m
93+
labels:
94+
severity: critical
95+
team: platform
96+
- alert: KubePersistentVolumeErrors
97+
annotations:
98+
description: The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase }}.
99+
summary: PersistentVolume is having issues with provisioning.
100+
expr: |
101+
kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0
102+
for: 5m
103+
labels:
104+
severity: critical
105+
team: platform
106+
- alert: KubeVersionMismatch
107+
annotations:
108+
description: There are {{ $value }} different semantic versions of Kubernetes components running.
109+
summary: Different semantic versions of Kubernetes components running.
110+
expr: |
111+
count by (cluster) (count by (git_version, cluster) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
112+
for: 15m
113+
labels:
114+
severity: warning
115+
team: platform
116+
- alert: KubeNodeNotReady
117+
annotations:
118+
description: '{{ $labels.node }} has been unready for more than 15 minutes.'
119+
summary: Node is not ready.
120+
expr: |
121+
kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0
122+
for: 15m
123+
labels:
124+
severity: critical
125+
team: platform
126+
- alert: KubeletDown
127+
annotations:
128+
description: Kubelet has disappeared from Prometheus target discovery.
129+
summary: Target disappeared from Prometheus target discovery.
130+
expr: |
131+
absent(up{job="kubelet", metrics_path="/metrics"} == 1)
132+
for: 15m
133+
labels:
134+
severity: critical
135+
team: platform

0 commit comments

Comments
 (0)