-
Notifications
You must be signed in to change notification settings - Fork 1.3k
/
Copy pathkubernetes.yaml
112 lines (111 loc) · 4.5 KB
/
kubernetes.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
# Copyright (c) 2022 Gitpod GmbH. All rights reserved.
# Licensed under the GNU Affero General Public License (AGPL).
# See License-AGPL.txt in the project root for license information.
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
app.kubernetes.io/name: kubernetes
app.kubernetes.io/part-of: kube-prometheus
prometheus: k8s
role: alert-rules
name: kubernetes-monitoring-rules
namespace: monitoring-satellite
spec:
groups:
- name: kubernetes
rules:
- alert: KubeDaemonSetNotScheduled
annotations:
description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled.'
summary: DaemonSet pods are not scheduled.
expr: |
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}
-
kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} > 0
for: 10m
labels:
severity: warning
team: platform
- alert: KubeJobNotCompleted
annotations:
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than {{ "43200" | humanizeDuration }} to complete.
summary: Job did not complete in time
expr: |
time() - max by(namespace, job_name, cluster) (kube_job_status_start_time{job="kube-state-metrics"}
and
kube_job_status_active{job="kube-state-metrics"} > 0) > 43200
labels:
severity: warning
team: platform
- alert: KubeJobFailed
annotations:
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. Removing failed job after investigation should clear this alert.
summary: Job failed to complete.
expr: |
kube_job_failed{job="kube-state-metrics"} > 0
for: 15m
labels:
severity: warning
team: platform
- alert: KubePersistentVolumeFillingUp
annotations:
description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage }} free.
summary: PersistentVolume is filling up.
expr: |
(
kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}
/
kubelet_volume_stats_capacity_bytes{job="kubelet", metrics_path="/metrics"}
) < 0.03
and
kubelet_volume_stats_used_bytes{job="kubelet", metrics_path="/metrics"} > 0
unless on(namespace, persistentvolumeclaim)
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
unless on(namespace, persistentvolumeclaim)
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
for: 1m
labels:
severity: critical
team: platform
- alert: KubePersistentVolumeErrors
annotations:
description: The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase }}.
summary: PersistentVolume is having issues with provisioning.
expr: |
kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0
for: 5m
labels:
severity: critical
team: platform
- alert: KubeVersionMismatch
annotations:
description: There are {{ $value }} different semantic versions of Kubernetes components running.
summary: Different semantic versions of Kubernetes components running.
expr: |
count by (cluster) (count by (git_version, cluster) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
for: 15m
labels:
severity: warning
team: platform
- alert: KubeNodeNotReady
annotations:
description: '{{ $labels.node }} has been unready for more than 15 minutes.'
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/KubeNodeNotReady.md
summary: Node is not ready.
expr: |
kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0
for: 15m
labels:
severity: critical
team: platform
- alert: KubeletDown
annotations:
description: Kubelet has disappeared from Prometheus target discovery.
summary: Target disappeared from Prometheus target discovery.
expr: |
absent(up{job="kubelet", metrics_path="/metrics"} == 1)
for: 15m
labels:
severity: critical
team: platform