-
Notifications
You must be signed in to change notification settings - Fork 1.3k
/
Copy pathalerts.libsonnet
141 lines (140 loc) · 5.63 KB
/
alerts.libsonnet
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
/**
* Copyright (c) 2021 Gitpod GmbH. All rights reserved.
* Licensed under the MIT License. See License-MIT.txt in the project root for license information.
*/
{
prometheusAlerts+:: {
groups+: [
{
name: 'gitpod-component-workspace-alerts',
rules: [
{
alert: 'GitpodWorkspaceStuckOnStarting',
labels: {
severity: 'critical',
},
'for': '20m',
annotations: {
runbook_url: 'https://github.com/gitpod-io/runbooks/blob/main/runbooks/GitpodWorkspaceStuckOnStarting.md',
summary: '5 or more workspaces are stuck on starting',
description: '{{ printf "%.2f" $value }} regular workspaces are stuck on starting for more than 20 minutes. Current status: "{{ $labels.reason }}"',
},
expr: |||
count(
kube_pod_container_status_waiting_reason * on(pod) group_left kube_pod_labels{component="workspace", workspace_type="regular"}
) by (reason) > 5
|||,
},
{
alert: 'GitpodWorkspaceStuckOnStopping',
labels: {
severity: 'critical',
},
'for': '20m',
annotations: {
runbook_url: 'https://github.com/gitpod-io/runbooks/blob/main/runbooks/GitpodWorkspaceStuckOnStopping.md',
summary: '5 or more workspaces are stuck on stopping',
description: '{{ printf "%.2f" $value }} {{ $labels.workspace_type }} workspaces are stuck on stopping for more than 20 minutes.',
},
expr: |||
sum(
gitpod_ws_manager_workspace_phase_total{type="REGULAR", phase="STOPPING"}
) without(phase) > 5
|||,
},
{
alert: 'GitpodWorkspaceHighFailureRate',
labels: {
severity: 'critical',
},
annotations: {
runbook_url: 'https://github.com/gitpod-io/runbooks/blob/main/runbooks/GitpodWorkspaceHighFailureRate.md',
summary: 'Workspaces are failing',
description: 'Multiple workspaces are failing for the last 5 minutes',
},
expr: |||
rate(gitpod_ws_manager_workspace_stops_total{reason="failed", type="REGULAR"}[5m]) >= 1
|||,
},
{
alert: 'GitpodWorkspaceStatusUpdatesCeased',
labels: {
severity: 'warning',
},
'for': '10m',
annotations: {
runbook_url: 'none',
summary: 'meta has not seen a workspace update in the last 10 minutes despite starting workspaces',
description: 'meta has not seen a workspace update in the last 10 minutes despite starting workspaces',
},
expr: |||
sum(rate(gitpod_ws_manager_bridge_status_updates_total[1m])) == 0 AND sum(rate(grpc_client_handled_total{grpc_method="StartWorkspace", grpc_service="wsman.WorkspaceManager"}[1m])) != 0
|||,
},
{
alert: 'GitpodWorkspaceTooManyRegularNotActive',
labels: {
severity: 'critical',
},
'for': '15m',
annotations: {
runbook_url: 'https://github.com/gitpod-io/runbooks/blob/main/runbooks/GitpodWorkspaceTooManyRegularNotActive.md',
summary: 'too many running but inactive workspaces',
description: 'too many running but inactive workspaces',
},
expr: |||
gitpod_workspace_regular_not_active_percentage > 0.15 AND sum(gitpod_ws_manager_workspace_activity_total) > 100
|||,
},
{
alert: 'GitpodWorkspacesNotStarting',
labels: {
severity: 'critical',
},
'for': '10m',
annotations: {
runbook_url: 'https://github.com/gitpod-io/runbooks/blob/main/runbooks/GitpodWorkspaceNotStarting.md',
summary: 'workspaces are not starting',
description: 'inactive regular workspaces exists but workspaces are not being started',
},
expr: |||
avg_over_time(gitpod_workspace_regular_not_active_percentage[1m]) > 0
AND
rate(gitpod_ws_manager_workspace_startup_seconds_sum{type="REGULAR"}[1m]) == 0
|||,
},
{
alert: 'GitpodTooManyWorkspacesInPending',
labels: {
severity: 'critical',
},
'for': '15m',
annotations: {
runbook_url: 'https://github.com/gitpod-io/runbooks/blob/main/runbooks/GitpodTooManyWorkspacesInPending.md',
summary: 'workspaces are in pending phase',
description: 'regular workspaces are stuck in pending phase',
},
expr: |||
gitpod_ws_manager_workspace_phase_total{phase="PENDING", type="REGULAR"} > 20
|||,
},
{
alert: 'GitpodTooManyPrebuildsInPending',
labels: {
severity: 'critical',
},
'for': '15m',
annotations: {
runbook_url: 'https://github.com/gitpod-io/runbooks/blob/main/runbooks/GitpodTooManyPrebuildsInPending.md',
summary: 'workspaces are in pending phase',
description: 'prebuilds are stuck in pending phase',
},
expr: |||
gitpod_ws_manager_workspace_phase_total{phase="PENDING", type="PREBUILD"} > 20
|||,
},
],
},
],
},
}