@@ -862,11 +862,31 @@ spec:
862
862
record: cluster:capacity_cpu_cores:sum
863
863
- expr : |
864
864
clamp_max(
865
- (
866
- label_replace( ( ( sum (node_cpu_info) by (instance, package, core) ) > 1 ), "label_node_hyperthread_enabled", "true", "instance", "(.*)" )
867
- or on (instance, package)
868
- label_replace( ( ( sum (node_cpu_info) by (instance, package, core) ) <= 1 ), "label_node_hyperthread_enabled", "false", "instance", "(.*)" )
869
- ), 1
865
+ label_replace(
866
+ sum by(instance, package, core) (
867
+ node_cpu_info{core!="",package!=""}
868
+ or
869
+ # Assume core = cpu and package = 0 for platforms that don't expose core/package labels.
870
+ label_replace(label_join(node_cpu_info{core="",package=""}, "core", "", "cpu"), "package", "0", "package", "")
871
+ ) > 1,
872
+ "label_node_hyperthread_enabled",
873
+ "true",
874
+ "instance",
875
+ "(.*)"
876
+ ) or on (instance, package)
877
+ label_replace(
878
+ sum by(instance, package, core) (
879
+ label_replace(node_cpu_info{core!="",package!=""}
880
+ or
881
+ # Assume core = cpu and package = 0 for platforms that don't expose core/package labels.
882
+ label_join(node_cpu_info{core="",package=""}, "core", "", "cpu"), "package", "0", "package", "")
883
+ ) <= 1,
884
+ "label_node_hyperthread_enabled",
885
+ "false",
886
+ "instance",
887
+ "(.*)"
888
+ ),
889
+ 1
870
890
)
871
891
record: cluster:cpu_core_hyperthreading
872
892
- expr : |
@@ -1175,7 +1195,7 @@ spec:
1175
1195
{{ printf "%.0f" $value }} receive errors in the last two minutes.'
1176
1196
summary : Network interface is reporting many receive errors.
1177
1197
expr : |
1178
- rate (node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01
1198
+ increase (node_network_receive_errs_total[2m]) > 10
1179
1199
for : 1h
1180
1200
labels :
1181
1201
severity : warning
@@ -1185,7 +1205,7 @@ spec:
1185
1205
{{ printf "%.0f" $value }} transmit errors in the last two minutes.'
1186
1206
summary : Network interface is reporting many transmit errors.
1187
1207
expr : |
1188
- rate (node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01
1208
+ increase (node_network_transmit_errs_total[2m]) > 10
1189
1209
for : 1h
1190
1210
labels :
1191
1211
severity : warning
@@ -1232,8 +1252,6 @@ spec:
1232
1252
summary : Clock not synchronising.
1233
1253
expr : |
1234
1254
min_over_time(node_timex_sync_status[5m]) == 0
1235
- and
1236
- node_timex_maxerror_seconds >= 16
1237
1255
for : 10m
1238
1256
labels :
1239
1257
severity : warning
@@ -2106,9 +2124,7 @@ spec:
2106
2124
rules :
2107
2125
- alert : etcdMembersDown
2108
2126
annotations :
2109
- description : ' etcd cluster "{{ $labels.job }}": members are down ({{ $value
2110
- }}).'
2111
- summary : etcd cluster members are down.
2127
+ message : ' etcd cluster "{{ $labels.job }}": members are down ({{ $value }}).'
2112
2128
expr : |
2113
2129
max without (endpoint) (
2114
2130
sum without (instance) (up{job=~".*etcd.*"} == bool 0)
@@ -2123,41 +2139,36 @@ spec:
2123
2139
severity : critical
2124
2140
- alert : etcdInsufficientMembers
2125
2141
annotations :
2126
- description : ' etcd cluster "{{ $labels.job }}": insufficient members ({{ $value
2142
+ message : ' etcd cluster "{{ $labels.job }}": insufficient members ({{ $value
2127
2143
}}).'
2128
- summary : etcd cluster has insufficient number of members.
2129
2144
expr : |
2130
2145
sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"}) without (instance) + 1) / 2)
2131
2146
for : 3m
2132
2147
labels :
2133
2148
severity : critical
2134
2149
- alert : etcdNoLeader
2135
2150
annotations :
2136
- description : ' etcd cluster "{{ $labels.job }}": member {{ $labels.instance
2137
- }} has no leader.'
2138
- summary : etcd cluster has no leader.
2151
+ message : ' etcd cluster "{{ $labels.job }}": member {{ $labels.instance }}
2152
+ has no leader.'
2139
2153
expr : |
2140
2154
etcd_server_has_leader{job=~".*etcd.*"} == 0
2141
2155
for : 1m
2142
2156
labels :
2143
2157
severity : critical
2144
2158
- alert : etcdHighNumberOfLeaderChanges
2145
2159
annotations :
2146
- description : ' etcd cluster "{{ $labels.job }}": {{ $value }} leader changes
2147
- within the last 15 minutes. Frequent elections may be a sign of insufficient
2148
- resources, high network latency, or disruptions by other components and
2149
- should be investigated.'
2150
- summary : etcd cluster has high number of leader changes.
2160
+ message : ' etcd cluster "{{ $labels.job }}": {{ $value }} leader changes within
2161
+ the last 15 minutes. Frequent elections may be a sign of insufficient resources,
2162
+ high network latency, or disruptions by other components and should be investigated.'
2151
2163
expr : |
2152
2164
increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) >= 4
2153
2165
for : 5m
2154
2166
labels :
2155
2167
severity : warning
2156
2168
- alert : etcdGRPCRequestsSlow
2157
2169
annotations :
2158
- description : ' etcd cluster "{{ $labels.job }}": gRPC requests to {{ $labels.grpc_method
2170
+ message : ' etcd cluster "{{ $labels.job }}": gRPC requests to {{ $labels.grpc_method
2159
2171
}} are taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
2160
- summary : etcd grpc requests are slow
2161
2172
expr : |
2162
2173
histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_type="unary"}[5m])) without(grpc_type))
2163
2174
> 0.15
@@ -2166,10 +2177,8 @@ spec:
2166
2177
severity : critical
2167
2178
- alert : etcdMemberCommunicationSlow
2168
2179
annotations :
2169
- description : ' etcd cluster "{{ $labels.job }}": member communication with
2170
- {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance
2171
- }}.'
2172
- summary : etcd cluster member communication is slow.
2180
+ message : ' etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To
2181
+ }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
2173
2182
expr : |
2174
2183
histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
2175
2184
> 0.15
@@ -2178,40 +2187,27 @@ spec:
2178
2187
severity : warning
2179
2188
- alert : etcdHighNumberOfFailedProposals
2180
2189
annotations :
2181
- description : ' etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures
2190
+ message : ' etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures
2182
2191
within the last 30 minutes on etcd instance {{ $labels.instance }}.'
2183
- summary : etcd cluster has high number of proposal failures.
2184
2192
expr : |
2185
2193
rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
2186
2194
for : 15m
2187
2195
labels :
2188
2196
severity : warning
2189
2197
- alert : etcdHighFsyncDurations
2190
2198
annotations :
2191
- description : ' etcd cluster "{{ $labels.job }}": 99th percentile fsync durations
2199
+ message : ' etcd cluster "{{ $labels.job }}": 99th percentile fync durations
2192
2200
are {{ $value }}s on etcd instance {{ $labels.instance }}.'
2193
- summary : etcd cluster 99th percentile fsync durations are too high.
2194
2201
expr : |
2195
2202
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
2196
2203
> 0.5
2197
2204
for : 10m
2198
2205
labels :
2199
2206
severity : warning
2200
- - alert : etcdHighFsyncDurations
2201
- annotations :
2202
- message : ' etcd cluster "{{ $labels.job }}": 99th percentile fync durations
2203
- are {{ $value }}s on etcd instance {{ $labels.instance }}.'
2204
- expr : |
2205
- histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
2206
- > 1
2207
- for : 10m
2208
- labels :
2209
- severity : critical
2210
2207
- alert : etcdHighCommitDurations
2211
2208
annotations :
2212
- description : ' etcd cluster "{{ $labels.job }}": 99th percentile commit durations
2209
+ message : ' etcd cluster "{{ $labels.job }}": 99th percentile commit durations
2213
2210
{{ $value }}s on etcd instance {{ $labels.instance }}.'
2214
- summary : etcd cluster 99th percentile commit durations are too high.
2215
2211
expr : |
2216
2212
histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
2217
2213
> 0.25
@@ -2220,9 +2216,8 @@ spec:
2220
2216
severity : warning
2221
2217
- alert : etcdHighNumberOfFailedHTTPRequests
2222
2218
annotations :
2223
- description : ' {{ $value }}% of requests for {{ $labels.method }} failed on
2224
- etcd instance {{ $labels.instance }}'
2225
- summary : etcd has high number of failed HTTP requests.
2219
+ message : ' {{ $value }}% of requests for {{ $labels.method }} failed on etcd
2220
+ instance {{ $labels.instance }}'
2226
2221
expr : |
2227
2222
sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) without (code) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
2228
2223
without (code) > 0.01
@@ -2231,9 +2226,8 @@ spec:
2231
2226
severity : warning
2232
2227
- alert : etcdHighNumberOfFailedHTTPRequests
2233
2228
annotations :
2234
- description : ' {{ $value }}% of requests for {{ $labels.method }} failed on
2235
- etcd instance {{ $labels.instance }}.'
2236
- summary : etcd has high number of failed HTTP requests.
2229
+ message : ' {{ $value }}% of requests for {{ $labels.method }} failed on etcd
2230
+ instance {{ $labels.instance }}.'
2237
2231
expr : |
2238
2232
sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) without (code) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
2239
2233
without (code) > 0.05
@@ -2242,32 +2236,11 @@ spec:
2242
2236
severity : critical
2243
2237
- alert : etcdHTTPRequestsSlow
2244
2238
annotations :
2245
- description : etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method
2239
+ message : etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method
2246
2240
}} are slow.
2247
- summary : etcd instance HTTP requests are slow.
2248
2241
expr : |
2249
2242
histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
2250
2243
> 0.15
2251
2244
for : 10m
2252
2245
labels :
2253
2246
severity : warning
2254
- - alert : etcdBackendQuotaLowSpace
2255
- annotations :
2256
- message : ' etcd cluster "{{ $labels.job }}": database size exceeds the defined
2257
- quota on etcd instance {{ $labels.instance }}, please defrag or increase
2258
- the quota as the writes to etcd will be disabled when it is full.'
2259
- expr : |
2260
- (etcd_mvcc_db_total_size_in_bytes/etcd_server_quota_backend_bytes)*100 > 95
2261
- for : 10m
2262
- labels :
2263
- severity : critical
2264
- - alert : etcdExcessiveDatabaseGrowth
2265
- annotations :
2266
- message : ' etcd cluster "{{ $labels.job }}": Observed surge in etcd writes
2267
- leading to 50% increase in database size over the past four hours on etcd
2268
- instance {{ $labels.instance }}, please check as it might be disruptive.'
2269
- expr : |
2270
- increase(((etcd_mvcc_db_total_size_in_bytes/etcd_server_quota_backend_bytes)*100)[240m:1m]) > 50
2271
- for : 10m
2272
- labels :
2273
- severity : warning
0 commit comments