Skip to content

Commit edbd400

Browse files
committed
assets,pkg: regenerate
Signed-off-by: Simon Pasquier <[email protected]>
1 parent d6102e6 commit edbd400

File tree

2 files changed

+49
-76
lines changed

2 files changed

+49
-76
lines changed

assets/prometheus-k8s/rules.yaml

Lines changed: 45 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -862,11 +862,31 @@ spec:
862862
record: cluster:capacity_cpu_cores:sum
863863
- expr: |
864864
clamp_max(
865-
(
866-
label_replace( ( ( sum (node_cpu_info) by (instance, package, core) ) > 1 ), "label_node_hyperthread_enabled", "true", "instance", "(.*)" )
867-
or on (instance, package)
868-
label_replace( ( ( sum (node_cpu_info) by (instance, package, core) ) <= 1 ), "label_node_hyperthread_enabled", "false", "instance", "(.*)" )
869-
), 1
865+
label_replace(
866+
sum by(instance, package, core) (
867+
node_cpu_info{core!="",package!=""}
868+
or
869+
# Assume core = cpu and package = 0 for platforms that don't expose core/package labels.
870+
label_replace(label_join(node_cpu_info{core="",package=""}, "core", "", "cpu"), "package", "0", "package", "")
871+
) > 1,
872+
"label_node_hyperthread_enabled",
873+
"true",
874+
"instance",
875+
"(.*)"
876+
) or on (instance, package)
877+
label_replace(
878+
sum by(instance, package, core) (
879+
label_replace(node_cpu_info{core!="",package!=""}
880+
or
881+
# Assume core = cpu and package = 0 for platforms that don't expose core/package labels.
882+
label_join(node_cpu_info{core="",package=""}, "core", "", "cpu"), "package", "0", "package", "")
883+
) <= 1,
884+
"label_node_hyperthread_enabled",
885+
"false",
886+
"instance",
887+
"(.*)"
888+
),
889+
1
870890
)
871891
record: cluster:cpu_core_hyperthreading
872892
- expr: |
@@ -1175,7 +1195,7 @@ spec:
11751195
{{ printf "%.0f" $value }} receive errors in the last two minutes.'
11761196
summary: Network interface is reporting many receive errors.
11771197
expr: |
1178-
rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01
1198+
increase(node_network_receive_errs_total[2m]) > 10
11791199
for: 1h
11801200
labels:
11811201
severity: warning
@@ -1185,7 +1205,7 @@ spec:
11851205
{{ printf "%.0f" $value }} transmit errors in the last two minutes.'
11861206
summary: Network interface is reporting many transmit errors.
11871207
expr: |
1188-
rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01
1208+
increase(node_network_transmit_errs_total[2m]) > 10
11891209
for: 1h
11901210
labels:
11911211
severity: warning
@@ -1232,8 +1252,6 @@ spec:
12321252
summary: Clock not synchronising.
12331253
expr: |
12341254
min_over_time(node_timex_sync_status[5m]) == 0
1235-
and
1236-
node_timex_maxerror_seconds >= 16
12371255
for: 10m
12381256
labels:
12391257
severity: warning
@@ -2106,9 +2124,7 @@ spec:
21062124
rules:
21072125
- alert: etcdMembersDown
21082126
annotations:
2109-
description: 'etcd cluster "{{ $labels.job }}": members are down ({{ $value
2110-
}}).'
2111-
summary: etcd cluster members are down.
2127+
message: 'etcd cluster "{{ $labels.job }}": members are down ({{ $value }}).'
21122128
expr: |
21132129
max without (endpoint) (
21142130
sum without (instance) (up{job=~".*etcd.*"} == bool 0)
@@ -2123,41 +2139,36 @@ spec:
21232139
severity: critical
21242140
- alert: etcdInsufficientMembers
21252141
annotations:
2126-
description: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value
2142+
message: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value
21272143
}}).'
2128-
summary: etcd cluster has insufficient number of members.
21292144
expr: |
21302145
sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"}) without (instance) + 1) / 2)
21312146
for: 3m
21322147
labels:
21332148
severity: critical
21342149
- alert: etcdNoLeader
21352150
annotations:
2136-
description: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance
2137-
}} has no leader.'
2138-
summary: etcd cluster has no leader.
2151+
message: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }}
2152+
has no leader.'
21392153
expr: |
21402154
etcd_server_has_leader{job=~".*etcd.*"} == 0
21412155
for: 1m
21422156
labels:
21432157
severity: critical
21442158
- alert: etcdHighNumberOfLeaderChanges
21452159
annotations:
2146-
description: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes
2147-
within the last 15 minutes. Frequent elections may be a sign of insufficient
2148-
resources, high network latency, or disruptions by other components and
2149-
should be investigated.'
2150-
summary: etcd cluster has high number of leader changes.
2160+
message: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes within
2161+
the last 15 minutes. Frequent elections may be a sign of insufficient resources,
2162+
high network latency, or disruptions by other components and should be investigated.'
21512163
expr: |
21522164
increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) >= 4
21532165
for: 5m
21542166
labels:
21552167
severity: warning
21562168
- alert: etcdGRPCRequestsSlow
21572169
annotations:
2158-
description: 'etcd cluster "{{ $labels.job }}": gRPC requests to {{ $labels.grpc_method
2170+
message: 'etcd cluster "{{ $labels.job }}": gRPC requests to {{ $labels.grpc_method
21592171
}} are taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
2160-
summary: etcd grpc requests are slow
21612172
expr: |
21622173
histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_type="unary"}[5m])) without(grpc_type))
21632174
> 0.15
@@ -2166,10 +2177,8 @@ spec:
21662177
severity: critical
21672178
- alert: etcdMemberCommunicationSlow
21682179
annotations:
2169-
description: 'etcd cluster "{{ $labels.job }}": member communication with
2170-
{{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance
2171-
}}.'
2172-
summary: etcd cluster member communication is slow.
2180+
message: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To
2181+
}} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
21732182
expr: |
21742183
histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
21752184
> 0.15
@@ -2178,40 +2187,27 @@ spec:
21782187
severity: warning
21792188
- alert: etcdHighNumberOfFailedProposals
21802189
annotations:
2181-
description: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures
2190+
message: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures
21822191
within the last 30 minutes on etcd instance {{ $labels.instance }}.'
2183-
summary: etcd cluster has high number of proposal failures.
21842192
expr: |
21852193
rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
21862194
for: 15m
21872195
labels:
21882196
severity: warning
21892197
- alert: etcdHighFsyncDurations
21902198
annotations:
2191-
description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations
2199+
message: 'etcd cluster "{{ $labels.job }}": 99th percentile fync durations
21922200
are {{ $value }}s on etcd instance {{ $labels.instance }}.'
2193-
summary: etcd cluster 99th percentile fsync durations are too high.
21942201
expr: |
21952202
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
21962203
> 0.5
21972204
for: 10m
21982205
labels:
21992206
severity: warning
2200-
- alert: etcdHighFsyncDurations
2201-
annotations:
2202-
message: 'etcd cluster "{{ $labels.job }}": 99th percentile fync durations
2203-
are {{ $value }}s on etcd instance {{ $labels.instance }}.'
2204-
expr: |
2205-
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
2206-
> 1
2207-
for: 10m
2208-
labels:
2209-
severity: critical
22102207
- alert: etcdHighCommitDurations
22112208
annotations:
2212-
description: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations
2209+
message: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations
22132210
{{ $value }}s on etcd instance {{ $labels.instance }}.'
2214-
summary: etcd cluster 99th percentile commit durations are too high.
22152211
expr: |
22162212
histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
22172213
> 0.25
@@ -2220,9 +2216,8 @@ spec:
22202216
severity: warning
22212217
- alert: etcdHighNumberOfFailedHTTPRequests
22222218
annotations:
2223-
description: '{{ $value }}% of requests for {{ $labels.method }} failed on
2224-
etcd instance {{ $labels.instance }}'
2225-
summary: etcd has high number of failed HTTP requests.
2219+
message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd
2220+
instance {{ $labels.instance }}'
22262221
expr: |
22272222
sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) without (code) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
22282223
without (code) > 0.01
@@ -2231,9 +2226,8 @@ spec:
22312226
severity: warning
22322227
- alert: etcdHighNumberOfFailedHTTPRequests
22332228
annotations:
2234-
description: '{{ $value }}% of requests for {{ $labels.method }} failed on
2235-
etcd instance {{ $labels.instance }}.'
2236-
summary: etcd has high number of failed HTTP requests.
2229+
message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd
2230+
instance {{ $labels.instance }}.'
22372231
expr: |
22382232
sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) without (code) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
22392233
without (code) > 0.05
@@ -2242,32 +2236,11 @@ spec:
22422236
severity: critical
22432237
- alert: etcdHTTPRequestsSlow
22442238
annotations:
2245-
description: etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method
2239+
message: etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method
22462240
}} are slow.
2247-
summary: etcd instance HTTP requests are slow.
22482241
expr: |
22492242
histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
22502243
> 0.15
22512244
for: 10m
22522245
labels:
22532246
severity: warning
2254-
- alert: etcdBackendQuotaLowSpace
2255-
annotations:
2256-
message: 'etcd cluster "{{ $labels.job }}": database size exceeds the defined
2257-
quota on etcd instance {{ $labels.instance }}, please defrag or increase
2258-
the quota as the writes to etcd will be disabled when it is full.'
2259-
expr: |
2260-
(etcd_mvcc_db_total_size_in_bytes/etcd_server_quota_backend_bytes)*100 > 95
2261-
for: 10m
2262-
labels:
2263-
severity: critical
2264-
- alert: etcdExcessiveDatabaseGrowth
2265-
annotations:
2266-
message: 'etcd cluster "{{ $labels.job }}": Observed surge in etcd writes
2267-
leading to 50% increase in database size over the past four hours on etcd
2268-
instance {{ $labels.instance }}, please check as it might be disruptive.'
2269-
expr: |
2270-
increase(((etcd_mvcc_db_total_size_in_bytes/etcd_server_quota_backend_bytes)*100)[240m:1m]) > 50
2271-
for: 10m
2272-
labels:
2273-
severity: warning

pkg/manifests/bindata.go

Lines changed: 4 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)