Skip to content

Commit 01162dd

Browse files
Merge pull request #1090 from dgrisonnet/bz-1921335
Bug 1921335: Fix and adjust ThanosSidecarUnhealthy alert
2 parents 66bbb2e + 711146a commit 01162dd

File tree

6 files changed

+57
-24
lines changed

6 files changed

+57
-24
lines changed

Diff for: assets/prometheus-k8s/prometheus-rule.yaml

+19-8
Original file line numberDiff line numberDiff line change
@@ -239,24 +239,35 @@ spec:
239239
for: 15m
240240
labels:
241241
severity: critical
242-
- name: thanos-sidecar.rules
242+
- name: thanos-sidecar
243243
rules:
244244
- alert: ThanosSidecarPrometheusDown
245245
annotations:
246-
description: Thanos Sidecar {{$labels.job}} {{$labels.pod}} cannot connect
246+
description: Thanos Sidecar {{$labels.job}} {{$labels.instance}} cannot connect
247247
to Prometheus.
248248
summary: Thanos Sidecar cannot connect to Prometheus
249249
expr: |
250-
sum by (job, pod) (thanos_sidecar_prometheus_up{job=~"prometheus-(k8s|user-workload)-thanos-sidecar"} == 0)
251-
for: 5m
250+
sum by (job, instance) (thanos_sidecar_prometheus_up{job=~"prometheus-(k8s|user-workload)-thanos-sidecar"} == 0)
251+
for: 1h
252252
labels:
253-
severity: critical
253+
severity: warning
254+
- alert: ThanosSidecarBucketOperationsFailed
255+
annotations:
256+
description: Thanos Sidecar {{$labels.job}} {{$labels.instance}} bucket operations
257+
are failing
258+
summary: Thanos Sidecar bucket operations are failing
259+
expr: |
260+
rate(thanos_objstore_bucket_operation_failures_total{job=~"prometheus-(k8s|user-workload)-thanos-sidecar"}[5m]) > 0
261+
for: 1h
262+
labels:
263+
severity: warning
254264
- alert: ThanosSidecarUnhealthy
255265
annotations:
256266
description: Thanos Sidecar {{$labels.job}} {{$labels.pod}} is unhealthy for
257-
{{ $value }} seconds.
267+
more than {{ $value }} seconds.
258268
summary: Thanos Sidecar is unhealthy.
259269
expr: |
260-
time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job=~"prometheus-(k8s|user-workload)-thanos-sidecar"}) by (job, pod) >= 600
270+
time() - max(timestamp(thanos_sidecar_last_heartbeat_success_time_seconds{job=~"prometheus-(k8s|user-workload)-thanos-sidecar"})) by (job,pod) >= 240
271+
for: 1h
261272
labels:
262-
severity: critical
273+
severity: warning

Diff for: assets/thanos-querier/prometheus-rule.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ metadata:
1010
namespace: openshift-monitoring
1111
spec:
1212
groups:
13-
- name: thanos-query.rules
13+
- name: thanos-query
1414
rules:
1515
- alert: ThanosQueryHttpRequestQueryErrorRateHigh
1616
annotations:

Diff for: assets/thanos-ruler/thanos-ruler-prometheus-rule.yaml

+8-12
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,11 @@ metadata:
55
namespace: openshift-user-workload-monitoring
66
spec:
77
groups:
8-
- name: thanos-rule.rules
8+
- name: thanos-rule
99
rules:
1010
- alert: ThanosRuleQueueIsDroppingAlerts
1111
annotations:
12-
description: Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to queue
13-
alerts.
12+
description: Thanos Rule {{$labels.job}} is failing to queue alerts.
1413
summary: Thanos Rule is failing to queue alerts.
1514
expr: |
1615
sum by (job) (rate(thanos_alert_queue_alerts_dropped_total{job="thanos-ruler"}[5m])) > 0
@@ -19,8 +18,7 @@ spec:
1918
severity: critical
2019
- alert: ThanosRuleSenderIsFailingAlerts
2120
annotations:
22-
description: Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to send
23-
alerts to alertmanager.
21+
description: Thanos Rule {{$labels.job}} is failing to send alerts to alertmanager.
2422
summary: Thanos Rule is failing to send alerts to alertmanager.
2523
expr: |
2624
sum by (job) (rate(thanos_alert_sender_alerts_dropped_total{job="thanos-ruler"}[5m])) > 0
@@ -29,8 +27,7 @@ spec:
2927
severity: critical
3028
- alert: ThanosRuleHighRuleEvaluationFailures
3129
annotations:
32-
description: Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to evaluate
33-
rules.
30+
description: Thanos Rule {{$labels.job}} is failing to evaluate rules.
3431
summary: Thanos Rule is failing to evaluate rules.
3532
expr: |
3633
(
@@ -44,8 +41,7 @@ spec:
4441
severity: critical
4542
- alert: ThanosRuleHighRuleEvaluationWarnings
4643
annotations:
47-
description: Thanos Rule {{$labels.job}} {{$labels.pod}} has high number of
48-
evaluation warnings.
44+
description: Thanos Rule {{$labels.job}} has high number of evaluation warnings.
4945
summary: Thanos Rule has high number of evaluation warnings.
5046
expr: |
5147
sum by (job) (rate(thanos_rule_evaluation_with_warnings_total{job="thanos-ruler"}[5m])) > 0
@@ -54,14 +50,14 @@ spec:
5450
severity: info
5551
- alert: ThanosRuleRuleEvaluationLatencyHigh
5652
annotations:
57-
description: Thanos Rule {{$labels.job}}/{{$labels.pod}} has higher evaluation
53+
description: Thanos Rule {{$labels.job}}/{{$labels.instance}} has higher evaluation
5854
latency than interval for {{$labels.rule_group}}.
5955
summary: Thanos Rule has high rule evaluation latency.
6056
expr: |
6157
(
62-
sum by (job, pod, rule_group) (prometheus_rule_group_last_duration_seconds{job="thanos-ruler"})
58+
sum by (job, instance, rule_group) (prometheus_rule_group_last_duration_seconds{job="thanos-ruler"})
6359
>
64-
sum by (job, pod, rule_group) (prometheus_rule_group_interval_seconds{job="thanos-ruler"})
60+
sum by (job, instance, rule_group) (prometheus_rule_group_interval_seconds{job="thanos-ruler"})
6561
)
6662
for: 5m
6763
labels:

Diff for: jsonnet/jsonnetfile.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@
6464
"subdir": "mixin"
6565
}
6666
},
67-
"version": "release-0.17"
67+
"version": "95b2fe908b29977a20f695afac33f6257bb2d3ea"
6868
}
6969
],
7070
"legacyImports": true

Diff for: jsonnet/jsonnetfile.lock.json

+2-2
Original file line numberDiff line numberDiff line change
@@ -203,8 +203,8 @@
203203
"subdir": "mixin"
204204
}
205205
},
206-
"version": "37e6ef61566c7c70793ba6d128f00c4c66cb2402",
207-
"sum": "OptiWUMOHFrRGTZhSfxV1RCeXZ90qsefGNTD4lDYVG0="
206+
"version": "95b2fe908b29977a20f695afac33f6257bb2d3ea",
207+
"sum": "bE5eEPMulYMtz0lJSxDVnCXVTg/lqSSS5aEUUH3V19A="
208208
}
209209
],
210210
"legacyImports": false

Diff for: jsonnet/patch-rules.libsonnet

+26
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,32 @@ local patchedRules = [
7474
},
7575
],
7676
},
77+
{
78+
name: 'thanos-sidecar',
79+
rules: [
80+
{
81+
alert: 'ThanosSidecarPrometheusDown',
82+
'for': '1h',
83+
labels: {
84+
severity: 'warning',
85+
},
86+
},
87+
{
88+
alert: 'ThanosSidecarBucketOperationsFailed',
89+
'for': '1h',
90+
labels: {
91+
severity: 'warning',
92+
},
93+
},
94+
{
95+
alert: 'ThanosSidecarUnhealthy',
96+
'for': '1h',
97+
labels: {
98+
severity: 'warning',
99+
},
100+
},
101+
],
102+
},
77103
];
78104

79105
local patchOrExcludeRule(rule, ruleSet, operation) =

0 commit comments

Comments
 (0)