Skip to content

Commit 6ad5bdb

Browse files
Merge pull request #1087 from simonpasquier/bz1934163
Bug 1934163: adjust Thanos querier alerting rules
2 parents ff5fe73 + 70e4331 commit 6ad5bdb

File tree

3 files changed

+44
-45
lines changed

3 files changed

+44
-45
lines changed

Diff for: CHANGELOG.md

+8
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,13 @@
11
# Note: This CHANGELOG is only for the monitoring team to track all monitoring related changes. Please see OpenShift release notes for official changes.
22

3+
## 4.8
4+
5+
- [#1087](https://github.com/openshift/cluster-monitoring-operator/pull/1087) Decrease alert severity to "warning" for ThanosQueryHttpRequestQueryErrorRateHigh and ThanosQueryHttpRequestQueryRangeErrorRateHigh alerts.
6+
- [#1087](https://github.com/openshift/cluster-monitoring-operator/pull/1087) Increase "for" duration to 1 hour for all Thanos query alerts.
7+
- [#1087](https://github.com/openshift/cluster-monitoring-operator/pull/1087) Remove ThanosQueryInstantLatencyHigh and ThanosQueryRangeLatencyHigh alerts.
8+
- [#1090](https://github.com/openshift/cluster-monitoring-operator/pull/1090) Decrease alert severity to "warning" for all Thanos sidecar alerts.
9+
- [#1090](https://github.com/openshift/cluster-monitoring-operator/pull/1090) Increase "for" duration to 1 hour for all Thanos sidecar alerts.
10+
311
## 4.7
412

513
- [#963](https://github.com/openshift/cluster-monitoring-operator/pull/963) bump mixins to include new etcd alerts

Diff for: assets/thanos-querier/prometheus-rule.yaml

+7-35
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,9 @@ spec:
2323
/
2424
sum(rate(http_requests_total{job="thanos-querier", handler="query"}[5m]))
2525
) * 100 > 5
26-
for: 5m
26+
for: 1h
2727
labels:
28-
severity: critical
28+
severity: warning
2929
- alert: ThanosQueryHttpRequestQueryRangeErrorRateHigh
3030
annotations:
3131
description: Thanos Query {{$labels.job}} is failing to handle {{ $value |
@@ -37,9 +37,9 @@ spec:
3737
/
3838
sum(rate(http_requests_total{job="thanos-querier", handler="query_range"}[5m]))
3939
) * 100 > 5
40-
for: 5m
40+
for: 1h
4141
labels:
42-
severity: critical
42+
severity: warning
4343
- alert: ThanosQueryGrpcServerErrorRate
4444
annotations:
4545
description: Thanos Query {{$labels.job}} is failing to handle {{ $value |
@@ -52,7 +52,7 @@ spec:
5252
sum by (job) (rate(grpc_server_started_total{job="thanos-querier"}[5m]))
5353
* 100 > 5
5454
)
55-
for: 5m
55+
for: 1h
5656
labels:
5757
severity: warning
5858
- alert: ThanosQueryGrpcClientErrorRate
@@ -66,7 +66,7 @@ spec:
6666
/
6767
sum by (job) (rate(grpc_client_started_total{job="thanos-querier"}[5m]))
6868
) * 100 > 5
69-
for: 5m
69+
for: 1h
7070
labels:
7171
severity: warning
7272
- alert: ThanosQueryHighDNSFailures
@@ -80,34 +80,6 @@ spec:
8080
/
8181
sum by (job) (rate(thanos_query_store_apis_dns_lookups_total{job="thanos-querier"}[5m]))
8282
) * 100 > 1
83-
for: 15m
83+
for: 1h
8484
labels:
8585
severity: warning
86-
- alert: ThanosQueryInstantLatencyHigh
87-
annotations:
88-
description: Thanos Query {{$labels.job}} has a 99th percentile latency of
89-
{{ $value }} seconds for instant queries.
90-
summary: Thanos Query has high latency for queries.
91-
expr: |
92-
(
93-
histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job="thanos-querier", handler="query"}[5m]))) > 40
94-
and
95-
sum by (job) (rate(http_request_duration_seconds_bucket{job="thanos-querier", handler="query"}[5m])) > 0
96-
)
97-
for: 10m
98-
labels:
99-
severity: critical
100-
- alert: ThanosQueryRangeLatencyHigh
101-
annotations:
102-
description: Thanos Query {{$labels.job}} has a 99th percentile latency of
103-
{{ $value }} seconds for range queries.
104-
summary: Thanos Query has high latency for queries.
105-
expr: |
106-
(
107-
histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job="thanos-querier", handler="query_range"}[5m]))) > 90
108-
and
109-
sum by (job) (rate(http_request_duration_seconds_count{job="thanos-querier", handler="query_range"}[5m])) > 0
110-
)
111-
for: 10m
112-
labels:
113-
severity: critical

Diff for: jsonnet/patch-rules.libsonnet

+29-10
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,13 @@ local excludedRules = [
5858
{ alert: 'KubeDeploymentReplicasMismatch' },
5959
],
6060
},
61+
{
62+
name: 'thanos-query',
63+
rules: [
64+
{ alert: 'ThanosQueryInstantLatencyHigh' },
65+
{ alert: 'ThanosQueryRangeLatencyHigh' },
66+
],
67+
},
6168
];
6269

6370
local patchedRules = [
@@ -78,21 +85,19 @@ local patchedRules = [
7885
name: 'thanos-sidecar',
7986
rules: [
8087
{
81-
alert: 'ThanosSidecarPrometheusDown',
82-
'for': '1h',
83-
labels: {
84-
severity: 'warning',
85-
},
86-
},
87-
{
88-
alert: 'ThanosSidecarBucketOperationsFailed',
88+
alert: '',
8989
'for': '1h',
9090
labels: {
9191
severity: 'warning',
9292
},
9393
},
94+
],
95+
},
96+
{
97+
name: 'thanos-query',
98+
rules: [
9499
{
95-
alert: 'ThanosSidecarUnhealthy',
100+
alert: '',
96101
'for': '1h',
97102
labels: {
98103
severity: 'warning',
@@ -108,7 +113,12 @@ local patchOrExcludeRule(rule, ruleSet, operation) =
108113
else if (('alert' in rule && 'alert' in ruleSet[0]) && std.startsWith(rule.alert, ruleSet[0].alert)) ||
109114
(('record' in rule && 'record' in ruleSet[0]) && std.startsWith(rule.record, ruleSet[0].record)) then
110115
if operation == 'patch' then
111-
[std.mergePatch(rule, ruleSet[0])]
116+
local patch = {
117+
[k]: ruleSet[0][k]
118+
for k in std.objectFields(ruleSet[0])
119+
if k != 'alert' && k != 'record'
120+
};
121+
[std.mergePatch(rule, patch)]
112122
else
113123
[]
114124
else
@@ -123,6 +133,9 @@ local patchOrExcludeRuleGroup(group, groupSet, operation) =
123133
[] + patchOrExcludeRuleGroup(group, groupSet[1:], operation);
124134

125135
{
136+
// excludedRules removes upstream rules that we don't want to carry in CMO.
137+
// It can remove specific rules from a rules group (see excludedRules) or a
138+
// whole rules group (see excludedRuleGroups).
126139
excludeRules(o): {
127140
local exclude(o) = o {
128141
[if (o.kind == 'PrometheusRule') then 'spec']+: {
@@ -142,6 +155,12 @@ local patchOrExcludeRuleGroup(group, groupSet, operation) =
142155
for k in std.objectFields(o)
143156
},
144157

158+
// patchRules adapts upstream rules to comply with OpenShift requirements
159+
// (such as extending the for duration, changing alert severity, and so on).
160+
// The patches are defined in the patchedRules array where each item contains
161+
// the name of the affected group and the list of patches keyed by their
162+
// 'alert' or 'record' identifier. The function will apply the patch to every
163+
// alerting/recording rule in the group whose name starts by the identifier.
145164
patchRules(o): {
146165
local patch(o) = o {
147166
[if (o.kind == 'PrometheusRule') then 'spec']+: {

0 commit comments

Comments
 (0)