Skip to content

Commit 26d187d

Browse files
authored
Merge pull request #81 from appuio/feat/silence-handle-delayed-worker-pools
Add support for creating a separate silence for delayed machineconfigpool maintenance
2 parents 3ca4db0 + f09fb9c commit 26d187d

File tree

37 files changed

+11712
-9
lines changed

37 files changed

+11712
-9
lines changed

.cruft.json

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
"name": "openshift-upgrade-controller",
88
"slug": "openshift-upgrade-controller",
99
"parameter_key": "openshift_upgrade_controller",
10-
"test_cases": "defaults",
10+
"test_cases": "defaults delayed-pool-silence",
1111
"add_lib": "n",
1212
"add_pp": "n",
1313
"add_golden": "y",
@@ -24,7 +24,8 @@
2424
"github_owner": "appuio",
2525
"github_name": "component-openshift-upgrade-controller",
2626
"github_url": "https://github.com/appuio/component-openshift-upgrade-controller",
27-
"_template": "https://github.com/projectsyn/commodore-component-template.git"
27+
"_template": "https://github.com/projectsyn/commodore-component-template.git",
28+
"_commit": "98d16f99766e6c6d97322dbe42e058f0e2bf73d0"
2829
}
2930
},
3031
"directory": null

.github/workflows/test.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ jobs:
3333
matrix:
3434
instance:
3535
- defaults
36+
- delayed-pool-silence
3637
defaults:
3738
run:
3839
working-directory: ${{ env.COMPONENT_NAME }}
@@ -48,6 +49,7 @@ jobs:
4849
matrix:
4950
instance:
5051
- defaults
52+
- delayed-pool-silence
5153
defaults:
5254
run:
5355
working-directory: ${{ env.COMPONENT_NAME }}

Makefile.vars.mk

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,4 +50,4 @@ KUBENT_IMAGE ?= ghcr.io/doitintl/kube-no-trouble:latest
5050
KUBENT_DOCKER ?= $(DOCKER_CMD) $(DOCKER_ARGS) $(root_volume) --entrypoint=/app/kubent $(KUBENT_IMAGE)
5151

5252
instance ?= defaults
53-
test_instances = tests/defaults.yml
53+
test_instances = tests/defaults.yml tests/delayed-pool-silence.yml

class/defaults.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ parameters:
4444
alert_matchers: {}
4545
silence_timeout_hours: 12
4646
silence_after_finish_minutes: 30
47+
handle_delayed_worker_pools: false
4748
additional_job_configuration:
4849
metadata: {}
4950
spec:

component/scripts/silence.sh

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,23 @@ set -xeuo pipefail
33

44
job_name="$JOB_metadata_name"
55

6+
if [ "${EVENT_name}" = "\"MachineConfigPoolUnpause\"" ] && [ "${EVENT_reason}" = "\"Completed\"" ]; then
7+
echo "Upgrade completed without MCP upgrade. Not creating a new silence when unpausing MCPs."
8+
exit 0
9+
fi
10+
611
curl_opts=( "https://${ALERTMANAGER_HOST}.${ALERTMANAGER_NAMESPACE}.svc.cluster.local:9095/api/v2/silences" --cacert /etc/ssl/certs/serving-certs/service-ca.crt --header 'Content-Type: application/json' --header "Authorization: Bearer $(cat /var/run/secrets/kubernetes.io/serviceaccount/token)" --resolve "${ALERTMANAGER_HOST}.${ALERTMANAGER_NAMESPACE}.svc.cluster.local:9095:$(getent hosts "${ALERTMANAGER_OPERATED_SERVICE}.${ALERTMANAGER_NAMESPACE}.svc.cluster.local" | awk '{print $1}' | head -n 1)" --silent )
712

813
startsAt="$(date -u +'%Y-%m-%dT%H:%M:%S' --date '-5 minutes')"
14+
# We use SILENCE_TIMEOUT_HOURS regardless of whether we create a silence for
15+
# the initial maintenance or for a delayed worker pool maintenance, when we
16+
# trigger on `UpgradeComplete` and `MachineConfigPoolUnpause`.
917
endsAt="$(date -u +'%Y-%m-%dT%H:%M:%S' --date "+${SILENCE_TIMEOUT_HOURS} hours")"
1018

11-
if [ "${EVENT_name}" = "\"Finish\"" ]; then
19+
# Expire silence on Finish or UpgradeComplete events. Also use
20+
# SILENCE_AFTER_FINISH_MINUTES when expiring silence before paused pools are
21+
# updated.
22+
if [ "${EVENT_name}" = "\"Finish\"" ] || [ "${EVENT_name}" = "\"UpgradeComplete\"" ]; then
1223
endsAt="$(date -u +'%Y-%m-%dT%H:%M:%S' --date "+${SILENCE_AFTER_FINISH_MINUTES} minutes")"
1324
fi
1425

component/silence.libsonnet

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -81,13 +81,20 @@ local certcm = kube.ConfigMap('maintenance-silence-certs') + namespace {
8181
data:: {},
8282
};
8383

84+
local events = if params.upgrade_silence.handle_delayed_worker_pools then [
85+
'Start',
86+
'UpgradeComplete',
87+
'MachineConfigPoolUnpause',
88+
'Finish',
89+
] else [
90+
'Start',
91+
'Finish',
92+
];
93+
8494
local ujh = kube._Object('managedupgrade.appuio.io/v1beta1', 'UpgradeJobHook', 'maintenance-silence') + namespace {
8595
spec+: {
8696
selector: params.upgrade_silence.upgrade_job_selector,
87-
events: [
88-
'Start',
89-
'Finish',
90-
],
97+
events: events,
9198
template+: {
9299
spec+: {
93100
template+: {

docs/modules/ROOT/pages/references/parameters.adoc

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -276,6 +276,26 @@ default:: `30`
276276

277277
The duration to wait after the upgrade job has finished before expiring the silence in minutes.
278278

279+
=== `upgrade_silence.handle_delayed_worker_pools`
280+
281+
[horizontal]
282+
type:: bool
283+
default:: `false`
284+
285+
Whether to create separate silences for the initial maintenance and the delayed maintenance of one or more MachineConfigPools.
286+
287+
If set to true, the upgrade silence `UpgradeJobHook` is executed for events `UpgradeComplete` (when the upgrade is complete except for delayed MachineConfigPools) and `MachineConfigPoolUnpause` (when the delayed MachineConfigPool maintenance starts) in addition to the `Start` and `Finish` events.
288+
289+
When the hook script runs for the `UpgradeComplete` event it expires the silence after `upgrade_silence.silence_after_finish_minutes`.
290+
When the hook script runs for the `MachineConfigPoolUnpause` event it creates a silence which ends after `upgrade_silence.silence_timeout_hours`.
291+
292+
The `Start` and `Finish` logic remains unchanged, the `Finish` run will expire the silence created by the run for `MachineConfigPoolUnpause` when the `handle_delayed_worker_pools` parameter is `true`.
293+
294+
[NOTE]
295+
====
296+
Currently, the upgrade-controller doesn't have an event for `MachineConfigPoolComplete`.
297+
Enabling this parameter may produce undesired silences for configurations which have different delays for multiple delayed MachineConfigPools.
298+
====
279299

280300
=== `upgrade_silence.additional_job_configuration`
281301

tests/delayed-pool-silence.yml

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
parameters:
2+
openshift_upgrade_controller:
3+
upgrade_configs:
4+
appuio-monday-afternoon:
5+
spec:
6+
maxSchedulingDelay: 1h
7+
maxUpgradeStartDelay: 1h
8+
schedule:
9+
cron: "0 10 * * 2"
10+
location: Europe/Zurich
11+
jobTemplate:
12+
metadata:
13+
labels:
14+
upgradeconfig/name: appuio-monday-afternoon
15+
spec:
16+
config:
17+
upgradeTimeout: 12h
18+
preUpgradeHealthChecks:
19+
timeout: 1h
20+
postUpgradeHealthChecks:
21+
timeout: 1h
22+
upgrade_silence:
23+
upgrade_job_selector:
24+
matchLabels: ${openshift_upgrade_controller:upgrade_configs:appuio-monday-afternoon:spec:jobTemplate:metadata:labels}
25+
alert_matchers:
26+
"only maintenance without SLOs":
27+
matchers:
28+
- name: alertname
29+
value: "Watchdog"
30+
isRegex: false
31+
isEqual: false
32+
- name: Maintenance
33+
value: "true"
34+
isRegex: false
35+
isEqual: false
36+
additional_job_configuration:
37+
metadata: {}
38+
spec: {}
39+
handle_delayed_worker_pools: true

tests/golden/defaults/openshift-upgrade-controller/openshift-upgrade-controller/90_upgrade_silence.yaml

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ rules:
2222
resources:
2323
- alertmanagers/api
2424
verbs:
25+
- create
2526
- get
2627
---
2728
apiVersion: rbac.authorization.k8s.io/v1
@@ -66,12 +67,23 @@ data:
6667
6768
job_name="$JOB_metadata_name"
6869
70+
if [ "${EVENT_name}" = "\"MachineConfigPoolUnpause\"" ] && [ "${EVENT_reason}" = "\"Completed\"" ]; then
71+
echo "Upgrade completed without MCP upgrade. Not creating a new silence when unpausing MCPs."
72+
exit 0
73+
fi
74+
6975
curl_opts=( "https://${ALERTMANAGER_HOST}.${ALERTMANAGER_NAMESPACE}.svc.cluster.local:9095/api/v2/silences" --cacert /etc/ssl/certs/serving-certs/service-ca.crt --header 'Content-Type: application/json' --header "Authorization: Bearer $(cat /var/run/secrets/kubernetes.io/serviceaccount/token)" --resolve "${ALERTMANAGER_HOST}.${ALERTMANAGER_NAMESPACE}.svc.cluster.local:9095:$(getent hosts "${ALERTMANAGER_OPERATED_SERVICE}.${ALERTMANAGER_NAMESPACE}.svc.cluster.local" | awk '{print $1}' | head -n 1)" --silent )
7076
7177
startsAt="$(date -u +'%Y-%m-%dT%H:%M:%S' --date '-5 minutes')"
78+
# We use SILENCE_TIMEOUT_HOURS regardless of whether we create a silence for
79+
# the initial maintenance or for a delayed worker pool maintenance, when we
80+
# trigger on `UpgradeComplete` and `MachineConfigPoolUnpause`.
7281
endsAt="$(date -u +'%Y-%m-%dT%H:%M:%S' --date "+${SILENCE_TIMEOUT_HOURS} hours")"
7382
74-
if [ "${EVENT_name}" = "\"Finish\"" ]; then
83+
# Expire silence on Finish or UpgradeComplete events. Also use
84+
# SILENCE_AFTER_FINISH_MINUTES when expiring silence before paused pools are
85+
# updated.
86+
if [ "${EVENT_name}" = "\"Finish\"" ] || [ "${EVENT_name}" = "\"UpgradeComplete\"" ]; then
7587
endsAt="$(date -u +'%Y-%m-%dT%H:%M:%S' --date "+${SILENCE_AFTER_FINISH_MINUTES} minutes")"
7688
fi
7789

tests/golden/delayed-pool-silence/openshift-upgrade-controller/apps/openshift-upgrade-controller.yaml

Whitespace-only changes.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
apiVersion: monitoring.coreos.com/v1
2+
kind: PrometheusRule
3+
metadata:
4+
annotations: {}
5+
labels:
6+
name: openshift-upgrade-controller
7+
name: openshift-upgrade-controller
8+
namespace: appuio-openshift-upgrade-controller
9+
spec:
10+
groups:
11+
- name: drain.alerts
12+
rules:
13+
- alert: MaintenanceInProgress
14+
annotations:
15+
description: Cluster is currently upgrading
16+
message: An OpenShift upgrade is in progress on this cluster
17+
summary: Cluster is currently upgrading
18+
expr: |
19+
max(max_over_time(openshift_upgrade_controller_upgradejob_state{state="active"} [10m])) > 0
20+
for: 0m
21+
labels:
22+
Maintenance: 'true'
23+
severity: info
24+
syn: 'true'
25+
syn_component: openshift-upgrade-controller
26+
- alert: NodeDrainStuck
27+
annotations:
28+
description: Node {{$labels.node}} is draining for more than 10 minutes.
29+
message: Node {{$labels.node}} is draining for more than 10 minutes.
30+
runbook_url: https://hub.syn.tools/openshift-upgrade-controller/runbooks/NodeDrainStuck.html
31+
summary: Node is draining for more than 10 minutes.
32+
expr: |
33+
openshift_upgrade_controller_node_draining == 1
34+
for: 15m
35+
labels:
36+
Maintenance: 'true'
37+
severity: warning
38+
syn: 'true'
39+
syn_component: openshift-upgrade-controller
40+
- alert: PausedMachineConfigPool
41+
annotations:
42+
description: |
43+
MachineConfigPool {{$labels.pool}} is paused. A paused MachineConfigPool will likely block the next maintenance.
44+
message: MachineConfigPool {{$labels.pool}} is paused.
45+
runbook_url: https://hub.syn.tools/openshift-upgrade-controller/runbooks/PausedMachineConfigPool.html
46+
summary: Paused MachineConfigPool
47+
expr: |
48+
group(openshift_upgrade_controller_machine_config_pools_paused > 0) by (pool) unless on() group(openshift_upgrade_controller_upgradejob_state{state=~"active|paused"})
49+
for: 2h
50+
labels:
51+
severity: warning
52+
syn: 'true'
53+
syn_component: openshift-upgrade-controller
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
apiVersion: managedupgrade.appuio.io/v1beta1
2+
kind: UpgradeConfig
3+
metadata:
4+
annotations: {}
5+
labels:
6+
name: appuio-monday-afternoon
7+
name: appuio-monday-afternoon
8+
namespace: appuio-openshift-upgrade-controller
9+
spec:
10+
jobTemplate:
11+
metadata:
12+
labels:
13+
upgradeconfig/name: appuio-monday-afternoon
14+
spec:
15+
config:
16+
postUpgradeHealthChecks:
17+
timeout: 1h
18+
preUpgradeHealthChecks:
19+
timeout: 1h
20+
upgradeTimeout: 12h
21+
maxSchedulingDelay: 1h
22+
maxUpgradeStartDelay: 1h
23+
schedule:
24+
cron: 0 10 * * 2
25+
location: Europe/Zurich

tests/golden/delayed-pool-silence/openshift-upgrade-controller/openshift-upgrade-controller/22_upgradejobhooks.yaml

Whitespace-only changes.

tests/golden/delayed-pool-silence/openshift-upgrade-controller/openshift-upgrade-controller/24_upgradesuspensionwindows.yaml

Whitespace-only changes.

tests/golden/delayed-pool-silence/openshift-upgrade-controller/openshift-upgrade-controller/26_nodeforcedrains.yaml

Whitespace-only changes.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
apiVersion: rbac.authorization.k8s.io/v1
2+
kind: ClusterRole
3+
metadata:
4+
annotations: {}
5+
labels:
6+
name: syn-openshift-upgrade-controller-view
7+
rbac.authorization.k8s.io/aggregate-to-admin: 'true'
8+
rbac.authorization.k8s.io/aggregate-to-edit: 'true'
9+
rbac.authorization.k8s.io/aggregate-to-view: 'true'
10+
name: syn:openshift-upgrade-controller:view
11+
rules:
12+
- apiGroups:
13+
- managedupgrade.appuio.io
14+
resources:
15+
- clusterversions
16+
- upgradeconfigs
17+
- upgradejobs
18+
- upgradejobhooks
19+
verbs:
20+
- get
21+
- list
22+
- watch
23+
---
24+
apiVersion: rbac.authorization.k8s.io/v1
25+
kind: ClusterRole
26+
metadata:
27+
annotations: {}
28+
labels:
29+
name: syn-openshift-upgrade-controller-edit
30+
rbac.authorization.k8s.io/aggregate-to-admin: 'true'
31+
rbac.authorization.k8s.io/aggregate-to-edit: 'true'
32+
name: syn:openshift-upgrade-controller:edit
33+
rules:
34+
- apiGroups:
35+
- managedupgrade.appuio.io
36+
resources:
37+
- clusterversions
38+
- upgradeconfigs
39+
- upgradejobs
40+
- upgradejobhooks
41+
verbs:
42+
- create
43+
- delete
44+
- deletecollection
45+
- patch
46+
- update

tests/golden/delayed-pool-silence/openshift-upgrade-controller/openshift-upgrade-controller/90_admin_ack.yaml

Whitespace-only changes.

0 commit comments

Comments
 (0)