Skip to content

Commit 2e39812

Browse files
idanovindaFxKuhughcapet
authored
Implement major upgrade result annotations (zalando#2727)
Co-authored-by: Felix Kunde <[email protected]> Co-authored-by: Polina Bungina <[email protected]>
1 parent a09b765 commit 2e39812

File tree

3 files changed

+134
-16
lines changed

3 files changed

+134
-16
lines changed

docs/administrator.md

+6
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,12 @@ It is also possible to define `maintenanceWindows` in the Postgres manifest to
8585
better control when such automated upgrades should take place after increasing
8686
the version.
8787

88+
### Upgrade annotations
89+
90+
When an upgrade is executed, the operator sets an annotation in the PostgreSQL resource, either `last-major-upgrade-success` if the upgrade succeeds, or `last-major-upgrade-failure` if it fails. The value of the annotation is a timestamp indicating when the upgrade occurred.
91+
92+
If a PostgreSQL resource contains a failure annotation, the operator will not attempt to retry the upgrade during a sync event. To remove the failure annotation, you can revert the PostgreSQL version back to the current version. This action will trigger the removal of the failure annotation.
93+
8894
## Non-default cluster domain
8995

9096
If your cluster uses a DNS domain other than the default `cluster.local`, this

e2e/tests/test_e2e.py

+64-15
Original file line numberDiff line numberDiff line change
@@ -1185,13 +1185,19 @@ def get_docker_image():
11851185
@timeout_decorator.timeout(TEST_TIMEOUT_SEC)
11861186
def test_major_version_upgrade(self):
11871187
"""
1188-
Test major version upgrade
1188+
Test major version upgrade: with full upgrade, maintenance window, and annotation
11891189
"""
11901190
def check_version():
11911191
p = k8s.patroni_rest("acid-upgrade-test-0", "")
11921192
version = p.get("server_version", 0) // 10000
11931193
return version
11941194

1195+
def get_annotations():
1196+
pg_manifest = k8s.api.custom_objects_api.get_namespaced_custom_object(
1197+
"acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test")
1198+
annotations = pg_manifest["metadata"]["annotations"]
1199+
return annotations
1200+
11951201
k8s = self.k8s
11961202
cluster_label = 'application=spilo,cluster-name=acid-upgrade-test'
11971203

@@ -1209,68 +1215,111 @@ def check_version():
12091215

12101216
master_nodes, _ = k8s.get_cluster_nodes(cluster_labels=cluster_label)
12111217
# should upgrade immediately
1212-
pg_patch_version_14 = {
1218+
pg_patch_version_13 = {
12131219
"spec": {
12141220
"postgresql": {
1215-
"version": "14"
1221+
"version": "13"
12161222
}
12171223
}
12181224
}
12191225
k8s.api.custom_objects_api.patch_namespaced_custom_object(
1220-
"acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version_14)
1226+
"acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version_13)
12211227
self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
12221228

1223-
# should have finish failover
12241229
k8s.wait_for_pod_failover(master_nodes, 'spilo-role=replica,' + cluster_label)
12251230
k8s.wait_for_pod_start('spilo-role=master,' + cluster_label)
12261231
k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
1227-
self.eventuallyEqual(check_version, 14, "Version should be upgraded from 12 to 14")
1232+
self.eventuallyEqual(check_version, 13, "Version should be upgraded from 12 to 13")
1233+
1234+
# check if annotation for last upgrade's success is set
1235+
annotations = get_annotations()
1236+
self.assertIsNotNone(annotations.get("last-major-upgrade-success"), "Annotation for last upgrade's success is not set")
12281237

12291238
# should not upgrade because current time is not in maintenanceWindow
12301239
current_time = datetime.now()
12311240
maintenance_window_future = f"{(current_time+timedelta(minutes=60)).strftime('%H:%M')}-{(current_time+timedelta(minutes=120)).strftime('%H:%M')}"
1232-
pg_patch_version_15 = {
1241+
pg_patch_version_14 = {
12331242
"spec": {
12341243
"postgresql": {
1235-
"version": "15"
1244+
"version": "14"
12361245
},
12371246
"maintenanceWindows": [
12381247
maintenance_window_future
12391248
]
12401249
}
12411250
}
12421251
k8s.api.custom_objects_api.patch_namespaced_custom_object(
1243-
"acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version_15)
1252+
"acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version_14)
12441253
self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
12451254

1246-
# should have finish failover
12471255
k8s.wait_for_pod_failover(master_nodes, 'spilo-role=master,' + cluster_label)
12481256
k8s.wait_for_pod_start('spilo-role=master,' + cluster_label)
12491257
k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
1250-
self.eventuallyEqual(check_version, 14, "Version should not be upgraded")
1258+
self.eventuallyEqual(check_version, 13, "Version should not be upgraded")
1259+
1260+
second_annotations = get_annotations()
1261+
self.assertIsNone(second_annotations.get("last-major-upgrade-failure"), "Annotation for last upgrade's failure should not be set")
12511262

12521263
# change the version again to trigger operator sync
12531264
maintenance_window_current = f"{(current_time-timedelta(minutes=30)).strftime('%H:%M')}-{(current_time+timedelta(minutes=30)).strftime('%H:%M')}"
1254-
pg_patch_version_16 = {
1265+
pg_patch_version_15 = {
12551266
"spec": {
12561267
"postgresql": {
1257-
"version": "16"
1268+
"version": "15"
12581269
},
12591270
"maintenanceWindows": [
12601271
maintenance_window_current
12611272
]
12621273
}
12631274
}
12641275

1276+
k8s.api.custom_objects_api.patch_namespaced_custom_object(
1277+
"acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version_15)
1278+
self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
1279+
1280+
k8s.wait_for_pod_failover(master_nodes, 'spilo-role=replica,' + cluster_label)
1281+
k8s.wait_for_pod_start('spilo-role=master,' + cluster_label)
1282+
k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
1283+
self.eventuallyEqual(check_version, 15, "Version should be upgraded from 13 to 15")
1284+
1285+
# check if annotation for last upgrade's success is updated after second upgrade
1286+
third_annotations = get_annotations()
1287+
self.assertIsNotNone(third_annotations.get("last-major-upgrade-success"), "Annotation for last upgrade's success is not set")
1288+
self.assertNotEqual(annotations.get("last-major-upgrade-success"), third_annotations.get("last-major-upgrade-success"), "Annotation for last upgrade's success is not updated")
1289+
1290+
# test upgrade with failed upgrade annotation
1291+
pg_patch_version_16 = {
1292+
"metadata": {
1293+
"annotations": {
1294+
"last-major-upgrade-failure": "2024-01-02T15:04:05Z"
1295+
},
1296+
},
1297+
"spec": {
1298+
"postgresql": {
1299+
"version": "16"
1300+
},
1301+
},
1302+
}
12651303
k8s.api.custom_objects_api.patch_namespaced_custom_object(
12661304
"acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version_16)
12671305
self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
12681306

1269-
# should have finish failover
1307+
k8s.wait_for_pod_failover(master_nodes, 'spilo-role=master,' + cluster_label)
1308+
k8s.wait_for_pod_start('spilo-role=master,' + cluster_label)
1309+
k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
1310+
self.eventuallyEqual(check_version, 15, "Version should not be upgraded because annotation for last upgrade's failure is set")
1311+
1312+
# change the version back to 15 and should remove failure annotation
1313+
k8s.api.custom_objects_api.patch_namespaced_custom_object(
1314+
"acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version_15)
1315+
self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
1316+
12701317
k8s.wait_for_pod_failover(master_nodes, 'spilo-role=replica,' + cluster_label)
12711318
k8s.wait_for_pod_start('spilo-role=master,' + cluster_label)
12721319
k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
1273-
self.eventuallyEqual(check_version, 16, "Version should be upgraded from 14 to 16")
1320+
1321+
fourth_annotations = get_annotations()
1322+
self.assertIsNone(fourth_annotations.get("last-major-upgrade-failure"), "Annotation for last upgrade's failure is not removed")
12741323

12751324
@timeout_decorator.timeout(TEST_TIMEOUT_SEC)
12761325
def test_persistent_volume_claim_retention_policy(self):

pkg/cluster/majorversionupgrade.go

+64-1
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,16 @@
11
package cluster
22

33
import (
4+
"context"
5+
"encoding/json"
46
"fmt"
57
"strings"
68

79
"github.com/zalando/postgres-operator/pkg/spec"
810
"github.com/zalando/postgres-operator/pkg/util"
911
v1 "k8s.io/api/core/v1"
12+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
13+
"k8s.io/apimachinery/pkg/types"
1014
)
1115

1216
// VersionMap Map of version numbers
@@ -18,6 +22,11 @@ var VersionMap = map[string]int{
1822
"16": 160000,
1923
}
2024

25+
const (
26+
majorVersionUpgradeSuccessAnnotation = "last-major-upgrade-success"
27+
majorVersionUpgradeFailureAnnotation = "last-major-upgrade-failure"
28+
)
29+
2130
// IsBiggerPostgresVersion Compare two Postgres version numbers
2231
func IsBiggerPostgresVersion(old string, new string) bool {
2332
oldN := VersionMap[old]
@@ -54,6 +63,47 @@ func (c *Cluster) isUpgradeAllowedForTeam(owningTeam string) bool {
5463
return util.SliceContains(allowedTeams, owningTeam)
5564
}
5665

66+
func (c *Cluster) annotatePostgresResource(isSuccess bool) error {
67+
annotations := make(map[string]string)
68+
currentTime := metav1.Now().Format("2006-01-02T15:04:05Z")
69+
if isSuccess {
70+
annotations[majorVersionUpgradeSuccessAnnotation] = currentTime
71+
} else {
72+
annotations[majorVersionUpgradeFailureAnnotation] = currentTime
73+
}
74+
patchData, err := metaAnnotationsPatch(annotations)
75+
if err != nil {
76+
c.logger.Errorf("could not form patch for %s postgresql resource: %v", c.Name, err)
77+
return err
78+
}
79+
_, err = c.KubeClient.Postgresqls(c.Namespace).Patch(context.Background(), c.Name, types.MergePatchType, patchData, metav1.PatchOptions{})
80+
if err != nil {
81+
c.logger.Errorf("failed to patch annotations to postgresql resource: %v", err)
82+
return err
83+
}
84+
return nil
85+
}
86+
87+
func (c *Cluster) removeFailuresAnnotation() error {
88+
annotationToRemove := []map[string]string{
89+
{
90+
"op": "remove",
91+
"path": fmt.Sprintf("/metadata/annotations/%s", majorVersionUpgradeFailureAnnotation),
92+
},
93+
}
94+
removePatch, err := json.Marshal(annotationToRemove)
95+
if err != nil {
96+
c.logger.Errorf("could not form removal patch for %s postgresql resource: %v", c.Name, err)
97+
return err
98+
}
99+
_, err = c.KubeClient.Postgresqls(c.Namespace).Patch(context.Background(), c.Name, types.JSONPatchType, removePatch, metav1.PatchOptions{})
100+
if err != nil {
101+
c.logger.Errorf("failed to remove annotations from postgresql resource: %v", err)
102+
return err
103+
}
104+
return nil
105+
}
106+
57107
/*
58108
Execute upgrade when mode is set to manual or full or when the owning team is allowed for upgrade (and mode is "off").
59109
@@ -69,10 +119,19 @@ func (c *Cluster) majorVersionUpgrade() error {
69119
desiredVersion := c.GetDesiredMajorVersionAsInt()
70120

71121
if c.currentMajorVersion >= desiredVersion {
122+
if _, exists := c.ObjectMeta.Annotations[majorVersionUpgradeFailureAnnotation]; exists { // if failure annotation exists, remove it
123+
c.removeFailuresAnnotation()
124+
c.logger.Infof("removing failure annotation as the cluster is already up to date")
125+
}
72126
c.logger.Infof("cluster version up to date. current: %d, min desired: %d", c.currentMajorVersion, desiredVersion)
73127
return nil
74128
}
75129

130+
if _, exists := c.ObjectMeta.Annotations[majorVersionUpgradeFailureAnnotation]; exists {
131+
c.logger.Infof("last major upgrade failed, skipping upgrade")
132+
return nil
133+
}
134+
76135
if !isInMainternanceWindow(c.Spec.MaintenanceWindows) {
77136
c.logger.Infof("skipping major version upgrade, not in maintenance window")
78137
return nil
@@ -107,6 +166,7 @@ func (c *Cluster) majorVersionUpgrade() error {
107166
return nil
108167
}
109168

169+
isUpgradeSuccess := true
110170
numberOfPods := len(pods)
111171
if allRunning && masterPod != nil {
112172
c.logger.Infof("healthy cluster ready to upgrade, current: %d desired: %d", c.currentMajorVersion, desiredVersion)
@@ -132,11 +192,14 @@ func (c *Cluster) majorVersionUpgrade() error {
132192
result, err = c.ExecCommand(podName, "/bin/su", "postgres", "-c", upgradeCommand)
133193
}
134194
if err != nil {
195+
isUpgradeSuccess = false
196+
c.annotatePostgresResource(isUpgradeSuccess)
135197
c.eventRecorder.Eventf(c.GetReference(), v1.EventTypeWarning, "Major Version Upgrade", "upgrade from %d to %d FAILED: %v", c.currentMajorVersion, desiredVersion, err)
136198
return err
137199
}
138-
c.logger.Infof("upgrade action triggered and command completed: %s", result[:100])
139200

201+
c.annotatePostgresResource(isUpgradeSuccess)
202+
c.logger.Infof("upgrade action triggered and command completed: %s", result[:100])
140203
c.eventRecorder.Eventf(c.GetReference(), v1.EventTypeNormal, "Major Version Upgrade", "upgrade from %d to %d finished", c.currentMajorVersion, desiredVersion)
141204
}
142205
}

0 commit comments

Comments
 (0)