@@ -43,7 +43,7 @@ var _ = g.Describe("[sig-etcd][Feature:DisasterRecovery][Suite:openshift/etcd/re
43
43
44
44
masters := masterNodes (oc )
45
45
// Need one node to back up from and another to restore to
46
- o .Expect (len (masters )).To (o .BeNumerically (">=" , 2 ))
46
+ o .Expect (len (masters )).To (o .BeNumerically (">=" , 3 ))
47
47
48
48
// Pick one node to back up on
49
49
backupNode := masters [0 ]
@@ -124,7 +124,7 @@ var _ = g.Describe("[sig-etcd][Feature:DisasterRecovery][Suite:openshift/etcd/re
124
124
o .Expect (err ).ToNot (o .HaveOccurred ())
125
125
126
126
masters := masterNodes (oc )
127
- o .Expect (len (masters )).To (o .BeNumerically (">=" , 2 ))
127
+ o .Expect (len (masters )).To (o .BeNumerically (">=" , 3 ))
128
128
backupNode := masters [0 ]
129
129
framework .Logf ("Selecting node %q as the backup host" , backupNode .Name )
130
130
recoveryNode := masters [1 ]
@@ -151,11 +151,6 @@ var _ = g.Describe("[sig-etcd][Feature:DisasterRecovery][Suite:openshift/etcd/re
151
151
152
152
// we should come back with a single etcd static pod
153
153
waitForReadyEtcdStaticPods (oc .AdminKubeClient (), 1 )
154
-
155
- // TODO(thomas): since we're bumping resources, that should not be necessary anymore
156
- // err = runOVNRepairCommands(oc, recoveryNode, nonRecoveryNodes)
157
- // o.Expect(err).ToNot(o.HaveOccurred())
158
-
159
154
forceOperandRedeployment (oc .AdminOperatorClient ().OperatorV1 ())
160
155
// CEO will bring back the other etcd static pods again
161
156
waitForReadyEtcdStaticPods (oc .AdminKubeClient (), len (masters ))
@@ -165,3 +160,62 @@ var _ = g.Describe("[sig-etcd][Feature:DisasterRecovery][Suite:openshift/etcd/re
165
160
assertPostBackupResourcesAreNotFound (oc )
166
161
})
167
162
})
163
+
164
+ var _ = g .Describe ("[sig-etcd][Feature:DisasterRecovery][Suite:openshift/etcd/recovery][Timeout:1h]" , func () {
165
+ defer g .GinkgoRecover ()
166
+
167
+ f := framework .NewDefaultFramework ("recovery" )
168
+ f .SkipNamespaceCreation = true
169
+ oc := exutil .NewCLIWithoutNamespace ("recovery" )
170
+
171
+ g .AfterEach (func () {
172
+ g .GinkgoT ().Log ("turning the quorum guard back on" )
173
+ data := fmt .Sprintf (`{"spec": {"unsupportedConfigOverrides": {"useUnsupportedUnsafeNonHANonProductionUnstableEtcd": false}}}` )
174
+ _ , err := oc .AdminOperatorClient ().OperatorV1 ().Etcds ().Patch (context .Background (), "cluster" , types .MergePatchType , []byte (data ), metav1.PatchOptions {})
175
+ o .Expect (err ).ToNot (o .HaveOccurred ())
176
+
177
+ // we need to ensure this test also ends with a stable revision for api and etcd
178
+ g .GinkgoT ().Log ("waiting for api servers to stabilize on the same revision" )
179
+ err = waitForApiServerToStabilizeOnTheSameRevision (g .GinkgoT (), oc )
180
+ err = errors .Wrap (err , "cleanup timed out waiting for APIServer pods to stabilize on the same revision" )
181
+ o .Expect (err ).ToNot (o .HaveOccurred ())
182
+
183
+ g .GinkgoT ().Log ("waiting for etcd to stabilize on the same revision" )
184
+ err = waitForEtcdToStabilizeOnTheSameRevision (g .GinkgoT (), oc )
185
+ err = errors .Wrap (err , "cleanup timed out waiting for etcd pods to stabilize on the same revision" )
186
+ o .Expect (err ).ToNot (o .HaveOccurred ())
187
+ })
188
+
189
+ g .It ("[Feature:EtcdRecovery][Disruptive] Recover with quorum restore" , func () {
190
+ // ensure the CEO can still act without quorum, doing it first so the CEO can cycle while we install ssh keys
191
+ data := fmt .Sprintf (`{"spec": {"unsupportedConfigOverrides": {"useUnsupportedUnsafeNonHANonProductionUnstableEtcd": true}}}` )
192
+ _ , err := oc .AdminOperatorClient ().OperatorV1 ().Etcds ().Patch (context .Background (), "cluster" , types .MergePatchType , []byte (data ), metav1.PatchOptions {})
193
+ o .Expect (err ).ToNot (o .HaveOccurred ())
194
+
195
+ // we need to ensure each test starts with a stable revision for api and etcd
196
+ g .GinkgoT ().Log ("waiting for api servers to stabilize on the same revision" )
197
+ err = waitForApiServerToStabilizeOnTheSameRevision (g .GinkgoT (), oc )
198
+ err = errors .Wrap (err , "cleanup timed out waiting for APIServer pods to stabilize on the same revision" )
199
+ o .Expect (err ).ToNot (o .HaveOccurred ())
200
+
201
+ g .GinkgoT ().Log ("waiting for etcd to stabilize on the same revision" )
202
+ err = waitForEtcdToStabilizeOnTheSameRevision (g .GinkgoT (), oc )
203
+ err = errors .Wrap (err , "cleanup timed out waiting for etcd pods to stabilize on the same revision" )
204
+ o .Expect (err ).ToNot (o .HaveOccurred ())
205
+
206
+ err = InstallSSHKeyOnControlPlaneNodes (oc )
207
+ o .Expect (err ).ToNot (o .HaveOccurred ())
208
+
209
+ masters := masterNodes (oc )
210
+ o .Expect (len (masters )).To (o .BeNumerically (">=" , 3 ))
211
+ recoveryNode := masters [2 ]
212
+
213
+ err = runQuorumRestoreScript (oc , recoveryNode )
214
+ o .Expect (err ).ToNot (o .HaveOccurred ())
215
+
216
+ forceOperandRedeployment (oc .AdminOperatorClient ().OperatorV1 ())
217
+ // CEO will bring back the other etcd static pods again
218
+ waitForReadyEtcdStaticPods (oc .AdminKubeClient (), len (masters ))
219
+ waitForOperatorsToSettle ()
220
+ })
221
+ })
0 commit comments