Skip to content

Commit f355db7

Browse files
Merge pull request #29169 from tjungblu/quorum_restore_test
ETCD-657: add simple quorum restore test
2 parents 92addf5 + 2829d3c commit f355db7

File tree

3 files changed

+108
-7
lines changed

3 files changed

+108
-7
lines changed

test/extended/dr/common.go

+45
Original file line numberDiff line numberDiff line change
@@ -599,6 +599,51 @@ EOF`, sshKeyDance, strings.Join(nonRecoveryIps, " "), restoreInternalIp, backupI
599599
return runPod(oc, pod)
600600
}
601601

602+
func runQuorumRestoreScript(oc *exutil.CLI, restoreNode *corev1.Node) error {
603+
const name = "quorum-repair-etcd-pod"
604+
framework.Logf("running quorum restore script on node: %v", restoreNode.Name)
605+
606+
restoreScript := fmt.Sprintf(`
607+
#!/bin/bash
608+
set -exuo pipefail
609+
610+
# ssh key dance
611+
%s
612+
613+
TARGET_NODE_NAME=%s
614+
ssh -i $P_KEY -o StrictHostKeyChecking=no -q core@${TARGET_NODE_NAME} <<EOF
615+
sudo /usr/local/bin/quorum-restore.sh
616+
# this will cause the pod to disappear effectively, must be the last statement
617+
sudo systemctl restart kubelet.service
618+
619+
EOF`, sshKeyDance, internalIP(restoreNode))
620+
621+
podSpec := applycorev1.PodSpec().WithHostNetwork(true).WithRestartPolicy(corev1.RestartPolicyOnFailure)
622+
podSpec.Containers = []applycorev1.ContainerApplyConfiguration{
623+
*applycorev1.Container().
624+
WithName("cluster-restore").
625+
WithSecurityContext(applycorev1.SecurityContext().WithPrivileged(true)).
626+
WithImage(image.ShellImage()).
627+
WithVolumeMounts(
628+
applycorev1.VolumeMount().WithName("keys").WithMountPath(sshPath),
629+
).
630+
WithCommand("/bin/bash", "-c", restoreScript),
631+
}
632+
633+
podSpec.NodeSelector = map[string]string{"kubernetes.io/hostname": restoreNode.Labels["kubernetes.io/hostname"]}
634+
podSpec.Tolerations = []applycorev1.TolerationApplyConfiguration{
635+
*applycorev1.Toleration().WithKey("node-role.kubernetes.io/master").WithOperator(corev1.TolerationOpExists).WithEffect(corev1.TaintEffectNoSchedule),
636+
}
637+
638+
podSpec.Volumes = []applycorev1.VolumeApplyConfiguration{
639+
*applycorev1.Volume().WithName("keys").WithSecret(applycorev1.SecretVolumeSource().WithSecretName("dr-ssh")),
640+
}
641+
642+
pod := applycorev1.Pod(name, openshiftEtcdNamespace).WithSpec(podSpec)
643+
// we only run the pod and not wait for it, as it will not be tracked after the control plane comes back
644+
return runPod(oc, pod)
645+
}
646+
602647
func runPodAndWaitForSuccess(oc *exutil.CLI, pod *applycorev1.PodApplyConfiguration) error {
603648
err := runPod(oc, pod)
604649
if err != nil {

test/extended/dr/recovery.go

+61-7
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ var _ = g.Describe("[sig-etcd][Feature:DisasterRecovery][Suite:openshift/etcd/re
4343

4444
masters := masterNodes(oc)
4545
// Need one node to back up from and another to restore to
46-
o.Expect(len(masters)).To(o.BeNumerically(">=", 2))
46+
o.Expect(len(masters)).To(o.BeNumerically(">=", 3))
4747

4848
// Pick one node to back up on
4949
backupNode := masters[0]
@@ -124,7 +124,7 @@ var _ = g.Describe("[sig-etcd][Feature:DisasterRecovery][Suite:openshift/etcd/re
124124
o.Expect(err).ToNot(o.HaveOccurred())
125125

126126
masters := masterNodes(oc)
127-
o.Expect(len(masters)).To(o.BeNumerically(">=", 2))
127+
o.Expect(len(masters)).To(o.BeNumerically(">=", 3))
128128
backupNode := masters[0]
129129
framework.Logf("Selecting node %q as the backup host", backupNode.Name)
130130
recoveryNode := masters[1]
@@ -151,11 +151,6 @@ var _ = g.Describe("[sig-etcd][Feature:DisasterRecovery][Suite:openshift/etcd/re
151151

152152
// we should come back with a single etcd static pod
153153
waitForReadyEtcdStaticPods(oc.AdminKubeClient(), 1)
154-
155-
// TODO(thomas): since we're bumping resources, that should not be necessary anymore
156-
// err = runOVNRepairCommands(oc, recoveryNode, nonRecoveryNodes)
157-
// o.Expect(err).ToNot(o.HaveOccurred())
158-
159154
forceOperandRedeployment(oc.AdminOperatorClient().OperatorV1())
160155
// CEO will bring back the other etcd static pods again
161156
waitForReadyEtcdStaticPods(oc.AdminKubeClient(), len(masters))
@@ -165,3 +160,62 @@ var _ = g.Describe("[sig-etcd][Feature:DisasterRecovery][Suite:openshift/etcd/re
165160
assertPostBackupResourcesAreNotFound(oc)
166161
})
167162
})
163+
164+
var _ = g.Describe("[sig-etcd][Feature:DisasterRecovery][Suite:openshift/etcd/recovery][Timeout:1h]", func() {
165+
defer g.GinkgoRecover()
166+
167+
f := framework.NewDefaultFramework("recovery")
168+
f.SkipNamespaceCreation = true
169+
oc := exutil.NewCLIWithoutNamespace("recovery")
170+
171+
g.AfterEach(func() {
172+
g.GinkgoT().Log("turning the quorum guard back on")
173+
data := fmt.Sprintf(`{"spec": {"unsupportedConfigOverrides": {"useUnsupportedUnsafeNonHANonProductionUnstableEtcd": false}}}`)
174+
_, err := oc.AdminOperatorClient().OperatorV1().Etcds().Patch(context.Background(), "cluster", types.MergePatchType, []byte(data), metav1.PatchOptions{})
175+
o.Expect(err).ToNot(o.HaveOccurred())
176+
177+
// we need to ensure this test also ends with a stable revision for api and etcd
178+
g.GinkgoT().Log("waiting for api servers to stabilize on the same revision")
179+
err = waitForApiServerToStabilizeOnTheSameRevision(g.GinkgoT(), oc)
180+
err = errors.Wrap(err, "cleanup timed out waiting for APIServer pods to stabilize on the same revision")
181+
o.Expect(err).ToNot(o.HaveOccurred())
182+
183+
g.GinkgoT().Log("waiting for etcd to stabilize on the same revision")
184+
err = waitForEtcdToStabilizeOnTheSameRevision(g.GinkgoT(), oc)
185+
err = errors.Wrap(err, "cleanup timed out waiting for etcd pods to stabilize on the same revision")
186+
o.Expect(err).ToNot(o.HaveOccurred())
187+
})
188+
189+
g.It("[Feature:EtcdRecovery][Disruptive] Recover with quorum restore", func() {
190+
// ensure the CEO can still act without quorum, doing it first so the CEO can cycle while we install ssh keys
191+
data := fmt.Sprintf(`{"spec": {"unsupportedConfigOverrides": {"useUnsupportedUnsafeNonHANonProductionUnstableEtcd": true}}}`)
192+
_, err := oc.AdminOperatorClient().OperatorV1().Etcds().Patch(context.Background(), "cluster", types.MergePatchType, []byte(data), metav1.PatchOptions{})
193+
o.Expect(err).ToNot(o.HaveOccurred())
194+
195+
// we need to ensure each test starts with a stable revision for api and etcd
196+
g.GinkgoT().Log("waiting for api servers to stabilize on the same revision")
197+
err = waitForApiServerToStabilizeOnTheSameRevision(g.GinkgoT(), oc)
198+
err = errors.Wrap(err, "cleanup timed out waiting for APIServer pods to stabilize on the same revision")
199+
o.Expect(err).ToNot(o.HaveOccurred())
200+
201+
g.GinkgoT().Log("waiting for etcd to stabilize on the same revision")
202+
err = waitForEtcdToStabilizeOnTheSameRevision(g.GinkgoT(), oc)
203+
err = errors.Wrap(err, "cleanup timed out waiting for etcd pods to stabilize on the same revision")
204+
o.Expect(err).ToNot(o.HaveOccurred())
205+
206+
err = InstallSSHKeyOnControlPlaneNodes(oc)
207+
o.Expect(err).ToNot(o.HaveOccurred())
208+
209+
masters := masterNodes(oc)
210+
o.Expect(len(masters)).To(o.BeNumerically(">=", 3))
211+
recoveryNode := masters[2]
212+
213+
err = runQuorumRestoreScript(oc, recoveryNode)
214+
o.Expect(err).ToNot(o.HaveOccurred())
215+
216+
forceOperandRedeployment(oc.AdminOperatorClient().OperatorV1())
217+
// CEO will bring back the other etcd static pods again
218+
waitForReadyEtcdStaticPods(oc.AdminKubeClient(), len(masters))
219+
waitForOperatorsToSettle()
220+
})
221+
})

test/extended/util/annotate/generated/zz_generated.annotations.go

+2
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)