Merge pull request #29169 from tjungblu/quorum_restore_test

openshift-merge-bot[bot] · web-flow · commit f355db75374b · 2024-11-15T21:27:33.000Z
ETCD-657: add simple quorum restore test
diff --git a/test/extended/dr/common.go b/test/extended/dr/common.go
@@ -599,6 +599,51 @@ EOF`, sshKeyDance, strings.Join(nonRecoveryIps, " "), restoreInternalIp, backupI
 	return runPod(oc, pod)
 }
 
+func runQuorumRestoreScript(oc *exutil.CLI, restoreNode *corev1.Node) error {
+	const name = "quorum-repair-etcd-pod"
+	framework.Logf("running quorum restore script on node: %v", restoreNode.Name)
+
+	restoreScript := fmt.Sprintf(`
+        #!/bin/bash
+        set -exuo pipefail
+        
+        # ssh key dance
+        %s
+
+        TARGET_NODE_NAME=%s 
+        ssh -i $P_KEY -o StrictHostKeyChecking=no -q core@${TARGET_NODE_NAME} <<EOF
+        sudo /usr/local/bin/quorum-restore.sh
+        # this will cause the pod to disappear effectively, must be the last statement
+        sudo systemctl restart kubelet.service
+
+EOF`, sshKeyDance, internalIP(restoreNode))
+
+	podSpec := applycorev1.PodSpec().WithHostNetwork(true).WithRestartPolicy(corev1.RestartPolicyOnFailure)
+	podSpec.Containers = []applycorev1.ContainerApplyConfiguration{
+		*applycorev1.Container().
+			WithName("cluster-restore").
+			WithSecurityContext(applycorev1.SecurityContext().WithPrivileged(true)).
+			WithImage(image.ShellImage()).
+			WithVolumeMounts(
+				applycorev1.VolumeMount().WithName("keys").WithMountPath(sshPath),
+			).
+			WithCommand("/bin/bash", "-c", restoreScript),
+	}
+
+	podSpec.NodeSelector = map[string]string{"kubernetes.io/hostname": restoreNode.Labels["kubernetes.io/hostname"]}
+	podSpec.Tolerations = []applycorev1.TolerationApplyConfiguration{
+		*applycorev1.Toleration().WithKey("node-role.kubernetes.io/master").WithOperator(corev1.TolerationOpExists).WithEffect(corev1.TaintEffectNoSchedule),
+	}
+
+	podSpec.Volumes = []applycorev1.VolumeApplyConfiguration{
+		*applycorev1.Volume().WithName("keys").WithSecret(applycorev1.SecretVolumeSource().WithSecretName("dr-ssh")),
+	}
+
+	pod := applycorev1.Pod(name, openshiftEtcdNamespace).WithSpec(podSpec)
+	// we only run the pod and not wait for it, as it will not be tracked after the control plane comes back
+	return runPod(oc, pod)
+}
+
 func runPodAndWaitForSuccess(oc *exutil.CLI, pod *applycorev1.PodApplyConfiguration) error {
 	err := runPod(oc, pod)
 	if err != nil {
diff --git a/test/extended/dr/recovery.go b/test/extended/dr/recovery.go
@@ -43,7 +43,7 @@ var _ = g.Describe("[sig-etcd][Feature:DisasterRecovery][Suite:openshift/etcd/re
 
 		masters := masterNodes(oc)
 		// Need one node to back up from and another to restore to
-		o.Expect(len(masters)).To(o.BeNumerically(">=", 2))
+		o.Expect(len(masters)).To(o.BeNumerically(">=", 3))
 
 		// Pick one node to back up on
 		backupNode := masters[0]
@@ -124,7 +124,7 @@ var _ = g.Describe("[sig-etcd][Feature:DisasterRecovery][Suite:openshift/etcd/re
 		o.Expect(err).ToNot(o.HaveOccurred())
 
 		masters := masterNodes(oc)
-		o.Expect(len(masters)).To(o.BeNumerically(">=", 2))
+		o.Expect(len(masters)).To(o.BeNumerically(">=", 3))
 		backupNode := masters[0]
 		framework.Logf("Selecting node %q as the backup host", backupNode.Name)
 		recoveryNode := masters[1]
@@ -151,11 +151,6 @@ var _ = g.Describe("[sig-etcd][Feature:DisasterRecovery][Suite:openshift/etcd/re
 
 		// we should come back with a single etcd static pod
 		waitForReadyEtcdStaticPods(oc.AdminKubeClient(), 1)
-
-		// TODO(thomas): since we're bumping resources, that should not be necessary anymore
-		// err = runOVNRepairCommands(oc, recoveryNode, nonRecoveryNodes)
-		// o.Expect(err).ToNot(o.HaveOccurred())
-
 		forceOperandRedeployment(oc.AdminOperatorClient().OperatorV1())
 		// CEO will bring back the other etcd static pods again
 		waitForReadyEtcdStaticPods(oc.AdminKubeClient(), len(masters))
@@ -165,3 +160,62 @@ var _ = g.Describe("[sig-etcd][Feature:DisasterRecovery][Suite:openshift/etcd/re
 		assertPostBackupResourcesAreNotFound(oc)
 	})
 })
+
+var _ = g.Describe("[sig-etcd][Feature:DisasterRecovery][Suite:openshift/etcd/recovery][Timeout:1h]", func() {
+	defer g.GinkgoRecover()
+
+	f := framework.NewDefaultFramework("recovery")
+	f.SkipNamespaceCreation = true
+	oc := exutil.NewCLIWithoutNamespace("recovery")
+
+	g.AfterEach(func() {
+		g.GinkgoT().Log("turning the quorum guard back on")
+		data := fmt.Sprintf(`{"spec": {"unsupportedConfigOverrides": {"useUnsupportedUnsafeNonHANonProductionUnstableEtcd": false}}}`)
+		_, err := oc.AdminOperatorClient().OperatorV1().Etcds().Patch(context.Background(), "cluster", types.MergePatchType, []byte(data), metav1.PatchOptions{})
+		o.Expect(err).ToNot(o.HaveOccurred())
+
+		// we need to ensure this test also ends with a stable revision for api and etcd
+		g.GinkgoT().Log("waiting for api servers to stabilize on the same revision")
+		err = waitForApiServerToStabilizeOnTheSameRevision(g.GinkgoT(), oc)
+		err = errors.Wrap(err, "cleanup timed out waiting for APIServer pods to stabilize on the same revision")
+		o.Expect(err).ToNot(o.HaveOccurred())
+
+		g.GinkgoT().Log("waiting for etcd to stabilize on the same revision")
+		err = waitForEtcdToStabilizeOnTheSameRevision(g.GinkgoT(), oc)
+		err = errors.Wrap(err, "cleanup timed out waiting for etcd pods to stabilize on the same revision")
+		o.Expect(err).ToNot(o.HaveOccurred())
+	})
+
+	g.It("[Feature:EtcdRecovery][Disruptive] Recover with quorum restore", func() {
+		// ensure the CEO can still act without quorum, doing it first so the CEO can cycle while we install ssh keys
+		data := fmt.Sprintf(`{"spec": {"unsupportedConfigOverrides": {"useUnsupportedUnsafeNonHANonProductionUnstableEtcd": true}}}`)
+		_, err := oc.AdminOperatorClient().OperatorV1().Etcds().Patch(context.Background(), "cluster", types.MergePatchType, []byte(data), metav1.PatchOptions{})
+		o.Expect(err).ToNot(o.HaveOccurred())
+
+		// we need to ensure each test starts with a stable revision for api and etcd
+		g.GinkgoT().Log("waiting for api servers to stabilize on the same revision")
+		err = waitForApiServerToStabilizeOnTheSameRevision(g.GinkgoT(), oc)
+		err = errors.Wrap(err, "cleanup timed out waiting for APIServer pods to stabilize on the same revision")
+		o.Expect(err).ToNot(o.HaveOccurred())
+
+		g.GinkgoT().Log("waiting for etcd to stabilize on the same revision")
+		err = waitForEtcdToStabilizeOnTheSameRevision(g.GinkgoT(), oc)
+		err = errors.Wrap(err, "cleanup timed out waiting for etcd pods to stabilize on the same revision")
+		o.Expect(err).ToNot(o.HaveOccurred())
+
+		err = InstallSSHKeyOnControlPlaneNodes(oc)
+		o.Expect(err).ToNot(o.HaveOccurred())
+
+		masters := masterNodes(oc)
+		o.Expect(len(masters)).To(o.BeNumerically(">=", 3))
+		recoveryNode := masters[2]
+
+		err = runQuorumRestoreScript(oc, recoveryNode)
+		o.Expect(err).ToNot(o.HaveOccurred())
+
+		forceOperandRedeployment(oc.AdminOperatorClient().OperatorV1())
+		// CEO will bring back the other etcd static pods again
+		waitForReadyEtcdStaticPods(oc.AdminKubeClient(), len(masters))
+		waitForOperatorsToSettle()
+	})
+})
diff --git a/test/extended/util/annotate/generated/zz_generated.annotations.go b/test/extended/util/annotate/generated/zz_generated.annotations.go