diff --git a/controlplane/kubeadm/internal/cluster.go b/controlplane/kubeadm/internal/cluster.go index f4b0b7fac16e..43be58ec1aa3 100644 --- a/controlplane/kubeadm/internal/cluster.go +++ b/controlplane/kubeadm/internal/cluster.go @@ -181,7 +181,7 @@ func (m *Management) getApiServerEtcdClientCert(ctx context.Context, clusterKey type healthCheck func(context.Context) (HealthCheckResult, error) -// HealthCheck will run a generic health check function and report any errors discovered. +// healthCheck will run a generic health check function and report any errors discovered. // In addition to the health check, it also ensures there is a 1;1 match between nodes and machines. func (m *Management) healthCheck(ctx context.Context, check healthCheck, clusterKey client.ObjectKey) error { var errorList []error diff --git a/controlplane/kubeadm/internal/etcd/etcd.go b/controlplane/kubeadm/internal/etcd/etcd.go index a71fbb898de7..ffb87c02279d 100644 --- a/controlplane/kubeadm/internal/etcd/etcd.go +++ b/controlplane/kubeadm/internal/etcd/etcd.go @@ -24,6 +24,7 @@ import ( "github.com/pkg/errors" "go.etcd.io/etcd/clientv3" + "go.etcd.io/etcd/etcdserver/api/v3rpc/rpctypes" "go.etcd.io/etcd/etcdserver/etcdserverpb" "google.golang.org/grpc" "sigs.k8s.io/cluster-api/controlplane/kubeadm/internal/proxy" @@ -46,6 +47,7 @@ type etcd interface { MemberUpdate(ctx context.Context, id uint64, peerURLs []string) (*clientv3.MemberUpdateResponse, error) MoveLeader(ctx context.Context, id uint64) (*clientv3.MoveLeaderResponse, error) Status(ctx context.Context, endpoint string) (*clientv3.StatusResponse, error) + Get(ctx context.Context, key string, opts ...clientv3.OpOption) (*clientv3.GetResponse, error) } // Client wraps an etcd client formatting its output to something more consumable. @@ -234,3 +236,16 @@ func (c *Client) Alarms(ctx context.Context) ([]MemberAlarm, error) { return memberAlarms, nil } + +// IsEndpointHealthy checks the healthiness of endpoints specified in endpoints during Client creation. +// Using the same logic used in etcdctl health command: +// "get a random key. As long as we can get the response without an error, the endpoint is health." +func (c *Client) IsEndpointHealthy(ctx context.Context) error { + _, err := c.EtcdClient.Get(ctx, "health") + + // permission denied is OK since it reports the endpoint is working, no matter of the grants on the specific key + if err == nil || err == rpctypes.ErrPermissionDenied { + return nil + } + return err +} diff --git a/controlplane/kubeadm/internal/etcd/etcd_test.go b/controlplane/kubeadm/internal/etcd/etcd_test.go index a4c219792dc2..49d0d05b9554 100644 --- a/controlplane/kubeadm/internal/etcd/etcd_test.go +++ b/controlplane/kubeadm/internal/etcd/etcd_test.go @@ -85,6 +85,7 @@ func TestEtcdMembers_WithSuccess(t *testing.T) { MemberRemoveResponse: &clientv3.MemberRemoveResponse{}, AlarmResponse: &clientv3.AlarmResponse{}, StatusResponse: &clientv3.StatusResponse{}, + GetResponse: &clientv3.GetResponse{}, } client, err := newEtcdClient(ctx, fakeEtcdClient) diff --git a/controlplane/kubeadm/internal/etcd/fake/client.go b/controlplane/kubeadm/internal/etcd/fake/client.go index 98ce83c9a2bb..42a48cfbe6a5 100644 --- a/controlplane/kubeadm/internal/etcd/fake/client.go +++ b/controlplane/kubeadm/internal/etcd/fake/client.go @@ -30,6 +30,7 @@ type FakeEtcdClient struct { MemberUpdateResponse *clientv3.MemberUpdateResponse MoveLeaderResponse *clientv3.MoveLeaderResponse StatusResponse *clientv3.StatusResponse + GetResponse *clientv3.GetResponse ErrorResponse error MovedLeader uint64 RemovedMember uint64 @@ -65,3 +66,7 @@ func (c *FakeEtcdClient) MemberUpdate(_ context.Context, _ uint64, _ []string) ( func (c *FakeEtcdClient) Status(_ context.Context, _ string) (*clientv3.StatusResponse, error) { return c.StatusResponse, nil } + +func (c *FakeEtcdClient) Get(ctx context.Context, key string, opts ...clientv3.OpOption) (*clientv3.GetResponse, error) { + return c.GetResponse, nil +} diff --git a/controlplane/kubeadm/internal/etcd_client_generator.go b/controlplane/kubeadm/internal/etcd_client_generator.go index e709a3bae48d..4b5680baca8d 100644 --- a/controlplane/kubeadm/internal/etcd_client_generator.go +++ b/controlplane/kubeadm/internal/etcd_client_generator.go @@ -21,10 +21,9 @@ import ( "crypto/tls" "github.com/pkg/errors" - kerrors "k8s.io/apimachinery/pkg/util/errors" - corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + kerrors "k8s.io/apimachinery/pkg/util/errors" "k8s.io/client-go/rest" "sigs.k8s.io/cluster-api/controlplane/kubeadm/internal/etcd" "sigs.k8s.io/cluster-api/controlplane/kubeadm/internal/proxy" diff --git a/controlplane/kubeadm/internal/workload_cluster_etcd.go b/controlplane/kubeadm/internal/workload_cluster_etcd.go index 78261db27911..de6a76647dd1 100644 --- a/controlplane/kubeadm/internal/workload_cluster_etcd.go +++ b/controlplane/kubeadm/internal/workload_cluster_etcd.go @@ -49,6 +49,8 @@ func (w *Workload) EtcdIsHealthy(ctx context.Context) (HealthCheckResult, error) expectedMembers := 0 response := make(map[string]error) + var memberList []*etcd.Member + nodesReportSameMemberList := true for _, node := range controlPlaneNodes.Items { name := node.Name response[name] = nil @@ -85,6 +87,12 @@ func (w *Workload) EtcdIsHealthy(ctx context.Context) (HealthCheckResult, error) } defer etcdClient.Close() + // Check healthiness of the etcd endpoint by trying to get a random key. + if err := etcdClient.IsEndpointHealthy(ctx); err != nil { + response[name] = errors.Wrap(err, "etcd member is unhealthy") + continue + } + // List etcd members. This checks that the member is healthy, because the request goes through consensus. members, err := etcdClient.Members(ctx) if err != nil { @@ -92,7 +100,13 @@ func (w *Workload) EtcdIsHealthy(ctx context.Context) (HealthCheckResult, error) continue } + memberList = members + member := etcdutil.MemberForName(members, name) + if member == nil { + response[name] = errors.Wrap(err, "failed to find the member for the machine") + continue + } // Check that the member reports no alarms. if len(member.Alarms) > 0 { @@ -116,9 +130,10 @@ func (w *Workload) EtcdIsHealthy(ctx context.Context) (HealthCheckResult, error) } else { unknownMembers := memberIDSet.Difference(knownMemberIDSet) if unknownMembers.Len() > 0 { + nodesReportSameMemberList = false response[name] = errors.Errorf("etcd member reports members IDs %v, but all previously seen etcd members reported member IDs %v", memberIDSet.UnsortedList(), knownMemberIDSet.UnsortedList()) + continue } - continue } } @@ -130,6 +145,17 @@ func (w *Workload) EtcdIsHealthy(ctx context.Context) (HealthCheckResult, error) return response, errors.Errorf("there are %d healthy etcd pods, but %d etcd members", expectedMembers, len(knownMemberIDSet)) } + // If the nodes are reporting a consistent member list make a further check ensuring that all the etcd members + // have a corresponding node (so there should not be "external" etcd members). + if nodesReportSameMemberList { + for _, m := range memberList { + // NOTE: we are using response for this check because it has a 1:1 correspondence with control plane nodes and map does support existence check + if _, ok := response[m.Name]; !ok { + return response, errors.Errorf("etcd member %q does not have a corresponding control plane nodes", m.Name) + } + } + } + return response, nil } diff --git a/controlplane/kubeadm/internal/workload_cluster_etcd_test.go b/controlplane/kubeadm/internal/workload_cluster_etcd_test.go index 90c049761ee1..f5a8d36475b5 100644 --- a/controlplane/kubeadm/internal/workload_cluster_etcd_test.go +++ b/controlplane/kubeadm/internal/workload_cluster_etcd_test.go @@ -70,10 +70,12 @@ func TestWorkload_EtcdIsHealthy(t *testing.T) { AlarmResponse: &clientv3.AlarmResponse{ Alarms: []*pb.AlarmMember{}, }, + GetResponse: &clientv3.GetResponse{}, }, }, }, } + ctx := context.Background() health, err := workload.EtcdIsHealthy(ctx) g.Expect(err).NotTo(HaveOccurred())