Skip to content

✨ Update MHC with v1Beta2 status #11290

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 52 additions & 3 deletions api/v1beta1/machine_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -200,22 +200,71 @@ const (
// Note: this could happen when creating the machine. However, this state should be treated as an error if it lasts indefinitely.
MachineNodeDoesNotExistV1Beta2Reason = ObjectDoesNotExistV1Beta2Reason

// MachineNodeDeletedV1Beta2Reason surfaces when the node hosted on the machine has been deleted.
// MachineNodeDeletedV1Beta2Reason surfaces when the node hosted on the machine has been deleted.
// Note: controllers can't identify if the Node was deleted by the controller itself, e.g.
// during the deletion workflow, or by a users.
MachineNodeDeletedV1Beta2Reason = ObjectDeletedV1Beta2Reason
)

// Machine's HealthCheckSucceeded and OwnerRemediated conditions and corresponding reasons that will be used in v1Beta2 API version.
// Note: HealthCheckSucceeded and OwnerRemediated condition are set by the MachineHealthCheck controller.
// Machine's HealthCheckSucceeded condition and corresponding reasons that will be used in v1Beta2 API version.
// Note: HealthCheckSucceeded condition is set by the MachineHealthCheck controller.
const (
// MachineHealthCheckSucceededV1Beta2Condition is true if MHC instances targeting this machine report the Machine
// is healthy according to the definition of healthy present in the spec of the MachineHealthCheck object.
MachineHealthCheckSucceededV1Beta2Condition = "HealthCheckSucceeded"

// MachineHealthCheckSucceededV1Beta2Reason surfaces when a machine passes all the health checks defined by a MachineHealthCheck object.
MachineHealthCheckSucceededV1Beta2Reason = "HealthCheckSucceeded"

// MachineHealthCheckUnhealthyNodeV1Beta2Reason surfaces when the node hosted on the machine does not pass the health checks
// defined by a MachineHealthCheck object.
MachineHealthCheckUnhealthyNodeV1Beta2Reason = "UnhealthyNode"

// MachineHealthCheckNodeStartupTimeoutV1Beta2Reason surfaces when the node hosted on the machine does not appear within
// the timeout defined by a MachineHealthCheck object.
MachineHealthCheckNodeStartupTimeoutV1Beta2Reason = "NodeStartupTimeout"

// MachineHealthCheckNodeDeletedV1Beta2Reason surfaces when a MachineHealthCheck detect that the node hosted on the
// machine has been deleted while the Machine is still running.
MachineHealthCheckNodeDeletedV1Beta2Reason = "NodeDeleted"

// MachineHealthCheckHasRemediateAnnotationV1Beta2Reason surfaces a MachineHealthCheck detects a machine manually remediated
// via the remediate-machine annotation.
MachineHealthCheckHasRemediateAnnotationV1Beta2Reason = "HasRemediateAnnotation"
)

// Machine's OwnerRemediated conditions and corresponding reasons that will be used in v1Beta2 API version.
// Note: OwnerRemediated condition is initially set by the MachineHealthCheck controller; then it is up to the Machine's
// owner controller to update or delete this condition.
const (
// MachineOwnerRemediatedV1Beta2Condition is only present if MHC instances targeting this machine
// determine that the controller owning this machine should perform remediation.
MachineOwnerRemediatedV1Beta2Condition = "OwnerRemediated"

// MachineOwnerRemediatedWaitingForRemediationV1Beta2Reason surfaces the machine is waiting for the owner controller
// to start remediation.
MachineOwnerRemediatedWaitingForRemediationV1Beta2Reason = "WaitingForRemediation"
)

// Machine's ExternallyRemediated conditions and corresponding reasons that will be used in v1Beta2 API version.
// Note: ExternallyRemediated condition is initially set by the MachineHealthCheck controller; then it is up to the external
// remediation controller to update or delete this condition.
const (
// MachineExternallyRemediatedV1Beta2Condition is only present if MHC instances targeting this machine
// determine that an external controller should perform remediation.
MachineExternallyRemediatedV1Beta2Condition = "ExternallyRemediated"

// MachineExternallyRemediatedWaitingForRemediationV1Beta2Reason surfaces the machine is waiting for the
// external remediation controller to start remediation.
MachineExternallyRemediatedWaitingForRemediationV1Beta2Reason = "WaitingForRemediation"

// MachineExternallyRemediatedRemediationTemplateNotFoundV1Beta2Reason surfaces that the MachineHealthCheck cannot
// find the template for a external remediation request.
MachineExternallyRemediatedRemediationTemplateNotFoundV1Beta2Reason = "RemediationTemplateNotFound"

// MachineExternallyRemediatedRemediationRequestCreationFailedV1Beta2Reason surfaces that the MachineHealthCheck cannot
// create a request for the external remediation controller.
MachineExternallyRemediatedRemediationRequestCreationFailedV1Beta2Reason = "RemediationRequestCreationFailed"
)

// Machine's Deleting condition and corresponding reasons that will be used in v1Beta2 API version.
Expand Down
21 changes: 21 additions & 0 deletions api/v1beta1/machinehealthcheck_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,27 @@ import (
"k8s.io/apimachinery/pkg/util/intstr"
)

// MachineHealthCheck's RemediationAllowed condition and corresponding reasons that will be used in v1Beta2 API version.
const (
// MachineHealthCheckRemediationAllowedV1Beta2Condition surfaces whether the MachineHealthCheck is
// allowed to remediate any Machines or whether it is blocked from remediating any further.
MachineHealthCheckRemediationAllowedV1Beta2Condition = "RemediationAllowed"

// MachineHealthCheckTooManyUnhealthyV1Beta2Reason is the reason used when too many Machines are unhealthy and
// the MachineHealthCheck is blocked from making any further remediation.
MachineHealthCheckTooManyUnhealthyV1Beta2Reason = "TooManyUnhealthy"

// MachineHealthCheckRemediationAllowedV1Beta2Reason is the reason used when the number of unhealthy machine
// is within the limits defined by the MachineHealthCheck, and thus remediation is allowed.
MachineHealthCheckRemediationAllowedV1Beta2Reason = "RemediationAllowed"
)

// MachineHealthCheck's Paused condition and corresponding reasons that will be used in v1Beta2 API version.
const (
// MachineHealthCheckPausedV1Beta2Condition is true if this MachineHealthCheck or the Cluster it belongs to are paused.
MachineHealthCheckPausedV1Beta2Condition = PausedV1Beta2Condition
)

var (
// DefaultNodeStartupTimeout is the time allowed for a node to start up.
// Can be made longer as part of spec if required for particular provider.
Expand Down
10 changes: 0 additions & 10 deletions api/v1beta1/v1beta2_condition_consts.go
Original file line number Diff line number Diff line change
Expand Up @@ -225,16 +225,6 @@ const (
ClusterPausedV1Beta2Condition = PausedV1Beta2Condition
)

// Conditions that will be used for the MachineHealthCheck object in v1Beta2 API version.
const (
// MachineHealthCheckRemediationAllowedV1Beta2Condition surfaces whether the MachineHealthCheck is
// allowed to remediate any Machines or whether it is blocked from remediating any further.
MachineHealthCheckRemediationAllowedV1Beta2Condition = "RemediationAllowed"

// MachineHealthCheckPausedV1Beta2Condition is true if this MachineHealthCheck or the Cluster it belongs to are paused.
MachineHealthCheckPausedV1Beta2Condition = PausedV1Beta2Condition
)

// Conditions that will be used for the ClusterClass object in v1Beta2 API version.
const (
// ClusterClassVariablesReadyV1Beta2Condition is true if the ClusterClass variables, including both inline and external
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ import (
"sigs.k8s.io/cluster-api/util"
"sigs.k8s.io/cluster-api/util/annotations"
"sigs.k8s.io/cluster-api/util/conditions"
v1beta2conditions "sigs.k8s.io/cluster-api/util/conditions/v1beta2"
"sigs.k8s.io/cluster-api/util/patch"
"sigs.k8s.io/cluster-api/util/predicates"
)
Expand Down Expand Up @@ -279,6 +280,13 @@ func (r *Reconciler) reconcile(ctx context.Context, logger logr.Logger, cluster
Message: message,
})

v1beta2conditions.Set(m, metav1.Condition{
Type: clusterv1.MachineHealthCheckRemediationAllowedV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: clusterv1.MachineHealthCheckTooManyUnhealthyV1Beta2Reason,
Message: message,
})

// If there are no unhealthy target, skip publishing the `RemediationRestricted` event to avoid misleading.
if len(unhealthy) != 0 {
r.recorder.Event(
Expand Down Expand Up @@ -321,6 +329,12 @@ func (r *Reconciler) reconcile(ctx context.Context, logger logr.Logger, cluster
m.Status.RemediationsAllowed = remediationCount
conditions.MarkTrue(m, clusterv1.RemediationAllowedCondition)

v1beta2conditions.Set(m, metav1.Condition{
Type: clusterv1.MachineHealthCheckRemediationAllowedV1Beta2Condition,
Status: metav1.ConditionTrue,
Reason: clusterv1.MachineHealthCheckRemediationAllowedV1Beta2Reason,
})

errList := r.patchUnhealthyTargets(ctx, logger, unhealthy, cluster, m)
errList = append(errList, r.patchHealthyTargets(ctx, logger, healthy, m)...)

Expand Down Expand Up @@ -399,6 +413,13 @@ func (r *Reconciler) patchUnhealthyTargets(ctx context.Context, logger logr.Logg
from, err := external.Get(ctx, r.Client, m.Spec.RemediationTemplate, t.Machine.Namespace)
if err != nil {
conditions.MarkFalse(m, clusterv1.ExternalRemediationTemplateAvailableCondition, clusterv1.ExternalRemediationTemplateNotFoundReason, clusterv1.ConditionSeverityError, err.Error())

v1beta2conditions.Set(t.Machine, metav1.Condition{
Type: clusterv1.MachineExternallyRemediatedV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: clusterv1.MachineExternallyRemediatedRemediationTemplateNotFoundV1Beta2Reason,
Message: fmt.Sprintf("error retrieving remediation template %s %s", m.Spec.RemediationTemplate.Kind, klog.KRef(t.Machine.Namespace, m.Spec.RemediationTemplate.Name)),
})
errList = append(errList, errors.Wrapf(err, "error retrieving remediation template %v %q for machine %q in namespace %q within cluster %q", m.Spec.RemediationTemplate.GroupVersionKind(), m.Spec.RemediationTemplate.Name, t.Machine.Name, t.Machine.Namespace, m.Spec.ClusterName))
return errList
}
Expand Down Expand Up @@ -428,16 +449,37 @@ func (r *Reconciler) patchUnhealthyTargets(ctx context.Context, logger logr.Logg
// Create the external clone.
if err := r.Client.Create(ctx, to); err != nil {
conditions.MarkFalse(m, clusterv1.ExternalRemediationRequestAvailableCondition, clusterv1.ExternalRemediationRequestCreationFailedReason, clusterv1.ConditionSeverityError, err.Error())

v1beta2conditions.Set(t.Machine, metav1.Condition{
Type: clusterv1.MachineExternallyRemediatedV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: clusterv1.MachineExternallyRemediatedRemediationRequestCreationFailedV1Beta2Reason,
Message: "Please check controller logs for errors",
})
errList = append(errList, errors.Wrapf(err, "error creating remediation request for machine %q in namespace %q within cluster %q", t.Machine.Name, t.Machine.Namespace, t.Machine.Spec.ClusterName))
return errList
}

v1beta2conditions.Set(t.Machine, metav1.Condition{
Type: clusterv1.MachineExternallyRemediatedV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: clusterv1.MachineExternallyRemediatedWaitingForRemediationV1Beta2Reason,
})
} else {
logger.Info("Target has failed health check, marking for remediation", "target", t.string(), "reason", condition.Reason, "message", condition.Message)
// NOTE: MHC is responsible for creating MachineOwnerRemediatedCondition if missing or to trigger another remediation if the previous one is completed;
// instead, if a remediation is in already progress, the remediation owner is responsible for completing the process and MHC should not overwrite the condition.
if !conditions.Has(t.Machine, clusterv1.MachineOwnerRemediatedCondition) || conditions.IsTrue(t.Machine, clusterv1.MachineOwnerRemediatedCondition) {
conditions.MarkFalse(t.Machine, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "")
}

if ownerRemediatedCondition := v1beta2conditions.Get(t.Machine, clusterv1.MachineOwnerRemediatedV1Beta2Condition); ownerRemediatedCondition == nil || ownerRemediatedCondition.Status == metav1.ConditionTrue {
v1beta2conditions.Set(t.Machine, metav1.Condition{
Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: clusterv1.MachineOwnerRemediatedWaitingForRemediationV1Beta2Reason,
})
}
}
}

Expand Down
Loading
Loading