sigs.k8s.io/cluster-api@v1.7.1/controlplane/kubeadm/internal/controllers/remediation.go (about)

     1  /*
     2  Copyright 2020 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package controllers
    18  
    19  import (
    20  	"context"
    21  	"encoding/json"
    22  	"fmt"
    23  	"time"
    24  
    25  	"github.com/blang/semver/v4"
    26  	"github.com/go-logr/logr"
    27  	"github.com/pkg/errors"
    28  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    29  	kerrors "k8s.io/apimachinery/pkg/util/errors"
    30  	"k8s.io/klog/v2"
    31  	ctrl "sigs.k8s.io/controller-runtime"
    32  
    33  	clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
    34  	controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1beta1"
    35  	"sigs.k8s.io/cluster-api/controlplane/kubeadm/internal"
    36  	"sigs.k8s.io/cluster-api/util/annotations"
    37  	"sigs.k8s.io/cluster-api/util/collections"
    38  	"sigs.k8s.io/cluster-api/util/conditions"
    39  	"sigs.k8s.io/cluster-api/util/patch"
    40  )
    41  
    42  // reconcileUnhealthyMachines tries to remediate KubeadmControlPlane unhealthy machines
    43  // based on the process described in https://github.com/kubernetes-sigs/cluster-api/blob/main/docs/proposals/20191017-kubeadm-based-control-plane.md#remediation-using-delete-and-recreate
    44  func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.Context, controlPlane *internal.ControlPlane) (ret ctrl.Result, retErr error) {
    45  	log := ctrl.LoggerFrom(ctx)
    46  	reconciliationTime := time.Now().UTC()
    47  
    48  	// Cleanup pending remediation actions not completed for any reasons (e.g. number of current replicas is less or equal to 1)
    49  	// if the underlying machine is now back to healthy / not deleting.
    50  	errList := []error{}
    51  	healthyMachines := controlPlane.HealthyMachinesByMachineHealthCheck()
    52  	for _, m := range healthyMachines {
    53  		if conditions.IsTrue(m, clusterv1.MachineHealthCheckSucceededCondition) &&
    54  			conditions.IsFalse(m, clusterv1.MachineOwnerRemediatedCondition) &&
    55  			m.DeletionTimestamp.IsZero() {
    56  			patchHelper, err := patch.NewHelper(m, r.Client)
    57  			if err != nil {
    58  				errList = append(errList, err)
    59  				continue
    60  			}
    61  
    62  			conditions.Delete(m, clusterv1.MachineOwnerRemediatedCondition)
    63  
    64  			if err := patchHelper.Patch(ctx, m, patch.WithOwnedConditions{Conditions: []clusterv1.ConditionType{
    65  				clusterv1.MachineOwnerRemediatedCondition,
    66  			}}); err != nil {
    67  				errList = append(errList, err)
    68  			}
    69  		}
    70  	}
    71  	if len(errList) > 0 {
    72  		return ctrl.Result{}, kerrors.NewAggregate(errList)
    73  	}
    74  
    75  	// Gets all machines that have `MachineHealthCheckSucceeded=False` (indicating a problem was detected on the machine)
    76  	// and `MachineOwnerRemediated` present, indicating that this controller is responsible for performing remediation.
    77  	unhealthyMachines := controlPlane.UnhealthyMachinesByMachineHealthCheck()
    78  
    79  	// If there are no unhealthy machines, return so KCP can proceed with other operations (ctrl.Result nil).
    80  	if len(unhealthyMachines) == 0 {
    81  		return ctrl.Result{}, nil
    82  	}
    83  
    84  	// Select the machine to be remediated, which is the oldest machine marked as unhealthy not yet provisioned (if any)
    85  	// or the oldest machine marked as unhealthy.
    86  	//
    87  	// NOTE: The current solution is considered acceptable for the most frequent use case (only one unhealthy machine),
    88  	// however, in the future this could potentially be improved for the scenario where more than one unhealthy machine exists
    89  	// by considering which machine has lower impact on etcd quorum.
    90  	machineToBeRemediated := getMachineToBeRemediated(unhealthyMachines)
    91  
    92  	// Returns if the machine is in the process of being deleted.
    93  	if !machineToBeRemediated.ObjectMeta.DeletionTimestamp.IsZero() {
    94  		return ctrl.Result{}, nil
    95  	}
    96  
    97  	log = log.WithValues("Machine", klog.KObj(machineToBeRemediated), "initialized", controlPlane.KCP.Status.Initialized)
    98  
    99  	// Returns if another remediation is in progress but the new Machine is not yet created.
   100  	// Note: This condition is checked after we check for unhealthy Machines and if machineToBeRemediated
   101  	// is being deleted to avoid unnecessary logs if no further remediation should be done.
   102  	if _, ok := controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation]; ok {
   103  		log.Info("Another remediation is already in progress. Skipping remediation.")
   104  		return ctrl.Result{}, nil
   105  	}
   106  
   107  	patchHelper, err := patch.NewHelper(machineToBeRemediated, r.Client)
   108  	if err != nil {
   109  		return ctrl.Result{}, err
   110  	}
   111  
   112  	defer func() {
   113  		// Always attempt to Patch the Machine conditions after each reconcileUnhealthyMachines.
   114  		if err := patchHelper.Patch(ctx, machineToBeRemediated, patch.WithOwnedConditions{Conditions: []clusterv1.ConditionType{
   115  			clusterv1.MachineOwnerRemediatedCondition,
   116  		}}); err != nil {
   117  			log.Error(err, "Failed to patch control plane Machine", "Machine", machineToBeRemediated.Name)
   118  			if retErr == nil {
   119  				retErr = errors.Wrapf(err, "failed to patch control plane Machine %s", machineToBeRemediated.Name)
   120  			}
   121  		}
   122  	}()
   123  
   124  	// Before starting remediation, run preflight checks in order to verify it is safe to remediate.
   125  	// If any of the following checks fails, we'll surface the reason in the MachineOwnerRemediated condition.
   126  
   127  	// Check if KCP is allowed to remediate considering retry limits:
   128  	// - Remediation cannot happen because retryPeriod is not yet expired.
   129  	// - KCP already reached MaxRetries limit.
   130  	remediationInProgressData, canRemediate, err := r.checkRetryLimits(log, machineToBeRemediated, controlPlane, reconciliationTime)
   131  	if err != nil {
   132  		return ctrl.Result{}, err
   133  	}
   134  	if !canRemediate {
   135  		// NOTE: log lines and conditions surfacing why it is not possible to remediate are set by checkRetryLimits.
   136  		return ctrl.Result{}, nil
   137  	}
   138  
   139  	if controlPlane.KCP.Status.Initialized {
   140  		// Executes checks that apply only if the control plane is already initialized; in this case KCP can
   141  		// remediate only if it can safely assume that the operation preserves the operation state of the
   142  		// existing cluster (or at least it doesn't make it worse).
   143  
   144  		// The cluster MUST have more than one replica, because this is the smallest cluster size that allows any etcd failure tolerance.
   145  		if controlPlane.Machines.Len() <= 1 {
   146  			log.Info("A control plane machine needs remediation, but the number of current replicas is less or equal to 1. Skipping remediation", "Replicas", controlPlane.Machines.Len())
   147  			conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate if current replicas are less or equal to 1")
   148  			return ctrl.Result{}, nil
   149  		}
   150  
   151  		// The cluster MUST NOT have healthy machines still being provisioned. This rule prevents KCP taking actions while the cluster is in a transitional state.
   152  		if controlPlane.HasHealthyMachineStillProvisioning() {
   153  			log.Info("A control plane machine needs remediation, but there are other control-plane machines being provisioned. Skipping remediation")
   154  			conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP waiting for control plane machine provisioning to complete before triggering remediation")
   155  			return ctrl.Result{}, nil
   156  		}
   157  
   158  		// The cluster MUST have no machines with a deletion timestamp. This rule prevents KCP taking actions while the cluster is in a transitional state.
   159  		if controlPlane.HasDeletingMachine() {
   160  			log.Info("A control plane machine needs remediation, but there are other control-plane machines being deleted. Skipping remediation")
   161  			conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP waiting for control plane machine deletion to complete before triggering remediation")
   162  			return ctrl.Result{}, nil
   163  		}
   164  
   165  		// Remediation MUST preserve etcd quorum. This rule ensures that KCP will not remove a member that would result in etcd
   166  		// losing a majority of members and thus become unable to field new requests.
   167  		if controlPlane.IsEtcdManaged() {
   168  			canSafelyRemediate, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, machineToBeRemediated)
   169  			if err != nil {
   170  				conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityError, err.Error())
   171  				return ctrl.Result{}, err
   172  			}
   173  			if !canSafelyRemediate {
   174  				log.Info("A control plane machine needs remediation, but removing this machine could result in etcd quorum loss. Skipping remediation")
   175  				conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because this could result in etcd loosing quorum")
   176  				return ctrl.Result{}, nil
   177  			}
   178  		}
   179  
   180  		// Start remediating the unhealthy control plane machine by deleting it.
   181  		// A new machine will come up completing the operation as part of the regular reconcile.
   182  
   183  		// If the control plane is initialized, before deleting the machine:
   184  		// - if the machine hosts the etcd leader, forward etcd leadership to another machine.
   185  		// - delete the etcd member hosted on the machine being deleted.
   186  		// - remove the etcd member from the kubeadm config map (only for kubernetes version older than v1.22.0)
   187  		workloadCluster, err := controlPlane.GetWorkloadCluster(ctx)
   188  		if err != nil {
   189  			log.Error(err, "Failed to create client to workload cluster")
   190  			return ctrl.Result{}, errors.Wrapf(err, "failed to create client to workload cluster")
   191  		}
   192  
   193  		// If the machine that is about to be deleted is the etcd leader, move it to the newest member available.
   194  		if controlPlane.IsEtcdManaged() {
   195  			etcdLeaderCandidate := controlPlane.HealthyMachinesByMachineHealthCheck().Newest()
   196  			if etcdLeaderCandidate == nil {
   197  				log.Info("A control plane machine needs remediation, but there is no healthy machine to forward etcd leadership to")
   198  				conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityWarning,
   199  					"A control plane machine needs remediation, but there is no healthy machine to forward etcd leadership to. Skipping remediation")
   200  				return ctrl.Result{}, nil
   201  			}
   202  			if err := workloadCluster.ForwardEtcdLeadership(ctx, machineToBeRemediated, etcdLeaderCandidate); err != nil {
   203  				log.Error(err, "Failed to move etcd leadership to candidate machine", "candidate", klog.KObj(etcdLeaderCandidate))
   204  				conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityError, err.Error())
   205  				return ctrl.Result{}, err
   206  			}
   207  			if err := workloadCluster.RemoveEtcdMemberForMachine(ctx, machineToBeRemediated); err != nil {
   208  				log.Error(err, "Failed to remove etcd member for machine")
   209  				conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityError, err.Error())
   210  				return ctrl.Result{}, err
   211  			}
   212  		}
   213  
   214  		parsedVersion, err := semver.ParseTolerant(controlPlane.KCP.Spec.Version)
   215  		if err != nil {
   216  			return ctrl.Result{}, errors.Wrapf(err, "failed to parse kubernetes version %q", controlPlane.KCP.Spec.Version)
   217  		}
   218  
   219  		if err := workloadCluster.RemoveMachineFromKubeadmConfigMap(ctx, machineToBeRemediated, parsedVersion); err != nil {
   220  			log.Error(err, "Failed to remove machine from kubeadm ConfigMap")
   221  			return ctrl.Result{}, err
   222  		}
   223  	}
   224  
   225  	// Delete the machine
   226  	if err := r.Client.Delete(ctx, machineToBeRemediated); err != nil {
   227  		conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityError, err.Error())
   228  		return ctrl.Result{}, errors.Wrapf(err, "failed to delete unhealthy machine %s", machineToBeRemediated.Name)
   229  	}
   230  
   231  	// Surface the operation is in progress.
   232  	log.Info("Remediating unhealthy machine")
   233  	conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
   234  
   235  	// Prepare the info for tracking the remediation progress into the RemediationInProgressAnnotation.
   236  	remediationInProgressValue, err := remediationInProgressData.Marshal()
   237  	if err != nil {
   238  		return ctrl.Result{}, err
   239  	}
   240  
   241  	// Set annotations tracking remediation details so they can be picked up by the machine
   242  	// that will be created as part of the scale up action that completes the remediation.
   243  	annotations.AddAnnotations(controlPlane.KCP, map[string]string{
   244  		controlplanev1.RemediationInProgressAnnotation: remediationInProgressValue,
   245  	})
   246  
   247  	return ctrl.Result{Requeue: true}, nil
   248  }
   249  
   250  // Gets the machine to be remediated, which is the oldest machine marked as unhealthy not yet provisioned (if any)
   251  // or the oldest machine marked as unhealthy.
   252  func getMachineToBeRemediated(unhealthyMachines collections.Machines) *clusterv1.Machine {
   253  	machineToBeRemediated := unhealthyMachines.Filter(collections.Not(collections.HasNode())).Oldest()
   254  	if machineToBeRemediated == nil {
   255  		machineToBeRemediated = unhealthyMachines.Oldest()
   256  	}
   257  	return machineToBeRemediated
   258  }
   259  
   260  // checkRetryLimits checks if KCP is allowed to remediate considering retry limits:
   261  // - Remediation cannot happen because retryPeriod is not yet expired.
   262  // - KCP already reached the maximum number of retries for a machine.
   263  // NOTE: Counting the number of retries is required In order to prevent infinite remediation e.g. in case the
   264  // first Control Plane machine is failing due to quota issue.
   265  func (r *KubeadmControlPlaneReconciler) checkRetryLimits(log logr.Logger, machineToBeRemediated *clusterv1.Machine, controlPlane *internal.ControlPlane, reconciliationTime time.Time) (*RemediationData, bool, error) {
   266  	// Get last remediation info from the machine.
   267  	var lastRemediationData *RemediationData
   268  	if value, ok := machineToBeRemediated.Annotations[controlplanev1.RemediationForAnnotation]; ok {
   269  		l, err := RemediationDataFromAnnotation(value)
   270  		if err != nil {
   271  			return nil, false, err
   272  		}
   273  		lastRemediationData = l
   274  	}
   275  
   276  	remediationInProgressData := &RemediationData{
   277  		Machine:    machineToBeRemediated.Name,
   278  		Timestamp:  metav1.Time{Time: reconciliationTime},
   279  		RetryCount: 0,
   280  	}
   281  
   282  	// If there is no last remediation, this is the first try of a new retry sequence.
   283  	if lastRemediationData == nil {
   284  		return remediationInProgressData, true, nil
   285  	}
   286  
   287  	// Gets MinHealthyPeriod and RetryPeriod from the remediation strategy, or use defaults.
   288  	minHealthyPeriod := controlplanev1.DefaultMinHealthyPeriod
   289  	if controlPlane.KCP.Spec.RemediationStrategy != nil && controlPlane.KCP.Spec.RemediationStrategy.MinHealthyPeriod != nil {
   290  		minHealthyPeriod = controlPlane.KCP.Spec.RemediationStrategy.MinHealthyPeriod.Duration
   291  	}
   292  	retryPeriod := time.Duration(0)
   293  	if controlPlane.KCP.Spec.RemediationStrategy != nil {
   294  		retryPeriod = controlPlane.KCP.Spec.RemediationStrategy.RetryPeriod.Duration
   295  	}
   296  
   297  	// Gets the timestamp of the last remediation; if missing, default to a value
   298  	// that ensures both MinHealthyPeriod and RetryPeriod are expired.
   299  	// NOTE: this could potentially lead to executing more retries than expected or to executing retries before than
   300  	// expected, but this is considered acceptable when the system recovers from someone/something changes or deletes
   301  	// the RemediationForAnnotation on Machines.
   302  	lastRemediationTime := reconciliationTime.Add(-2 * max(minHealthyPeriod, retryPeriod))
   303  	if !lastRemediationData.Timestamp.IsZero() {
   304  		lastRemediationTime = lastRemediationData.Timestamp.Time
   305  	}
   306  
   307  	// Once we get here we already know that there was a last remediation for the Machine.
   308  	// If the current remediation is happening before minHealthyPeriod is expired, then KCP considers this
   309  	// as a remediation for the same previously unhealthy machine.
   310  	// NOTE: If someone/something changes the RemediationForAnnotation on Machines (e.g. changes the Timestamp),
   311  	// this could potentially lead to executing more retries than expected, but this is considered acceptable in such a case.
   312  	var retryForSameMachineInProgress bool
   313  	if lastRemediationTime.Add(minHealthyPeriod).After(reconciliationTime) {
   314  		retryForSameMachineInProgress = true
   315  		log = log.WithValues("RemediationRetryFor", klog.KRef(machineToBeRemediated.Namespace, lastRemediationData.Machine))
   316  	}
   317  
   318  	// If the retry for the same machine is not in progress, this is the first try of a new retry sequence.
   319  	if !retryForSameMachineInProgress {
   320  		return remediationInProgressData, true, nil
   321  	}
   322  
   323  	// If the remediation is for the same machine, carry over the retry count.
   324  	remediationInProgressData.RetryCount = lastRemediationData.RetryCount
   325  
   326  	// Check if remediation can happen because retryPeriod is passed.
   327  	if lastRemediationTime.Add(retryPeriod).After(reconciliationTime) {
   328  		log.Info(fmt.Sprintf("A control plane machine needs remediation, but the operation already failed in the latest %s. Skipping remediation", retryPeriod))
   329  		conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because the operation already failed in the latest %s (RetryPeriod)", retryPeriod)
   330  		return remediationInProgressData, false, nil
   331  	}
   332  
   333  	// Check if remediation can happen because of maxRetry is not reached yet, if defined.
   334  	if controlPlane.KCP.Spec.RemediationStrategy != nil && controlPlane.KCP.Spec.RemediationStrategy.MaxRetry != nil {
   335  		maxRetry := int(*controlPlane.KCP.Spec.RemediationStrategy.MaxRetry)
   336  		if remediationInProgressData.RetryCount >= maxRetry {
   337  			log.Info(fmt.Sprintf("A control plane machine needs remediation, but the operation already failed %d times (MaxRetry %d). Skipping remediation", remediationInProgressData.RetryCount, maxRetry))
   338  			conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because the operation already failed %d times (MaxRetry)", maxRetry)
   339  			return remediationInProgressData, false, nil
   340  		}
   341  	}
   342  
   343  	// All the check passed, increase the remediation retry count.
   344  	remediationInProgressData.RetryCount++
   345  
   346  	return remediationInProgressData, true, nil
   347  }
   348  
   349  // canSafelyRemoveEtcdMember assess if it is possible to remove the member hosted on the machine to be remediated
   350  // without loosing etcd quorum.
   351  //
   352  // The answer mostly depend on the existence of other failing members on top of the one being deleted, and according
   353  // to the etcd fault tolerance specification (see https://etcd.io/docs/v3.3/faq/#what-is-failure-tolerance):
   354  //   - 3 CP cluster does not tolerate additional failing members on top of the one being deleted (the target
   355  //     cluster size after deletion is 2, fault tolerance 0)
   356  //   - 5 CP cluster tolerates 1 additional failing members on top of the one being deleted (the target
   357  //     cluster size after deletion is 4, fault tolerance 1)
   358  //   - 7 CP cluster tolerates 2 additional failing members on top of the one being deleted (the target
   359  //     cluster size after deletion is 6, fault tolerance 2)
   360  //   - etc.
   361  //
   362  // NOTE: this func assumes the list of members in sync with the list of machines/nodes, it is required to call reconcileEtcdMembers
   363  // as well as reconcileControlPlaneConditions before this.
   364  func (r *KubeadmControlPlaneReconciler) canSafelyRemoveEtcdMember(ctx context.Context, controlPlane *internal.ControlPlane, machineToBeRemediated *clusterv1.Machine) (bool, error) {
   365  	log := ctrl.LoggerFrom(ctx)
   366  
   367  	workloadCluster, err := controlPlane.GetWorkloadCluster(ctx)
   368  	if err != nil {
   369  		return false, errors.Wrapf(err, "failed to get client for workload cluster %s", controlPlane.Cluster.Name)
   370  	}
   371  
   372  	// Gets the etcd status
   373  
   374  	// This makes it possible to have a set of etcd members status different from the MHC unhealthy/unhealthy conditions.
   375  	etcdMembers, err := workloadCluster.EtcdMembers(ctx)
   376  	if err != nil {
   377  		return false, errors.Wrapf(err, "failed to get etcdStatus for workload cluster %s", controlPlane.Cluster.Name)
   378  	}
   379  
   380  	currentTotalMembers := len(etcdMembers)
   381  
   382  	log.Info("etcd cluster before remediation",
   383  		"currentTotalMembers", currentTotalMembers,
   384  		"currentMembers", etcdMembers)
   385  
   386  	// Projects the target etcd cluster after remediation, considering all the etcd members except the one being remediated.
   387  	targetTotalMembers := 0
   388  	targetUnhealthyMembers := 0
   389  
   390  	healthyMembers := []string{}
   391  	unhealthyMembers := []string{}
   392  	for _, etcdMember := range etcdMembers {
   393  		// Skip the machine to be deleted because it won't be part of the target etcd cluster.
   394  		if machineToBeRemediated.Status.NodeRef != nil && machineToBeRemediated.Status.NodeRef.Name == etcdMember {
   395  			continue
   396  		}
   397  
   398  		// Include the member in the target etcd cluster.
   399  		targetTotalMembers++
   400  
   401  		// Search for the machine corresponding to the etcd member.
   402  		var machine *clusterv1.Machine
   403  		for _, m := range controlPlane.Machines {
   404  			if m.Status.NodeRef != nil && m.Status.NodeRef.Name == etcdMember {
   405  				machine = m
   406  				break
   407  			}
   408  		}
   409  
   410  		// If an etcd member does not have a corresponding machine it is not possible to retrieve etcd member health,
   411  		// so KCP is assuming the worst scenario and considering the member unhealthy.
   412  		//
   413  		// NOTE: This should not happen given that KCP is running reconcileEtcdMembers before calling this method.
   414  		if machine == nil {
   415  			log.Info("An etcd member does not have a corresponding machine, assuming this member is unhealthy", "MemberName", etcdMember)
   416  			targetUnhealthyMembers++
   417  			unhealthyMembers = append(unhealthyMembers, fmt.Sprintf("%s (no machine)", etcdMember))
   418  			continue
   419  		}
   420  
   421  		// Check member health as reported by machine's health conditions
   422  		if !conditions.IsTrue(machine, controlplanev1.MachineEtcdMemberHealthyCondition) {
   423  			targetUnhealthyMembers++
   424  			unhealthyMembers = append(unhealthyMembers, fmt.Sprintf("%s (%s)", etcdMember, machine.Name))
   425  			continue
   426  		}
   427  
   428  		healthyMembers = append(healthyMembers, fmt.Sprintf("%s (%s)", etcdMember, machine.Name))
   429  	}
   430  
   431  	// See https://etcd.io/docs/v3.3/faq/#what-is-failure-tolerance for fault tolerance formula explanation.
   432  	targetQuorum := (targetTotalMembers / 2.0) + 1
   433  	canSafelyRemediate := targetTotalMembers-targetUnhealthyMembers >= targetQuorum
   434  
   435  	log.Info(fmt.Sprintf("etcd cluster projected after remediation of %s", machineToBeRemediated.Name),
   436  		"healthyMembers", healthyMembers,
   437  		"unhealthyMembers", unhealthyMembers,
   438  		"targetTotalMembers", targetTotalMembers,
   439  		"targetQuorum", targetQuorum,
   440  		"targetUnhealthyMembers", targetUnhealthyMembers,
   441  		"canSafelyRemediate", canSafelyRemediate)
   442  
   443  	return canSafelyRemediate, nil
   444  }
   445  
   446  // RemediationData struct is used to keep track of information stored in the RemediationInProgressAnnotation in KCP
   447  // during remediation and then into the RemediationForAnnotation on the replacement machine once it is created.
   448  type RemediationData struct {
   449  	// Machine is the machine name of the latest machine being remediated.
   450  	Machine string `json:"machine"`
   451  
   452  	// Timestamp is when last remediation happened. It is represented in RFC3339 form and is in UTC.
   453  	Timestamp metav1.Time `json:"timestamp"`
   454  
   455  	// RetryCount used to keep track of remediation retry for the last remediated machine.
   456  	// A retry happens when a machine that was created as a replacement for an unhealthy machine also fails.
   457  	RetryCount int `json:"retryCount"`
   458  }
   459  
   460  // RemediationDataFromAnnotation gets RemediationData from an annotation value.
   461  func RemediationDataFromAnnotation(value string) (*RemediationData, error) {
   462  	ret := &RemediationData{}
   463  	if err := json.Unmarshal([]byte(value), ret); err != nil {
   464  		return nil, errors.Wrapf(err, "failed to unmarshal value %s for %s annotation", value, clusterv1.RemediationInProgressReason)
   465  	}
   466  	return ret, nil
   467  }
   468  
   469  // Marshal an RemediationData into an annotation value.
   470  func (r *RemediationData) Marshal() (string, error) {
   471  	b, err := json.Marshal(r)
   472  	if err != nil {
   473  		return "", errors.Wrapf(err, "failed to marshal value for %s annotation", clusterv1.RemediationInProgressReason)
   474  	}
   475  	return string(b), nil
   476  }
   477  
   478  // ToStatus converts a RemediationData into a LastRemediationStatus struct.
   479  func (r *RemediationData) ToStatus() *controlplanev1.LastRemediationStatus {
   480  	return &controlplanev1.LastRemediationStatus{
   481  		Machine:    r.Machine,
   482  		Timestamp:  r.Timestamp,
   483  		RetryCount: int32(r.RetryCount),
   484  	}
   485  }