sigs.k8s.io/cluster-api@v1.6.3/controlplane/kubeadm/internal/controllers/remediation.go

sigs.k8s.io/cluster-api@v1.6.3/controlplane/kubeadm/internal/controllers/remediation.go (about)

     1  /*
     2  Copyright 2020 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package controllers
    18  
    19  import (
    20  	"context"
    21  	"encoding/json"
    22  	"fmt"
    23  	"time"
    24  
    25  	"github.com/blang/semver/v4"
    26  	"github.com/go-logr/logr"
    27  	"github.com/pkg/errors"
    28  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    29  	kerrors "k8s.io/apimachinery/pkg/util/errors"
    30  	"k8s.io/klog/v2"
    31  	ctrl "sigs.k8s.io/controller-runtime"
    32  
    33  	clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
    34  	controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1beta1"
    35  	"sigs.k8s.io/cluster-api/controlplane/kubeadm/internal"
    36  	"sigs.k8s.io/cluster-api/util/annotations"
    37  	"sigs.k8s.io/cluster-api/util/conditions"
    38  	"sigs.k8s.io/cluster-api/util/patch"
    39  )
    40  
    41  // reconcileUnhealthyMachines tries to remediate KubeadmControlPlane unhealthy machines
    42  // based on the process described in https://github.com/kubernetes-sigs/cluster-api/blob/main/docs/proposals/20191017-kubeadm-based-control-plane.md#remediation-using-delete-and-recreate
    43  func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.Context, controlPlane *internal.ControlPlane) (ret ctrl.Result, retErr error) {
    44  	log := ctrl.LoggerFrom(ctx)
    45  	reconciliationTime := time.Now().UTC()
    46  
    47  	// Cleanup pending remediation actions not completed for any reasons (e.g. number of current replicas is less or equal to 1)
    48  	// if the underlying machine is now back to healthy / not deleting.
    49  	errList := []error{}
    50  	healthyMachines := controlPlane.HealthyMachines()
    51  	for _, m := range healthyMachines {
    52  		if conditions.IsTrue(m, clusterv1.MachineHealthCheckSucceededCondition) &&
    53  			conditions.IsFalse(m, clusterv1.MachineOwnerRemediatedCondition) &&
    54  			m.DeletionTimestamp.IsZero() {
    55  			patchHelper, err := patch.NewHelper(m, r.Client)
    56  			if err != nil {
    57  				errList = append(errList, errors.Wrapf(err, "failed to get PatchHelper for machine %s", m.Name))
    58  				continue
    59  			}
    60  
    61  			conditions.Delete(m, clusterv1.MachineOwnerRemediatedCondition)
    62  
    63  			if err := patchHelper.Patch(ctx, m, patch.WithOwnedConditions{Conditions: []clusterv1.ConditionType{
    64  				clusterv1.MachineOwnerRemediatedCondition,
    65  			}}); err != nil {
    66  				errList = append(errList, errors.Wrapf(err, "failed to patch machine %s", m.Name))
    67  			}
    68  		}
    69  	}
    70  	if len(errList) > 0 {
    71  		return ctrl.Result{}, kerrors.NewAggregate(errList)
    72  	}
    73  
    74  	// Gets all machines that have `MachineHealthCheckSucceeded=False` (indicating a problem was detected on the machine)
    75  	// and `MachineOwnerRemediated` present, indicating that this controller is responsible for performing remediation.
    76  	unhealthyMachines := controlPlane.UnhealthyMachines()
    77  
    78  	// If there are no unhealthy machines, return so KCP can proceed with other operations (ctrl.Result nil).
    79  	if len(unhealthyMachines) == 0 {
    80  		return ctrl.Result{}, nil
    81  	}
    82  
    83  	// Select the machine to be remediated, which is the oldest machine marked as unhealthy.
    84  	//
    85  	// NOTE: The current solution is considered acceptable for the most frequent use case (only one unhealthy machine),
    86  	// however, in the future this could potentially be improved for the scenario where more than one unhealthy machine exists
    87  	// by considering which machine has lower impact on etcd quorum.
    88  	machineToBeRemediated := unhealthyMachines.Oldest()
    89  
    90  	// Returns if the machine is in the process of being deleted.
    91  	if !machineToBeRemediated.ObjectMeta.DeletionTimestamp.IsZero() {
    92  		return ctrl.Result{}, nil
    93  	}
    94  
    95  	log = log.WithValues("Machine", klog.KObj(machineToBeRemediated), "initialized", controlPlane.KCP.Status.Initialized)
    96  
    97  	// Returns if another remediation is in progress but the new Machine is not yet created.
    98  	// Note: This condition is checked after we check for unhealthy Machines and if machineToBeRemediated
    99  	// is being deleted to avoid unnecessary logs if no further remediation should be done.
   100  	if _, ok := controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation]; ok {
   101  		log.Info("Another remediation is already in progress. Skipping remediation.")
   102  		return ctrl.Result{}, nil
   103  	}
   104  
   105  	patchHelper, err := patch.NewHelper(machineToBeRemediated, r.Client)
   106  	if err != nil {
   107  		return ctrl.Result{}, err
   108  	}
   109  
   110  	defer func() {
   111  		// Always attempt to Patch the Machine conditions after each reconcileUnhealthyMachines.
   112  		if err := patchHelper.Patch(ctx, machineToBeRemediated, patch.WithOwnedConditions{Conditions: []clusterv1.ConditionType{
   113  			clusterv1.MachineOwnerRemediatedCondition,
   114  		}}); err != nil {
   115  			log.Error(err, "Failed to patch control plane Machine", "Machine", machineToBeRemediated.Name)
   116  			if retErr == nil {
   117  				retErr = errors.Wrapf(err, "failed to patch control plane Machine %s", machineToBeRemediated.Name)
   118  			}
   119  		}
   120  	}()
   121  
   122  	// Before starting remediation, run preflight checks in order to verify it is safe to remediate.
   123  	// If any of the following checks fails, we'll surface the reason in the MachineOwnerRemediated condition.
   124  
   125  	// Check if KCP is allowed to remediate considering retry limits:
   126  	// - Remediation cannot happen because retryPeriod is not yet expired.
   127  	// - KCP already reached MaxRetries limit.
   128  	remediationInProgressData, canRemediate, err := r.checkRetryLimits(log, machineToBeRemediated, controlPlane, reconciliationTime)
   129  	if err != nil {
   130  		return ctrl.Result{}, err
   131  	}
   132  	if !canRemediate {
   133  		// NOTE: log lines and conditions surfacing why it is not possible to remediate are set by checkRetryLimits.
   134  		return ctrl.Result{}, nil
   135  	}
   136  
   137  	if controlPlane.KCP.Status.Initialized {
   138  		// Executes checks that apply only if the control plane is already initialized; in this case KCP can
   139  		// remediate only if it can safely assume that the operation preserves the operation state of the
   140  		// existing cluster (or at least it doesn't make it worse).
   141  
   142  		// The cluster MUST have more than one replica, because this is the smallest cluster size that allows any etcd failure tolerance.
   143  		if controlPlane.Machines.Len() <= 1 {
   144  			log.Info("A control plane machine needs remediation, but the number of current replicas is less or equal to 1. Skipping remediation", "Replicas", controlPlane.Machines.Len())
   145  			conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate if current replicas are less or equal to 1")
   146  			return ctrl.Result{}, nil
   147  		}
   148  
   149  		// The cluster MUST have no machines with a deletion timestamp. This rule prevents KCP taking actions while the cluster is in a transitional state.
   150  		if controlPlane.HasDeletingMachine() {
   151  			log.Info("A control plane machine needs remediation, but there are other control-plane machines being deleted. Skipping remediation")
   152  			conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP waiting for control plane machine deletion to complete before triggering remediation")
   153  			return ctrl.Result{}, nil
   154  		}
   155  
   156  		// Remediation MUST preserve etcd quorum. This rule ensures that KCP will not remove a member that would result in etcd
   157  		// losing a majority of members and thus become unable to field new requests.
   158  		if controlPlane.IsEtcdManaged() {
   159  			canSafelyRemediate, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, machineToBeRemediated)
   160  			if err != nil {
   161  				conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityError, err.Error())
   162  				return ctrl.Result{}, err
   163  			}
   164  			if !canSafelyRemediate {
   165  				log.Info("A control plane machine needs remediation, but removing this machine could result in etcd quorum loss. Skipping remediation")
   166  				conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because this could result in etcd loosing quorum")
   167  				return ctrl.Result{}, nil
   168  			}
   169  		}
   170  
   171  		// Start remediating the unhealthy control plane machine by deleting it.
   172  		// A new machine will come up completing the operation as part of the regular reconcile.
   173  
   174  		// If the control plane is initialized, before deleting the machine:
   175  		// - if the machine hosts the etcd leader, forward etcd leadership to another machine.
   176  		// - delete the etcd member hosted on the machine being deleted.
   177  		// - remove the etcd member from the kubeadm config map (only for kubernetes version older than v1.22.0)
   178  		workloadCluster, err := controlPlane.GetWorkloadCluster(ctx)
   179  		if err != nil {
   180  			log.Error(err, "Failed to create client to workload cluster")
   181  			return ctrl.Result{}, errors.Wrapf(err, "failed to create client to workload cluster")
   182  		}
   183  
   184  		// If the machine that is about to be deleted is the etcd leader, move it to the newest member available.
   185  		if controlPlane.IsEtcdManaged() {
   186  			etcdLeaderCandidate := controlPlane.HealthyMachines().Newest()
   187  			if etcdLeaderCandidate == nil {
   188  				log.Info("A control plane machine needs remediation, but there is no healthy machine to forward etcd leadership to")
   189  				conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityWarning,
   190  					"A control plane machine needs remediation, but there is no healthy machine to forward etcd leadership to. Skipping remediation")
   191  				return ctrl.Result{}, nil
   192  			}
   193  			if err := workloadCluster.ForwardEtcdLeadership(ctx, machineToBeRemediated, etcdLeaderCandidate); err != nil {
   194  				log.Error(err, "Failed to move etcd leadership to candidate machine", "candidate", klog.KObj(etcdLeaderCandidate))
   195  				conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityError, err.Error())
   196  				return ctrl.Result{}, err
   197  			}
   198  			if err := workloadCluster.RemoveEtcdMemberForMachine(ctx, machineToBeRemediated); err != nil {
   199  				log.Error(err, "Failed to remove etcd member for machine")
   200  				conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityError, err.Error())
   201  				return ctrl.Result{}, err
   202  			}
   203  		}
   204  
   205  		parsedVersion, err := semver.ParseTolerant(controlPlane.KCP.Spec.Version)
   206  		if err != nil {
   207  			return ctrl.Result{}, errors.Wrapf(err, "failed to parse kubernetes version %q", controlPlane.KCP.Spec.Version)
   208  		}
   209  
   210  		if err := workloadCluster.RemoveMachineFromKubeadmConfigMap(ctx, machineToBeRemediated, parsedVersion); err != nil {
   211  			log.Error(err, "Failed to remove machine from kubeadm ConfigMap")
   212  			return ctrl.Result{}, err
   213  		}
   214  	}
   215  
   216  	// Delete the machine
   217  	if err := r.Client.Delete(ctx, machineToBeRemediated); err != nil {
   218  		conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityError, err.Error())
   219  		return ctrl.Result{}, errors.Wrapf(err, "failed to delete unhealthy machine %s", machineToBeRemediated.Name)
   220  	}
   221  
   222  	// Surface the operation is in progress.
   223  	log.Info("Remediating unhealthy machine")
   224  	conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
   225  
   226  	// Prepare the info for tracking the remediation progress into the RemediationInProgressAnnotation.
   227  	remediationInProgressValue, err := remediationInProgressData.Marshal()
   228  	if err != nil {
   229  		return ctrl.Result{}, err
   230  	}
   231  
   232  	// Set annotations tracking remediation details so they can be picked up by the machine
   233  	// that will be created as part of the scale up action that completes the remediation.
   234  	annotations.AddAnnotations(controlPlane.KCP, map[string]string{
   235  		controlplanev1.RemediationInProgressAnnotation: remediationInProgressValue,
   236  	})
   237  
   238  	return ctrl.Result{Requeue: true}, nil
   239  }
   240  
   241  // checkRetryLimits checks if KCP is allowed to remediate considering retry limits:
   242  // - Remediation cannot happen because retryPeriod is not yet expired.
   243  // - KCP already reached the maximum number of retries for a machine.
   244  // NOTE: Counting the number of retries is required In order to prevent infinite remediation e.g. in case the
   245  // first Control Plane machine is failing due to quota issue.
   246  func (r *KubeadmControlPlaneReconciler) checkRetryLimits(log logr.Logger, machineToBeRemediated *clusterv1.Machine, controlPlane *internal.ControlPlane, reconciliationTime time.Time) (*RemediationData, bool, error) {
   247  	// Get last remediation info from the machine.
   248  	var lastRemediationData *RemediationData
   249  	if value, ok := machineToBeRemediated.Annotations[controlplanev1.RemediationForAnnotation]; ok {
   250  		l, err := RemediationDataFromAnnotation(value)
   251  		if err != nil {
   252  			return nil, false, err
   253  		}
   254  		lastRemediationData = l
   255  	}
   256  
   257  	remediationInProgressData := &RemediationData{
   258  		Machine:    machineToBeRemediated.Name,
   259  		Timestamp:  metav1.Time{Time: reconciliationTime},
   260  		RetryCount: 0,
   261  	}
   262  
   263  	// If there is no last remediation, this is the first try of a new retry sequence.
   264  	if lastRemediationData == nil {
   265  		return remediationInProgressData, true, nil
   266  	}
   267  
   268  	// Gets MinHealthyPeriod and RetryPeriod from the remediation strategy, or use defaults.
   269  	minHealthyPeriod := controlplanev1.DefaultMinHealthyPeriod
   270  	if controlPlane.KCP.Spec.RemediationStrategy != nil && controlPlane.KCP.Spec.RemediationStrategy.MinHealthyPeriod != nil {
   271  		minHealthyPeriod = controlPlane.KCP.Spec.RemediationStrategy.MinHealthyPeriod.Duration
   272  	}
   273  	retryPeriod := time.Duration(0)
   274  	if controlPlane.KCP.Spec.RemediationStrategy != nil {
   275  		retryPeriod = controlPlane.KCP.Spec.RemediationStrategy.RetryPeriod.Duration
   276  	}
   277  
   278  	// Gets the timestamp of the last remediation; if missing, default to a value
   279  	// that ensures both MinHealthyPeriod and RetryPeriod are expired.
   280  	// NOTE: this could potentially lead to executing more retries than expected or to executing retries before than
   281  	// expected, but this is considered acceptable when the system recovers from someone/something changes or deletes
   282  	// the RemediationForAnnotation on Machines.
   283  	lastRemediationTime := reconciliationTime.Add(-2 * max(minHealthyPeriod, retryPeriod))
   284  	if !lastRemediationData.Timestamp.IsZero() {
   285  		lastRemediationTime = lastRemediationData.Timestamp.Time
   286  	}
   287  
   288  	// Once we get here we already know that there was a last remediation for the Machine.
   289  	// If the current remediation is happening before minHealthyPeriod is expired, then KCP considers this
   290  	// as a remediation for the same previously unhealthy machine.
   291  	// NOTE: If someone/something changes the RemediationForAnnotation on Machines (e.g. changes the Timestamp),
   292  	// this could potentially lead to executing more retries than expected, but this is considered acceptable in such a case.
   293  	var retryForSameMachineInProgress bool
   294  	if lastRemediationTime.Add(minHealthyPeriod).After(reconciliationTime) {
   295  		retryForSameMachineInProgress = true
   296  		log = log.WithValues("RemediationRetryFor", klog.KRef(machineToBeRemediated.Namespace, lastRemediationData.Machine))
   297  	}
   298  
   299  	// If the retry for the same machine is not in progress, this is the first try of a new retry sequence.
   300  	if !retryForSameMachineInProgress {
   301  		return remediationInProgressData, true, nil
   302  	}
   303  
   304  	// If the remediation is for the same machine, carry over the retry count.
   305  	remediationInProgressData.RetryCount = lastRemediationData.RetryCount
   306  
   307  	// Check if remediation can happen because retryPeriod is passed.
   308  	if lastRemediationTime.Add(retryPeriod).After(reconciliationTime) {
   309  		log.Info(fmt.Sprintf("A control plane machine needs remediation, but the operation already failed in the latest %s. Skipping remediation", retryPeriod))
   310  		conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because the operation already failed in the latest %s (RetryPeriod)", retryPeriod)
   311  		return remediationInProgressData, false, nil
   312  	}
   313  
   314  	// Check if remediation can happen because of maxRetry is not reached yet, if defined.
   315  	if controlPlane.KCP.Spec.RemediationStrategy != nil && controlPlane.KCP.Spec.RemediationStrategy.MaxRetry != nil {
   316  		maxRetry := int(*controlPlane.KCP.Spec.RemediationStrategy.MaxRetry)
   317  		if remediationInProgressData.RetryCount >= maxRetry {
   318  			log.Info(fmt.Sprintf("A control plane machine needs remediation, but the operation already failed %d times (MaxRetry %d). Skipping remediation", remediationInProgressData.RetryCount, maxRetry))
   319  			conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because the operation already failed %d times (MaxRetry)", maxRetry)
   320  			return remediationInProgressData, false, nil
   321  		}
   322  	}
   323  
   324  	// All the check passed, increase the remediation retry count.
   325  	remediationInProgressData.RetryCount++
   326  
   327  	return remediationInProgressData, true, nil
   328  }
   329  
   330  // max calculates the maximum duration.
   331  func max(x, y time.Duration) time.Duration {
   332  	if x < y {
   333  		return y
   334  	}
   335  	return x
   336  }
   337  
   338  // canSafelyRemoveEtcdMember assess if it is possible to remove the member hosted on the machine to be remediated
   339  // without loosing etcd quorum.
   340  //
   341  // The answer mostly depend on the existence of other failing members on top of the one being deleted, and according
   342  // to the etcd fault tolerance specification (see https://etcd.io/docs/v3.3/faq/#what-is-failure-tolerance):
   343  //   - 3 CP cluster does not tolerate additional failing members on top of the one being deleted (the target
   344  //     cluster size after deletion is 2, fault tolerance 0)
   345  //   - 5 CP cluster tolerates 1 additional failing members on top of the one being deleted (the target
   346  //     cluster size after deletion is 4, fault tolerance 1)
   347  //   - 7 CP cluster tolerates 2 additional failing members on top of the one being deleted (the target
   348  //     cluster size after deletion is 6, fault tolerance 2)
   349  //   - etc.
   350  //
   351  // NOTE: this func assumes the list of members in sync with the list of machines/nodes, it is required to call reconcileEtcdMembers
   352  // as well as reconcileControlPlaneConditions before this.
   353  func (r *KubeadmControlPlaneReconciler) canSafelyRemoveEtcdMember(ctx context.Context, controlPlane *internal.ControlPlane, machineToBeRemediated *clusterv1.Machine) (bool, error) {
   354  	log := ctrl.LoggerFrom(ctx)
   355  
   356  	workloadCluster, err := controlPlane.GetWorkloadCluster(ctx)
   357  	if err != nil {
   358  		return false, errors.Wrapf(err, "failed to get client for workload cluster %s", controlPlane.Cluster.Name)
   359  	}
   360  
   361  	// Gets the etcd status
   362  
   363  	// This makes it possible to have a set of etcd members status different from the MHC unhealthy/unhealthy conditions.
   364  	etcdMembers, err := workloadCluster.EtcdMembers(ctx)
   365  	if err != nil {
   366  		return false, errors.Wrapf(err, "failed to get etcdStatus for workload cluster %s", controlPlane.Cluster.Name)
   367  	}
   368  
   369  	currentTotalMembers := len(etcdMembers)
   370  
   371  	log.Info("etcd cluster before remediation",
   372  		"currentTotalMembers", currentTotalMembers,
   373  		"currentMembers", etcdMembers)
   374  
   375  	// Projects the target etcd cluster after remediation, considering all the etcd members except the one being remediated.
   376  	targetTotalMembers := 0
   377  	targetUnhealthyMembers := 0
   378  
   379  	healthyMembers := []string{}
   380  	unhealthyMembers := []string{}
   381  	for _, etcdMember := range etcdMembers {
   382  		// Skip the machine to be deleted because it won't be part of the target etcd cluster.
   383  		if machineToBeRemediated.Status.NodeRef != nil && machineToBeRemediated.Status.NodeRef.Name == etcdMember {
   384  			continue
   385  		}
   386  
   387  		// Include the member in the target etcd cluster.
   388  		targetTotalMembers++
   389  
   390  		// Search for the machine corresponding to the etcd member.
   391  		var machine *clusterv1.Machine
   392  		for _, m := range controlPlane.Machines {
   393  			if m.Status.NodeRef != nil && m.Status.NodeRef.Name == etcdMember {
   394  				machine = m
   395  				break
   396  			}
   397  		}
   398  
   399  		// If an etcd member does not have a corresponding machine it is not possible to retrieve etcd member health,
   400  		// so KCP is assuming the worst scenario and considering the member unhealthy.
   401  		//
   402  		// NOTE: This should not happen given that KCP is running reconcileEtcdMembers before calling this method.
   403  		if machine == nil {
   404  			log.Info("An etcd member does not have a corresponding machine, assuming this member is unhealthy", "MemberName", etcdMember)
   405  			targetUnhealthyMembers++
   406  			unhealthyMembers = append(unhealthyMembers, fmt.Sprintf("%s (no machine)", etcdMember))
   407  			continue
   408  		}
   409  
   410  		// Check member health as reported by machine's health conditions
   411  		if !conditions.IsTrue(machine, controlplanev1.MachineEtcdMemberHealthyCondition) {
   412  			targetUnhealthyMembers++
   413  			unhealthyMembers = append(unhealthyMembers, fmt.Sprintf("%s (%s)", etcdMember, machine.Name))
   414  			continue
   415  		}
   416  
   417  		healthyMembers = append(healthyMembers, fmt.Sprintf("%s (%s)", etcdMember, machine.Name))
   418  	}
   419  
   420  	// See https://etcd.io/docs/v3.3/faq/#what-is-failure-tolerance for fault tolerance formula explanation.
   421  	targetQuorum := (targetTotalMembers / 2.0) + 1
   422  	canSafelyRemediate := targetTotalMembers-targetUnhealthyMembers >= targetQuorum
   423  
   424  	log.Info(fmt.Sprintf("etcd cluster projected after remediation of %s", machineToBeRemediated.Name),
   425  		"healthyMembers", healthyMembers,
   426  		"unhealthyMembers", unhealthyMembers,
   427  		"targetTotalMembers", targetTotalMembers,
   428  		"targetQuorum", targetQuorum,
   429  		"targetUnhealthyMembers", targetUnhealthyMembers,
   430  		"canSafelyRemediate", canSafelyRemediate)
   431  
   432  	return canSafelyRemediate, nil
   433  }
   434  
   435  // RemediationData struct is used to keep track of information stored in the RemediationInProgressAnnotation in KCP
   436  // during remediation and then into the RemediationForAnnotation on the replacement machine once it is created.
   437  type RemediationData struct {
   438  	// Machine is the machine name of the latest machine being remediated.
   439  	Machine string `json:"machine"`
   440  
   441  	// Timestamp is when last remediation happened. It is represented in RFC3339 form and is in UTC.
   442  	Timestamp metav1.Time `json:"timestamp"`
   443  
   444  	// RetryCount used to keep track of remediation retry for the last remediated machine.
   445  	// A retry happens when a machine that was created as a replacement for an unhealthy machine also fails.
   446  	RetryCount int `json:"retryCount"`
   447  }
   448  
   449  // RemediationDataFromAnnotation gets RemediationData from an annotation value.
   450  func RemediationDataFromAnnotation(value string) (*RemediationData, error) {
   451  	ret := &RemediationData{}
   452  	if err := json.Unmarshal([]byte(value), ret); err != nil {
   453  		return nil, errors.Wrapf(err, "failed to unmarshal value %s for %s annotation", value, clusterv1.RemediationInProgressReason)
   454  	}
   455  	return ret, nil
   456  }
   457  
   458  // Marshal an RemediationData into an annotation value.
   459  func (r *RemediationData) Marshal() (string, error) {
   460  	b, err := json.Marshal(r)
   461  	if err != nil {
   462  		return "", errors.Wrapf(err, "failed to marshal value for %s annotation", clusterv1.RemediationInProgressReason)
   463  	}
   464  	return string(b), nil
   465  }
   466  
   467  // ToStatus converts a RemediationData into a LastRemediationStatus struct.
   468  func (r *RemediationData) ToStatus() *controlplanev1.LastRemediationStatus {
   469  	return &controlplanev1.LastRemediationStatus{
   470  		Machine:    r.Machine,
   471  		Timestamp:  r.Timestamp,
   472  		RetryCount: int32(r.RetryCount),
   473  	}
   474  }