sigs.k8s.io/cluster-api@v1.7.1/controlplane/kubeadm/internal/controllers/remediation.go (about) 1 /* 2 Copyright 2020 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package controllers 18 19 import ( 20 "context" 21 "encoding/json" 22 "fmt" 23 "time" 24 25 "github.com/blang/semver/v4" 26 "github.com/go-logr/logr" 27 "github.com/pkg/errors" 28 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 29 kerrors "k8s.io/apimachinery/pkg/util/errors" 30 "k8s.io/klog/v2" 31 ctrl "sigs.k8s.io/controller-runtime" 32 33 clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" 34 controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1beta1" 35 "sigs.k8s.io/cluster-api/controlplane/kubeadm/internal" 36 "sigs.k8s.io/cluster-api/util/annotations" 37 "sigs.k8s.io/cluster-api/util/collections" 38 "sigs.k8s.io/cluster-api/util/conditions" 39 "sigs.k8s.io/cluster-api/util/patch" 40 ) 41 42 // reconcileUnhealthyMachines tries to remediate KubeadmControlPlane unhealthy machines 43 // based on the process described in https://github.com/kubernetes-sigs/cluster-api/blob/main/docs/proposals/20191017-kubeadm-based-control-plane.md#remediation-using-delete-and-recreate 44 func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.Context, controlPlane *internal.ControlPlane) (ret ctrl.Result, retErr error) { 45 log := ctrl.LoggerFrom(ctx) 46 reconciliationTime := time.Now().UTC() 47 48 // Cleanup pending remediation actions not completed for any reasons (e.g. number of current replicas is less or equal to 1) 49 // if the underlying machine is now back to healthy / not deleting. 50 errList := []error{} 51 healthyMachines := controlPlane.HealthyMachinesByMachineHealthCheck() 52 for _, m := range healthyMachines { 53 if conditions.IsTrue(m, clusterv1.MachineHealthCheckSucceededCondition) && 54 conditions.IsFalse(m, clusterv1.MachineOwnerRemediatedCondition) && 55 m.DeletionTimestamp.IsZero() { 56 patchHelper, err := patch.NewHelper(m, r.Client) 57 if err != nil { 58 errList = append(errList, err) 59 continue 60 } 61 62 conditions.Delete(m, clusterv1.MachineOwnerRemediatedCondition) 63 64 if err := patchHelper.Patch(ctx, m, patch.WithOwnedConditions{Conditions: []clusterv1.ConditionType{ 65 clusterv1.MachineOwnerRemediatedCondition, 66 }}); err != nil { 67 errList = append(errList, err) 68 } 69 } 70 } 71 if len(errList) > 0 { 72 return ctrl.Result{}, kerrors.NewAggregate(errList) 73 } 74 75 // Gets all machines that have `MachineHealthCheckSucceeded=False` (indicating a problem was detected on the machine) 76 // and `MachineOwnerRemediated` present, indicating that this controller is responsible for performing remediation. 77 unhealthyMachines := controlPlane.UnhealthyMachinesByMachineHealthCheck() 78 79 // If there are no unhealthy machines, return so KCP can proceed with other operations (ctrl.Result nil). 80 if len(unhealthyMachines) == 0 { 81 return ctrl.Result{}, nil 82 } 83 84 // Select the machine to be remediated, which is the oldest machine marked as unhealthy not yet provisioned (if any) 85 // or the oldest machine marked as unhealthy. 86 // 87 // NOTE: The current solution is considered acceptable for the most frequent use case (only one unhealthy machine), 88 // however, in the future this could potentially be improved for the scenario where more than one unhealthy machine exists 89 // by considering which machine has lower impact on etcd quorum. 90 machineToBeRemediated := getMachineToBeRemediated(unhealthyMachines) 91 92 // Returns if the machine is in the process of being deleted. 93 if !machineToBeRemediated.ObjectMeta.DeletionTimestamp.IsZero() { 94 return ctrl.Result{}, nil 95 } 96 97 log = log.WithValues("Machine", klog.KObj(machineToBeRemediated), "initialized", controlPlane.KCP.Status.Initialized) 98 99 // Returns if another remediation is in progress but the new Machine is not yet created. 100 // Note: This condition is checked after we check for unhealthy Machines and if machineToBeRemediated 101 // is being deleted to avoid unnecessary logs if no further remediation should be done. 102 if _, ok := controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation]; ok { 103 log.Info("Another remediation is already in progress. Skipping remediation.") 104 return ctrl.Result{}, nil 105 } 106 107 patchHelper, err := patch.NewHelper(machineToBeRemediated, r.Client) 108 if err != nil { 109 return ctrl.Result{}, err 110 } 111 112 defer func() { 113 // Always attempt to Patch the Machine conditions after each reconcileUnhealthyMachines. 114 if err := patchHelper.Patch(ctx, machineToBeRemediated, patch.WithOwnedConditions{Conditions: []clusterv1.ConditionType{ 115 clusterv1.MachineOwnerRemediatedCondition, 116 }}); err != nil { 117 log.Error(err, "Failed to patch control plane Machine", "Machine", machineToBeRemediated.Name) 118 if retErr == nil { 119 retErr = errors.Wrapf(err, "failed to patch control plane Machine %s", machineToBeRemediated.Name) 120 } 121 } 122 }() 123 124 // Before starting remediation, run preflight checks in order to verify it is safe to remediate. 125 // If any of the following checks fails, we'll surface the reason in the MachineOwnerRemediated condition. 126 127 // Check if KCP is allowed to remediate considering retry limits: 128 // - Remediation cannot happen because retryPeriod is not yet expired. 129 // - KCP already reached MaxRetries limit. 130 remediationInProgressData, canRemediate, err := r.checkRetryLimits(log, machineToBeRemediated, controlPlane, reconciliationTime) 131 if err != nil { 132 return ctrl.Result{}, err 133 } 134 if !canRemediate { 135 // NOTE: log lines and conditions surfacing why it is not possible to remediate are set by checkRetryLimits. 136 return ctrl.Result{}, nil 137 } 138 139 if controlPlane.KCP.Status.Initialized { 140 // Executes checks that apply only if the control plane is already initialized; in this case KCP can 141 // remediate only if it can safely assume that the operation preserves the operation state of the 142 // existing cluster (or at least it doesn't make it worse). 143 144 // The cluster MUST have more than one replica, because this is the smallest cluster size that allows any etcd failure tolerance. 145 if controlPlane.Machines.Len() <= 1 { 146 log.Info("A control plane machine needs remediation, but the number of current replicas is less or equal to 1. Skipping remediation", "Replicas", controlPlane.Machines.Len()) 147 conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate if current replicas are less or equal to 1") 148 return ctrl.Result{}, nil 149 } 150 151 // The cluster MUST NOT have healthy machines still being provisioned. This rule prevents KCP taking actions while the cluster is in a transitional state. 152 if controlPlane.HasHealthyMachineStillProvisioning() { 153 log.Info("A control plane machine needs remediation, but there are other control-plane machines being provisioned. Skipping remediation") 154 conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP waiting for control plane machine provisioning to complete before triggering remediation") 155 return ctrl.Result{}, nil 156 } 157 158 // The cluster MUST have no machines with a deletion timestamp. This rule prevents KCP taking actions while the cluster is in a transitional state. 159 if controlPlane.HasDeletingMachine() { 160 log.Info("A control plane machine needs remediation, but there are other control-plane machines being deleted. Skipping remediation") 161 conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP waiting for control plane machine deletion to complete before triggering remediation") 162 return ctrl.Result{}, nil 163 } 164 165 // Remediation MUST preserve etcd quorum. This rule ensures that KCP will not remove a member that would result in etcd 166 // losing a majority of members and thus become unable to field new requests. 167 if controlPlane.IsEtcdManaged() { 168 canSafelyRemediate, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, machineToBeRemediated) 169 if err != nil { 170 conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityError, err.Error()) 171 return ctrl.Result{}, err 172 } 173 if !canSafelyRemediate { 174 log.Info("A control plane machine needs remediation, but removing this machine could result in etcd quorum loss. Skipping remediation") 175 conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because this could result in etcd loosing quorum") 176 return ctrl.Result{}, nil 177 } 178 } 179 180 // Start remediating the unhealthy control plane machine by deleting it. 181 // A new machine will come up completing the operation as part of the regular reconcile. 182 183 // If the control plane is initialized, before deleting the machine: 184 // - if the machine hosts the etcd leader, forward etcd leadership to another machine. 185 // - delete the etcd member hosted on the machine being deleted. 186 // - remove the etcd member from the kubeadm config map (only for kubernetes version older than v1.22.0) 187 workloadCluster, err := controlPlane.GetWorkloadCluster(ctx) 188 if err != nil { 189 log.Error(err, "Failed to create client to workload cluster") 190 return ctrl.Result{}, errors.Wrapf(err, "failed to create client to workload cluster") 191 } 192 193 // If the machine that is about to be deleted is the etcd leader, move it to the newest member available. 194 if controlPlane.IsEtcdManaged() { 195 etcdLeaderCandidate := controlPlane.HealthyMachinesByMachineHealthCheck().Newest() 196 if etcdLeaderCandidate == nil { 197 log.Info("A control plane machine needs remediation, but there is no healthy machine to forward etcd leadership to") 198 conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityWarning, 199 "A control plane machine needs remediation, but there is no healthy machine to forward etcd leadership to. Skipping remediation") 200 return ctrl.Result{}, nil 201 } 202 if err := workloadCluster.ForwardEtcdLeadership(ctx, machineToBeRemediated, etcdLeaderCandidate); err != nil { 203 log.Error(err, "Failed to move etcd leadership to candidate machine", "candidate", klog.KObj(etcdLeaderCandidate)) 204 conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityError, err.Error()) 205 return ctrl.Result{}, err 206 } 207 if err := workloadCluster.RemoveEtcdMemberForMachine(ctx, machineToBeRemediated); err != nil { 208 log.Error(err, "Failed to remove etcd member for machine") 209 conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityError, err.Error()) 210 return ctrl.Result{}, err 211 } 212 } 213 214 parsedVersion, err := semver.ParseTolerant(controlPlane.KCP.Spec.Version) 215 if err != nil { 216 return ctrl.Result{}, errors.Wrapf(err, "failed to parse kubernetes version %q", controlPlane.KCP.Spec.Version) 217 } 218 219 if err := workloadCluster.RemoveMachineFromKubeadmConfigMap(ctx, machineToBeRemediated, parsedVersion); err != nil { 220 log.Error(err, "Failed to remove machine from kubeadm ConfigMap") 221 return ctrl.Result{}, err 222 } 223 } 224 225 // Delete the machine 226 if err := r.Client.Delete(ctx, machineToBeRemediated); err != nil { 227 conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityError, err.Error()) 228 return ctrl.Result{}, errors.Wrapf(err, "failed to delete unhealthy machine %s", machineToBeRemediated.Name) 229 } 230 231 // Surface the operation is in progress. 232 log.Info("Remediating unhealthy machine") 233 conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "") 234 235 // Prepare the info for tracking the remediation progress into the RemediationInProgressAnnotation. 236 remediationInProgressValue, err := remediationInProgressData.Marshal() 237 if err != nil { 238 return ctrl.Result{}, err 239 } 240 241 // Set annotations tracking remediation details so they can be picked up by the machine 242 // that will be created as part of the scale up action that completes the remediation. 243 annotations.AddAnnotations(controlPlane.KCP, map[string]string{ 244 controlplanev1.RemediationInProgressAnnotation: remediationInProgressValue, 245 }) 246 247 return ctrl.Result{Requeue: true}, nil 248 } 249 250 // Gets the machine to be remediated, which is the oldest machine marked as unhealthy not yet provisioned (if any) 251 // or the oldest machine marked as unhealthy. 252 func getMachineToBeRemediated(unhealthyMachines collections.Machines) *clusterv1.Machine { 253 machineToBeRemediated := unhealthyMachines.Filter(collections.Not(collections.HasNode())).Oldest() 254 if machineToBeRemediated == nil { 255 machineToBeRemediated = unhealthyMachines.Oldest() 256 } 257 return machineToBeRemediated 258 } 259 260 // checkRetryLimits checks if KCP is allowed to remediate considering retry limits: 261 // - Remediation cannot happen because retryPeriod is not yet expired. 262 // - KCP already reached the maximum number of retries for a machine. 263 // NOTE: Counting the number of retries is required In order to prevent infinite remediation e.g. in case the 264 // first Control Plane machine is failing due to quota issue. 265 func (r *KubeadmControlPlaneReconciler) checkRetryLimits(log logr.Logger, machineToBeRemediated *clusterv1.Machine, controlPlane *internal.ControlPlane, reconciliationTime time.Time) (*RemediationData, bool, error) { 266 // Get last remediation info from the machine. 267 var lastRemediationData *RemediationData 268 if value, ok := machineToBeRemediated.Annotations[controlplanev1.RemediationForAnnotation]; ok { 269 l, err := RemediationDataFromAnnotation(value) 270 if err != nil { 271 return nil, false, err 272 } 273 lastRemediationData = l 274 } 275 276 remediationInProgressData := &RemediationData{ 277 Machine: machineToBeRemediated.Name, 278 Timestamp: metav1.Time{Time: reconciliationTime}, 279 RetryCount: 0, 280 } 281 282 // If there is no last remediation, this is the first try of a new retry sequence. 283 if lastRemediationData == nil { 284 return remediationInProgressData, true, nil 285 } 286 287 // Gets MinHealthyPeriod and RetryPeriod from the remediation strategy, or use defaults. 288 minHealthyPeriod := controlplanev1.DefaultMinHealthyPeriod 289 if controlPlane.KCP.Spec.RemediationStrategy != nil && controlPlane.KCP.Spec.RemediationStrategy.MinHealthyPeriod != nil { 290 minHealthyPeriod = controlPlane.KCP.Spec.RemediationStrategy.MinHealthyPeriod.Duration 291 } 292 retryPeriod := time.Duration(0) 293 if controlPlane.KCP.Spec.RemediationStrategy != nil { 294 retryPeriod = controlPlane.KCP.Spec.RemediationStrategy.RetryPeriod.Duration 295 } 296 297 // Gets the timestamp of the last remediation; if missing, default to a value 298 // that ensures both MinHealthyPeriod and RetryPeriod are expired. 299 // NOTE: this could potentially lead to executing more retries than expected or to executing retries before than 300 // expected, but this is considered acceptable when the system recovers from someone/something changes or deletes 301 // the RemediationForAnnotation on Machines. 302 lastRemediationTime := reconciliationTime.Add(-2 * max(minHealthyPeriod, retryPeriod)) 303 if !lastRemediationData.Timestamp.IsZero() { 304 lastRemediationTime = lastRemediationData.Timestamp.Time 305 } 306 307 // Once we get here we already know that there was a last remediation for the Machine. 308 // If the current remediation is happening before minHealthyPeriod is expired, then KCP considers this 309 // as a remediation for the same previously unhealthy machine. 310 // NOTE: If someone/something changes the RemediationForAnnotation on Machines (e.g. changes the Timestamp), 311 // this could potentially lead to executing more retries than expected, but this is considered acceptable in such a case. 312 var retryForSameMachineInProgress bool 313 if lastRemediationTime.Add(minHealthyPeriod).After(reconciliationTime) { 314 retryForSameMachineInProgress = true 315 log = log.WithValues("RemediationRetryFor", klog.KRef(machineToBeRemediated.Namespace, lastRemediationData.Machine)) 316 } 317 318 // If the retry for the same machine is not in progress, this is the first try of a new retry sequence. 319 if !retryForSameMachineInProgress { 320 return remediationInProgressData, true, nil 321 } 322 323 // If the remediation is for the same machine, carry over the retry count. 324 remediationInProgressData.RetryCount = lastRemediationData.RetryCount 325 326 // Check if remediation can happen because retryPeriod is passed. 327 if lastRemediationTime.Add(retryPeriod).After(reconciliationTime) { 328 log.Info(fmt.Sprintf("A control plane machine needs remediation, but the operation already failed in the latest %s. Skipping remediation", retryPeriod)) 329 conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because the operation already failed in the latest %s (RetryPeriod)", retryPeriod) 330 return remediationInProgressData, false, nil 331 } 332 333 // Check if remediation can happen because of maxRetry is not reached yet, if defined. 334 if controlPlane.KCP.Spec.RemediationStrategy != nil && controlPlane.KCP.Spec.RemediationStrategy.MaxRetry != nil { 335 maxRetry := int(*controlPlane.KCP.Spec.RemediationStrategy.MaxRetry) 336 if remediationInProgressData.RetryCount >= maxRetry { 337 log.Info(fmt.Sprintf("A control plane machine needs remediation, but the operation already failed %d times (MaxRetry %d). Skipping remediation", remediationInProgressData.RetryCount, maxRetry)) 338 conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because the operation already failed %d times (MaxRetry)", maxRetry) 339 return remediationInProgressData, false, nil 340 } 341 } 342 343 // All the check passed, increase the remediation retry count. 344 remediationInProgressData.RetryCount++ 345 346 return remediationInProgressData, true, nil 347 } 348 349 // canSafelyRemoveEtcdMember assess if it is possible to remove the member hosted on the machine to be remediated 350 // without loosing etcd quorum. 351 // 352 // The answer mostly depend on the existence of other failing members on top of the one being deleted, and according 353 // to the etcd fault tolerance specification (see https://etcd.io/docs/v3.3/faq/#what-is-failure-tolerance): 354 // - 3 CP cluster does not tolerate additional failing members on top of the one being deleted (the target 355 // cluster size after deletion is 2, fault tolerance 0) 356 // - 5 CP cluster tolerates 1 additional failing members on top of the one being deleted (the target 357 // cluster size after deletion is 4, fault tolerance 1) 358 // - 7 CP cluster tolerates 2 additional failing members on top of the one being deleted (the target 359 // cluster size after deletion is 6, fault tolerance 2) 360 // - etc. 361 // 362 // NOTE: this func assumes the list of members in sync with the list of machines/nodes, it is required to call reconcileEtcdMembers 363 // as well as reconcileControlPlaneConditions before this. 364 func (r *KubeadmControlPlaneReconciler) canSafelyRemoveEtcdMember(ctx context.Context, controlPlane *internal.ControlPlane, machineToBeRemediated *clusterv1.Machine) (bool, error) { 365 log := ctrl.LoggerFrom(ctx) 366 367 workloadCluster, err := controlPlane.GetWorkloadCluster(ctx) 368 if err != nil { 369 return false, errors.Wrapf(err, "failed to get client for workload cluster %s", controlPlane.Cluster.Name) 370 } 371 372 // Gets the etcd status 373 374 // This makes it possible to have a set of etcd members status different from the MHC unhealthy/unhealthy conditions. 375 etcdMembers, err := workloadCluster.EtcdMembers(ctx) 376 if err != nil { 377 return false, errors.Wrapf(err, "failed to get etcdStatus for workload cluster %s", controlPlane.Cluster.Name) 378 } 379 380 currentTotalMembers := len(etcdMembers) 381 382 log.Info("etcd cluster before remediation", 383 "currentTotalMembers", currentTotalMembers, 384 "currentMembers", etcdMembers) 385 386 // Projects the target etcd cluster after remediation, considering all the etcd members except the one being remediated. 387 targetTotalMembers := 0 388 targetUnhealthyMembers := 0 389 390 healthyMembers := []string{} 391 unhealthyMembers := []string{} 392 for _, etcdMember := range etcdMembers { 393 // Skip the machine to be deleted because it won't be part of the target etcd cluster. 394 if machineToBeRemediated.Status.NodeRef != nil && machineToBeRemediated.Status.NodeRef.Name == etcdMember { 395 continue 396 } 397 398 // Include the member in the target etcd cluster. 399 targetTotalMembers++ 400 401 // Search for the machine corresponding to the etcd member. 402 var machine *clusterv1.Machine 403 for _, m := range controlPlane.Machines { 404 if m.Status.NodeRef != nil && m.Status.NodeRef.Name == etcdMember { 405 machine = m 406 break 407 } 408 } 409 410 // If an etcd member does not have a corresponding machine it is not possible to retrieve etcd member health, 411 // so KCP is assuming the worst scenario and considering the member unhealthy. 412 // 413 // NOTE: This should not happen given that KCP is running reconcileEtcdMembers before calling this method. 414 if machine == nil { 415 log.Info("An etcd member does not have a corresponding machine, assuming this member is unhealthy", "MemberName", etcdMember) 416 targetUnhealthyMembers++ 417 unhealthyMembers = append(unhealthyMembers, fmt.Sprintf("%s (no machine)", etcdMember)) 418 continue 419 } 420 421 // Check member health as reported by machine's health conditions 422 if !conditions.IsTrue(machine, controlplanev1.MachineEtcdMemberHealthyCondition) { 423 targetUnhealthyMembers++ 424 unhealthyMembers = append(unhealthyMembers, fmt.Sprintf("%s (%s)", etcdMember, machine.Name)) 425 continue 426 } 427 428 healthyMembers = append(healthyMembers, fmt.Sprintf("%s (%s)", etcdMember, machine.Name)) 429 } 430 431 // See https://etcd.io/docs/v3.3/faq/#what-is-failure-tolerance for fault tolerance formula explanation. 432 targetQuorum := (targetTotalMembers / 2.0) + 1 433 canSafelyRemediate := targetTotalMembers-targetUnhealthyMembers >= targetQuorum 434 435 log.Info(fmt.Sprintf("etcd cluster projected after remediation of %s", machineToBeRemediated.Name), 436 "healthyMembers", healthyMembers, 437 "unhealthyMembers", unhealthyMembers, 438 "targetTotalMembers", targetTotalMembers, 439 "targetQuorum", targetQuorum, 440 "targetUnhealthyMembers", targetUnhealthyMembers, 441 "canSafelyRemediate", canSafelyRemediate) 442 443 return canSafelyRemediate, nil 444 } 445 446 // RemediationData struct is used to keep track of information stored in the RemediationInProgressAnnotation in KCP 447 // during remediation and then into the RemediationForAnnotation on the replacement machine once it is created. 448 type RemediationData struct { 449 // Machine is the machine name of the latest machine being remediated. 450 Machine string `json:"machine"` 451 452 // Timestamp is when last remediation happened. It is represented in RFC3339 form and is in UTC. 453 Timestamp metav1.Time `json:"timestamp"` 454 455 // RetryCount used to keep track of remediation retry for the last remediated machine. 456 // A retry happens when a machine that was created as a replacement for an unhealthy machine also fails. 457 RetryCount int `json:"retryCount"` 458 } 459 460 // RemediationDataFromAnnotation gets RemediationData from an annotation value. 461 func RemediationDataFromAnnotation(value string) (*RemediationData, error) { 462 ret := &RemediationData{} 463 if err := json.Unmarshal([]byte(value), ret); err != nil { 464 return nil, errors.Wrapf(err, "failed to unmarshal value %s for %s annotation", value, clusterv1.RemediationInProgressReason) 465 } 466 return ret, nil 467 } 468 469 // Marshal an RemediationData into an annotation value. 470 func (r *RemediationData) Marshal() (string, error) { 471 b, err := json.Marshal(r) 472 if err != nil { 473 return "", errors.Wrapf(err, "failed to marshal value for %s annotation", clusterv1.RemediationInProgressReason) 474 } 475 return string(b), nil 476 } 477 478 // ToStatus converts a RemediationData into a LastRemediationStatus struct. 479 func (r *RemediationData) ToStatus() *controlplanev1.LastRemediationStatus { 480 return &controlplanev1.LastRemediationStatus{ 481 Machine: r.Machine, 482 Timestamp: r.Timestamp, 483 RetryCount: int32(r.RetryCount), 484 } 485 }