sigs.k8s.io/cluster-api@v1.6.3/controlplane/kubeadm/internal/controllers/remediation.go (about) 1 /* 2 Copyright 2020 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package controllers 18 19 import ( 20 "context" 21 "encoding/json" 22 "fmt" 23 "time" 24 25 "github.com/blang/semver/v4" 26 "github.com/go-logr/logr" 27 "github.com/pkg/errors" 28 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 29 kerrors "k8s.io/apimachinery/pkg/util/errors" 30 "k8s.io/klog/v2" 31 ctrl "sigs.k8s.io/controller-runtime" 32 33 clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" 34 controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1beta1" 35 "sigs.k8s.io/cluster-api/controlplane/kubeadm/internal" 36 "sigs.k8s.io/cluster-api/util/annotations" 37 "sigs.k8s.io/cluster-api/util/conditions" 38 "sigs.k8s.io/cluster-api/util/patch" 39 ) 40 41 // reconcileUnhealthyMachines tries to remediate KubeadmControlPlane unhealthy machines 42 // based on the process described in https://github.com/kubernetes-sigs/cluster-api/blob/main/docs/proposals/20191017-kubeadm-based-control-plane.md#remediation-using-delete-and-recreate 43 func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.Context, controlPlane *internal.ControlPlane) (ret ctrl.Result, retErr error) { 44 log := ctrl.LoggerFrom(ctx) 45 reconciliationTime := time.Now().UTC() 46 47 // Cleanup pending remediation actions not completed for any reasons (e.g. number of current replicas is less or equal to 1) 48 // if the underlying machine is now back to healthy / not deleting. 49 errList := []error{} 50 healthyMachines := controlPlane.HealthyMachines() 51 for _, m := range healthyMachines { 52 if conditions.IsTrue(m, clusterv1.MachineHealthCheckSucceededCondition) && 53 conditions.IsFalse(m, clusterv1.MachineOwnerRemediatedCondition) && 54 m.DeletionTimestamp.IsZero() { 55 patchHelper, err := patch.NewHelper(m, r.Client) 56 if err != nil { 57 errList = append(errList, errors.Wrapf(err, "failed to get PatchHelper for machine %s", m.Name)) 58 continue 59 } 60 61 conditions.Delete(m, clusterv1.MachineOwnerRemediatedCondition) 62 63 if err := patchHelper.Patch(ctx, m, patch.WithOwnedConditions{Conditions: []clusterv1.ConditionType{ 64 clusterv1.MachineOwnerRemediatedCondition, 65 }}); err != nil { 66 errList = append(errList, errors.Wrapf(err, "failed to patch machine %s", m.Name)) 67 } 68 } 69 } 70 if len(errList) > 0 { 71 return ctrl.Result{}, kerrors.NewAggregate(errList) 72 } 73 74 // Gets all machines that have `MachineHealthCheckSucceeded=False` (indicating a problem was detected on the machine) 75 // and `MachineOwnerRemediated` present, indicating that this controller is responsible for performing remediation. 76 unhealthyMachines := controlPlane.UnhealthyMachines() 77 78 // If there are no unhealthy machines, return so KCP can proceed with other operations (ctrl.Result nil). 79 if len(unhealthyMachines) == 0 { 80 return ctrl.Result{}, nil 81 } 82 83 // Select the machine to be remediated, which is the oldest machine marked as unhealthy. 84 // 85 // NOTE: The current solution is considered acceptable for the most frequent use case (only one unhealthy machine), 86 // however, in the future this could potentially be improved for the scenario where more than one unhealthy machine exists 87 // by considering which machine has lower impact on etcd quorum. 88 machineToBeRemediated := unhealthyMachines.Oldest() 89 90 // Returns if the machine is in the process of being deleted. 91 if !machineToBeRemediated.ObjectMeta.DeletionTimestamp.IsZero() { 92 return ctrl.Result{}, nil 93 } 94 95 log = log.WithValues("Machine", klog.KObj(machineToBeRemediated), "initialized", controlPlane.KCP.Status.Initialized) 96 97 // Returns if another remediation is in progress but the new Machine is not yet created. 98 // Note: This condition is checked after we check for unhealthy Machines and if machineToBeRemediated 99 // is being deleted to avoid unnecessary logs if no further remediation should be done. 100 if _, ok := controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation]; ok { 101 log.Info("Another remediation is already in progress. Skipping remediation.") 102 return ctrl.Result{}, nil 103 } 104 105 patchHelper, err := patch.NewHelper(machineToBeRemediated, r.Client) 106 if err != nil { 107 return ctrl.Result{}, err 108 } 109 110 defer func() { 111 // Always attempt to Patch the Machine conditions after each reconcileUnhealthyMachines. 112 if err := patchHelper.Patch(ctx, machineToBeRemediated, patch.WithOwnedConditions{Conditions: []clusterv1.ConditionType{ 113 clusterv1.MachineOwnerRemediatedCondition, 114 }}); err != nil { 115 log.Error(err, "Failed to patch control plane Machine", "Machine", machineToBeRemediated.Name) 116 if retErr == nil { 117 retErr = errors.Wrapf(err, "failed to patch control plane Machine %s", machineToBeRemediated.Name) 118 } 119 } 120 }() 121 122 // Before starting remediation, run preflight checks in order to verify it is safe to remediate. 123 // If any of the following checks fails, we'll surface the reason in the MachineOwnerRemediated condition. 124 125 // Check if KCP is allowed to remediate considering retry limits: 126 // - Remediation cannot happen because retryPeriod is not yet expired. 127 // - KCP already reached MaxRetries limit. 128 remediationInProgressData, canRemediate, err := r.checkRetryLimits(log, machineToBeRemediated, controlPlane, reconciliationTime) 129 if err != nil { 130 return ctrl.Result{}, err 131 } 132 if !canRemediate { 133 // NOTE: log lines and conditions surfacing why it is not possible to remediate are set by checkRetryLimits. 134 return ctrl.Result{}, nil 135 } 136 137 if controlPlane.KCP.Status.Initialized { 138 // Executes checks that apply only if the control plane is already initialized; in this case KCP can 139 // remediate only if it can safely assume that the operation preserves the operation state of the 140 // existing cluster (or at least it doesn't make it worse). 141 142 // The cluster MUST have more than one replica, because this is the smallest cluster size that allows any etcd failure tolerance. 143 if controlPlane.Machines.Len() <= 1 { 144 log.Info("A control plane machine needs remediation, but the number of current replicas is less or equal to 1. Skipping remediation", "Replicas", controlPlane.Machines.Len()) 145 conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate if current replicas are less or equal to 1") 146 return ctrl.Result{}, nil 147 } 148 149 // The cluster MUST have no machines with a deletion timestamp. This rule prevents KCP taking actions while the cluster is in a transitional state. 150 if controlPlane.HasDeletingMachine() { 151 log.Info("A control plane machine needs remediation, but there are other control-plane machines being deleted. Skipping remediation") 152 conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP waiting for control plane machine deletion to complete before triggering remediation") 153 return ctrl.Result{}, nil 154 } 155 156 // Remediation MUST preserve etcd quorum. This rule ensures that KCP will not remove a member that would result in etcd 157 // losing a majority of members and thus become unable to field new requests. 158 if controlPlane.IsEtcdManaged() { 159 canSafelyRemediate, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, machineToBeRemediated) 160 if err != nil { 161 conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityError, err.Error()) 162 return ctrl.Result{}, err 163 } 164 if !canSafelyRemediate { 165 log.Info("A control plane machine needs remediation, but removing this machine could result in etcd quorum loss. Skipping remediation") 166 conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because this could result in etcd loosing quorum") 167 return ctrl.Result{}, nil 168 } 169 } 170 171 // Start remediating the unhealthy control plane machine by deleting it. 172 // A new machine will come up completing the operation as part of the regular reconcile. 173 174 // If the control plane is initialized, before deleting the machine: 175 // - if the machine hosts the etcd leader, forward etcd leadership to another machine. 176 // - delete the etcd member hosted on the machine being deleted. 177 // - remove the etcd member from the kubeadm config map (only for kubernetes version older than v1.22.0) 178 workloadCluster, err := controlPlane.GetWorkloadCluster(ctx) 179 if err != nil { 180 log.Error(err, "Failed to create client to workload cluster") 181 return ctrl.Result{}, errors.Wrapf(err, "failed to create client to workload cluster") 182 } 183 184 // If the machine that is about to be deleted is the etcd leader, move it to the newest member available. 185 if controlPlane.IsEtcdManaged() { 186 etcdLeaderCandidate := controlPlane.HealthyMachines().Newest() 187 if etcdLeaderCandidate == nil { 188 log.Info("A control plane machine needs remediation, but there is no healthy machine to forward etcd leadership to") 189 conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityWarning, 190 "A control plane machine needs remediation, but there is no healthy machine to forward etcd leadership to. Skipping remediation") 191 return ctrl.Result{}, nil 192 } 193 if err := workloadCluster.ForwardEtcdLeadership(ctx, machineToBeRemediated, etcdLeaderCandidate); err != nil { 194 log.Error(err, "Failed to move etcd leadership to candidate machine", "candidate", klog.KObj(etcdLeaderCandidate)) 195 conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityError, err.Error()) 196 return ctrl.Result{}, err 197 } 198 if err := workloadCluster.RemoveEtcdMemberForMachine(ctx, machineToBeRemediated); err != nil { 199 log.Error(err, "Failed to remove etcd member for machine") 200 conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityError, err.Error()) 201 return ctrl.Result{}, err 202 } 203 } 204 205 parsedVersion, err := semver.ParseTolerant(controlPlane.KCP.Spec.Version) 206 if err != nil { 207 return ctrl.Result{}, errors.Wrapf(err, "failed to parse kubernetes version %q", controlPlane.KCP.Spec.Version) 208 } 209 210 if err := workloadCluster.RemoveMachineFromKubeadmConfigMap(ctx, machineToBeRemediated, parsedVersion); err != nil { 211 log.Error(err, "Failed to remove machine from kubeadm ConfigMap") 212 return ctrl.Result{}, err 213 } 214 } 215 216 // Delete the machine 217 if err := r.Client.Delete(ctx, machineToBeRemediated); err != nil { 218 conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityError, err.Error()) 219 return ctrl.Result{}, errors.Wrapf(err, "failed to delete unhealthy machine %s", machineToBeRemediated.Name) 220 } 221 222 // Surface the operation is in progress. 223 log.Info("Remediating unhealthy machine") 224 conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "") 225 226 // Prepare the info for tracking the remediation progress into the RemediationInProgressAnnotation. 227 remediationInProgressValue, err := remediationInProgressData.Marshal() 228 if err != nil { 229 return ctrl.Result{}, err 230 } 231 232 // Set annotations tracking remediation details so they can be picked up by the machine 233 // that will be created as part of the scale up action that completes the remediation. 234 annotations.AddAnnotations(controlPlane.KCP, map[string]string{ 235 controlplanev1.RemediationInProgressAnnotation: remediationInProgressValue, 236 }) 237 238 return ctrl.Result{Requeue: true}, nil 239 } 240 241 // checkRetryLimits checks if KCP is allowed to remediate considering retry limits: 242 // - Remediation cannot happen because retryPeriod is not yet expired. 243 // - KCP already reached the maximum number of retries for a machine. 244 // NOTE: Counting the number of retries is required In order to prevent infinite remediation e.g. in case the 245 // first Control Plane machine is failing due to quota issue. 246 func (r *KubeadmControlPlaneReconciler) checkRetryLimits(log logr.Logger, machineToBeRemediated *clusterv1.Machine, controlPlane *internal.ControlPlane, reconciliationTime time.Time) (*RemediationData, bool, error) { 247 // Get last remediation info from the machine. 248 var lastRemediationData *RemediationData 249 if value, ok := machineToBeRemediated.Annotations[controlplanev1.RemediationForAnnotation]; ok { 250 l, err := RemediationDataFromAnnotation(value) 251 if err != nil { 252 return nil, false, err 253 } 254 lastRemediationData = l 255 } 256 257 remediationInProgressData := &RemediationData{ 258 Machine: machineToBeRemediated.Name, 259 Timestamp: metav1.Time{Time: reconciliationTime}, 260 RetryCount: 0, 261 } 262 263 // If there is no last remediation, this is the first try of a new retry sequence. 264 if lastRemediationData == nil { 265 return remediationInProgressData, true, nil 266 } 267 268 // Gets MinHealthyPeriod and RetryPeriod from the remediation strategy, or use defaults. 269 minHealthyPeriod := controlplanev1.DefaultMinHealthyPeriod 270 if controlPlane.KCP.Spec.RemediationStrategy != nil && controlPlane.KCP.Spec.RemediationStrategy.MinHealthyPeriod != nil { 271 minHealthyPeriod = controlPlane.KCP.Spec.RemediationStrategy.MinHealthyPeriod.Duration 272 } 273 retryPeriod := time.Duration(0) 274 if controlPlane.KCP.Spec.RemediationStrategy != nil { 275 retryPeriod = controlPlane.KCP.Spec.RemediationStrategy.RetryPeriod.Duration 276 } 277 278 // Gets the timestamp of the last remediation; if missing, default to a value 279 // that ensures both MinHealthyPeriod and RetryPeriod are expired. 280 // NOTE: this could potentially lead to executing more retries than expected or to executing retries before than 281 // expected, but this is considered acceptable when the system recovers from someone/something changes or deletes 282 // the RemediationForAnnotation on Machines. 283 lastRemediationTime := reconciliationTime.Add(-2 * max(minHealthyPeriod, retryPeriod)) 284 if !lastRemediationData.Timestamp.IsZero() { 285 lastRemediationTime = lastRemediationData.Timestamp.Time 286 } 287 288 // Once we get here we already know that there was a last remediation for the Machine. 289 // If the current remediation is happening before minHealthyPeriod is expired, then KCP considers this 290 // as a remediation for the same previously unhealthy machine. 291 // NOTE: If someone/something changes the RemediationForAnnotation on Machines (e.g. changes the Timestamp), 292 // this could potentially lead to executing more retries than expected, but this is considered acceptable in such a case. 293 var retryForSameMachineInProgress bool 294 if lastRemediationTime.Add(minHealthyPeriod).After(reconciliationTime) { 295 retryForSameMachineInProgress = true 296 log = log.WithValues("RemediationRetryFor", klog.KRef(machineToBeRemediated.Namespace, lastRemediationData.Machine)) 297 } 298 299 // If the retry for the same machine is not in progress, this is the first try of a new retry sequence. 300 if !retryForSameMachineInProgress { 301 return remediationInProgressData, true, nil 302 } 303 304 // If the remediation is for the same machine, carry over the retry count. 305 remediationInProgressData.RetryCount = lastRemediationData.RetryCount 306 307 // Check if remediation can happen because retryPeriod is passed. 308 if lastRemediationTime.Add(retryPeriod).After(reconciliationTime) { 309 log.Info(fmt.Sprintf("A control plane machine needs remediation, but the operation already failed in the latest %s. Skipping remediation", retryPeriod)) 310 conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because the operation already failed in the latest %s (RetryPeriod)", retryPeriod) 311 return remediationInProgressData, false, nil 312 } 313 314 // Check if remediation can happen because of maxRetry is not reached yet, if defined. 315 if controlPlane.KCP.Spec.RemediationStrategy != nil && controlPlane.KCP.Spec.RemediationStrategy.MaxRetry != nil { 316 maxRetry := int(*controlPlane.KCP.Spec.RemediationStrategy.MaxRetry) 317 if remediationInProgressData.RetryCount >= maxRetry { 318 log.Info(fmt.Sprintf("A control plane machine needs remediation, but the operation already failed %d times (MaxRetry %d). Skipping remediation", remediationInProgressData.RetryCount, maxRetry)) 319 conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because the operation already failed %d times (MaxRetry)", maxRetry) 320 return remediationInProgressData, false, nil 321 } 322 } 323 324 // All the check passed, increase the remediation retry count. 325 remediationInProgressData.RetryCount++ 326 327 return remediationInProgressData, true, nil 328 } 329 330 // max calculates the maximum duration. 331 func max(x, y time.Duration) time.Duration { 332 if x < y { 333 return y 334 } 335 return x 336 } 337 338 // canSafelyRemoveEtcdMember assess if it is possible to remove the member hosted on the machine to be remediated 339 // without loosing etcd quorum. 340 // 341 // The answer mostly depend on the existence of other failing members on top of the one being deleted, and according 342 // to the etcd fault tolerance specification (see https://etcd.io/docs/v3.3/faq/#what-is-failure-tolerance): 343 // - 3 CP cluster does not tolerate additional failing members on top of the one being deleted (the target 344 // cluster size after deletion is 2, fault tolerance 0) 345 // - 5 CP cluster tolerates 1 additional failing members on top of the one being deleted (the target 346 // cluster size after deletion is 4, fault tolerance 1) 347 // - 7 CP cluster tolerates 2 additional failing members on top of the one being deleted (the target 348 // cluster size after deletion is 6, fault tolerance 2) 349 // - etc. 350 // 351 // NOTE: this func assumes the list of members in sync with the list of machines/nodes, it is required to call reconcileEtcdMembers 352 // as well as reconcileControlPlaneConditions before this. 353 func (r *KubeadmControlPlaneReconciler) canSafelyRemoveEtcdMember(ctx context.Context, controlPlane *internal.ControlPlane, machineToBeRemediated *clusterv1.Machine) (bool, error) { 354 log := ctrl.LoggerFrom(ctx) 355 356 workloadCluster, err := controlPlane.GetWorkloadCluster(ctx) 357 if err != nil { 358 return false, errors.Wrapf(err, "failed to get client for workload cluster %s", controlPlane.Cluster.Name) 359 } 360 361 // Gets the etcd status 362 363 // This makes it possible to have a set of etcd members status different from the MHC unhealthy/unhealthy conditions. 364 etcdMembers, err := workloadCluster.EtcdMembers(ctx) 365 if err != nil { 366 return false, errors.Wrapf(err, "failed to get etcdStatus for workload cluster %s", controlPlane.Cluster.Name) 367 } 368 369 currentTotalMembers := len(etcdMembers) 370 371 log.Info("etcd cluster before remediation", 372 "currentTotalMembers", currentTotalMembers, 373 "currentMembers", etcdMembers) 374 375 // Projects the target etcd cluster after remediation, considering all the etcd members except the one being remediated. 376 targetTotalMembers := 0 377 targetUnhealthyMembers := 0 378 379 healthyMembers := []string{} 380 unhealthyMembers := []string{} 381 for _, etcdMember := range etcdMembers { 382 // Skip the machine to be deleted because it won't be part of the target etcd cluster. 383 if machineToBeRemediated.Status.NodeRef != nil && machineToBeRemediated.Status.NodeRef.Name == etcdMember { 384 continue 385 } 386 387 // Include the member in the target etcd cluster. 388 targetTotalMembers++ 389 390 // Search for the machine corresponding to the etcd member. 391 var machine *clusterv1.Machine 392 for _, m := range controlPlane.Machines { 393 if m.Status.NodeRef != nil && m.Status.NodeRef.Name == etcdMember { 394 machine = m 395 break 396 } 397 } 398 399 // If an etcd member does not have a corresponding machine it is not possible to retrieve etcd member health, 400 // so KCP is assuming the worst scenario and considering the member unhealthy. 401 // 402 // NOTE: This should not happen given that KCP is running reconcileEtcdMembers before calling this method. 403 if machine == nil { 404 log.Info("An etcd member does not have a corresponding machine, assuming this member is unhealthy", "MemberName", etcdMember) 405 targetUnhealthyMembers++ 406 unhealthyMembers = append(unhealthyMembers, fmt.Sprintf("%s (no machine)", etcdMember)) 407 continue 408 } 409 410 // Check member health as reported by machine's health conditions 411 if !conditions.IsTrue(machine, controlplanev1.MachineEtcdMemberHealthyCondition) { 412 targetUnhealthyMembers++ 413 unhealthyMembers = append(unhealthyMembers, fmt.Sprintf("%s (%s)", etcdMember, machine.Name)) 414 continue 415 } 416 417 healthyMembers = append(healthyMembers, fmt.Sprintf("%s (%s)", etcdMember, machine.Name)) 418 } 419 420 // See https://etcd.io/docs/v3.3/faq/#what-is-failure-tolerance for fault tolerance formula explanation. 421 targetQuorum := (targetTotalMembers / 2.0) + 1 422 canSafelyRemediate := targetTotalMembers-targetUnhealthyMembers >= targetQuorum 423 424 log.Info(fmt.Sprintf("etcd cluster projected after remediation of %s", machineToBeRemediated.Name), 425 "healthyMembers", healthyMembers, 426 "unhealthyMembers", unhealthyMembers, 427 "targetTotalMembers", targetTotalMembers, 428 "targetQuorum", targetQuorum, 429 "targetUnhealthyMembers", targetUnhealthyMembers, 430 "canSafelyRemediate", canSafelyRemediate) 431 432 return canSafelyRemediate, nil 433 } 434 435 // RemediationData struct is used to keep track of information stored in the RemediationInProgressAnnotation in KCP 436 // during remediation and then into the RemediationForAnnotation on the replacement machine once it is created. 437 type RemediationData struct { 438 // Machine is the machine name of the latest machine being remediated. 439 Machine string `json:"machine"` 440 441 // Timestamp is when last remediation happened. It is represented in RFC3339 form and is in UTC. 442 Timestamp metav1.Time `json:"timestamp"` 443 444 // RetryCount used to keep track of remediation retry for the last remediated machine. 445 // A retry happens when a machine that was created as a replacement for an unhealthy machine also fails. 446 RetryCount int `json:"retryCount"` 447 } 448 449 // RemediationDataFromAnnotation gets RemediationData from an annotation value. 450 func RemediationDataFromAnnotation(value string) (*RemediationData, error) { 451 ret := &RemediationData{} 452 if err := json.Unmarshal([]byte(value), ret); err != nil { 453 return nil, errors.Wrapf(err, "failed to unmarshal value %s for %s annotation", value, clusterv1.RemediationInProgressReason) 454 } 455 return ret, nil 456 } 457 458 // Marshal an RemediationData into an annotation value. 459 func (r *RemediationData) Marshal() (string, error) { 460 b, err := json.Marshal(r) 461 if err != nil { 462 return "", errors.Wrapf(err, "failed to marshal value for %s annotation", clusterv1.RemediationInProgressReason) 463 } 464 return string(b), nil 465 } 466 467 // ToStatus converts a RemediationData into a LastRemediationStatus struct. 468 func (r *RemediationData) ToStatus() *controlplanev1.LastRemediationStatus { 469 return &controlplanev1.LastRemediationStatus{ 470 Machine: r.Machine, 471 Timestamp: r.Timestamp, 472 RetryCount: int32(r.RetryCount), 473 } 474 }