sigs.k8s.io/cluster-api@v1.7.1/internal/controllers/machineset/machineset_controller.go (about) 1 /* 2 Copyright 2019 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package machineset 18 19 import ( 20 "context" 21 "fmt" 22 "strings" 23 "time" 24 25 "github.com/pkg/errors" 26 corev1 "k8s.io/api/core/v1" 27 apierrors "k8s.io/apimachinery/pkg/api/errors" 28 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 29 "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" 30 "k8s.io/apimachinery/pkg/labels" 31 kerrors "k8s.io/apimachinery/pkg/util/errors" 32 "k8s.io/apimachinery/pkg/util/wait" 33 "k8s.io/apiserver/pkg/storage/names" 34 "k8s.io/client-go/tools/record" 35 "k8s.io/klog/v2" 36 ctrl "sigs.k8s.io/controller-runtime" 37 "sigs.k8s.io/controller-runtime/pkg/builder" 38 "sigs.k8s.io/controller-runtime/pkg/client" 39 "sigs.k8s.io/controller-runtime/pkg/controller" 40 "sigs.k8s.io/controller-runtime/pkg/handler" 41 42 clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" 43 "sigs.k8s.io/cluster-api/controllers/external" 44 "sigs.k8s.io/cluster-api/controllers/noderefutil" 45 "sigs.k8s.io/cluster-api/controllers/remote" 46 "sigs.k8s.io/cluster-api/internal/contract" 47 "sigs.k8s.io/cluster-api/internal/controllers/machine" 48 "sigs.k8s.io/cluster-api/internal/util/ssa" 49 "sigs.k8s.io/cluster-api/util" 50 "sigs.k8s.io/cluster-api/util/annotations" 51 "sigs.k8s.io/cluster-api/util/collections" 52 "sigs.k8s.io/cluster-api/util/conditions" 53 utilconversion "sigs.k8s.io/cluster-api/util/conversion" 54 "sigs.k8s.io/cluster-api/util/labels/format" 55 clog "sigs.k8s.io/cluster-api/util/log" 56 "sigs.k8s.io/cluster-api/util/patch" 57 "sigs.k8s.io/cluster-api/util/predicates" 58 ) 59 60 var ( 61 // machineSetKind contains the schema.GroupVersionKind for the MachineSet type. 62 machineSetKind = clusterv1.GroupVersion.WithKind("MachineSet") 63 64 // stateConfirmationTimeout is the amount of time allowed to wait for desired state. 65 stateConfirmationTimeout = 10 * time.Second 66 67 // stateConfirmationInterval is the amount of time between polling for the desired state. 68 // The polling is against a local memory cache. 69 stateConfirmationInterval = 100 * time.Millisecond 70 ) 71 72 const machineSetManagerName = "capi-machineset" 73 74 // +kubebuilder:rbac:groups=core,resources=events,verbs=get;list;watch;create;patch 75 // +kubebuilder:rbac:groups=core,resources=secrets,verbs=get;list;watch 76 // +kubebuilder:rbac:groups=core,resources=nodes,verbs=get;list;watch;create;update;patch;delete 77 // +kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io;bootstrap.cluster.x-k8s.io,resources=*,verbs=get;list;watch;create;update;patch;delete 78 // +kubebuilder:rbac:groups=cluster.x-k8s.io,resources=machinesets;machinesets/status;machinesets/finalizers,verbs=get;list;watch;create;update;patch;delete 79 80 // Reconciler reconciles a MachineSet object. 81 type Reconciler struct { 82 Client client.Client 83 UnstructuredCachingClient client.Client 84 APIReader client.Reader 85 Tracker *remote.ClusterCacheTracker 86 87 // WatchFilterValue is the label value used to filter events prior to reconciliation. 88 WatchFilterValue string 89 90 ssaCache ssa.Cache 91 recorder record.EventRecorder 92 } 93 94 func (r *Reconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, options controller.Options) error { 95 clusterToMachineSets, err := util.ClusterToTypedObjectsMapper(mgr.GetClient(), &clusterv1.MachineSetList{}, mgr.GetScheme()) 96 if err != nil { 97 return err 98 } 99 100 err = ctrl.NewControllerManagedBy(mgr). 101 For(&clusterv1.MachineSet{}). 102 Owns(&clusterv1.Machine{}). 103 Watches( 104 &clusterv1.Machine{}, 105 handler.EnqueueRequestsFromMapFunc(r.MachineToMachineSets), 106 ). 107 WithOptions(options). 108 WithEventFilter(predicates.ResourceNotPausedAndHasFilterLabel(ctrl.LoggerFrom(ctx), r.WatchFilterValue)). 109 Watches( 110 &clusterv1.Cluster{}, 111 handler.EnqueueRequestsFromMapFunc(clusterToMachineSets), 112 builder.WithPredicates( 113 // TODO: should this wait for Cluster.Status.InfrastructureReady similar to Infra Machine resources? 114 predicates.All(ctrl.LoggerFrom(ctx), 115 predicates.ClusterUnpaused(ctrl.LoggerFrom(ctx)), 116 predicates.ResourceHasFilterLabel(ctrl.LoggerFrom(ctx), r.WatchFilterValue), 117 ), 118 ), 119 ).Complete(r) 120 if err != nil { 121 return errors.Wrap(err, "failed setting up with a controller manager") 122 } 123 124 r.recorder = mgr.GetEventRecorderFor("machineset-controller") 125 r.ssaCache = ssa.NewCache() 126 return nil 127 } 128 129 func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Result, reterr error) { 130 machineSet := &clusterv1.MachineSet{} 131 if err := r.Client.Get(ctx, req.NamespacedName, machineSet); err != nil { 132 if apierrors.IsNotFound(err) { 133 // Object not found, return. Created objects are automatically garbage collected. 134 // For additional cleanup logic use finalizers. 135 return ctrl.Result{}, nil 136 } 137 // Error reading the object - requeue the request. 138 return ctrl.Result{}, err 139 } 140 141 // AddOwners adds the owners of MachineSet as k/v pairs to the logger. 142 // Specifically, it will add MachineDeployment. 143 ctx, log, err := clog.AddOwners(ctx, r.Client, machineSet) 144 if err != nil { 145 return ctrl.Result{}, err 146 } 147 148 log = log.WithValues("Cluster", klog.KRef(machineSet.ObjectMeta.Namespace, machineSet.Spec.ClusterName)) 149 ctx = ctrl.LoggerInto(ctx, log) 150 151 cluster, err := util.GetClusterByName(ctx, r.Client, machineSet.ObjectMeta.Namespace, machineSet.Spec.ClusterName) 152 if err != nil { 153 return ctrl.Result{}, err 154 } 155 156 // Return early if the object or Cluster is paused. 157 if annotations.IsPaused(cluster, machineSet) { 158 log.Info("Reconciliation is paused for this object") 159 return ctrl.Result{}, nil 160 } 161 162 // Initialize the patch helper 163 patchHelper, err := patch.NewHelper(machineSet, r.Client) 164 if err != nil { 165 return ctrl.Result{}, err 166 } 167 168 defer func() { 169 // Always attempt to patch the object and status after each reconciliation. 170 if err := patchMachineSet(ctx, patchHelper, machineSet); err != nil { 171 reterr = kerrors.NewAggregate([]error{reterr, err}) 172 } 173 }() 174 175 // Ignore deleted MachineSets, this can happen when foregroundDeletion 176 // is enabled 177 if !machineSet.DeletionTimestamp.IsZero() { 178 return ctrl.Result{}, nil 179 } 180 181 result, err := r.reconcile(ctx, cluster, machineSet) 182 if err != nil { 183 // Requeue if the reconcile failed because the ClusterCacheTracker was locked for 184 // the current cluster because of concurrent access. 185 if errors.Is(err, remote.ErrClusterLocked) { 186 log.V(5).Info("Requeuing because another worker has the lock on the ClusterCacheTracker") 187 return ctrl.Result{RequeueAfter: time.Minute}, nil 188 } 189 r.recorder.Eventf(machineSet, corev1.EventTypeWarning, "ReconcileError", "%v", err) 190 } 191 return result, err 192 } 193 194 func patchMachineSet(ctx context.Context, patchHelper *patch.Helper, machineSet *clusterv1.MachineSet, options ...patch.Option) error { 195 // Always update the readyCondition by summarizing the state of other conditions. 196 conditions.SetSummary(machineSet, 197 conditions.WithConditions( 198 clusterv1.MachinesCreatedCondition, 199 clusterv1.ResizedCondition, 200 clusterv1.MachinesReadyCondition, 201 ), 202 ) 203 204 // Patch the object, ignoring conflicts on the conditions owned by this controller. 205 options = append(options, 206 patch.WithOwnedConditions{Conditions: []clusterv1.ConditionType{ 207 clusterv1.ReadyCondition, 208 clusterv1.MachinesCreatedCondition, 209 clusterv1.ResizedCondition, 210 clusterv1.MachinesReadyCondition, 211 }}, 212 ) 213 return patchHelper.Patch(ctx, machineSet, options...) 214 } 215 216 func (r *Reconciler) reconcile(ctx context.Context, cluster *clusterv1.Cluster, machineSet *clusterv1.MachineSet) (ctrl.Result, error) { 217 log := ctrl.LoggerFrom(ctx) 218 219 // Reconcile and retrieve the Cluster object. 220 if machineSet.Labels == nil { 221 machineSet.Labels = make(map[string]string) 222 } 223 machineSet.Labels[clusterv1.ClusterNameLabel] = machineSet.Spec.ClusterName 224 225 // If the machine set is a stand alone one, meaning not originated from a MachineDeployment, then set it as directly 226 // owned by the Cluster (if not already present). 227 if r.shouldAdopt(machineSet) { 228 machineSet.SetOwnerReferences(util.EnsureOwnerRef(machineSet.GetOwnerReferences(), metav1.OwnerReference{ 229 APIVersion: clusterv1.GroupVersion.String(), 230 Kind: "Cluster", 231 Name: cluster.Name, 232 UID: cluster.UID, 233 })) 234 } 235 236 // Make sure to reconcile the external infrastructure reference. 237 if err := reconcileExternalTemplateReference(ctx, r.UnstructuredCachingClient, cluster, &machineSet.Spec.Template.Spec.InfrastructureRef); err != nil { 238 return ctrl.Result{}, err 239 } 240 // Make sure to reconcile the external bootstrap reference, if any. 241 if machineSet.Spec.Template.Spec.Bootstrap.ConfigRef != nil { 242 if err := reconcileExternalTemplateReference(ctx, r.UnstructuredCachingClient, cluster, machineSet.Spec.Template.Spec.Bootstrap.ConfigRef); err != nil { 243 return ctrl.Result{}, err 244 } 245 } 246 247 // Make sure selector and template to be in the same cluster. 248 if machineSet.Spec.Selector.MatchLabels == nil { 249 machineSet.Spec.Selector.MatchLabels = make(map[string]string) 250 } 251 252 if machineSet.Spec.Template.Labels == nil { 253 machineSet.Spec.Template.Labels = make(map[string]string) 254 } 255 256 machineSet.Spec.Selector.MatchLabels[clusterv1.ClusterNameLabel] = machineSet.Spec.ClusterName 257 machineSet.Spec.Template.Labels[clusterv1.ClusterNameLabel] = machineSet.Spec.ClusterName 258 259 selectorMap, err := metav1.LabelSelectorAsMap(&machineSet.Spec.Selector) 260 if err != nil { 261 return ctrl.Result{}, errors.Wrapf(err, "failed to convert MachineSet %q label selector to a map", machineSet.Name) 262 } 263 264 // Get all Machines linked to this MachineSet. 265 allMachines := &clusterv1.MachineList{} 266 err = r.Client.List(ctx, 267 allMachines, 268 client.InNamespace(machineSet.Namespace), 269 client.MatchingLabels(selectorMap), 270 ) 271 if err != nil { 272 return ctrl.Result{}, errors.Wrap(err, "failed to list machines") 273 } 274 275 // Filter out irrelevant machines (i.e. IsControlledBy something else) and claim orphaned machines. 276 // Machines in deleted state are deliberately not excluded https://github.com/kubernetes-sigs/cluster-api/pull/3434. 277 filteredMachines := make([]*clusterv1.Machine, 0, len(allMachines.Items)) 278 for idx := range allMachines.Items { 279 machine := &allMachines.Items[idx] 280 log := log.WithValues("Machine", klog.KObj(machine)) 281 if shouldExcludeMachine(machineSet, machine) { 282 continue 283 } 284 285 // Attempt to adopt machine if it meets previous conditions and it has no controller references. 286 if metav1.GetControllerOf(machine) == nil { 287 if err := r.adoptOrphan(ctx, machineSet, machine); err != nil { 288 log.Error(err, "Failed to adopt Machine") 289 r.recorder.Eventf(machineSet, corev1.EventTypeWarning, "FailedAdopt", "Failed to adopt Machine %q: %v", machine.Name, err) 290 continue 291 } 292 log.Info("Adopted Machine") 293 r.recorder.Eventf(machineSet, corev1.EventTypeNormal, "SuccessfulAdopt", "Adopted Machine %q", machine.Name) 294 } 295 296 filteredMachines = append(filteredMachines, machine) 297 } 298 299 result := ctrl.Result{} 300 301 reconcileUnhealthyMachinesResult, err := r.reconcileUnhealthyMachines(ctx, cluster, machineSet, filteredMachines) 302 if err != nil { 303 return ctrl.Result{}, errors.Wrap(err, "failed to reconcile unhealthy machines") 304 } 305 result = util.LowestNonZeroResult(result, reconcileUnhealthyMachinesResult) 306 307 if err := r.syncMachines(ctx, machineSet, filteredMachines); err != nil { 308 return ctrl.Result{}, errors.Wrap(err, "failed to update Machines") 309 } 310 311 syncReplicasResult, syncErr := r.syncReplicas(ctx, cluster, machineSet, filteredMachines) 312 result = util.LowestNonZeroResult(result, syncReplicasResult) 313 314 // Always updates status as machines come up or die. 315 if err := r.updateStatus(ctx, cluster, machineSet, filteredMachines); err != nil { 316 return ctrl.Result{}, errors.Wrapf(kerrors.NewAggregate([]error{err, syncErr}), "failed to update MachineSet's Status") 317 } 318 319 if syncErr != nil { 320 return ctrl.Result{}, errors.Wrapf(syncErr, "failed to sync MachineSet replicas") 321 } 322 323 var replicas int32 324 if machineSet.Spec.Replicas != nil { 325 replicas = *machineSet.Spec.Replicas 326 } 327 328 // Resync the MachineSet after MinReadySeconds as a last line of defense to guard against clock-skew. 329 // Clock-skew is an issue as it may impact whether an available replica is counted as a ready replica. 330 // A replica is available if the amount of time since last transition exceeds MinReadySeconds. 331 // If there was a clock skew, checking whether the amount of time since last transition to ready state 332 // exceeds MinReadySeconds could be incorrect. 333 // To avoid an available replica stuck in the ready state, we force a reconcile after MinReadySeconds, 334 // at which point it should confirm any available replica to be available. 335 if machineSet.Spec.MinReadySeconds > 0 && 336 machineSet.Status.ReadyReplicas == replicas && 337 machineSet.Status.AvailableReplicas != replicas { 338 minReadyResult := ctrl.Result{RequeueAfter: time.Duration(machineSet.Spec.MinReadySeconds) * time.Second} 339 result = util.LowestNonZeroResult(result, minReadyResult) 340 return result, nil 341 } 342 343 // Quickly reconcile until the nodes become Ready. 344 if machineSet.Status.ReadyReplicas != replicas { 345 result = util.LowestNonZeroResult(result, ctrl.Result{RequeueAfter: 15 * time.Second}) 346 return result, nil 347 } 348 349 return result, nil 350 } 351 352 // syncMachines updates Machines, InfrastructureMachine and BootstrapConfig to propagate in-place mutable fields 353 // from the MachineSet. 354 // Note: It also cleans up managed fields of all Machines so that Machines that were 355 // created/patched before (< v1.4.0) the controller adopted Server-Side-Apply (SSA) can also work with SSA. 356 // Note: For InfrastructureMachines and BootstrapConfigs it also drops ownership of "metadata.labels" and 357 // "metadata.annotations" from "manager" so that "capi-machineset" can own these fields and can work with SSA. 358 // Otherwise fields would be co-owned by our "old" "manager" and "capi-machineset" and then we would not be 359 // able to e.g. drop labels and annotations. 360 func (r *Reconciler) syncMachines(ctx context.Context, machineSet *clusterv1.MachineSet, machines []*clusterv1.Machine) error { 361 log := ctrl.LoggerFrom(ctx) 362 for i := range machines { 363 m := machines[i] 364 // If the machine is already being deleted, we don't need to update it. 365 if !m.DeletionTimestamp.IsZero() { 366 continue 367 } 368 369 // Cleanup managed fields of all Machines. 370 // We do this so that Machines that were created/patched before the controller adopted Server-Side-Apply (SSA) 371 // (< v1.4.0) can also work with SSA. Otherwise, fields would be co-owned by our "old" "manager" and 372 // "capi-machineset" and then we would not be able to e.g. drop labels and annotations. 373 if err := ssa.CleanUpManagedFieldsForSSAAdoption(ctx, r.Client, m, machineSetManagerName); err != nil { 374 return errors.Wrapf(err, "failed to update machine: failed to adjust the managedFields of the Machine %q", m.Name) 375 } 376 377 // Update Machine to propagate in-place mutable fields from the MachineSet. 378 updatedMachine := r.computeDesiredMachine(machineSet, m) 379 err := ssa.Patch(ctx, r.Client, machineSetManagerName, updatedMachine, ssa.WithCachingProxy{Cache: r.ssaCache, Original: m}) 380 if err != nil { 381 log.Error(err, "failed to update Machine", "Machine", klog.KObj(updatedMachine)) 382 return errors.Wrapf(err, "failed to update Machine %q", klog.KObj(updatedMachine)) 383 } 384 machines[i] = updatedMachine 385 386 infraMachine, err := external.Get(ctx, r.UnstructuredCachingClient, &updatedMachine.Spec.InfrastructureRef, updatedMachine.Namespace) 387 if err != nil { 388 return errors.Wrapf(err, "failed to get InfrastructureMachine %s", 389 klog.KRef(updatedMachine.Spec.InfrastructureRef.Namespace, updatedMachine.Spec.InfrastructureRef.Name)) 390 } 391 // Cleanup managed fields of all InfrastructureMachines to drop ownership of labels and annotations 392 // from "manager". We do this so that InfrastructureMachines that are created using the Create method 393 // can also work with SSA. Otherwise, labels and annotations would be co-owned by our "old" "manager" 394 // and "capi-machineset" and then we would not be able to e.g. drop labels and annotations. 395 labelsAndAnnotationsManagedFieldPaths := []contract.Path{ 396 {"f:metadata", "f:annotations"}, 397 {"f:metadata", "f:labels"}, 398 } 399 if err := ssa.DropManagedFields(ctx, r.Client, infraMachine, machineSetManagerName, labelsAndAnnotationsManagedFieldPaths); err != nil { 400 return errors.Wrapf(err, "failed to update machine: failed to adjust the managedFields of the InfrastructureMachine %s", klog.KObj(infraMachine)) 401 } 402 // Update in-place mutating fields on InfrastructureMachine. 403 if err := r.updateExternalObject(ctx, infraMachine, machineSet); err != nil { 404 return errors.Wrapf(err, "failed to update InfrastructureMachine %s", klog.KObj(infraMachine)) 405 } 406 407 if updatedMachine.Spec.Bootstrap.ConfigRef != nil { 408 bootstrapConfig, err := external.Get(ctx, r.UnstructuredCachingClient, updatedMachine.Spec.Bootstrap.ConfigRef, updatedMachine.Namespace) 409 if err != nil { 410 return errors.Wrapf(err, "failed to get BootstrapConfig %s", 411 klog.KRef(updatedMachine.Spec.Bootstrap.ConfigRef.Namespace, updatedMachine.Spec.Bootstrap.ConfigRef.Name)) 412 } 413 // Cleanup managed fields of all BootstrapConfigs to drop ownership of labels and annotations 414 // from "manager". We do this so that BootstrapConfigs that are created using the Create method 415 // can also work with SSA. Otherwise, labels and annotations would be co-owned by our "old" "manager" 416 // and "capi-machineset" and then we would not be able to e.g. drop labels and annotations. 417 if err := ssa.DropManagedFields(ctx, r.Client, bootstrapConfig, machineSetManagerName, labelsAndAnnotationsManagedFieldPaths); err != nil { 418 return errors.Wrapf(err, "failed to update machine: failed to adjust the managedFields of the BootstrapConfig %s", klog.KObj(bootstrapConfig)) 419 } 420 // Update in-place mutating fields on BootstrapConfig. 421 if err := r.updateExternalObject(ctx, bootstrapConfig, machineSet); err != nil { 422 return errors.Wrapf(err, "failed to update BootstrapConfig %s", klog.KObj(bootstrapConfig)) 423 } 424 } 425 } 426 return nil 427 } 428 429 // syncReplicas scales Machine resources up or down. 430 func (r *Reconciler) syncReplicas(ctx context.Context, cluster *clusterv1.Cluster, ms *clusterv1.MachineSet, machines []*clusterv1.Machine) (ctrl.Result, error) { 431 log := ctrl.LoggerFrom(ctx) 432 if ms.Spec.Replicas == nil { 433 return ctrl.Result{}, errors.Errorf("the Replicas field in Spec for machineset %v is nil, this should not be allowed", ms.Name) 434 } 435 diff := len(machines) - int(*(ms.Spec.Replicas)) 436 switch { 437 case diff < 0: 438 diff *= -1 439 log.Info(fmt.Sprintf("MachineSet is scaling up to %d replicas by creating %d machines", *(ms.Spec.Replicas), diff), "replicas", *(ms.Spec.Replicas), "machineCount", len(machines)) 440 if ms.Annotations != nil { 441 if _, ok := ms.Annotations[clusterv1.DisableMachineCreateAnnotation]; ok { 442 log.Info("Automatic creation of new machines disabled for machine set") 443 return ctrl.Result{}, nil 444 } 445 } 446 447 result, preflightCheckErrMessage, err := r.runPreflightChecks(ctx, cluster, ms, "Scale up") 448 if err != nil || !result.IsZero() { 449 if err != nil { 450 // If the error is not nil use that as the message for the condition. 451 preflightCheckErrMessage = err.Error() 452 } 453 conditions.MarkFalse(ms, clusterv1.MachinesCreatedCondition, clusterv1.PreflightCheckFailedReason, clusterv1.ConditionSeverityError, preflightCheckErrMessage) 454 return result, err 455 } 456 457 var ( 458 machineList []*clusterv1.Machine 459 errs []error 460 ) 461 462 for i := 0; i < diff; i++ { 463 // Create a new logger so the global logger is not modified. 464 log := log 465 machine := r.computeDesiredMachine(ms, nil) 466 // Clone and set the infrastructure and bootstrap references. 467 var ( 468 infraRef, bootstrapRef *corev1.ObjectReference 469 err error 470 ) 471 472 // Create the BootstrapConfig if necessary. 473 if ms.Spec.Template.Spec.Bootstrap.ConfigRef != nil { 474 bootstrapRef, err = external.CreateFromTemplate(ctx, &external.CreateFromTemplateInput{ 475 Client: r.UnstructuredCachingClient, 476 TemplateRef: ms.Spec.Template.Spec.Bootstrap.ConfigRef, 477 Namespace: machine.Namespace, 478 Name: machine.Name, 479 ClusterName: machine.Spec.ClusterName, 480 Labels: machine.Labels, 481 Annotations: machine.Annotations, 482 OwnerRef: &metav1.OwnerReference{ 483 APIVersion: clusterv1.GroupVersion.String(), 484 Kind: "MachineSet", 485 Name: ms.Name, 486 UID: ms.UID, 487 }, 488 }) 489 if err != nil { 490 conditions.MarkFalse(ms, clusterv1.MachinesCreatedCondition, clusterv1.BootstrapTemplateCloningFailedReason, clusterv1.ConditionSeverityError, err.Error()) 491 return ctrl.Result{}, errors.Wrapf(err, "failed to clone bootstrap configuration from %s %s while creating a machine", 492 ms.Spec.Template.Spec.Bootstrap.ConfigRef.Kind, 493 klog.KRef(ms.Spec.Template.Spec.Bootstrap.ConfigRef.Namespace, ms.Spec.Template.Spec.Bootstrap.ConfigRef.Name)) 494 } 495 machine.Spec.Bootstrap.ConfigRef = bootstrapRef 496 log = log.WithValues(bootstrapRef.Kind, klog.KRef(bootstrapRef.Namespace, bootstrapRef.Name)) 497 } 498 499 // Create the InfraMachine. 500 infraRef, err = external.CreateFromTemplate(ctx, &external.CreateFromTemplateInput{ 501 Client: r.UnstructuredCachingClient, 502 TemplateRef: &ms.Spec.Template.Spec.InfrastructureRef, 503 Namespace: machine.Namespace, 504 Name: machine.Name, 505 ClusterName: machine.Spec.ClusterName, 506 Labels: machine.Labels, 507 Annotations: machine.Annotations, 508 OwnerRef: &metav1.OwnerReference{ 509 APIVersion: clusterv1.GroupVersion.String(), 510 Kind: "MachineSet", 511 Name: ms.Name, 512 UID: ms.UID, 513 }, 514 }) 515 if err != nil { 516 conditions.MarkFalse(ms, clusterv1.MachinesCreatedCondition, clusterv1.InfrastructureTemplateCloningFailedReason, clusterv1.ConditionSeverityError, err.Error()) 517 return ctrl.Result{}, errors.Wrapf(err, "failed to clone infrastructure machine from %s %s while creating a machine", 518 ms.Spec.Template.Spec.InfrastructureRef.Kind, 519 klog.KRef(ms.Spec.Template.Spec.InfrastructureRef.Namespace, ms.Spec.Template.Spec.InfrastructureRef.Name)) 520 } 521 log = log.WithValues(infraRef.Kind, klog.KRef(infraRef.Namespace, infraRef.Name)) 522 machine.Spec.InfrastructureRef = *infraRef 523 524 // Create the Machine. 525 if err := ssa.Patch(ctx, r.Client, machineSetManagerName, machine); err != nil { 526 log.Error(err, "Error while creating a machine") 527 r.recorder.Eventf(ms, corev1.EventTypeWarning, "FailedCreate", "Failed to create machine: %v", err) 528 errs = append(errs, err) 529 conditions.MarkFalse(ms, clusterv1.MachinesCreatedCondition, clusterv1.MachineCreationFailedReason, 530 clusterv1.ConditionSeverityError, err.Error()) 531 532 // Try to cleanup the external objects if the Machine creation failed. 533 if err := r.Client.Delete(ctx, util.ObjectReferenceToUnstructured(*infraRef)); !apierrors.IsNotFound(err) { 534 log.Error(err, "Failed to cleanup infrastructure machine object after Machine creation error", infraRef.Kind, klog.KRef(infraRef.Namespace, infraRef.Name)) 535 } 536 if bootstrapRef != nil { 537 if err := r.Client.Delete(ctx, util.ObjectReferenceToUnstructured(*bootstrapRef)); !apierrors.IsNotFound(err) { 538 log.Error(err, "Failed to cleanup bootstrap configuration object after Machine creation error", bootstrapRef.Kind, klog.KRef(bootstrapRef.Namespace, bootstrapRef.Name)) 539 } 540 } 541 continue 542 } 543 544 log.Info(fmt.Sprintf("Created machine %d of %d", i+1, diff), "Machine", klog.KObj(machine)) 545 r.recorder.Eventf(ms, corev1.EventTypeNormal, "SuccessfulCreate", "Created machine %q", machine.Name) 546 machineList = append(machineList, machine) 547 } 548 549 if len(errs) > 0 { 550 return ctrl.Result{}, kerrors.NewAggregate(errs) 551 } 552 return ctrl.Result{}, r.waitForMachineCreation(ctx, machineList) 553 case diff > 0: 554 log.Info(fmt.Sprintf("MachineSet is scaling down to %d replicas by deleting %d machines", *(ms.Spec.Replicas), diff), "replicas", *(ms.Spec.Replicas), "machineCount", len(machines), "deletePolicy", ms.Spec.DeletePolicy) 555 556 deletePriorityFunc, err := getDeletePriorityFunc(ms) 557 if err != nil { 558 return ctrl.Result{}, err 559 } 560 561 var errs []error 562 machinesToDelete := getMachinesToDeletePrioritized(machines, diff, deletePriorityFunc) 563 for i, machine := range machinesToDelete { 564 log := log.WithValues("Machine", klog.KObj(machine)) 565 if machine.GetDeletionTimestamp().IsZero() { 566 log.Info(fmt.Sprintf("Deleting machine %d of %d", i+1, diff)) 567 if err := r.Client.Delete(ctx, machine); err != nil { 568 log.Error(err, "Unable to delete Machine") 569 r.recorder.Eventf(ms, corev1.EventTypeWarning, "FailedDelete", "Failed to delete machine %q: %v", machine.Name, err) 570 errs = append(errs, err) 571 continue 572 } 573 r.recorder.Eventf(ms, corev1.EventTypeNormal, "SuccessfulDelete", "Deleted machine %q", machine.Name) 574 } else { 575 log.Info(fmt.Sprintf("Waiting for machine %d of %d to be deleted", i+1, diff)) 576 } 577 } 578 579 if len(errs) > 0 { 580 return ctrl.Result{}, kerrors.NewAggregate(errs) 581 } 582 return ctrl.Result{}, r.waitForMachineDeletion(ctx, machinesToDelete) 583 } 584 585 return ctrl.Result{}, nil 586 } 587 588 // computeDesiredMachine computes the desired Machine. 589 // This Machine will be used during reconciliation to: 590 // * create a Machine 591 // * update an existing Machine 592 // Because we are using Server-Side-Apply we always have to calculate the full object. 593 // There are small differences in how we calculate the Machine depending on if it 594 // is a create or update. Example: for a new Machine we have to calculate a new name, 595 // while for an existing Machine we have to use the name of the existing Machine. 596 func (r *Reconciler) computeDesiredMachine(machineSet *clusterv1.MachineSet, existingMachine *clusterv1.Machine) *clusterv1.Machine { 597 desiredMachine := &clusterv1.Machine{ 598 TypeMeta: metav1.TypeMeta{ 599 APIVersion: clusterv1.GroupVersion.String(), 600 Kind: "Machine", 601 }, 602 ObjectMeta: metav1.ObjectMeta{ 603 Name: names.SimpleNameGenerator.GenerateName(fmt.Sprintf("%s-", machineSet.Name)), 604 Namespace: machineSet.Namespace, 605 // Note: By setting the ownerRef on creation we signal to the Machine controller that this is not a stand-alone Machine. 606 OwnerReferences: []metav1.OwnerReference{*metav1.NewControllerRef(machineSet, machineSetKind)}, 607 Labels: map[string]string{}, 608 Annotations: map[string]string{}, 609 Finalizers: []string{clusterv1.MachineFinalizer}, 610 }, 611 Spec: *machineSet.Spec.Template.Spec.DeepCopy(), 612 } 613 // Set ClusterName. 614 desiredMachine.Spec.ClusterName = machineSet.Spec.ClusterName 615 616 // Clean up the refs to the incorrect objects. 617 // The InfrastructureRef and the Bootstrap.ConfigRef in Machine should point to the InfrastructureMachine 618 // and the BootstrapConfig objects. In the MachineSet these values point to InfrastructureMachineTemplate 619 // BootstrapConfigTemplate. Drop the values that were copied over from MachineSet during DeepCopy 620 // to make sure to not point to incorrect refs. 621 // Note: During Machine creation, these refs will be updated with the correct values after the corresponding 622 // objects are created. 623 desiredMachine.Spec.InfrastructureRef = corev1.ObjectReference{} 624 desiredMachine.Spec.Bootstrap.ConfigRef = nil 625 626 // If we are updating an existing Machine reuse the name, uid, infrastructureRef and bootstrap.configRef 627 // from the existingMachine. 628 // Note: we use UID to force SSA to update the existing Machine and to not accidentally create a new Machine. 629 // infrastructureRef and bootstrap.configRef remain the same for an existing Machine. 630 if existingMachine != nil { 631 desiredMachine.SetName(existingMachine.Name) 632 desiredMachine.SetUID(existingMachine.UID) 633 desiredMachine.Spec.Bootstrap.ConfigRef = existingMachine.Spec.Bootstrap.ConfigRef 634 desiredMachine.Spec.InfrastructureRef = existingMachine.Spec.InfrastructureRef 635 } 636 637 // Set the in-place mutable fields. 638 // When we create a new Machine we will just create the Machine with those fields. 639 // When we update an existing Machine will we update the fields on the existing Machine (in-place mutate). 640 641 // Set Labels 642 desiredMachine.Labels = machineLabelsFromMachineSet(machineSet) 643 644 // Set Annotations 645 desiredMachine.Annotations = machineAnnotationsFromMachineSet(machineSet) 646 647 // Set all other in-place mutable fields. 648 desiredMachine.Spec.NodeDrainTimeout = machineSet.Spec.Template.Spec.NodeDrainTimeout 649 desiredMachine.Spec.NodeDeletionTimeout = machineSet.Spec.Template.Spec.NodeDeletionTimeout 650 desiredMachine.Spec.NodeVolumeDetachTimeout = machineSet.Spec.Template.Spec.NodeVolumeDetachTimeout 651 652 return desiredMachine 653 } 654 655 // updateExternalObject updates the external object passed in with the 656 // updated labels and annotations from the MachineSet. 657 func (r *Reconciler) updateExternalObject(ctx context.Context, obj client.Object, machineSet *clusterv1.MachineSet) error { 658 updatedObject := &unstructured.Unstructured{} 659 updatedObject.SetGroupVersionKind(obj.GetObjectKind().GroupVersionKind()) 660 updatedObject.SetNamespace(obj.GetNamespace()) 661 updatedObject.SetName(obj.GetName()) 662 // Set the UID to ensure that Server-Side-Apply only performs an update 663 // and does not perform an accidental create. 664 updatedObject.SetUID(obj.GetUID()) 665 666 updatedObject.SetLabels(machineLabelsFromMachineSet(machineSet)) 667 updatedObject.SetAnnotations(machineAnnotationsFromMachineSet(machineSet)) 668 669 if err := ssa.Patch(ctx, r.Client, machineSetManagerName, updatedObject, ssa.WithCachingProxy{Cache: r.ssaCache, Original: obj}); err != nil { 670 return errors.Wrapf(err, "failed to update %s", klog.KObj(obj)) 671 } 672 return nil 673 } 674 675 // machineLabelsFromMachineSet computes the labels the Machine created from this MachineSet should have. 676 func machineLabelsFromMachineSet(machineSet *clusterv1.MachineSet) map[string]string { 677 machineLabels := map[string]string{} 678 // Note: We can't just set `machineSet.Spec.Template.Labels` directly and thus "share" the labels 679 // map between Machine and machineSet.Spec.Template.Labels. This would mean that adding the 680 // MachineSetNameLabel and MachineDeploymentNameLabel later on the Machine would also add the labels 681 // to machineSet.Spec.Template.Labels and thus modify the labels of the MachineSet. 682 for k, v := range machineSet.Spec.Template.Labels { 683 machineLabels[k] = v 684 } 685 // Always set the MachineSetNameLabel. 686 // Note: If a client tries to create a MachineSet without a selector, the MachineSet webhook 687 // will add this label automatically. But we want this label to always be present even if the MachineSet 688 // has a selector which doesn't include it. Therefore, we have to set it here explicitly. 689 machineLabels[clusterv1.MachineSetNameLabel] = format.MustFormatValue(machineSet.Name) 690 // Propagate the MachineDeploymentNameLabel from MachineSet to Machine if it exists. 691 if mdName, ok := machineSet.Labels[clusterv1.MachineDeploymentNameLabel]; ok { 692 machineLabels[clusterv1.MachineDeploymentNameLabel] = mdName 693 } 694 return machineLabels 695 } 696 697 // machineAnnotationsFromMachineSet computes the annotations the Machine created from this MachineSet should have. 698 func machineAnnotationsFromMachineSet(machineSet *clusterv1.MachineSet) map[string]string { 699 annotations := map[string]string{} 700 for k, v := range machineSet.Spec.Template.Annotations { 701 annotations[k] = v 702 } 703 return annotations 704 } 705 706 // shouldExcludeMachine returns true if the machine should be filtered out, false otherwise. 707 func shouldExcludeMachine(machineSet *clusterv1.MachineSet, machine *clusterv1.Machine) bool { 708 if metav1.GetControllerOf(machine) != nil && !metav1.IsControlledBy(machine, machineSet) { 709 return true 710 } 711 712 return false 713 } 714 715 // adoptOrphan sets the MachineSet as a controller OwnerReference to the Machine. 716 func (r *Reconciler) adoptOrphan(ctx context.Context, machineSet *clusterv1.MachineSet, machine *clusterv1.Machine) error { 717 patch := client.MergeFrom(machine.DeepCopy()) 718 newRef := *metav1.NewControllerRef(machineSet, machineSetKind) 719 machine.SetOwnerReferences(util.EnsureOwnerRef(machine.GetOwnerReferences(), newRef)) 720 return r.Client.Patch(ctx, machine, patch) 721 } 722 723 func (r *Reconciler) waitForMachineCreation(ctx context.Context, machineList []*clusterv1.Machine) error { 724 log := ctrl.LoggerFrom(ctx) 725 726 for i := 0; i < len(machineList); i++ { 727 machine := machineList[i] 728 pollErr := wait.PollUntilContextTimeout(ctx, stateConfirmationInterval, stateConfirmationTimeout, true, func(ctx context.Context) (bool, error) { 729 key := client.ObjectKey{Namespace: machine.Namespace, Name: machine.Name} 730 if err := r.Client.Get(ctx, key, &clusterv1.Machine{}); err != nil { 731 if apierrors.IsNotFound(err) { 732 return false, nil 733 } 734 return false, err 735 } 736 737 return true, nil 738 }) 739 740 if pollErr != nil { 741 log.Error(pollErr, "Failed waiting for machine object to be created") 742 return errors.Wrap(pollErr, "failed waiting for machine object to be created") 743 } 744 } 745 746 return nil 747 } 748 749 func (r *Reconciler) waitForMachineDeletion(ctx context.Context, machineList []*clusterv1.Machine) error { 750 log := ctrl.LoggerFrom(ctx) 751 752 for i := 0; i < len(machineList); i++ { 753 machine := machineList[i] 754 pollErr := wait.PollUntilContextTimeout(ctx, stateConfirmationInterval, stateConfirmationTimeout, true, func(ctx context.Context) (bool, error) { 755 m := &clusterv1.Machine{} 756 key := client.ObjectKey{Namespace: machine.Namespace, Name: machine.Name} 757 err := r.Client.Get(ctx, key, m) 758 if apierrors.IsNotFound(err) || !m.DeletionTimestamp.IsZero() { 759 return true, nil 760 } 761 return false, err 762 }) 763 764 if pollErr != nil { 765 log.Error(pollErr, "Failed waiting for machine object to be deleted") 766 return errors.Wrap(pollErr, "failed waiting for machine object to be deleted") 767 } 768 } 769 return nil 770 } 771 772 // MachineToMachineSets is a handler.ToRequestsFunc to be used to enqueue requests for reconciliation 773 // for MachineSets that might adopt an orphaned Machine. 774 func (r *Reconciler) MachineToMachineSets(ctx context.Context, o client.Object) []ctrl.Request { 775 result := []ctrl.Request{} 776 777 m, ok := o.(*clusterv1.Machine) 778 if !ok { 779 panic(fmt.Sprintf("Expected a Machine but got a %T", o)) 780 } 781 782 log := ctrl.LoggerFrom(ctx, "Machine", klog.KObj(m)) 783 784 // Check if the controller reference is already set and 785 // return an empty result when one is found. 786 for _, ref := range m.ObjectMeta.GetOwnerReferences() { 787 if ref.Controller != nil && *ref.Controller { 788 return result 789 } 790 } 791 792 mss, err := r.getMachineSetsForMachine(ctx, m) 793 if err != nil { 794 log.Error(err, "Failed getting MachineSets for Machine") 795 return nil 796 } 797 if len(mss) == 0 { 798 return nil 799 } 800 801 for _, ms := range mss { 802 name := client.ObjectKey{Namespace: ms.Namespace, Name: ms.Name} 803 result = append(result, ctrl.Request{NamespacedName: name}) 804 } 805 806 return result 807 } 808 809 func (r *Reconciler) getMachineSetsForMachine(ctx context.Context, m *clusterv1.Machine) ([]*clusterv1.MachineSet, error) { 810 if len(m.Labels) == 0 { 811 return nil, fmt.Errorf("machine %v has no labels, this is unexpected", client.ObjectKeyFromObject(m)) 812 } 813 814 msList := &clusterv1.MachineSetList{} 815 if err := r.Client.List(ctx, msList, client.InNamespace(m.Namespace)); err != nil { 816 return nil, errors.Wrapf(err, "failed to list MachineSets") 817 } 818 819 var mss []*clusterv1.MachineSet 820 for idx := range msList.Items { 821 ms := &msList.Items[idx] 822 if machine.HasMatchingLabels(ms.Spec.Selector, m.Labels) { 823 mss = append(mss, ms) 824 } 825 } 826 827 return mss, nil 828 } 829 830 // shouldAdopt returns true if the MachineSet should be adopted as a stand-alone MachineSet directly owned by the Cluster. 831 func (r *Reconciler) shouldAdopt(ms *clusterv1.MachineSet) bool { 832 // if the MachineSet is controlled by a MachineDeployment, or if it is a stand-alone MachinesSet directly owned by the Cluster, then no-op. 833 if util.HasOwner(ms.GetOwnerReferences(), clusterv1.GroupVersion.String(), []string{"MachineDeployment", "Cluster"}) { 834 return false 835 } 836 837 // If the MachineSet is originated by a MachineDeployment object, it should not be adopted directly by the Cluster as a stand-alone MachineSet. 838 // Note: this is required because after restore from a backup both the MachineSet controller and the 839 // MachineDeployment controller are racing to adopt MachineSets, see https://github.com/kubernetes-sigs/cluster-api/issues/7529 840 if _, ok := ms.Labels[clusterv1.MachineDeploymentNameLabel]; ok { 841 return false 842 } 843 return true 844 } 845 846 // updateStatus updates the Status field for the MachineSet 847 // It checks for the current state of the replicas and updates the Status of the MachineSet. 848 func (r *Reconciler) updateStatus(ctx context.Context, cluster *clusterv1.Cluster, ms *clusterv1.MachineSet, filteredMachines []*clusterv1.Machine) error { 849 log := ctrl.LoggerFrom(ctx) 850 newStatus := ms.Status.DeepCopy() 851 852 // Copy label selector to its status counterpart in string format. 853 // This is necessary for CRDs including scale subresources. 854 selector, err := metav1.LabelSelectorAsSelector(&ms.Spec.Selector) 855 if err != nil { 856 return errors.Wrapf(err, "failed to update status for MachineSet %s/%s", ms.Namespace, ms.Name) 857 } 858 newStatus.Selector = selector.String() 859 860 // Count the number of machines that have labels matching the labels of the machine 861 // template of the replica set, the matching machines may have more 862 // labels than are in the template. Because the label of machineTemplateSpec is 863 // a superset of the selector of the replica set, so the possible 864 // matching machines must be part of the filteredMachines. 865 fullyLabeledReplicasCount := 0 866 readyReplicasCount := 0 867 availableReplicasCount := 0 868 desiredReplicas := *ms.Spec.Replicas 869 templateLabel := labels.Set(ms.Spec.Template.Labels).AsSelectorPreValidated() 870 871 for _, machine := range filteredMachines { 872 log := log.WithValues("Machine", klog.KObj(machine)) 873 874 if templateLabel.Matches(labels.Set(machine.Labels)) { 875 fullyLabeledReplicasCount++ 876 } 877 878 if machine.Status.NodeRef == nil { 879 log.V(4).Info("Waiting for the machine controller to set status.NodeRef on the Machine") 880 continue 881 } 882 883 node, err := r.getMachineNode(ctx, cluster, machine) 884 if err != nil && machine.GetDeletionTimestamp().IsZero() { 885 log.Error(err, "Unable to retrieve Node status", "node", klog.KObj(node)) 886 continue 887 } 888 889 if noderefutil.IsNodeReady(node) { 890 readyReplicasCount++ 891 if noderefutil.IsNodeAvailable(node, ms.Spec.MinReadySeconds, metav1.Now()) { 892 availableReplicasCount++ 893 } 894 } else if machine.GetDeletionTimestamp().IsZero() { 895 log.V(4).Info("Waiting for the Kubernetes node on the machine to report ready state") 896 } 897 } 898 899 newStatus.Replicas = int32(len(filteredMachines)) 900 newStatus.FullyLabeledReplicas = int32(fullyLabeledReplicasCount) 901 newStatus.ReadyReplicas = int32(readyReplicasCount) 902 newStatus.AvailableReplicas = int32(availableReplicasCount) 903 904 // Copy the newly calculated status into the machineset 905 if ms.Status.Replicas != newStatus.Replicas || 906 ms.Status.FullyLabeledReplicas != newStatus.FullyLabeledReplicas || 907 ms.Status.ReadyReplicas != newStatus.ReadyReplicas || 908 ms.Status.AvailableReplicas != newStatus.AvailableReplicas || 909 ms.Generation != ms.Status.ObservedGeneration { 910 log.V(4).Info("Updating status: " + 911 fmt.Sprintf("replicas %d->%d (need %d), ", ms.Status.Replicas, newStatus.Replicas, desiredReplicas) + 912 fmt.Sprintf("fullyLabeledReplicas %d->%d, ", ms.Status.FullyLabeledReplicas, newStatus.FullyLabeledReplicas) + 913 fmt.Sprintf("readyReplicas %d->%d, ", ms.Status.ReadyReplicas, newStatus.ReadyReplicas) + 914 fmt.Sprintf("availableReplicas %d->%d, ", ms.Status.AvailableReplicas, newStatus.AvailableReplicas) + 915 fmt.Sprintf("observedGeneration %v->%v", ms.Status.ObservedGeneration, ms.Generation)) 916 917 // Save the generation number we acted on, otherwise we might wrongfully indicate 918 // that we've seen a spec update when we retry. 919 newStatus.ObservedGeneration = ms.Generation 920 newStatus.DeepCopyInto(&ms.Status) 921 } 922 switch { 923 // We are scaling up 924 case newStatus.Replicas < desiredReplicas: 925 conditions.MarkFalse(ms, clusterv1.ResizedCondition, clusterv1.ScalingUpReason, clusterv1.ConditionSeverityWarning, "Scaling up MachineSet to %d replicas (actual %d)", desiredReplicas, newStatus.Replicas) 926 // We are scaling down 927 case newStatus.Replicas > desiredReplicas: 928 conditions.MarkFalse(ms, clusterv1.ResizedCondition, clusterv1.ScalingDownReason, clusterv1.ConditionSeverityWarning, "Scaling down MachineSet to %d replicas (actual %d)", desiredReplicas, newStatus.Replicas) 929 // This means that there was no error in generating the desired number of machine objects 930 conditions.MarkTrue(ms, clusterv1.MachinesCreatedCondition) 931 default: 932 // Make sure last resize operation is marked as completed. 933 // NOTE: we are checking the number of machines ready so we report resize completed only when the machines 934 // are actually provisioned (vs reporting completed immediately after the last machine object is created). This convention is also used by KCP. 935 if newStatus.ReadyReplicas == newStatus.Replicas { 936 if conditions.IsFalse(ms, clusterv1.ResizedCondition) { 937 log.Info("All the replicas are ready", "replicas", newStatus.ReadyReplicas) 938 } 939 conditions.MarkTrue(ms, clusterv1.ResizedCondition) 940 } 941 // This means that there was no error in generating the desired number of machine objects 942 conditions.MarkTrue(ms, clusterv1.MachinesCreatedCondition) 943 } 944 945 // Aggregate the operational state of all the machines; while aggregating we are adding the 946 // source ref (reason@machine/name) so the problem can be easily tracked down to its source machine. 947 conditions.SetAggregate(ms, clusterv1.MachinesReadyCondition, collections.FromMachines(filteredMachines...).ConditionGetters(), conditions.AddSourceRef()) 948 949 return nil 950 } 951 952 func (r *Reconciler) getMachineNode(ctx context.Context, cluster *clusterv1.Cluster, machine *clusterv1.Machine) (*corev1.Node, error) { 953 remoteClient, err := r.Tracker.GetClient(ctx, util.ObjectKey(cluster)) 954 if err != nil { 955 return nil, err 956 } 957 node := &corev1.Node{} 958 if err := remoteClient.Get(ctx, client.ObjectKey{Name: machine.Status.NodeRef.Name}, node); err != nil { 959 return nil, errors.Wrapf(err, "error retrieving node %s for machine %s/%s", machine.Status.NodeRef.Name, machine.Namespace, machine.Name) 960 } 961 return node, nil 962 } 963 964 func (r *Reconciler) reconcileUnhealthyMachines(ctx context.Context, cluster *clusterv1.Cluster, ms *clusterv1.MachineSet, filteredMachines []*clusterv1.Machine) (ctrl.Result, error) { 965 log := ctrl.LoggerFrom(ctx) 966 // List all unhealthy machines. 967 machinesToRemediate := make([]*clusterv1.Machine, 0, len(filteredMachines)) 968 for _, m := range filteredMachines { 969 // filteredMachines contains machines in deleting status to calculate correct status. 970 // skip remediation for those in deleting status. 971 if !m.DeletionTimestamp.IsZero() { 972 continue 973 } 974 if conditions.IsFalse(m, clusterv1.MachineOwnerRemediatedCondition) { 975 machinesToRemediate = append(machinesToRemediate, m) 976 } 977 } 978 979 // If there are no machines to remediate return early. 980 if len(machinesToRemediate) == 0 { 981 return ctrl.Result{}, nil 982 } 983 984 preflightChecksResult, preflightCheckErrMessage, err := r.runPreflightChecks(ctx, cluster, ms, "Machine Remediation") 985 if err != nil { 986 // If err is not nil use that as the preflightCheckErrMessage 987 preflightCheckErrMessage = err.Error() 988 } 989 990 preflightChecksFailed := err != nil || !preflightChecksResult.IsZero() 991 if preflightChecksFailed { 992 // PreflightChecks did not pass. Update the MachineOwnerRemediated condition on the unhealthy Machines with 993 // WaitingForRemediationReason reason. 994 var errs []error 995 for _, m := range machinesToRemediate { 996 patchHelper, err := patch.NewHelper(m, r.Client) 997 if err != nil { 998 errs = append(errs, err) 999 continue 1000 } 1001 conditions.MarkFalse(m, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, preflightCheckErrMessage) 1002 if err := patchHelper.Patch(ctx, m); err != nil { 1003 errs = append(errs, err) 1004 } 1005 } 1006 1007 if len(errs) > 0 { 1008 return ctrl.Result{}, errors.Wrapf(kerrors.NewAggregate(errs), "failed to patch unhealthy Machines") 1009 } 1010 return preflightChecksResult, nil 1011 } 1012 1013 // PreflightChecks passed, so it is safe to remediate unhealthy machines. 1014 // Remediate unhealthy machines by deleting them. 1015 var errs []error 1016 for _, m := range machinesToRemediate { 1017 log.Info(fmt.Sprintf("Deleting Machine %s because it was marked as unhealthy by the MachineHealthCheck controller", klog.KObj(m))) 1018 patch := client.MergeFrom(m.DeepCopy()) 1019 if err := r.Client.Delete(ctx, m); err != nil { 1020 errs = append(errs, errors.Wrapf(err, "failed to delete Machine %s", klog.KObj(m))) 1021 continue 1022 } 1023 conditions.MarkTrue(m, clusterv1.MachineOwnerRemediatedCondition) 1024 if err := r.Client.Status().Patch(ctx, m, patch); err != nil && !apierrors.IsNotFound(err) { 1025 errs = append(errs, errors.Wrapf(err, "failed to update status of Machine %s", klog.KObj(m))) 1026 } 1027 } 1028 1029 if len(errs) > 0 { 1030 return ctrl.Result{}, errors.Wrapf(kerrors.NewAggregate(errs), "failed to delete unhealthy Machines") 1031 } 1032 1033 return ctrl.Result{}, nil 1034 } 1035 1036 func reconcileExternalTemplateReference(ctx context.Context, c client.Client, cluster *clusterv1.Cluster, ref *corev1.ObjectReference) error { 1037 if !strings.HasSuffix(ref.Kind, clusterv1.TemplateSuffix) { 1038 return nil 1039 } 1040 1041 if err := utilconversion.UpdateReferenceAPIContract(ctx, c, ref); err != nil { 1042 return err 1043 } 1044 1045 obj, err := external.Get(ctx, c, ref, cluster.Namespace) 1046 if err != nil { 1047 return err 1048 } 1049 1050 patchHelper, err := patch.NewHelper(obj, c) 1051 if err != nil { 1052 return err 1053 } 1054 1055 obj.SetOwnerReferences(util.EnsureOwnerRef(obj.GetOwnerReferences(), metav1.OwnerReference{ 1056 APIVersion: clusterv1.GroupVersion.String(), 1057 Kind: "Cluster", 1058 Name: cluster.Name, 1059 UID: cluster.UID, 1060 })) 1061 1062 return patchHelper.Patch(ctx, obj) 1063 }