sigs.k8s.io/cluster-api@v1.6.3/internal/controllers/machineset/machineset_controller.go (about) 1 /* 2 Copyright 2019 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package machineset 18 19 import ( 20 "context" 21 "fmt" 22 "strings" 23 "time" 24 25 "github.com/pkg/errors" 26 corev1 "k8s.io/api/core/v1" 27 apierrors "k8s.io/apimachinery/pkg/api/errors" 28 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 29 "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" 30 "k8s.io/apimachinery/pkg/labels" 31 kerrors "k8s.io/apimachinery/pkg/util/errors" 32 "k8s.io/apimachinery/pkg/util/wait" 33 "k8s.io/apiserver/pkg/storage/names" 34 "k8s.io/client-go/tools/record" 35 "k8s.io/klog/v2" 36 ctrl "sigs.k8s.io/controller-runtime" 37 "sigs.k8s.io/controller-runtime/pkg/builder" 38 "sigs.k8s.io/controller-runtime/pkg/client" 39 "sigs.k8s.io/controller-runtime/pkg/controller" 40 "sigs.k8s.io/controller-runtime/pkg/handler" 41 42 clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" 43 "sigs.k8s.io/cluster-api/controllers/external" 44 "sigs.k8s.io/cluster-api/controllers/noderefutil" 45 "sigs.k8s.io/cluster-api/controllers/remote" 46 "sigs.k8s.io/cluster-api/internal/contract" 47 "sigs.k8s.io/cluster-api/internal/controllers/machine" 48 "sigs.k8s.io/cluster-api/internal/util/ssa" 49 "sigs.k8s.io/cluster-api/util" 50 "sigs.k8s.io/cluster-api/util/annotations" 51 "sigs.k8s.io/cluster-api/util/collections" 52 "sigs.k8s.io/cluster-api/util/conditions" 53 utilconversion "sigs.k8s.io/cluster-api/util/conversion" 54 "sigs.k8s.io/cluster-api/util/labels/format" 55 clog "sigs.k8s.io/cluster-api/util/log" 56 "sigs.k8s.io/cluster-api/util/patch" 57 "sigs.k8s.io/cluster-api/util/predicates" 58 ) 59 60 var ( 61 // machineSetKind contains the schema.GroupVersionKind for the MachineSet type. 62 machineSetKind = clusterv1.GroupVersion.WithKind("MachineSet") 63 64 // stateConfirmationTimeout is the amount of time allowed to wait for desired state. 65 stateConfirmationTimeout = 10 * time.Second 66 67 // stateConfirmationInterval is the amount of time between polling for the desired state. 68 // The polling is against a local memory cache. 69 stateConfirmationInterval = 100 * time.Millisecond 70 ) 71 72 const machineSetManagerName = "capi-machineset" 73 74 // +kubebuilder:rbac:groups=core,resources=events,verbs=get;list;watch;create;patch 75 // +kubebuilder:rbac:groups=core,resources=secrets,verbs=get;list;watch 76 // +kubebuilder:rbac:groups=core,resources=nodes,verbs=get;list;watch;create;update;patch;delete 77 // +kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io;bootstrap.cluster.x-k8s.io,resources=*,verbs=get;list;watch;create;update;patch;delete 78 // +kubebuilder:rbac:groups=cluster.x-k8s.io,resources=machinesets;machinesets/status;machinesets/finalizers,verbs=get;list;watch;create;update;patch;delete 79 80 // Reconciler reconciles a MachineSet object. 81 type Reconciler struct { 82 Client client.Client 83 UnstructuredCachingClient client.Client 84 APIReader client.Reader 85 Tracker *remote.ClusterCacheTracker 86 87 // WatchFilterValue is the label value used to filter events prior to reconciliation. 88 WatchFilterValue string 89 90 ssaCache ssa.Cache 91 recorder record.EventRecorder 92 } 93 94 func (r *Reconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, options controller.Options) error { 95 clusterToMachineSets, err := util.ClusterToTypedObjectsMapper(mgr.GetClient(), &clusterv1.MachineSetList{}, mgr.GetScheme()) 96 if err != nil { 97 return err 98 } 99 100 err = ctrl.NewControllerManagedBy(mgr). 101 For(&clusterv1.MachineSet{}). 102 Owns(&clusterv1.Machine{}). 103 Watches( 104 &clusterv1.Machine{}, 105 handler.EnqueueRequestsFromMapFunc(r.MachineToMachineSets), 106 ). 107 WithOptions(options). 108 WithEventFilter(predicates.ResourceNotPausedAndHasFilterLabel(ctrl.LoggerFrom(ctx), r.WatchFilterValue)). 109 Watches( 110 &clusterv1.Cluster{}, 111 handler.EnqueueRequestsFromMapFunc(clusterToMachineSets), 112 builder.WithPredicates( 113 // TODO: should this wait for Cluster.Status.InfrastructureReady similar to Infra Machine resources? 114 predicates.All(ctrl.LoggerFrom(ctx), 115 predicates.ClusterUnpaused(ctrl.LoggerFrom(ctx)), 116 predicates.ResourceHasFilterLabel(ctrl.LoggerFrom(ctx), r.WatchFilterValue), 117 ), 118 ), 119 ).Complete(r) 120 if err != nil { 121 return errors.Wrap(err, "failed setting up with a controller manager") 122 } 123 124 r.recorder = mgr.GetEventRecorderFor("machineset-controller") 125 r.ssaCache = ssa.NewCache() 126 return nil 127 } 128 129 func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Result, reterr error) { 130 machineSet := &clusterv1.MachineSet{} 131 if err := r.Client.Get(ctx, req.NamespacedName, machineSet); err != nil { 132 if apierrors.IsNotFound(err) { 133 // Object not found, return. Created objects are automatically garbage collected. 134 // For additional cleanup logic use finalizers. 135 return ctrl.Result{}, nil 136 } 137 // Error reading the object - requeue the request. 138 return ctrl.Result{}, err 139 } 140 141 // AddOwners adds the owners of MachineSet as k/v pairs to the logger. 142 // Specifically, it will add MachineDeployment. 143 ctx, log, err := clog.AddOwners(ctx, r.Client, machineSet) 144 if err != nil { 145 return ctrl.Result{}, err 146 } 147 148 log = log.WithValues("Cluster", klog.KRef(machineSet.ObjectMeta.Namespace, machineSet.Spec.ClusterName)) 149 ctx = ctrl.LoggerInto(ctx, log) 150 151 cluster, err := util.GetClusterByName(ctx, r.Client, machineSet.ObjectMeta.Namespace, machineSet.Spec.ClusterName) 152 if err != nil { 153 return ctrl.Result{}, err 154 } 155 156 // Return early if the object or Cluster is paused. 157 if annotations.IsPaused(cluster, machineSet) { 158 log.Info("Reconciliation is paused for this object") 159 return ctrl.Result{}, nil 160 } 161 162 // Initialize the patch helper 163 patchHelper, err := patch.NewHelper(machineSet, r.Client) 164 if err != nil { 165 return ctrl.Result{}, err 166 } 167 168 defer func() { 169 // Always attempt to patch the object and status after each reconciliation. 170 if err := patchMachineSet(ctx, patchHelper, machineSet); err != nil { 171 reterr = kerrors.NewAggregate([]error{reterr, err}) 172 } 173 }() 174 175 // Ignore deleted MachineSets, this can happen when foregroundDeletion 176 // is enabled 177 if !machineSet.DeletionTimestamp.IsZero() { 178 return ctrl.Result{}, nil 179 } 180 181 result, err := r.reconcile(ctx, cluster, machineSet) 182 if err != nil { 183 // Requeue if the reconcile failed because the ClusterCacheTracker was locked for 184 // the current cluster because of concurrent access. 185 if errors.Is(err, remote.ErrClusterLocked) { 186 log.V(5).Info("Requeuing because another worker has the lock on the ClusterCacheTracker") 187 return ctrl.Result{Requeue: true}, nil 188 } 189 r.recorder.Eventf(machineSet, corev1.EventTypeWarning, "ReconcileError", "%v", err) 190 } 191 return result, err 192 } 193 194 func patchMachineSet(ctx context.Context, patchHelper *patch.Helper, machineSet *clusterv1.MachineSet, options ...patch.Option) error { 195 // Always update the readyCondition by summarizing the state of other conditions. 196 conditions.SetSummary(machineSet, 197 conditions.WithConditions( 198 clusterv1.MachinesCreatedCondition, 199 clusterv1.ResizedCondition, 200 clusterv1.MachinesReadyCondition, 201 ), 202 ) 203 204 // Patch the object, ignoring conflicts on the conditions owned by this controller. 205 options = append(options, 206 patch.WithOwnedConditions{Conditions: []clusterv1.ConditionType{ 207 clusterv1.ReadyCondition, 208 clusterv1.MachinesCreatedCondition, 209 clusterv1.ResizedCondition, 210 clusterv1.MachinesReadyCondition, 211 }}, 212 ) 213 return patchHelper.Patch(ctx, machineSet, options...) 214 } 215 216 func (r *Reconciler) reconcile(ctx context.Context, cluster *clusterv1.Cluster, machineSet *clusterv1.MachineSet) (ctrl.Result, error) { 217 log := ctrl.LoggerFrom(ctx) 218 219 // Reconcile and retrieve the Cluster object. 220 if machineSet.Labels == nil { 221 machineSet.Labels = make(map[string]string) 222 } 223 machineSet.Labels[clusterv1.ClusterNameLabel] = machineSet.Spec.ClusterName 224 225 // If the machine set is a stand alone one, meaning not originated from a MachineDeployment, then set it as directly 226 // owned by the Cluster (if not already present). 227 if r.shouldAdopt(machineSet) { 228 machineSet.SetOwnerReferences(util.EnsureOwnerRef(machineSet.GetOwnerReferences(), metav1.OwnerReference{ 229 APIVersion: clusterv1.GroupVersion.String(), 230 Kind: "Cluster", 231 Name: cluster.Name, 232 UID: cluster.UID, 233 })) 234 } 235 236 // Make sure to reconcile the external infrastructure reference. 237 if err := reconcileExternalTemplateReference(ctx, r.UnstructuredCachingClient, cluster, &machineSet.Spec.Template.Spec.InfrastructureRef); err != nil { 238 return ctrl.Result{}, err 239 } 240 // Make sure to reconcile the external bootstrap reference, if any. 241 if machineSet.Spec.Template.Spec.Bootstrap.ConfigRef != nil { 242 if err := reconcileExternalTemplateReference(ctx, r.UnstructuredCachingClient, cluster, machineSet.Spec.Template.Spec.Bootstrap.ConfigRef); err != nil { 243 return ctrl.Result{}, err 244 } 245 } 246 247 // Make sure selector and template to be in the same cluster. 248 if machineSet.Spec.Selector.MatchLabels == nil { 249 machineSet.Spec.Selector.MatchLabels = make(map[string]string) 250 } 251 252 if machineSet.Spec.Template.Labels == nil { 253 machineSet.Spec.Template.Labels = make(map[string]string) 254 } 255 256 machineSet.Spec.Selector.MatchLabels[clusterv1.ClusterNameLabel] = machineSet.Spec.ClusterName 257 machineSet.Spec.Template.Labels[clusterv1.ClusterNameLabel] = machineSet.Spec.ClusterName 258 259 selectorMap, err := metav1.LabelSelectorAsMap(&machineSet.Spec.Selector) 260 if err != nil { 261 return ctrl.Result{}, errors.Wrapf(err, "failed to convert MachineSet %q label selector to a map", machineSet.Name) 262 } 263 264 // Get all Machines linked to this MachineSet. 265 allMachines := &clusterv1.MachineList{} 266 err = r.Client.List(ctx, 267 allMachines, 268 client.InNamespace(machineSet.Namespace), 269 client.MatchingLabels(selectorMap), 270 ) 271 if err != nil { 272 return ctrl.Result{}, errors.Wrap(err, "failed to list machines") 273 } 274 275 // Filter out irrelevant machines (i.e. IsControlledBy something else) and claim orphaned machines. 276 // Machines in deleted state are deliberately not excluded https://github.com/kubernetes-sigs/cluster-api/pull/3434. 277 filteredMachines := make([]*clusterv1.Machine, 0, len(allMachines.Items)) 278 for idx := range allMachines.Items { 279 machine := &allMachines.Items[idx] 280 log := log.WithValues("Machine", klog.KObj(machine)) 281 if shouldExcludeMachine(machineSet, machine) { 282 continue 283 } 284 285 // Attempt to adopt machine if it meets previous conditions and it has no controller references. 286 if metav1.GetControllerOf(machine) == nil { 287 if err := r.adoptOrphan(ctx, machineSet, machine); err != nil { 288 log.Error(err, "Failed to adopt Machine") 289 r.recorder.Eventf(machineSet, corev1.EventTypeWarning, "FailedAdopt", "Failed to adopt Machine %q: %v", machine.Name, err) 290 continue 291 } 292 log.Info("Adopted Machine") 293 r.recorder.Eventf(machineSet, corev1.EventTypeNormal, "SuccessfulAdopt", "Adopted Machine %q", machine.Name) 294 } 295 296 filteredMachines = append(filteredMachines, machine) 297 } 298 299 result := ctrl.Result{} 300 301 reconcileUnhealthyMachinesResult, err := r.reconcileUnhealthyMachines(ctx, cluster, machineSet, filteredMachines) 302 if err != nil { 303 return ctrl.Result{}, errors.Wrap(err, "failed to reconcile unhealthy machines") 304 } 305 result = util.LowestNonZeroResult(result, reconcileUnhealthyMachinesResult) 306 307 if err := r.syncMachines(ctx, machineSet, filteredMachines); err != nil { 308 return ctrl.Result{}, errors.Wrap(err, "failed to update Machines") 309 } 310 311 syncReplicasResult, syncErr := r.syncReplicas(ctx, cluster, machineSet, filteredMachines) 312 result = util.LowestNonZeroResult(result, syncReplicasResult) 313 314 // Always updates status as machines come up or die. 315 if err := r.updateStatus(ctx, cluster, machineSet, filteredMachines); err != nil { 316 return ctrl.Result{}, errors.Wrapf(kerrors.NewAggregate([]error{err, syncErr}), "failed to update MachineSet's Status") 317 } 318 319 if syncErr != nil { 320 return ctrl.Result{}, errors.Wrapf(syncErr, "failed to sync MachineSet replicas") 321 } 322 323 var replicas int32 324 if machineSet.Spec.Replicas != nil { 325 replicas = *machineSet.Spec.Replicas 326 } 327 328 // Resync the MachineSet after MinReadySeconds as a last line of defense to guard against clock-skew. 329 // Clock-skew is an issue as it may impact whether an available replica is counted as a ready replica. 330 // A replica is available if the amount of time since last transition exceeds MinReadySeconds. 331 // If there was a clock skew, checking whether the amount of time since last transition to ready state 332 // exceeds MinReadySeconds could be incorrect. 333 // To avoid an available replica stuck in the ready state, we force a reconcile after MinReadySeconds, 334 // at which point it should confirm any available replica to be available. 335 if machineSet.Spec.MinReadySeconds > 0 && 336 machineSet.Status.ReadyReplicas == replicas && 337 machineSet.Status.AvailableReplicas != replicas { 338 minReadyResult := ctrl.Result{RequeueAfter: time.Duration(machineSet.Spec.MinReadySeconds) * time.Second} 339 result = util.LowestNonZeroResult(result, minReadyResult) 340 return result, nil 341 } 342 343 // Quickly reconcile until the nodes become Ready. 344 if machineSet.Status.ReadyReplicas != replicas { 345 result = util.LowestNonZeroResult(result, ctrl.Result{RequeueAfter: 15 * time.Second}) 346 return result, nil 347 } 348 349 return result, nil 350 } 351 352 // syncMachines updates Machines, InfrastructureMachine and BootstrapConfig to propagate in-place mutable fields 353 // from the MachineSet. 354 // Note: It also cleans up managed fields of all Machines so that Machines that were 355 // created/patched before (< v1.4.0) the controller adopted Server-Side-Apply (SSA) can also work with SSA. 356 // Note: For InfrastructureMachines and BootstrapConfigs it also drops ownership of "metadata.labels" and 357 // "metadata.annotations" from "manager" so that "capi-machineset" can own these fields and can work with SSA. 358 // Otherwise fields would be co-owned by our "old" "manager" and "capi-machineset" and then we would not be 359 // able to e.g. drop labels and annotations. 360 func (r *Reconciler) syncMachines(ctx context.Context, machineSet *clusterv1.MachineSet, machines []*clusterv1.Machine) error { 361 log := ctrl.LoggerFrom(ctx) 362 for i := range machines { 363 m := machines[i] 364 // If the machine is already being deleted, we don't need to update it. 365 if !m.DeletionTimestamp.IsZero() { 366 continue 367 } 368 369 // Cleanup managed fields of all Machines. 370 // We do this so that Machines that were created/patched before the controller adopted Server-Side-Apply (SSA) 371 // (< v1.4.0) can also work with SSA. Otherwise, fields would be co-owned by our "old" "manager" and 372 // "capi-machineset" and then we would not be able to e.g. drop labels and annotations. 373 if err := ssa.CleanUpManagedFieldsForSSAAdoption(ctx, r.Client, m, machineSetManagerName); err != nil { 374 return errors.Wrapf(err, "failed to update machine: failed to adjust the managedFields of the Machine %q", m.Name) 375 } 376 377 // Update Machine to propagate in-place mutable fields from the MachineSet. 378 updatedMachine := r.computeDesiredMachine(machineSet, m) 379 err := ssa.Patch(ctx, r.Client, machineSetManagerName, updatedMachine, ssa.WithCachingProxy{Cache: r.ssaCache, Original: m}) 380 if err != nil { 381 log.Error(err, "failed to update Machine", "Machine", klog.KObj(updatedMachine)) 382 return errors.Wrapf(err, "failed to update Machine %q", klog.KObj(updatedMachine)) 383 } 384 machines[i] = updatedMachine 385 386 infraMachine, err := external.Get(ctx, r.UnstructuredCachingClient, &updatedMachine.Spec.InfrastructureRef, updatedMachine.Namespace) 387 if err != nil { 388 return errors.Wrapf(err, "failed to get InfrastructureMachine %s", 389 klog.KRef(updatedMachine.Spec.InfrastructureRef.Namespace, updatedMachine.Spec.InfrastructureRef.Name)) 390 } 391 // Cleanup managed fields of all InfrastructureMachines to drop ownership of labels and annotations 392 // from "manager". We do this so that InfrastructureMachines that are created using the Create method 393 // can also work with SSA. Otherwise, labels and annotations would be co-owned by our "old" "manager" 394 // and "capi-machineset" and then we would not be able to e.g. drop labels and annotations. 395 labelsAndAnnotationsManagedFieldPaths := []contract.Path{ 396 {"f:metadata", "f:annotations"}, 397 {"f:metadata", "f:labels"}, 398 } 399 if err := ssa.DropManagedFields(ctx, r.Client, infraMachine, machineSetManagerName, labelsAndAnnotationsManagedFieldPaths); err != nil { 400 return errors.Wrapf(err, "failed to update machine: failed to adjust the managedFields of the InfrastructureMachine %s", klog.KObj(infraMachine)) 401 } 402 // Update in-place mutating fields on InfrastructureMachine. 403 if err := r.updateExternalObject(ctx, infraMachine, machineSet); err != nil { 404 return errors.Wrapf(err, "failed to update InfrastructureMachine %s", klog.KObj(infraMachine)) 405 } 406 407 if updatedMachine.Spec.Bootstrap.ConfigRef != nil { 408 bootstrapConfig, err := external.Get(ctx, r.UnstructuredCachingClient, updatedMachine.Spec.Bootstrap.ConfigRef, updatedMachine.Namespace) 409 if err != nil { 410 return errors.Wrapf(err, "failed to get BootstrapConfig %s", 411 klog.KRef(updatedMachine.Spec.Bootstrap.ConfigRef.Namespace, updatedMachine.Spec.Bootstrap.ConfigRef.Name)) 412 } 413 // Cleanup managed fields of all BootstrapConfigs to drop ownership of labels and annotations 414 // from "manager". We do this so that BootstrapConfigs that are created using the Create method 415 // can also work with SSA. Otherwise, labels and annotations would be co-owned by our "old" "manager" 416 // and "capi-machineset" and then we would not be able to e.g. drop labels and annotations. 417 if err := ssa.DropManagedFields(ctx, r.Client, bootstrapConfig, machineSetManagerName, labelsAndAnnotationsManagedFieldPaths); err != nil { 418 return errors.Wrapf(err, "failed to update machine: failed to adjust the managedFields of the BootstrapConfig %s", klog.KObj(bootstrapConfig)) 419 } 420 // Update in-place mutating fields on BootstrapConfig. 421 if err := r.updateExternalObject(ctx, bootstrapConfig, machineSet); err != nil { 422 return errors.Wrapf(err, "failed to update BootstrapConfig %s", klog.KObj(bootstrapConfig)) 423 } 424 } 425 } 426 return nil 427 } 428 429 // syncReplicas scales Machine resources up or down. 430 func (r *Reconciler) syncReplicas(ctx context.Context, cluster *clusterv1.Cluster, ms *clusterv1.MachineSet, machines []*clusterv1.Machine) (ctrl.Result, error) { 431 log := ctrl.LoggerFrom(ctx) 432 if ms.Spec.Replicas == nil { 433 return ctrl.Result{}, errors.Errorf("the Replicas field in Spec for machineset %v is nil, this should not be allowed", ms.Name) 434 } 435 diff := len(machines) - int(*(ms.Spec.Replicas)) 436 switch { 437 case diff < 0: 438 diff *= -1 439 log.Info(fmt.Sprintf("MachineSet is scaling up to %d replicas by creating %d machines", *(ms.Spec.Replicas), diff), "replicas", *(ms.Spec.Replicas), "machineCount", len(machines)) 440 if ms.Annotations != nil { 441 if _, ok := ms.Annotations[clusterv1.DisableMachineCreateAnnotation]; ok { 442 log.Info("Automatic creation of new machines disabled for machine set") 443 return ctrl.Result{}, nil 444 } 445 } 446 447 result, preflightCheckErrMessage, err := r.runPreflightChecks(ctx, cluster, ms, "Scale up") 448 if err != nil || !result.IsZero() { 449 if err != nil { 450 // If the error is not nil use that as the message for the condition. 451 preflightCheckErrMessage = err.Error() 452 } 453 conditions.MarkFalse(ms, clusterv1.MachinesCreatedCondition, clusterv1.PreflightCheckFailedReason, clusterv1.ConditionSeverityError, preflightCheckErrMessage) 454 return result, err 455 } 456 457 var ( 458 machineList []*clusterv1.Machine 459 errs []error 460 ) 461 462 for i := 0; i < diff; i++ { 463 // Create a new logger so the global logger is not modified. 464 log := log 465 machine := r.computeDesiredMachine(ms, nil) 466 // Clone and set the infrastructure and bootstrap references. 467 var ( 468 infraRef, bootstrapRef *corev1.ObjectReference 469 err error 470 ) 471 472 // Create the BootstrapConfig if necessary. 473 if ms.Spec.Template.Spec.Bootstrap.ConfigRef != nil { 474 bootstrapRef, err = external.CreateFromTemplate(ctx, &external.CreateFromTemplateInput{ 475 Client: r.UnstructuredCachingClient, 476 TemplateRef: ms.Spec.Template.Spec.Bootstrap.ConfigRef, 477 Namespace: machine.Namespace, 478 ClusterName: machine.Spec.ClusterName, 479 Labels: machine.Labels, 480 Annotations: machine.Annotations, 481 OwnerRef: &metav1.OwnerReference{ 482 APIVersion: clusterv1.GroupVersion.String(), 483 Kind: "MachineSet", 484 Name: ms.Name, 485 UID: ms.UID, 486 }, 487 }) 488 if err != nil { 489 conditions.MarkFalse(ms, clusterv1.MachinesCreatedCondition, clusterv1.BootstrapTemplateCloningFailedReason, clusterv1.ConditionSeverityError, err.Error()) 490 return ctrl.Result{}, errors.Wrapf(err, "failed to clone bootstrap configuration from %s %s while creating a machine", 491 ms.Spec.Template.Spec.Bootstrap.ConfigRef.Kind, 492 klog.KRef(ms.Spec.Template.Spec.Bootstrap.ConfigRef.Namespace, ms.Spec.Template.Spec.Bootstrap.ConfigRef.Name)) 493 } 494 machine.Spec.Bootstrap.ConfigRef = bootstrapRef 495 log = log.WithValues(bootstrapRef.Kind, klog.KRef(bootstrapRef.Namespace, bootstrapRef.Name)) 496 } 497 498 // Create the InfraMachine. 499 infraRef, err = external.CreateFromTemplate(ctx, &external.CreateFromTemplateInput{ 500 Client: r.UnstructuredCachingClient, 501 TemplateRef: &ms.Spec.Template.Spec.InfrastructureRef, 502 Namespace: machine.Namespace, 503 ClusterName: machine.Spec.ClusterName, 504 Labels: machine.Labels, 505 Annotations: machine.Annotations, 506 OwnerRef: &metav1.OwnerReference{ 507 APIVersion: clusterv1.GroupVersion.String(), 508 Kind: "MachineSet", 509 Name: ms.Name, 510 UID: ms.UID, 511 }, 512 }) 513 if err != nil { 514 conditions.MarkFalse(ms, clusterv1.MachinesCreatedCondition, clusterv1.InfrastructureTemplateCloningFailedReason, clusterv1.ConditionSeverityError, err.Error()) 515 return ctrl.Result{}, errors.Wrapf(err, "failed to clone infrastructure machine from %s %s while creating a machine", 516 ms.Spec.Template.Spec.InfrastructureRef.Kind, 517 klog.KRef(ms.Spec.Template.Spec.InfrastructureRef.Namespace, ms.Spec.Template.Spec.InfrastructureRef.Name)) 518 } 519 log = log.WithValues(infraRef.Kind, klog.KRef(infraRef.Namespace, infraRef.Name)) 520 machine.Spec.InfrastructureRef = *infraRef 521 522 // Create the Machine. 523 if err := ssa.Patch(ctx, r.Client, machineSetManagerName, machine); err != nil { 524 log.Error(err, "Error while creating a machine") 525 r.recorder.Eventf(ms, corev1.EventTypeWarning, "FailedCreate", "Failed to create machine: %v", err) 526 errs = append(errs, err) 527 conditions.MarkFalse(ms, clusterv1.MachinesCreatedCondition, clusterv1.MachineCreationFailedReason, 528 clusterv1.ConditionSeverityError, err.Error()) 529 530 // Try to cleanup the external objects if the Machine creation failed. 531 if err := r.Client.Delete(ctx, util.ObjectReferenceToUnstructured(*infraRef)); !apierrors.IsNotFound(err) { 532 log.Error(err, "Failed to cleanup infrastructure machine object after Machine creation error", infraRef.Kind, klog.KRef(infraRef.Namespace, infraRef.Name)) 533 } 534 if bootstrapRef != nil { 535 if err := r.Client.Delete(ctx, util.ObjectReferenceToUnstructured(*bootstrapRef)); !apierrors.IsNotFound(err) { 536 log.Error(err, "Failed to cleanup bootstrap configuration object after Machine creation error", bootstrapRef.Kind, klog.KRef(bootstrapRef.Namespace, bootstrapRef.Name)) 537 } 538 } 539 continue 540 } 541 542 log.Info(fmt.Sprintf("Created machine %d of %d", i+1, diff), "Machine", klog.KObj(machine)) 543 r.recorder.Eventf(ms, corev1.EventTypeNormal, "SuccessfulCreate", "Created machine %q", machine.Name) 544 machineList = append(machineList, machine) 545 } 546 547 if len(errs) > 0 { 548 return ctrl.Result{}, kerrors.NewAggregate(errs) 549 } 550 return ctrl.Result{}, r.waitForMachineCreation(ctx, machineList) 551 case diff > 0: 552 log.Info(fmt.Sprintf("MachineSet is scaling down to %d replicas by deleting %d machines", *(ms.Spec.Replicas), diff), "replicas", *(ms.Spec.Replicas), "machineCount", len(machines), "deletePolicy", ms.Spec.DeletePolicy) 553 554 deletePriorityFunc, err := getDeletePriorityFunc(ms) 555 if err != nil { 556 return ctrl.Result{}, err 557 } 558 559 var errs []error 560 machinesToDelete := getMachinesToDeletePrioritized(machines, diff, deletePriorityFunc) 561 for i, machine := range machinesToDelete { 562 log := log.WithValues("Machine", klog.KObj(machine)) 563 if machine.GetDeletionTimestamp().IsZero() { 564 log.Info(fmt.Sprintf("Deleting machine %d of %d", i+1, diff)) 565 if err := r.Client.Delete(ctx, machine); err != nil { 566 log.Error(err, "Unable to delete Machine") 567 r.recorder.Eventf(ms, corev1.EventTypeWarning, "FailedDelete", "Failed to delete machine %q: %v", machine.Name, err) 568 errs = append(errs, err) 569 continue 570 } 571 r.recorder.Eventf(ms, corev1.EventTypeNormal, "SuccessfulDelete", "Deleted machine %q", machine.Name) 572 } else { 573 log.Info(fmt.Sprintf("Waiting for machine %d of %d to be deleted", i+1, diff)) 574 } 575 } 576 577 if len(errs) > 0 { 578 return ctrl.Result{}, kerrors.NewAggregate(errs) 579 } 580 return ctrl.Result{}, r.waitForMachineDeletion(ctx, machinesToDelete) 581 } 582 583 return ctrl.Result{}, nil 584 } 585 586 // computeDesiredMachine computes the desired Machine. 587 // This Machine will be used during reconciliation to: 588 // * create a Machine 589 // * update an existing Machine 590 // Because we are using Server-Side-Apply we always have to calculate the full object. 591 // There are small differences in how we calculate the Machine depending on if it 592 // is a create or update. Example: for a new Machine we have to calculate a new name, 593 // while for an existing Machine we have to use the name of the existing Machine. 594 func (r *Reconciler) computeDesiredMachine(machineSet *clusterv1.MachineSet, existingMachine *clusterv1.Machine) *clusterv1.Machine { 595 desiredMachine := &clusterv1.Machine{ 596 TypeMeta: metav1.TypeMeta{ 597 APIVersion: clusterv1.GroupVersion.String(), 598 Kind: "Machine", 599 }, 600 ObjectMeta: metav1.ObjectMeta{ 601 Name: names.SimpleNameGenerator.GenerateName(fmt.Sprintf("%s-", machineSet.Name)), 602 Namespace: machineSet.Namespace, 603 // Note: By setting the ownerRef on creation we signal to the Machine controller that this is not a stand-alone Machine. 604 OwnerReferences: []metav1.OwnerReference{*metav1.NewControllerRef(machineSet, machineSetKind)}, 605 Labels: map[string]string{}, 606 Annotations: map[string]string{}, 607 Finalizers: []string{clusterv1.MachineFinalizer}, 608 }, 609 Spec: *machineSet.Spec.Template.Spec.DeepCopy(), 610 } 611 // Set ClusterName. 612 desiredMachine.Spec.ClusterName = machineSet.Spec.ClusterName 613 614 // Clean up the refs to the incorrect objects. 615 // The InfrastructureRef and the Bootstrap.ConfigRef in Machine should point to the InfrastructureMachine 616 // and the BootstrapConfig objects. In the MachineSet these values point to InfrastructureMachineTemplate 617 // BootstrapConfigTemplate. Drop the values that were copied over from MachineSet during DeepCopy 618 // to make sure to not point to incorrect refs. 619 // Note: During Machine creation, these refs will be updated with the correct values after the corresponding 620 // objects are created. 621 desiredMachine.Spec.InfrastructureRef = corev1.ObjectReference{} 622 desiredMachine.Spec.Bootstrap.ConfigRef = nil 623 624 // If we are updating an existing Machine reuse the name, uid, infrastructureRef and bootstrap.configRef 625 // from the existingMachine. 626 // Note: we use UID to force SSA to update the existing Machine and to not accidentally create a new Machine. 627 // infrastructureRef and bootstrap.configRef remain the same for an existing Machine. 628 if existingMachine != nil { 629 desiredMachine.SetName(existingMachine.Name) 630 desiredMachine.SetUID(existingMachine.UID) 631 desiredMachine.Spec.Bootstrap.ConfigRef = existingMachine.Spec.Bootstrap.ConfigRef 632 desiredMachine.Spec.InfrastructureRef = existingMachine.Spec.InfrastructureRef 633 } 634 635 // Set the in-place mutable fields. 636 // When we create a new Machine we will just create the Machine with those fields. 637 // When we update an existing Machine will we update the fields on the existing Machine (in-place mutate). 638 639 // Set Labels 640 desiredMachine.Labels = machineLabelsFromMachineSet(machineSet) 641 642 // Set Annotations 643 desiredMachine.Annotations = machineAnnotationsFromMachineSet(machineSet) 644 645 // Set all other in-place mutable fields. 646 desiredMachine.Spec.NodeDrainTimeout = machineSet.Spec.Template.Spec.NodeDrainTimeout 647 desiredMachine.Spec.NodeDeletionTimeout = machineSet.Spec.Template.Spec.NodeDeletionTimeout 648 desiredMachine.Spec.NodeVolumeDetachTimeout = machineSet.Spec.Template.Spec.NodeVolumeDetachTimeout 649 650 return desiredMachine 651 } 652 653 // updateExternalObject updates the external object passed in with the 654 // updated labels and annotations from the MachineSet. 655 func (r *Reconciler) updateExternalObject(ctx context.Context, obj client.Object, machineSet *clusterv1.MachineSet) error { 656 updatedObject := &unstructured.Unstructured{} 657 updatedObject.SetGroupVersionKind(obj.GetObjectKind().GroupVersionKind()) 658 updatedObject.SetNamespace(obj.GetNamespace()) 659 updatedObject.SetName(obj.GetName()) 660 // Set the UID to ensure that Server-Side-Apply only performs an update 661 // and does not perform an accidental create. 662 updatedObject.SetUID(obj.GetUID()) 663 664 updatedObject.SetLabels(machineLabelsFromMachineSet(machineSet)) 665 updatedObject.SetAnnotations(machineAnnotationsFromMachineSet(machineSet)) 666 667 if err := ssa.Patch(ctx, r.Client, machineSetManagerName, updatedObject, ssa.WithCachingProxy{Cache: r.ssaCache, Original: obj}); err != nil { 668 return errors.Wrapf(err, "failed to update %s", klog.KObj(obj)) 669 } 670 return nil 671 } 672 673 // machineLabelsFromMachineSet computes the labels the Machine created from this MachineSet should have. 674 func machineLabelsFromMachineSet(machineSet *clusterv1.MachineSet) map[string]string { 675 machineLabels := map[string]string{} 676 // Note: We can't just set `machineSet.Spec.Template.Labels` directly and thus "share" the labels 677 // map between Machine and machineSet.Spec.Template.Labels. This would mean that adding the 678 // MachineSetNameLabel and MachineDeploymentNameLabel later on the Machine would also add the labels 679 // to machineSet.Spec.Template.Labels and thus modify the labels of the MachineSet. 680 for k, v := range machineSet.Spec.Template.Labels { 681 machineLabels[k] = v 682 } 683 // Always set the MachineSetNameLabel. 684 // Note: If a client tries to create a MachineSet without a selector, the MachineSet webhook 685 // will add this label automatically. But we want this label to always be present even if the MachineSet 686 // has a selector which doesn't include it. Therefore, we have to set it here explicitly. 687 machineLabels[clusterv1.MachineSetNameLabel] = format.MustFormatValue(machineSet.Name) 688 // Propagate the MachineDeploymentNameLabel from MachineSet to Machine if it exists. 689 if mdName, ok := machineSet.Labels[clusterv1.MachineDeploymentNameLabel]; ok { 690 machineLabels[clusterv1.MachineDeploymentNameLabel] = mdName 691 } 692 return machineLabels 693 } 694 695 // machineAnnotationsFromMachineSet computes the annotations the Machine created from this MachineSet should have. 696 func machineAnnotationsFromMachineSet(machineSet *clusterv1.MachineSet) map[string]string { 697 annotations := map[string]string{} 698 for k, v := range machineSet.Spec.Template.Annotations { 699 annotations[k] = v 700 } 701 return annotations 702 } 703 704 // shouldExcludeMachine returns true if the machine should be filtered out, false otherwise. 705 func shouldExcludeMachine(machineSet *clusterv1.MachineSet, machine *clusterv1.Machine) bool { 706 if metav1.GetControllerOf(machine) != nil && !metav1.IsControlledBy(machine, machineSet) { 707 return true 708 } 709 710 return false 711 } 712 713 // adoptOrphan sets the MachineSet as a controller OwnerReference to the Machine. 714 func (r *Reconciler) adoptOrphan(ctx context.Context, machineSet *clusterv1.MachineSet, machine *clusterv1.Machine) error { 715 patch := client.MergeFrom(machine.DeepCopy()) 716 newRef := *metav1.NewControllerRef(machineSet, machineSetKind) 717 machine.SetOwnerReferences(util.EnsureOwnerRef(machine.GetOwnerReferences(), newRef)) 718 return r.Client.Patch(ctx, machine, patch) 719 } 720 721 func (r *Reconciler) waitForMachineCreation(ctx context.Context, machineList []*clusterv1.Machine) error { 722 log := ctrl.LoggerFrom(ctx) 723 724 for i := 0; i < len(machineList); i++ { 725 machine := machineList[i] 726 pollErr := wait.PollUntilContextTimeout(ctx, stateConfirmationInterval, stateConfirmationTimeout, true, func(ctx context.Context) (bool, error) { 727 key := client.ObjectKey{Namespace: machine.Namespace, Name: machine.Name} 728 if err := r.Client.Get(ctx, key, &clusterv1.Machine{}); err != nil { 729 if apierrors.IsNotFound(err) { 730 return false, nil 731 } 732 return false, err 733 } 734 735 return true, nil 736 }) 737 738 if pollErr != nil { 739 log.Error(pollErr, "Failed waiting for machine object to be created") 740 return errors.Wrap(pollErr, "failed waiting for machine object to be created") 741 } 742 } 743 744 return nil 745 } 746 747 func (r *Reconciler) waitForMachineDeletion(ctx context.Context, machineList []*clusterv1.Machine) error { 748 log := ctrl.LoggerFrom(ctx) 749 750 for i := 0; i < len(machineList); i++ { 751 machine := machineList[i] 752 pollErr := wait.PollUntilContextTimeout(ctx, stateConfirmationInterval, stateConfirmationTimeout, true, func(ctx context.Context) (bool, error) { 753 m := &clusterv1.Machine{} 754 key := client.ObjectKey{Namespace: machine.Namespace, Name: machine.Name} 755 err := r.Client.Get(ctx, key, m) 756 if apierrors.IsNotFound(err) || !m.DeletionTimestamp.IsZero() { 757 return true, nil 758 } 759 return false, err 760 }) 761 762 if pollErr != nil { 763 log.Error(pollErr, "Failed waiting for machine object to be deleted") 764 return errors.Wrap(pollErr, "failed waiting for machine object to be deleted") 765 } 766 } 767 return nil 768 } 769 770 // MachineToMachineSets is a handler.ToRequestsFunc to be used to enqueue requests for reconciliation 771 // for MachineSets that might adopt an orphaned Machine. 772 func (r *Reconciler) MachineToMachineSets(ctx context.Context, o client.Object) []ctrl.Request { 773 result := []ctrl.Request{} 774 775 m, ok := o.(*clusterv1.Machine) 776 if !ok { 777 panic(fmt.Sprintf("Expected a Machine but got a %T", o)) 778 } 779 780 log := ctrl.LoggerFrom(ctx, "Machine", klog.KObj(m)) 781 782 // Check if the controller reference is already set and 783 // return an empty result when one is found. 784 for _, ref := range m.ObjectMeta.GetOwnerReferences() { 785 if ref.Controller != nil && *ref.Controller { 786 return result 787 } 788 } 789 790 mss, err := r.getMachineSetsForMachine(ctx, m) 791 if err != nil { 792 log.Error(err, "Failed getting MachineSets for Machine") 793 return nil 794 } 795 if len(mss) == 0 { 796 return nil 797 } 798 799 for _, ms := range mss { 800 name := client.ObjectKey{Namespace: ms.Namespace, Name: ms.Name} 801 result = append(result, ctrl.Request{NamespacedName: name}) 802 } 803 804 return result 805 } 806 807 func (r *Reconciler) getMachineSetsForMachine(ctx context.Context, m *clusterv1.Machine) ([]*clusterv1.MachineSet, error) { 808 if len(m.Labels) == 0 { 809 return nil, fmt.Errorf("machine %v has no labels, this is unexpected", client.ObjectKeyFromObject(m)) 810 } 811 812 msList := &clusterv1.MachineSetList{} 813 if err := r.Client.List(ctx, msList, client.InNamespace(m.Namespace)); err != nil { 814 return nil, errors.Wrapf(err, "failed to list MachineSets") 815 } 816 817 var mss []*clusterv1.MachineSet 818 for idx := range msList.Items { 819 ms := &msList.Items[idx] 820 if machine.HasMatchingLabels(ms.Spec.Selector, m.Labels) { 821 mss = append(mss, ms) 822 } 823 } 824 825 return mss, nil 826 } 827 828 // shouldAdopt returns true if the MachineSet should be adopted as a stand-alone MachineSet directly owned by the Cluster. 829 func (r *Reconciler) shouldAdopt(ms *clusterv1.MachineSet) bool { 830 // if the MachineSet is controlled by a MachineDeployment, or if it is a stand-alone MachinesSet directly owned by the Cluster, then no-op. 831 if util.HasOwner(ms.GetOwnerReferences(), clusterv1.GroupVersion.String(), []string{"MachineDeployment", "Cluster"}) { 832 return false 833 } 834 835 // If the MachineSet is originated by a MachineDeployment object, it should not be adopted directly by the Cluster as a stand-alone MachineSet. 836 // Note: this is required because after restore from a backup both the MachineSet controller and the 837 // MachineDeployment controller are racing to adopt MachineSets, see https://github.com/kubernetes-sigs/cluster-api/issues/7529 838 if _, ok := ms.Labels[clusterv1.MachineDeploymentNameLabel]; ok { 839 return false 840 } 841 return true 842 } 843 844 // updateStatus updates the Status field for the MachineSet 845 // It checks for the current state of the replicas and updates the Status of the MachineSet. 846 func (r *Reconciler) updateStatus(ctx context.Context, cluster *clusterv1.Cluster, ms *clusterv1.MachineSet, filteredMachines []*clusterv1.Machine) error { 847 log := ctrl.LoggerFrom(ctx) 848 newStatus := ms.Status.DeepCopy() 849 850 // Copy label selector to its status counterpart in string format. 851 // This is necessary for CRDs including scale subresources. 852 selector, err := metav1.LabelSelectorAsSelector(&ms.Spec.Selector) 853 if err != nil { 854 return errors.Wrapf(err, "failed to update status for MachineSet %s/%s", ms.Namespace, ms.Name) 855 } 856 newStatus.Selector = selector.String() 857 858 // Count the number of machines that have labels matching the labels of the machine 859 // template of the replica set, the matching machines may have more 860 // labels than are in the template. Because the label of machineTemplateSpec is 861 // a superset of the selector of the replica set, so the possible 862 // matching machines must be part of the filteredMachines. 863 fullyLabeledReplicasCount := 0 864 readyReplicasCount := 0 865 availableReplicasCount := 0 866 desiredReplicas := *ms.Spec.Replicas 867 templateLabel := labels.Set(ms.Spec.Template.Labels).AsSelectorPreValidated() 868 869 for _, machine := range filteredMachines { 870 log := log.WithValues("Machine", klog.KObj(machine)) 871 872 if templateLabel.Matches(labels.Set(machine.Labels)) { 873 fullyLabeledReplicasCount++ 874 } 875 876 if machine.Status.NodeRef == nil { 877 log.V(4).Info("Waiting for the machine controller to set status.NodeRef on the Machine") 878 continue 879 } 880 881 node, err := r.getMachineNode(ctx, cluster, machine) 882 if err != nil && machine.GetDeletionTimestamp().IsZero() { 883 log.Error(err, "Unable to retrieve Node status", "node", klog.KObj(node)) 884 continue 885 } 886 887 if noderefutil.IsNodeReady(node) { 888 readyReplicasCount++ 889 if noderefutil.IsNodeAvailable(node, ms.Spec.MinReadySeconds, metav1.Now()) { 890 availableReplicasCount++ 891 } 892 } else if machine.GetDeletionTimestamp().IsZero() { 893 log.V(4).Info("Waiting for the Kubernetes node on the machine to report ready state") 894 } 895 } 896 897 newStatus.Replicas = int32(len(filteredMachines)) 898 newStatus.FullyLabeledReplicas = int32(fullyLabeledReplicasCount) 899 newStatus.ReadyReplicas = int32(readyReplicasCount) 900 newStatus.AvailableReplicas = int32(availableReplicasCount) 901 902 // Copy the newly calculated status into the machineset 903 if ms.Status.Replicas != newStatus.Replicas || 904 ms.Status.FullyLabeledReplicas != newStatus.FullyLabeledReplicas || 905 ms.Status.ReadyReplicas != newStatus.ReadyReplicas || 906 ms.Status.AvailableReplicas != newStatus.AvailableReplicas || 907 ms.Generation != ms.Status.ObservedGeneration { 908 log.V(4).Info("Updating status: " + 909 fmt.Sprintf("replicas %d->%d (need %d), ", ms.Status.Replicas, newStatus.Replicas, desiredReplicas) + 910 fmt.Sprintf("fullyLabeledReplicas %d->%d, ", ms.Status.FullyLabeledReplicas, newStatus.FullyLabeledReplicas) + 911 fmt.Sprintf("readyReplicas %d->%d, ", ms.Status.ReadyReplicas, newStatus.ReadyReplicas) + 912 fmt.Sprintf("availableReplicas %d->%d, ", ms.Status.AvailableReplicas, newStatus.AvailableReplicas) + 913 fmt.Sprintf("observedGeneration %v->%v", ms.Status.ObservedGeneration, ms.Generation)) 914 915 // Save the generation number we acted on, otherwise we might wrongfully indicate 916 // that we've seen a spec update when we retry. 917 newStatus.ObservedGeneration = ms.Generation 918 newStatus.DeepCopyInto(&ms.Status) 919 } 920 switch { 921 // We are scaling up 922 case newStatus.Replicas < desiredReplicas: 923 conditions.MarkFalse(ms, clusterv1.ResizedCondition, clusterv1.ScalingUpReason, clusterv1.ConditionSeverityWarning, "Scaling up MachineSet to %d replicas (actual %d)", desiredReplicas, newStatus.Replicas) 924 // We are scaling down 925 case newStatus.Replicas > desiredReplicas: 926 conditions.MarkFalse(ms, clusterv1.ResizedCondition, clusterv1.ScalingDownReason, clusterv1.ConditionSeverityWarning, "Scaling down MachineSet to %d replicas (actual %d)", desiredReplicas, newStatus.Replicas) 927 // This means that there was no error in generating the desired number of machine objects 928 conditions.MarkTrue(ms, clusterv1.MachinesCreatedCondition) 929 default: 930 // Make sure last resize operation is marked as completed. 931 // NOTE: we are checking the number of machines ready so we report resize completed only when the machines 932 // are actually provisioned (vs reporting completed immediately after the last machine object is created). This convention is also used by KCP. 933 if newStatus.ReadyReplicas == newStatus.Replicas { 934 if conditions.IsFalse(ms, clusterv1.ResizedCondition) { 935 log.Info("All the replicas are ready", "replicas", newStatus.ReadyReplicas) 936 } 937 conditions.MarkTrue(ms, clusterv1.ResizedCondition) 938 } 939 // This means that there was no error in generating the desired number of machine objects 940 conditions.MarkTrue(ms, clusterv1.MachinesCreatedCondition) 941 } 942 943 // Aggregate the operational state of all the machines; while aggregating we are adding the 944 // source ref (reason@machine/name) so the problem can be easily tracked down to its source machine. 945 conditions.SetAggregate(ms, clusterv1.MachinesReadyCondition, collections.FromMachines(filteredMachines...).ConditionGetters(), conditions.AddSourceRef(), conditions.WithStepCounterIf(false)) 946 947 return nil 948 } 949 950 func (r *Reconciler) getMachineNode(ctx context.Context, cluster *clusterv1.Cluster, machine *clusterv1.Machine) (*corev1.Node, error) { 951 remoteClient, err := r.Tracker.GetClient(ctx, util.ObjectKey(cluster)) 952 if err != nil { 953 return nil, err 954 } 955 node := &corev1.Node{} 956 if err := remoteClient.Get(ctx, client.ObjectKey{Name: machine.Status.NodeRef.Name}, node); err != nil { 957 return nil, errors.Wrapf(err, "error retrieving node %s for machine %s/%s", machine.Status.NodeRef.Name, machine.Namespace, machine.Name) 958 } 959 return node, nil 960 } 961 962 func (r *Reconciler) reconcileUnhealthyMachines(ctx context.Context, cluster *clusterv1.Cluster, ms *clusterv1.MachineSet, filteredMachines []*clusterv1.Machine) (ctrl.Result, error) { 963 log := ctrl.LoggerFrom(ctx) 964 // List all unhealthy machines. 965 machinesToRemediate := make([]*clusterv1.Machine, 0, len(filteredMachines)) 966 for _, m := range filteredMachines { 967 // filteredMachines contains machines in deleting status to calculate correct status. 968 // skip remediation for those in deleting status. 969 if !m.DeletionTimestamp.IsZero() { 970 continue 971 } 972 if conditions.IsFalse(m, clusterv1.MachineOwnerRemediatedCondition) { 973 machinesToRemediate = append(machinesToRemediate, m) 974 } 975 } 976 977 // If there are no machines to remediate return early. 978 if len(machinesToRemediate) == 0 { 979 return ctrl.Result{}, nil 980 } 981 982 preflightChecksResult, preflightCheckErrMessage, err := r.runPreflightChecks(ctx, cluster, ms, "Machine Remediation") 983 if err != nil { 984 // If err is not nil use that as the preflightCheckErrMessage 985 preflightCheckErrMessage = err.Error() 986 } 987 988 preflightChecksFailed := err != nil || !preflightChecksResult.IsZero() 989 if preflightChecksFailed { 990 // PreflightChecks did not pass. Update the MachineOwnerRemediated condition on the unhealthy Machines with 991 // WaitingForRemediationReason reason. 992 var errs []error 993 for _, m := range machinesToRemediate { 994 patchHelper, err := patch.NewHelper(m, r.Client) 995 if err != nil { 996 errs = append(errs, errors.Wrapf(err, "failed to create patch helper for Machine %s", klog.KObj(m))) 997 continue 998 } 999 conditions.MarkFalse(m, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, preflightCheckErrMessage) 1000 if err := patchHelper.Patch(ctx, m); err != nil { 1001 errs = append(errs, errors.Wrapf(err, "failed to patch Machine %s", klog.KObj(m))) 1002 } 1003 } 1004 1005 if len(errs) > 0 { 1006 return ctrl.Result{}, errors.Wrapf(kerrors.NewAggregate(errs), "failed to patch unhealthy Machines") 1007 } 1008 return preflightChecksResult, nil 1009 } 1010 1011 // PreflightChecks passed, so it is safe to remediate unhealthy machines. 1012 // Remediate unhealthy machines by deleting them. 1013 var errs []error 1014 for _, m := range machinesToRemediate { 1015 log.Info(fmt.Sprintf("Deleting Machine %s because it was marked as unhealthy by the MachineHealthCheck controller", klog.KObj(m))) 1016 patch := client.MergeFrom(m.DeepCopy()) 1017 if err := r.Client.Delete(ctx, m); err != nil { 1018 errs = append(errs, errors.Wrapf(err, "failed to delete Machine %s", klog.KObj(m))) 1019 continue 1020 } 1021 conditions.MarkTrue(m, clusterv1.MachineOwnerRemediatedCondition) 1022 if err := r.Client.Status().Patch(ctx, m, patch); err != nil && !apierrors.IsNotFound(err) { 1023 errs = append(errs, errors.Wrapf(err, "failed to update status of Machine %s", klog.KObj(m))) 1024 } 1025 } 1026 1027 if len(errs) > 0 { 1028 return ctrl.Result{}, errors.Wrapf(kerrors.NewAggregate(errs), "failed to delete unhealthy Machines") 1029 } 1030 1031 return ctrl.Result{}, nil 1032 } 1033 1034 func reconcileExternalTemplateReference(ctx context.Context, c client.Client, cluster *clusterv1.Cluster, ref *corev1.ObjectReference) error { 1035 if !strings.HasSuffix(ref.Kind, clusterv1.TemplateSuffix) { 1036 return nil 1037 } 1038 1039 if err := utilconversion.UpdateReferenceAPIContract(ctx, c, ref); err != nil { 1040 return err 1041 } 1042 1043 obj, err := external.Get(ctx, c, ref, cluster.Namespace) 1044 if err != nil { 1045 return err 1046 } 1047 1048 patchHelper, err := patch.NewHelper(obj, c) 1049 if err != nil { 1050 return err 1051 } 1052 1053 obj.SetOwnerReferences(util.EnsureOwnerRef(obj.GetOwnerReferences(), metav1.OwnerReference{ 1054 APIVersion: clusterv1.GroupVersion.String(), 1055 Kind: "Cluster", 1056 Name: cluster.Name, 1057 UID: cluster.UID, 1058 })) 1059 1060 return patchHelper.Patch(ctx, obj) 1061 }