sigs.k8s.io/cluster-api@v1.7.1/internal/controllers/topology/cluster/reconcile_state.go (about) 1 /* 2 Copyright 2021 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package cluster 18 19 import ( 20 "context" 21 "fmt" 22 "strings" 23 "time" 24 25 "github.com/pkg/errors" 26 corev1 "k8s.io/api/core/v1" 27 apierrors "k8s.io/apimachinery/pkg/api/errors" 28 "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" 29 kerrors "k8s.io/apimachinery/pkg/util/errors" 30 "k8s.io/apimachinery/pkg/util/sets" 31 "k8s.io/apimachinery/pkg/util/validation/field" 32 "k8s.io/apimachinery/pkg/util/wait" 33 "k8s.io/apiserver/pkg/storage/names" 34 "k8s.io/klog/v2" 35 ctrl "sigs.k8s.io/controller-runtime" 36 "sigs.k8s.io/controller-runtime/pkg/client" 37 38 clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" 39 expv1 "sigs.k8s.io/cluster-api/exp/api/v1beta1" 40 runtimehooksv1 "sigs.k8s.io/cluster-api/exp/runtime/hooks/api/v1alpha1" 41 "sigs.k8s.io/cluster-api/exp/topology/scope" 42 "sigs.k8s.io/cluster-api/feature" 43 "sigs.k8s.io/cluster-api/internal/contract" 44 "sigs.k8s.io/cluster-api/internal/controllers/topology/cluster/structuredmerge" 45 "sigs.k8s.io/cluster-api/internal/hooks" 46 tlog "sigs.k8s.io/cluster-api/internal/log" 47 "sigs.k8s.io/cluster-api/internal/topology/check" 48 "sigs.k8s.io/cluster-api/internal/topology/clustershim" 49 topologynames "sigs.k8s.io/cluster-api/internal/topology/names" 50 "sigs.k8s.io/cluster-api/internal/topology/ownerrefs" 51 "sigs.k8s.io/cluster-api/util" 52 ) 53 54 const ( 55 createEventReason = "TopologyCreate" 56 updateEventReason = "TopologyUpdate" 57 deleteEventReason = "TopologyDelete" 58 ) 59 60 // reconcileState reconciles the current and desired state of the managed Cluster topology. 61 // NOTE: We are assuming all the required objects are provided as input; also, in case of any error, 62 // the entire reconcile operation will fail. This might be improved in the future if support for reconciling 63 // subset of a topology will be implemented. 64 func (r *Reconciler) reconcileState(ctx context.Context, s *scope.Scope) error { 65 log := tlog.LoggerFrom(ctx) 66 log.Infof("Reconciling state for topology owned objects") 67 68 // Reconcile the Cluster shim, a temporary object used a mean to collect 69 // objects/templates that can be orphaned in case of errors during the 70 // remaining part of the reconcile process. 71 if err := r.reconcileClusterShim(ctx, s); err != nil { 72 return err 73 } 74 75 if feature.Gates.Enabled(feature.RuntimeSDK) { 76 if err := r.callAfterHooks(ctx, s); err != nil { 77 return err 78 } 79 } 80 81 // Reconcile desired state of the InfrastructureCluster object. 82 createdInfraCluster, errInfraCluster := r.reconcileInfrastructureCluster(ctx, s) 83 if errInfraCluster != nil { 84 return errInfraCluster 85 } 86 87 // Reconcile desired state of the ControlPlane object. 88 createdControlPlane, errControlPlane := r.reconcileControlPlane(ctx, s) 89 if errControlPlane != nil { 90 // NOTE: report control plane error immediately only if we did not just create the infrastructure cluster; otherwise attempt reconcile cluster before returning. 91 if !createdInfraCluster { 92 return errControlPlane 93 } 94 95 // In this case (reconcileInfrastructureCluster reported creation of the infrastructure cluster object, reconcileControlPlane - which is expected to create the control plane object - failed), 96 // if the creation of the control plane actually did not happen, blank out ControlPlaneRef from desired cluster. 97 if s.Current.Cluster.Spec.ControlPlaneRef == nil && !createdControlPlane { 98 s.Desired.Cluster.Spec.ControlPlaneRef = nil 99 } 100 } 101 102 // Reconcile desired state of the Cluster object. 103 errCluster := r.reconcileCluster(ctx, s) 104 err := kerrors.NewAggregate([]error{errControlPlane, errCluster}) 105 if err != nil { 106 return err 107 } 108 109 // Reconcile desired state of the MachineDeployment objects. 110 if err := r.reconcileMachineDeployments(ctx, s); err != nil { 111 return err 112 } 113 114 // Reconcile desired state of the MachinePool object and return. 115 return r.reconcileMachinePools(ctx, s) 116 } 117 118 // Reconcile the Cluster shim, a temporary object used a mean to collect objects/templates 119 // that might be orphaned in case of errors during the remaining part of the reconcile process. 120 func (r *Reconciler) reconcileClusterShim(ctx context.Context, s *scope.Scope) error { 121 shim := clustershim.New(s.Current.Cluster) 122 123 // If we are going to create the InfrastructureCluster or the ControlPlane object, then 124 // add a temporary cluster-shim object and use it as an additional owner. 125 // This will ensure the objects will be garbage collected in case of errors in between 126 // creating InfrastructureCluster/ControlPlane objects and updating the Cluster with the 127 // references to above objects. 128 if s.Current.InfrastructureCluster == nil || s.Current.ControlPlane.Object == nil { 129 // Given that the cluster shim is a temporary object which is only modified 130 // by this controller, it is not necessary to use the SSA patch helper. 131 if err := r.Client.Create(ctx, shim); err != nil { 132 if !apierrors.IsAlreadyExists(err) { 133 return errors.Wrap(err, "failed to create the cluster shim object") 134 } 135 if err := r.Client.Get(ctx, client.ObjectKeyFromObject(shim), shim); err != nil { 136 return errors.Wrapf(err, "failed to read the cluster shim object") 137 } 138 } 139 140 // Enforce type meta back given that it gets blanked out by Get. 141 shim.Kind = "Secret" 142 shim.APIVersion = corev1.SchemeGroupVersion.String() 143 144 // Add the shim as a temporary owner for the InfrastructureCluster. 145 s.Desired.InfrastructureCluster.SetOwnerReferences( 146 util.EnsureOwnerRef(s.Desired.InfrastructureCluster.GetOwnerReferences(), 147 *ownerrefs.OwnerReferenceTo(shim, corev1.SchemeGroupVersion.WithKind("Secret")), 148 ), 149 ) 150 151 // Add the shim as a temporary owner for the ControlPlane. 152 s.Desired.ControlPlane.Object.SetOwnerReferences( 153 util.EnsureOwnerRef(s.Desired.ControlPlane.Object.GetOwnerReferences(), 154 *ownerrefs.OwnerReferenceTo(shim, corev1.SchemeGroupVersion.WithKind("Secret")), 155 ), 156 ) 157 } 158 159 // If the InfrastructureCluster and the ControlPlane objects have been already created 160 // in previous reconciliation, check if they have already been reconciled by the ClusterController 161 // by verifying the ownerReference for the Cluster is present. 162 // 163 // When the Cluster and the shim object are both owners, 164 // it's safe for us to remove the shim and garbage collect any potential orphaned resource. 165 if s.Current.InfrastructureCluster != nil && s.Current.ControlPlane.Object != nil { 166 clusterOwnsAll := ownerrefs.HasOwnerReferenceFrom(s.Current.InfrastructureCluster, s.Current.Cluster) && 167 ownerrefs.HasOwnerReferenceFrom(s.Current.ControlPlane.Object, s.Current.Cluster) 168 shimOwnsAtLeastOne := ownerrefs.HasOwnerReferenceFrom(s.Current.InfrastructureCluster, shim) || 169 ownerrefs.HasOwnerReferenceFrom(s.Current.ControlPlane.Object, shim) 170 171 if clusterOwnsAll && shimOwnsAtLeastOne { 172 if err := r.Client.Delete(ctx, shim); err != nil { 173 if !apierrors.IsNotFound(err) { 174 return errors.Wrapf(err, "failed to delete the cluster shim object") 175 } 176 } 177 } 178 } 179 return nil 180 } 181 182 func (r *Reconciler) callAfterHooks(ctx context.Context, s *scope.Scope) error { 183 if err := r.callAfterControlPlaneInitialized(ctx, s); err != nil { 184 return err 185 } 186 187 return r.callAfterClusterUpgrade(ctx, s) 188 } 189 190 func (r *Reconciler) callAfterControlPlaneInitialized(ctx context.Context, s *scope.Scope) error { 191 // If the cluster topology is being created then track to intent to call the AfterControlPlaneInitialized hook so that we can call it later. 192 if s.Current.Cluster.Spec.InfrastructureRef == nil && s.Current.Cluster.Spec.ControlPlaneRef == nil { 193 if err := hooks.MarkAsPending(ctx, r.Client, s.Current.Cluster, runtimehooksv1.AfterControlPlaneInitialized); err != nil { 194 return err 195 } 196 } 197 198 // Call the hook only if we are tracking the intent to do so. If it is not tracked it means we don't need to call the 199 // hook because already called the hook after the control plane is initialized. 200 if hooks.IsPending(runtimehooksv1.AfterControlPlaneInitialized, s.Current.Cluster) { 201 if isControlPlaneInitialized(s.Current.Cluster) { 202 // The control plane is initialized for the first time. Call all the registered extensions for the hook. 203 hookRequest := &runtimehooksv1.AfterControlPlaneInitializedRequest{ 204 Cluster: *s.Current.Cluster, 205 } 206 hookResponse := &runtimehooksv1.AfterControlPlaneInitializedResponse{} 207 if err := r.RuntimeClient.CallAllExtensions(ctx, runtimehooksv1.AfterControlPlaneInitialized, s.Current.Cluster, hookRequest, hookResponse); err != nil { 208 return err 209 } 210 s.HookResponseTracker.Add(runtimehooksv1.AfterControlPlaneInitialized, hookResponse) 211 if err := hooks.MarkAsDone(ctx, r.Client, s.Current.Cluster, runtimehooksv1.AfterControlPlaneInitialized); err != nil { 212 return err 213 } 214 } 215 } 216 217 return nil 218 } 219 220 func isControlPlaneInitialized(cluster *clusterv1.Cluster) bool { 221 for _, condition := range cluster.GetConditions() { 222 if condition.Type == clusterv1.ControlPlaneInitializedCondition { 223 if condition.Status == corev1.ConditionTrue { 224 return true 225 } 226 } 227 } 228 return false 229 } 230 231 func (r *Reconciler) callAfterClusterUpgrade(ctx context.Context, s *scope.Scope) error { 232 // Call the hook only if we are tracking the intent to do so. If it is not tracked it means we don't need to call the 233 // hook because we didn't go through an upgrade or we already called the hook after the upgrade. 234 if hooks.IsPending(runtimehooksv1.AfterClusterUpgrade, s.Current.Cluster) { 235 // Call the registered extensions for the hook after the cluster is fully upgraded. 236 // A clusters is considered fully upgraded if: 237 // - Control plane is stable (not upgrading, not scaling, not about to upgrade) 238 // - MachineDeployments/MachinePools are not currently upgrading 239 // - MachineDeployments/MachinePools are not pending an upgrade 240 // - MachineDeployments/MachinePools are not pending create 241 if s.UpgradeTracker.ControlPlane.IsControlPlaneStable() && // Control Plane stable checks 242 len(s.UpgradeTracker.MachineDeployments.UpgradingNames()) == 0 && // Machine deployments are not upgrading or not about to upgrade 243 !s.UpgradeTracker.MachineDeployments.IsAnyPendingCreate() && // No MachineDeployments are pending create 244 !s.UpgradeTracker.MachineDeployments.IsAnyPendingUpgrade() && // No MachineDeployments are pending an upgrade 245 !s.UpgradeTracker.MachineDeployments.DeferredUpgrade() && // No MachineDeployments have deferred an upgrade 246 len(s.UpgradeTracker.MachinePools.UpgradingNames()) == 0 && // Machine pools are not upgrading or not about to upgrade 247 !s.UpgradeTracker.MachinePools.IsAnyPendingCreate() && // No MachinePools are pending create 248 !s.UpgradeTracker.MachinePools.IsAnyPendingUpgrade() && // No MachinePools are pending an upgrade 249 !s.UpgradeTracker.MachinePools.DeferredUpgrade() { // No MachinePools have deferred an upgrade 250 // Everything is stable and the cluster can be considered fully upgraded. 251 hookRequest := &runtimehooksv1.AfterClusterUpgradeRequest{ 252 Cluster: *s.Current.Cluster, 253 KubernetesVersion: s.Current.Cluster.Spec.Topology.Version, 254 } 255 hookResponse := &runtimehooksv1.AfterClusterUpgradeResponse{} 256 if err := r.RuntimeClient.CallAllExtensions(ctx, runtimehooksv1.AfterClusterUpgrade, s.Current.Cluster, hookRequest, hookResponse); err != nil { 257 return err 258 } 259 s.HookResponseTracker.Add(runtimehooksv1.AfterClusterUpgrade, hookResponse) 260 // The hook is successfully called; we can remove this hook from the list of pending-hooks. 261 if err := hooks.MarkAsDone(ctx, r.Client, s.Current.Cluster, runtimehooksv1.AfterClusterUpgrade); err != nil { 262 return err 263 } 264 } 265 } 266 267 return nil 268 } 269 270 // reconcileInfrastructureCluster reconciles the desired state of the InfrastructureCluster object. 271 func (r *Reconciler) reconcileInfrastructureCluster(ctx context.Context, s *scope.Scope) (bool, error) { 272 ctx, _ = tlog.LoggerFrom(ctx).WithObject(s.Desired.InfrastructureCluster).Into(ctx) 273 274 ignorePaths, err := contract.InfrastructureCluster().IgnorePaths(s.Desired.InfrastructureCluster) 275 if err != nil { 276 return false, errors.Wrap(err, "failed to calculate ignore paths") 277 } 278 279 return r.reconcileReferencedObject(ctx, reconcileReferencedObjectInput{ 280 cluster: s.Current.Cluster, 281 current: s.Current.InfrastructureCluster, 282 desired: s.Desired.InfrastructureCluster, 283 ignorePaths: ignorePaths, 284 }) 285 } 286 287 // reconcileControlPlane works to bring the current state of a managed topology in line with the desired state. This involves 288 // updating the cluster where needed. 289 func (r *Reconciler) reconcileControlPlane(ctx context.Context, s *scope.Scope) (bool, error) { 290 // If the ControlPlane has defined a current or desired MachineHealthCheck attempt to reconcile it. 291 // MHC changes are not Kubernetes version dependent, therefore proceed with MHC reconciliation 292 // even if the Control Plane is pending an upgrade. 293 if s.Desired.ControlPlane.MachineHealthCheck != nil || s.Current.ControlPlane.MachineHealthCheck != nil { 294 // Reconcile the current and desired state of the MachineHealthCheck. 295 if err := r.reconcileMachineHealthCheck(ctx, s.Current.ControlPlane.MachineHealthCheck, s.Desired.ControlPlane.MachineHealthCheck); err != nil { 296 return false, err 297 } 298 } 299 300 // Return early if the control plane is pending an upgrade. 301 // Do not reconcile the control plane yet to avoid updating the control plane while it is still pending a 302 // version upgrade. This will prevent the control plane from performing a double rollout. 303 if s.UpgradeTracker.ControlPlane.IsPendingUpgrade { 304 return false, nil 305 } 306 // If the clusterClass mandates the controlPlane has infrastructureMachines, reconcile it. 307 infrastructureMachineCleanupFunc := func() {} 308 if s.Blueprint.HasControlPlaneInfrastructureMachine() { 309 ctx, _ := tlog.LoggerFrom(ctx).WithObject(s.Desired.ControlPlane.InfrastructureMachineTemplate).Into(ctx) 310 311 cpInfraRef, err := contract.ControlPlane().MachineTemplate().InfrastructureRef().Get(s.Desired.ControlPlane.Object) 312 if err != nil { 313 return false, errors.Wrapf(err, "failed to reconcile %s", tlog.KObj{Obj: s.Desired.ControlPlane.InfrastructureMachineTemplate}) 314 } 315 316 // Create or update the MachineInfrastructureTemplate of the control plane. 317 createdInfrastructureTemplate, err := r.reconcileReferencedTemplate(ctx, reconcileReferencedTemplateInput{ 318 cluster: s.Current.Cluster, 319 ref: cpInfraRef, 320 current: s.Current.ControlPlane.InfrastructureMachineTemplate, 321 desired: s.Desired.ControlPlane.InfrastructureMachineTemplate, 322 compatibilityChecker: check.ObjectsAreCompatible, 323 templateNamePrefix: topologynames.ControlPlaneInfrastructureMachineTemplateNamePrefix(s.Current.Cluster.Name), 324 }) 325 if err != nil { 326 return false, err 327 } 328 329 if createdInfrastructureTemplate { 330 infrastructureMachineCleanupFunc = func() { 331 // Best effort cleanup of the InfrastructureMachineTemplate; 332 // If this fails, the object will be garbage collected when the cluster is deleted. 333 if err := r.Client.Delete(ctx, s.Desired.ControlPlane.InfrastructureMachineTemplate); err != nil { 334 log := tlog.LoggerFrom(ctx). 335 WithValues(s.Desired.ControlPlane.InfrastructureMachineTemplate.GetObjectKind().GroupVersionKind().Kind, s.Desired.ControlPlane.InfrastructureMachineTemplate.GetName()). 336 WithValues("err", err.Error()) 337 log.Infof("WARNING! Failed to cleanup InfrastructureMachineTemplate for control plane while handling creation or update error. The object will be garbage collected when the cluster is deleted.") 338 } 339 } 340 } 341 342 // The controlPlaneObject.Spec.machineTemplate.infrastructureRef has to be updated in the desired object 343 err = contract.ControlPlane().MachineTemplate().InfrastructureRef().Set(s.Desired.ControlPlane.Object, refToUnstructured(cpInfraRef)) 344 if err != nil { 345 // Best effort cleanup of the InfrastructureMachineTemplate (only on creation). 346 infrastructureMachineCleanupFunc() 347 return false, errors.Wrapf(err, "failed to reconcile %s", tlog.KObj{Obj: s.Desired.ControlPlane.Object}) 348 } 349 } 350 351 // Create or update the ControlPlaneObject for the ControlPlaneState. 352 ctx, _ = tlog.LoggerFrom(ctx).WithObject(s.Desired.ControlPlane.Object).Into(ctx) 353 created, err := r.reconcileReferencedObject(ctx, reconcileReferencedObjectInput{ 354 cluster: s.Current.Cluster, 355 current: s.Current.ControlPlane.Object, 356 desired: s.Desired.ControlPlane.Object, 357 versionGetter: contract.ControlPlane().Version().Get, 358 }) 359 if err != nil { 360 // Best effort cleanup of the InfrastructureMachineTemplate (only on creation). 361 infrastructureMachineCleanupFunc() 362 return created, err 363 } 364 365 // If the controlPlane has infrastructureMachines and the InfrastructureMachineTemplate has changed on this reconcile 366 // delete the old template. 367 // This is a best effort deletion only and may leak templates if an error occurs during reconciliation. 368 if s.Blueprint.HasControlPlaneInfrastructureMachine() && s.Current.ControlPlane.InfrastructureMachineTemplate != nil { 369 if s.Current.ControlPlane.InfrastructureMachineTemplate.GetName() != s.Desired.ControlPlane.InfrastructureMachineTemplate.GetName() { 370 if err := r.Client.Delete(ctx, s.Current.ControlPlane.InfrastructureMachineTemplate); err != nil { 371 return created, errors.Wrapf(err, "failed to delete oldinfrastructure machine template %s of control plane %s", 372 tlog.KObj{Obj: s.Current.ControlPlane.InfrastructureMachineTemplate}, 373 tlog.KObj{Obj: s.Current.ControlPlane.Object}, 374 ) 375 } 376 } 377 } 378 379 return created, nil 380 } 381 382 // reconcileMachineHealthCheck creates, updates, deletes or leaves untouched a MachineHealthCheck depending on the difference between the 383 // current state and the desired state. 384 func (r *Reconciler) reconcileMachineHealthCheck(ctx context.Context, current, desired *clusterv1.MachineHealthCheck) error { 385 log := tlog.LoggerFrom(ctx) 386 387 // If a current MachineHealthCheck doesn't exist but there is a desired MachineHealthCheck attempt to create. 388 if current == nil && desired != nil { 389 log.Infof("Creating %s", tlog.KObj{Obj: desired}) 390 helper, err := r.patchHelperFactory(ctx, nil, desired) 391 if err != nil { 392 return errors.Wrapf(err, "failed to create patch helper for %s", tlog.KObj{Obj: desired}) 393 } 394 if err := helper.Patch(ctx); err != nil { 395 return errors.Wrapf(err, "failed to create %s", tlog.KObj{Obj: desired}) 396 } 397 r.recorder.Eventf(desired, corev1.EventTypeNormal, createEventReason, "Created %q", tlog.KObj{Obj: desired}) 398 return nil 399 } 400 401 // If a current MachineHealthCheck exists but there is no desired MachineHealthCheck attempt to delete. 402 if current != nil && desired == nil { 403 log.Infof("Deleting %s", tlog.KObj{Obj: current}) 404 if err := r.Client.Delete(ctx, current); err != nil { 405 // If the object to be deleted is not found don't throw an error. 406 if !apierrors.IsNotFound(err) { 407 return errors.Wrapf(err, "failed to delete %s", tlog.KObj{Obj: current}) 408 } 409 } 410 r.recorder.Eventf(current, corev1.EventTypeNormal, deleteEventReason, "Deleted %q", tlog.KObj{Obj: current}) 411 return nil 412 } 413 414 ctx, log = log.WithObject(current).Into(ctx) 415 416 // Check differences between current and desired MachineHealthChecks, and patch if required. 417 // NOTE: we want to be authoritative on the entire spec because the users are 418 // expected to change MHC fields from the ClusterClass only. 419 patchHelper, err := r.patchHelperFactory(ctx, current, desired) 420 if err != nil { 421 return errors.Wrapf(err, "failed to create patch helper for %s", tlog.KObj{Obj: current}) 422 } 423 if !patchHelper.HasChanges() { 424 log.V(3).Infof("No changes for %s", tlog.KObj{Obj: current}) 425 return nil 426 } 427 428 log.Infof("Patching %s", tlog.KObj{Obj: current}) 429 if err := patchHelper.Patch(ctx); err != nil { 430 return errors.Wrapf(err, "failed to patch %s", tlog.KObj{Obj: current}) 431 } 432 r.recorder.Eventf(current, corev1.EventTypeNormal, updateEventReason, "Updated %q", tlog.KObj{Obj: current}) 433 return nil 434 } 435 436 // reconcileCluster reconciles the desired state of the Cluster object. 437 // NOTE: this assumes reconcileInfrastructureCluster and reconcileControlPlane being already completed; 438 // most specifically, after a Cluster is created it is assumed that the reference to the InfrastructureCluster / 439 // ControlPlane objects should never change (only the content of the objects can change). 440 func (r *Reconciler) reconcileCluster(ctx context.Context, s *scope.Scope) error { 441 ctx, log := tlog.LoggerFrom(ctx).WithObject(s.Desired.Cluster).Into(ctx) 442 443 // Check differences between current and desired state, and eventually patch the current object. 444 patchHelper, err := r.patchHelperFactory(ctx, s.Current.Cluster, s.Desired.Cluster) 445 if err != nil { 446 return errors.Wrapf(err, "failed to create patch helper for %s", tlog.KObj{Obj: s.Current.Cluster}) 447 } 448 if !patchHelper.HasChanges() { 449 log.V(3).Infof("No changes for %s", tlog.KObj{Obj: s.Current.Cluster}) 450 return nil 451 } 452 453 log.Infof("Patching %s", tlog.KObj{Obj: s.Current.Cluster}) 454 if err := patchHelper.Patch(ctx); err != nil { 455 return errors.Wrapf(err, "failed to patch %s", tlog.KObj{Obj: s.Current.Cluster}) 456 } 457 r.recorder.Eventf(s.Current.Cluster, corev1.EventTypeNormal, updateEventReason, "Updated %q", tlog.KObj{Obj: s.Current.Cluster}) 458 459 // Wait until Cluster is updated in the cache. 460 // Note: We have to do this because otherwise using a cached client in the Reconcile func could 461 // return a stale state of the Cluster we just patched (because the cache might be stale). 462 // Note: It is good enough to check that the resource version changed. Other controllers might have updated the 463 // Cluster as well, but the combination of the patch call above without a conflict and a changed resource 464 // version here guarantees that we see the changes of our own update. 465 err = wait.PollUntilContextTimeout(ctx, 5*time.Millisecond, 5*time.Second, true, func(ctx context.Context) (bool, error) { 466 key := client.ObjectKey{Namespace: s.Current.Cluster.GetNamespace(), Name: s.Current.Cluster.GetName()} 467 cachedCluster := &clusterv1.Cluster{} 468 if err := r.Client.Get(ctx, key, cachedCluster); err != nil { 469 return false, err 470 } 471 return s.Current.Cluster.GetResourceVersion() != cachedCluster.GetResourceVersion(), nil 472 }) 473 if err != nil { 474 return errors.Wrapf(err, "failed waiting for Cluster %s to be updated in the cache after patch", tlog.KObj{Obj: s.Current.Cluster}) 475 } 476 return nil 477 } 478 479 // reconcileMachineDeployments reconciles the desired state of the MachineDeployment objects. 480 func (r *Reconciler) reconcileMachineDeployments(ctx context.Context, s *scope.Scope) error { 481 diff := calculateMachineDeploymentDiff(s.Current.MachineDeployments, s.Desired.MachineDeployments) 482 483 // Create MachineDeployments. 484 if len(diff.toCreate) > 0 { 485 // In current state we only got the MD list via a cached call. 486 // As a consequence, in order to prevent the creation of duplicate MD due to stale reads, 487 // we are now using a live client to double-check here that the MachineDeployment 488 // to be created doesn't exist yet. 489 currentMDTopologyNames, err := r.getCurrentMachineDeployments(ctx, s) 490 if err != nil { 491 return err 492 } 493 for _, mdTopologyName := range diff.toCreate { 494 md := s.Desired.MachineDeployments[mdTopologyName] 495 496 // Skip the MD creation if the MD already exists. 497 if currentMDTopologyNames.Has(mdTopologyName) { 498 log := tlog.LoggerFrom(ctx).WithMachineDeployment(md.Object) 499 log.V(3).Infof(fmt.Sprintf("Skipping creation of MachineDeployment %s because MachineDeployment for topology %s already exists (only considered creation because of stale cache)", tlog.KObj{Obj: md.Object}, mdTopologyName)) 500 continue 501 } 502 503 if err := r.createMachineDeployment(ctx, s, md); err != nil { 504 return err 505 } 506 } 507 } 508 509 // Update MachineDeployments. 510 for _, mdTopologyName := range diff.toUpdate { 511 currentMD := s.Current.MachineDeployments[mdTopologyName] 512 desiredMD := s.Desired.MachineDeployments[mdTopologyName] 513 if err := r.updateMachineDeployment(ctx, s, mdTopologyName, currentMD, desiredMD); err != nil { 514 return err 515 } 516 } 517 518 // Delete MachineDeployments. 519 for _, mdTopologyName := range diff.toDelete { 520 md := s.Current.MachineDeployments[mdTopologyName] 521 if err := r.deleteMachineDeployment(ctx, s.Current.Cluster, md); err != nil { 522 return err 523 } 524 } 525 return nil 526 } 527 528 // getCurrentMachineDeployments gets the current list of MachineDeployments via the APIReader. 529 func (r *Reconciler) getCurrentMachineDeployments(ctx context.Context, s *scope.Scope) (sets.Set[string], error) { 530 // TODO: We should consider using PartialObjectMetadataList here. Currently this doesn't work as our 531 // implementation for topology dryrun doesn't support PartialObjectMetadataList. 532 mdList := &clusterv1.MachineDeploymentList{} 533 err := r.APIReader.List(ctx, mdList, 534 client.MatchingLabels{ 535 clusterv1.ClusterNameLabel: s.Current.Cluster.Name, 536 clusterv1.ClusterTopologyOwnedLabel: "", 537 }, 538 client.InNamespace(s.Current.Cluster.Namespace), 539 ) 540 if err != nil { 541 return nil, errors.Wrap(err, "failed to read MachineDeployments for managed topology") 542 } 543 544 currentMDs := sets.Set[string]{} 545 for _, md := range mdList.Items { 546 mdTopologyName, ok := md.ObjectMeta.Labels[clusterv1.ClusterTopologyMachineDeploymentNameLabel] 547 if ok || mdTopologyName != "" { 548 currentMDs.Insert(mdTopologyName) 549 } 550 } 551 return currentMDs, nil 552 } 553 554 // createMachineDeployment creates a MachineDeployment and the corresponding Templates. 555 func (r *Reconciler) createMachineDeployment(ctx context.Context, s *scope.Scope, md *scope.MachineDeploymentState) error { 556 // Do not create the MachineDeployment if it is marked as pending create. 557 // This will also block MHC creation because creating the MHC without the corresponding 558 // MachineDeployment is unnecessary. 559 mdTopologyName, ok := md.Object.Labels[clusterv1.ClusterTopologyMachineDeploymentNameLabel] 560 if !ok || mdTopologyName == "" { 561 // Note: This is only an additional safety check and should not happen. The label will always be added when computing 562 // the desired MachineDeployment. 563 return errors.Errorf("new MachineDeployment is missing the %q label", clusterv1.ClusterTopologyMachineDeploymentNameLabel) 564 } 565 // Return early if the MachineDeployment is pending create. 566 if s.UpgradeTracker.MachineDeployments.IsPendingCreate(mdTopologyName) { 567 return nil 568 } 569 570 log := tlog.LoggerFrom(ctx).WithMachineDeployment(md.Object) 571 cluster := s.Current.Cluster 572 infraCtx, _ := log.WithObject(md.InfrastructureMachineTemplate).Into(ctx) 573 infrastructureMachineCleanupFunc := func() {} 574 createdInfra, err := r.reconcileReferencedTemplate(infraCtx, reconcileReferencedTemplateInput{ 575 cluster: cluster, 576 desired: md.InfrastructureMachineTemplate, 577 }) 578 if err != nil { 579 return errors.Wrapf(err, "failed to create %s", md.Object.Kind) 580 } 581 582 if createdInfra { 583 infrastructureMachineCleanupFunc = func() { 584 // Best effort cleanup of the InfrastructureMachineTemplate; 585 // If this fails, the object will be garbage collected when the cluster is deleted. 586 if err := r.Client.Delete(ctx, md.InfrastructureMachineTemplate); err != nil { 587 log := tlog.LoggerFrom(ctx). 588 WithValues(md.InfrastructureMachineTemplate.GetObjectKind().GroupVersionKind().Kind, md.InfrastructureMachineTemplate.GetName()). 589 WithValues("err", err.Error()) 590 log.Infof("WARNING! Failed to cleanup InfrastructureMachineTemplate for MachineDeployment while handling creation error. The object will be garbage collected when the cluster is deleted.") 591 } 592 } 593 } 594 595 bootstrapCtx, _ := log.WithObject(md.BootstrapTemplate).Into(ctx) 596 bootstrapCleanupFunc := func() {} 597 createdBootstrap, err := r.reconcileReferencedTemplate(bootstrapCtx, reconcileReferencedTemplateInput{ 598 cluster: cluster, 599 desired: md.BootstrapTemplate, 600 }) 601 if err != nil { 602 // Best effort cleanup of the InfrastructureMachineTemplate (only on creation). 603 infrastructureMachineCleanupFunc() 604 return errors.Wrapf(err, "failed to create %s", md.Object.Kind) 605 } 606 607 if createdBootstrap { 608 bootstrapCleanupFunc = func() { 609 // Best effort cleanup of the BootstrapTemplate; 610 // If this fails, the object will be garbage collected when the cluster is deleted. 611 if err := r.Client.Delete(ctx, md.BootstrapTemplate); err != nil { 612 log := tlog.LoggerFrom(ctx). 613 WithValues(md.BootstrapTemplate.GetObjectKind().GroupVersionKind().Kind, md.BootstrapTemplate.GetName()). 614 WithValues("err", err.Error()) 615 log.Infof("WARNING! Failed to cleanup BootstrapTemplate for MachineDeployment while handling creation error. The object will be garbage collected when the cluster is deleted.") 616 } 617 } 618 } 619 620 log = log.WithObject(md.Object) 621 log.Infof(fmt.Sprintf("Creating %s", tlog.KObj{Obj: md.Object})) 622 helper, err := r.patchHelperFactory(ctx, nil, md.Object) 623 if err != nil { 624 // Best effort cleanup of the InfrastructureMachineTemplate & BootstrapTemplate (only on creation). 625 infrastructureMachineCleanupFunc() 626 bootstrapCleanupFunc() 627 return createErrorWithoutObjectName(ctx, err, md.Object) 628 } 629 if err := helper.Patch(ctx); err != nil { 630 // Best effort cleanup of the InfrastructureMachineTemplate & BootstrapTemplate (only on creation). 631 infrastructureMachineCleanupFunc() 632 bootstrapCleanupFunc() 633 return createErrorWithoutObjectName(ctx, err, md.Object) 634 } 635 r.recorder.Eventf(cluster, corev1.EventTypeNormal, createEventReason, "Created %q", tlog.KObj{Obj: md.Object}) 636 637 // Wait until MachineDeployment is visible in the cache. 638 // Note: We have to do this because otherwise using a cached client in current state could 639 // miss a newly created MachineDeployment (because the cache might be stale). 640 err = wait.PollUntilContextTimeout(ctx, 5*time.Millisecond, 5*time.Second, true, func(ctx context.Context) (bool, error) { 641 key := client.ObjectKey{Namespace: md.Object.Namespace, Name: md.Object.Name} 642 if err := r.Client.Get(ctx, key, &clusterv1.MachineDeployment{}); err != nil { 643 if apierrors.IsNotFound(err) { 644 return false, nil 645 } 646 return false, err 647 } 648 return true, nil 649 }) 650 if err != nil { 651 return errors.Wrapf(err, "failed waiting for MachineDeployment %s to be visible in the cache after create", md.Object.Kind) 652 } 653 654 // If the MachineDeployment has defined a MachineHealthCheck reconcile it. 655 if md.MachineHealthCheck != nil { 656 if err := r.reconcileMachineHealthCheck(ctx, nil, md.MachineHealthCheck); err != nil { 657 return err 658 } 659 } 660 return nil 661 } 662 663 // updateMachineDeployment updates a MachineDeployment. Also rotates the corresponding Templates if necessary. 664 func (r *Reconciler) updateMachineDeployment(ctx context.Context, s *scope.Scope, mdTopologyName string, currentMD, desiredMD *scope.MachineDeploymentState) error { 665 log := tlog.LoggerFrom(ctx).WithMachineDeployment(desiredMD.Object) 666 667 // Patch MachineHealthCheck for the MachineDeployment. 668 // MHC changes are not Kubernetes version dependent, therefore proceed with MHC reconciliation 669 // even if the MachineDeployment is pending an upgrade. 670 if desiredMD.MachineHealthCheck != nil || currentMD.MachineHealthCheck != nil { 671 if err := r.reconcileMachineHealthCheck(ctx, currentMD.MachineHealthCheck, desiredMD.MachineHealthCheck); err != nil { 672 return err 673 } 674 } 675 676 // Return early if the MachineDeployment is pending an upgrade. 677 // Do not reconcile the MachineDeployment yet to avoid updating the MachineDeployment while it is still pending a 678 // version upgrade. This will prevent the MachineDeployment from performing a double rollout. 679 if s.UpgradeTracker.MachineDeployments.IsPendingUpgrade(currentMD.Object.Name) { 680 return nil 681 } 682 683 cluster := s.Current.Cluster 684 infraCtx, _ := log.WithObject(desiredMD.InfrastructureMachineTemplate).Into(ctx) 685 infrastructureMachineCleanupFunc := func() {} 686 createdInfra, err := r.reconcileReferencedTemplate(infraCtx, reconcileReferencedTemplateInput{ 687 cluster: cluster, 688 ref: &desiredMD.Object.Spec.Template.Spec.InfrastructureRef, 689 current: currentMD.InfrastructureMachineTemplate, 690 desired: desiredMD.InfrastructureMachineTemplate, 691 templateNamePrefix: topologynames.InfrastructureMachineTemplateNamePrefix(cluster.Name, mdTopologyName), 692 compatibilityChecker: check.ObjectsAreCompatible, 693 }) 694 if err != nil { 695 return errors.Wrapf(err, "failed to reconcile %s", tlog.KObj{Obj: currentMD.Object}) 696 } 697 698 if createdInfra { 699 infrastructureMachineCleanupFunc = func() { 700 // Best effort cleanup of the InfrastructureMachineTemplate; 701 // If this fails, the object will be garbage collected when the cluster is deleted. 702 if err := r.Client.Delete(ctx, desiredMD.InfrastructureMachineTemplate); err != nil { 703 log := tlog.LoggerFrom(ctx). 704 WithValues(desiredMD.InfrastructureMachineTemplate.GetObjectKind().GroupVersionKind().Kind, desiredMD.InfrastructureMachineTemplate.GetName()). 705 WithValues("err", err.Error()) 706 log.Infof("WARNING! Failed to cleanup InfrastructureMachineTemplate for MachineDeployment while handling update error. The object will be garbage collected when the cluster is deleted.") 707 } 708 } 709 } 710 711 bootstrapCtx, _ := log.WithObject(desiredMD.BootstrapTemplate).Into(ctx) 712 bootstrapCleanupFunc := func() {} 713 createdBootstrap, err := r.reconcileReferencedTemplate(bootstrapCtx, reconcileReferencedTemplateInput{ 714 cluster: cluster, 715 ref: desiredMD.Object.Spec.Template.Spec.Bootstrap.ConfigRef, 716 current: currentMD.BootstrapTemplate, 717 desired: desiredMD.BootstrapTemplate, 718 templateNamePrefix: topologynames.BootstrapTemplateNamePrefix(cluster.Name, mdTopologyName), 719 compatibilityChecker: check.ObjectsAreInTheSameNamespace, 720 }) 721 if err != nil { 722 // Best effort cleanup of the InfrastructureMachineTemplate (only on template rotation). 723 infrastructureMachineCleanupFunc() 724 return errors.Wrapf(err, "failed to reconcile %s", tlog.KObj{Obj: currentMD.Object}) 725 } 726 727 if createdBootstrap { 728 bootstrapCleanupFunc = func() { 729 // Best effort cleanup of the BootstrapTemplate; 730 // If this fails, the object will be garbage collected when the cluster is deleted. 731 if err := r.Client.Delete(ctx, desiredMD.BootstrapTemplate); err != nil { 732 log := tlog.LoggerFrom(ctx). 733 WithValues(desiredMD.BootstrapTemplate.GetObjectKind().GroupVersionKind().Kind, desiredMD.BootstrapTemplate.GetName()). 734 WithValues("err", err.Error()) 735 log.Infof("WARNING! Failed to cleanup BootstrapTemplate for MachineDeployment while handling update error. The object will be garbage collected when the cluster is deleted.") 736 } 737 } 738 } 739 740 // Check differences between current and desired MachineDeployment, and eventually patch the current object. 741 log = log.WithObject(desiredMD.Object) 742 patchHelper, err := r.patchHelperFactory(ctx, currentMD.Object, desiredMD.Object) 743 if err != nil { 744 // Best effort cleanup of the InfrastructureMachineTemplate & BootstrapTemplate (only on template rotation). 745 infrastructureMachineCleanupFunc() 746 bootstrapCleanupFunc() 747 return errors.Wrapf(err, "failed to create patch helper for %s", tlog.KObj{Obj: currentMD.Object}) 748 } 749 if !patchHelper.HasChanges() { 750 log.V(3).Infof("No changes for %s", tlog.KObj{Obj: currentMD.Object}) 751 return nil 752 } 753 754 log.Infof("Patching %s", tlog.KObj{Obj: currentMD.Object}) 755 if err := patchHelper.Patch(ctx); err != nil { 756 // Best effort cleanup of the InfrastructureMachineTemplate & BootstrapTemplate (only on template rotation). 757 infrastructureMachineCleanupFunc() 758 bootstrapCleanupFunc() 759 return errors.Wrapf(err, "failed to patch %s", tlog.KObj{Obj: currentMD.Object}) 760 } 761 r.recorder.Eventf(cluster, corev1.EventTypeNormal, updateEventReason, "Updated %q%s", tlog.KObj{Obj: currentMD.Object}, logMachineDeploymentVersionChange(currentMD.Object, desiredMD.Object)) 762 763 // Wait until MachineDeployment is updated in the cache. 764 // Note: We have to do this because otherwise using a cached client in current state could 765 // return a stale state of a MachineDeployment we just patched (because the cache might be stale). 766 // Note: It is good enough to check that the resource version changed. Other controllers might have updated the 767 // MachineDeployment as well, but the combination of the patch call above without a conflict and a changed resource 768 // version here guarantees that we see the changes of our own update. 769 err = wait.PollUntilContextTimeout(ctx, 5*time.Millisecond, 5*time.Second, true, func(ctx context.Context) (bool, error) { 770 key := client.ObjectKey{Namespace: currentMD.Object.GetNamespace(), Name: currentMD.Object.GetName()} 771 cachedMD := &clusterv1.MachineDeployment{} 772 if err := r.Client.Get(ctx, key, cachedMD); err != nil { 773 return false, err 774 } 775 return currentMD.Object.GetResourceVersion() != cachedMD.GetResourceVersion(), nil 776 }) 777 if err != nil { 778 return errors.Wrapf(err, "failed waiting for MachineDeployment %s to be updated in the cache after patch", tlog.KObj{Obj: currentMD.Object}) 779 } 780 781 // We want to call both cleanup functions even if one of them fails to clean up as much as possible. 782 return nil 783 } 784 785 func logMachineDeploymentVersionChange(current, desired *clusterv1.MachineDeployment) string { 786 if current.Spec.Template.Spec.Version == nil || desired.Spec.Template.Spec.Version == nil { 787 return "" 788 } 789 790 if *current.Spec.Template.Spec.Version != *desired.Spec.Template.Spec.Version { 791 return fmt.Sprintf(" with version change from %s to %s", *current.Spec.Template.Spec.Version, *desired.Spec.Template.Spec.Version) 792 } 793 return "" 794 } 795 796 // deleteMachineDeployment deletes a MachineDeployment. 797 func (r *Reconciler) deleteMachineDeployment(ctx context.Context, cluster *clusterv1.Cluster, md *scope.MachineDeploymentState) error { 798 log := tlog.LoggerFrom(ctx).WithMachineDeployment(md.Object).WithObject(md.Object) 799 800 // delete MachineHealthCheck for the MachineDeployment. 801 if md.MachineHealthCheck != nil { 802 if err := r.reconcileMachineHealthCheck(ctx, md.MachineHealthCheck, nil); err != nil { 803 return err 804 } 805 } 806 log.Infof("Deleting %s", tlog.KObj{Obj: md.Object}) 807 if err := r.Client.Delete(ctx, md.Object); err != nil && !apierrors.IsNotFound(err) { 808 return errors.Wrapf(err, "failed to delete %s", tlog.KObj{Obj: md.Object}) 809 } 810 r.recorder.Eventf(cluster, corev1.EventTypeNormal, deleteEventReason, "Deleted %q", tlog.KObj{Obj: md.Object}) 811 return nil 812 } 813 814 // reconcileMachinePools reconciles the desired state of the MachinePool objects. 815 func (r *Reconciler) reconcileMachinePools(ctx context.Context, s *scope.Scope) error { 816 diff := calculateMachinePoolDiff(s.Current.MachinePools, s.Desired.MachinePools) 817 818 // Create MachinePools. 819 if len(diff.toCreate) > 0 { 820 // In current state we only got the MP list via a cached call. 821 // As a consequence, in order to prevent the creation of duplicate MP due to stale reads, 822 // we are now using a live client to double-check here that the MachinePool 823 // to be created doesn't exist yet. 824 currentMPTopologyNames, err := r.getCurrentMachinePools(ctx, s) 825 if err != nil { 826 return err 827 } 828 for _, mpTopologyName := range diff.toCreate { 829 mp := s.Desired.MachinePools[mpTopologyName] 830 831 // Skip the MP creation if the MP already exists. 832 if currentMPTopologyNames.Has(mpTopologyName) { 833 log := tlog.LoggerFrom(ctx).WithMachinePool(mp.Object) 834 log.V(3).Infof(fmt.Sprintf("Skipping creation of MachinePool %s because MachinePool for topology %s already exists (only considered creation because of stale cache)", tlog.KObj{Obj: mp.Object}, mpTopologyName)) 835 continue 836 } 837 838 if err := r.createMachinePool(ctx, s, mp); err != nil { 839 return err 840 } 841 } 842 } 843 844 // Update MachinePools. 845 for _, mpTopologyName := range diff.toUpdate { 846 currentMP := s.Current.MachinePools[mpTopologyName] 847 desiredMP := s.Desired.MachinePools[mpTopologyName] 848 if err := r.updateMachinePool(ctx, s, currentMP, desiredMP); err != nil { 849 return err 850 } 851 } 852 853 // Delete MachinePools. 854 for _, mpTopologyName := range diff.toDelete { 855 mp := s.Current.MachinePools[mpTopologyName] 856 if err := r.deleteMachinePool(ctx, s.Current.Cluster, mp); err != nil { 857 return err 858 } 859 } 860 861 return nil 862 } 863 864 // getCurrentMachinePools gets the current list of MachinePools via the APIReader. 865 func (r *Reconciler) getCurrentMachinePools(ctx context.Context, s *scope.Scope) (sets.Set[string], error) { 866 // TODO: We should consider using PartialObjectMetadataList here. Currently this doesn't work as our 867 // implementation for topology dryrun doesn't support PartialObjectMetadataList. 868 mpList := &expv1.MachinePoolList{} 869 err := r.APIReader.List(ctx, mpList, 870 client.MatchingLabels{ 871 clusterv1.ClusterNameLabel: s.Current.Cluster.Name, 872 clusterv1.ClusterTopologyOwnedLabel: "", 873 }, 874 client.InNamespace(s.Current.Cluster.Namespace), 875 ) 876 if err != nil { 877 return nil, errors.Wrap(err, "failed to read MachinePools for managed topology") 878 } 879 880 currentMPs := sets.Set[string]{} 881 for _, mp := range mpList.Items { 882 mpTopologyName, ok := mp.ObjectMeta.Labels[clusterv1.ClusterTopologyMachinePoolNameLabel] 883 if ok || mpTopologyName != "" { 884 currentMPs.Insert(mpTopologyName) 885 } 886 } 887 return currentMPs, nil 888 } 889 890 // createMachinePool creates a MachinePool and the corresponding templates. 891 func (r *Reconciler) createMachinePool(ctx context.Context, s *scope.Scope, mp *scope.MachinePoolState) error { 892 // Do not create the MachinePool if it is marked as pending create. 893 mpTopologyName, ok := mp.Object.Labels[clusterv1.ClusterTopologyMachinePoolNameLabel] 894 if !ok || mpTopologyName == "" { 895 // Note: This is only an additional safety check and should not happen. The label will always be added when computing 896 // the desired MachinePool. 897 return errors.Errorf("new MachinePool is missing the %q label", clusterv1.ClusterTopologyMachinePoolNameLabel) 898 } 899 // Return early if the MachinePool is pending create. 900 if s.UpgradeTracker.MachinePools.IsPendingCreate(mpTopologyName) { 901 return nil 902 } 903 904 log := tlog.LoggerFrom(ctx).WithMachinePool(mp.Object) 905 cluster := s.Current.Cluster 906 infraCtx, _ := log.WithObject(mp.InfrastructureMachinePoolObject).Into(ctx) 907 infrastructureMachineMachinePoolCleanupFunc := func() {} 908 createdInfrastructureMachinePool, err := r.reconcileReferencedObject(infraCtx, reconcileReferencedObjectInput{ 909 cluster: cluster, 910 desired: mp.InfrastructureMachinePoolObject, 911 }) 912 if err != nil { 913 return errors.Wrapf(err, "failed to create %s", mp.Object.Kind) 914 } 915 916 if createdInfrastructureMachinePool { 917 infrastructureMachineMachinePoolCleanupFunc = func() { 918 // Best effort cleanup of the InfrastructureMachinePool; 919 // If this fails, the object will be garbage collected when the cluster is deleted. 920 if err := r.Client.Delete(ctx, mp.InfrastructureMachinePoolObject); err != nil { 921 log := tlog.LoggerFrom(ctx). 922 WithValues(mp.InfrastructureMachinePoolObject.GetObjectKind().GroupVersionKind().Kind, mp.InfrastructureMachinePoolObject.GetName()). 923 WithValues("err", err.Error()) 924 log.Infof("WARNING! Failed to cleanup InfrastructureMachinePoolObject for MachinePool while handling creation error. The object will be garbage collected when the cluster is deleted.") 925 } 926 } 927 } 928 929 bootstrapCtx, _ := log.WithObject(mp.BootstrapObject).Into(ctx) 930 bootstrapCleanupFunc := func() {} 931 createdBootstrap, err := r.reconcileReferencedObject(bootstrapCtx, reconcileReferencedObjectInput{ 932 cluster: cluster, 933 desired: mp.BootstrapObject, 934 }) 935 if err != nil { 936 // Best effort cleanup of the InfrastructureMachinePool (only on creation). 937 infrastructureMachineMachinePoolCleanupFunc() 938 return errors.Wrapf(err, "failed to create %s", mp.Object.Kind) 939 } 940 941 if createdBootstrap { 942 bootstrapCleanupFunc = func() { 943 // Best effort cleanup of the BootstrapConfig; 944 // If this fails, the object will be garbage collected when the cluster is deleted. 945 if err := r.Client.Delete(ctx, mp.BootstrapObject); err != nil { 946 log := tlog.LoggerFrom(ctx). 947 WithValues(mp.BootstrapObject.GetObjectKind().GroupVersionKind().Kind, mp.BootstrapObject.GetName()). 948 WithValues("err", err.Error()) 949 log.Infof("WARNING! Failed to cleanup BootstrapObject for MachinePool while handling creation error. The object will be garbage collected when the cluster is deleted.") 950 } 951 } 952 } 953 954 log = log.WithObject(mp.Object) 955 log.Infof(fmt.Sprintf("Creating %s", tlog.KObj{Obj: mp.Object})) 956 helper, err := r.patchHelperFactory(ctx, nil, mp.Object) 957 if err != nil { 958 // Best effort cleanup of the InfrastructureMachinePool & BootstrapConfig (only on creation). 959 infrastructureMachineMachinePoolCleanupFunc() 960 bootstrapCleanupFunc() 961 return createErrorWithoutObjectName(ctx, err, mp.Object) 962 } 963 if err := helper.Patch(ctx); err != nil { 964 // Best effort cleanup of the InfrastructureMachinePool & BootstrapConfig (only on creation). 965 infrastructureMachineMachinePoolCleanupFunc() 966 bootstrapCleanupFunc() 967 return createErrorWithoutObjectName(ctx, err, mp.Object) 968 } 969 r.recorder.Eventf(cluster, corev1.EventTypeNormal, createEventReason, "Created %q", tlog.KObj{Obj: mp.Object}) 970 971 // Wait until MachinePool is visible in the cache. 972 // Note: We have to do this because otherwise using a cached client in current state could 973 // miss a newly created MachinePool (because the cache might be stale). 974 err = wait.PollUntilContextTimeout(ctx, 5*time.Millisecond, 5*time.Second, true, func(ctx context.Context) (bool, error) { 975 key := client.ObjectKey{Namespace: mp.Object.Namespace, Name: mp.Object.Name} 976 if err := r.Client.Get(ctx, key, &expv1.MachinePool{}); err != nil { 977 if apierrors.IsNotFound(err) { 978 return false, nil 979 } 980 return false, err 981 } 982 return true, nil 983 }) 984 if err != nil { 985 return errors.Wrapf(err, "failed waiting for MachinePool %s to be visible in the cache after create", mp.Object.Kind) 986 } 987 988 return nil 989 } 990 991 // updateMachinePool updates a MachinePool. Also updates the corresponding objects if necessary. 992 func (r *Reconciler) updateMachinePool(ctx context.Context, s *scope.Scope, currentMP, desiredMP *scope.MachinePoolState) error { 993 log := tlog.LoggerFrom(ctx).WithMachinePool(desiredMP.Object) 994 995 // Return early if the MachinePool is pending an upgrade. 996 // Do not reconcile the MachinePool yet to avoid updating the MachinePool while it is still pending a 997 // version upgrade. This will prevent the MachinePool from performing a double rollout. 998 if s.UpgradeTracker.MachinePools.IsPendingUpgrade(currentMP.Object.Name) { 999 return nil 1000 } 1001 1002 cluster := s.Current.Cluster 1003 infraCtx, _ := log.WithObject(desiredMP.InfrastructureMachinePoolObject).Into(ctx) 1004 if _, err := r.reconcileReferencedObject(infraCtx, reconcileReferencedObjectInput{ 1005 cluster: cluster, 1006 current: currentMP.InfrastructureMachinePoolObject, 1007 desired: desiredMP.InfrastructureMachinePoolObject, 1008 }); err != nil { 1009 return errors.Wrapf(err, "failed to reconcile %s", tlog.KObj{Obj: currentMP.Object}) 1010 } 1011 1012 bootstrapCtx, _ := log.WithObject(desiredMP.BootstrapObject).Into(ctx) 1013 if _, err := r.reconcileReferencedObject(bootstrapCtx, reconcileReferencedObjectInput{ 1014 cluster: cluster, 1015 current: currentMP.BootstrapObject, 1016 desired: desiredMP.BootstrapObject, 1017 }); err != nil { 1018 return errors.Wrapf(err, "failed to reconcile %s", tlog.KObj{Obj: currentMP.Object}) 1019 } 1020 1021 // Check differences between current and desired MachinePool, and eventually patch the current object. 1022 log = log.WithObject(desiredMP.Object) 1023 patchHelper, err := r.patchHelperFactory(ctx, currentMP.Object, desiredMP.Object) 1024 if err != nil { 1025 return errors.Wrapf(err, "failed to create patch helper for %s", tlog.KObj{Obj: currentMP.Object}) 1026 } 1027 if !patchHelper.HasChanges() { 1028 log.V(3).Infof("No changes for %s", tlog.KObj{Obj: currentMP.Object}) 1029 return nil 1030 } 1031 1032 log.Infof("Patching %s", tlog.KObj{Obj: currentMP.Object}) 1033 if err := patchHelper.Patch(ctx); err != nil { 1034 return errors.Wrapf(err, "failed to patch %s", tlog.KObj{Obj: currentMP.Object}) 1035 } 1036 r.recorder.Eventf(cluster, corev1.EventTypeNormal, updateEventReason, "Updated %q%s", tlog.KObj{Obj: currentMP.Object}, logMachinePoolVersionChange(currentMP.Object, desiredMP.Object)) 1037 1038 // Wait until MachinePool is updated in the cache. 1039 // Note: We have to do this because otherwise using a cached client in current state could 1040 // return a stale state of a MachinePool we just patched (because the cache might be stale). 1041 // Note: It is good enough to check that the resource version changed. Other controllers might have updated the 1042 // MachinePool as well, but the combination of the patch call above without a conflict and a changed resource 1043 // version here guarantees that we see the changes of our own update. 1044 err = wait.PollUntilContextTimeout(ctx, 5*time.Millisecond, 5*time.Second, true, func(ctx context.Context) (bool, error) { 1045 key := client.ObjectKey{Namespace: currentMP.Object.GetNamespace(), Name: currentMP.Object.GetName()} 1046 cachedMP := &expv1.MachinePool{} 1047 if err := r.Client.Get(ctx, key, cachedMP); err != nil { 1048 return false, err 1049 } 1050 return currentMP.Object.GetResourceVersion() != cachedMP.GetResourceVersion(), nil 1051 }) 1052 if err != nil { 1053 return errors.Wrapf(err, "failed waiting for MachinePool %s to be updated in the cache after patch", tlog.KObj{Obj: currentMP.Object}) 1054 } 1055 1056 // We want to call both cleanup functions even if one of them fails to clean up as much as possible. 1057 return nil 1058 } 1059 1060 func logMachinePoolVersionChange(current, desired *expv1.MachinePool) string { 1061 if current.Spec.Template.Spec.Version == nil || desired.Spec.Template.Spec.Version == nil { 1062 return "" 1063 } 1064 1065 if *current.Spec.Template.Spec.Version != *desired.Spec.Template.Spec.Version { 1066 return fmt.Sprintf(" with version change from %s to %s", *current.Spec.Template.Spec.Version, *desired.Spec.Template.Spec.Version) 1067 } 1068 return "" 1069 } 1070 1071 // deleteMachinePool deletes a MachinePool. 1072 func (r *Reconciler) deleteMachinePool(ctx context.Context, cluster *clusterv1.Cluster, mp *scope.MachinePoolState) error { 1073 log := tlog.LoggerFrom(ctx).WithMachinePool(mp.Object).WithObject(mp.Object) 1074 log.Infof("Deleting %s", tlog.KObj{Obj: mp.Object}) 1075 if err := r.Client.Delete(ctx, mp.Object); err != nil && !apierrors.IsNotFound(err) { 1076 return errors.Wrapf(err, "failed to delete %s", tlog.KObj{Obj: mp.Object}) 1077 } 1078 r.recorder.Eventf(cluster, corev1.EventTypeNormal, deleteEventReason, "Deleted %q", tlog.KObj{Obj: mp.Object}) 1079 return nil 1080 } 1081 1082 type machineDiff struct { 1083 toCreate, toUpdate, toDelete []string 1084 } 1085 1086 // calculateMachineDeploymentDiff compares two maps of MachineDeploymentState and calculates which 1087 // MachineDeployments should be created, updated or deleted. 1088 func calculateMachineDeploymentDiff(current, desired map[string]*scope.MachineDeploymentState) machineDiff { 1089 var diff machineDiff 1090 1091 for md := range desired { 1092 if _, ok := current[md]; ok { 1093 diff.toUpdate = append(diff.toUpdate, md) 1094 } else { 1095 diff.toCreate = append(diff.toCreate, md) 1096 } 1097 } 1098 1099 for md := range current { 1100 if _, ok := desired[md]; !ok { 1101 diff.toDelete = append(diff.toDelete, md) 1102 } 1103 } 1104 1105 return diff 1106 } 1107 1108 // calculateMachinePoolDiff compares two maps of MachinePoolState and calculates which 1109 // MachinePools should be created, updated or deleted. 1110 func calculateMachinePoolDiff(current, desired map[string]*scope.MachinePoolState) machineDiff { 1111 var diff machineDiff 1112 1113 for mp := range desired { 1114 if _, ok := current[mp]; ok { 1115 diff.toUpdate = append(diff.toUpdate, mp) 1116 } else { 1117 diff.toCreate = append(diff.toCreate, mp) 1118 } 1119 } 1120 1121 for mp := range current { 1122 if _, ok := desired[mp]; !ok { 1123 diff.toDelete = append(diff.toDelete, mp) 1124 } 1125 } 1126 1127 return diff 1128 } 1129 1130 type unstructuredVersionGetter func(obj *unstructured.Unstructured) (*string, error) 1131 1132 type reconcileReferencedObjectInput struct { 1133 cluster *clusterv1.Cluster 1134 current *unstructured.Unstructured 1135 desired *unstructured.Unstructured 1136 versionGetter unstructuredVersionGetter 1137 ignorePaths []contract.Path 1138 } 1139 1140 // reconcileReferencedObject reconciles the desired state of the referenced object. 1141 // Returns true if the referencedObject is created. 1142 // NOTE: After a referenced object is created it is assumed that the reference should 1143 // never change (only the content of the object can eventually change). Thus, we are checking for strict compatibility. 1144 func (r *Reconciler) reconcileReferencedObject(ctx context.Context, in reconcileReferencedObjectInput) (bool, error) { 1145 log := tlog.LoggerFrom(ctx) 1146 1147 // If there is no current object, create it. 1148 if in.current == nil { 1149 log.Infof("Creating %s", tlog.KObj{Obj: in.desired}) 1150 helper, err := r.patchHelperFactory(ctx, nil, in.desired, structuredmerge.IgnorePaths(in.ignorePaths)) 1151 if err != nil { 1152 return false, errors.Wrap(createErrorWithoutObjectName(ctx, err, in.desired), "failed to create patch helper") 1153 } 1154 if err := helper.Patch(ctx); err != nil { 1155 return false, createErrorWithoutObjectName(ctx, err, in.desired) 1156 } 1157 r.recorder.Eventf(in.cluster, corev1.EventTypeNormal, createEventReason, "Created %q", tlog.KObj{Obj: in.desired}) 1158 return true, nil 1159 } 1160 1161 // Check if the current and desired referenced object are compatible. 1162 if allErrs := check.ObjectsAreStrictlyCompatible(in.current, in.desired); len(allErrs) > 0 { 1163 return false, allErrs.ToAggregate() 1164 } 1165 1166 // Check differences between current and desired state, and eventually patch the current object. 1167 patchHelper, err := r.patchHelperFactory(ctx, in.current, in.desired, structuredmerge.IgnorePaths(in.ignorePaths)) 1168 if err != nil { 1169 return false, errors.Wrapf(err, "failed to create patch helper for %s", tlog.KObj{Obj: in.current}) 1170 } 1171 if !patchHelper.HasChanges() { 1172 log.V(3).Infof("No changes for %s", tlog.KObj{Obj: in.desired}) 1173 return false, nil 1174 } 1175 1176 log.Infof("Patching %s", tlog.KObj{Obj: in.desired}) 1177 if err := patchHelper.Patch(ctx); err != nil { 1178 return false, errors.Wrapf(err, "failed to patch %s", tlog.KObj{Obj: in.current}) 1179 } 1180 r.recorder.Eventf(in.cluster, corev1.EventTypeNormal, updateEventReason, "Updated %q%s", tlog.KObj{Obj: in.desired}, logUnstructuredVersionChange(in.current, in.desired, in.versionGetter)) 1181 return false, nil 1182 } 1183 1184 func logUnstructuredVersionChange(current, desired *unstructured.Unstructured, versionGetter unstructuredVersionGetter) string { 1185 if versionGetter == nil { 1186 return "" 1187 } 1188 1189 currentVersion, err := versionGetter(current) 1190 if err != nil || currentVersion == nil { 1191 return "" 1192 } 1193 desiredVersion, err := versionGetter(desired) 1194 if err != nil || desiredVersion == nil { 1195 return "" 1196 } 1197 1198 if *currentVersion != *desiredVersion { 1199 return fmt.Sprintf(" with version change from %s to %s", *currentVersion, *desiredVersion) 1200 } 1201 return "" 1202 } 1203 1204 type reconcileReferencedTemplateInput struct { 1205 cluster *clusterv1.Cluster 1206 ref *corev1.ObjectReference 1207 current *unstructured.Unstructured 1208 desired *unstructured.Unstructured 1209 templateNamePrefix string 1210 compatibilityChecker func(current, desired client.Object) field.ErrorList 1211 } 1212 1213 // reconcileReferencedTemplate reconciles the desired state of a referenced Template. 1214 // Returns true if the referencedTemplate is created. 1215 // NOTE: According to Cluster API operational practices, when a referenced Template changes a template rotation is required: 1216 // 1. create a new Template 1217 // 2. update the reference 1218 // 3. delete the old Template 1219 // This function specifically takes care of the first step and updates the reference locally. So the remaining steps 1220 // can be executed afterwards. 1221 // NOTE: This func has a side effect in case of template rotation, changing both the desired object and the object reference. 1222 func (r *Reconciler) reconcileReferencedTemplate(ctx context.Context, in reconcileReferencedTemplateInput) (bool, error) { 1223 log := tlog.LoggerFrom(ctx) 1224 1225 // If there is no current object, create the desired object. 1226 if in.current == nil { 1227 log.Infof("Creating %s", tlog.KObj{Obj: in.desired}) 1228 helper, err := r.patchHelperFactory(ctx, nil, in.desired) 1229 if err != nil { 1230 return false, errors.Wrap(createErrorWithoutObjectName(ctx, err, in.desired), "failed to create patch helper") 1231 } 1232 if err := helper.Patch(ctx); err != nil { 1233 return false, createErrorWithoutObjectName(ctx, err, in.desired) 1234 } 1235 r.recorder.Eventf(in.cluster, corev1.EventTypeNormal, createEventReason, "Created %q", tlog.KObj{Obj: in.desired}) 1236 return true, nil 1237 } 1238 1239 if in.ref == nil { 1240 return false, errors.Errorf("failed to rotate %s: ref should not be nil", in.desired.GroupVersionKind()) 1241 } 1242 1243 // Check if the current and desired referenced object are compatible. 1244 if allErrs := in.compatibilityChecker(in.current, in.desired); len(allErrs) > 0 { 1245 return false, allErrs.ToAggregate() 1246 } 1247 1248 // Check differences between current and desired objects, and if there are changes eventually start the template rotation. 1249 patchHelper, err := r.patchHelperFactory(ctx, in.current, in.desired) 1250 if err != nil { 1251 return false, errors.Wrapf(err, "failed to create patch helper for %s", tlog.KObj{Obj: in.current}) 1252 } 1253 1254 // Return if no changes are detected. 1255 if !patchHelper.HasChanges() { 1256 log.V(3).Infof("No changes for %s", tlog.KObj{Obj: in.desired}) 1257 return false, nil 1258 } 1259 1260 // If there are no changes in the spec, and thus only changes in metadata, instead of doing a full template 1261 // rotation we patch the object in place. This avoids recreating machines. 1262 if !patchHelper.HasSpecChanges() { 1263 log.Infof("Patching %s", tlog.KObj{Obj: in.desired}) 1264 if err := patchHelper.Patch(ctx); err != nil { 1265 return false, errors.Wrapf(err, "failed to patch %s", tlog.KObj{Obj: in.desired}) 1266 } 1267 r.recorder.Eventf(in.cluster, corev1.EventTypeNormal, updateEventReason, "Updated %q (metadata changes)", tlog.KObj{Obj: in.desired}) 1268 return false, nil 1269 } 1270 1271 // Create the new template. 1272 1273 // NOTE: it is required to assign a new name, because during compute the desired object name is enforced to be equal to the current one. 1274 // TODO: find a way to make side effect more explicit 1275 newName := names.SimpleNameGenerator.GenerateName(in.templateNamePrefix) 1276 in.desired.SetName(newName) 1277 1278 log.Infof("Rotating %s, new name %s", tlog.KObj{Obj: in.current}, newName) 1279 log.Infof("Creating %s", tlog.KObj{Obj: in.desired}) 1280 helper, err := r.patchHelperFactory(ctx, nil, in.desired) 1281 if err != nil { 1282 return false, errors.Wrap(createErrorWithoutObjectName(ctx, err, in.desired), "failed to create patch helper") 1283 } 1284 if err := helper.Patch(ctx); err != nil { 1285 return false, createErrorWithoutObjectName(ctx, err, in.desired) 1286 } 1287 r.recorder.Eventf(in.cluster, corev1.EventTypeNormal, createEventReason, "Created %q as a replacement for %q (template rotation)", tlog.KObj{Obj: in.desired}, in.ref.Name) 1288 1289 // Update the reference with the new name. 1290 // NOTE: Updating the object hosting reference to the template is executed outside this func. 1291 // TODO: find a way to make side effect more explicit 1292 in.ref.Name = newName 1293 1294 return true, nil 1295 } 1296 1297 // createErrorWithoutObjectName removes the name of the object from the error message. As each new Create call involves an 1298 // object with a unique generated name each error appears to be a different error. As the errors are being surfaced in a condition 1299 // on the Cluster, the name is removed here to prevent each creation error from triggering a new reconciliation. 1300 func createErrorWithoutObjectName(ctx context.Context, err error, obj client.Object) error { 1301 log := ctrl.LoggerFrom(ctx) 1302 if obj != nil { 1303 log = log.WithValues(obj.GetObjectKind().GroupVersionKind().Kind, klog.KObj(obj)) 1304 } 1305 log.Error(err, "Failed to create object") 1306 1307 var statusError *apierrors.StatusError 1308 if errors.As(err, &statusError) { 1309 var msg string 1310 if statusError.Status().Details != nil { 1311 var causes []string 1312 for _, cause := range statusError.Status().Details.Causes { 1313 causes = append(causes, fmt.Sprintf("%s: %s: %s", cause.Type, cause.Field, cause.Message)) 1314 } 1315 if len(causes) > 0 { 1316 msg = fmt.Sprintf("failed to create %s.%s: %s", statusError.Status().Details.Kind, statusError.Status().Details.Group, strings.Join(causes, " ")) 1317 } else { 1318 msg = fmt.Sprintf("failed to create %s.%s", statusError.Status().Details.Kind, statusError.Status().Details.Group) 1319 } 1320 statusError.ErrStatus.Message = msg 1321 return statusError 1322 } 1323 1324 if statusError.Status().Message != "" { 1325 if obj != nil { 1326 msg = fmt.Sprintf("failed to create %s", obj.GetObjectKind().GroupVersionKind().GroupKind().String()) 1327 } else { 1328 msg = "failed to create object" 1329 } 1330 } 1331 statusError.ErrStatus.Message = msg 1332 return statusError 1333 } 1334 // If this isn't a StatusError return a more generic error with the object details. 1335 if obj != nil { 1336 return errors.Errorf("failed to create %s", obj.GetObjectKind().GroupVersionKind().GroupKind().String()) 1337 } 1338 return errors.New("failed to create object") 1339 }