sigs.k8s.io/cluster-api@v1.6.3/internal/controllers/topology/cluster/reconcile_state.go

sigs.k8s.io/cluster-api@v1.6.3/internal/controllers/topology/cluster/reconcile_state.go (about)

     1  /*
     2  Copyright 2021 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package cluster
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"strings"
    23  	"time"
    24  
    25  	"github.com/pkg/errors"
    26  	corev1 "k8s.io/api/core/v1"
    27  	apierrors "k8s.io/apimachinery/pkg/api/errors"
    28  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    29  	"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
    30  	"k8s.io/apimachinery/pkg/util/sets"
    31  	"k8s.io/apimachinery/pkg/util/validation/field"
    32  	"k8s.io/apimachinery/pkg/util/wait"
    33  	"k8s.io/apiserver/pkg/storage/names"
    34  	"k8s.io/klog/v2"
    35  	ctrl "sigs.k8s.io/controller-runtime"
    36  	"sigs.k8s.io/controller-runtime/pkg/client"
    37  
    38  	clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
    39  	expv1 "sigs.k8s.io/cluster-api/exp/api/v1beta1"
    40  	runtimehooksv1 "sigs.k8s.io/cluster-api/exp/runtime/hooks/api/v1alpha1"
    41  	"sigs.k8s.io/cluster-api/feature"
    42  	"sigs.k8s.io/cluster-api/internal/contract"
    43  	"sigs.k8s.io/cluster-api/internal/controllers/topology/cluster/scope"
    44  	"sigs.k8s.io/cluster-api/internal/controllers/topology/cluster/structuredmerge"
    45  	"sigs.k8s.io/cluster-api/internal/hooks"
    46  	tlog "sigs.k8s.io/cluster-api/internal/log"
    47  	"sigs.k8s.io/cluster-api/internal/topology/check"
    48  )
    49  
    50  const (
    51  	createEventReason = "TopologyCreate"
    52  	updateEventReason = "TopologyUpdate"
    53  	deleteEventReason = "TopologyDelete"
    54  )
    55  
    56  // reconcileState reconciles the current and desired state of the managed Cluster topology.
    57  // NOTE: We are assuming all the required objects are provided as input; also, in case of any error,
    58  // the entire reconcile operation will fail. This might be improved in the future if support for reconciling
    59  // subset of a topology will be implemented.
    60  func (r *Reconciler) reconcileState(ctx context.Context, s *scope.Scope) error {
    61  	log := tlog.LoggerFrom(ctx)
    62  	log.Infof("Reconciling state for topology owned objects")
    63  
    64  	// Reconcile the Cluster shim, a temporary object used a mean to collect
    65  	// objects/templates that can be orphaned in case of errors during the
    66  	// remaining part of the reconcile process.
    67  	if err := r.reconcileClusterShim(ctx, s); err != nil {
    68  		return err
    69  	}
    70  
    71  	if feature.Gates.Enabled(feature.RuntimeSDK) {
    72  		if err := r.callAfterHooks(ctx, s); err != nil {
    73  			return err
    74  		}
    75  	}
    76  
    77  	// Reconcile desired state of the InfrastructureCluster object.
    78  	if err := r.reconcileInfrastructureCluster(ctx, s); err != nil {
    79  		return err
    80  	}
    81  
    82  	// Reconcile desired state of the ControlPlane object.
    83  	if err := r.reconcileControlPlane(ctx, s); err != nil {
    84  		return err
    85  	}
    86  
    87  	// Reconcile desired state of the Cluster object.
    88  	if err := r.reconcileCluster(ctx, s); err != nil {
    89  		return err
    90  	}
    91  
    92  	// Reconcile desired state of the MachineDeployment objects.
    93  	if err := r.reconcileMachineDeployments(ctx, s); err != nil {
    94  		return err
    95  	}
    96  
    97  	// Reconcile desired state of the MachinePool object and return.
    98  	return r.reconcileMachinePools(ctx, s)
    99  }
   100  
   101  // Reconcile the Cluster shim, a temporary object used a mean to collect objects/templates
   102  // that might be orphaned in case of errors during the remaining part of the reconcile process.
   103  func (r *Reconciler) reconcileClusterShim(ctx context.Context, s *scope.Scope) error {
   104  	shim := clusterShim(s.Current.Cluster)
   105  
   106  	// If we are going to create the InfrastructureCluster or the ControlPlane object, then
   107  	// add a temporary cluster-shim object and use it as an additional owner.
   108  	// This will ensure the objects will be garbage collected in case of errors in between
   109  	// creating InfrastructureCluster/ControlPlane objects and updating the Cluster with the
   110  	// references to above objects.
   111  	if s.Current.InfrastructureCluster == nil || s.Current.ControlPlane.Object == nil {
   112  		// Given that the cluster shim is a temporary object which is only modified
   113  		// by this controller, it is not necessary to use the SSA patch helper.
   114  		if err := r.Client.Create(ctx, shim); err != nil {
   115  			if !apierrors.IsAlreadyExists(err) {
   116  				return errors.Wrap(err, "failed to create the cluster shim object")
   117  			}
   118  			if err := r.Client.Get(ctx, client.ObjectKeyFromObject(shim), shim); err != nil {
   119  				return errors.Wrapf(err, "failed to read the cluster shim object")
   120  			}
   121  		}
   122  
   123  		// Enforce type meta back given that it gets blanked out by Get.
   124  		shim.Kind = "Secret"
   125  		shim.APIVersion = corev1.SchemeGroupVersion.String()
   126  
   127  		// Add the shim as a temporary owner for the InfrastructureCluster.
   128  		ownerRefs := s.Desired.InfrastructureCluster.GetOwnerReferences()
   129  		ownerRefs = append(ownerRefs, *ownerReferenceTo(shim))
   130  		s.Desired.InfrastructureCluster.SetOwnerReferences(ownerRefs)
   131  
   132  		// Add the shim as a temporary owner for the ControlPlane.
   133  		ownerRefs = s.Desired.ControlPlane.Object.GetOwnerReferences()
   134  		ownerRefs = append(ownerRefs, *ownerReferenceTo(shim))
   135  		s.Desired.ControlPlane.Object.SetOwnerReferences(ownerRefs)
   136  	}
   137  
   138  	// If the InfrastructureCluster and the ControlPlane objects have been already created
   139  	// in previous reconciliation, check if they have already been reconciled by the ClusterController
   140  	// by verifying the ownerReference for the Cluster is present.
   141  	//
   142  	// When the Cluster and the shim object are both owners,
   143  	// it's safe for us to remove the shim and garbage collect any potential orphaned resource.
   144  	if s.Current.InfrastructureCluster != nil && s.Current.ControlPlane.Object != nil {
   145  		clusterOwnsAll := hasOwnerReferenceFrom(s.Current.InfrastructureCluster, s.Current.Cluster) &&
   146  			hasOwnerReferenceFrom(s.Current.ControlPlane.Object, s.Current.Cluster)
   147  		shimOwnsAtLeastOne := hasOwnerReferenceFrom(s.Current.InfrastructureCluster, shim) ||
   148  			hasOwnerReferenceFrom(s.Current.ControlPlane.Object, shim)
   149  
   150  		if clusterOwnsAll && shimOwnsAtLeastOne {
   151  			if err := r.Client.Delete(ctx, shim); err != nil {
   152  				if !apierrors.IsNotFound(err) {
   153  					return errors.Wrapf(err, "failed to delete the cluster shim object")
   154  				}
   155  			}
   156  		}
   157  	}
   158  	return nil
   159  }
   160  
   161  func clusterShim(c *clusterv1.Cluster) *corev1.Secret {
   162  	shim := &corev1.Secret{
   163  		TypeMeta: metav1.TypeMeta{
   164  			Kind:       "Secret",
   165  			APIVersion: corev1.SchemeGroupVersion.String(),
   166  		},
   167  		ObjectMeta: metav1.ObjectMeta{
   168  			Name:      fmt.Sprintf("%s-shim", c.Name),
   169  			Namespace: c.Namespace,
   170  			OwnerReferences: []metav1.OwnerReference{
   171  				*ownerReferenceTo(c),
   172  			},
   173  		},
   174  		Type: clusterv1.ClusterSecretType,
   175  	}
   176  	return shim
   177  }
   178  
   179  func hasOwnerReferenceFrom(obj, owner client.Object) bool {
   180  	for _, o := range obj.GetOwnerReferences() {
   181  		if o.Kind == owner.GetObjectKind().GroupVersionKind().Kind && o.Name == owner.GetName() {
   182  			return true
   183  		}
   184  	}
   185  	return false
   186  }
   187  
   188  func getOwnerReferenceFrom(obj, owner client.Object) *metav1.OwnerReference {
   189  	for _, o := range obj.GetOwnerReferences() {
   190  		if o.Kind == owner.GetObjectKind().GroupVersionKind().Kind && o.Name == owner.GetName() {
   191  			return &o
   192  		}
   193  	}
   194  	return nil
   195  }
   196  
   197  func (r *Reconciler) callAfterHooks(ctx context.Context, s *scope.Scope) error {
   198  	if err := r.callAfterControlPlaneInitialized(ctx, s); err != nil {
   199  		return err
   200  	}
   201  
   202  	return r.callAfterClusterUpgrade(ctx, s)
   203  }
   204  
   205  func (r *Reconciler) callAfterControlPlaneInitialized(ctx context.Context, s *scope.Scope) error {
   206  	// If the cluster topology is being created then track to intent to call the AfterControlPlaneInitialized hook so that we can call it later.
   207  	if s.Current.Cluster.Spec.InfrastructureRef == nil && s.Current.Cluster.Spec.ControlPlaneRef == nil {
   208  		if err := hooks.MarkAsPending(ctx, r.Client, s.Current.Cluster, runtimehooksv1.AfterControlPlaneInitialized); err != nil {
   209  			return err
   210  		}
   211  	}
   212  
   213  	// Call the hook only if we are tracking the intent to do so. If it is not tracked it means we don't need to call the
   214  	// hook because already called the hook after the control plane is initialized.
   215  	if hooks.IsPending(runtimehooksv1.AfterControlPlaneInitialized, s.Current.Cluster) {
   216  		if isControlPlaneInitialized(s.Current.Cluster) {
   217  			// The control plane is initialized for the first time. Call all the registered extensions for the hook.
   218  			hookRequest := &runtimehooksv1.AfterControlPlaneInitializedRequest{
   219  				Cluster: *s.Current.Cluster,
   220  			}
   221  			hookResponse := &runtimehooksv1.AfterControlPlaneInitializedResponse{}
   222  			if err := r.RuntimeClient.CallAllExtensions(ctx, runtimehooksv1.AfterControlPlaneInitialized, s.Current.Cluster, hookRequest, hookResponse); err != nil {
   223  				return err
   224  			}
   225  			s.HookResponseTracker.Add(runtimehooksv1.AfterControlPlaneInitialized, hookResponse)
   226  			if err := hooks.MarkAsDone(ctx, r.Client, s.Current.Cluster, runtimehooksv1.AfterControlPlaneInitialized); err != nil {
   227  				return err
   228  			}
   229  		}
   230  	}
   231  
   232  	return nil
   233  }
   234  
   235  func isControlPlaneInitialized(cluster *clusterv1.Cluster) bool {
   236  	for _, condition := range cluster.GetConditions() {
   237  		if condition.Type == clusterv1.ControlPlaneInitializedCondition {
   238  			if condition.Status == corev1.ConditionTrue {
   239  				return true
   240  			}
   241  		}
   242  	}
   243  	return false
   244  }
   245  
   246  func (r *Reconciler) callAfterClusterUpgrade(ctx context.Context, s *scope.Scope) error {
   247  	// Call the hook only if we are tracking the intent to do so. If it is not tracked it means we don't need to call the
   248  	// hook because we didn't go through an upgrade or we already called the hook after the upgrade.
   249  	if hooks.IsPending(runtimehooksv1.AfterClusterUpgrade, s.Current.Cluster) {
   250  		// Call the registered extensions for the hook after the cluster is fully upgraded.
   251  		// A clusters is considered fully upgraded if:
   252  		// - Control plane is stable (not upgrading, not scaling, not about to upgrade)
   253  		// - MachineDeployments/MachinePools are not currently upgrading
   254  		// - MachineDeployments/MachinePools are not pending an upgrade
   255  		// - MachineDeployments/MachinePools are not pending create
   256  		if isControlPlaneStable(s) && // Control Plane stable checks
   257  			len(s.UpgradeTracker.MachineDeployments.UpgradingNames()) == 0 && // Machine deployments are not upgrading or not about to upgrade
   258  			!s.UpgradeTracker.MachineDeployments.IsAnyPendingCreate() && // No MachineDeployments are pending create
   259  			!s.UpgradeTracker.MachineDeployments.IsAnyPendingUpgrade() && // No MachineDeployments are pending an upgrade
   260  			!s.UpgradeTracker.MachineDeployments.DeferredUpgrade() && // No MachineDeployments have deferred an upgrade
   261  			len(s.UpgradeTracker.MachinePools.UpgradingNames()) == 0 && // Machine pools are not upgrading or not about to upgrade
   262  			!s.UpgradeTracker.MachinePools.IsAnyPendingCreate() && // No MachinePools are pending create
   263  			!s.UpgradeTracker.MachinePools.IsAnyPendingUpgrade() && // No MachinePools are pending an upgrade
   264  			!s.UpgradeTracker.MachinePools.DeferredUpgrade() { // No MachinePools have deferred an upgrade
   265  			// Everything is stable and the cluster can be considered fully upgraded.
   266  			hookRequest := &runtimehooksv1.AfterClusterUpgradeRequest{
   267  				Cluster:           *s.Current.Cluster,
   268  				KubernetesVersion: s.Current.Cluster.Spec.Topology.Version,
   269  			}
   270  			hookResponse := &runtimehooksv1.AfterClusterUpgradeResponse{}
   271  			if err := r.RuntimeClient.CallAllExtensions(ctx, runtimehooksv1.AfterClusterUpgrade, s.Current.Cluster, hookRequest, hookResponse); err != nil {
   272  				return err
   273  			}
   274  			s.HookResponseTracker.Add(runtimehooksv1.AfterClusterUpgrade, hookResponse)
   275  			// The hook is successfully called; we can remove this hook from the list of pending-hooks.
   276  			if err := hooks.MarkAsDone(ctx, r.Client, s.Current.Cluster, runtimehooksv1.AfterClusterUpgrade); err != nil {
   277  				return err
   278  			}
   279  		}
   280  	}
   281  
   282  	return nil
   283  }
   284  
   285  // reconcileInfrastructureCluster reconciles the desired state of the InfrastructureCluster object.
   286  func (r *Reconciler) reconcileInfrastructureCluster(ctx context.Context, s *scope.Scope) error {
   287  	ctx, _ = tlog.LoggerFrom(ctx).WithObject(s.Desired.InfrastructureCluster).Into(ctx)
   288  
   289  	ignorePaths, err := contract.InfrastructureCluster().IgnorePaths(s.Desired.InfrastructureCluster)
   290  	if err != nil {
   291  		return errors.Wrap(err, "failed to calculate ignore paths")
   292  	}
   293  
   294  	return r.reconcileReferencedObject(ctx, reconcileReferencedObjectInput{
   295  		cluster:     s.Current.Cluster,
   296  		current:     s.Current.InfrastructureCluster,
   297  		desired:     s.Desired.InfrastructureCluster,
   298  		ignorePaths: ignorePaths,
   299  	})
   300  }
   301  
   302  // reconcileControlPlane works to bring the current state of a managed topology in line with the desired state. This involves
   303  // updating the cluster where needed.
   304  func (r *Reconciler) reconcileControlPlane(ctx context.Context, s *scope.Scope) error {
   305  	// If the ControlPlane has defined a current or desired MachineHealthCheck attempt to reconcile it.
   306  	// MHC changes are not Kubernetes version dependent, therefore proceed with MHC reconciliation
   307  	// even if the Control Plane is pending an upgrade.
   308  	if s.Desired.ControlPlane.MachineHealthCheck != nil || s.Current.ControlPlane.MachineHealthCheck != nil {
   309  		// Reconcile the current and desired state of the MachineHealthCheck.
   310  		if err := r.reconcileMachineHealthCheck(ctx, s.Current.ControlPlane.MachineHealthCheck, s.Desired.ControlPlane.MachineHealthCheck); err != nil {
   311  			return err
   312  		}
   313  	}
   314  
   315  	// Return early if the control plane is pending an upgrade.
   316  	// Do not reconcile the control plane yet to avoid updating the control plane while it is still pending a
   317  	// version upgrade. This will prevent the control plane from performing a double rollout.
   318  	if s.UpgradeTracker.ControlPlane.IsPendingUpgrade {
   319  		return nil
   320  	}
   321  	// If the clusterClass mandates the controlPlane has infrastructureMachines, reconcile it.
   322  	if s.Blueprint.HasControlPlaneInfrastructureMachine() {
   323  		ctx, _ := tlog.LoggerFrom(ctx).WithObject(s.Desired.ControlPlane.InfrastructureMachineTemplate).Into(ctx)
   324  
   325  		cpInfraRef, err := contract.ControlPlane().MachineTemplate().InfrastructureRef().Get(s.Desired.ControlPlane.Object)
   326  		if err != nil {
   327  			return errors.Wrapf(err, "failed to reconcile %s", tlog.KObj{Obj: s.Desired.ControlPlane.InfrastructureMachineTemplate})
   328  		}
   329  
   330  		// Create or update the MachineInfrastructureTemplate of the control plane.
   331  		if err = r.reconcileReferencedTemplate(ctx, reconcileReferencedTemplateInput{
   332  			cluster:              s.Current.Cluster,
   333  			ref:                  cpInfraRef,
   334  			current:              s.Current.ControlPlane.InfrastructureMachineTemplate,
   335  			desired:              s.Desired.ControlPlane.InfrastructureMachineTemplate,
   336  			compatibilityChecker: check.ObjectsAreCompatible,
   337  			templateNamePrefix:   controlPlaneInfrastructureMachineTemplateNamePrefix(s.Current.Cluster.Name),
   338  		},
   339  		); err != nil {
   340  			return err
   341  		}
   342  
   343  		// The controlPlaneObject.Spec.machineTemplate.infrastructureRef has to be updated in the desired object
   344  		err = contract.ControlPlane().MachineTemplate().InfrastructureRef().Set(s.Desired.ControlPlane.Object, refToUnstructured(cpInfraRef))
   345  		if err != nil {
   346  			return errors.Wrapf(err, "failed to reconcile %s", tlog.KObj{Obj: s.Desired.ControlPlane.Object})
   347  		}
   348  	}
   349  
   350  	// Create or update the ControlPlaneObject for the ControlPlaneState.
   351  	ctx, _ = tlog.LoggerFrom(ctx).WithObject(s.Desired.ControlPlane.Object).Into(ctx)
   352  	if err := r.reconcileReferencedObject(ctx, reconcileReferencedObjectInput{
   353  		cluster:       s.Current.Cluster,
   354  		current:       s.Current.ControlPlane.Object,
   355  		desired:       s.Desired.ControlPlane.Object,
   356  		versionGetter: contract.ControlPlane().Version().Get,
   357  	}); err != nil {
   358  		return err
   359  	}
   360  
   361  	// If the controlPlane has infrastructureMachines and the InfrastructureMachineTemplate has changed on this reconcile
   362  	// delete the old template.
   363  	// This is a best effort deletion only and may leak templates if an error occurs during reconciliation.
   364  	if s.Blueprint.HasControlPlaneInfrastructureMachine() && s.Current.ControlPlane.InfrastructureMachineTemplate != nil {
   365  		if s.Current.ControlPlane.InfrastructureMachineTemplate.GetName() != s.Desired.ControlPlane.InfrastructureMachineTemplate.GetName() {
   366  			if err := r.Client.Delete(ctx, s.Current.ControlPlane.InfrastructureMachineTemplate); err != nil {
   367  				return errors.Wrapf(err, "failed to delete oldinfrastructure machine template %s of control plane %s",
   368  					tlog.KObj{Obj: s.Current.ControlPlane.InfrastructureMachineTemplate},
   369  					tlog.KObj{Obj: s.Current.ControlPlane.Object},
   370  				)
   371  			}
   372  		}
   373  	}
   374  
   375  	return nil
   376  }
   377  
   378  // reconcileMachineHealthCheck creates, updates, deletes or leaves untouched a MachineHealthCheck depending on the difference between the
   379  // current state and the desired state.
   380  func (r *Reconciler) reconcileMachineHealthCheck(ctx context.Context, current, desired *clusterv1.MachineHealthCheck) error {
   381  	log := tlog.LoggerFrom(ctx)
   382  
   383  	// If a current MachineHealthCheck doesn't exist but there is a desired MachineHealthCheck attempt to create.
   384  	if current == nil && desired != nil {
   385  		log.Infof("Creating %s", tlog.KObj{Obj: desired})
   386  		helper, err := r.patchHelperFactory(ctx, nil, desired)
   387  		if err != nil {
   388  			return errors.Wrapf(err, "failed to create patch helper for %s", tlog.KObj{Obj: desired})
   389  		}
   390  		if err := helper.Patch(ctx); err != nil {
   391  			return errors.Wrapf(err, "failed to create %s", tlog.KObj{Obj: desired})
   392  		}
   393  		r.recorder.Eventf(desired, corev1.EventTypeNormal, createEventReason, "Created %q", tlog.KObj{Obj: desired})
   394  		return nil
   395  	}
   396  
   397  	// If a current MachineHealthCheck exists but there is no desired MachineHealthCheck attempt to delete.
   398  	if current != nil && desired == nil {
   399  		log.Infof("Deleting %s", tlog.KObj{Obj: current})
   400  		if err := r.Client.Delete(ctx, current); err != nil {
   401  			// If the object to be deleted is not found don't throw an error.
   402  			if !apierrors.IsNotFound(err) {
   403  				return errors.Wrapf(err, "failed to delete %s", tlog.KObj{Obj: current})
   404  			}
   405  		}
   406  		r.recorder.Eventf(current, corev1.EventTypeNormal, deleteEventReason, "Deleted %q", tlog.KObj{Obj: current})
   407  		return nil
   408  	}
   409  
   410  	ctx, log = log.WithObject(current).Into(ctx)
   411  
   412  	// Check differences between current and desired MachineHealthChecks, and patch if required.
   413  	// NOTE: we want to be authoritative on the entire spec because the users are
   414  	// expected to change MHC fields from the ClusterClass only.
   415  	patchHelper, err := r.patchHelperFactory(ctx, current, desired)
   416  	if err != nil {
   417  		return errors.Wrapf(err, "failed to create patch helper for %s", tlog.KObj{Obj: current})
   418  	}
   419  	if !patchHelper.HasChanges() {
   420  		log.V(3).Infof("No changes for %s", tlog.KObj{Obj: current})
   421  		return nil
   422  	}
   423  
   424  	log.Infof("Patching %s", tlog.KObj{Obj: current})
   425  	if err := patchHelper.Patch(ctx); err != nil {
   426  		return errors.Wrapf(err, "failed to patch %s", tlog.KObj{Obj: current})
   427  	}
   428  	r.recorder.Eventf(current, corev1.EventTypeNormal, updateEventReason, "Updated %q", tlog.KObj{Obj: current})
   429  	return nil
   430  }
   431  
   432  // reconcileCluster reconciles the desired state of the Cluster object.
   433  // NOTE: this assumes reconcileInfrastructureCluster and reconcileControlPlane being already completed;
   434  // most specifically, after a Cluster is created it is assumed that the reference to the InfrastructureCluster /
   435  // ControlPlane objects should never change (only the content of the objects can change).
   436  func (r *Reconciler) reconcileCluster(ctx context.Context, s *scope.Scope) error {
   437  	ctx, log := tlog.LoggerFrom(ctx).WithObject(s.Desired.Cluster).Into(ctx)
   438  
   439  	// Check differences between current and desired state, and eventually patch the current object.
   440  	patchHelper, err := r.patchHelperFactory(ctx, s.Current.Cluster, s.Desired.Cluster)
   441  	if err != nil {
   442  		return errors.Wrapf(err, "failed to create patch helper for %s", tlog.KObj{Obj: s.Current.Cluster})
   443  	}
   444  	if !patchHelper.HasChanges() {
   445  		log.V(3).Infof("No changes for %s", tlog.KObj{Obj: s.Current.Cluster})
   446  		return nil
   447  	}
   448  
   449  	log.Infof("Patching %s", tlog.KObj{Obj: s.Current.Cluster})
   450  	if err := patchHelper.Patch(ctx); err != nil {
   451  		return errors.Wrapf(err, "failed to patch %s", tlog.KObj{Obj: s.Current.Cluster})
   452  	}
   453  	r.recorder.Eventf(s.Current.Cluster, corev1.EventTypeNormal, updateEventReason, "Updated %q", tlog.KObj{Obj: s.Current.Cluster})
   454  
   455  	// Wait until Cluster is updated in the cache.
   456  	// Note: We have to do this because otherwise using a cached client in the Reconcile func could
   457  	// return a stale state of the Cluster we just patched (because the cache might be stale).
   458  	// Note: It is good enough to check that the resource version changed. Other controllers might have updated the
   459  	// Cluster as well, but the combination of the patch call above without a conflict and a changed resource
   460  	// version here guarantees that we see the changes of our own update.
   461  	err = wait.PollUntilContextTimeout(ctx, 5*time.Millisecond, 5*time.Second, true, func(ctx context.Context) (bool, error) {
   462  		key := client.ObjectKey{Namespace: s.Current.Cluster.GetNamespace(), Name: s.Current.Cluster.GetName()}
   463  		cachedCluster := &clusterv1.Cluster{}
   464  		if err := r.Client.Get(ctx, key, cachedCluster); err != nil {
   465  			return false, err
   466  		}
   467  		return s.Current.Cluster.GetResourceVersion() != cachedCluster.GetResourceVersion(), nil
   468  	})
   469  	if err != nil {
   470  		return errors.Wrapf(err, "failed waiting for Cluster %s to be updated in the cache after patch", tlog.KObj{Obj: s.Current.Cluster})
   471  	}
   472  	return nil
   473  }
   474  
   475  // reconcileMachineDeployments reconciles the desired state of the MachineDeployment objects.
   476  func (r *Reconciler) reconcileMachineDeployments(ctx context.Context, s *scope.Scope) error {
   477  	diff := calculateMachineDeploymentDiff(s.Current.MachineDeployments, s.Desired.MachineDeployments)
   478  
   479  	// Create MachineDeployments.
   480  	if len(diff.toCreate) > 0 {
   481  		// In current state we only got the MD list via a cached call.
   482  		// As a consequence, in order to prevent the creation of duplicate MD due to stale reads,
   483  		// we are now using a live client to double-check here that the MachineDeployment
   484  		// to be created doesn't exist yet.
   485  		currentMDTopologyNames, err := r.getCurrentMachineDeployments(ctx, s)
   486  		if err != nil {
   487  			return err
   488  		}
   489  		for _, mdTopologyName := range diff.toCreate {
   490  			md := s.Desired.MachineDeployments[mdTopologyName]
   491  
   492  			// Skip the MD creation if the MD already exists.
   493  			if currentMDTopologyNames.Has(mdTopologyName) {
   494  				log := tlog.LoggerFrom(ctx).WithMachineDeployment(md.Object)
   495  				log.V(3).Infof(fmt.Sprintf("Skipping creation of MachineDeployment %s because MachineDeployment for topology %s already exists (only considered creation because of stale cache)", tlog.KObj{Obj: md.Object}, mdTopologyName))
   496  				continue
   497  			}
   498  
   499  			if err := r.createMachineDeployment(ctx, s, md); err != nil {
   500  				return err
   501  			}
   502  		}
   503  	}
   504  
   505  	// Update MachineDeployments.
   506  	for _, mdTopologyName := range diff.toUpdate {
   507  		currentMD := s.Current.MachineDeployments[mdTopologyName]
   508  		desiredMD := s.Desired.MachineDeployments[mdTopologyName]
   509  		if err := r.updateMachineDeployment(ctx, s, mdTopologyName, currentMD, desiredMD); err != nil {
   510  			return err
   511  		}
   512  	}
   513  
   514  	// Delete MachineDeployments.
   515  	for _, mdTopologyName := range diff.toDelete {
   516  		md := s.Current.MachineDeployments[mdTopologyName]
   517  		if err := r.deleteMachineDeployment(ctx, s.Current.Cluster, md); err != nil {
   518  			return err
   519  		}
   520  	}
   521  	return nil
   522  }
   523  
   524  // getCurrentMachineDeployments gets the current list of MachineDeployments via the APIReader.
   525  func (r *Reconciler) getCurrentMachineDeployments(ctx context.Context, s *scope.Scope) (sets.Set[string], error) {
   526  	// TODO: We should consider using PartialObjectMetadataList here. Currently this doesn't work as our
   527  	// implementation for topology dryrun doesn't support PartialObjectMetadataList.
   528  	mdList := &clusterv1.MachineDeploymentList{}
   529  	err := r.APIReader.List(ctx, mdList,
   530  		client.MatchingLabels{
   531  			clusterv1.ClusterNameLabel:          s.Current.Cluster.Name,
   532  			clusterv1.ClusterTopologyOwnedLabel: "",
   533  		},
   534  		client.InNamespace(s.Current.Cluster.Namespace),
   535  	)
   536  	if err != nil {
   537  		return nil, errors.Wrap(err, "failed to read MachineDeployments for managed topology")
   538  	}
   539  
   540  	currentMDs := sets.Set[string]{}
   541  	for _, md := range mdList.Items {
   542  		mdTopologyName, ok := md.ObjectMeta.Labels[clusterv1.ClusterTopologyMachineDeploymentNameLabel]
   543  		if ok || mdTopologyName != "" {
   544  			currentMDs.Insert(mdTopologyName)
   545  		}
   546  	}
   547  	return currentMDs, nil
   548  }
   549  
   550  // createMachineDeployment creates a MachineDeployment and the corresponding Templates.
   551  func (r *Reconciler) createMachineDeployment(ctx context.Context, s *scope.Scope, md *scope.MachineDeploymentState) error {
   552  	// Do not create the MachineDeployment if it is marked as pending create.
   553  	// This will also block MHC creation because creating the MHC without the corresponding
   554  	// MachineDeployment is unnecessary.
   555  	mdTopologyName, ok := md.Object.Labels[clusterv1.ClusterTopologyMachineDeploymentNameLabel]
   556  	if !ok || mdTopologyName == "" {
   557  		// Note: This is only an additional safety check and should not happen. The label will always be added when computing
   558  		// the desired MachineDeployment.
   559  		return errors.Errorf("new MachineDeployment is missing the %q label", clusterv1.ClusterTopologyMachineDeploymentNameLabel)
   560  	}
   561  	// Return early if the MachineDeployment is pending create.
   562  	if s.UpgradeTracker.MachineDeployments.IsPendingCreate(mdTopologyName) {
   563  		return nil
   564  	}
   565  
   566  	log := tlog.LoggerFrom(ctx).WithMachineDeployment(md.Object)
   567  	cluster := s.Current.Cluster
   568  	infraCtx, _ := log.WithObject(md.InfrastructureMachineTemplate).Into(ctx)
   569  	if err := r.reconcileReferencedTemplate(infraCtx, reconcileReferencedTemplateInput{
   570  		cluster: cluster,
   571  		desired: md.InfrastructureMachineTemplate,
   572  	}); err != nil {
   573  		return errors.Wrapf(err, "failed to create %s", md.Object.Kind)
   574  	}
   575  
   576  	bootstrapCtx, _ := log.WithObject(md.BootstrapTemplate).Into(ctx)
   577  	if err := r.reconcileReferencedTemplate(bootstrapCtx, reconcileReferencedTemplateInput{
   578  		cluster: cluster,
   579  		desired: md.BootstrapTemplate,
   580  	}); err != nil {
   581  		return errors.Wrapf(err, "failed to create %s", md.Object.Kind)
   582  	}
   583  
   584  	log = log.WithObject(md.Object)
   585  	log.Infof(fmt.Sprintf("Creating %s", tlog.KObj{Obj: md.Object}))
   586  	helper, err := r.patchHelperFactory(ctx, nil, md.Object)
   587  	if err != nil {
   588  		return createErrorWithoutObjectName(ctx, err, md.Object)
   589  	}
   590  	if err := helper.Patch(ctx); err != nil {
   591  		return createErrorWithoutObjectName(ctx, err, md.Object)
   592  	}
   593  	r.recorder.Eventf(cluster, corev1.EventTypeNormal, createEventReason, "Created %q", tlog.KObj{Obj: md.Object})
   594  
   595  	// Wait until MachineDeployment is visible in the cache.
   596  	// Note: We have to do this because otherwise using a cached client in current state could
   597  	// miss a newly created MachineDeployment (because the cache might be stale).
   598  	err = wait.PollUntilContextTimeout(ctx, 5*time.Millisecond, 5*time.Second, true, func(ctx context.Context) (bool, error) {
   599  		key := client.ObjectKey{Namespace: md.Object.Namespace, Name: md.Object.Name}
   600  		if err := r.Client.Get(ctx, key, &clusterv1.MachineDeployment{}); err != nil {
   601  			if apierrors.IsNotFound(err) {
   602  				return false, nil
   603  			}
   604  			return false, err
   605  		}
   606  		return true, nil
   607  	})
   608  	if err != nil {
   609  		return errors.Wrapf(err, "failed waiting for MachineDeployment %s to be visible in the cache after create", md.Object.Kind)
   610  	}
   611  
   612  	// If the MachineDeployment has defined a MachineHealthCheck reconcile it.
   613  	if md.MachineHealthCheck != nil {
   614  		if err := r.reconcileMachineHealthCheck(ctx, nil, md.MachineHealthCheck); err != nil {
   615  			return err
   616  		}
   617  	}
   618  	return nil
   619  }
   620  
   621  // updateMachineDeployment updates a MachineDeployment. Also rotates the corresponding Templates if necessary.
   622  func (r *Reconciler) updateMachineDeployment(ctx context.Context, s *scope.Scope, mdTopologyName string, currentMD, desiredMD *scope.MachineDeploymentState) error {
   623  	log := tlog.LoggerFrom(ctx).WithMachineDeployment(desiredMD.Object)
   624  
   625  	// Patch MachineHealthCheck for the MachineDeployment.
   626  	// MHC changes are not Kubernetes version dependent, therefore proceed with MHC reconciliation
   627  	// even if the MachineDeployment is pending an upgrade.
   628  	if desiredMD.MachineHealthCheck != nil || currentMD.MachineHealthCheck != nil {
   629  		if err := r.reconcileMachineHealthCheck(ctx, currentMD.MachineHealthCheck, desiredMD.MachineHealthCheck); err != nil {
   630  			return err
   631  		}
   632  	}
   633  
   634  	// Return early if the MachineDeployment is pending an upgrade.
   635  	// Do not reconcile the MachineDeployment yet to avoid updating the MachineDeployment while it is still pending a
   636  	// version upgrade. This will prevent the MachineDeployment from performing a double rollout.
   637  	if s.UpgradeTracker.MachineDeployments.IsPendingUpgrade(currentMD.Object.Name) {
   638  		return nil
   639  	}
   640  
   641  	cluster := s.Current.Cluster
   642  	infraCtx, _ := log.WithObject(desiredMD.InfrastructureMachineTemplate).Into(ctx)
   643  	if err := r.reconcileReferencedTemplate(infraCtx, reconcileReferencedTemplateInput{
   644  		cluster:              cluster,
   645  		ref:                  &desiredMD.Object.Spec.Template.Spec.InfrastructureRef,
   646  		current:              currentMD.InfrastructureMachineTemplate,
   647  		desired:              desiredMD.InfrastructureMachineTemplate,
   648  		templateNamePrefix:   infrastructureMachineTemplateNamePrefix(cluster.Name, mdTopologyName),
   649  		compatibilityChecker: check.ObjectsAreCompatible,
   650  	}); err != nil {
   651  		return errors.Wrapf(err, "failed to reconcile %s", tlog.KObj{Obj: currentMD.Object})
   652  	}
   653  
   654  	bootstrapCtx, _ := log.WithObject(desiredMD.BootstrapTemplate).Into(ctx)
   655  	if err := r.reconcileReferencedTemplate(bootstrapCtx, reconcileReferencedTemplateInput{
   656  		cluster:              cluster,
   657  		ref:                  desiredMD.Object.Spec.Template.Spec.Bootstrap.ConfigRef,
   658  		current:              currentMD.BootstrapTemplate,
   659  		desired:              desiredMD.BootstrapTemplate,
   660  		templateNamePrefix:   bootstrapTemplateNamePrefix(cluster.Name, mdTopologyName),
   661  		compatibilityChecker: check.ObjectsAreInTheSameNamespace,
   662  	}); err != nil {
   663  		return errors.Wrapf(err, "failed to reconcile %s", tlog.KObj{Obj: currentMD.Object})
   664  	}
   665  
   666  	// Check differences between current and desired MachineDeployment, and eventually patch the current object.
   667  	log = log.WithObject(desiredMD.Object)
   668  	patchHelper, err := r.patchHelperFactory(ctx, currentMD.Object, desiredMD.Object)
   669  	if err != nil {
   670  		return errors.Wrapf(err, "failed to create patch helper for %s", tlog.KObj{Obj: currentMD.Object})
   671  	}
   672  	if !patchHelper.HasChanges() {
   673  		log.V(3).Infof("No changes for %s", tlog.KObj{Obj: currentMD.Object})
   674  		return nil
   675  	}
   676  
   677  	log.Infof("Patching %s", tlog.KObj{Obj: currentMD.Object})
   678  	if err := patchHelper.Patch(ctx); err != nil {
   679  		return errors.Wrapf(err, "failed to patch %s", tlog.KObj{Obj: currentMD.Object})
   680  	}
   681  	r.recorder.Eventf(cluster, corev1.EventTypeNormal, updateEventReason, "Updated %q%s", tlog.KObj{Obj: currentMD.Object}, logMachineDeploymentVersionChange(currentMD.Object, desiredMD.Object))
   682  
   683  	// Wait until MachineDeployment is updated in the cache.
   684  	// Note: We have to do this because otherwise using a cached client in current state could
   685  	// return a stale state of a MachineDeployment we just patched (because the cache might be stale).
   686  	// Note: It is good enough to check that the resource version changed. Other controllers might have updated the
   687  	// MachineDeployment as well, but the combination of the patch call above without a conflict and a changed resource
   688  	// version here guarantees that we see the changes of our own update.
   689  	err = wait.PollUntilContextTimeout(ctx, 5*time.Millisecond, 5*time.Second, true, func(ctx context.Context) (bool, error) {
   690  		key := client.ObjectKey{Namespace: currentMD.Object.GetNamespace(), Name: currentMD.Object.GetName()}
   691  		cachedMD := &clusterv1.MachineDeployment{}
   692  		if err := r.Client.Get(ctx, key, cachedMD); err != nil {
   693  			return false, err
   694  		}
   695  		return currentMD.Object.GetResourceVersion() != cachedMD.GetResourceVersion(), nil
   696  	})
   697  	if err != nil {
   698  		return errors.Wrapf(err, "failed waiting for MachineDeployment %s to be updated in the cache after patch", tlog.KObj{Obj: currentMD.Object})
   699  	}
   700  
   701  	// We want to call both cleanup functions even if one of them fails to clean up as much as possible.
   702  	return nil
   703  }
   704  
   705  func logMachineDeploymentVersionChange(current, desired *clusterv1.MachineDeployment) string {
   706  	if current.Spec.Template.Spec.Version == nil || desired.Spec.Template.Spec.Version == nil {
   707  		return ""
   708  	}
   709  
   710  	if *current.Spec.Template.Spec.Version != *desired.Spec.Template.Spec.Version {
   711  		return fmt.Sprintf(" with version change from %s to %s", *current.Spec.Template.Spec.Version, *desired.Spec.Template.Spec.Version)
   712  	}
   713  	return ""
   714  }
   715  
   716  // deleteMachineDeployment deletes a MachineDeployment.
   717  func (r *Reconciler) deleteMachineDeployment(ctx context.Context, cluster *clusterv1.Cluster, md *scope.MachineDeploymentState) error {
   718  	log := tlog.LoggerFrom(ctx).WithMachineDeployment(md.Object).WithObject(md.Object)
   719  
   720  	// delete MachineHealthCheck for the MachineDeployment.
   721  	if md.MachineHealthCheck != nil {
   722  		if err := r.reconcileMachineHealthCheck(ctx, md.MachineHealthCheck, nil); err != nil {
   723  			return err
   724  		}
   725  	}
   726  	log.Infof("Deleting %s", tlog.KObj{Obj: md.Object})
   727  	if err := r.Client.Delete(ctx, md.Object); err != nil && !apierrors.IsNotFound(err) {
   728  		return errors.Wrapf(err, "failed to delete %s", tlog.KObj{Obj: md.Object})
   729  	}
   730  	r.recorder.Eventf(cluster, corev1.EventTypeNormal, deleteEventReason, "Deleted %q", tlog.KObj{Obj: md.Object})
   731  	return nil
   732  }
   733  
   734  // reconcileMachinePools reconciles the desired state of the MachinePool objects.
   735  func (r *Reconciler) reconcileMachinePools(ctx context.Context, s *scope.Scope) error {
   736  	diff := calculateMachinePoolDiff(s.Current.MachinePools, s.Desired.MachinePools)
   737  
   738  	// Create MachinePools.
   739  	if len(diff.toCreate) > 0 {
   740  		// In current state we only got the MP list via a cached call.
   741  		// As a consequence, in order to prevent the creation of duplicate MP due to stale reads,
   742  		// we are now using a live client to double-check here that the MachinePool
   743  		// to be created doesn't exist yet.
   744  		currentMPTopologyNames, err := r.getCurrentMachinePools(ctx, s)
   745  		if err != nil {
   746  			return err
   747  		}
   748  		for _, mpTopologyName := range diff.toCreate {
   749  			mp := s.Desired.MachinePools[mpTopologyName]
   750  
   751  			// Skip the MP creation if the MP already exists.
   752  			if currentMPTopologyNames.Has(mpTopologyName) {
   753  				log := tlog.LoggerFrom(ctx).WithMachinePool(mp.Object)
   754  				log.V(3).Infof(fmt.Sprintf("Skipping creation of MachinePool %s because MachinePool for topology %s already exists (only considered creation because of stale cache)", tlog.KObj{Obj: mp.Object}, mpTopologyName))
   755  				continue
   756  			}
   757  
   758  			if err := r.createMachinePool(ctx, s, mp); err != nil {
   759  				return err
   760  			}
   761  		}
   762  	}
   763  
   764  	// Update MachinePools.
   765  	for _, mpTopologyName := range diff.toUpdate {
   766  		currentMP := s.Current.MachinePools[mpTopologyName]
   767  		desiredMP := s.Desired.MachinePools[mpTopologyName]
   768  		if err := r.updateMachinePool(ctx, s, currentMP, desiredMP); err != nil {
   769  			return err
   770  		}
   771  	}
   772  
   773  	// Delete MachinePools.
   774  	for _, mpTopologyName := range diff.toDelete {
   775  		mp := s.Current.MachinePools[mpTopologyName]
   776  		if err := r.deleteMachinePool(ctx, s.Current.Cluster, mp); err != nil {
   777  			return err
   778  		}
   779  	}
   780  
   781  	return nil
   782  }
   783  
   784  // getCurrentMachinePools gets the current list of MachinePools via the APIReader.
   785  func (r *Reconciler) getCurrentMachinePools(ctx context.Context, s *scope.Scope) (sets.Set[string], error) {
   786  	// TODO: We should consider using PartialObjectMetadataList here. Currently this doesn't work as our
   787  	// implementation for topology dryrun doesn't support PartialObjectMetadataList.
   788  	mpList := &expv1.MachinePoolList{}
   789  	err := r.APIReader.List(ctx, mpList,
   790  		client.MatchingLabels{
   791  			clusterv1.ClusterNameLabel:          s.Current.Cluster.Name,
   792  			clusterv1.ClusterTopologyOwnedLabel: "",
   793  		},
   794  		client.InNamespace(s.Current.Cluster.Namespace),
   795  	)
   796  	if err != nil {
   797  		return nil, errors.Wrap(err, "failed to read MachinePools for managed topology")
   798  	}
   799  
   800  	currentMPs := sets.Set[string]{}
   801  	for _, mp := range mpList.Items {
   802  		mpTopologyName, ok := mp.ObjectMeta.Labels[clusterv1.ClusterTopologyMachinePoolNameLabel]
   803  		if ok || mpTopologyName != "" {
   804  			currentMPs.Insert(mpTopologyName)
   805  		}
   806  	}
   807  	return currentMPs, nil
   808  }
   809  
   810  // createMachinePool creates a MachinePool and the corresponding templates.
   811  func (r *Reconciler) createMachinePool(ctx context.Context, s *scope.Scope, mp *scope.MachinePoolState) error {
   812  	// Do not create the MachinePool if it is marked as pending create.
   813  	mpTopologyName, ok := mp.Object.Labels[clusterv1.ClusterTopologyMachinePoolNameLabel]
   814  	if !ok || mpTopologyName == "" {
   815  		// Note: This is only an additional safety check and should not happen. The label will always be added when computing
   816  		// the desired MachinePool.
   817  		return errors.Errorf("new MachinePool is missing the %q label", clusterv1.ClusterTopologyMachinePoolNameLabel)
   818  	}
   819  	// Return early if the MachinePool is pending create.
   820  	if s.UpgradeTracker.MachinePools.IsPendingCreate(mpTopologyName) {
   821  		return nil
   822  	}
   823  
   824  	log := tlog.LoggerFrom(ctx).WithMachinePool(mp.Object)
   825  	cluster := s.Current.Cluster
   826  	infraCtx, _ := log.WithObject(mp.InfrastructureMachinePoolObject).Into(ctx)
   827  	if err := r.reconcileReferencedObject(infraCtx, reconcileReferencedObjectInput{
   828  		cluster: cluster,
   829  		desired: mp.InfrastructureMachinePoolObject,
   830  	}); err != nil {
   831  		return errors.Wrapf(err, "failed to create %s", mp.Object.Kind)
   832  	}
   833  
   834  	bootstrapCtx, _ := log.WithObject(mp.BootstrapObject).Into(ctx)
   835  	if err := r.reconcileReferencedObject(bootstrapCtx, reconcileReferencedObjectInput{
   836  		cluster: cluster,
   837  		desired: mp.BootstrapObject,
   838  	}); err != nil {
   839  		return errors.Wrapf(err, "failed to create %s", mp.Object.Kind)
   840  	}
   841  
   842  	log = log.WithObject(mp.Object)
   843  	log.Infof(fmt.Sprintf("Creating %s", tlog.KObj{Obj: mp.Object}))
   844  	helper, err := r.patchHelperFactory(ctx, nil, mp.Object)
   845  	if err != nil {
   846  		return createErrorWithoutObjectName(ctx, err, mp.Object)
   847  	}
   848  	if err := helper.Patch(ctx); err != nil {
   849  		return createErrorWithoutObjectName(ctx, err, mp.Object)
   850  	}
   851  	r.recorder.Eventf(cluster, corev1.EventTypeNormal, createEventReason, "Created %q", tlog.KObj{Obj: mp.Object})
   852  
   853  	// Wait until MachinePool is visible in the cache.
   854  	// Note: We have to do this because otherwise using a cached client in current state could
   855  	// miss a newly created MachinePool (because the cache might be stale).
   856  	err = wait.PollUntilContextTimeout(ctx, 5*time.Millisecond, 5*time.Second, true, func(ctx context.Context) (bool, error) {
   857  		key := client.ObjectKey{Namespace: mp.Object.Namespace, Name: mp.Object.Name}
   858  		if err := r.Client.Get(ctx, key, &expv1.MachinePool{}); err != nil {
   859  			if apierrors.IsNotFound(err) {
   860  				return false, nil
   861  			}
   862  			return false, err
   863  		}
   864  		return true, nil
   865  	})
   866  	if err != nil {
   867  		return errors.Wrapf(err, "failed waiting for MachinePool %s to be visible in the cache after create", mp.Object.Kind)
   868  	}
   869  
   870  	return nil
   871  }
   872  
   873  // updateMachinePool updates a MachinePool. Also updates the corresponding objects if necessary.
   874  func (r *Reconciler) updateMachinePool(ctx context.Context, s *scope.Scope, currentMP, desiredMP *scope.MachinePoolState) error {
   875  	log := tlog.LoggerFrom(ctx).WithMachinePool(desiredMP.Object)
   876  
   877  	// Return early if the MachinePool is pending an upgrade.
   878  	// Do not reconcile the MachinePool yet to avoid updating the MachinePool while it is still pending a
   879  	// version upgrade. This will prevent the MachinePool from performing a double rollout.
   880  	if s.UpgradeTracker.MachinePools.IsPendingUpgrade(currentMP.Object.Name) {
   881  		return nil
   882  	}
   883  
   884  	cluster := s.Current.Cluster
   885  	infraCtx, _ := log.WithObject(desiredMP.InfrastructureMachinePoolObject).Into(ctx)
   886  	if err := r.reconcileReferencedObject(infraCtx, reconcileReferencedObjectInput{
   887  		cluster: cluster,
   888  		current: currentMP.InfrastructureMachinePoolObject,
   889  		desired: desiredMP.InfrastructureMachinePoolObject,
   890  	}); err != nil {
   891  		return errors.Wrapf(err, "failed to reconcile %s", tlog.KObj{Obj: currentMP.Object})
   892  	}
   893  
   894  	bootstrapCtx, _ := log.WithObject(desiredMP.BootstrapObject).Into(ctx)
   895  	if err := r.reconcileReferencedObject(bootstrapCtx, reconcileReferencedObjectInput{
   896  		cluster: cluster,
   897  		current: currentMP.BootstrapObject,
   898  		desired: desiredMP.BootstrapObject,
   899  	}); err != nil {
   900  		return errors.Wrapf(err, "failed to reconcile %s", tlog.KObj{Obj: currentMP.Object})
   901  	}
   902  
   903  	// Check differences between current and desired MachinePool, and eventually patch the current object.
   904  	log = log.WithObject(desiredMP.Object)
   905  	patchHelper, err := r.patchHelperFactory(ctx, currentMP.Object, desiredMP.Object)
   906  	if err != nil {
   907  		return errors.Wrapf(err, "failed to create patch helper for %s", tlog.KObj{Obj: currentMP.Object})
   908  	}
   909  	if !patchHelper.HasChanges() {
   910  		log.V(3).Infof("No changes for %s", tlog.KObj{Obj: currentMP.Object})
   911  		return nil
   912  	}
   913  
   914  	log.Infof("Patching %s", tlog.KObj{Obj: currentMP.Object})
   915  	if err := patchHelper.Patch(ctx); err != nil {
   916  		return errors.Wrapf(err, "failed to patch %s", tlog.KObj{Obj: currentMP.Object})
   917  	}
   918  	r.recorder.Eventf(cluster, corev1.EventTypeNormal, updateEventReason, "Updated %q%s", tlog.KObj{Obj: currentMP.Object}, logMachinePoolVersionChange(currentMP.Object, desiredMP.Object))
   919  
   920  	// Wait until MachinePool is updated in the cache.
   921  	// Note: We have to do this because otherwise using a cached client in current state could
   922  	// return a stale state of a MachinePool we just patched (because the cache might be stale).
   923  	// Note: It is good enough to check that the resource version changed. Other controllers might have updated the
   924  	// MachinePool as well, but the combination of the patch call above without a conflict and a changed resource
   925  	// version here guarantees that we see the changes of our own update.
   926  	err = wait.PollUntilContextTimeout(ctx, 5*time.Millisecond, 5*time.Second, true, func(ctx context.Context) (bool, error) {
   927  		key := client.ObjectKey{Namespace: currentMP.Object.GetNamespace(), Name: currentMP.Object.GetName()}
   928  		cachedMP := &expv1.MachinePool{}
   929  		if err := r.Client.Get(ctx, key, cachedMP); err != nil {
   930  			return false, err
   931  		}
   932  		return currentMP.Object.GetResourceVersion() != cachedMP.GetResourceVersion(), nil
   933  	})
   934  	if err != nil {
   935  		return errors.Wrapf(err, "failed waiting for MachinePool %s to be updated in the cache after patch", tlog.KObj{Obj: currentMP.Object})
   936  	}
   937  
   938  	// We want to call both cleanup functions even if one of them fails to clean up as much as possible.
   939  	return nil
   940  }
   941  
   942  func logMachinePoolVersionChange(current, desired *expv1.MachinePool) string {
   943  	if current.Spec.Template.Spec.Version == nil || desired.Spec.Template.Spec.Version == nil {
   944  		return ""
   945  	}
   946  
   947  	if *current.Spec.Template.Spec.Version != *desired.Spec.Template.Spec.Version {
   948  		return fmt.Sprintf(" with version change from %s to %s", *current.Spec.Template.Spec.Version, *desired.Spec.Template.Spec.Version)
   949  	}
   950  	return ""
   951  }
   952  
   953  // deleteMachinePool deletes a MachinePool.
   954  func (r *Reconciler) deleteMachinePool(ctx context.Context, cluster *clusterv1.Cluster, mp *scope.MachinePoolState) error {
   955  	log := tlog.LoggerFrom(ctx).WithMachinePool(mp.Object).WithObject(mp.Object)
   956  	log.Infof("Deleting %s", tlog.KObj{Obj: mp.Object})
   957  	if err := r.Client.Delete(ctx, mp.Object); err != nil && !apierrors.IsNotFound(err) {
   958  		return errors.Wrapf(err, "failed to delete %s", tlog.KObj{Obj: mp.Object})
   959  	}
   960  	r.recorder.Eventf(cluster, corev1.EventTypeNormal, deleteEventReason, "Deleted %q", tlog.KObj{Obj: mp.Object})
   961  	return nil
   962  }
   963  
   964  type machineDiff struct {
   965  	toCreate, toUpdate, toDelete []string
   966  }
   967  
   968  // calculateMachineDeploymentDiff compares two maps of MachineDeploymentState and calculates which
   969  // MachineDeployments should be created, updated or deleted.
   970  func calculateMachineDeploymentDiff(current, desired map[string]*scope.MachineDeploymentState) machineDiff {
   971  	var diff machineDiff
   972  
   973  	for md := range desired {
   974  		if _, ok := current[md]; ok {
   975  			diff.toUpdate = append(diff.toUpdate, md)
   976  		} else {
   977  			diff.toCreate = append(diff.toCreate, md)
   978  		}
   979  	}
   980  
   981  	for md := range current {
   982  		if _, ok := desired[md]; !ok {
   983  			diff.toDelete = append(diff.toDelete, md)
   984  		}
   985  	}
   986  
   987  	return diff
   988  }
   989  
   990  // calculateMachinePoolDiff compares two maps of MachinePoolState and calculates which
   991  // MachinePools should be created, updated or deleted.
   992  func calculateMachinePoolDiff(current, desired map[string]*scope.MachinePoolState) machineDiff {
   993  	var diff machineDiff
   994  
   995  	for mp := range desired {
   996  		if _, ok := current[mp]; ok {
   997  			diff.toUpdate = append(diff.toUpdate, mp)
   998  		} else {
   999  			diff.toCreate = append(diff.toCreate, mp)
  1000  		}
  1001  	}
  1002  
  1003  	for mp := range current {
  1004  		if _, ok := desired[mp]; !ok {
  1005  			diff.toDelete = append(diff.toDelete, mp)
  1006  		}
  1007  	}
  1008  
  1009  	return diff
  1010  }
  1011  
  1012  type unstructuredVersionGetter func(obj *unstructured.Unstructured) (*string, error)
  1013  
  1014  type reconcileReferencedObjectInput struct {
  1015  	cluster       *clusterv1.Cluster
  1016  	current       *unstructured.Unstructured
  1017  	desired       *unstructured.Unstructured
  1018  	versionGetter unstructuredVersionGetter
  1019  	ignorePaths   []contract.Path
  1020  }
  1021  
  1022  // reconcileReferencedObject reconciles the desired state of the referenced object.
  1023  // NOTE: After a referenced object is created it is assumed that the reference should
  1024  // never change (only the content of the object can eventually change). Thus, we are checking for strict compatibility.
  1025  func (r *Reconciler) reconcileReferencedObject(ctx context.Context, in reconcileReferencedObjectInput) error {
  1026  	log := tlog.LoggerFrom(ctx)
  1027  
  1028  	// If there is no current object, create it.
  1029  	if in.current == nil {
  1030  		log.Infof("Creating %s", tlog.KObj{Obj: in.desired})
  1031  		helper, err := r.patchHelperFactory(ctx, nil, in.desired, structuredmerge.IgnorePaths(in.ignorePaths))
  1032  		if err != nil {
  1033  			return errors.Wrap(createErrorWithoutObjectName(ctx, err, in.desired), "failed to create patch helper")
  1034  		}
  1035  		if err := helper.Patch(ctx); err != nil {
  1036  			return createErrorWithoutObjectName(ctx, err, in.desired)
  1037  		}
  1038  		r.recorder.Eventf(in.cluster, corev1.EventTypeNormal, createEventReason, "Created %q", tlog.KObj{Obj: in.desired})
  1039  		return nil
  1040  	}
  1041  
  1042  	// Check if the current and desired referenced object are compatible.
  1043  	if allErrs := check.ObjectsAreStrictlyCompatible(in.current, in.desired); len(allErrs) > 0 {
  1044  		return allErrs.ToAggregate()
  1045  	}
  1046  
  1047  	// Check differences between current and desired state, and eventually patch the current object.
  1048  	patchHelper, err := r.patchHelperFactory(ctx, in.current, in.desired, structuredmerge.IgnorePaths(in.ignorePaths))
  1049  	if err != nil {
  1050  		return errors.Wrapf(err, "failed to create patch helper for %s", tlog.KObj{Obj: in.current})
  1051  	}
  1052  	if !patchHelper.HasChanges() {
  1053  		log.V(3).Infof("No changes for %s", tlog.KObj{Obj: in.desired})
  1054  		return nil
  1055  	}
  1056  
  1057  	log.Infof("Patching %s", tlog.KObj{Obj: in.desired})
  1058  	if err := patchHelper.Patch(ctx); err != nil {
  1059  		return errors.Wrapf(err, "failed to patch %s", tlog.KObj{Obj: in.current})
  1060  	}
  1061  	r.recorder.Eventf(in.cluster, corev1.EventTypeNormal, updateEventReason, "Updated %q%s", tlog.KObj{Obj: in.desired}, logUnstructuredVersionChange(in.current, in.desired, in.versionGetter))
  1062  	return nil
  1063  }
  1064  
  1065  func logUnstructuredVersionChange(current, desired *unstructured.Unstructured, versionGetter unstructuredVersionGetter) string {
  1066  	if versionGetter == nil {
  1067  		return ""
  1068  	}
  1069  
  1070  	currentVersion, err := versionGetter(current)
  1071  	if err != nil || currentVersion == nil {
  1072  		return ""
  1073  	}
  1074  	desiredVersion, err := versionGetter(desired)
  1075  	if err != nil || desiredVersion == nil {
  1076  		return ""
  1077  	}
  1078  
  1079  	if *currentVersion != *desiredVersion {
  1080  		return fmt.Sprintf(" with version change from %s to %s", *currentVersion, *desiredVersion)
  1081  	}
  1082  	return ""
  1083  }
  1084  
  1085  type reconcileReferencedTemplateInput struct {
  1086  	cluster              *clusterv1.Cluster
  1087  	ref                  *corev1.ObjectReference
  1088  	current              *unstructured.Unstructured
  1089  	desired              *unstructured.Unstructured
  1090  	templateNamePrefix   string
  1091  	compatibilityChecker func(current, desired client.Object) field.ErrorList
  1092  }
  1093  
  1094  // reconcileReferencedTemplate reconciles the desired state of a referenced Template.
  1095  // NOTE: According to Cluster API operational practices, when a referenced Template changes a template rotation is required:
  1096  // 1. create a new Template
  1097  // 2. update the reference
  1098  // 3. delete the old Template
  1099  // This function specifically takes care of the first step and updates the reference locally. So the remaining steps
  1100  // can be executed afterwards.
  1101  // NOTE: This func has a side effect in case of template rotation, changing both the desired object and the object reference.
  1102  func (r *Reconciler) reconcileReferencedTemplate(ctx context.Context, in reconcileReferencedTemplateInput) error {
  1103  	log := tlog.LoggerFrom(ctx)
  1104  
  1105  	// If there is no current object, create the desired object.
  1106  	if in.current == nil {
  1107  		log.Infof("Creating %s", tlog.KObj{Obj: in.desired})
  1108  		helper, err := r.patchHelperFactory(ctx, nil, in.desired)
  1109  		if err != nil {
  1110  			return errors.Wrap(createErrorWithoutObjectName(ctx, err, in.desired), "failed to create patch helper")
  1111  		}
  1112  		if err := helper.Patch(ctx); err != nil {
  1113  			return createErrorWithoutObjectName(ctx, err, in.desired)
  1114  		}
  1115  		r.recorder.Eventf(in.cluster, corev1.EventTypeNormal, createEventReason, "Created %q", tlog.KObj{Obj: in.desired})
  1116  		return nil
  1117  	}
  1118  
  1119  	if in.ref == nil {
  1120  		return errors.Errorf("failed to rotate %s: ref should not be nil", in.desired.GroupVersionKind())
  1121  	}
  1122  
  1123  	// Check if the current and desired referenced object are compatible.
  1124  	if allErrs := in.compatibilityChecker(in.current, in.desired); len(allErrs) > 0 {
  1125  		return allErrs.ToAggregate()
  1126  	}
  1127  
  1128  	// Check differences between current and desired objects, and if there are changes eventually start the template rotation.
  1129  	patchHelper, err := r.patchHelperFactory(ctx, in.current, in.desired)
  1130  	if err != nil {
  1131  		return errors.Wrapf(err, "failed to create patch helper for %s", tlog.KObj{Obj: in.current})
  1132  	}
  1133  
  1134  	// Return if no changes are detected.
  1135  	if !patchHelper.HasChanges() {
  1136  		log.V(3).Infof("No changes for %s", tlog.KObj{Obj: in.desired})
  1137  		return nil
  1138  	}
  1139  
  1140  	// If there are no changes in the spec, and thus only changes in metadata, instead of doing a full template
  1141  	// rotation we patch the object in place. This avoids recreating machines.
  1142  	if !patchHelper.HasSpecChanges() {
  1143  		log.Infof("Patching %s", tlog.KObj{Obj: in.desired})
  1144  		if err := patchHelper.Patch(ctx); err != nil {
  1145  			return errors.Wrapf(err, "failed to patch %s", tlog.KObj{Obj: in.desired})
  1146  		}
  1147  		r.recorder.Eventf(in.cluster, corev1.EventTypeNormal, updateEventReason, "Updated %q (metadata changes)", tlog.KObj{Obj: in.desired})
  1148  		return nil
  1149  	}
  1150  
  1151  	// Create the new template.
  1152  
  1153  	// NOTE: it is required to assign a new name, because during compute the desired object name is enforced to be equal to the current one.
  1154  	// TODO: find a way to make side effect more explicit
  1155  	newName := names.SimpleNameGenerator.GenerateName(in.templateNamePrefix)
  1156  	in.desired.SetName(newName)
  1157  
  1158  	log.Infof("Rotating %s, new name %s", tlog.KObj{Obj: in.current}, newName)
  1159  	log.Infof("Creating %s", tlog.KObj{Obj: in.desired})
  1160  	helper, err := r.patchHelperFactory(ctx, nil, in.desired)
  1161  	if err != nil {
  1162  		return errors.Wrap(createErrorWithoutObjectName(ctx, err, in.desired), "failed to create patch helper")
  1163  	}
  1164  	if err := helper.Patch(ctx); err != nil {
  1165  		return createErrorWithoutObjectName(ctx, err, in.desired)
  1166  	}
  1167  	r.recorder.Eventf(in.cluster, corev1.EventTypeNormal, createEventReason, "Created %q as a replacement for %q (template rotation)", tlog.KObj{Obj: in.desired}, in.ref.Name)
  1168  
  1169  	// Update the reference with the new name.
  1170  	// NOTE: Updating the object hosting reference to the template is executed outside this func.
  1171  	// TODO: find a way to make side effect more explicit
  1172  	in.ref.Name = newName
  1173  
  1174  	return nil
  1175  }
  1176  
  1177  // createErrorWithoutObjectName removes the name of the object from the error message. As each new Create call involves an
  1178  // object with a unique generated name each error appears to be a different error. As the errors are being surfaced in a condition
  1179  // on the Cluster, the name is removed here to prevent each creation error from triggering a new reconciliation.
  1180  func createErrorWithoutObjectName(ctx context.Context, err error, obj client.Object) error {
  1181  	log := ctrl.LoggerFrom(ctx)
  1182  	if obj != nil {
  1183  		log = log.WithValues(obj.GetObjectKind().GroupVersionKind().Kind, klog.KObj(obj))
  1184  	}
  1185  	log.Error(err, "Failed to create object")
  1186  
  1187  	var statusError *apierrors.StatusError
  1188  	if errors.As(err, &statusError) {
  1189  		var msg string
  1190  		if statusError.Status().Details != nil {
  1191  			var causes []string
  1192  			for _, cause := range statusError.Status().Details.Causes {
  1193  				causes = append(causes, fmt.Sprintf("%s: %s: %s", cause.Type, cause.Field, cause.Message))
  1194  			}
  1195  			if len(causes) > 0 {
  1196  				msg = fmt.Sprintf("failed to create %s.%s: %s", statusError.Status().Details.Kind, statusError.Status().Details.Group, strings.Join(causes, " "))
  1197  			} else {
  1198  				msg = fmt.Sprintf("failed to create %s.%s", statusError.Status().Details.Kind, statusError.Status().Details.Group)
  1199  			}
  1200  			statusError.ErrStatus.Message = msg
  1201  			return statusError
  1202  		}
  1203  
  1204  		if statusError.Status().Message != "" {
  1205  			if obj != nil {
  1206  				msg = fmt.Sprintf("failed to create %s", obj.GetObjectKind().GroupVersionKind().GroupKind().String())
  1207  			} else {
  1208  				msg = "failed to create object"
  1209  			}
  1210  		}
  1211  		statusError.ErrStatus.Message = msg
  1212  		return statusError
  1213  	}
  1214  	// If this isn't a StatusError return a more generic error with the object details.
  1215  	if obj != nil {
  1216  		return errors.Errorf("failed to create %s", obj.GetObjectKind().GroupVersionKind().GroupKind().String())
  1217  	}
  1218  	return errors.New("failed to create object")
  1219  }