sigs.k8s.io/cluster-api@v1.7.1/internal/webhooks/cluster.go (about)

     1  /*
     2  Copyright 2021 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package webhooks
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"net"
    23  	"strconv"
    24  	"strings"
    25  	"time"
    26  
    27  	"github.com/blang/semver/v4"
    28  	"github.com/pkg/errors"
    29  	apierrors "k8s.io/apimachinery/pkg/api/errors"
    30  	"k8s.io/apimachinery/pkg/runtime"
    31  	kerrors "k8s.io/apimachinery/pkg/util/errors"
    32  	"k8s.io/apimachinery/pkg/util/validation"
    33  	"k8s.io/apimachinery/pkg/util/validation/field"
    34  	"k8s.io/apimachinery/pkg/util/wait"
    35  	"k8s.io/klog/v2"
    36  	ctrl "sigs.k8s.io/controller-runtime"
    37  	"sigs.k8s.io/controller-runtime/pkg/client"
    38  	"sigs.k8s.io/controller-runtime/pkg/webhook"
    39  	"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
    40  
    41  	clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
    42  	"sigs.k8s.io/cluster-api/controllers/external"
    43  	expv1 "sigs.k8s.io/cluster-api/exp/api/v1beta1"
    44  	"sigs.k8s.io/cluster-api/feature"
    45  	"sigs.k8s.io/cluster-api/internal/contract"
    46  	"sigs.k8s.io/cluster-api/internal/topology/check"
    47  	"sigs.k8s.io/cluster-api/internal/topology/variables"
    48  	"sigs.k8s.io/cluster-api/util/conditions"
    49  	"sigs.k8s.io/cluster-api/util/version"
    50  )
    51  
    52  // SetupWebhookWithManager sets up Cluster webhooks.
    53  func (webhook *Cluster) SetupWebhookWithManager(mgr ctrl.Manager) error {
    54  	return ctrl.NewWebhookManagedBy(mgr).
    55  		For(&clusterv1.Cluster{}).
    56  		WithDefaulter(webhook).
    57  		WithValidator(webhook).
    58  		Complete()
    59  }
    60  
    61  // +kubebuilder:webhook:verbs=create;update;delete,path=/validate-cluster-x-k8s-io-v1beta1-cluster,mutating=false,failurePolicy=fail,matchPolicy=Equivalent,groups=cluster.x-k8s.io,resources=clusters,versions=v1beta1,name=validation.cluster.cluster.x-k8s.io,sideEffects=None,admissionReviewVersions=v1;v1beta1
    62  // +kubebuilder:webhook:verbs=create;update,path=/mutate-cluster-x-k8s-io-v1beta1-cluster,mutating=true,failurePolicy=fail,matchPolicy=Equivalent,groups=cluster.x-k8s.io,resources=clusters,versions=v1beta1,name=default.cluster.cluster.x-k8s.io,sideEffects=None,admissionReviewVersions=v1;v1beta1
    63  
    64  // ClusterCacheTrackerReader is a scoped-down interface from ClusterCacheTracker that only allows to get a reader client.
    65  type ClusterCacheTrackerReader interface {
    66  	GetReader(ctx context.Context, cluster client.ObjectKey) (client.Reader, error)
    67  }
    68  
    69  // Cluster implements a validating and defaulting webhook for Cluster.
    70  type Cluster struct {
    71  	Client  client.Reader
    72  	Tracker ClusterCacheTrackerReader
    73  }
    74  
    75  var _ webhook.CustomDefaulter = &Cluster{}
    76  var _ webhook.CustomValidator = &Cluster{}
    77  
    78  var errClusterClassNotReconciled = errors.New("ClusterClass is not up to date")
    79  
    80  // Default satisfies the defaulting webhook interface.
    81  func (webhook *Cluster) Default(ctx context.Context, obj runtime.Object) error {
    82  	// We gather all defaulting errors and return them together.
    83  	var allErrs field.ErrorList
    84  
    85  	cluster, ok := obj.(*clusterv1.Cluster)
    86  	if !ok {
    87  		return apierrors.NewBadRequest(fmt.Sprintf("expected a Cluster but got a %T", obj))
    88  	}
    89  
    90  	if cluster.Spec.InfrastructureRef != nil && cluster.Spec.InfrastructureRef.Namespace == "" {
    91  		cluster.Spec.InfrastructureRef.Namespace = cluster.Namespace
    92  	}
    93  
    94  	if cluster.Spec.ControlPlaneRef != nil && cluster.Spec.ControlPlaneRef.Namespace == "" {
    95  		cluster.Spec.ControlPlaneRef.Namespace = cluster.Namespace
    96  	}
    97  
    98  	// Additional defaulting if the Cluster uses a managed topology.
    99  	if cluster.Spec.Topology != nil {
   100  		// Tolerate version strings without a "v" prefix: prepend it if it's not there.
   101  		if !strings.HasPrefix(cluster.Spec.Topology.Version, "v") {
   102  			cluster.Spec.Topology.Version = "v" + cluster.Spec.Topology.Version
   103  		}
   104  		clusterClass, err := webhook.pollClusterClassForCluster(ctx, cluster)
   105  		if err != nil {
   106  			// If the ClusterClass can't be found or is not up to date ignore the error.
   107  			if apierrors.IsNotFound(err) || errors.Is(err, errClusterClassNotReconciled) {
   108  				return nil
   109  			}
   110  			return apierrors.NewInternalError(errors.Wrapf(err, "Cluster %s can't be defaulted. ClusterClass %s can not be retrieved", cluster.Name, cluster.Spec.Topology.Class))
   111  		}
   112  
   113  		// Doing both defaulting and validating here prevents a race condition where the ClusterClass could be
   114  		// different in the defaulting and validating webhook.
   115  		allErrs = append(allErrs, DefaultAndValidateVariables(cluster, clusterClass)...)
   116  
   117  		if len(allErrs) > 0 {
   118  			return apierrors.NewInvalid(clusterv1.GroupVersion.WithKind("Cluster").GroupKind(), cluster.Name, allErrs)
   119  		}
   120  	}
   121  	return nil
   122  }
   123  
   124  // ValidateCreate implements webhook.CustomValidator so a webhook will be registered for the type.
   125  func (webhook *Cluster) ValidateCreate(ctx context.Context, obj runtime.Object) (admission.Warnings, error) {
   126  	cluster, ok := obj.(*clusterv1.Cluster)
   127  	if !ok {
   128  		return nil, apierrors.NewBadRequest(fmt.Sprintf("expected a Cluster but got a %T", obj))
   129  	}
   130  	return webhook.validate(ctx, nil, cluster)
   131  }
   132  
   133  // ValidateUpdate implements webhook.CustomValidator so a webhook will be registered for the type.
   134  func (webhook *Cluster) ValidateUpdate(ctx context.Context, oldObj, newObj runtime.Object) (admission.Warnings, error) {
   135  	newCluster, ok := newObj.(*clusterv1.Cluster)
   136  	if !ok {
   137  		return nil, apierrors.NewBadRequest(fmt.Sprintf("expected a Cluster but got a %T", newObj))
   138  	}
   139  	oldCluster, ok := oldObj.(*clusterv1.Cluster)
   140  	if !ok {
   141  		return nil, apierrors.NewBadRequest(fmt.Sprintf("expected a Cluster but got a %T", oldObj))
   142  	}
   143  	return webhook.validate(ctx, oldCluster, newCluster)
   144  }
   145  
   146  // ValidateDelete implements webhook.CustomValidator so a webhook will be registered for the type.
   147  func (webhook *Cluster) ValidateDelete(_ context.Context, _ runtime.Object) (admission.Warnings, error) {
   148  	return nil, nil
   149  }
   150  
   151  func (webhook *Cluster) validate(ctx context.Context, oldCluster, newCluster *clusterv1.Cluster) (admission.Warnings, error) {
   152  	var allErrs field.ErrorList
   153  	var allWarnings admission.Warnings
   154  	// The Cluster name is used as a label value. This check ensures that names which are not valid label values are rejected.
   155  	if errs := validation.IsValidLabelValue(newCluster.Name); len(errs) != 0 {
   156  		for _, err := range errs {
   157  			allErrs = append(
   158  				allErrs,
   159  				field.Invalid(
   160  					field.NewPath("metadata", "name"),
   161  					newCluster.Name,
   162  					fmt.Sprintf("must be a valid label value %s", err),
   163  				),
   164  			)
   165  		}
   166  	}
   167  	specPath := field.NewPath("spec")
   168  	if newCluster.Spec.InfrastructureRef != nil && newCluster.Spec.InfrastructureRef.Namespace != newCluster.Namespace {
   169  		allErrs = append(
   170  			allErrs,
   171  			field.Invalid(
   172  				specPath.Child("infrastructureRef", "namespace"),
   173  				newCluster.Spec.InfrastructureRef.Namespace,
   174  				"must match metadata.namespace",
   175  			),
   176  		)
   177  	}
   178  
   179  	if newCluster.Spec.ControlPlaneRef != nil && newCluster.Spec.ControlPlaneRef.Namespace != newCluster.Namespace {
   180  		allErrs = append(
   181  			allErrs,
   182  			field.Invalid(
   183  				specPath.Child("controlPlaneRef", "namespace"),
   184  				newCluster.Spec.ControlPlaneRef.Namespace,
   185  				"must match metadata.namespace",
   186  			),
   187  		)
   188  	}
   189  	if newCluster.Spec.ClusterNetwork != nil {
   190  		// Ensure that the CIDR blocks defined under ClusterNetwork are valid.
   191  		if newCluster.Spec.ClusterNetwork.Pods != nil {
   192  			allErrs = append(allErrs, validateCIDRBlocks(specPath.Child("clusterNetwork", "pods", "cidrBlocks"),
   193  				newCluster.Spec.ClusterNetwork.Pods.CIDRBlocks)...)
   194  		}
   195  
   196  		if newCluster.Spec.ClusterNetwork.Services != nil {
   197  			allErrs = append(allErrs, validateCIDRBlocks(specPath.Child("clusterNetwork", "services", "cidrBlocks"),
   198  				newCluster.Spec.ClusterNetwork.Services.CIDRBlocks)...)
   199  		}
   200  	}
   201  
   202  	topologyPath := specPath.Child("topology")
   203  
   204  	// Validate the managed topology, if defined.
   205  	if newCluster.Spec.Topology != nil {
   206  		topologyWarnings, topologyErrs := webhook.validateTopology(ctx, oldCluster, newCluster, topologyPath)
   207  		allWarnings = append(allWarnings, topologyWarnings...)
   208  		allErrs = append(allErrs, topologyErrs...)
   209  	}
   210  
   211  	// On update.
   212  	if oldCluster != nil {
   213  		// Error if the update moves the cluster from Managed to Unmanaged i.e. the managed topology is removed on update.
   214  		if oldCluster.Spec.Topology != nil && newCluster.Spec.Topology == nil {
   215  			allErrs = append(allErrs, field.Forbidden(
   216  				topologyPath,
   217  				"cannot be removed from an existing Cluster",
   218  			))
   219  		}
   220  	}
   221  
   222  	if len(allErrs) > 0 {
   223  		return allWarnings, apierrors.NewInvalid(clusterv1.GroupVersion.WithKind("Cluster").GroupKind(), newCluster.Name, allErrs)
   224  	}
   225  	return allWarnings, nil
   226  }
   227  
   228  func (webhook *Cluster) validateTopology(ctx context.Context, oldCluster, newCluster *clusterv1.Cluster, fldPath *field.Path) (admission.Warnings, field.ErrorList) {
   229  	var allWarnings admission.Warnings
   230  
   231  	// NOTE: ClusterClass and managed topologies are behind ClusterTopology feature gate flag; the web hook
   232  	// must prevent the usage of Cluster.Topology in case the feature flag is disabled.
   233  	if !feature.Gates.Enabled(feature.ClusterTopology) {
   234  		return allWarnings, field.ErrorList{
   235  			field.Forbidden(
   236  				fldPath,
   237  				"can be set only if the ClusterTopology feature flag is enabled",
   238  			),
   239  		}
   240  	}
   241  
   242  	var allErrs field.ErrorList
   243  
   244  	// class should be defined.
   245  	if newCluster.Spec.Topology.Class == "" {
   246  		allErrs = append(
   247  			allErrs,
   248  			field.Required(
   249  				fldPath.Child("class"),
   250  				"class cannot be empty",
   251  			),
   252  		)
   253  		// Return early if there is no defined class to validate.
   254  		return allWarnings, allErrs
   255  	}
   256  
   257  	// version should be valid.
   258  	if !version.KubeSemver.MatchString(newCluster.Spec.Topology.Version) {
   259  		allErrs = append(
   260  			allErrs,
   261  			field.Invalid(
   262  				fldPath.Child("version"),
   263  				newCluster.Spec.Topology.Version,
   264  				"version must be a valid semantic version",
   265  			),
   266  		)
   267  	}
   268  
   269  	// metadata in topology should be valid
   270  	allErrs = append(allErrs, validateTopologyMetadata(newCluster.Spec.Topology, fldPath)...)
   271  
   272  	// upgrade concurrency should be a numeric value.
   273  	if concurrency, ok := newCluster.Annotations[clusterv1.ClusterTopologyUpgradeConcurrencyAnnotation]; ok {
   274  		concurrencyAnnotationField := field.NewPath("metadata", "annotations", clusterv1.ClusterTopologyUpgradeConcurrencyAnnotation)
   275  		concurrencyInt, err := strconv.Atoi(concurrency)
   276  		if err != nil {
   277  			allErrs = append(allErrs, field.Invalid(
   278  				concurrencyAnnotationField,
   279  				concurrency,
   280  				errors.Wrap(err, "could not parse the value of the annotation").Error(),
   281  			))
   282  		} else if concurrencyInt < 1 {
   283  			allErrs = append(allErrs, field.Invalid(
   284  				concurrencyAnnotationField,
   285  				concurrency,
   286  				"value cannot be less than 1",
   287  			))
   288  		}
   289  	}
   290  
   291  	// Get the ClusterClass referenced in the Cluster.
   292  	clusterClass, warnings, clusterClassPollErr := webhook.validateClusterClassExistsAndIsReconciled(ctx, newCluster)
   293  	// If the error is anything other than "NotFound" or "NotReconciled" return all errors.
   294  	if clusterClassPollErr != nil && !(apierrors.IsNotFound(clusterClassPollErr) || errors.Is(clusterClassPollErr, errClusterClassNotReconciled)) {
   295  		allErrs = append(
   296  			allErrs, field.InternalError(
   297  				fldPath.Child("class"),
   298  				clusterClassPollErr))
   299  		return allWarnings, allErrs
   300  	}
   301  
   302  	// Add the warnings if no error was returned.
   303  	allWarnings = append(allWarnings, warnings...)
   304  
   305  	// If there's no error validate the Cluster based on the ClusterClass.
   306  	if clusterClassPollErr == nil {
   307  		allErrs = append(allErrs, ValidateClusterForClusterClass(newCluster, clusterClass)...)
   308  	}
   309  	if oldCluster != nil { // On update
   310  		// The ClusterClass must exist to proceed with update validation. Return an error if the ClusterClass was
   311  		// not found.
   312  		if apierrors.IsNotFound(clusterClassPollErr) {
   313  			allErrs = append(
   314  				allErrs, field.InternalError(
   315  					fldPath.Child("class"),
   316  					clusterClassPollErr))
   317  			return allWarnings, allErrs
   318  		}
   319  
   320  		// Topology or Class can not be added on update unless ClusterTopologyUnsafeUpdateClassNameAnnotation is set.
   321  		if oldCluster.Spec.Topology == nil || oldCluster.Spec.Topology.Class == "" {
   322  			if _, ok := newCluster.Annotations[clusterv1.ClusterTopologyUnsafeUpdateClassNameAnnotation]; ok {
   323  				return allWarnings, allErrs
   324  			}
   325  
   326  			allErrs = append(
   327  				allErrs,
   328  				field.Forbidden(
   329  					fldPath.Child("class"),
   330  					"class cannot be set on an existing Cluster",
   331  				),
   332  			)
   333  			// return early here if there is no class to compare.
   334  			return allWarnings, allErrs
   335  		}
   336  
   337  		inVersion, err := semver.ParseTolerant(newCluster.Spec.Topology.Version)
   338  		if err != nil {
   339  			allErrs = append(
   340  				allErrs,
   341  				field.Invalid(
   342  					fldPath.Child("version"),
   343  					newCluster.Spec.Topology.Version,
   344  					"version must be a valid semantic version",
   345  				),
   346  			)
   347  		}
   348  		oldVersion, err := semver.ParseTolerant(oldCluster.Spec.Topology.Version)
   349  		if err != nil {
   350  			// NOTE: this should never happen. Nevertheless, handling this for extra caution.
   351  			allErrs = append(
   352  				allErrs,
   353  				field.Invalid(
   354  					fldPath.Child("version"),
   355  					oldCluster.Spec.Topology.Version,
   356  					"old version must be a valid semantic version",
   357  				),
   358  			)
   359  		}
   360  
   361  		if _, ok := newCluster.GetAnnotations()[clusterv1.ClusterTopologyUnsafeUpdateVersionAnnotation]; ok {
   362  			log := ctrl.LoggerFrom(ctx)
   363  			warningMsg := fmt.Sprintf("Skipping version validation for Cluster because annotation %q is set.", clusterv1.ClusterTopologyUnsafeUpdateVersionAnnotation)
   364  			log.Info(warningMsg)
   365  			allWarnings = append(allWarnings, warningMsg)
   366  		} else {
   367  			if err := webhook.validateTopologyVersion(ctx, fldPath.Child("version"), newCluster.Spec.Topology.Version, inVersion, oldVersion, oldCluster); err != nil {
   368  				allErrs = append(allErrs, err)
   369  			}
   370  		}
   371  
   372  		// If the ClusterClass referenced in the Topology has changed compatibility checks are needed.
   373  		if oldCluster.Spec.Topology.Class != newCluster.Spec.Topology.Class {
   374  			// Check to see if the ClusterClass referenced in the old version of the Cluster exists.
   375  			oldClusterClass, err := webhook.pollClusterClassForCluster(ctx, oldCluster)
   376  			if err != nil {
   377  				allErrs = append(
   378  					allErrs, field.Forbidden(
   379  						fldPath.Child("class"),
   380  						fmt.Sprintf("valid ClusterClass with name %q could not be retrieved, change from class %[1]q to class %q cannot be validated. Error: %s",
   381  							oldCluster.Spec.Topology.Class, newCluster.Spec.Topology.Class, err.Error())))
   382  
   383  				// Return early with errors if the ClusterClass can't be retrieved.
   384  				return allWarnings, allErrs
   385  			}
   386  
   387  			// Check if the new and old ClusterClasses are compatible with one another.
   388  			allErrs = append(allErrs, check.ClusterClassesAreCompatible(oldClusterClass, clusterClass)...)
   389  		}
   390  	}
   391  	return allWarnings, allErrs
   392  }
   393  
   394  func (webhook *Cluster) validateTopologyVersion(ctx context.Context, fldPath *field.Path, fldValue string, inVersion, oldVersion semver.Version, oldCluster *clusterv1.Cluster) *field.Error {
   395  	// Version could only be increased.
   396  	if inVersion.NE(semver.Version{}) && oldVersion.NE(semver.Version{}) && version.Compare(inVersion, oldVersion, version.WithBuildTags()) == -1 {
   397  		return field.Invalid(
   398  			fldPath,
   399  			fldValue,
   400  			fmt.Sprintf("version cannot be decreased from %q to %q", oldVersion, inVersion),
   401  		)
   402  	}
   403  
   404  	// A +2 minor version upgrade is not allowed.
   405  	ceilVersion := semver.Version{
   406  		Major: oldVersion.Major,
   407  		Minor: oldVersion.Minor + 2,
   408  		Patch: 0,
   409  	}
   410  	if inVersion.GTE(ceilVersion) {
   411  		return field.Invalid(
   412  			fldPath,
   413  			fldValue,
   414  			fmt.Sprintf("version cannot be increased from %q to %q", oldVersion, inVersion),
   415  		)
   416  	}
   417  
   418  	// Only check the following cases if the minor version increases by 1 (we already return above for >= 2).
   419  	ceilVersion = semver.Version{
   420  		Major: oldVersion.Major,
   421  		Minor: oldVersion.Minor + 1,
   422  		Patch: 0,
   423  	}
   424  
   425  	// Return early if its not a minor version upgrade.
   426  	if !inVersion.GTE(ceilVersion) {
   427  		return nil
   428  	}
   429  
   430  	allErrs := []error{}
   431  	// minor version cannot be increased if control plane is upgrading or not yet on the current version
   432  	if err := validateTopologyControlPlaneVersion(ctx, webhook.Client, oldCluster, oldVersion); err != nil {
   433  		allErrs = append(allErrs, fmt.Errorf("blocking version update due to ControlPlane version check: %v", err))
   434  	}
   435  
   436  	// minor version cannot be increased if MachineDeployments are upgrading or not yet on the current version
   437  	if err := validateTopologyMachineDeploymentVersions(ctx, webhook.Client, oldCluster, oldVersion); err != nil {
   438  		allErrs = append(allErrs, fmt.Errorf("blocking version update due to MachineDeployment version check: %v", err))
   439  	}
   440  
   441  	// minor version cannot be increased if MachinePools are upgrading or not yet on the current version
   442  	if err := validateTopologyMachinePoolVersions(ctx, webhook.Client, webhook.Tracker, oldCluster, oldVersion); err != nil {
   443  		allErrs = append(allErrs, fmt.Errorf("blocking version update due to MachinePool version check: %v", err))
   444  	}
   445  
   446  	if len(allErrs) > 0 {
   447  		return field.Invalid(
   448  			fldPath,
   449  			fldValue,
   450  			fmt.Sprintf("minor version update cannot happen at this time: %v", kerrors.NewAggregate(allErrs)),
   451  		)
   452  	}
   453  
   454  	return nil
   455  }
   456  
   457  func validateTopologyControlPlaneVersion(ctx context.Context, ctrlClient client.Reader, oldCluster *clusterv1.Cluster, oldVersion semver.Version) error {
   458  	cp, err := external.Get(ctx, ctrlClient, oldCluster.Spec.ControlPlaneRef, oldCluster.Namespace)
   459  	if err != nil {
   460  		return errors.Wrap(err, "failed to get ControlPlane object")
   461  	}
   462  
   463  	cpVersionString, err := contract.ControlPlane().Version().Get(cp)
   464  	if err != nil {
   465  		return errors.Wrap(err, "failed to get ControlPlane version")
   466  	}
   467  
   468  	cpVersion, err := semver.ParseTolerant(*cpVersionString)
   469  	if err != nil {
   470  		// NOTE: this should never happen. Nevertheless, handling this for extra caution.
   471  		return errors.New("failed to parse version of ControlPlane")
   472  	}
   473  	if cpVersion.NE(oldVersion) {
   474  		return fmt.Errorf("ControlPlane version %q does not match the current version %q", cpVersion, oldVersion)
   475  	}
   476  
   477  	provisioning, err := contract.ControlPlane().IsProvisioning(cp)
   478  	if err != nil {
   479  		return errors.Wrap(err, "failed to check if ControlPlane is provisioning")
   480  	}
   481  
   482  	if provisioning {
   483  		return errors.New("ControlPlane is currently provisioning")
   484  	}
   485  
   486  	upgrading, err := contract.ControlPlane().IsUpgrading(cp)
   487  	if err != nil {
   488  		return errors.Wrap(err, "failed to check if ControlPlane is upgrading")
   489  	}
   490  
   491  	if upgrading {
   492  		return errors.New("ControlPlane is still completing a previous upgrade")
   493  	}
   494  
   495  	return nil
   496  }
   497  
   498  func validateTopologyMachineDeploymentVersions(ctx context.Context, ctrlClient client.Reader, oldCluster *clusterv1.Cluster, oldVersion semver.Version) error {
   499  	// List all the machine deployments in the current cluster and in a managed topology.
   500  	// FROM: current_state.go getCurrentMachineDeploymentState
   501  	mds := &clusterv1.MachineDeploymentList{}
   502  	err := ctrlClient.List(ctx, mds,
   503  		client.MatchingLabels{
   504  			clusterv1.ClusterNameLabel:          oldCluster.Name,
   505  			clusterv1.ClusterTopologyOwnedLabel: "",
   506  		},
   507  		client.InNamespace(oldCluster.Namespace),
   508  	)
   509  	if err != nil {
   510  		return errors.Wrap(err, "failed to read MachineDeployments for managed topology")
   511  	}
   512  
   513  	if len(mds.Items) == 0 {
   514  		return nil
   515  	}
   516  
   517  	mdUpgradingNames := []string{}
   518  
   519  	for i := range mds.Items {
   520  		md := &mds.Items[i]
   521  
   522  		mdVersion, err := semver.ParseTolerant(*md.Spec.Template.Spec.Version)
   523  		if err != nil {
   524  			// NOTE: this should never happen. Nevertheless, handling this for extra caution.
   525  			return errors.Wrapf(err, "failed to parse MachineDeployment's %q version %q", klog.KObj(md), *md.Spec.Template.Spec.Version)
   526  		}
   527  
   528  		if mdVersion.NE(oldVersion) {
   529  			mdUpgradingNames = append(mdUpgradingNames, md.Name)
   530  			continue
   531  		}
   532  
   533  		upgrading, err := check.IsMachineDeploymentUpgrading(ctx, ctrlClient, md)
   534  		if err != nil {
   535  			return errors.Wrap(err, "failed to check if MachineDeployment is upgrading")
   536  		}
   537  		if upgrading {
   538  			mdUpgradingNames = append(mdUpgradingNames, md.Name)
   539  		}
   540  	}
   541  
   542  	if len(mdUpgradingNames) > 0 {
   543  		return fmt.Errorf("there are MachineDeployments still completing a previous upgrade: [%s]", strings.Join(mdUpgradingNames, ", "))
   544  	}
   545  
   546  	return nil
   547  }
   548  
   549  func validateTopologyMachinePoolVersions(ctx context.Context, ctrlClient client.Reader, tracker ClusterCacheTrackerReader, oldCluster *clusterv1.Cluster, oldVersion semver.Version) error {
   550  	// List all the machine pools in the current cluster and in a managed topology.
   551  	// FROM: current_state.go getCurrentMachinePoolState
   552  	mps := &expv1.MachinePoolList{}
   553  	err := ctrlClient.List(ctx, mps,
   554  		client.MatchingLabels{
   555  			clusterv1.ClusterNameLabel:          oldCluster.Name,
   556  			clusterv1.ClusterTopologyOwnedLabel: "",
   557  		},
   558  		client.InNamespace(oldCluster.Namespace),
   559  	)
   560  	if err != nil {
   561  		return errors.Wrap(err, "failed to read MachinePools for managed topology")
   562  	}
   563  
   564  	// Return early
   565  	if len(mps.Items) == 0 {
   566  		return nil
   567  	}
   568  
   569  	wlClient, err := tracker.GetReader(ctx, client.ObjectKeyFromObject(oldCluster))
   570  	if err != nil {
   571  		return errors.Wrap(err, "unable to get client for workload cluster")
   572  	}
   573  
   574  	mpUpgradingNames := []string{}
   575  
   576  	for i := range mps.Items {
   577  		mp := &mps.Items[i]
   578  
   579  		mpVersion, err := semver.ParseTolerant(*mp.Spec.Template.Spec.Version)
   580  		if err != nil {
   581  			// NOTE: this should never happen. Nevertheless, handling this for extra caution.
   582  			return errors.Wrapf(err, "failed to parse MachinePool's %q version %q", klog.KObj(mp), *mp.Spec.Template.Spec.Version)
   583  		}
   584  
   585  		if mpVersion.NE(oldVersion) {
   586  			mpUpgradingNames = append(mpUpgradingNames, mp.Name)
   587  			continue
   588  		}
   589  
   590  		upgrading, err := check.IsMachinePoolUpgrading(ctx, wlClient, mp)
   591  		if err != nil {
   592  			return errors.Wrap(err, "failed to check if MachinePool is upgrading")
   593  		}
   594  		if upgrading {
   595  			mpUpgradingNames = append(mpUpgradingNames, mp.Name)
   596  		}
   597  	}
   598  
   599  	if len(mpUpgradingNames) > 0 {
   600  		return fmt.Errorf("there are MachinePools still completing a previous upgrade: [%s]", strings.Join(mpUpgradingNames, ", "))
   601  	}
   602  
   603  	return nil
   604  }
   605  
   606  func validateMachineHealthChecks(cluster *clusterv1.Cluster, clusterClass *clusterv1.ClusterClass) field.ErrorList {
   607  	var allErrs field.ErrorList
   608  
   609  	if cluster.Spec.Topology.ControlPlane.MachineHealthCheck != nil {
   610  		fldPath := field.NewPath("spec", "topology", "controlPlane", "machineHealthCheck")
   611  
   612  		// Validate ControlPlane MachineHealthCheck if defined.
   613  		if !cluster.Spec.Topology.ControlPlane.MachineHealthCheck.MachineHealthCheckClass.IsZero() {
   614  			// Ensure ControlPlane does not define a MachineHealthCheck if the ClusterClass does not define MachineInfrastructure.
   615  			if clusterClass.Spec.ControlPlane.MachineInfrastructure == nil {
   616  				allErrs = append(allErrs, field.Forbidden(
   617  					fldPath,
   618  					"can be set only if spec.controlPlane.machineInfrastructure is set in ClusterClass",
   619  				))
   620  			}
   621  			allErrs = append(allErrs, validateMachineHealthCheckClass(fldPath, cluster.Namespace,
   622  				&cluster.Spec.Topology.ControlPlane.MachineHealthCheck.MachineHealthCheckClass)...)
   623  		}
   624  
   625  		// If MachineHealthCheck is explicitly enabled then make sure that a MachineHealthCheck definition is
   626  		// available either in the Cluster topology or in the ClusterClass.
   627  		// (One of these definitions will be used in the controller to create the MachineHealthCheck)
   628  
   629  		// Check if the machineHealthCheck is explicitly enabled in the ControlPlaneTopology.
   630  		if cluster.Spec.Topology.ControlPlane.MachineHealthCheck.Enable != nil && *cluster.Spec.Topology.ControlPlane.MachineHealthCheck.Enable {
   631  			// Ensure the MHC is defined in at least one of the ControlPlaneTopology of the Cluster or the ControlPlaneClass of the ClusterClass.
   632  			if cluster.Spec.Topology.ControlPlane.MachineHealthCheck.MachineHealthCheckClass.IsZero() && clusterClass.Spec.ControlPlane.MachineHealthCheck == nil {
   633  				allErrs = append(allErrs, field.Forbidden(
   634  					fldPath.Child("enable"),
   635  					fmt.Sprintf("cannot be set to %t as MachineHealthCheck definition is not available in the Cluster topology or the ClusterClass", *cluster.Spec.Topology.ControlPlane.MachineHealthCheck.Enable),
   636  				))
   637  			}
   638  		}
   639  	}
   640  
   641  	if cluster.Spec.Topology.Workers != nil {
   642  		for i := range cluster.Spec.Topology.Workers.MachineDeployments {
   643  			md := cluster.Spec.Topology.Workers.MachineDeployments[i]
   644  			if md.MachineHealthCheck != nil {
   645  				fldPath := field.NewPath("spec", "topology", "workers", "machineDeployments", "machineHealthCheck").Index(i)
   646  
   647  				// Validate the MachineDeployment MachineHealthCheck if defined.
   648  				if !md.MachineHealthCheck.MachineHealthCheckClass.IsZero() {
   649  					allErrs = append(allErrs, validateMachineHealthCheckClass(fldPath, cluster.Namespace,
   650  						&md.MachineHealthCheck.MachineHealthCheckClass)...)
   651  				}
   652  
   653  				// If MachineHealthCheck is explicitly enabled then make sure that a MachineHealthCheck definition is
   654  				// available either in the Cluster topology or in the ClusterClass.
   655  				// (One of these definitions will be used in the controller to create the MachineHealthCheck)
   656  				mdClass := machineDeploymentClassOfName(clusterClass, md.Class)
   657  				if mdClass != nil { // Note: we skip handling the nil case here as it is already handled in previous validations.
   658  					// Check if the machineHealthCheck is explicitly enabled in the machineDeploymentTopology.
   659  					if md.MachineHealthCheck.Enable != nil && *md.MachineHealthCheck.Enable {
   660  						// Ensure the MHC is defined in at least one of the MachineDeploymentTopology of the Cluster or the MachineDeploymentClass of the ClusterClass.
   661  						if md.MachineHealthCheck.MachineHealthCheckClass.IsZero() && mdClass.MachineHealthCheck == nil {
   662  							allErrs = append(allErrs, field.Forbidden(
   663  								fldPath.Child("enable"),
   664  								fmt.Sprintf("cannot be set to %t as MachineHealthCheck definition is not available in the Cluster topology or the ClusterClass", *md.MachineHealthCheck.Enable),
   665  							))
   666  						}
   667  					}
   668  				}
   669  			}
   670  		}
   671  	}
   672  
   673  	return allErrs
   674  }
   675  
   676  // machineDeploymentClassOfName find a MachineDeploymentClass of the given name in the provided ClusterClass.
   677  // Returns nil if it can not find one.
   678  // TODO: Check if there is already a helper function that can do this.
   679  func machineDeploymentClassOfName(clusterClass *clusterv1.ClusterClass, name string) *clusterv1.MachineDeploymentClass {
   680  	for _, mdClass := range clusterClass.Spec.Workers.MachineDeployments {
   681  		if mdClass.Class == name {
   682  			return &mdClass
   683  		}
   684  	}
   685  	return nil
   686  }
   687  
   688  // validateCIDRBlocks ensures the passed CIDR is valid.
   689  func validateCIDRBlocks(fldPath *field.Path, cidrs []string) field.ErrorList {
   690  	var allErrs field.ErrorList
   691  	for i, cidr := range cidrs {
   692  		if _, _, err := net.ParseCIDR(cidr); err != nil {
   693  			allErrs = append(allErrs, field.Invalid(
   694  				fldPath.Index(i),
   695  				cidr,
   696  				err.Error()))
   697  		}
   698  	}
   699  	return allErrs
   700  }
   701  
   702  // DefaultAndValidateVariables defaults and validates variables in the Cluster and MachineDeployment/MachinePool topologies based
   703  // on the definitions in the ClusterClass.
   704  func DefaultAndValidateVariables(cluster *clusterv1.Cluster, clusterClass *clusterv1.ClusterClass) field.ErrorList {
   705  	var allErrs field.ErrorList
   706  	allErrs = append(allErrs, DefaultVariables(cluster, clusterClass)...)
   707  
   708  	// Variables must be validated in the defaulting webhook. Variable definitions are stored in the ClusterClass status
   709  	// and are patched in the ClusterClass reconcile.
   710  	allErrs = append(allErrs, variables.ValidateClusterVariables(cluster.Spec.Topology.Variables, clusterClass.Status.Variables,
   711  		field.NewPath("spec", "topology", "variables"))...)
   712  	if cluster.Spec.Topology.Workers != nil {
   713  		for i, md := range cluster.Spec.Topology.Workers.MachineDeployments {
   714  			// Continue if there are no variable overrides.
   715  			if md.Variables == nil || len(md.Variables.Overrides) == 0 {
   716  				continue
   717  			}
   718  			allErrs = append(allErrs, variables.ValidateMachineVariables(md.Variables.Overrides, clusterClass.Status.Variables,
   719  				field.NewPath("spec", "topology", "workers", "machineDeployments").Index(i).Child("variables", "overrides"))...)
   720  		}
   721  		for i, mp := range cluster.Spec.Topology.Workers.MachinePools {
   722  			// Continue if there are no variable overrides.
   723  			if mp.Variables == nil || len(mp.Variables.Overrides) == 0 {
   724  				continue
   725  			}
   726  			allErrs = append(allErrs, variables.ValidateMachineVariables(mp.Variables.Overrides, clusterClass.Status.Variables,
   727  				field.NewPath("spec", "topology", "workers", "machinePools").Index(i).Child("variables", "overrides"))...)
   728  		}
   729  	}
   730  	return allErrs
   731  }
   732  
   733  // DefaultVariables defaults variables in the Cluster based on information in the ClusterClass.
   734  func DefaultVariables(cluster *clusterv1.Cluster, clusterClass *clusterv1.ClusterClass) field.ErrorList {
   735  	var allErrs field.ErrorList
   736  	if cluster == nil {
   737  		return field.ErrorList{field.InternalError(field.NewPath(""), errors.New("Cluster can not be nil"))}
   738  	}
   739  	if clusterClass == nil {
   740  		return field.ErrorList{field.InternalError(field.NewPath(""), errors.New("ClusterClass can not be nil"))}
   741  	}
   742  	defaultedVariables, errs := variables.DefaultClusterVariables(cluster.Spec.Topology.Variables, clusterClass.Status.Variables,
   743  		field.NewPath("spec", "topology", "variables"))
   744  	if len(errs) > 0 {
   745  		allErrs = append(allErrs, errs...)
   746  	} else {
   747  		cluster.Spec.Topology.Variables = defaultedVariables
   748  	}
   749  
   750  	if cluster.Spec.Topology.Workers != nil {
   751  		for i, md := range cluster.Spec.Topology.Workers.MachineDeployments {
   752  			// Continue if there are no variable overrides.
   753  			if md.Variables == nil || len(md.Variables.Overrides) == 0 {
   754  				continue
   755  			}
   756  			defaultedVariables, errs := variables.DefaultMachineVariables(md.Variables.Overrides, clusterClass.Status.Variables,
   757  				field.NewPath("spec", "topology", "workers", "machineDeployments").Index(i).Child("variables", "overrides"))
   758  			if len(errs) > 0 {
   759  				allErrs = append(allErrs, errs...)
   760  			} else {
   761  				md.Variables.Overrides = defaultedVariables
   762  			}
   763  		}
   764  		for i, mp := range cluster.Spec.Topology.Workers.MachinePools {
   765  			// Continue if there are no variable overrides.
   766  			if mp.Variables == nil || len(mp.Variables.Overrides) == 0 {
   767  				continue
   768  			}
   769  			defaultedVariables, errs := variables.DefaultMachineVariables(mp.Variables.Overrides, clusterClass.Status.Variables,
   770  				field.NewPath("spec", "topology", "workers", "machinePools").Index(i).Child("variables", "overrides"))
   771  			if len(errs) > 0 {
   772  				allErrs = append(allErrs, errs...)
   773  			} else {
   774  				mp.Variables.Overrides = defaultedVariables
   775  			}
   776  		}
   777  	}
   778  	return allErrs
   779  }
   780  
   781  // ValidateClusterForClusterClass uses information in the ClusterClass to validate the Cluster.
   782  func ValidateClusterForClusterClass(cluster *clusterv1.Cluster, clusterClass *clusterv1.ClusterClass) field.ErrorList {
   783  	var allErrs field.ErrorList
   784  	if cluster == nil {
   785  		return field.ErrorList{field.InternalError(field.NewPath(""), errors.New("Cluster can not be nil"))}
   786  	}
   787  	if clusterClass == nil {
   788  		return field.ErrorList{field.InternalError(field.NewPath(""), errors.New("ClusterClass can not be nil"))}
   789  	}
   790  	allErrs = append(allErrs, check.MachineDeploymentTopologiesAreValidAndDefinedInClusterClass(cluster, clusterClass)...)
   791  
   792  	allErrs = append(allErrs, check.MachinePoolTopologiesAreValidAndDefinedInClusterClass(cluster, clusterClass)...)
   793  
   794  	// Validate the MachineHealthChecks defined in the cluster topology.
   795  	allErrs = append(allErrs, validateMachineHealthChecks(cluster, clusterClass)...)
   796  	return allErrs
   797  }
   798  
   799  // validateClusterClassExistsAndIsReconciled will try to get the ClusterClass referenced in the Cluster. If it does not exist or is not reconciled it will add a warning.
   800  // In any other case it will return an error.
   801  func (webhook *Cluster) validateClusterClassExistsAndIsReconciled(ctx context.Context, newCluster *clusterv1.Cluster) (*clusterv1.ClusterClass, admission.Warnings, error) {
   802  	var allWarnings admission.Warnings
   803  	clusterClass, clusterClassPollErr := webhook.pollClusterClassForCluster(ctx, newCluster)
   804  	if clusterClassPollErr != nil {
   805  		// Add a warning if the Class does not exist or if it has not been successfully reconciled.
   806  		switch {
   807  		case apierrors.IsNotFound(clusterClassPollErr):
   808  			allWarnings = append(allWarnings,
   809  				fmt.Sprintf(
   810  					"Cluster refers to ClusterClass %s in the topology but it does not exist. "+
   811  						"Cluster topology has not been fully validated. "+
   812  						"The ClusterClass must be created to reconcile the Cluster", newCluster.Spec.Topology.Class),
   813  			)
   814  		case errors.Is(clusterClassPollErr, errClusterClassNotReconciled):
   815  			allWarnings = append(allWarnings,
   816  				fmt.Sprintf(
   817  					"Cluster refers to ClusterClass %s but this object which hasn't yet been reconciled. "+
   818  						"Cluster topology has not been fully validated. ", newCluster.Spec.Topology.Class),
   819  			)
   820  		// If there's any other error return a generic warning with the error message.
   821  		default:
   822  			allWarnings = append(allWarnings,
   823  				fmt.Sprintf(
   824  					"Cluster refers to ClusterClass %s in the topology but it could not be retrieved. "+
   825  						"Cluster topology has not been fully validated: %s", newCluster.Spec.Topology.Class, clusterClassPollErr.Error()),
   826  			)
   827  		}
   828  	}
   829  	return clusterClass, allWarnings, clusterClassPollErr
   830  }
   831  
   832  // pollClusterClassForCluster will retry getting the ClusterClass referenced in the Cluster for two seconds.
   833  func (webhook *Cluster) pollClusterClassForCluster(ctx context.Context, cluster *clusterv1.Cluster) (*clusterv1.ClusterClass, error) {
   834  	clusterClass := &clusterv1.ClusterClass{}
   835  	var clusterClassPollErr error
   836  	_ = wait.PollUntilContextTimeout(ctx, 200*time.Millisecond, 2*time.Second, true, func(ctx context.Context) (bool, error) {
   837  		if clusterClassPollErr = webhook.Client.Get(ctx, client.ObjectKey{Namespace: cluster.Namespace, Name: cluster.Spec.Topology.Class}, clusterClass); clusterClassPollErr != nil {
   838  			return false, nil //nolint:nilerr
   839  		}
   840  
   841  		if clusterClassPollErr = clusterClassIsReconciled(clusterClass); clusterClassPollErr != nil {
   842  			return false, nil //nolint:nilerr
   843  		}
   844  		clusterClassPollErr = nil
   845  		return true, nil
   846  	})
   847  	if clusterClassPollErr != nil {
   848  		return nil, clusterClassPollErr
   849  	}
   850  	return clusterClass, nil
   851  }
   852  
   853  // clusterClassIsReconciled returns errClusterClassNotReconciled if the ClusterClass has not successfully reconciled or if the
   854  // ClusterClass variables have not been successfully reconciled.
   855  func clusterClassIsReconciled(clusterClass *clusterv1.ClusterClass) error {
   856  	// If the clusterClass metadata generation does not match the status observed generation, the ClusterClass has not been successfully reconciled.
   857  	if clusterClass.Generation != clusterClass.Status.ObservedGeneration {
   858  		return errClusterClassNotReconciled
   859  	}
   860  	// If the clusterClass does not have ClusterClassVariablesReconciled==True, the ClusterClass has not been successfully reconciled.
   861  	if !conditions.Has(clusterClass, clusterv1.ClusterClassVariablesReconciledCondition) ||
   862  		conditions.IsFalse(clusterClass, clusterv1.ClusterClassVariablesReconciledCondition) {
   863  		return errClusterClassNotReconciled
   864  	}
   865  	return nil
   866  }
   867  
   868  func validateTopologyMetadata(topology *clusterv1.Topology, fldPath *field.Path) field.ErrorList {
   869  	var allErrs field.ErrorList
   870  	allErrs = append(allErrs, topology.ControlPlane.Metadata.Validate(fldPath.Child("controlPlane", "metadata"))...)
   871  	if topology.Workers != nil {
   872  		for idx, md := range topology.Workers.MachineDeployments {
   873  			allErrs = append(allErrs, md.Metadata.Validate(
   874  				fldPath.Child("workers", "machineDeployments").Index(idx).Child("metadata"),
   875  			)...)
   876  		}
   877  		for idx, mp := range topology.Workers.MachinePools {
   878  			allErrs = append(allErrs, mp.Metadata.Validate(
   879  				fldPath.Child("workers", "machinePools").Index(idx).Child("metadata"),
   880  			)...)
   881  		}
   882  	}
   883  	return allErrs
   884  }