sigs.k8s.io/cluster-api@v1.7.1/internal/controllers/topology/cluster/cluster_controller.go (about)

     1  /*
     2  Copyright 2021 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package cluster
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"time"
    23  
    24  	"github.com/pkg/errors"
    25  	apierrors "k8s.io/apimachinery/pkg/api/errors"
    26  	"k8s.io/apimachinery/pkg/types"
    27  	kerrors "k8s.io/apimachinery/pkg/util/errors"
    28  	"k8s.io/client-go/tools/record"
    29  	ctrl "sigs.k8s.io/controller-runtime"
    30  	"sigs.k8s.io/controller-runtime/pkg/builder"
    31  	"sigs.k8s.io/controller-runtime/pkg/client"
    32  	"sigs.k8s.io/controller-runtime/pkg/controller"
    33  	"sigs.k8s.io/controller-runtime/pkg/handler"
    34  	"sigs.k8s.io/controller-runtime/pkg/reconcile"
    35  
    36  	clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
    37  	"sigs.k8s.io/cluster-api/api/v1beta1/index"
    38  	"sigs.k8s.io/cluster-api/controllers/external"
    39  	"sigs.k8s.io/cluster-api/controllers/remote"
    40  	expv1 "sigs.k8s.io/cluster-api/exp/api/v1beta1"
    41  	runtimecatalog "sigs.k8s.io/cluster-api/exp/runtime/catalog"
    42  	runtimehooksv1 "sigs.k8s.io/cluster-api/exp/runtime/hooks/api/v1alpha1"
    43  	"sigs.k8s.io/cluster-api/exp/topology/desiredstate"
    44  	"sigs.k8s.io/cluster-api/exp/topology/scope"
    45  	"sigs.k8s.io/cluster-api/feature"
    46  	"sigs.k8s.io/cluster-api/internal/controllers/topology/cluster/structuredmerge"
    47  	"sigs.k8s.io/cluster-api/internal/hooks"
    48  	tlog "sigs.k8s.io/cluster-api/internal/log"
    49  	runtimeclient "sigs.k8s.io/cluster-api/internal/runtime/client"
    50  	"sigs.k8s.io/cluster-api/internal/util/ssa"
    51  	"sigs.k8s.io/cluster-api/internal/webhooks"
    52  	"sigs.k8s.io/cluster-api/util"
    53  	"sigs.k8s.io/cluster-api/util/annotations"
    54  	"sigs.k8s.io/cluster-api/util/patch"
    55  	"sigs.k8s.io/cluster-api/util/predicates"
    56  )
    57  
    58  // +kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io;bootstrap.cluster.x-k8s.io;controlplane.cluster.x-k8s.io,resources=*,verbs=get;list;watch;create;update;patch;delete
    59  // +kubebuilder:rbac:groups=cluster.x-k8s.io,resources=clusters;clusters/status,verbs=get;list;watch;create;update;patch;delete
    60  // +kubebuilder:rbac:groups=cluster.x-k8s.io,resources=clusterclasses,verbs=get;list;watch;create;update;patch;delete
    61  // +kubebuilder:rbac:groups=cluster.x-k8s.io,resources=machinedeployments,verbs=get;list;watch;create;update;patch;delete
    62  // +kubebuilder:rbac:groups=cluster.x-k8s.io,resources=machinepools,verbs=get;list;watch;create;update;patch;delete
    63  // +kubebuilder:rbac:groups=cluster.x-k8s.io,resources=machinehealthchecks,verbs=get;list;watch;create;update;patch;delete
    64  // +kubebuilder:rbac:groups=apiextensions.k8s.io,resources=customresourcedefinitions,verbs=get;list;watch
    65  // +kubebuilder:rbac:groups=core,resources=secrets,verbs=get;create;delete
    66  
    67  // Reconciler reconciles a managed topology for a Cluster object.
    68  type Reconciler struct {
    69  	Client  client.Client
    70  	Tracker *remote.ClusterCacheTracker
    71  	// APIReader is used to list MachineSets directly via the API server to avoid
    72  	// race conditions caused by an outdated cache.
    73  	APIReader client.Reader
    74  
    75  	RuntimeClient runtimeclient.Client
    76  
    77  	// WatchFilterValue is the label value used to filter events prior to reconciliation.
    78  	WatchFilterValue string
    79  
    80  	// UnstructuredCachingClient provides a client that forces caching of unstructured objects,
    81  	// thus allowing to optimize reads for templates or provider specific objects in a managed topology.
    82  	UnstructuredCachingClient client.Client
    83  
    84  	externalTracker external.ObjectTracker
    85  	recorder        record.EventRecorder
    86  
    87  	// desiredStateGenerator is used to generate the desired state.
    88  	desiredStateGenerator desiredstate.Generator
    89  
    90  	patchHelperFactory structuredmerge.PatchHelperFactoryFunc
    91  }
    92  
    93  func (r *Reconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, options controller.Options) error {
    94  	c, err := ctrl.NewControllerManagedBy(mgr).
    95  		For(&clusterv1.Cluster{}, builder.WithPredicates(
    96  			// Only reconcile Cluster with topology.
    97  			predicates.ClusterHasTopology(ctrl.LoggerFrom(ctx)),
    98  		)).
    99  		Named("topology/cluster").
   100  		Watches(
   101  			&clusterv1.ClusterClass{},
   102  			handler.EnqueueRequestsFromMapFunc(r.clusterClassToCluster),
   103  		).
   104  		Watches(
   105  			&clusterv1.MachineDeployment{},
   106  			handler.EnqueueRequestsFromMapFunc(r.machineDeploymentToCluster),
   107  			// Only trigger Cluster reconciliation if the MachineDeployment is topology owned.
   108  			builder.WithPredicates(predicates.ResourceIsTopologyOwned(ctrl.LoggerFrom(ctx))),
   109  		).
   110  		Watches(
   111  			&expv1.MachinePool{},
   112  			handler.EnqueueRequestsFromMapFunc(r.machinePoolToCluster),
   113  			// Only trigger Cluster reconciliation if the MachinePool is topology owned.
   114  			builder.WithPredicates(predicates.ResourceIsTopologyOwned(ctrl.LoggerFrom(ctx))),
   115  		).
   116  		WithOptions(options).
   117  		WithEventFilter(predicates.ResourceNotPausedAndHasFilterLabel(ctrl.LoggerFrom(ctx), r.WatchFilterValue)).
   118  		Build(r)
   119  
   120  	if err != nil {
   121  		return errors.Wrap(err, "failed setting up with a controller manager")
   122  	}
   123  
   124  	r.externalTracker = external.ObjectTracker{
   125  		Controller: c,
   126  		Cache:      mgr.GetCache(),
   127  	}
   128  	r.desiredStateGenerator = desiredstate.NewGenerator(r.Client, r.Tracker, r.RuntimeClient)
   129  	r.recorder = mgr.GetEventRecorderFor("topology/cluster-controller")
   130  	if r.patchHelperFactory == nil {
   131  		r.patchHelperFactory = serverSideApplyPatchHelperFactory(r.Client, ssa.NewCache())
   132  	}
   133  	return nil
   134  }
   135  
   136  // SetupForDryRun prepares the Reconciler for a dry run execution.
   137  func (r *Reconciler) SetupForDryRun(recorder record.EventRecorder) {
   138  	r.desiredStateGenerator = desiredstate.NewGenerator(r.Client, r.Tracker, r.RuntimeClient)
   139  	r.recorder = recorder
   140  	r.patchHelperFactory = dryRunPatchHelperFactory(r.Client)
   141  }
   142  
   143  func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Result, reterr error) {
   144  	log := ctrl.LoggerFrom(ctx)
   145  
   146  	// Fetch the Cluster instance.
   147  	cluster := &clusterv1.Cluster{}
   148  	if err := r.Client.Get(ctx, req.NamespacedName, cluster); err != nil {
   149  		if apierrors.IsNotFound(err) {
   150  			return ctrl.Result{}, nil
   151  		}
   152  		// Error reading the object - requeue the request.
   153  		return ctrl.Result{}, err
   154  	}
   155  	cluster.APIVersion = clusterv1.GroupVersion.String()
   156  	cluster.Kind = "Cluster"
   157  
   158  	// Return early, if the Cluster does not use a managed topology.
   159  	// NOTE: We're already filtering events, but this is a safeguard for cases like e.g. when
   160  	// there are MachineDeployments which have the topology owned label, but the corresponding
   161  	// cluster is not topology owned.
   162  	if cluster.Spec.Topology == nil {
   163  		return ctrl.Result{}, nil
   164  	}
   165  
   166  	// Return early if the Cluster is paused.
   167  	// TODO: What should we do if the cluster class is paused?
   168  	if annotations.IsPaused(cluster, cluster) {
   169  		log.Info("Reconciliation is paused for this object")
   170  		return ctrl.Result{}, nil
   171  	}
   172  
   173  	patchHelper, err := patch.NewHelper(cluster, r.Client)
   174  	if err != nil {
   175  		return ctrl.Result{}, err
   176  	}
   177  
   178  	// Create a scope initialized with only the cluster; during reconcile
   179  	// additional information will be added about the Cluster blueprint, current state and desired state.
   180  	s := scope.New(cluster)
   181  
   182  	defer func() {
   183  		if err := r.reconcileConditions(s, cluster, reterr); err != nil {
   184  			reterr = kerrors.NewAggregate([]error{reterr, errors.Wrap(err, "failed to reconcile cluster topology conditions")})
   185  			return
   186  		}
   187  		options := []patch.Option{
   188  			patch.WithOwnedConditions{Conditions: []clusterv1.ConditionType{
   189  				clusterv1.TopologyReconciledCondition,
   190  			}},
   191  			patch.WithForceOverwriteConditions{},
   192  		}
   193  		if err := patchHelper.Patch(ctx, cluster, options...); err != nil {
   194  			reterr = kerrors.NewAggregate([]error{reterr, err})
   195  			return
   196  		}
   197  	}()
   198  
   199  	// In case the object is deleted, the managed topology stops to reconcile;
   200  	// (the other controllers will take care of deletion).
   201  	if !cluster.ObjectMeta.DeletionTimestamp.IsZero() {
   202  		return r.reconcileDelete(ctx, cluster)
   203  	}
   204  
   205  	// Handle normal reconciliation loop.
   206  	result, err := r.reconcile(ctx, s)
   207  	if err != nil {
   208  		// Requeue if the reconcile failed because the ClusterCacheTracker was locked for
   209  		// the current cluster because of concurrent access.
   210  		if errors.Is(err, remote.ErrClusterLocked) {
   211  			log.V(5).Info("Requeuing because another worker has the lock on the ClusterCacheTracker")
   212  			return ctrl.Result{RequeueAfter: time.Minute}, nil
   213  		}
   214  	}
   215  	return result, err
   216  }
   217  
   218  // reconcile handles cluster reconciliation.
   219  func (r *Reconciler) reconcile(ctx context.Context, s *scope.Scope) (ctrl.Result, error) {
   220  	var err error
   221  
   222  	// Get ClusterClass.
   223  	clusterClass := &clusterv1.ClusterClass{}
   224  	key := client.ObjectKey{Name: s.Current.Cluster.Spec.Topology.Class, Namespace: s.Current.Cluster.Namespace}
   225  	if err := r.Client.Get(ctx, key, clusterClass); err != nil {
   226  		return ctrl.Result{}, errors.Wrapf(err, "failed to retrieve ClusterClass %s", s.Current.Cluster.Spec.Topology.Class)
   227  	}
   228  
   229  	s.Blueprint.ClusterClass = clusterClass
   230  	// If the ClusterClass `metadata.Generation` doesn't match the `status.ObservedGeneration` return as the ClusterClass
   231  	// is not up to date.
   232  	// Note: This doesn't require requeue as a change to ClusterClass observedGeneration will cause an additional reconcile
   233  	// in the Cluster.
   234  	if clusterClass.GetGeneration() != clusterClass.Status.ObservedGeneration {
   235  		return ctrl.Result{}, nil
   236  	}
   237  
   238  	// Default and Validate the Cluster variables based on information from the ClusterClass.
   239  	// This step is needed as if the ClusterClass does not exist at Cluster creation some fields may not be defaulted or
   240  	// validated in the webhook.
   241  	if errs := webhooks.DefaultAndValidateVariables(s.Current.Cluster, clusterClass); len(errs) > 0 {
   242  		return ctrl.Result{}, apierrors.NewInvalid(clusterv1.GroupVersion.WithKind("Cluster").GroupKind(), s.Current.Cluster.Name, errs)
   243  	}
   244  
   245  	// Gets the blueprint with the ClusterClass and the referenced templates
   246  	// and store it in the request scope.
   247  	s.Blueprint, err = r.getBlueprint(ctx, s.Current.Cluster, s.Blueprint.ClusterClass)
   248  	if err != nil {
   249  		return ctrl.Result{}, errors.Wrap(err, "error reading the ClusterClass")
   250  	}
   251  
   252  	// Gets the current state of the Cluster and store it in the request scope.
   253  	s.Current, err = r.getCurrentState(ctx, s)
   254  	if err != nil {
   255  		return ctrl.Result{}, errors.Wrap(err, "error reading current state of the Cluster topology")
   256  	}
   257  
   258  	// The cluster topology is yet to be created. Call the BeforeClusterCreate hook before proceeding.
   259  	if feature.Gates.Enabled(feature.RuntimeSDK) {
   260  		res, err := r.callBeforeClusterCreateHook(ctx, s)
   261  		if err != nil {
   262  			return reconcile.Result{}, err
   263  		}
   264  		if !res.IsZero() {
   265  			return res, nil
   266  		}
   267  	}
   268  
   269  	// Setup watches for InfrastructureCluster and ControlPlane CRs when they exist.
   270  	if err := r.setupDynamicWatches(ctx, s); err != nil {
   271  		return ctrl.Result{}, errors.Wrap(err, "error creating dynamic watch")
   272  	}
   273  
   274  	// Computes the desired state of the Cluster and store it in the request scope.
   275  	s.Desired, err = r.desiredStateGenerator.Generate(ctx, s)
   276  	if err != nil {
   277  		return ctrl.Result{}, errors.Wrap(err, "error computing the desired state of the Cluster topology")
   278  	}
   279  
   280  	// Reconciles current and desired state of the Cluster
   281  	if err := r.reconcileState(ctx, s); err != nil {
   282  		return ctrl.Result{}, errors.Wrap(err, "error reconciling the Cluster topology")
   283  	}
   284  
   285  	// requeueAfter will not be 0 if any of the runtime hooks returns a blocking response.
   286  	requeueAfter := s.HookResponseTracker.AggregateRetryAfter()
   287  	if requeueAfter != 0 {
   288  		return ctrl.Result{RequeueAfter: requeueAfter}, nil
   289  	}
   290  
   291  	return ctrl.Result{}, nil
   292  }
   293  
   294  // setupDynamicWatches create watches for InfrastructureCluster and ControlPlane CRs when they exist.
   295  func (r *Reconciler) setupDynamicWatches(ctx context.Context, s *scope.Scope) error {
   296  	if s.Current.InfrastructureCluster != nil {
   297  		if err := r.externalTracker.Watch(ctrl.LoggerFrom(ctx), s.Current.InfrastructureCluster,
   298  			handler.EnqueueRequestForOwner(r.Client.Scheme(), r.Client.RESTMapper(), &clusterv1.Cluster{}),
   299  			// Only trigger Cluster reconciliation if the InfrastructureCluster is topology owned.
   300  			predicates.ResourceIsTopologyOwned(ctrl.LoggerFrom(ctx))); err != nil {
   301  			return errors.Wrap(err, "error watching Infrastructure CR")
   302  		}
   303  	}
   304  	if s.Current.ControlPlane.Object != nil {
   305  		if err := r.externalTracker.Watch(ctrl.LoggerFrom(ctx), s.Current.ControlPlane.Object,
   306  			handler.EnqueueRequestForOwner(r.Client.Scheme(), r.Client.RESTMapper(), &clusterv1.Cluster{}),
   307  			// Only trigger Cluster reconciliation if the ControlPlane is topology owned.
   308  			predicates.ResourceIsTopologyOwned(ctrl.LoggerFrom(ctx))); err != nil {
   309  			return errors.Wrap(err, "error watching ControlPlane CR")
   310  		}
   311  	}
   312  	return nil
   313  }
   314  
   315  func (r *Reconciler) callBeforeClusterCreateHook(ctx context.Context, s *scope.Scope) (reconcile.Result, error) {
   316  	// If the cluster objects (InfraCluster, ControlPlane, etc) are not yet created we are in the creation phase.
   317  	// Call the BeforeClusterCreate hook before proceeding.
   318  	log := tlog.LoggerFrom(ctx)
   319  	if s.Current.Cluster.Spec.InfrastructureRef == nil && s.Current.Cluster.Spec.ControlPlaneRef == nil {
   320  		hookRequest := &runtimehooksv1.BeforeClusterCreateRequest{
   321  			Cluster: *s.Current.Cluster,
   322  		}
   323  		hookResponse := &runtimehooksv1.BeforeClusterCreateResponse{}
   324  		if err := r.RuntimeClient.CallAllExtensions(ctx, runtimehooksv1.BeforeClusterCreate, s.Current.Cluster, hookRequest, hookResponse); err != nil {
   325  			return ctrl.Result{}, err
   326  		}
   327  		s.HookResponseTracker.Add(runtimehooksv1.BeforeClusterCreate, hookResponse)
   328  		if hookResponse.RetryAfterSeconds != 0 {
   329  			log.Infof("Creation of Cluster topology is blocked by %s hook", runtimecatalog.HookName(runtimehooksv1.BeforeClusterCreate))
   330  			return ctrl.Result{RequeueAfter: time.Duration(hookResponse.RetryAfterSeconds) * time.Second}, nil
   331  		}
   332  	}
   333  	return ctrl.Result{}, nil
   334  }
   335  
   336  // clusterClassToCluster is a handler.ToRequestsFunc to be used to enqueue requests for reconciliation
   337  // for Cluster to update when its own ClusterClass gets updated.
   338  func (r *Reconciler) clusterClassToCluster(ctx context.Context, o client.Object) []ctrl.Request {
   339  	clusterClass, ok := o.(*clusterv1.ClusterClass)
   340  	if !ok {
   341  		panic(fmt.Sprintf("Expected a ClusterClass but got a %T", o))
   342  	}
   343  
   344  	clusterList := &clusterv1.ClusterList{}
   345  	if err := r.Client.List(
   346  		ctx,
   347  		clusterList,
   348  		client.MatchingFields{index.ClusterClassNameField: clusterClass.Name},
   349  		client.InNamespace(clusterClass.Namespace),
   350  	); err != nil {
   351  		return nil
   352  	}
   353  
   354  	// There can be more than one cluster using the same cluster class.
   355  	// create a request for each of the clusters.
   356  	requests := []ctrl.Request{}
   357  	for i := range clusterList.Items {
   358  		requests = append(requests, ctrl.Request{NamespacedName: util.ObjectKey(&clusterList.Items[i])})
   359  	}
   360  	return requests
   361  }
   362  
   363  // machineDeploymentToCluster is a handler.ToRequestsFunc to be used to enqueue requests for reconciliation
   364  // for Cluster to update when one of its own MachineDeployments gets updated.
   365  func (r *Reconciler) machineDeploymentToCluster(_ context.Context, o client.Object) []ctrl.Request {
   366  	md, ok := o.(*clusterv1.MachineDeployment)
   367  	if !ok {
   368  		panic(fmt.Sprintf("Expected a MachineDeployment but got a %T", o))
   369  	}
   370  	if md.Spec.ClusterName == "" {
   371  		return nil
   372  	}
   373  
   374  	return []ctrl.Request{{
   375  		NamespacedName: types.NamespacedName{
   376  			Namespace: md.Namespace,
   377  			Name:      md.Spec.ClusterName,
   378  		},
   379  	}}
   380  }
   381  
   382  // machinePoolToCluster is a handler.ToRequestsFunc to be used to enqueue requests for reconciliation
   383  // for Cluster to update when one of its own MachinePools gets updated.
   384  func (r *Reconciler) machinePoolToCluster(_ context.Context, o client.Object) []ctrl.Request {
   385  	mp, ok := o.(*expv1.MachinePool)
   386  	if !ok {
   387  		panic(fmt.Sprintf("Expected a MachinePool but got a %T", o))
   388  	}
   389  	if mp.Spec.ClusterName == "" {
   390  		return nil
   391  	}
   392  
   393  	return []ctrl.Request{{
   394  		NamespacedName: types.NamespacedName{
   395  			Namespace: mp.Namespace,
   396  			Name:      mp.Spec.ClusterName,
   397  		},
   398  	}}
   399  }
   400  
   401  func (r *Reconciler) reconcileDelete(ctx context.Context, cluster *clusterv1.Cluster) (ctrl.Result, error) {
   402  	// Call the BeforeClusterDelete hook if the 'ok-to-delete' annotation is not set
   403  	// and add the annotation to the cluster after receiving a successful non-blocking response.
   404  	log := tlog.LoggerFrom(ctx)
   405  	if feature.Gates.Enabled(feature.RuntimeSDK) {
   406  		if !hooks.IsOkToDelete(cluster) {
   407  			hookRequest := &runtimehooksv1.BeforeClusterDeleteRequest{
   408  				Cluster: *cluster,
   409  			}
   410  			hookResponse := &runtimehooksv1.BeforeClusterDeleteResponse{}
   411  			if err := r.RuntimeClient.CallAllExtensions(ctx, runtimehooksv1.BeforeClusterDelete, cluster, hookRequest, hookResponse); err != nil {
   412  				return ctrl.Result{}, err
   413  			}
   414  			if hookResponse.RetryAfterSeconds != 0 {
   415  				log.Infof("Cluster deletion is blocked by %q hook", runtimecatalog.HookName(runtimehooksv1.BeforeClusterDelete))
   416  				return ctrl.Result{RequeueAfter: time.Duration(hookResponse.RetryAfterSeconds) * time.Second}, nil
   417  			}
   418  			// The BeforeClusterDelete hook returned a non-blocking response. Now the cluster is ready to be deleted.
   419  			// Lets mark the cluster as `ok-to-delete`
   420  			if err := hooks.MarkAsOkToDelete(ctx, r.Client, cluster); err != nil {
   421  				return ctrl.Result{}, err
   422  			}
   423  		}
   424  	}
   425  	return ctrl.Result{}, nil
   426  }
   427  
   428  // serverSideApplyPatchHelperFactory makes use of managed fields provided by server side apply and is used by the controller.
   429  func serverSideApplyPatchHelperFactory(c client.Client, ssaCache ssa.Cache) structuredmerge.PatchHelperFactoryFunc {
   430  	return func(ctx context.Context, original, modified client.Object, opts ...structuredmerge.HelperOption) (structuredmerge.PatchHelper, error) {
   431  		return structuredmerge.NewServerSidePatchHelper(ctx, original, modified, c, ssaCache, opts...)
   432  	}
   433  }
   434  
   435  // dryRunPatchHelperFactory makes use of a two-ways patch and is used in situations where we cannot rely on managed fields.
   436  func dryRunPatchHelperFactory(c client.Client) structuredmerge.PatchHelperFactoryFunc {
   437  	return func(_ context.Context, original, modified client.Object, opts ...structuredmerge.HelperOption) (structuredmerge.PatchHelper, error) {
   438  		return structuredmerge.NewTwoWaysPatchHelper(original, modified, c, opts...)
   439  	}
   440  }