sigs.k8s.io/cluster-api-provider-azure@v1.14.3/exp/controllers/azuremachinepoolmachine_controller.go (about)

     1  /*
     2  Copyright 2021 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package controllers
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"time"
    23  
    24  	"github.com/pkg/errors"
    25  	corev1 "k8s.io/api/core/v1"
    26  	apierrors "k8s.io/apimachinery/pkg/api/errors"
    27  	"k8s.io/apimachinery/pkg/runtime"
    28  	"k8s.io/client-go/tools/record"
    29  	infrav1 "sigs.k8s.io/cluster-api-provider-azure/api/v1beta1"
    30  	"sigs.k8s.io/cluster-api-provider-azure/azure"
    31  	"sigs.k8s.io/cluster-api-provider-azure/azure/scope"
    32  	"sigs.k8s.io/cluster-api-provider-azure/azure/services/scalesetvms"
    33  	infracontroller "sigs.k8s.io/cluster-api-provider-azure/controllers"
    34  	infrav1exp "sigs.k8s.io/cluster-api-provider-azure/exp/api/v1beta1"
    35  	"sigs.k8s.io/cluster-api-provider-azure/pkg/coalescing"
    36  	"sigs.k8s.io/cluster-api-provider-azure/util/reconciler"
    37  	"sigs.k8s.io/cluster-api-provider-azure/util/tele"
    38  	clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
    39  	capierrors "sigs.k8s.io/cluster-api/errors"
    40  	"sigs.k8s.io/cluster-api/util"
    41  	"sigs.k8s.io/cluster-api/util/annotations"
    42  	"sigs.k8s.io/cluster-api/util/conditions"
    43  	"sigs.k8s.io/cluster-api/util/predicates"
    44  	ctrl "sigs.k8s.io/controller-runtime"
    45  	"sigs.k8s.io/controller-runtime/pkg/client"
    46  	"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
    47  	"sigs.k8s.io/controller-runtime/pkg/handler"
    48  	"sigs.k8s.io/controller-runtime/pkg/reconcile"
    49  	"sigs.k8s.io/controller-runtime/pkg/source"
    50  )
    51  
    52  type (
    53  	azureMachinePoolMachineReconcilerFactory func(*scope.MachinePoolMachineScope) (azure.Reconciler, error)
    54  
    55  	// AzureMachinePoolMachineController handles Kubernetes change events for AzureMachinePoolMachine resources.
    56  	AzureMachinePoolMachineController struct {
    57  		client.Client
    58  		Scheme            *runtime.Scheme
    59  		Recorder          record.EventRecorder
    60  		Timeouts          reconciler.Timeouts
    61  		WatchFilterValue  string
    62  		reconcilerFactory azureMachinePoolMachineReconcilerFactory
    63  	}
    64  
    65  	azureMachinePoolMachineReconciler struct {
    66  		Scope              *scope.MachinePoolMachineScope
    67  		scalesetVMsService *scalesetvms.Service
    68  	}
    69  )
    70  
    71  // NewAzureMachinePoolMachineController creates a new AzureMachinePoolMachineController to handle updates to Azure Machine Pool Machines.
    72  func NewAzureMachinePoolMachineController(c client.Client, recorder record.EventRecorder, timeouts reconciler.Timeouts, watchFilterValue string) *AzureMachinePoolMachineController {
    73  	return &AzureMachinePoolMachineController{
    74  		Client:            c,
    75  		Recorder:          recorder,
    76  		Timeouts:          timeouts,
    77  		WatchFilterValue:  watchFilterValue,
    78  		reconcilerFactory: newAzureMachinePoolMachineReconciler,
    79  	}
    80  }
    81  
    82  // SetupWithManager initializes this controller with a manager.
    83  func (ampmr *AzureMachinePoolMachineController) SetupWithManager(ctx context.Context, mgr ctrl.Manager, options infracontroller.Options) error {
    84  	ctx, log, done := tele.StartSpanWithLogger(ctx,
    85  		"controllers.AzureMachinePoolMachineController.SetupWithManager",
    86  		tele.KVP("controller", "AzureMachinePoolMachine"),
    87  	)
    88  	defer done()
    89  
    90  	var r reconcile.Reconciler = ampmr
    91  	if options.Cache != nil {
    92  		r = coalescing.NewReconciler(ampmr, options.Cache, log)
    93  	}
    94  
    95  	c, err := ctrl.NewControllerManagedBy(mgr).
    96  		WithOptions(options.Options).
    97  		For(&infrav1exp.AzureMachinePoolMachine{}).
    98  		WithEventFilter(predicates.ResourceNotPausedAndHasFilterLabel(log, ampmr.WatchFilterValue)).
    99  		Build(r)
   100  	if err != nil {
   101  		return errors.Wrapf(err, "error creating controller")
   102  	}
   103  
   104  	// Add a watch on AzureMachinePool for model changes
   105  	if err := c.Watch(
   106  		source.Kind(mgr.GetCache(), &infrav1exp.AzureMachinePool{}),
   107  		handler.EnqueueRequestsFromMapFunc(AzureMachinePoolToAzureMachinePoolMachines(ctx, mgr.GetClient(), log)),
   108  		MachinePoolModelHasChanged(log),
   109  		predicates.ResourceNotPausedAndHasFilterLabel(log, ampmr.WatchFilterValue),
   110  	); err != nil {
   111  		return errors.Wrapf(err, "failed adding a watch for AzureMachinePool model changes")
   112  	}
   113  
   114  	// Add a watch on CAPI Machines for MachinePool Machines
   115  	if err := c.Watch(
   116  		source.Kind(mgr.GetCache(), &clusterv1.Machine{}),
   117  		handler.EnqueueRequestsFromMapFunc(util.MachineToInfrastructureMapFunc(infrav1exp.GroupVersion.WithKind("AzureMachinePoolMachine"))),
   118  		predicates.ResourceNotPausedAndHasFilterLabel(log, ampmr.WatchFilterValue),
   119  	); err != nil {
   120  		return errors.Wrapf(err, "failed adding a watch for Machine model changes")
   121  	}
   122  
   123  	return nil
   124  }
   125  
   126  // +kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io,resources=azuremachinepools,verbs=get;list;watch
   127  // +kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io,resources=azuremachinepools/status,verbs=get
   128  // +kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io,resources=azuremachinepoolmachines,verbs=get;list;watch;create;update;patch;delete
   129  // +kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io,resources=azuremachinepoolmachines/status,verbs=get;update;patch
   130  // +kubebuilder:rbac:groups=cluster.x-k8s.io,resources=machinepools;machinepools/status,verbs=get
   131  // +kubebuilder:rbac:groups=cluster.x-k8s.io,resources=machines;machines/status,verbs=get;list;watch;delete
   132  // +kubebuilder:rbac:groups="",resources=events,verbs=get;list;watch;create;update;patch
   133  // +kubebuilder:rbac:groups="",resources=secrets;,verbs=get;list;watch
   134  // +kubebuilder:rbac:groups=core,resources=nodes,verbs=get;list;watch
   135  
   136  // Reconcile idempotently gets, creates, and updates a machine pool.
   137  func (ampmr *AzureMachinePoolMachineController) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Result, reterr error) {
   138  	ctx, logger, done := tele.StartSpanWithLogger(
   139  		ctx,
   140  		"controllers.AzureMachinePoolMachineController.Reconcile",
   141  		tele.KVP("namespace", req.Namespace),
   142  		tele.KVP("name", req.Name),
   143  		tele.KVP("kind", "AzureMachinePoolMachine"),
   144  	)
   145  	defer done()
   146  
   147  	logger = logger.WithValues("namespace", req.Namespace, "azureMachinePoolMachine", req.Name)
   148  
   149  	ctx, cancel := context.WithTimeout(ctx, ampmr.Timeouts.DefaultedLoopTimeout())
   150  	defer cancel()
   151  
   152  	azureMachine := &infrav1exp.AzureMachinePoolMachine{}
   153  	err := ampmr.Get(ctx, req.NamespacedName, azureMachine)
   154  	if err != nil {
   155  		if apierrors.IsNotFound(err) {
   156  			return reconcile.Result{}, nil
   157  		}
   158  		return reconcile.Result{}, err
   159  	}
   160  	logger.V(2).Info("Fetching cluster for AzureMachinePoolMachine", "ampm", azureMachine.Name)
   161  
   162  	// Fetch the Cluster.
   163  	cluster, err := util.GetClusterFromMetadata(ctx, ampmr.Client, azureMachine.ObjectMeta)
   164  	if err != nil {
   165  		logger.Info("AzureMachinePoolMachine is missing cluster label or cluster does not exist")
   166  		return reconcile.Result{}, nil
   167  	}
   168  
   169  	logger = logger.WithValues("cluster", cluster.Name)
   170  
   171  	// Return early if the object or Cluster is paused.
   172  	if annotations.IsPaused(cluster, azureMachine) {
   173  		logger.Info("AzureMachinePoolMachine or linked Cluster is marked as paused. Won't reconcile")
   174  		return ctrl.Result{}, nil
   175  	}
   176  
   177  	clusterScope, err := infracontroller.GetClusterScoper(ctx, logger, ampmr.Client, cluster, ampmr.Timeouts)
   178  	if err != nil {
   179  		return reconcile.Result{}, errors.Wrapf(err, "failed to create cluster scope for cluster %s/%s", cluster.Namespace, cluster.Name)
   180  	}
   181  
   182  	logger.V(2).Info("Fetching AzureMachinePool with object meta", "meta", azureMachine.ObjectMeta)
   183  	// Fetch the owning AzureMachinePool (VMSS)
   184  	azureMachinePool, err := infracontroller.GetOwnerAzureMachinePool(ctx, ampmr.Client, azureMachine.ObjectMeta)
   185  	if err != nil {
   186  		if apierrors.IsNotFound(err) {
   187  			logger.Info("AzureMachinePool not found error missing, removing finalizer", "azureMachinePoolMachine", azureMachine.Name)
   188  			controllerutil.RemoveFinalizer(azureMachine, infrav1exp.AzureMachinePoolMachineFinalizer)
   189  			return reconcile.Result{}, ampmr.Client.Update(ctx, azureMachine)
   190  		}
   191  		return reconcile.Result{}, err
   192  	}
   193  	if azureMachinePool == nil {
   194  		logger.Info("AzureMachinePool not found error missing, removing finalizer", "azureMachinePoolMachine", azureMachine.Name)
   195  		controllerutil.RemoveFinalizer(azureMachine, infrav1exp.AzureMachinePoolMachineFinalizer)
   196  		return reconcile.Result{}, ampmr.Client.Update(ctx, azureMachine)
   197  	}
   198  
   199  	logger = logger.WithValues("azureMachinePool", azureMachinePool.Name)
   200  
   201  	// Fetch the CAPI MachinePool.
   202  	machinePool, err := infracontroller.GetOwnerMachinePool(ctx, ampmr.Client, azureMachinePool.ObjectMeta)
   203  	if err != nil && !apierrors.IsNotFound(err) {
   204  		return reconcile.Result{}, err
   205  	}
   206  
   207  	if machinePool != nil {
   208  		logger = logger.WithValues("machinePool", machinePool.Name)
   209  	}
   210  
   211  	// Fetch the CAPI Machine.
   212  	machine, err := util.GetOwnerMachine(ctx, ampmr.Client, azureMachine.ObjectMeta)
   213  	if err != nil && !apierrors.IsNotFound(err) {
   214  		return reconcile.Result{}, err
   215  	}
   216  
   217  	if machine != nil {
   218  		logger = logger.WithValues("machine", machine.Name)
   219  	} else {
   220  		logger.Info("Waiting for Machine Controller to set OwnerRef on AzureMachinePoolMachine")
   221  		return reconcile.Result{}, nil
   222  	}
   223  
   224  	// Create the machine pool scope
   225  	machineScope, err := scope.NewMachinePoolMachineScope(scope.MachinePoolMachineScopeParams{
   226  		Client:                  ampmr.Client,
   227  		MachinePool:             machinePool,
   228  		AzureMachinePool:        azureMachinePool,
   229  		AzureMachinePoolMachine: azureMachine,
   230  		Machine:                 machine,
   231  		ClusterScope:            clusterScope,
   232  	})
   233  	if err != nil {
   234  		return reconcile.Result{}, errors.Wrap(err, "failed to create scope")
   235  	}
   236  
   237  	// Always close the scope when exiting this function so we can persist any AzureMachine changes.
   238  	defer func() {
   239  		if err := machineScope.Close(ctx); err != nil && reterr == nil {
   240  			reterr = err
   241  		}
   242  	}()
   243  
   244  	// Handle deleted machine pools machine
   245  	if !azureMachine.ObjectMeta.DeletionTimestamp.IsZero() {
   246  		return ampmr.reconcileDelete(ctx, machineScope, clusterScope)
   247  	}
   248  
   249  	if !cluster.Status.InfrastructureReady {
   250  		logger.Info("Cluster infrastructure is not ready yet")
   251  		return reconcile.Result{}, nil
   252  	}
   253  
   254  	// Handle non-deleted machine pools
   255  	return ampmr.reconcileNormal(ctx, machineScope)
   256  }
   257  
   258  func (ampmr *AzureMachinePoolMachineController) reconcileNormal(ctx context.Context, machineScope *scope.MachinePoolMachineScope) (_ reconcile.Result, reterr error) {
   259  	ctx, log, done := tele.StartSpanWithLogger(ctx, "controllers.AzureMachinePoolMachineController.reconcileNormal")
   260  	defer done()
   261  
   262  	log.Info("Reconciling AzureMachinePoolMachine")
   263  	// If the AzureMachine is in an error state, return early.
   264  	if machineScope.AzureMachinePool.Status.FailureReason != nil || machineScope.AzureMachinePool.Status.FailureMessage != nil {
   265  		log.Info("Error state detected, skipping reconciliation")
   266  		return reconcile.Result{}, nil
   267  	}
   268  
   269  	ampms, err := ampmr.reconcilerFactory(machineScope)
   270  	if err != nil {
   271  		return reconcile.Result{}, errors.Wrap(err, "failed to create AzureMachinePoolMachine reconciler")
   272  	}
   273  	if err := ampms.Reconcile(ctx); err != nil {
   274  		// Handle transient and terminal errors
   275  		var reconcileError azure.ReconcileError
   276  		if errors.As(err, &reconcileError) {
   277  			if reconcileError.IsTerminal() {
   278  				log.Error(err, "failed to reconcile AzureMachinePool", "name", machineScope.Name())
   279  				return reconcile.Result{}, nil
   280  			}
   281  
   282  			if reconcileError.IsTransient() {
   283  				log.V(4).Info("failed to reconcile AzureMachinePoolMachine", "name", machineScope.Name(), "transient_error", err)
   284  				return reconcile.Result{RequeueAfter: reconcileError.RequeueAfter()}, nil
   285  			}
   286  
   287  			return reconcile.Result{}, errors.Wrap(err, "failed to reconcile AzureMachinePool")
   288  		}
   289  
   290  		return reconcile.Result{}, err
   291  	}
   292  
   293  	state := machineScope.ProvisioningState()
   294  	switch state {
   295  	case infrav1.Failed:
   296  		ampmr.Recorder.Eventf(machineScope.AzureMachinePoolMachine, corev1.EventTypeWarning, "FailedVMState", "Azure scale set VM is in failed state")
   297  		machineScope.SetFailureReason(capierrors.UpdateMachineError)
   298  		machineScope.SetFailureMessage(errors.Errorf("Azure VM state is %s", state))
   299  	case infrav1.Deleting:
   300  		if err := ampmr.Client.Delete(ctx, machineScope.AzureMachinePoolMachine); err != nil {
   301  			return reconcile.Result{}, errors.Wrap(err, "machine pool machine failed to be deleted when deleting")
   302  		}
   303  	}
   304  
   305  	log.V(2).Info(fmt.Sprintf("Scale Set VM is %s", state), "id", machineScope.ProviderID())
   306  
   307  	bootstrappingCondition := conditions.Get(machineScope.AzureMachinePoolMachine, infrav1.BootstrapSucceededCondition)
   308  	if bootstrappingCondition != nil && bootstrappingCondition.Reason == infrav1.BootstrapFailedReason {
   309  		return reconcile.Result{}, nil
   310  	}
   311  
   312  	if !infrav1.IsTerminalProvisioningState(state) || !machineScope.IsReady() {
   313  		log.V(2).Info("Requeuing", "state", state, "ready", machineScope.IsReady())
   314  		// we are in a non-terminal state, retry in a bit
   315  		return reconcile.Result{
   316  			RequeueAfter: 30 * time.Second,
   317  		}, nil
   318  	}
   319  
   320  	return reconcile.Result{}, nil
   321  }
   322  
   323  func (ampmr *AzureMachinePoolMachineController) reconcileDelete(ctx context.Context, machineScope *scope.MachinePoolMachineScope, clusterScope infracontroller.ClusterScoper) (_ reconcile.Result, reterr error) {
   324  	ctx, log, done := tele.StartSpanWithLogger(ctx, "controllers.AzureMachinePoolMachineController.reconcileDelete")
   325  	defer done()
   326  
   327  	if !infracontroller.ShouldDeleteIndividualResources(ctx, clusterScope) {
   328  		log.Info("Skipping VMSS VM deletion as the whole resource group is being deleted")
   329  
   330  		controllerutil.RemoveFinalizer(machineScope.AzureMachinePoolMachine, infrav1exp.AzureMachinePoolMachineFinalizer)
   331  		return reconcile.Result{}, nil
   332  	}
   333  
   334  	if !machineScope.AzureMachinePool.ObjectMeta.DeletionTimestamp.IsZero() {
   335  		log.Info("Skipping VMSS VM deletion as VMSS delete will delete individual instances")
   336  
   337  		controllerutil.RemoveFinalizer(machineScope.AzureMachinePoolMachine, infrav1exp.AzureMachinePoolMachineFinalizer)
   338  		return reconcile.Result{}, nil
   339  	}
   340  
   341  	log.Info("Deleting AzureMachinePoolMachine")
   342  
   343  	// deleting a single machine
   344  	// 1) delete the infrastructure, node drain already done by owner Machine
   345  	// 2) remove finalizer
   346  
   347  	ampms, err := ampmr.reconcilerFactory(machineScope)
   348  	if err != nil {
   349  		return reconcile.Result{}, errors.Wrap(err, "failed to create AzureMachinePoolMachine reconciler")
   350  	}
   351  	if err := ampms.Delete(ctx); err != nil {
   352  		// Handle transient and terminal errors
   353  		var reconcileError azure.ReconcileError
   354  		if errors.As(err, &reconcileError) {
   355  			if reconcileError.IsTerminal() {
   356  				log.Error(err, "failed to delete AzureMachinePoolMachine", "name", machineScope.Name())
   357  				return reconcile.Result{}, nil
   358  			}
   359  
   360  			if reconcileError.IsTransient() {
   361  				log.V(4).Info("failed to delete AzureMachinePoolMachine", "name", machineScope.Name(), "transient_error", err)
   362  				return reconcile.Result{RequeueAfter: reconcileError.RequeueAfter()}, nil
   363  			}
   364  
   365  			return reconcile.Result{}, errors.Wrapf(err, "failed to reconcile AzureMachinePool")
   366  		}
   367  
   368  		return reconcile.Result{}, err
   369  	}
   370  
   371  	return reconcile.Result{}, nil
   372  }
   373  
   374  func newAzureMachinePoolMachineReconciler(scope *scope.MachinePoolMachineScope) (azure.Reconciler, error) {
   375  	scaleSetVMsSvc, err := scalesetvms.NewService(scope)
   376  	if err != nil {
   377  		return nil, err
   378  	}
   379  	return &azureMachinePoolMachineReconciler{
   380  		Scope:              scope,
   381  		scalesetVMsService: scaleSetVMsSvc,
   382  	}, nil
   383  }
   384  
   385  // Reconcile will reconcile the state of the Machine Pool Machine with the state of the Azure VMSS VM.
   386  func (r *azureMachinePoolMachineReconciler) Reconcile(ctx context.Context) error {
   387  	ctx, _, done := tele.StartSpanWithLogger(ctx, "controllers.azureMachinePoolMachineReconciler.Reconcile")
   388  	defer done()
   389  
   390  	if err := r.scalesetVMsService.Reconcile(ctx); err != nil {
   391  		return errors.Wrap(err, "failed to reconcile scalesetVMs")
   392  	}
   393  
   394  	if err := r.Scope.UpdateNodeStatus(ctx); err != nil {
   395  		return errors.Wrap(err, "failed to update VMSS VM node status")
   396  	}
   397  
   398  	if err := r.Scope.UpdateInstanceStatus(ctx); err != nil {
   399  		return errors.Wrap(err, "failed to update VMSS VM instance status")
   400  	}
   401  
   402  	return nil
   403  }
   404  
   405  // Delete will attempt to drain and delete the Azure VMSS VM.
   406  func (r *azureMachinePoolMachineReconciler) Delete(ctx context.Context) error {
   407  	ctx, log, done := tele.StartSpanWithLogger(ctx, "controllers.azureMachinePoolMachineReconciler.Delete")
   408  	defer done()
   409  
   410  	defer func() {
   411  		if err := r.Scope.UpdateNodeStatus(ctx); err != nil {
   412  			log.V(4).Info("failed to update VMSS VM node status during delete")
   413  		}
   414  
   415  		if err := r.Scope.UpdateInstanceStatus(ctx); err != nil {
   416  			log.V(4).Info("failed to update VMSS VM instance status during delete")
   417  		}
   418  	}()
   419  
   420  	if err := r.scalesetVMsService.Delete(ctx); err != nil {
   421  		return errors.Wrap(err, "failed to reconcile scalesetVMs")
   422  	}
   423  
   424  	// no long running operation, so we are finished deleting the resource. Remove the finalizer.
   425  	controllerutil.RemoveFinalizer(r.Scope.AzureMachinePoolMachine, infrav1exp.AzureMachinePoolMachineFinalizer)
   426  
   427  	return nil
   428  }