sigs.k8s.io/cluster-api-provider-azure@v1.14.3/controllers/azuremachine_controller.go (about)

     1  /*
     2  Copyright 2019 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package controllers
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  
    23  	"github.com/pkg/errors"
    24  	corev1 "k8s.io/api/core/v1"
    25  	apierrors "k8s.io/apimachinery/pkg/api/errors"
    26  	"k8s.io/client-go/tools/record"
    27  	infrav1 "sigs.k8s.io/cluster-api-provider-azure/api/v1beta1"
    28  	"sigs.k8s.io/cluster-api-provider-azure/azure"
    29  	"sigs.k8s.io/cluster-api-provider-azure/azure/scope"
    30  	"sigs.k8s.io/cluster-api-provider-azure/pkg/coalescing"
    31  	"sigs.k8s.io/cluster-api-provider-azure/util/reconciler"
    32  	"sigs.k8s.io/cluster-api-provider-azure/util/tele"
    33  	clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
    34  	capierrors "sigs.k8s.io/cluster-api/errors"
    35  	"sigs.k8s.io/cluster-api/util"
    36  	"sigs.k8s.io/cluster-api/util/annotations"
    37  	"sigs.k8s.io/cluster-api/util/conditions"
    38  	"sigs.k8s.io/cluster-api/util/predicates"
    39  	ctrl "sigs.k8s.io/controller-runtime"
    40  	"sigs.k8s.io/controller-runtime/pkg/client"
    41  	"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
    42  	"sigs.k8s.io/controller-runtime/pkg/handler"
    43  	"sigs.k8s.io/controller-runtime/pkg/reconcile"
    44  	"sigs.k8s.io/controller-runtime/pkg/source"
    45  )
    46  
    47  // AzureMachineReconciler reconciles an AzureMachine object.
    48  type AzureMachineReconciler struct {
    49  	client.Client
    50  	Recorder                  record.EventRecorder
    51  	Timeouts                  reconciler.Timeouts
    52  	WatchFilterValue          string
    53  	createAzureMachineService azureMachineServiceCreator
    54  }
    55  
    56  type azureMachineServiceCreator func(machineScope *scope.MachineScope) (*azureMachineService, error)
    57  
    58  // NewAzureMachineReconciler returns a new AzureMachineReconciler instance.
    59  func NewAzureMachineReconciler(client client.Client, recorder record.EventRecorder, timeouts reconciler.Timeouts, watchFilterValue string) *AzureMachineReconciler {
    60  	amr := &AzureMachineReconciler{
    61  		Client:           client,
    62  		Recorder:         recorder,
    63  		Timeouts:         timeouts,
    64  		WatchFilterValue: watchFilterValue,
    65  	}
    66  
    67  	amr.createAzureMachineService = newAzureMachineService
    68  
    69  	return amr
    70  }
    71  
    72  // SetupWithManager initializes this controller with a manager.
    73  func (amr *AzureMachineReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, options Options) error {
    74  	ctx, log, done := tele.StartSpanWithLogger(ctx,
    75  		"controllers.AzureMachineReconciler.SetupWithManager",
    76  		tele.KVP("controller", "AzureMachine"),
    77  	)
    78  	defer done()
    79  
    80  	var r reconcile.Reconciler = amr
    81  	if options.Cache != nil {
    82  		r = coalescing.NewReconciler(amr, options.Cache, log)
    83  	}
    84  
    85  	// create mapper to transform incoming AzureClusters into AzureMachine requests
    86  	azureClusterToAzureMachinesMapper, err := AzureClusterToAzureMachinesMapper(ctx, amr.Client, &infrav1.AzureMachineList{}, mgr.GetScheme(), log)
    87  	if err != nil {
    88  		return errors.Wrap(err, "failed to create AzureCluster to AzureMachines mapper")
    89  	}
    90  
    91  	c, err := ctrl.NewControllerManagedBy(mgr).
    92  		WithOptions(options.Options).
    93  		For(&infrav1.AzureMachine{}).
    94  		WithEventFilter(predicates.ResourceHasFilterLabel(log, amr.WatchFilterValue)).
    95  		// watch for changes in CAPI Machine resources
    96  		Watches(
    97  			&clusterv1.Machine{},
    98  			handler.EnqueueRequestsFromMapFunc(util.MachineToInfrastructureMapFunc(infrav1.GroupVersion.WithKind("AzureMachine"))),
    99  		).
   100  		// watch for changes in AzureCluster
   101  		Watches(
   102  			&infrav1.AzureCluster{},
   103  			handler.EnqueueRequestsFromMapFunc(azureClusterToAzureMachinesMapper),
   104  		).
   105  		Build(r)
   106  	if err != nil {
   107  		return errors.Wrap(err, "error creating controller")
   108  	}
   109  
   110  	azureMachineMapper, err := util.ClusterToTypedObjectsMapper(amr.Client, &infrav1.AzureMachineList{}, mgr.GetScheme())
   111  	if err != nil {
   112  		return errors.Wrap(err, "failed to create mapper for Cluster to AzureMachines")
   113  	}
   114  
   115  	// Add a watch on clusterv1.Cluster object for pause/unpause & ready notifications.
   116  	if err := c.Watch(
   117  		source.Kind(mgr.GetCache(), &clusterv1.Cluster{}),
   118  		handler.EnqueueRequestsFromMapFunc(azureMachineMapper),
   119  		ClusterPauseChangeAndInfrastructureReady(log),
   120  		predicates.ResourceHasFilterLabel(log, amr.WatchFilterValue),
   121  	); err != nil {
   122  		return errors.Wrap(err, "failed adding a watch for ready clusters")
   123  	}
   124  
   125  	return nil
   126  }
   127  
   128  // +kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io,resources=azuremachines,verbs=get;list;watch;create;update;patch;delete
   129  // +kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io,resources=azuremachines/status,verbs=get;update;patch
   130  // +kubebuilder:rbac:groups=cluster.x-k8s.io,resources=machines;machines/status,verbs=get;list;watch
   131  // +kubebuilder:rbac:groups="",resources=events,verbs=get;list;watch;create;update;patch
   132  // +kubebuilder:rbac:groups="",resources=secrets;,verbs=get;list;watch
   133  
   134  // Reconcile idempotently gets, creates, and updates a machine.
   135  func (amr *AzureMachineReconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Result, reterr error) {
   136  	ctx, cancel := context.WithTimeout(ctx, amr.Timeouts.DefaultedLoopTimeout())
   137  	defer cancel()
   138  
   139  	ctx, log, done := tele.StartSpanWithLogger(
   140  		ctx,
   141  		"controllers.AzureMachineReconciler.Reconcile",
   142  		tele.KVP("namespace", req.Namespace),
   143  		tele.KVP("name", req.Name),
   144  		tele.KVP("kind", "AzureMachine"),
   145  	)
   146  	defer done()
   147  
   148  	// Fetch the AzureMachine VM.
   149  	azureMachine := &infrav1.AzureMachine{}
   150  	err := amr.Get(ctx, req.NamespacedName, azureMachine)
   151  	if err != nil {
   152  		if apierrors.IsNotFound(err) {
   153  			return reconcile.Result{}, nil
   154  		}
   155  		return reconcile.Result{}, err
   156  	}
   157  
   158  	// Fetch the Machine.
   159  	machine, err := util.GetOwnerMachine(ctx, amr.Client, azureMachine.ObjectMeta)
   160  	if err != nil {
   161  		return reconcile.Result{}, err
   162  	}
   163  	if machine == nil {
   164  		amr.Recorder.Eventf(azureMachine, corev1.EventTypeNormal, "Machine controller dependency not yet met", "Machine Controller has not yet set OwnerRef")
   165  		log.Info("Machine Controller has not yet set OwnerRef")
   166  		return reconcile.Result{}, nil
   167  	}
   168  
   169  	log = log.WithValues("machine", machine.Name)
   170  
   171  	// Fetch the Cluster.
   172  	cluster, err := util.GetClusterFromMetadata(ctx, amr.Client, machine.ObjectMeta)
   173  	if err != nil {
   174  		amr.Recorder.Eventf(azureMachine, corev1.EventTypeNormal, "Unable to get cluster from metadata", "Machine is missing cluster label or cluster does not exist")
   175  		log.Info("Machine is missing cluster label or cluster does not exist")
   176  		return reconcile.Result{}, nil
   177  	}
   178  
   179  	log = log.WithValues("cluster", cluster.Name)
   180  
   181  	log = log.WithValues("AzureCluster", cluster.Spec.InfrastructureRef.Name)
   182  	azureClusterName := client.ObjectKey{
   183  		Namespace: azureMachine.Namespace,
   184  		Name:      cluster.Spec.InfrastructureRef.Name,
   185  	}
   186  	azureCluster := &infrav1.AzureCluster{}
   187  	if err := amr.Client.Get(ctx, azureClusterName, azureCluster); err != nil {
   188  		amr.Recorder.Eventf(azureMachine, corev1.EventTypeNormal, "AzureCluster unavailable", "AzureCluster is not available yet")
   189  		log.Info("AzureCluster is not available yet")
   190  		return reconcile.Result{}, nil
   191  	}
   192  
   193  	// Create the cluster scope
   194  	clusterScope, err := scope.NewClusterScope(ctx, scope.ClusterScopeParams{
   195  		Client:       amr.Client,
   196  		Cluster:      cluster,
   197  		AzureCluster: azureCluster,
   198  		Timeouts:     amr.Timeouts,
   199  	})
   200  	if err != nil {
   201  		amr.Recorder.Eventf(azureCluster, corev1.EventTypeWarning, "Error creating the cluster scope", err.Error())
   202  		return reconcile.Result{}, err
   203  	}
   204  
   205  	// Create the machine scope
   206  	machineScope, err := scope.NewMachineScope(scope.MachineScopeParams{
   207  		Client:       amr.Client,
   208  		Machine:      machine,
   209  		AzureMachine: azureMachine,
   210  		ClusterScope: clusterScope,
   211  	})
   212  	if err != nil {
   213  		amr.Recorder.Eventf(azureMachine, corev1.EventTypeWarning, "Error creating the machine scope", err.Error())
   214  		return reconcile.Result{}, errors.Wrap(err, "failed to create scope")
   215  	}
   216  
   217  	// Always close the scope when exiting this function so we can persist any AzureMachine changes.
   218  	defer func() {
   219  		if err := machineScope.Close(ctx); err != nil && reterr == nil {
   220  			reterr = err
   221  		}
   222  	}()
   223  
   224  	// Return early if the object or Cluster is paused.
   225  	if annotations.IsPaused(cluster, azureMachine) {
   226  		log.Info("AzureMachine or linked Cluster is marked as paused. Won't reconcile normally")
   227  		return amr.reconcilePause(ctx, machineScope)
   228  	}
   229  
   230  	// Handle deleted machines
   231  	if !azureMachine.ObjectMeta.DeletionTimestamp.IsZero() {
   232  		return amr.reconcileDelete(ctx, machineScope, clusterScope)
   233  	}
   234  
   235  	// Handle non-deleted machines
   236  	return amr.reconcileNormal(ctx, machineScope, clusterScope)
   237  }
   238  
   239  func (amr *AzureMachineReconciler) reconcileNormal(ctx context.Context, machineScope *scope.MachineScope, clusterScope *scope.ClusterScope) (reconcile.Result, error) {
   240  	ctx, log, done := tele.StartSpanWithLogger(ctx, "controllers.AzureMachineReconciler.reconcileNormal")
   241  	defer done()
   242  
   243  	log.Info("Reconciling AzureMachine")
   244  	// If the AzureMachine is in an error state, return early.
   245  	if machineScope.AzureMachine.Status.FailureReason != nil || machineScope.AzureMachine.Status.FailureMessage != nil {
   246  		log.Info("Error state detected, skipping reconciliation")
   247  		return reconcile.Result{}, nil
   248  	}
   249  
   250  	// Register our finalizer immediately to avoid orphaning Azure resources on delete
   251  	needsPatch := controllerutil.AddFinalizer(machineScope.AzureMachine, infrav1.MachineFinalizer)
   252  	// Register the block-move annotation immediately to avoid moving un-paused ASO resources
   253  	needsPatch = AddBlockMoveAnnotation(machineScope.AzureMachine) || needsPatch
   254  	if needsPatch {
   255  		if err := machineScope.PatchObject(ctx); err != nil {
   256  			return reconcile.Result{}, err
   257  		}
   258  	}
   259  
   260  	// Make sure the Cluster Infrastructure is ready.
   261  	if !clusterScope.Cluster.Status.InfrastructureReady {
   262  		log.Info("Cluster infrastructure is not ready yet")
   263  		conditions.MarkFalse(machineScope.AzureMachine, infrav1.VMRunningCondition, infrav1.WaitingForClusterInfrastructureReason, clusterv1.ConditionSeverityInfo, "")
   264  		return reconcile.Result{}, nil
   265  	}
   266  
   267  	// Make sure bootstrap data is available and populated.
   268  	if machineScope.Machine.Spec.Bootstrap.DataSecretName == nil {
   269  		log.Info("Bootstrap data secret reference is not yet available")
   270  		conditions.MarkFalse(machineScope.AzureMachine, infrav1.VMRunningCondition, infrav1.WaitingForBootstrapDataReason, clusterv1.ConditionSeverityInfo, "")
   271  		return reconcile.Result{}, nil
   272  	}
   273  
   274  	var reconcileError azure.ReconcileError
   275  
   276  	// Initialize the cache to be used by the AzureMachine services.
   277  	err := machineScope.InitMachineCache(ctx)
   278  	if err != nil {
   279  		if errors.As(err, &reconcileError) && reconcileError.IsTerminal() {
   280  			amr.Recorder.Eventf(machineScope.AzureMachine, corev1.EventTypeWarning, "SKUNotFound", errors.Wrap(err, "failed to initialize machine cache").Error())
   281  			log.Error(err, "Failed to initialize machine cache")
   282  			machineScope.SetFailureReason(capierrors.InvalidConfigurationMachineError)
   283  			machineScope.SetFailureMessage(err)
   284  			machineScope.SetNotReady()
   285  			return reconcile.Result{}, nil
   286  		}
   287  		return reconcile.Result{}, errors.Wrap(err, "failed to init machine scope cache")
   288  	}
   289  
   290  	// Mark the AzureMachine as failed if the identities are not ready.
   291  	cond := conditions.Get(machineScope.AzureMachine, infrav1.VMIdentitiesReadyCondition)
   292  	if cond != nil && cond.Status == corev1.ConditionFalse && cond.Reason == infrav1.UserAssignedIdentityMissingReason {
   293  		amr.Recorder.Eventf(machineScope.AzureMachine, corev1.EventTypeWarning, infrav1.UserAssignedIdentityMissingReason, "VM is unhealthy")
   294  		machineScope.SetFailureReason(capierrors.UnsupportedChangeMachineError)
   295  		machineScope.SetFailureMessage(errors.New("VM identities are not ready"))
   296  		return reconcile.Result{}, errors.New("VM identities are not ready")
   297  	}
   298  
   299  	ams, err := amr.createAzureMachineService(machineScope)
   300  	if err != nil {
   301  		return reconcile.Result{}, errors.Wrap(err, "failed to create azure machine service")
   302  	}
   303  
   304  	if err := ams.Reconcile(ctx); err != nil {
   305  		// This means that a VM was created and managed by this controller, but is not present anymore.
   306  		// In this case, we mark it as failed and leave it to MHC for remediation
   307  		if errors.As(err, &azure.VMDeletedError{}) {
   308  			amr.Recorder.Eventf(machineScope.AzureMachine, corev1.EventTypeWarning, "VMDeleted", errors.Wrap(err, "failed to reconcile AzureMachine").Error())
   309  			machineScope.SetFailureReason(capierrors.UpdateMachineError)
   310  			machineScope.SetFailureMessage(err)
   311  			machineScope.SetNotReady()
   312  			machineScope.SetVMState(infrav1.Deleted)
   313  			return reconcile.Result{}, errors.Wrap(err, "failed to reconcile AzureMachine")
   314  		}
   315  
   316  		// Handle transient and terminal errors
   317  		if errors.As(err, &reconcileError) {
   318  			if reconcileError.IsTerminal() {
   319  				amr.Recorder.Eventf(machineScope.AzureMachine, corev1.EventTypeWarning, "ReconcileError", errors.Wrapf(err, "failed to reconcile AzureMachine").Error())
   320  				log.Error(err, "failed to reconcile AzureMachine", "name", machineScope.Name())
   321  				machineScope.SetFailureReason(capierrors.CreateMachineError)
   322  				machineScope.SetFailureMessage(err)
   323  				machineScope.SetNotReady()
   324  				machineScope.SetVMState(infrav1.Failed)
   325  				return reconcile.Result{}, nil
   326  			}
   327  
   328  			if reconcileError.IsTransient() {
   329  				if azure.IsOperationNotDoneError(reconcileError) {
   330  					log.V(2).Info(fmt.Sprintf("AzureMachine reconcile not done: %s", reconcileError.Error()))
   331  				} else {
   332  					log.V(2).Info(fmt.Sprintf("transient failure to reconcile AzureMachine, retrying: %s", reconcileError.Error()))
   333  				}
   334  				return reconcile.Result{RequeueAfter: reconcileError.RequeueAfter()}, nil
   335  			}
   336  		}
   337  		amr.Recorder.Eventf(machineScope.AzureMachine, corev1.EventTypeWarning, "ReconcileError", errors.Wrapf(err, "failed to reconcile AzureMachine").Error())
   338  		return reconcile.Result{}, errors.Wrap(err, "failed to reconcile AzureMachine")
   339  	}
   340  
   341  	machineScope.SetReady()
   342  
   343  	return reconcile.Result{}, nil
   344  }
   345  
   346  //nolint:unparam // Always returns an empty struct for reconcile.Result
   347  func (amr *AzureMachineReconciler) reconcilePause(ctx context.Context, machineScope *scope.MachineScope) (reconcile.Result, error) {
   348  	ctx, log, done := tele.StartSpanWithLogger(ctx, "controllers.AzureMachine.reconcilePause")
   349  	defer done()
   350  
   351  	log.Info("Reconciling AzureMachine pause")
   352  
   353  	ams, err := amr.createAzureMachineService(machineScope)
   354  	if err != nil {
   355  		return reconcile.Result{}, errors.Wrap(err, "failed to create azure machine service")
   356  	}
   357  
   358  	if err := ams.Pause(ctx); err != nil {
   359  		return reconcile.Result{}, errors.Wrap(err, "failed to pause azure machine services")
   360  	}
   361  	RemoveBlockMoveAnnotation(machineScope.AzureMachine)
   362  
   363  	return reconcile.Result{}, nil
   364  }
   365  
   366  func (amr *AzureMachineReconciler) reconcileDelete(ctx context.Context, machineScope *scope.MachineScope, clusterScope *scope.ClusterScope) (reconcile.Result, error) {
   367  	ctx, log, done := tele.StartSpanWithLogger(ctx, "controllers.AzureMachineReconciler.reconcileDelete")
   368  	defer done()
   369  
   370  	log.Info("Handling deleted AzureMachine")
   371  	conditions.MarkFalse(machineScope.AzureMachine, infrav1.VMRunningCondition, clusterv1.DeletingReason, clusterv1.ConditionSeverityInfo, "")
   372  	if err := machineScope.PatchObject(ctx); err != nil {
   373  		return reconcile.Result{}, err
   374  	}
   375  
   376  	if ShouldDeleteIndividualResources(ctx, clusterScope) {
   377  		log.Info("Deleting AzureMachine")
   378  		ams, err := amr.createAzureMachineService(machineScope)
   379  		if err != nil {
   380  			return reconcile.Result{}, errors.Wrap(err, "failed to create azure machine service")
   381  		}
   382  
   383  		if err := ams.Delete(ctx); err != nil {
   384  			// Handle transient errors
   385  			var reconcileError azure.ReconcileError
   386  			if errors.As(err, &reconcileError) {
   387  				if reconcileError.IsTransient() {
   388  					if azure.IsOperationNotDoneError(reconcileError) {
   389  						log.V(2).Info(fmt.Sprintf("AzureMachine delete not done: %s", reconcileError.Error()))
   390  					} else {
   391  						log.V(2).Info("transient failure to delete AzureMachine, retrying")
   392  					}
   393  					return reconcile.Result{RequeueAfter: reconcileError.RequeueAfter()}, nil
   394  				}
   395  			}
   396  
   397  			amr.Recorder.Eventf(machineScope.AzureMachine, corev1.EventTypeWarning, "Error deleting AzureMachine", errors.Wrapf(err, "error deleting AzureMachine %s/%s", machineScope.Namespace(), machineScope.Name()).Error())
   398  			return reconcile.Result{}, errors.Wrapf(err, "error deleting AzureMachine %s/%s", machineScope.Namespace(), machineScope.Name())
   399  		}
   400  	} else {
   401  		log.Info("Skipping AzureMachine Deletion; will delete whole resource group.")
   402  	}
   403  
   404  	// we're done deleting this AzureMachine so remove the finalizer.
   405  	log.Info("Removing finalizer from AzureMachine")
   406  	controllerutil.RemoveFinalizer(machineScope.AzureMachine, infrav1.MachineFinalizer)
   407  
   408  	return reconcile.Result{}, nil
   409  }