sigs.k8s.io/cluster-api-provider-azure@v1.17.0/controllers/azuremachine_controller.go (about)

     1  /*
     2  Copyright 2019 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package controllers
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  
    23  	"github.com/pkg/errors"
    24  	corev1 "k8s.io/api/core/v1"
    25  	apierrors "k8s.io/apimachinery/pkg/api/errors"
    26  	"k8s.io/client-go/tools/record"
    27  	infrav1 "sigs.k8s.io/cluster-api-provider-azure/api/v1beta1"
    28  	"sigs.k8s.io/cluster-api-provider-azure/azure"
    29  	"sigs.k8s.io/cluster-api-provider-azure/azure/scope"
    30  	"sigs.k8s.io/cluster-api-provider-azure/pkg/coalescing"
    31  	"sigs.k8s.io/cluster-api-provider-azure/util/reconciler"
    32  	"sigs.k8s.io/cluster-api-provider-azure/util/tele"
    33  	clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
    34  	capierrors "sigs.k8s.io/cluster-api/errors"
    35  	"sigs.k8s.io/cluster-api/util"
    36  	"sigs.k8s.io/cluster-api/util/annotations"
    37  	"sigs.k8s.io/cluster-api/util/conditions"
    38  	"sigs.k8s.io/cluster-api/util/predicates"
    39  	ctrl "sigs.k8s.io/controller-runtime"
    40  	"sigs.k8s.io/controller-runtime/pkg/builder"
    41  	"sigs.k8s.io/controller-runtime/pkg/client"
    42  	"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
    43  	"sigs.k8s.io/controller-runtime/pkg/handler"
    44  	"sigs.k8s.io/controller-runtime/pkg/reconcile"
    45  )
    46  
    47  // AzureMachineReconciler reconciles an AzureMachine object.
    48  type AzureMachineReconciler struct {
    49  	client.Client
    50  	Recorder                  record.EventRecorder
    51  	Timeouts                  reconciler.Timeouts
    52  	WatchFilterValue          string
    53  	createAzureMachineService azureMachineServiceCreator
    54  }
    55  
    56  type azureMachineServiceCreator func(machineScope *scope.MachineScope) (*azureMachineService, error)
    57  
    58  // NewAzureMachineReconciler returns a new AzureMachineReconciler instance.
    59  func NewAzureMachineReconciler(client client.Client, recorder record.EventRecorder, timeouts reconciler.Timeouts, watchFilterValue string) *AzureMachineReconciler {
    60  	amr := &AzureMachineReconciler{
    61  		Client:           client,
    62  		Recorder:         recorder,
    63  		Timeouts:         timeouts,
    64  		WatchFilterValue: watchFilterValue,
    65  	}
    66  
    67  	amr.createAzureMachineService = newAzureMachineService
    68  
    69  	return amr
    70  }
    71  
    72  // SetupWithManager initializes this controller with a manager.
    73  func (amr *AzureMachineReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, options Options) error {
    74  	ctx, log, done := tele.StartSpanWithLogger(ctx,
    75  		"controllers.AzureMachineReconciler.SetupWithManager",
    76  		tele.KVP("controller", "AzureMachine"),
    77  	)
    78  	defer done()
    79  
    80  	var r reconcile.Reconciler = amr
    81  	if options.Cache != nil {
    82  		r = coalescing.NewReconciler(amr, options.Cache, log)
    83  	}
    84  
    85  	// create mapper to transform incoming AzureClusters into AzureMachine requests
    86  	azureClusterToAzureMachinesMapper, err := AzureClusterToAzureMachinesMapper(ctx, amr.Client, &infrav1.AzureMachineList{}, mgr.GetScheme(), log)
    87  	if err != nil {
    88  		return errors.Wrap(err, "failed to create AzureCluster to AzureMachines mapper")
    89  	}
    90  
    91  	azureMachineMapper, err := util.ClusterToTypedObjectsMapper(amr.Client, &infrav1.AzureMachineList{}, mgr.GetScheme())
    92  	if err != nil {
    93  		return errors.Wrap(err, "failed to create mapper for Cluster to AzureMachines")
    94  	}
    95  
    96  	return ctrl.NewControllerManagedBy(mgr).
    97  		WithOptions(options.Options).
    98  		For(&infrav1.AzureMachine{}).
    99  		WithEventFilter(predicates.ResourceHasFilterLabel(log, amr.WatchFilterValue)).
   100  		// watch for changes in CAPI Machine resources
   101  		Watches(
   102  			&clusterv1.Machine{},
   103  			handler.EnqueueRequestsFromMapFunc(util.MachineToInfrastructureMapFunc(infrav1.GroupVersion.WithKind("AzureMachine"))),
   104  		).
   105  		// watch for changes in AzureCluster
   106  		Watches(
   107  			&infrav1.AzureCluster{},
   108  			handler.EnqueueRequestsFromMapFunc(azureClusterToAzureMachinesMapper),
   109  		).
   110  		// Add a watch on clusterv1.Cluster object for pause/unpause & ready notifications.
   111  		Watches(
   112  			&clusterv1.Cluster{},
   113  			handler.EnqueueRequestsFromMapFunc(azureMachineMapper),
   114  			builder.WithPredicates(
   115  				ClusterPauseChangeAndInfrastructureReady(log),
   116  				predicates.ResourceHasFilterLabel(log, amr.WatchFilterValue),
   117  			),
   118  		).
   119  		Complete(r)
   120  }
   121  
   122  // +kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io,resources=azuremachines,verbs=get;list;watch;create;update;patch;delete
   123  // +kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io,resources=azuremachines/status,verbs=get;update;patch
   124  // +kubebuilder:rbac:groups=cluster.x-k8s.io,resources=machines;machines/status,verbs=get;list;watch
   125  // +kubebuilder:rbac:groups="",resources=events,verbs=get;list;watch;create;update;patch
   126  // +kubebuilder:rbac:groups="",resources=secrets;,verbs=get;list;watch
   127  
   128  // Reconcile idempotently gets, creates, and updates a machine.
   129  func (amr *AzureMachineReconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Result, reterr error) {
   130  	ctx, cancel := context.WithTimeout(ctx, amr.Timeouts.DefaultedLoopTimeout())
   131  	defer cancel()
   132  
   133  	ctx, log, done := tele.StartSpanWithLogger(
   134  		ctx,
   135  		"controllers.AzureMachineReconciler.Reconcile",
   136  		tele.KVP("namespace", req.Namespace),
   137  		tele.KVP("name", req.Name),
   138  		tele.KVP("kind", "AzureMachine"),
   139  	)
   140  	defer done()
   141  
   142  	// Fetch the AzureMachine VM.
   143  	azureMachine := &infrav1.AzureMachine{}
   144  	err := amr.Get(ctx, req.NamespacedName, azureMachine)
   145  	if err != nil {
   146  		if apierrors.IsNotFound(err) {
   147  			return reconcile.Result{}, nil
   148  		}
   149  		return reconcile.Result{}, err
   150  	}
   151  
   152  	// Fetch the Machine.
   153  	machine, err := util.GetOwnerMachine(ctx, amr.Client, azureMachine.ObjectMeta)
   154  	if err != nil {
   155  		return reconcile.Result{}, err
   156  	}
   157  	if machine == nil {
   158  		amr.Recorder.Eventf(azureMachine, corev1.EventTypeNormal, "Machine controller dependency not yet met", "Machine Controller has not yet set OwnerRef")
   159  		log.Info("Machine Controller has not yet set OwnerRef")
   160  		return reconcile.Result{}, nil
   161  	}
   162  
   163  	log = log.WithValues("machine", machine.Name)
   164  
   165  	// Fetch the Cluster.
   166  	cluster, err := util.GetClusterFromMetadata(ctx, amr.Client, machine.ObjectMeta)
   167  	if err != nil {
   168  		amr.Recorder.Eventf(azureMachine, corev1.EventTypeNormal, "Unable to get cluster from metadata", "Machine is missing cluster label or cluster does not exist")
   169  		log.Info("Machine is missing cluster label or cluster does not exist")
   170  		return reconcile.Result{}, nil
   171  	}
   172  
   173  	log = log.WithValues("cluster", cluster.Name)
   174  
   175  	log = log.WithValues("AzureCluster", cluster.Spec.InfrastructureRef.Name)
   176  	azureClusterName := client.ObjectKey{
   177  		Namespace: azureMachine.Namespace,
   178  		Name:      cluster.Spec.InfrastructureRef.Name,
   179  	}
   180  	azureCluster := &infrav1.AzureCluster{}
   181  	if err := amr.Client.Get(ctx, azureClusterName, azureCluster); err != nil {
   182  		amr.Recorder.Eventf(azureMachine, corev1.EventTypeNormal, "AzureCluster unavailable", "AzureCluster is not available yet")
   183  		log.Info("AzureCluster is not available yet")
   184  		return reconcile.Result{}, nil
   185  	}
   186  
   187  	// Create the cluster scope
   188  	clusterScope, err := scope.NewClusterScope(ctx, scope.ClusterScopeParams{
   189  		Client:       amr.Client,
   190  		Cluster:      cluster,
   191  		AzureCluster: azureCluster,
   192  		Timeouts:     amr.Timeouts,
   193  	})
   194  	if err != nil {
   195  		amr.Recorder.Eventf(azureCluster, corev1.EventTypeWarning, "Error creating the cluster scope", err.Error())
   196  		return reconcile.Result{}, err
   197  	}
   198  
   199  	// Create the machine scope
   200  	machineScope, err := scope.NewMachineScope(scope.MachineScopeParams{
   201  		Client:       amr.Client,
   202  		Machine:      machine,
   203  		AzureMachine: azureMachine,
   204  		ClusterScope: clusterScope,
   205  	})
   206  	if err != nil {
   207  		amr.Recorder.Eventf(azureMachine, corev1.EventTypeWarning, "Error creating the machine scope", err.Error())
   208  		return reconcile.Result{}, errors.Wrap(err, "failed to create scope")
   209  	}
   210  
   211  	// Always close the scope when exiting this function so we can persist any AzureMachine changes.
   212  	defer func() {
   213  		if err := machineScope.Close(ctx); err != nil && reterr == nil {
   214  			reterr = err
   215  		}
   216  	}()
   217  
   218  	// Return early if the object or Cluster is paused.
   219  	if annotations.IsPaused(cluster, azureMachine) {
   220  		log.Info("AzureMachine or linked Cluster is marked as paused. Won't reconcile normally")
   221  		return amr.reconcilePause(ctx, machineScope)
   222  	}
   223  
   224  	// Handle deleted machines
   225  	if !azureMachine.ObjectMeta.DeletionTimestamp.IsZero() {
   226  		return amr.reconcileDelete(ctx, machineScope, clusterScope)
   227  	}
   228  
   229  	// Handle non-deleted machines
   230  	return amr.reconcileNormal(ctx, machineScope, clusterScope)
   231  }
   232  
   233  func (amr *AzureMachineReconciler) reconcileNormal(ctx context.Context, machineScope *scope.MachineScope, clusterScope *scope.ClusterScope) (reconcile.Result, error) {
   234  	ctx, log, done := tele.StartSpanWithLogger(ctx, "controllers.AzureMachineReconciler.reconcileNormal")
   235  	defer done()
   236  
   237  	log.Info("Reconciling AzureMachine")
   238  	// If the AzureMachine is in an error state, return early.
   239  	if machineScope.AzureMachine.Status.FailureReason != nil || machineScope.AzureMachine.Status.FailureMessage != nil {
   240  		log.Info("Error state detected, skipping reconciliation")
   241  		return reconcile.Result{}, nil
   242  	}
   243  
   244  	// Register our finalizer immediately to avoid orphaning Azure resources on delete
   245  	needsPatch := controllerutil.AddFinalizer(machineScope.AzureMachine, infrav1.MachineFinalizer)
   246  	// Register the block-move annotation immediately to avoid moving un-paused ASO resources
   247  	needsPatch = AddBlockMoveAnnotation(machineScope.AzureMachine) || needsPatch
   248  	if needsPatch {
   249  		if err := machineScope.PatchObject(ctx); err != nil {
   250  			return reconcile.Result{}, err
   251  		}
   252  	}
   253  
   254  	// Make sure the Cluster Infrastructure is ready.
   255  	if !clusterScope.Cluster.Status.InfrastructureReady {
   256  		log.Info("Cluster infrastructure is not ready yet")
   257  		conditions.MarkFalse(machineScope.AzureMachine, infrav1.VMRunningCondition, infrav1.WaitingForClusterInfrastructureReason, clusterv1.ConditionSeverityInfo, "")
   258  		return reconcile.Result{}, nil
   259  	}
   260  
   261  	// Make sure bootstrap data is available and populated.
   262  	if machineScope.Machine.Spec.Bootstrap.DataSecretName == nil {
   263  		log.Info("Bootstrap data secret reference is not yet available")
   264  		conditions.MarkFalse(machineScope.AzureMachine, infrav1.VMRunningCondition, infrav1.WaitingForBootstrapDataReason, clusterv1.ConditionSeverityInfo, "")
   265  		return reconcile.Result{}, nil
   266  	}
   267  
   268  	var reconcileError azure.ReconcileError
   269  
   270  	// Initialize the cache to be used by the AzureMachine services.
   271  	err := machineScope.InitMachineCache(ctx)
   272  	if err != nil {
   273  		if errors.As(err, &reconcileError) && reconcileError.IsTerminal() {
   274  			amr.Recorder.Eventf(machineScope.AzureMachine, corev1.EventTypeWarning, "SKUNotFound", errors.Wrap(err, "failed to initialize machine cache").Error())
   275  			log.Error(err, "Failed to initialize machine cache")
   276  			machineScope.SetFailureReason(capierrors.InvalidConfigurationMachineError)
   277  			machineScope.SetFailureMessage(err)
   278  			machineScope.SetNotReady()
   279  			return reconcile.Result{}, nil
   280  		}
   281  		return reconcile.Result{}, errors.Wrap(err, "failed to init machine scope cache")
   282  	}
   283  
   284  	// Mark the AzureMachine as failed if the identities are not ready.
   285  	cond := conditions.Get(machineScope.AzureMachine, infrav1.VMIdentitiesReadyCondition)
   286  	if cond != nil && cond.Status == corev1.ConditionFalse && cond.Reason == infrav1.UserAssignedIdentityMissingReason {
   287  		amr.Recorder.Eventf(machineScope.AzureMachine, corev1.EventTypeWarning, infrav1.UserAssignedIdentityMissingReason, "VM is unhealthy")
   288  		machineScope.SetFailureReason(capierrors.UnsupportedChangeMachineError)
   289  		machineScope.SetFailureMessage(errors.New("VM identities are not ready"))
   290  		return reconcile.Result{}, errors.New("VM identities are not ready")
   291  	}
   292  
   293  	ams, err := amr.createAzureMachineService(machineScope)
   294  	if err != nil {
   295  		return reconcile.Result{}, errors.Wrap(err, "failed to create azure machine service")
   296  	}
   297  
   298  	if err := ams.Reconcile(ctx); err != nil {
   299  		// This means that a VM was created and managed by this controller, but is not present anymore.
   300  		// In this case, we mark it as failed and leave it to MHC for remediation
   301  		if errors.As(err, &azure.VMDeletedError{}) {
   302  			amr.Recorder.Eventf(machineScope.AzureMachine, corev1.EventTypeWarning, "VMDeleted", errors.Wrap(err, "failed to reconcile AzureMachine").Error())
   303  			machineScope.SetFailureReason(capierrors.UpdateMachineError)
   304  			machineScope.SetFailureMessage(err)
   305  			machineScope.SetNotReady()
   306  			machineScope.SetVMState(infrav1.Deleted)
   307  			return reconcile.Result{}, errors.Wrap(err, "failed to reconcile AzureMachine")
   308  		}
   309  
   310  		// Handle transient and terminal errors
   311  		if errors.As(err, &reconcileError) {
   312  			if reconcileError.IsTerminal() {
   313  				amr.Recorder.Eventf(machineScope.AzureMachine, corev1.EventTypeWarning, "ReconcileError", errors.Wrapf(err, "failed to reconcile AzureMachine").Error())
   314  				log.Error(err, "failed to reconcile AzureMachine", "name", machineScope.Name())
   315  				machineScope.SetFailureReason(capierrors.CreateMachineError)
   316  				machineScope.SetFailureMessage(err)
   317  				machineScope.SetNotReady()
   318  				machineScope.SetVMState(infrav1.Failed)
   319  				return reconcile.Result{}, nil
   320  			}
   321  
   322  			if reconcileError.IsTransient() {
   323  				if azure.IsOperationNotDoneError(reconcileError) {
   324  					log.V(2).Info(fmt.Sprintf("AzureMachine reconcile not done: %s", reconcileError.Error()))
   325  				} else {
   326  					log.V(2).Info(fmt.Sprintf("transient failure to reconcile AzureMachine, retrying: %s", reconcileError.Error()))
   327  				}
   328  				return reconcile.Result{RequeueAfter: reconcileError.RequeueAfter()}, nil
   329  			}
   330  		}
   331  		amr.Recorder.Eventf(machineScope.AzureMachine, corev1.EventTypeWarning, "ReconcileError", errors.Wrapf(err, "failed to reconcile AzureMachine").Error())
   332  		return reconcile.Result{}, errors.Wrap(err, "failed to reconcile AzureMachine")
   333  	}
   334  
   335  	machineScope.SetReady()
   336  
   337  	return reconcile.Result{}, nil
   338  }
   339  
   340  //nolint:unparam // Always returns an empty struct for reconcile.Result
   341  func (amr *AzureMachineReconciler) reconcilePause(ctx context.Context, machineScope *scope.MachineScope) (reconcile.Result, error) {
   342  	ctx, log, done := tele.StartSpanWithLogger(ctx, "controllers.AzureMachine.reconcilePause")
   343  	defer done()
   344  
   345  	log.Info("Reconciling AzureMachine pause")
   346  
   347  	ams, err := amr.createAzureMachineService(machineScope)
   348  	if err != nil {
   349  		return reconcile.Result{}, errors.Wrap(err, "failed to create azure machine service")
   350  	}
   351  
   352  	if err := ams.Pause(ctx); err != nil {
   353  		return reconcile.Result{}, errors.Wrap(err, "failed to pause azure machine services")
   354  	}
   355  	RemoveBlockMoveAnnotation(machineScope.AzureMachine)
   356  
   357  	return reconcile.Result{}, nil
   358  }
   359  
   360  func (amr *AzureMachineReconciler) reconcileDelete(ctx context.Context, machineScope *scope.MachineScope, clusterScope *scope.ClusterScope) (reconcile.Result, error) {
   361  	ctx, log, done := tele.StartSpanWithLogger(ctx, "controllers.AzureMachineReconciler.reconcileDelete")
   362  	defer done()
   363  
   364  	log.Info("Handling deleted AzureMachine")
   365  	conditions.MarkFalse(machineScope.AzureMachine, infrav1.VMRunningCondition, clusterv1.DeletingReason, clusterv1.ConditionSeverityInfo, "")
   366  	if err := machineScope.PatchObject(ctx); err != nil {
   367  		return reconcile.Result{}, err
   368  	}
   369  
   370  	if ShouldDeleteIndividualResources(ctx, clusterScope) {
   371  		log.Info("Deleting AzureMachine")
   372  		ams, err := amr.createAzureMachineService(machineScope)
   373  		if err != nil {
   374  			return reconcile.Result{}, errors.Wrap(err, "failed to create azure machine service")
   375  		}
   376  
   377  		if err := ams.Delete(ctx); err != nil {
   378  			// Handle transient errors
   379  			var reconcileError azure.ReconcileError
   380  			if errors.As(err, &reconcileError) {
   381  				if reconcileError.IsTransient() {
   382  					if azure.IsOperationNotDoneError(reconcileError) {
   383  						log.V(2).Info(fmt.Sprintf("AzureMachine delete not done: %s", reconcileError.Error()))
   384  					} else {
   385  						log.V(2).Info("transient failure to delete AzureMachine, retrying")
   386  					}
   387  					return reconcile.Result{RequeueAfter: reconcileError.RequeueAfter()}, nil
   388  				}
   389  			}
   390  
   391  			amr.Recorder.Eventf(machineScope.AzureMachine, corev1.EventTypeWarning, "Error deleting AzureMachine", errors.Wrapf(err, "error deleting AzureMachine %s/%s", machineScope.Namespace(), machineScope.Name()).Error())
   392  			return reconcile.Result{}, errors.Wrapf(err, "error deleting AzureMachine %s/%s", machineScope.Namespace(), machineScope.Name())
   393  		}
   394  	} else {
   395  		log.Info("Skipping AzureMachine Deletion; will delete whole resource group.")
   396  	}
   397  
   398  	// we're done deleting this AzureMachine so remove the finalizer.
   399  	log.Info("Removing finalizer from AzureMachine")
   400  	controllerutil.RemoveFinalizer(machineScope.AzureMachine, infrav1.MachineFinalizer)
   401  
   402  	return reconcile.Result{}, nil
   403  }