sigs.k8s.io/cluster-api-provider-azure@v1.14.3/controllers/azuremachine_controller.go (about) 1 /* 2 Copyright 2019 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package controllers 18 19 import ( 20 "context" 21 "fmt" 22 23 "github.com/pkg/errors" 24 corev1 "k8s.io/api/core/v1" 25 apierrors "k8s.io/apimachinery/pkg/api/errors" 26 "k8s.io/client-go/tools/record" 27 infrav1 "sigs.k8s.io/cluster-api-provider-azure/api/v1beta1" 28 "sigs.k8s.io/cluster-api-provider-azure/azure" 29 "sigs.k8s.io/cluster-api-provider-azure/azure/scope" 30 "sigs.k8s.io/cluster-api-provider-azure/pkg/coalescing" 31 "sigs.k8s.io/cluster-api-provider-azure/util/reconciler" 32 "sigs.k8s.io/cluster-api-provider-azure/util/tele" 33 clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" 34 capierrors "sigs.k8s.io/cluster-api/errors" 35 "sigs.k8s.io/cluster-api/util" 36 "sigs.k8s.io/cluster-api/util/annotations" 37 "sigs.k8s.io/cluster-api/util/conditions" 38 "sigs.k8s.io/cluster-api/util/predicates" 39 ctrl "sigs.k8s.io/controller-runtime" 40 "sigs.k8s.io/controller-runtime/pkg/client" 41 "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" 42 "sigs.k8s.io/controller-runtime/pkg/handler" 43 "sigs.k8s.io/controller-runtime/pkg/reconcile" 44 "sigs.k8s.io/controller-runtime/pkg/source" 45 ) 46 47 // AzureMachineReconciler reconciles an AzureMachine object. 48 type AzureMachineReconciler struct { 49 client.Client 50 Recorder record.EventRecorder 51 Timeouts reconciler.Timeouts 52 WatchFilterValue string 53 createAzureMachineService azureMachineServiceCreator 54 } 55 56 type azureMachineServiceCreator func(machineScope *scope.MachineScope) (*azureMachineService, error) 57 58 // NewAzureMachineReconciler returns a new AzureMachineReconciler instance. 59 func NewAzureMachineReconciler(client client.Client, recorder record.EventRecorder, timeouts reconciler.Timeouts, watchFilterValue string) *AzureMachineReconciler { 60 amr := &AzureMachineReconciler{ 61 Client: client, 62 Recorder: recorder, 63 Timeouts: timeouts, 64 WatchFilterValue: watchFilterValue, 65 } 66 67 amr.createAzureMachineService = newAzureMachineService 68 69 return amr 70 } 71 72 // SetupWithManager initializes this controller with a manager. 73 func (amr *AzureMachineReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, options Options) error { 74 ctx, log, done := tele.StartSpanWithLogger(ctx, 75 "controllers.AzureMachineReconciler.SetupWithManager", 76 tele.KVP("controller", "AzureMachine"), 77 ) 78 defer done() 79 80 var r reconcile.Reconciler = amr 81 if options.Cache != nil { 82 r = coalescing.NewReconciler(amr, options.Cache, log) 83 } 84 85 // create mapper to transform incoming AzureClusters into AzureMachine requests 86 azureClusterToAzureMachinesMapper, err := AzureClusterToAzureMachinesMapper(ctx, amr.Client, &infrav1.AzureMachineList{}, mgr.GetScheme(), log) 87 if err != nil { 88 return errors.Wrap(err, "failed to create AzureCluster to AzureMachines mapper") 89 } 90 91 c, err := ctrl.NewControllerManagedBy(mgr). 92 WithOptions(options.Options). 93 For(&infrav1.AzureMachine{}). 94 WithEventFilter(predicates.ResourceHasFilterLabel(log, amr.WatchFilterValue)). 95 // watch for changes in CAPI Machine resources 96 Watches( 97 &clusterv1.Machine{}, 98 handler.EnqueueRequestsFromMapFunc(util.MachineToInfrastructureMapFunc(infrav1.GroupVersion.WithKind("AzureMachine"))), 99 ). 100 // watch for changes in AzureCluster 101 Watches( 102 &infrav1.AzureCluster{}, 103 handler.EnqueueRequestsFromMapFunc(azureClusterToAzureMachinesMapper), 104 ). 105 Build(r) 106 if err != nil { 107 return errors.Wrap(err, "error creating controller") 108 } 109 110 azureMachineMapper, err := util.ClusterToTypedObjectsMapper(amr.Client, &infrav1.AzureMachineList{}, mgr.GetScheme()) 111 if err != nil { 112 return errors.Wrap(err, "failed to create mapper for Cluster to AzureMachines") 113 } 114 115 // Add a watch on clusterv1.Cluster object for pause/unpause & ready notifications. 116 if err := c.Watch( 117 source.Kind(mgr.GetCache(), &clusterv1.Cluster{}), 118 handler.EnqueueRequestsFromMapFunc(azureMachineMapper), 119 ClusterPauseChangeAndInfrastructureReady(log), 120 predicates.ResourceHasFilterLabel(log, amr.WatchFilterValue), 121 ); err != nil { 122 return errors.Wrap(err, "failed adding a watch for ready clusters") 123 } 124 125 return nil 126 } 127 128 // +kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io,resources=azuremachines,verbs=get;list;watch;create;update;patch;delete 129 // +kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io,resources=azuremachines/status,verbs=get;update;patch 130 // +kubebuilder:rbac:groups=cluster.x-k8s.io,resources=machines;machines/status,verbs=get;list;watch 131 // +kubebuilder:rbac:groups="",resources=events,verbs=get;list;watch;create;update;patch 132 // +kubebuilder:rbac:groups="",resources=secrets;,verbs=get;list;watch 133 134 // Reconcile idempotently gets, creates, and updates a machine. 135 func (amr *AzureMachineReconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Result, reterr error) { 136 ctx, cancel := context.WithTimeout(ctx, amr.Timeouts.DefaultedLoopTimeout()) 137 defer cancel() 138 139 ctx, log, done := tele.StartSpanWithLogger( 140 ctx, 141 "controllers.AzureMachineReconciler.Reconcile", 142 tele.KVP("namespace", req.Namespace), 143 tele.KVP("name", req.Name), 144 tele.KVP("kind", "AzureMachine"), 145 ) 146 defer done() 147 148 // Fetch the AzureMachine VM. 149 azureMachine := &infrav1.AzureMachine{} 150 err := amr.Get(ctx, req.NamespacedName, azureMachine) 151 if err != nil { 152 if apierrors.IsNotFound(err) { 153 return reconcile.Result{}, nil 154 } 155 return reconcile.Result{}, err 156 } 157 158 // Fetch the Machine. 159 machine, err := util.GetOwnerMachine(ctx, amr.Client, azureMachine.ObjectMeta) 160 if err != nil { 161 return reconcile.Result{}, err 162 } 163 if machine == nil { 164 amr.Recorder.Eventf(azureMachine, corev1.EventTypeNormal, "Machine controller dependency not yet met", "Machine Controller has not yet set OwnerRef") 165 log.Info("Machine Controller has not yet set OwnerRef") 166 return reconcile.Result{}, nil 167 } 168 169 log = log.WithValues("machine", machine.Name) 170 171 // Fetch the Cluster. 172 cluster, err := util.GetClusterFromMetadata(ctx, amr.Client, machine.ObjectMeta) 173 if err != nil { 174 amr.Recorder.Eventf(azureMachine, corev1.EventTypeNormal, "Unable to get cluster from metadata", "Machine is missing cluster label or cluster does not exist") 175 log.Info("Machine is missing cluster label or cluster does not exist") 176 return reconcile.Result{}, nil 177 } 178 179 log = log.WithValues("cluster", cluster.Name) 180 181 log = log.WithValues("AzureCluster", cluster.Spec.InfrastructureRef.Name) 182 azureClusterName := client.ObjectKey{ 183 Namespace: azureMachine.Namespace, 184 Name: cluster.Spec.InfrastructureRef.Name, 185 } 186 azureCluster := &infrav1.AzureCluster{} 187 if err := amr.Client.Get(ctx, azureClusterName, azureCluster); err != nil { 188 amr.Recorder.Eventf(azureMachine, corev1.EventTypeNormal, "AzureCluster unavailable", "AzureCluster is not available yet") 189 log.Info("AzureCluster is not available yet") 190 return reconcile.Result{}, nil 191 } 192 193 // Create the cluster scope 194 clusterScope, err := scope.NewClusterScope(ctx, scope.ClusterScopeParams{ 195 Client: amr.Client, 196 Cluster: cluster, 197 AzureCluster: azureCluster, 198 Timeouts: amr.Timeouts, 199 }) 200 if err != nil { 201 amr.Recorder.Eventf(azureCluster, corev1.EventTypeWarning, "Error creating the cluster scope", err.Error()) 202 return reconcile.Result{}, err 203 } 204 205 // Create the machine scope 206 machineScope, err := scope.NewMachineScope(scope.MachineScopeParams{ 207 Client: amr.Client, 208 Machine: machine, 209 AzureMachine: azureMachine, 210 ClusterScope: clusterScope, 211 }) 212 if err != nil { 213 amr.Recorder.Eventf(azureMachine, corev1.EventTypeWarning, "Error creating the machine scope", err.Error()) 214 return reconcile.Result{}, errors.Wrap(err, "failed to create scope") 215 } 216 217 // Always close the scope when exiting this function so we can persist any AzureMachine changes. 218 defer func() { 219 if err := machineScope.Close(ctx); err != nil && reterr == nil { 220 reterr = err 221 } 222 }() 223 224 // Return early if the object or Cluster is paused. 225 if annotations.IsPaused(cluster, azureMachine) { 226 log.Info("AzureMachine or linked Cluster is marked as paused. Won't reconcile normally") 227 return amr.reconcilePause(ctx, machineScope) 228 } 229 230 // Handle deleted machines 231 if !azureMachine.ObjectMeta.DeletionTimestamp.IsZero() { 232 return amr.reconcileDelete(ctx, machineScope, clusterScope) 233 } 234 235 // Handle non-deleted machines 236 return amr.reconcileNormal(ctx, machineScope, clusterScope) 237 } 238 239 func (amr *AzureMachineReconciler) reconcileNormal(ctx context.Context, machineScope *scope.MachineScope, clusterScope *scope.ClusterScope) (reconcile.Result, error) { 240 ctx, log, done := tele.StartSpanWithLogger(ctx, "controllers.AzureMachineReconciler.reconcileNormal") 241 defer done() 242 243 log.Info("Reconciling AzureMachine") 244 // If the AzureMachine is in an error state, return early. 245 if machineScope.AzureMachine.Status.FailureReason != nil || machineScope.AzureMachine.Status.FailureMessage != nil { 246 log.Info("Error state detected, skipping reconciliation") 247 return reconcile.Result{}, nil 248 } 249 250 // Register our finalizer immediately to avoid orphaning Azure resources on delete 251 needsPatch := controllerutil.AddFinalizer(machineScope.AzureMachine, infrav1.MachineFinalizer) 252 // Register the block-move annotation immediately to avoid moving un-paused ASO resources 253 needsPatch = AddBlockMoveAnnotation(machineScope.AzureMachine) || needsPatch 254 if needsPatch { 255 if err := machineScope.PatchObject(ctx); err != nil { 256 return reconcile.Result{}, err 257 } 258 } 259 260 // Make sure the Cluster Infrastructure is ready. 261 if !clusterScope.Cluster.Status.InfrastructureReady { 262 log.Info("Cluster infrastructure is not ready yet") 263 conditions.MarkFalse(machineScope.AzureMachine, infrav1.VMRunningCondition, infrav1.WaitingForClusterInfrastructureReason, clusterv1.ConditionSeverityInfo, "") 264 return reconcile.Result{}, nil 265 } 266 267 // Make sure bootstrap data is available and populated. 268 if machineScope.Machine.Spec.Bootstrap.DataSecretName == nil { 269 log.Info("Bootstrap data secret reference is not yet available") 270 conditions.MarkFalse(machineScope.AzureMachine, infrav1.VMRunningCondition, infrav1.WaitingForBootstrapDataReason, clusterv1.ConditionSeverityInfo, "") 271 return reconcile.Result{}, nil 272 } 273 274 var reconcileError azure.ReconcileError 275 276 // Initialize the cache to be used by the AzureMachine services. 277 err := machineScope.InitMachineCache(ctx) 278 if err != nil { 279 if errors.As(err, &reconcileError) && reconcileError.IsTerminal() { 280 amr.Recorder.Eventf(machineScope.AzureMachine, corev1.EventTypeWarning, "SKUNotFound", errors.Wrap(err, "failed to initialize machine cache").Error()) 281 log.Error(err, "Failed to initialize machine cache") 282 machineScope.SetFailureReason(capierrors.InvalidConfigurationMachineError) 283 machineScope.SetFailureMessage(err) 284 machineScope.SetNotReady() 285 return reconcile.Result{}, nil 286 } 287 return reconcile.Result{}, errors.Wrap(err, "failed to init machine scope cache") 288 } 289 290 // Mark the AzureMachine as failed if the identities are not ready. 291 cond := conditions.Get(machineScope.AzureMachine, infrav1.VMIdentitiesReadyCondition) 292 if cond != nil && cond.Status == corev1.ConditionFalse && cond.Reason == infrav1.UserAssignedIdentityMissingReason { 293 amr.Recorder.Eventf(machineScope.AzureMachine, corev1.EventTypeWarning, infrav1.UserAssignedIdentityMissingReason, "VM is unhealthy") 294 machineScope.SetFailureReason(capierrors.UnsupportedChangeMachineError) 295 machineScope.SetFailureMessage(errors.New("VM identities are not ready")) 296 return reconcile.Result{}, errors.New("VM identities are not ready") 297 } 298 299 ams, err := amr.createAzureMachineService(machineScope) 300 if err != nil { 301 return reconcile.Result{}, errors.Wrap(err, "failed to create azure machine service") 302 } 303 304 if err := ams.Reconcile(ctx); err != nil { 305 // This means that a VM was created and managed by this controller, but is not present anymore. 306 // In this case, we mark it as failed and leave it to MHC for remediation 307 if errors.As(err, &azure.VMDeletedError{}) { 308 amr.Recorder.Eventf(machineScope.AzureMachine, corev1.EventTypeWarning, "VMDeleted", errors.Wrap(err, "failed to reconcile AzureMachine").Error()) 309 machineScope.SetFailureReason(capierrors.UpdateMachineError) 310 machineScope.SetFailureMessage(err) 311 machineScope.SetNotReady() 312 machineScope.SetVMState(infrav1.Deleted) 313 return reconcile.Result{}, errors.Wrap(err, "failed to reconcile AzureMachine") 314 } 315 316 // Handle transient and terminal errors 317 if errors.As(err, &reconcileError) { 318 if reconcileError.IsTerminal() { 319 amr.Recorder.Eventf(machineScope.AzureMachine, corev1.EventTypeWarning, "ReconcileError", errors.Wrapf(err, "failed to reconcile AzureMachine").Error()) 320 log.Error(err, "failed to reconcile AzureMachine", "name", machineScope.Name()) 321 machineScope.SetFailureReason(capierrors.CreateMachineError) 322 machineScope.SetFailureMessage(err) 323 machineScope.SetNotReady() 324 machineScope.SetVMState(infrav1.Failed) 325 return reconcile.Result{}, nil 326 } 327 328 if reconcileError.IsTransient() { 329 if azure.IsOperationNotDoneError(reconcileError) { 330 log.V(2).Info(fmt.Sprintf("AzureMachine reconcile not done: %s", reconcileError.Error())) 331 } else { 332 log.V(2).Info(fmt.Sprintf("transient failure to reconcile AzureMachine, retrying: %s", reconcileError.Error())) 333 } 334 return reconcile.Result{RequeueAfter: reconcileError.RequeueAfter()}, nil 335 } 336 } 337 amr.Recorder.Eventf(machineScope.AzureMachine, corev1.EventTypeWarning, "ReconcileError", errors.Wrapf(err, "failed to reconcile AzureMachine").Error()) 338 return reconcile.Result{}, errors.Wrap(err, "failed to reconcile AzureMachine") 339 } 340 341 machineScope.SetReady() 342 343 return reconcile.Result{}, nil 344 } 345 346 //nolint:unparam // Always returns an empty struct for reconcile.Result 347 func (amr *AzureMachineReconciler) reconcilePause(ctx context.Context, machineScope *scope.MachineScope) (reconcile.Result, error) { 348 ctx, log, done := tele.StartSpanWithLogger(ctx, "controllers.AzureMachine.reconcilePause") 349 defer done() 350 351 log.Info("Reconciling AzureMachine pause") 352 353 ams, err := amr.createAzureMachineService(machineScope) 354 if err != nil { 355 return reconcile.Result{}, errors.Wrap(err, "failed to create azure machine service") 356 } 357 358 if err := ams.Pause(ctx); err != nil { 359 return reconcile.Result{}, errors.Wrap(err, "failed to pause azure machine services") 360 } 361 RemoveBlockMoveAnnotation(machineScope.AzureMachine) 362 363 return reconcile.Result{}, nil 364 } 365 366 func (amr *AzureMachineReconciler) reconcileDelete(ctx context.Context, machineScope *scope.MachineScope, clusterScope *scope.ClusterScope) (reconcile.Result, error) { 367 ctx, log, done := tele.StartSpanWithLogger(ctx, "controllers.AzureMachineReconciler.reconcileDelete") 368 defer done() 369 370 log.Info("Handling deleted AzureMachine") 371 conditions.MarkFalse(machineScope.AzureMachine, infrav1.VMRunningCondition, clusterv1.DeletingReason, clusterv1.ConditionSeverityInfo, "") 372 if err := machineScope.PatchObject(ctx); err != nil { 373 return reconcile.Result{}, err 374 } 375 376 if ShouldDeleteIndividualResources(ctx, clusterScope) { 377 log.Info("Deleting AzureMachine") 378 ams, err := amr.createAzureMachineService(machineScope) 379 if err != nil { 380 return reconcile.Result{}, errors.Wrap(err, "failed to create azure machine service") 381 } 382 383 if err := ams.Delete(ctx); err != nil { 384 // Handle transient errors 385 var reconcileError azure.ReconcileError 386 if errors.As(err, &reconcileError) { 387 if reconcileError.IsTransient() { 388 if azure.IsOperationNotDoneError(reconcileError) { 389 log.V(2).Info(fmt.Sprintf("AzureMachine delete not done: %s", reconcileError.Error())) 390 } else { 391 log.V(2).Info("transient failure to delete AzureMachine, retrying") 392 } 393 return reconcile.Result{RequeueAfter: reconcileError.RequeueAfter()}, nil 394 } 395 } 396 397 amr.Recorder.Eventf(machineScope.AzureMachine, corev1.EventTypeWarning, "Error deleting AzureMachine", errors.Wrapf(err, "error deleting AzureMachine %s/%s", machineScope.Namespace(), machineScope.Name()).Error()) 398 return reconcile.Result{}, errors.Wrapf(err, "error deleting AzureMachine %s/%s", machineScope.Namespace(), machineScope.Name()) 399 } 400 } else { 401 log.Info("Skipping AzureMachine Deletion; will delete whole resource group.") 402 } 403 404 // we're done deleting this AzureMachine so remove the finalizer. 405 log.Info("Removing finalizer from AzureMachine") 406 controllerutil.RemoveFinalizer(machineScope.AzureMachine, infrav1.MachineFinalizer) 407 408 return reconcile.Result{}, nil 409 }