sigs.k8s.io/cluster-api-provider-azure@v1.17.0/controllers/azuremachine_controller.go (about) 1 /* 2 Copyright 2019 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package controllers 18 19 import ( 20 "context" 21 "fmt" 22 23 "github.com/pkg/errors" 24 corev1 "k8s.io/api/core/v1" 25 apierrors "k8s.io/apimachinery/pkg/api/errors" 26 "k8s.io/client-go/tools/record" 27 infrav1 "sigs.k8s.io/cluster-api-provider-azure/api/v1beta1" 28 "sigs.k8s.io/cluster-api-provider-azure/azure" 29 "sigs.k8s.io/cluster-api-provider-azure/azure/scope" 30 "sigs.k8s.io/cluster-api-provider-azure/pkg/coalescing" 31 "sigs.k8s.io/cluster-api-provider-azure/util/reconciler" 32 "sigs.k8s.io/cluster-api-provider-azure/util/tele" 33 clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" 34 capierrors "sigs.k8s.io/cluster-api/errors" 35 "sigs.k8s.io/cluster-api/util" 36 "sigs.k8s.io/cluster-api/util/annotations" 37 "sigs.k8s.io/cluster-api/util/conditions" 38 "sigs.k8s.io/cluster-api/util/predicates" 39 ctrl "sigs.k8s.io/controller-runtime" 40 "sigs.k8s.io/controller-runtime/pkg/builder" 41 "sigs.k8s.io/controller-runtime/pkg/client" 42 "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" 43 "sigs.k8s.io/controller-runtime/pkg/handler" 44 "sigs.k8s.io/controller-runtime/pkg/reconcile" 45 ) 46 47 // AzureMachineReconciler reconciles an AzureMachine object. 48 type AzureMachineReconciler struct { 49 client.Client 50 Recorder record.EventRecorder 51 Timeouts reconciler.Timeouts 52 WatchFilterValue string 53 createAzureMachineService azureMachineServiceCreator 54 } 55 56 type azureMachineServiceCreator func(machineScope *scope.MachineScope) (*azureMachineService, error) 57 58 // NewAzureMachineReconciler returns a new AzureMachineReconciler instance. 59 func NewAzureMachineReconciler(client client.Client, recorder record.EventRecorder, timeouts reconciler.Timeouts, watchFilterValue string) *AzureMachineReconciler { 60 amr := &AzureMachineReconciler{ 61 Client: client, 62 Recorder: recorder, 63 Timeouts: timeouts, 64 WatchFilterValue: watchFilterValue, 65 } 66 67 amr.createAzureMachineService = newAzureMachineService 68 69 return amr 70 } 71 72 // SetupWithManager initializes this controller with a manager. 73 func (amr *AzureMachineReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, options Options) error { 74 ctx, log, done := tele.StartSpanWithLogger(ctx, 75 "controllers.AzureMachineReconciler.SetupWithManager", 76 tele.KVP("controller", "AzureMachine"), 77 ) 78 defer done() 79 80 var r reconcile.Reconciler = amr 81 if options.Cache != nil { 82 r = coalescing.NewReconciler(amr, options.Cache, log) 83 } 84 85 // create mapper to transform incoming AzureClusters into AzureMachine requests 86 azureClusterToAzureMachinesMapper, err := AzureClusterToAzureMachinesMapper(ctx, amr.Client, &infrav1.AzureMachineList{}, mgr.GetScheme(), log) 87 if err != nil { 88 return errors.Wrap(err, "failed to create AzureCluster to AzureMachines mapper") 89 } 90 91 azureMachineMapper, err := util.ClusterToTypedObjectsMapper(amr.Client, &infrav1.AzureMachineList{}, mgr.GetScheme()) 92 if err != nil { 93 return errors.Wrap(err, "failed to create mapper for Cluster to AzureMachines") 94 } 95 96 return ctrl.NewControllerManagedBy(mgr). 97 WithOptions(options.Options). 98 For(&infrav1.AzureMachine{}). 99 WithEventFilter(predicates.ResourceHasFilterLabel(log, amr.WatchFilterValue)). 100 // watch for changes in CAPI Machine resources 101 Watches( 102 &clusterv1.Machine{}, 103 handler.EnqueueRequestsFromMapFunc(util.MachineToInfrastructureMapFunc(infrav1.GroupVersion.WithKind("AzureMachine"))), 104 ). 105 // watch for changes in AzureCluster 106 Watches( 107 &infrav1.AzureCluster{}, 108 handler.EnqueueRequestsFromMapFunc(azureClusterToAzureMachinesMapper), 109 ). 110 // Add a watch on clusterv1.Cluster object for pause/unpause & ready notifications. 111 Watches( 112 &clusterv1.Cluster{}, 113 handler.EnqueueRequestsFromMapFunc(azureMachineMapper), 114 builder.WithPredicates( 115 ClusterPauseChangeAndInfrastructureReady(log), 116 predicates.ResourceHasFilterLabel(log, amr.WatchFilterValue), 117 ), 118 ). 119 Complete(r) 120 } 121 122 // +kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io,resources=azuremachines,verbs=get;list;watch;create;update;patch;delete 123 // +kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io,resources=azuremachines/status,verbs=get;update;patch 124 // +kubebuilder:rbac:groups=cluster.x-k8s.io,resources=machines;machines/status,verbs=get;list;watch 125 // +kubebuilder:rbac:groups="",resources=events,verbs=get;list;watch;create;update;patch 126 // +kubebuilder:rbac:groups="",resources=secrets;,verbs=get;list;watch 127 128 // Reconcile idempotently gets, creates, and updates a machine. 129 func (amr *AzureMachineReconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Result, reterr error) { 130 ctx, cancel := context.WithTimeout(ctx, amr.Timeouts.DefaultedLoopTimeout()) 131 defer cancel() 132 133 ctx, log, done := tele.StartSpanWithLogger( 134 ctx, 135 "controllers.AzureMachineReconciler.Reconcile", 136 tele.KVP("namespace", req.Namespace), 137 tele.KVP("name", req.Name), 138 tele.KVP("kind", "AzureMachine"), 139 ) 140 defer done() 141 142 // Fetch the AzureMachine VM. 143 azureMachine := &infrav1.AzureMachine{} 144 err := amr.Get(ctx, req.NamespacedName, azureMachine) 145 if err != nil { 146 if apierrors.IsNotFound(err) { 147 return reconcile.Result{}, nil 148 } 149 return reconcile.Result{}, err 150 } 151 152 // Fetch the Machine. 153 machine, err := util.GetOwnerMachine(ctx, amr.Client, azureMachine.ObjectMeta) 154 if err != nil { 155 return reconcile.Result{}, err 156 } 157 if machine == nil { 158 amr.Recorder.Eventf(azureMachine, corev1.EventTypeNormal, "Machine controller dependency not yet met", "Machine Controller has not yet set OwnerRef") 159 log.Info("Machine Controller has not yet set OwnerRef") 160 return reconcile.Result{}, nil 161 } 162 163 log = log.WithValues("machine", machine.Name) 164 165 // Fetch the Cluster. 166 cluster, err := util.GetClusterFromMetadata(ctx, amr.Client, machine.ObjectMeta) 167 if err != nil { 168 amr.Recorder.Eventf(azureMachine, corev1.EventTypeNormal, "Unable to get cluster from metadata", "Machine is missing cluster label or cluster does not exist") 169 log.Info("Machine is missing cluster label or cluster does not exist") 170 return reconcile.Result{}, nil 171 } 172 173 log = log.WithValues("cluster", cluster.Name) 174 175 log = log.WithValues("AzureCluster", cluster.Spec.InfrastructureRef.Name) 176 azureClusterName := client.ObjectKey{ 177 Namespace: azureMachine.Namespace, 178 Name: cluster.Spec.InfrastructureRef.Name, 179 } 180 azureCluster := &infrav1.AzureCluster{} 181 if err := amr.Client.Get(ctx, azureClusterName, azureCluster); err != nil { 182 amr.Recorder.Eventf(azureMachine, corev1.EventTypeNormal, "AzureCluster unavailable", "AzureCluster is not available yet") 183 log.Info("AzureCluster is not available yet") 184 return reconcile.Result{}, nil 185 } 186 187 // Create the cluster scope 188 clusterScope, err := scope.NewClusterScope(ctx, scope.ClusterScopeParams{ 189 Client: amr.Client, 190 Cluster: cluster, 191 AzureCluster: azureCluster, 192 Timeouts: amr.Timeouts, 193 }) 194 if err != nil { 195 amr.Recorder.Eventf(azureCluster, corev1.EventTypeWarning, "Error creating the cluster scope", err.Error()) 196 return reconcile.Result{}, err 197 } 198 199 // Create the machine scope 200 machineScope, err := scope.NewMachineScope(scope.MachineScopeParams{ 201 Client: amr.Client, 202 Machine: machine, 203 AzureMachine: azureMachine, 204 ClusterScope: clusterScope, 205 }) 206 if err != nil { 207 amr.Recorder.Eventf(azureMachine, corev1.EventTypeWarning, "Error creating the machine scope", err.Error()) 208 return reconcile.Result{}, errors.Wrap(err, "failed to create scope") 209 } 210 211 // Always close the scope when exiting this function so we can persist any AzureMachine changes. 212 defer func() { 213 if err := machineScope.Close(ctx); err != nil && reterr == nil { 214 reterr = err 215 } 216 }() 217 218 // Return early if the object or Cluster is paused. 219 if annotations.IsPaused(cluster, azureMachine) { 220 log.Info("AzureMachine or linked Cluster is marked as paused. Won't reconcile normally") 221 return amr.reconcilePause(ctx, machineScope) 222 } 223 224 // Handle deleted machines 225 if !azureMachine.ObjectMeta.DeletionTimestamp.IsZero() { 226 return amr.reconcileDelete(ctx, machineScope, clusterScope) 227 } 228 229 // Handle non-deleted machines 230 return amr.reconcileNormal(ctx, machineScope, clusterScope) 231 } 232 233 func (amr *AzureMachineReconciler) reconcileNormal(ctx context.Context, machineScope *scope.MachineScope, clusterScope *scope.ClusterScope) (reconcile.Result, error) { 234 ctx, log, done := tele.StartSpanWithLogger(ctx, "controllers.AzureMachineReconciler.reconcileNormal") 235 defer done() 236 237 log.Info("Reconciling AzureMachine") 238 // If the AzureMachine is in an error state, return early. 239 if machineScope.AzureMachine.Status.FailureReason != nil || machineScope.AzureMachine.Status.FailureMessage != nil { 240 log.Info("Error state detected, skipping reconciliation") 241 return reconcile.Result{}, nil 242 } 243 244 // Register our finalizer immediately to avoid orphaning Azure resources on delete 245 needsPatch := controllerutil.AddFinalizer(machineScope.AzureMachine, infrav1.MachineFinalizer) 246 // Register the block-move annotation immediately to avoid moving un-paused ASO resources 247 needsPatch = AddBlockMoveAnnotation(machineScope.AzureMachine) || needsPatch 248 if needsPatch { 249 if err := machineScope.PatchObject(ctx); err != nil { 250 return reconcile.Result{}, err 251 } 252 } 253 254 // Make sure the Cluster Infrastructure is ready. 255 if !clusterScope.Cluster.Status.InfrastructureReady { 256 log.Info("Cluster infrastructure is not ready yet") 257 conditions.MarkFalse(machineScope.AzureMachine, infrav1.VMRunningCondition, infrav1.WaitingForClusterInfrastructureReason, clusterv1.ConditionSeverityInfo, "") 258 return reconcile.Result{}, nil 259 } 260 261 // Make sure bootstrap data is available and populated. 262 if machineScope.Machine.Spec.Bootstrap.DataSecretName == nil { 263 log.Info("Bootstrap data secret reference is not yet available") 264 conditions.MarkFalse(machineScope.AzureMachine, infrav1.VMRunningCondition, infrav1.WaitingForBootstrapDataReason, clusterv1.ConditionSeverityInfo, "") 265 return reconcile.Result{}, nil 266 } 267 268 var reconcileError azure.ReconcileError 269 270 // Initialize the cache to be used by the AzureMachine services. 271 err := machineScope.InitMachineCache(ctx) 272 if err != nil { 273 if errors.As(err, &reconcileError) && reconcileError.IsTerminal() { 274 amr.Recorder.Eventf(machineScope.AzureMachine, corev1.EventTypeWarning, "SKUNotFound", errors.Wrap(err, "failed to initialize machine cache").Error()) 275 log.Error(err, "Failed to initialize machine cache") 276 machineScope.SetFailureReason(capierrors.InvalidConfigurationMachineError) 277 machineScope.SetFailureMessage(err) 278 machineScope.SetNotReady() 279 return reconcile.Result{}, nil 280 } 281 return reconcile.Result{}, errors.Wrap(err, "failed to init machine scope cache") 282 } 283 284 // Mark the AzureMachine as failed if the identities are not ready. 285 cond := conditions.Get(machineScope.AzureMachine, infrav1.VMIdentitiesReadyCondition) 286 if cond != nil && cond.Status == corev1.ConditionFalse && cond.Reason == infrav1.UserAssignedIdentityMissingReason { 287 amr.Recorder.Eventf(machineScope.AzureMachine, corev1.EventTypeWarning, infrav1.UserAssignedIdentityMissingReason, "VM is unhealthy") 288 machineScope.SetFailureReason(capierrors.UnsupportedChangeMachineError) 289 machineScope.SetFailureMessage(errors.New("VM identities are not ready")) 290 return reconcile.Result{}, errors.New("VM identities are not ready") 291 } 292 293 ams, err := amr.createAzureMachineService(machineScope) 294 if err != nil { 295 return reconcile.Result{}, errors.Wrap(err, "failed to create azure machine service") 296 } 297 298 if err := ams.Reconcile(ctx); err != nil { 299 // This means that a VM was created and managed by this controller, but is not present anymore. 300 // In this case, we mark it as failed and leave it to MHC for remediation 301 if errors.As(err, &azure.VMDeletedError{}) { 302 amr.Recorder.Eventf(machineScope.AzureMachine, corev1.EventTypeWarning, "VMDeleted", errors.Wrap(err, "failed to reconcile AzureMachine").Error()) 303 machineScope.SetFailureReason(capierrors.UpdateMachineError) 304 machineScope.SetFailureMessage(err) 305 machineScope.SetNotReady() 306 machineScope.SetVMState(infrav1.Deleted) 307 return reconcile.Result{}, errors.Wrap(err, "failed to reconcile AzureMachine") 308 } 309 310 // Handle transient and terminal errors 311 if errors.As(err, &reconcileError) { 312 if reconcileError.IsTerminal() { 313 amr.Recorder.Eventf(machineScope.AzureMachine, corev1.EventTypeWarning, "ReconcileError", errors.Wrapf(err, "failed to reconcile AzureMachine").Error()) 314 log.Error(err, "failed to reconcile AzureMachine", "name", machineScope.Name()) 315 machineScope.SetFailureReason(capierrors.CreateMachineError) 316 machineScope.SetFailureMessage(err) 317 machineScope.SetNotReady() 318 machineScope.SetVMState(infrav1.Failed) 319 return reconcile.Result{}, nil 320 } 321 322 if reconcileError.IsTransient() { 323 if azure.IsOperationNotDoneError(reconcileError) { 324 log.V(2).Info(fmt.Sprintf("AzureMachine reconcile not done: %s", reconcileError.Error())) 325 } else { 326 log.V(2).Info(fmt.Sprintf("transient failure to reconcile AzureMachine, retrying: %s", reconcileError.Error())) 327 } 328 return reconcile.Result{RequeueAfter: reconcileError.RequeueAfter()}, nil 329 } 330 } 331 amr.Recorder.Eventf(machineScope.AzureMachine, corev1.EventTypeWarning, "ReconcileError", errors.Wrapf(err, "failed to reconcile AzureMachine").Error()) 332 return reconcile.Result{}, errors.Wrap(err, "failed to reconcile AzureMachine") 333 } 334 335 machineScope.SetReady() 336 337 return reconcile.Result{}, nil 338 } 339 340 //nolint:unparam // Always returns an empty struct for reconcile.Result 341 func (amr *AzureMachineReconciler) reconcilePause(ctx context.Context, machineScope *scope.MachineScope) (reconcile.Result, error) { 342 ctx, log, done := tele.StartSpanWithLogger(ctx, "controllers.AzureMachine.reconcilePause") 343 defer done() 344 345 log.Info("Reconciling AzureMachine pause") 346 347 ams, err := amr.createAzureMachineService(machineScope) 348 if err != nil { 349 return reconcile.Result{}, errors.Wrap(err, "failed to create azure machine service") 350 } 351 352 if err := ams.Pause(ctx); err != nil { 353 return reconcile.Result{}, errors.Wrap(err, "failed to pause azure machine services") 354 } 355 RemoveBlockMoveAnnotation(machineScope.AzureMachine) 356 357 return reconcile.Result{}, nil 358 } 359 360 func (amr *AzureMachineReconciler) reconcileDelete(ctx context.Context, machineScope *scope.MachineScope, clusterScope *scope.ClusterScope) (reconcile.Result, error) { 361 ctx, log, done := tele.StartSpanWithLogger(ctx, "controllers.AzureMachineReconciler.reconcileDelete") 362 defer done() 363 364 log.Info("Handling deleted AzureMachine") 365 conditions.MarkFalse(machineScope.AzureMachine, infrav1.VMRunningCondition, clusterv1.DeletingReason, clusterv1.ConditionSeverityInfo, "") 366 if err := machineScope.PatchObject(ctx); err != nil { 367 return reconcile.Result{}, err 368 } 369 370 if ShouldDeleteIndividualResources(ctx, clusterScope) { 371 log.Info("Deleting AzureMachine") 372 ams, err := amr.createAzureMachineService(machineScope) 373 if err != nil { 374 return reconcile.Result{}, errors.Wrap(err, "failed to create azure machine service") 375 } 376 377 if err := ams.Delete(ctx); err != nil { 378 // Handle transient errors 379 var reconcileError azure.ReconcileError 380 if errors.As(err, &reconcileError) { 381 if reconcileError.IsTransient() { 382 if azure.IsOperationNotDoneError(reconcileError) { 383 log.V(2).Info(fmt.Sprintf("AzureMachine delete not done: %s", reconcileError.Error())) 384 } else { 385 log.V(2).Info("transient failure to delete AzureMachine, retrying") 386 } 387 return reconcile.Result{RequeueAfter: reconcileError.RequeueAfter()}, nil 388 } 389 } 390 391 amr.Recorder.Eventf(machineScope.AzureMachine, corev1.EventTypeWarning, "Error deleting AzureMachine", errors.Wrapf(err, "error deleting AzureMachine %s/%s", machineScope.Namespace(), machineScope.Name()).Error()) 392 return reconcile.Result{}, errors.Wrapf(err, "error deleting AzureMachine %s/%s", machineScope.Namespace(), machineScope.Name()) 393 } 394 } else { 395 log.Info("Skipping AzureMachine Deletion; will delete whole resource group.") 396 } 397 398 // we're done deleting this AzureMachine so remove the finalizer. 399 log.Info("Removing finalizer from AzureMachine") 400 controllerutil.RemoveFinalizer(machineScope.AzureMachine, infrav1.MachineFinalizer) 401 402 return reconcile.Result{}, nil 403 }