sigs.k8s.io/cluster-api-provider-azure@v1.14.3/exp/controllers/azuremachinepoolmachine_controller.go (about) 1 /* 2 Copyright 2021 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package controllers 18 19 import ( 20 "context" 21 "fmt" 22 "time" 23 24 "github.com/pkg/errors" 25 corev1 "k8s.io/api/core/v1" 26 apierrors "k8s.io/apimachinery/pkg/api/errors" 27 "k8s.io/apimachinery/pkg/runtime" 28 "k8s.io/client-go/tools/record" 29 infrav1 "sigs.k8s.io/cluster-api-provider-azure/api/v1beta1" 30 "sigs.k8s.io/cluster-api-provider-azure/azure" 31 "sigs.k8s.io/cluster-api-provider-azure/azure/scope" 32 "sigs.k8s.io/cluster-api-provider-azure/azure/services/scalesetvms" 33 infracontroller "sigs.k8s.io/cluster-api-provider-azure/controllers" 34 infrav1exp "sigs.k8s.io/cluster-api-provider-azure/exp/api/v1beta1" 35 "sigs.k8s.io/cluster-api-provider-azure/pkg/coalescing" 36 "sigs.k8s.io/cluster-api-provider-azure/util/reconciler" 37 "sigs.k8s.io/cluster-api-provider-azure/util/tele" 38 clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" 39 capierrors "sigs.k8s.io/cluster-api/errors" 40 "sigs.k8s.io/cluster-api/util" 41 "sigs.k8s.io/cluster-api/util/annotations" 42 "sigs.k8s.io/cluster-api/util/conditions" 43 "sigs.k8s.io/cluster-api/util/predicates" 44 ctrl "sigs.k8s.io/controller-runtime" 45 "sigs.k8s.io/controller-runtime/pkg/client" 46 "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" 47 "sigs.k8s.io/controller-runtime/pkg/handler" 48 "sigs.k8s.io/controller-runtime/pkg/reconcile" 49 "sigs.k8s.io/controller-runtime/pkg/source" 50 ) 51 52 type ( 53 azureMachinePoolMachineReconcilerFactory func(*scope.MachinePoolMachineScope) (azure.Reconciler, error) 54 55 // AzureMachinePoolMachineController handles Kubernetes change events for AzureMachinePoolMachine resources. 56 AzureMachinePoolMachineController struct { 57 client.Client 58 Scheme *runtime.Scheme 59 Recorder record.EventRecorder 60 Timeouts reconciler.Timeouts 61 WatchFilterValue string 62 reconcilerFactory azureMachinePoolMachineReconcilerFactory 63 } 64 65 azureMachinePoolMachineReconciler struct { 66 Scope *scope.MachinePoolMachineScope 67 scalesetVMsService *scalesetvms.Service 68 } 69 ) 70 71 // NewAzureMachinePoolMachineController creates a new AzureMachinePoolMachineController to handle updates to Azure Machine Pool Machines. 72 func NewAzureMachinePoolMachineController(c client.Client, recorder record.EventRecorder, timeouts reconciler.Timeouts, watchFilterValue string) *AzureMachinePoolMachineController { 73 return &AzureMachinePoolMachineController{ 74 Client: c, 75 Recorder: recorder, 76 Timeouts: timeouts, 77 WatchFilterValue: watchFilterValue, 78 reconcilerFactory: newAzureMachinePoolMachineReconciler, 79 } 80 } 81 82 // SetupWithManager initializes this controller with a manager. 83 func (ampmr *AzureMachinePoolMachineController) SetupWithManager(ctx context.Context, mgr ctrl.Manager, options infracontroller.Options) error { 84 ctx, log, done := tele.StartSpanWithLogger(ctx, 85 "controllers.AzureMachinePoolMachineController.SetupWithManager", 86 tele.KVP("controller", "AzureMachinePoolMachine"), 87 ) 88 defer done() 89 90 var r reconcile.Reconciler = ampmr 91 if options.Cache != nil { 92 r = coalescing.NewReconciler(ampmr, options.Cache, log) 93 } 94 95 c, err := ctrl.NewControllerManagedBy(mgr). 96 WithOptions(options.Options). 97 For(&infrav1exp.AzureMachinePoolMachine{}). 98 WithEventFilter(predicates.ResourceNotPausedAndHasFilterLabel(log, ampmr.WatchFilterValue)). 99 Build(r) 100 if err != nil { 101 return errors.Wrapf(err, "error creating controller") 102 } 103 104 // Add a watch on AzureMachinePool for model changes 105 if err := c.Watch( 106 source.Kind(mgr.GetCache(), &infrav1exp.AzureMachinePool{}), 107 handler.EnqueueRequestsFromMapFunc(AzureMachinePoolToAzureMachinePoolMachines(ctx, mgr.GetClient(), log)), 108 MachinePoolModelHasChanged(log), 109 predicates.ResourceNotPausedAndHasFilterLabel(log, ampmr.WatchFilterValue), 110 ); err != nil { 111 return errors.Wrapf(err, "failed adding a watch for AzureMachinePool model changes") 112 } 113 114 // Add a watch on CAPI Machines for MachinePool Machines 115 if err := c.Watch( 116 source.Kind(mgr.GetCache(), &clusterv1.Machine{}), 117 handler.EnqueueRequestsFromMapFunc(util.MachineToInfrastructureMapFunc(infrav1exp.GroupVersion.WithKind("AzureMachinePoolMachine"))), 118 predicates.ResourceNotPausedAndHasFilterLabel(log, ampmr.WatchFilterValue), 119 ); err != nil { 120 return errors.Wrapf(err, "failed adding a watch for Machine model changes") 121 } 122 123 return nil 124 } 125 126 // +kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io,resources=azuremachinepools,verbs=get;list;watch 127 // +kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io,resources=azuremachinepools/status,verbs=get 128 // +kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io,resources=azuremachinepoolmachines,verbs=get;list;watch;create;update;patch;delete 129 // +kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io,resources=azuremachinepoolmachines/status,verbs=get;update;patch 130 // +kubebuilder:rbac:groups=cluster.x-k8s.io,resources=machinepools;machinepools/status,verbs=get 131 // +kubebuilder:rbac:groups=cluster.x-k8s.io,resources=machines;machines/status,verbs=get;list;watch;delete 132 // +kubebuilder:rbac:groups="",resources=events,verbs=get;list;watch;create;update;patch 133 // +kubebuilder:rbac:groups="",resources=secrets;,verbs=get;list;watch 134 // +kubebuilder:rbac:groups=core,resources=nodes,verbs=get;list;watch 135 136 // Reconcile idempotently gets, creates, and updates a machine pool. 137 func (ampmr *AzureMachinePoolMachineController) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Result, reterr error) { 138 ctx, logger, done := tele.StartSpanWithLogger( 139 ctx, 140 "controllers.AzureMachinePoolMachineController.Reconcile", 141 tele.KVP("namespace", req.Namespace), 142 tele.KVP("name", req.Name), 143 tele.KVP("kind", "AzureMachinePoolMachine"), 144 ) 145 defer done() 146 147 logger = logger.WithValues("namespace", req.Namespace, "azureMachinePoolMachine", req.Name) 148 149 ctx, cancel := context.WithTimeout(ctx, ampmr.Timeouts.DefaultedLoopTimeout()) 150 defer cancel() 151 152 azureMachine := &infrav1exp.AzureMachinePoolMachine{} 153 err := ampmr.Get(ctx, req.NamespacedName, azureMachine) 154 if err != nil { 155 if apierrors.IsNotFound(err) { 156 return reconcile.Result{}, nil 157 } 158 return reconcile.Result{}, err 159 } 160 logger.V(2).Info("Fetching cluster for AzureMachinePoolMachine", "ampm", azureMachine.Name) 161 162 // Fetch the Cluster. 163 cluster, err := util.GetClusterFromMetadata(ctx, ampmr.Client, azureMachine.ObjectMeta) 164 if err != nil { 165 logger.Info("AzureMachinePoolMachine is missing cluster label or cluster does not exist") 166 return reconcile.Result{}, nil 167 } 168 169 logger = logger.WithValues("cluster", cluster.Name) 170 171 // Return early if the object or Cluster is paused. 172 if annotations.IsPaused(cluster, azureMachine) { 173 logger.Info("AzureMachinePoolMachine or linked Cluster is marked as paused. Won't reconcile") 174 return ctrl.Result{}, nil 175 } 176 177 clusterScope, err := infracontroller.GetClusterScoper(ctx, logger, ampmr.Client, cluster, ampmr.Timeouts) 178 if err != nil { 179 return reconcile.Result{}, errors.Wrapf(err, "failed to create cluster scope for cluster %s/%s", cluster.Namespace, cluster.Name) 180 } 181 182 logger.V(2).Info("Fetching AzureMachinePool with object meta", "meta", azureMachine.ObjectMeta) 183 // Fetch the owning AzureMachinePool (VMSS) 184 azureMachinePool, err := infracontroller.GetOwnerAzureMachinePool(ctx, ampmr.Client, azureMachine.ObjectMeta) 185 if err != nil { 186 if apierrors.IsNotFound(err) { 187 logger.Info("AzureMachinePool not found error missing, removing finalizer", "azureMachinePoolMachine", azureMachine.Name) 188 controllerutil.RemoveFinalizer(azureMachine, infrav1exp.AzureMachinePoolMachineFinalizer) 189 return reconcile.Result{}, ampmr.Client.Update(ctx, azureMachine) 190 } 191 return reconcile.Result{}, err 192 } 193 if azureMachinePool == nil { 194 logger.Info("AzureMachinePool not found error missing, removing finalizer", "azureMachinePoolMachine", azureMachine.Name) 195 controllerutil.RemoveFinalizer(azureMachine, infrav1exp.AzureMachinePoolMachineFinalizer) 196 return reconcile.Result{}, ampmr.Client.Update(ctx, azureMachine) 197 } 198 199 logger = logger.WithValues("azureMachinePool", azureMachinePool.Name) 200 201 // Fetch the CAPI MachinePool. 202 machinePool, err := infracontroller.GetOwnerMachinePool(ctx, ampmr.Client, azureMachinePool.ObjectMeta) 203 if err != nil && !apierrors.IsNotFound(err) { 204 return reconcile.Result{}, err 205 } 206 207 if machinePool != nil { 208 logger = logger.WithValues("machinePool", machinePool.Name) 209 } 210 211 // Fetch the CAPI Machine. 212 machine, err := util.GetOwnerMachine(ctx, ampmr.Client, azureMachine.ObjectMeta) 213 if err != nil && !apierrors.IsNotFound(err) { 214 return reconcile.Result{}, err 215 } 216 217 if machine != nil { 218 logger = logger.WithValues("machine", machine.Name) 219 } else { 220 logger.Info("Waiting for Machine Controller to set OwnerRef on AzureMachinePoolMachine") 221 return reconcile.Result{}, nil 222 } 223 224 // Create the machine pool scope 225 machineScope, err := scope.NewMachinePoolMachineScope(scope.MachinePoolMachineScopeParams{ 226 Client: ampmr.Client, 227 MachinePool: machinePool, 228 AzureMachinePool: azureMachinePool, 229 AzureMachinePoolMachine: azureMachine, 230 Machine: machine, 231 ClusterScope: clusterScope, 232 }) 233 if err != nil { 234 return reconcile.Result{}, errors.Wrap(err, "failed to create scope") 235 } 236 237 // Always close the scope when exiting this function so we can persist any AzureMachine changes. 238 defer func() { 239 if err := machineScope.Close(ctx); err != nil && reterr == nil { 240 reterr = err 241 } 242 }() 243 244 // Handle deleted machine pools machine 245 if !azureMachine.ObjectMeta.DeletionTimestamp.IsZero() { 246 return ampmr.reconcileDelete(ctx, machineScope, clusterScope) 247 } 248 249 if !cluster.Status.InfrastructureReady { 250 logger.Info("Cluster infrastructure is not ready yet") 251 return reconcile.Result{}, nil 252 } 253 254 // Handle non-deleted machine pools 255 return ampmr.reconcileNormal(ctx, machineScope) 256 } 257 258 func (ampmr *AzureMachinePoolMachineController) reconcileNormal(ctx context.Context, machineScope *scope.MachinePoolMachineScope) (_ reconcile.Result, reterr error) { 259 ctx, log, done := tele.StartSpanWithLogger(ctx, "controllers.AzureMachinePoolMachineController.reconcileNormal") 260 defer done() 261 262 log.Info("Reconciling AzureMachinePoolMachine") 263 // If the AzureMachine is in an error state, return early. 264 if machineScope.AzureMachinePool.Status.FailureReason != nil || machineScope.AzureMachinePool.Status.FailureMessage != nil { 265 log.Info("Error state detected, skipping reconciliation") 266 return reconcile.Result{}, nil 267 } 268 269 ampms, err := ampmr.reconcilerFactory(machineScope) 270 if err != nil { 271 return reconcile.Result{}, errors.Wrap(err, "failed to create AzureMachinePoolMachine reconciler") 272 } 273 if err := ampms.Reconcile(ctx); err != nil { 274 // Handle transient and terminal errors 275 var reconcileError azure.ReconcileError 276 if errors.As(err, &reconcileError) { 277 if reconcileError.IsTerminal() { 278 log.Error(err, "failed to reconcile AzureMachinePool", "name", machineScope.Name()) 279 return reconcile.Result{}, nil 280 } 281 282 if reconcileError.IsTransient() { 283 log.V(4).Info("failed to reconcile AzureMachinePoolMachine", "name", machineScope.Name(), "transient_error", err) 284 return reconcile.Result{RequeueAfter: reconcileError.RequeueAfter()}, nil 285 } 286 287 return reconcile.Result{}, errors.Wrap(err, "failed to reconcile AzureMachinePool") 288 } 289 290 return reconcile.Result{}, err 291 } 292 293 state := machineScope.ProvisioningState() 294 switch state { 295 case infrav1.Failed: 296 ampmr.Recorder.Eventf(machineScope.AzureMachinePoolMachine, corev1.EventTypeWarning, "FailedVMState", "Azure scale set VM is in failed state") 297 machineScope.SetFailureReason(capierrors.UpdateMachineError) 298 machineScope.SetFailureMessage(errors.Errorf("Azure VM state is %s", state)) 299 case infrav1.Deleting: 300 if err := ampmr.Client.Delete(ctx, machineScope.AzureMachinePoolMachine); err != nil { 301 return reconcile.Result{}, errors.Wrap(err, "machine pool machine failed to be deleted when deleting") 302 } 303 } 304 305 log.V(2).Info(fmt.Sprintf("Scale Set VM is %s", state), "id", machineScope.ProviderID()) 306 307 bootstrappingCondition := conditions.Get(machineScope.AzureMachinePoolMachine, infrav1.BootstrapSucceededCondition) 308 if bootstrappingCondition != nil && bootstrappingCondition.Reason == infrav1.BootstrapFailedReason { 309 return reconcile.Result{}, nil 310 } 311 312 if !infrav1.IsTerminalProvisioningState(state) || !machineScope.IsReady() { 313 log.V(2).Info("Requeuing", "state", state, "ready", machineScope.IsReady()) 314 // we are in a non-terminal state, retry in a bit 315 return reconcile.Result{ 316 RequeueAfter: 30 * time.Second, 317 }, nil 318 } 319 320 return reconcile.Result{}, nil 321 } 322 323 func (ampmr *AzureMachinePoolMachineController) reconcileDelete(ctx context.Context, machineScope *scope.MachinePoolMachineScope, clusterScope infracontroller.ClusterScoper) (_ reconcile.Result, reterr error) { 324 ctx, log, done := tele.StartSpanWithLogger(ctx, "controllers.AzureMachinePoolMachineController.reconcileDelete") 325 defer done() 326 327 if !infracontroller.ShouldDeleteIndividualResources(ctx, clusterScope) { 328 log.Info("Skipping VMSS VM deletion as the whole resource group is being deleted") 329 330 controllerutil.RemoveFinalizer(machineScope.AzureMachinePoolMachine, infrav1exp.AzureMachinePoolMachineFinalizer) 331 return reconcile.Result{}, nil 332 } 333 334 if !machineScope.AzureMachinePool.ObjectMeta.DeletionTimestamp.IsZero() { 335 log.Info("Skipping VMSS VM deletion as VMSS delete will delete individual instances") 336 337 controllerutil.RemoveFinalizer(machineScope.AzureMachinePoolMachine, infrav1exp.AzureMachinePoolMachineFinalizer) 338 return reconcile.Result{}, nil 339 } 340 341 log.Info("Deleting AzureMachinePoolMachine") 342 343 // deleting a single machine 344 // 1) delete the infrastructure, node drain already done by owner Machine 345 // 2) remove finalizer 346 347 ampms, err := ampmr.reconcilerFactory(machineScope) 348 if err != nil { 349 return reconcile.Result{}, errors.Wrap(err, "failed to create AzureMachinePoolMachine reconciler") 350 } 351 if err := ampms.Delete(ctx); err != nil { 352 // Handle transient and terminal errors 353 var reconcileError azure.ReconcileError 354 if errors.As(err, &reconcileError) { 355 if reconcileError.IsTerminal() { 356 log.Error(err, "failed to delete AzureMachinePoolMachine", "name", machineScope.Name()) 357 return reconcile.Result{}, nil 358 } 359 360 if reconcileError.IsTransient() { 361 log.V(4).Info("failed to delete AzureMachinePoolMachine", "name", machineScope.Name(), "transient_error", err) 362 return reconcile.Result{RequeueAfter: reconcileError.RequeueAfter()}, nil 363 } 364 365 return reconcile.Result{}, errors.Wrapf(err, "failed to reconcile AzureMachinePool") 366 } 367 368 return reconcile.Result{}, err 369 } 370 371 return reconcile.Result{}, nil 372 } 373 374 func newAzureMachinePoolMachineReconciler(scope *scope.MachinePoolMachineScope) (azure.Reconciler, error) { 375 scaleSetVMsSvc, err := scalesetvms.NewService(scope) 376 if err != nil { 377 return nil, err 378 } 379 return &azureMachinePoolMachineReconciler{ 380 Scope: scope, 381 scalesetVMsService: scaleSetVMsSvc, 382 }, nil 383 } 384 385 // Reconcile will reconcile the state of the Machine Pool Machine with the state of the Azure VMSS VM. 386 func (r *azureMachinePoolMachineReconciler) Reconcile(ctx context.Context) error { 387 ctx, _, done := tele.StartSpanWithLogger(ctx, "controllers.azureMachinePoolMachineReconciler.Reconcile") 388 defer done() 389 390 if err := r.scalesetVMsService.Reconcile(ctx); err != nil { 391 return errors.Wrap(err, "failed to reconcile scalesetVMs") 392 } 393 394 if err := r.Scope.UpdateNodeStatus(ctx); err != nil { 395 return errors.Wrap(err, "failed to update VMSS VM node status") 396 } 397 398 if err := r.Scope.UpdateInstanceStatus(ctx); err != nil { 399 return errors.Wrap(err, "failed to update VMSS VM instance status") 400 } 401 402 return nil 403 } 404 405 // Delete will attempt to drain and delete the Azure VMSS VM. 406 func (r *azureMachinePoolMachineReconciler) Delete(ctx context.Context) error { 407 ctx, log, done := tele.StartSpanWithLogger(ctx, "controllers.azureMachinePoolMachineReconciler.Delete") 408 defer done() 409 410 defer func() { 411 if err := r.Scope.UpdateNodeStatus(ctx); err != nil { 412 log.V(4).Info("failed to update VMSS VM node status during delete") 413 } 414 415 if err := r.Scope.UpdateInstanceStatus(ctx); err != nil { 416 log.V(4).Info("failed to update VMSS VM instance status during delete") 417 } 418 }() 419 420 if err := r.scalesetVMsService.Delete(ctx); err != nil { 421 return errors.Wrap(err, "failed to reconcile scalesetVMs") 422 } 423 424 // no long running operation, so we are finished deleting the resource. Remove the finalizer. 425 controllerutil.RemoveFinalizer(r.Scope.AzureMachinePoolMachine, infrav1exp.AzureMachinePoolMachineFinalizer) 426 427 return nil 428 }