sigs.k8s.io/cluster-api-provider-azure@v1.17.0/azure/scope/strategies/machinepool_deployments/machinepool_deployment_strategy.go (about) 1 /* 2 Copyright 2021 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package machinepool 18 19 import ( 20 "context" 21 "math/rand" 22 "sort" 23 "time" 24 25 "github.com/pkg/errors" 26 "k8s.io/apimachinery/pkg/util/intstr" 27 infrav1 "sigs.k8s.io/cluster-api-provider-azure/api/v1beta1" 28 infrav1exp "sigs.k8s.io/cluster-api-provider-azure/exp/api/v1beta1" 29 "sigs.k8s.io/cluster-api-provider-azure/util/tele" 30 clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" 31 ctrl "sigs.k8s.io/controller-runtime" 32 ) 33 34 type ( 35 // Surger is the ability to surge a number of replica. 36 Surger interface { 37 Surge(desiredReplicaCount int) (int, error) 38 } 39 40 // DeleteSelector is the ability to select nodes to be delete with respect to a desired number of replicas. 41 DeleteSelector interface { 42 SelectMachinesToDelete(ctx context.Context, desiredReplicas int32, machinesByProviderID map[string]infrav1exp.AzureMachinePoolMachine) ([]infrav1exp.AzureMachinePoolMachine, error) 43 } 44 45 // TypedDeleteSelector is the ability to select nodes to be deleted with respect to a desired number of nodes, and 46 // the ability to describe the underlying type of the deployment strategy. 47 TypedDeleteSelector interface { 48 DeleteSelector 49 Type() infrav1exp.AzureMachinePoolDeploymentStrategyType 50 } 51 52 rollingUpdateStrategy struct { 53 infrav1exp.MachineRollingUpdateDeployment 54 } 55 ) 56 57 // NewMachinePoolDeploymentStrategy constructs a strategy implementation described in the AzureMachinePoolDeploymentStrategy 58 // specification. 59 func NewMachinePoolDeploymentStrategy(strategy infrav1exp.AzureMachinePoolDeploymentStrategy) TypedDeleteSelector { 60 switch strategy.Type { 61 case infrav1exp.RollingUpdateAzureMachinePoolDeploymentStrategyType: 62 rollingUpdate := strategy.RollingUpdate 63 if rollingUpdate == nil { 64 rollingUpdate = &infrav1exp.MachineRollingUpdateDeployment{} 65 } 66 67 return &rollingUpdateStrategy{ 68 MachineRollingUpdateDeployment: *rollingUpdate, 69 } 70 default: 71 // default to a rolling update strategy if unknown type 72 return &rollingUpdateStrategy{ 73 MachineRollingUpdateDeployment: infrav1exp.MachineRollingUpdateDeployment{}, 74 } 75 } 76 } 77 78 // Type is the AzureMachinePoolDeploymentStrategyType for the strategy. 79 func (rollingUpdateStrategy *rollingUpdateStrategy) Type() infrav1exp.AzureMachinePoolDeploymentStrategyType { 80 return infrav1exp.RollingUpdateAzureMachinePoolDeploymentStrategyType 81 } 82 83 // Surge calculates the number of replicas that can be added during an upgrade operation. 84 func (rollingUpdateStrategy *rollingUpdateStrategy) Surge(desiredReplicaCount int) (int, error) { 85 if rollingUpdateStrategy.MaxSurge == nil { 86 return 1, nil 87 } 88 89 return intstr.GetScaledValueFromIntOrPercent(rollingUpdateStrategy.MaxSurge, desiredReplicaCount, true) 90 } 91 92 // maxUnavailable calculates the maximum number of replicas which can be unavailable at any time. 93 func (rollingUpdateStrategy *rollingUpdateStrategy) maxUnavailable(desiredReplicaCount int) (int, error) { 94 if rollingUpdateStrategy.MaxUnavailable != nil { 95 val, err := intstr.GetScaledValueFromIntOrPercent(rollingUpdateStrategy.MaxUnavailable, desiredReplicaCount, false) 96 if err != nil { 97 return 0, errors.Wrap(err, "failed to get scaled value or int from maxUnavailable") 98 } 99 100 return val, nil 101 } 102 103 return 0, nil 104 } 105 106 // SelectMachinesToDelete selects the machines to delete based on the machine state, desired replica count, and 107 // the DeletePolicy. 108 func (rollingUpdateStrategy rollingUpdateStrategy) SelectMachinesToDelete(ctx context.Context, desiredReplicaCount int32, machinesByProviderID map[string]infrav1exp.AzureMachinePoolMachine) ([]infrav1exp.AzureMachinePoolMachine, error) { 109 ctx, _, done := tele.StartSpanWithLogger( 110 ctx, 111 "strategies.rollingUpdateStrategy.SelectMachinesToDelete", 112 ) 113 defer done() 114 115 maxUnavailable, err := rollingUpdateStrategy.maxUnavailable(int(desiredReplicaCount)) 116 if err != nil { 117 return nil, err 118 } 119 120 var ( 121 order = func() func(machines []infrav1exp.AzureMachinePoolMachine) []infrav1exp.AzureMachinePoolMachine { 122 switch rollingUpdateStrategy.DeletePolicy { 123 case infrav1exp.OldestDeletePolicyType: 124 return orderByOldest 125 case infrav1exp.NewestDeletePolicyType: 126 return orderByNewest 127 default: 128 return orderRandom 129 } 130 }() 131 log = ctrl.LoggerFrom(ctx).V(4) 132 deleteAnnotatedMachines = order(getDeleteAnnotatedMachines(machinesByProviderID)) 133 failedMachines = order(getFailedMachines(machinesByProviderID)) 134 deletingMachines = order(getDeletingMachines(machinesByProviderID)) 135 readyMachines = order(getReadyMachines(machinesByProviderID)) 136 machinesWithoutLatestModel = order(getMachinesWithoutLatestModel(machinesByProviderID)) 137 overProvisionCount = len(readyMachines) - int(desiredReplicaCount) 138 disruptionBudget = func() int { 139 if maxUnavailable > int(desiredReplicaCount) { 140 return int(desiredReplicaCount) 141 } 142 143 return len(readyMachines) - int(desiredReplicaCount) + maxUnavailable 144 }() 145 ) 146 147 // Order AzureMachinePoolMachines with the clusterv1.DeleteMachineAnnotation to the front so that they have delete priority. 148 // This allows MachinePool Machines to work with the autoscaler. 149 failedMachines = orderByDeleteMachineAnnotation(failedMachines) 150 deletingMachines = orderByDeleteMachineAnnotation(deletingMachines) 151 152 log.Info("selecting machines to delete", 153 "readyMachines", len(readyMachines), 154 "desiredReplicaCount", desiredReplicaCount, 155 "maxUnavailable", maxUnavailable, 156 "disruptionBudget", disruptionBudget, 157 "machinesWithoutTheLatestModel", len(machinesWithoutLatestModel), 158 "deleteAnnotatedMachines", len(deleteAnnotatedMachines), 159 "failedMachines", len(failedMachines), 160 "deletingMachines", len(deletingMachines), 161 ) 162 163 // if we have failed or deleting machines, remove them 164 if len(failedMachines) > 0 || len(deletingMachines) > 0 { 165 log.Info("failed or deleting machines", "desiredReplicaCount", desiredReplicaCount, "maxUnavailable", maxUnavailable, "failedMachines", getProviderIDs(failedMachines), "deletingMachines", getProviderIDs(deletingMachines)) 166 return append(failedMachines, deletingMachines...), nil 167 } 168 169 // if we have machines annotated with delete machine, remove them 170 if len(deleteAnnotatedMachines) > 0 { 171 log.Info("delete annotated machines", "desiredReplicaCount", desiredReplicaCount, "maxUnavailable", maxUnavailable, "deleteAnnotatedMachines", getProviderIDs(deleteAnnotatedMachines)) 172 return deleteAnnotatedMachines, nil 173 } 174 175 // if we have not yet reached our desired count, don't try to delete anything 176 if len(readyMachines) < int(desiredReplicaCount) { 177 log.Info("not enough ready machines", "desiredReplicaCount", desiredReplicaCount, "readyMachinesCount", len(readyMachines), "machinesByProviderID", len(machinesByProviderID)) 178 return []infrav1exp.AzureMachinePoolMachine{}, nil 179 } 180 181 // we have too many machines, let's choose the oldest to remove 182 if overProvisionCount > 0 { 183 var toDelete []infrav1exp.AzureMachinePoolMachine 184 log.Info("over-provisioned", "desiredReplicaCount", desiredReplicaCount, "overProvisionCount", overProvisionCount, "machinesWithoutLatestModel", getProviderIDs(machinesWithoutLatestModel)) 185 // we are over-provisioned try to remove old models 186 for _, v := range machinesWithoutLatestModel { 187 if len(toDelete) >= overProvisionCount { 188 return toDelete, nil 189 } 190 191 toDelete = append(toDelete, v) 192 } 193 194 log.Info("over-provisioned ready", "desiredReplicaCount", desiredReplicaCount, "overProvisionCount", overProvisionCount, "readyMachines", getProviderIDs(readyMachines)) 195 // remove ready machines 196 for _, v := range readyMachines { 197 if len(toDelete) >= overProvisionCount { 198 return toDelete, nil 199 } 200 201 toDelete = append(toDelete, v) 202 } 203 204 return toDelete, nil 205 } 206 207 if len(machinesWithoutLatestModel) == 0 { 208 log.Info("nothing more to do since all the AzureMachinePoolMachine(s) are the latest model and not over-provisioned") 209 return []infrav1exp.AzureMachinePoolMachine{}, nil 210 } 211 212 if disruptionBudget <= 0 { 213 log.Info("exit early since disruption budget is less than or equal to zero", "disruptionBudget", disruptionBudget, "desiredReplicaCount", desiredReplicaCount, "maxUnavailable", maxUnavailable, "readyMachines", getProviderIDs(readyMachines), "readyMachinesCount", len(readyMachines)) 214 return []infrav1exp.AzureMachinePoolMachine{}, nil 215 } 216 217 var toDelete []infrav1exp.AzureMachinePoolMachine 218 log.Info("removing ready machines within disruption budget", "desiredReplicaCount", desiredReplicaCount, "maxUnavailable", maxUnavailable, "readyMachines", getProviderIDs(readyMachines), "readyMachinesCount", len(readyMachines)) 219 for _, v := range readyMachines { 220 if len(toDelete) >= disruptionBudget { 221 return toDelete, nil 222 } 223 224 if !v.Status.LatestModelApplied { 225 toDelete = append(toDelete, v) 226 } 227 } 228 229 log.Info("completed without filling toDelete", "toDelete", getProviderIDs(toDelete), "numToDelete", len(toDelete)) 230 return toDelete, nil 231 } 232 233 func getDeleteAnnotatedMachines(machinesByProviderID map[string]infrav1exp.AzureMachinePoolMachine) []infrav1exp.AzureMachinePoolMachine { 234 var machines []infrav1exp.AzureMachinePoolMachine 235 for _, v := range machinesByProviderID { 236 if v.Annotations != nil { 237 if _, hasDeleteAnnotation := v.Annotations[clusterv1.DeleteMachineAnnotation]; hasDeleteAnnotation { 238 machines = append(machines, v) 239 } 240 } 241 } 242 return machines 243 } 244 245 func getFailedMachines(machinesByProviderID map[string]infrav1exp.AzureMachinePoolMachine) []infrav1exp.AzureMachinePoolMachine { 246 var machines []infrav1exp.AzureMachinePoolMachine 247 for _, v := range machinesByProviderID { 248 // ready status, with provisioning state Succeeded, and not marked for delete 249 if v.Status.ProvisioningState != nil && *v.Status.ProvisioningState == infrav1.Failed { 250 machines = append(machines, v) 251 } 252 } 253 254 return machines 255 } 256 257 // getDeletingMachines is responsible for identifying machines whose VMs are in an active state of deletion 258 // but whose corresponding AzureMachinePoolMachine resource has not yet been marked for deletion. 259 func getDeletingMachines(machinesByProviderID map[string]infrav1exp.AzureMachinePoolMachine) []infrav1exp.AzureMachinePoolMachine { 260 var machines []infrav1exp.AzureMachinePoolMachine 261 for _, v := range machinesByProviderID { 262 if v.Status.ProvisioningState != nil && 263 // provisioning state is Deleting 264 *v.Status.ProvisioningState == infrav1.Deleting && 265 // Ensure that the machine has not already been marked for deletion 266 v.DeletionTimestamp.IsZero() { 267 machines = append(machines, v) 268 } 269 } 270 271 return machines 272 } 273 274 func getReadyMachines(machinesByProviderID map[string]infrav1exp.AzureMachinePoolMachine) []infrav1exp.AzureMachinePoolMachine { 275 var readyMachines []infrav1exp.AzureMachinePoolMachine 276 for _, v := range machinesByProviderID { 277 // ready status, with provisioning state Succeeded, and not marked for delete 278 if v.Status.Ready && 279 (v.Status.ProvisioningState != nil && *v.Status.ProvisioningState == infrav1.Succeeded) && 280 // Don't include machines that have already been marked for delete 281 v.DeletionTimestamp.IsZero() && 282 // Don't include machines whose VMs are in an active state of deleting 283 *v.Status.ProvisioningState != infrav1.Deleting { 284 readyMachines = append(readyMachines, v) 285 } 286 } 287 288 return readyMachines 289 } 290 291 func getMachinesWithoutLatestModel(machinesByProviderID map[string]infrav1exp.AzureMachinePoolMachine) []infrav1exp.AzureMachinePoolMachine { 292 var machinesWithLatestModel []infrav1exp.AzureMachinePoolMachine 293 for _, v := range machinesByProviderID { 294 if !v.Status.LatestModelApplied { 295 machinesWithLatestModel = append(machinesWithLatestModel, v) 296 } 297 } 298 299 return machinesWithLatestModel 300 } 301 302 func orderByNewest(machines []infrav1exp.AzureMachinePoolMachine) []infrav1exp.AzureMachinePoolMachine { 303 sort.Slice(machines, func(i, j int) bool { 304 return machines[i].ObjectMeta.CreationTimestamp.After(machines[j].ObjectMeta.CreationTimestamp.Time) 305 }) 306 307 return machines 308 } 309 310 func orderByOldest(machines []infrav1exp.AzureMachinePoolMachine) []infrav1exp.AzureMachinePoolMachine { 311 sort.Slice(machines, func(i, j int) bool { 312 return machines[j].ObjectMeta.CreationTimestamp.After(machines[i].ObjectMeta.CreationTimestamp.Time) 313 }) 314 315 return machines 316 } 317 318 func orderRandom(machines []infrav1exp.AzureMachinePoolMachine) []infrav1exp.AzureMachinePoolMachine { 319 //nolint:gosec // We don't need a cryptographically appropriate random number here 320 r := rand.New(rand.NewSource(time.Now().UnixNano())) 321 r.Shuffle(len(machines), func(i, j int) { machines[i], machines[j] = machines[j], machines[i] }) 322 return machines 323 } 324 325 // orderByDeleteMachineAnnotation will sort AzureMachinePoolMachines with the clusterv1.DeleteMachineAnnotation to the front of the list. 326 // It will preserve the existing order of the list otherwise so that it respects the existing delete priority otherwise. 327 func orderByDeleteMachineAnnotation(machines []infrav1exp.AzureMachinePoolMachine) []infrav1exp.AzureMachinePoolMachine { 328 sort.SliceStable(machines, func(i, j int) bool { 329 _, iHasAnnotation := machines[i].Annotations[clusterv1.DeleteMachineAnnotation] 330 331 return iHasAnnotation 332 }) 333 334 return machines 335 } 336 337 func getProviderIDs(machines []infrav1exp.AzureMachinePoolMachine) []string { 338 ids := make([]string, len(machines)) 339 for i, machine := range machines { 340 ids[i] = machine.Spec.ProviderID 341 } 342 343 return ids 344 }