sigs.k8s.io/cluster-api-provider-azure@v1.14.3/azure/scope/strategies/machinepool_deployments/machinepool_deployment_strategy.go (about) 1 /* 2 Copyright 2021 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package machinepool 18 19 import ( 20 "context" 21 "math/rand" 22 "sort" 23 "time" 24 25 "github.com/pkg/errors" 26 "k8s.io/apimachinery/pkg/util/intstr" 27 infrav1 "sigs.k8s.io/cluster-api-provider-azure/api/v1beta1" 28 infrav1exp "sigs.k8s.io/cluster-api-provider-azure/exp/api/v1beta1" 29 "sigs.k8s.io/cluster-api-provider-azure/util/tele" 30 clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" 31 ctrl "sigs.k8s.io/controller-runtime" 32 ) 33 34 type ( 35 // Surger is the ability to surge a number of replica. 36 Surger interface { 37 Surge(desiredReplicaCount int) (int, error) 38 } 39 40 // DeleteSelector is the ability to select nodes to be delete with respect to a desired number of replicas. 41 DeleteSelector interface { 42 SelectMachinesToDelete(ctx context.Context, desiredReplicas int32, machinesByProviderID map[string]infrav1exp.AzureMachinePoolMachine) ([]infrav1exp.AzureMachinePoolMachine, error) 43 } 44 45 // TypedDeleteSelector is the ability to select nodes to be deleted with respect to a desired number of nodes, and 46 // the ability to describe the underlying type of the deployment strategy. 47 TypedDeleteSelector interface { 48 DeleteSelector 49 Type() infrav1exp.AzureMachinePoolDeploymentStrategyType 50 } 51 52 rollingUpdateStrategy struct { 53 infrav1exp.MachineRollingUpdateDeployment 54 } 55 ) 56 57 // NewMachinePoolDeploymentStrategy constructs a strategy implementation described in the AzureMachinePoolDeploymentStrategy 58 // specification. 59 func NewMachinePoolDeploymentStrategy(strategy infrav1exp.AzureMachinePoolDeploymentStrategy) TypedDeleteSelector { 60 switch strategy.Type { 61 case infrav1exp.RollingUpdateAzureMachinePoolDeploymentStrategyType: 62 rollingUpdate := strategy.RollingUpdate 63 if rollingUpdate == nil { 64 rollingUpdate = &infrav1exp.MachineRollingUpdateDeployment{} 65 } 66 67 return &rollingUpdateStrategy{ 68 MachineRollingUpdateDeployment: *rollingUpdate, 69 } 70 default: 71 // default to a rolling update strategy if unknown type 72 return &rollingUpdateStrategy{ 73 MachineRollingUpdateDeployment: infrav1exp.MachineRollingUpdateDeployment{}, 74 } 75 } 76 } 77 78 // Type is the AzureMachinePoolDeploymentStrategyType for the strategy. 79 func (rollingUpdateStrategy *rollingUpdateStrategy) Type() infrav1exp.AzureMachinePoolDeploymentStrategyType { 80 return infrav1exp.RollingUpdateAzureMachinePoolDeploymentStrategyType 81 } 82 83 // Surge calculates the number of replicas that can be added during an upgrade operation. 84 func (rollingUpdateStrategy *rollingUpdateStrategy) Surge(desiredReplicaCount int) (int, error) { 85 if rollingUpdateStrategy.MaxSurge == nil { 86 return 1, nil 87 } 88 89 return intstr.GetScaledValueFromIntOrPercent(rollingUpdateStrategy.MaxSurge, desiredReplicaCount, true) 90 } 91 92 // maxUnavailable calculates the maximum number of replicas which can be unavailable at any time. 93 func (rollingUpdateStrategy *rollingUpdateStrategy) maxUnavailable(desiredReplicaCount int) (int, error) { 94 if rollingUpdateStrategy.MaxUnavailable != nil { 95 val, err := intstr.GetScaledValueFromIntOrPercent(rollingUpdateStrategy.MaxUnavailable, desiredReplicaCount, false) 96 if err != nil { 97 return 0, errors.Wrap(err, "failed to get scaled value or int from maxUnavailable") 98 } 99 100 return val, nil 101 } 102 103 return 0, nil 104 } 105 106 // SelectMachinesToDelete selects the machines to delete based on the machine state, desired replica count, and 107 // the DeletePolicy. 108 func (rollingUpdateStrategy rollingUpdateStrategy) SelectMachinesToDelete(ctx context.Context, desiredReplicaCount int32, machinesByProviderID map[string]infrav1exp.AzureMachinePoolMachine) ([]infrav1exp.AzureMachinePoolMachine, error) { 109 ctx, _, done := tele.StartSpanWithLogger( 110 ctx, 111 "strategies.rollingUpdateStrategy.SelectMachinesToDelete", 112 ) 113 defer done() 114 115 maxUnavailable, err := rollingUpdateStrategy.maxUnavailable(int(desiredReplicaCount)) 116 if err != nil { 117 return nil, err 118 } 119 120 var ( 121 order = func() func(machines []infrav1exp.AzureMachinePoolMachine) []infrav1exp.AzureMachinePoolMachine { 122 switch rollingUpdateStrategy.DeletePolicy { 123 case infrav1exp.OldestDeletePolicyType: 124 return orderByOldest 125 case infrav1exp.NewestDeletePolicyType: 126 return orderByNewest 127 default: 128 return orderRandom 129 } 130 }() 131 log = ctrl.LoggerFrom(ctx).V(4) 132 failedMachines = order(getFailedMachines(machinesByProviderID)) 133 deletingMachines = order(getDeletingMachines(machinesByProviderID)) 134 readyMachines = order(getReadyMachines(machinesByProviderID)) 135 machinesWithoutLatestModel = order(getMachinesWithoutLatestModel(machinesByProviderID)) 136 overProvisionCount = len(readyMachines) - int(desiredReplicaCount) 137 disruptionBudget = func() int { 138 if maxUnavailable > int(desiredReplicaCount) { 139 return int(desiredReplicaCount) 140 } 141 142 return len(readyMachines) - int(desiredReplicaCount) + maxUnavailable 143 }() 144 ) 145 146 // Order AzureMachinePoolMachines with the clutserv1.DeleteMachineAnnotation to the front so that they have delete priority. 147 // This allows MachinePool Machines to work with the autoscaler. 148 failedMachines = orderByDeleteMachineAnnotation(failedMachines) 149 deletingMachines = orderByDeleteMachineAnnotation(deletingMachines) 150 readyMachines = orderByDeleteMachineAnnotation(readyMachines) 151 machinesWithoutLatestModel = orderByDeleteMachineAnnotation(machinesWithoutLatestModel) 152 153 log.Info("selecting machines to delete", 154 "readyMachines", len(readyMachines), 155 "desiredReplicaCount", desiredReplicaCount, 156 "maxUnavailable", maxUnavailable, 157 "disruptionBudget", disruptionBudget, 158 "machinesWithoutTheLatestModel", len(machinesWithoutLatestModel), 159 "failedMachines", len(failedMachines), 160 "deletingMachines", len(deletingMachines), 161 ) 162 163 // if we have failed or deleting machines, remove them 164 if len(failedMachines) > 0 || len(deletingMachines) > 0 { 165 log.Info("failed or deleting machines", "desiredReplicaCount", desiredReplicaCount, "maxUnavailable", maxUnavailable, "failedMachines", getProviderIDs(failedMachines), "deletingMachines", getProviderIDs(deletingMachines)) 166 return append(failedMachines, deletingMachines...), nil 167 } 168 169 // if we have not yet reached our desired count, don't try to delete anything 170 if len(readyMachines) < int(desiredReplicaCount) { 171 log.Info("not enough ready machines", "desiredReplicaCount", desiredReplicaCount, "readyMachinesCount", len(readyMachines), "machinesByProviderID", len(machinesByProviderID)) 172 return []infrav1exp.AzureMachinePoolMachine{}, nil 173 } 174 175 // we have too many machines, let's choose the oldest to remove 176 if overProvisionCount > 0 { 177 var toDelete []infrav1exp.AzureMachinePoolMachine 178 log.Info("over-provisioned", "desiredReplicaCount", desiredReplicaCount, "overProvisionCount", overProvisionCount, "machinesWithoutLatestModel", getProviderIDs(machinesWithoutLatestModel)) 179 // we are over-provisioned try to remove old models 180 for _, v := range machinesWithoutLatestModel { 181 if len(toDelete) >= overProvisionCount { 182 return toDelete, nil 183 } 184 185 toDelete = append(toDelete, v) 186 } 187 188 log.Info("over-provisioned ready", "desiredReplicaCount", desiredReplicaCount, "overProvisionCount", overProvisionCount, "readyMachines", getProviderIDs(readyMachines)) 189 // remove ready machines 190 for _, v := range readyMachines { 191 if len(toDelete) >= overProvisionCount { 192 return toDelete, nil 193 } 194 195 toDelete = append(toDelete, v) 196 } 197 198 return toDelete, nil 199 } 200 201 if len(machinesWithoutLatestModel) == 0 { 202 log.Info("nothing more to do since all the AzureMachinePoolMachine(s) are the latest model and not over-provisioned") 203 return []infrav1exp.AzureMachinePoolMachine{}, nil 204 } 205 206 if disruptionBudget <= 0 { 207 log.Info("exit early since disruption budget is less than or equal to zero", "disruptionBudget", disruptionBudget, "desiredReplicaCount", desiredReplicaCount, "maxUnavailable", maxUnavailable, "readyMachines", getProviderIDs(readyMachines), "readyMachinesCount", len(readyMachines)) 208 return []infrav1exp.AzureMachinePoolMachine{}, nil 209 } 210 211 var toDelete []infrav1exp.AzureMachinePoolMachine 212 log.Info("removing ready machines within disruption budget", "desiredReplicaCount", desiredReplicaCount, "maxUnavailable", maxUnavailable, "readyMachines", getProviderIDs(readyMachines), "readyMachinesCount", len(readyMachines)) 213 for _, v := range readyMachines { 214 if len(toDelete) >= disruptionBudget { 215 return toDelete, nil 216 } 217 218 if !v.Status.LatestModelApplied { 219 toDelete = append(toDelete, v) 220 } 221 } 222 223 log.Info("completed without filling toDelete", "toDelete", getProviderIDs(toDelete), "numToDelete", len(toDelete)) 224 return toDelete, nil 225 } 226 227 func getFailedMachines(machinesByProviderID map[string]infrav1exp.AzureMachinePoolMachine) []infrav1exp.AzureMachinePoolMachine { 228 var machines []infrav1exp.AzureMachinePoolMachine 229 for _, v := range machinesByProviderID { 230 // ready status, with provisioning state Succeeded, and not marked for delete 231 if v.Status.ProvisioningState != nil && *v.Status.ProvisioningState == infrav1.Failed { 232 machines = append(machines, v) 233 } 234 } 235 236 return machines 237 } 238 239 // getDeletingMachines is responsible for identifying machines whose VMs are in an active state of deletion 240 // but whose corresponding AzureMachinePoolMachine resource has not yet been marked for deletion. 241 func getDeletingMachines(machinesByProviderID map[string]infrav1exp.AzureMachinePoolMachine) []infrav1exp.AzureMachinePoolMachine { 242 var machines []infrav1exp.AzureMachinePoolMachine 243 for _, v := range machinesByProviderID { 244 if v.Status.ProvisioningState != nil && 245 // provisioning state is Deleting 246 *v.Status.ProvisioningState == infrav1.Deleting && 247 // Ensure that the machine has not already been marked for deletion 248 v.DeletionTimestamp.IsZero() { 249 machines = append(machines, v) 250 } 251 } 252 253 return machines 254 } 255 256 func getReadyMachines(machinesByProviderID map[string]infrav1exp.AzureMachinePoolMachine) []infrav1exp.AzureMachinePoolMachine { 257 var readyMachines []infrav1exp.AzureMachinePoolMachine 258 for _, v := range machinesByProviderID { 259 // ready status, with provisioning state Succeeded, and not marked for delete 260 if v.Status.Ready && 261 (v.Status.ProvisioningState != nil && *v.Status.ProvisioningState == infrav1.Succeeded) && 262 // Don't include machines that have already been marked for delete 263 v.DeletionTimestamp.IsZero() && 264 // Don't include machines whose VMs are in an active state of deleting 265 *v.Status.ProvisioningState != infrav1.Deleting { 266 readyMachines = append(readyMachines, v) 267 } 268 } 269 270 return readyMachines 271 } 272 273 func getMachinesWithoutLatestModel(machinesByProviderID map[string]infrav1exp.AzureMachinePoolMachine) []infrav1exp.AzureMachinePoolMachine { 274 var machinesWithLatestModel []infrav1exp.AzureMachinePoolMachine 275 for _, v := range machinesByProviderID { 276 if !v.Status.LatestModelApplied { 277 machinesWithLatestModel = append(machinesWithLatestModel, v) 278 } 279 } 280 281 return machinesWithLatestModel 282 } 283 284 func orderByNewest(machines []infrav1exp.AzureMachinePoolMachine) []infrav1exp.AzureMachinePoolMachine { 285 sort.Slice(machines, func(i, j int) bool { 286 return machines[i].ObjectMeta.CreationTimestamp.After(machines[j].ObjectMeta.CreationTimestamp.Time) 287 }) 288 289 return machines 290 } 291 292 func orderByOldest(machines []infrav1exp.AzureMachinePoolMachine) []infrav1exp.AzureMachinePoolMachine { 293 sort.Slice(machines, func(i, j int) bool { 294 return machines[j].ObjectMeta.CreationTimestamp.After(machines[i].ObjectMeta.CreationTimestamp.Time) 295 }) 296 297 return machines 298 } 299 300 func orderRandom(machines []infrav1exp.AzureMachinePoolMachine) []infrav1exp.AzureMachinePoolMachine { 301 //nolint:gosec // We don't need a cryptographically appropriate random number here 302 r := rand.New(rand.NewSource(time.Now().UnixNano())) 303 r.Shuffle(len(machines), func(i, j int) { machines[i], machines[j] = machines[j], machines[i] }) 304 return machines 305 } 306 307 // orderByDeleteMachineAnnotation will sort AzureMachinePoolMachines with the clusterv1.DeleteMachineAnnotation to the front of the list. 308 // It will preserve the existing order of the list otherwise so that it respects the existing delete priority otherwise. 309 func orderByDeleteMachineAnnotation(machines []infrav1exp.AzureMachinePoolMachine) []infrav1exp.AzureMachinePoolMachine { 310 sort.SliceStable(machines, func(i, j int) bool { 311 _, iHasAnnotation := machines[i].Annotations[clusterv1.DeleteMachineAnnotation] 312 313 return iHasAnnotation 314 }) 315 316 return machines 317 } 318 319 func getProviderIDs(machines []infrav1exp.AzureMachinePoolMachine) []string { 320 ids := make([]string, len(machines)) 321 for i, machine := range machines { 322 ids[i] = machine.Spec.ProviderID 323 } 324 325 return ids 326 }