sigs.k8s.io/cluster-api-provider-azure@v1.17.0/azure/scope/strategies/machinepool_deployments/machinepool_deployment_strategy.go (about)

     1  /*
     2  Copyright 2021 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package machinepool
    18  
    19  import (
    20  	"context"
    21  	"math/rand"
    22  	"sort"
    23  	"time"
    24  
    25  	"github.com/pkg/errors"
    26  	"k8s.io/apimachinery/pkg/util/intstr"
    27  	infrav1 "sigs.k8s.io/cluster-api-provider-azure/api/v1beta1"
    28  	infrav1exp "sigs.k8s.io/cluster-api-provider-azure/exp/api/v1beta1"
    29  	"sigs.k8s.io/cluster-api-provider-azure/util/tele"
    30  	clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
    31  	ctrl "sigs.k8s.io/controller-runtime"
    32  )
    33  
    34  type (
    35  	// Surger is the ability to surge a number of replica.
    36  	Surger interface {
    37  		Surge(desiredReplicaCount int) (int, error)
    38  	}
    39  
    40  	// DeleteSelector is the ability to select nodes to be delete with respect to a desired number of replicas.
    41  	DeleteSelector interface {
    42  		SelectMachinesToDelete(ctx context.Context, desiredReplicas int32, machinesByProviderID map[string]infrav1exp.AzureMachinePoolMachine) ([]infrav1exp.AzureMachinePoolMachine, error)
    43  	}
    44  
    45  	// TypedDeleteSelector is the ability to select nodes to be deleted with respect to a desired number of nodes, and
    46  	// the ability to describe the underlying type of the deployment strategy.
    47  	TypedDeleteSelector interface {
    48  		DeleteSelector
    49  		Type() infrav1exp.AzureMachinePoolDeploymentStrategyType
    50  	}
    51  
    52  	rollingUpdateStrategy struct {
    53  		infrav1exp.MachineRollingUpdateDeployment
    54  	}
    55  )
    56  
    57  // NewMachinePoolDeploymentStrategy constructs a strategy implementation described in the AzureMachinePoolDeploymentStrategy
    58  // specification.
    59  func NewMachinePoolDeploymentStrategy(strategy infrav1exp.AzureMachinePoolDeploymentStrategy) TypedDeleteSelector {
    60  	switch strategy.Type {
    61  	case infrav1exp.RollingUpdateAzureMachinePoolDeploymentStrategyType:
    62  		rollingUpdate := strategy.RollingUpdate
    63  		if rollingUpdate == nil {
    64  			rollingUpdate = &infrav1exp.MachineRollingUpdateDeployment{}
    65  		}
    66  
    67  		return &rollingUpdateStrategy{
    68  			MachineRollingUpdateDeployment: *rollingUpdate,
    69  		}
    70  	default:
    71  		// default to a rolling update strategy if unknown type
    72  		return &rollingUpdateStrategy{
    73  			MachineRollingUpdateDeployment: infrav1exp.MachineRollingUpdateDeployment{},
    74  		}
    75  	}
    76  }
    77  
    78  // Type is the AzureMachinePoolDeploymentStrategyType for the strategy.
    79  func (rollingUpdateStrategy *rollingUpdateStrategy) Type() infrav1exp.AzureMachinePoolDeploymentStrategyType {
    80  	return infrav1exp.RollingUpdateAzureMachinePoolDeploymentStrategyType
    81  }
    82  
    83  // Surge calculates the number of replicas that can be added during an upgrade operation.
    84  func (rollingUpdateStrategy *rollingUpdateStrategy) Surge(desiredReplicaCount int) (int, error) {
    85  	if rollingUpdateStrategy.MaxSurge == nil {
    86  		return 1, nil
    87  	}
    88  
    89  	return intstr.GetScaledValueFromIntOrPercent(rollingUpdateStrategy.MaxSurge, desiredReplicaCount, true)
    90  }
    91  
    92  // maxUnavailable calculates the maximum number of replicas which can be unavailable at any time.
    93  func (rollingUpdateStrategy *rollingUpdateStrategy) maxUnavailable(desiredReplicaCount int) (int, error) {
    94  	if rollingUpdateStrategy.MaxUnavailable != nil {
    95  		val, err := intstr.GetScaledValueFromIntOrPercent(rollingUpdateStrategy.MaxUnavailable, desiredReplicaCount, false)
    96  		if err != nil {
    97  			return 0, errors.Wrap(err, "failed to get scaled value or int from maxUnavailable")
    98  		}
    99  
   100  		return val, nil
   101  	}
   102  
   103  	return 0, nil
   104  }
   105  
   106  // SelectMachinesToDelete selects the machines to delete based on the machine state, desired replica count, and
   107  // the DeletePolicy.
   108  func (rollingUpdateStrategy rollingUpdateStrategy) SelectMachinesToDelete(ctx context.Context, desiredReplicaCount int32, machinesByProviderID map[string]infrav1exp.AzureMachinePoolMachine) ([]infrav1exp.AzureMachinePoolMachine, error) {
   109  	ctx, _, done := tele.StartSpanWithLogger(
   110  		ctx,
   111  		"strategies.rollingUpdateStrategy.SelectMachinesToDelete",
   112  	)
   113  	defer done()
   114  
   115  	maxUnavailable, err := rollingUpdateStrategy.maxUnavailable(int(desiredReplicaCount))
   116  	if err != nil {
   117  		return nil, err
   118  	}
   119  
   120  	var (
   121  		order = func() func(machines []infrav1exp.AzureMachinePoolMachine) []infrav1exp.AzureMachinePoolMachine {
   122  			switch rollingUpdateStrategy.DeletePolicy {
   123  			case infrav1exp.OldestDeletePolicyType:
   124  				return orderByOldest
   125  			case infrav1exp.NewestDeletePolicyType:
   126  				return orderByNewest
   127  			default:
   128  				return orderRandom
   129  			}
   130  		}()
   131  		log                        = ctrl.LoggerFrom(ctx).V(4)
   132  		deleteAnnotatedMachines    = order(getDeleteAnnotatedMachines(machinesByProviderID))
   133  		failedMachines             = order(getFailedMachines(machinesByProviderID))
   134  		deletingMachines           = order(getDeletingMachines(machinesByProviderID))
   135  		readyMachines              = order(getReadyMachines(machinesByProviderID))
   136  		machinesWithoutLatestModel = order(getMachinesWithoutLatestModel(machinesByProviderID))
   137  		overProvisionCount         = len(readyMachines) - int(desiredReplicaCount)
   138  		disruptionBudget           = func() int {
   139  			if maxUnavailable > int(desiredReplicaCount) {
   140  				return int(desiredReplicaCount)
   141  			}
   142  
   143  			return len(readyMachines) - int(desiredReplicaCount) + maxUnavailable
   144  		}()
   145  	)
   146  
   147  	// Order AzureMachinePoolMachines with the clusterv1.DeleteMachineAnnotation to the front so that they have delete priority.
   148  	// This allows MachinePool Machines to work with the autoscaler.
   149  	failedMachines = orderByDeleteMachineAnnotation(failedMachines)
   150  	deletingMachines = orderByDeleteMachineAnnotation(deletingMachines)
   151  
   152  	log.Info("selecting machines to delete",
   153  		"readyMachines", len(readyMachines),
   154  		"desiredReplicaCount", desiredReplicaCount,
   155  		"maxUnavailable", maxUnavailable,
   156  		"disruptionBudget", disruptionBudget,
   157  		"machinesWithoutTheLatestModel", len(machinesWithoutLatestModel),
   158  		"deleteAnnotatedMachines", len(deleteAnnotatedMachines),
   159  		"failedMachines", len(failedMachines),
   160  		"deletingMachines", len(deletingMachines),
   161  	)
   162  
   163  	// if we have failed or deleting machines, remove them
   164  	if len(failedMachines) > 0 || len(deletingMachines) > 0 {
   165  		log.Info("failed or deleting machines", "desiredReplicaCount", desiredReplicaCount, "maxUnavailable", maxUnavailable, "failedMachines", getProviderIDs(failedMachines), "deletingMachines", getProviderIDs(deletingMachines))
   166  		return append(failedMachines, deletingMachines...), nil
   167  	}
   168  
   169  	// if we have machines annotated with delete machine, remove them
   170  	if len(deleteAnnotatedMachines) > 0 {
   171  		log.Info("delete annotated machines", "desiredReplicaCount", desiredReplicaCount, "maxUnavailable", maxUnavailable, "deleteAnnotatedMachines", getProviderIDs(deleteAnnotatedMachines))
   172  		return deleteAnnotatedMachines, nil
   173  	}
   174  
   175  	// if we have not yet reached our desired count, don't try to delete anything
   176  	if len(readyMachines) < int(desiredReplicaCount) {
   177  		log.Info("not enough ready machines", "desiredReplicaCount", desiredReplicaCount, "readyMachinesCount", len(readyMachines), "machinesByProviderID", len(machinesByProviderID))
   178  		return []infrav1exp.AzureMachinePoolMachine{}, nil
   179  	}
   180  
   181  	// we have too many machines, let's choose the oldest to remove
   182  	if overProvisionCount > 0 {
   183  		var toDelete []infrav1exp.AzureMachinePoolMachine
   184  		log.Info("over-provisioned", "desiredReplicaCount", desiredReplicaCount, "overProvisionCount", overProvisionCount, "machinesWithoutLatestModel", getProviderIDs(machinesWithoutLatestModel))
   185  		// we are over-provisioned try to remove old models
   186  		for _, v := range machinesWithoutLatestModel {
   187  			if len(toDelete) >= overProvisionCount {
   188  				return toDelete, nil
   189  			}
   190  
   191  			toDelete = append(toDelete, v)
   192  		}
   193  
   194  		log.Info("over-provisioned ready", "desiredReplicaCount", desiredReplicaCount, "overProvisionCount", overProvisionCount, "readyMachines", getProviderIDs(readyMachines))
   195  		// remove ready machines
   196  		for _, v := range readyMachines {
   197  			if len(toDelete) >= overProvisionCount {
   198  				return toDelete, nil
   199  			}
   200  
   201  			toDelete = append(toDelete, v)
   202  		}
   203  
   204  		return toDelete, nil
   205  	}
   206  
   207  	if len(machinesWithoutLatestModel) == 0 {
   208  		log.Info("nothing more to do since all the AzureMachinePoolMachine(s) are the latest model and not over-provisioned")
   209  		return []infrav1exp.AzureMachinePoolMachine{}, nil
   210  	}
   211  
   212  	if disruptionBudget <= 0 {
   213  		log.Info("exit early since disruption budget is less than or equal to zero", "disruptionBudget", disruptionBudget, "desiredReplicaCount", desiredReplicaCount, "maxUnavailable", maxUnavailable, "readyMachines", getProviderIDs(readyMachines), "readyMachinesCount", len(readyMachines))
   214  		return []infrav1exp.AzureMachinePoolMachine{}, nil
   215  	}
   216  
   217  	var toDelete []infrav1exp.AzureMachinePoolMachine
   218  	log.Info("removing ready machines within disruption budget", "desiredReplicaCount", desiredReplicaCount, "maxUnavailable", maxUnavailable, "readyMachines", getProviderIDs(readyMachines), "readyMachinesCount", len(readyMachines))
   219  	for _, v := range readyMachines {
   220  		if len(toDelete) >= disruptionBudget {
   221  			return toDelete, nil
   222  		}
   223  
   224  		if !v.Status.LatestModelApplied {
   225  			toDelete = append(toDelete, v)
   226  		}
   227  	}
   228  
   229  	log.Info("completed without filling toDelete", "toDelete", getProviderIDs(toDelete), "numToDelete", len(toDelete))
   230  	return toDelete, nil
   231  }
   232  
   233  func getDeleteAnnotatedMachines(machinesByProviderID map[string]infrav1exp.AzureMachinePoolMachine) []infrav1exp.AzureMachinePoolMachine {
   234  	var machines []infrav1exp.AzureMachinePoolMachine
   235  	for _, v := range machinesByProviderID {
   236  		if v.Annotations != nil {
   237  			if _, hasDeleteAnnotation := v.Annotations[clusterv1.DeleteMachineAnnotation]; hasDeleteAnnotation {
   238  				machines = append(machines, v)
   239  			}
   240  		}
   241  	}
   242  	return machines
   243  }
   244  
   245  func getFailedMachines(machinesByProviderID map[string]infrav1exp.AzureMachinePoolMachine) []infrav1exp.AzureMachinePoolMachine {
   246  	var machines []infrav1exp.AzureMachinePoolMachine
   247  	for _, v := range machinesByProviderID {
   248  		// ready status, with provisioning state Succeeded, and not marked for delete
   249  		if v.Status.ProvisioningState != nil && *v.Status.ProvisioningState == infrav1.Failed {
   250  			machines = append(machines, v)
   251  		}
   252  	}
   253  
   254  	return machines
   255  }
   256  
   257  // getDeletingMachines is responsible for identifying machines whose VMs are in an active state of deletion
   258  // but whose corresponding AzureMachinePoolMachine resource has not yet been marked for deletion.
   259  func getDeletingMachines(machinesByProviderID map[string]infrav1exp.AzureMachinePoolMachine) []infrav1exp.AzureMachinePoolMachine {
   260  	var machines []infrav1exp.AzureMachinePoolMachine
   261  	for _, v := range machinesByProviderID {
   262  		if v.Status.ProvisioningState != nil &&
   263  			// provisioning state is Deleting
   264  			*v.Status.ProvisioningState == infrav1.Deleting &&
   265  			// Ensure that the machine has not already been marked for deletion
   266  			v.DeletionTimestamp.IsZero() {
   267  			machines = append(machines, v)
   268  		}
   269  	}
   270  
   271  	return machines
   272  }
   273  
   274  func getReadyMachines(machinesByProviderID map[string]infrav1exp.AzureMachinePoolMachine) []infrav1exp.AzureMachinePoolMachine {
   275  	var readyMachines []infrav1exp.AzureMachinePoolMachine
   276  	for _, v := range machinesByProviderID {
   277  		// ready status, with provisioning state Succeeded, and not marked for delete
   278  		if v.Status.Ready &&
   279  			(v.Status.ProvisioningState != nil && *v.Status.ProvisioningState == infrav1.Succeeded) &&
   280  			// Don't include machines that have already been marked for delete
   281  			v.DeletionTimestamp.IsZero() &&
   282  			// Don't include machines whose VMs are in an active state of deleting
   283  			*v.Status.ProvisioningState != infrav1.Deleting {
   284  			readyMachines = append(readyMachines, v)
   285  		}
   286  	}
   287  
   288  	return readyMachines
   289  }
   290  
   291  func getMachinesWithoutLatestModel(machinesByProviderID map[string]infrav1exp.AzureMachinePoolMachine) []infrav1exp.AzureMachinePoolMachine {
   292  	var machinesWithLatestModel []infrav1exp.AzureMachinePoolMachine
   293  	for _, v := range machinesByProviderID {
   294  		if !v.Status.LatestModelApplied {
   295  			machinesWithLatestModel = append(machinesWithLatestModel, v)
   296  		}
   297  	}
   298  
   299  	return machinesWithLatestModel
   300  }
   301  
   302  func orderByNewest(machines []infrav1exp.AzureMachinePoolMachine) []infrav1exp.AzureMachinePoolMachine {
   303  	sort.Slice(machines, func(i, j int) bool {
   304  		return machines[i].ObjectMeta.CreationTimestamp.After(machines[j].ObjectMeta.CreationTimestamp.Time)
   305  	})
   306  
   307  	return machines
   308  }
   309  
   310  func orderByOldest(machines []infrav1exp.AzureMachinePoolMachine) []infrav1exp.AzureMachinePoolMachine {
   311  	sort.Slice(machines, func(i, j int) bool {
   312  		return machines[j].ObjectMeta.CreationTimestamp.After(machines[i].ObjectMeta.CreationTimestamp.Time)
   313  	})
   314  
   315  	return machines
   316  }
   317  
   318  func orderRandom(machines []infrav1exp.AzureMachinePoolMachine) []infrav1exp.AzureMachinePoolMachine {
   319  	//nolint:gosec // We don't need a cryptographically appropriate random number here
   320  	r := rand.New(rand.NewSource(time.Now().UnixNano()))
   321  	r.Shuffle(len(machines), func(i, j int) { machines[i], machines[j] = machines[j], machines[i] })
   322  	return machines
   323  }
   324  
   325  // orderByDeleteMachineAnnotation will sort AzureMachinePoolMachines with the clusterv1.DeleteMachineAnnotation to the front of the list.
   326  // It will preserve the existing order of the list otherwise so that it respects the existing delete priority otherwise.
   327  func orderByDeleteMachineAnnotation(machines []infrav1exp.AzureMachinePoolMachine) []infrav1exp.AzureMachinePoolMachine {
   328  	sort.SliceStable(machines, func(i, j int) bool {
   329  		_, iHasAnnotation := machines[i].Annotations[clusterv1.DeleteMachineAnnotation]
   330  
   331  		return iHasAnnotation
   332  	})
   333  
   334  	return machines
   335  }
   336  
   337  func getProviderIDs(machines []infrav1exp.AzureMachinePoolMachine) []string {
   338  	ids := make([]string, len(machines))
   339  	for i, machine := range machines {
   340  		ids[i] = machine.Spec.ProviderID
   341  	}
   342  
   343  	return ids
   344  }