sigs.k8s.io/cluster-api-provider-azure@v1.14.3/azure/scope/strategies/machinepool_deployments/machinepool_deployment_strategy.go (about)

     1  /*
     2  Copyright 2021 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package machinepool
    18  
    19  import (
    20  	"context"
    21  	"math/rand"
    22  	"sort"
    23  	"time"
    24  
    25  	"github.com/pkg/errors"
    26  	"k8s.io/apimachinery/pkg/util/intstr"
    27  	infrav1 "sigs.k8s.io/cluster-api-provider-azure/api/v1beta1"
    28  	infrav1exp "sigs.k8s.io/cluster-api-provider-azure/exp/api/v1beta1"
    29  	"sigs.k8s.io/cluster-api-provider-azure/util/tele"
    30  	clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
    31  	ctrl "sigs.k8s.io/controller-runtime"
    32  )
    33  
    34  type (
    35  	// Surger is the ability to surge a number of replica.
    36  	Surger interface {
    37  		Surge(desiredReplicaCount int) (int, error)
    38  	}
    39  
    40  	// DeleteSelector is the ability to select nodes to be delete with respect to a desired number of replicas.
    41  	DeleteSelector interface {
    42  		SelectMachinesToDelete(ctx context.Context, desiredReplicas int32, machinesByProviderID map[string]infrav1exp.AzureMachinePoolMachine) ([]infrav1exp.AzureMachinePoolMachine, error)
    43  	}
    44  
    45  	// TypedDeleteSelector is the ability to select nodes to be deleted with respect to a desired number of nodes, and
    46  	// the ability to describe the underlying type of the deployment strategy.
    47  	TypedDeleteSelector interface {
    48  		DeleteSelector
    49  		Type() infrav1exp.AzureMachinePoolDeploymentStrategyType
    50  	}
    51  
    52  	rollingUpdateStrategy struct {
    53  		infrav1exp.MachineRollingUpdateDeployment
    54  	}
    55  )
    56  
    57  // NewMachinePoolDeploymentStrategy constructs a strategy implementation described in the AzureMachinePoolDeploymentStrategy
    58  // specification.
    59  func NewMachinePoolDeploymentStrategy(strategy infrav1exp.AzureMachinePoolDeploymentStrategy) TypedDeleteSelector {
    60  	switch strategy.Type {
    61  	case infrav1exp.RollingUpdateAzureMachinePoolDeploymentStrategyType:
    62  		rollingUpdate := strategy.RollingUpdate
    63  		if rollingUpdate == nil {
    64  			rollingUpdate = &infrav1exp.MachineRollingUpdateDeployment{}
    65  		}
    66  
    67  		return &rollingUpdateStrategy{
    68  			MachineRollingUpdateDeployment: *rollingUpdate,
    69  		}
    70  	default:
    71  		// default to a rolling update strategy if unknown type
    72  		return &rollingUpdateStrategy{
    73  			MachineRollingUpdateDeployment: infrav1exp.MachineRollingUpdateDeployment{},
    74  		}
    75  	}
    76  }
    77  
    78  // Type is the AzureMachinePoolDeploymentStrategyType for the strategy.
    79  func (rollingUpdateStrategy *rollingUpdateStrategy) Type() infrav1exp.AzureMachinePoolDeploymentStrategyType {
    80  	return infrav1exp.RollingUpdateAzureMachinePoolDeploymentStrategyType
    81  }
    82  
    83  // Surge calculates the number of replicas that can be added during an upgrade operation.
    84  func (rollingUpdateStrategy *rollingUpdateStrategy) Surge(desiredReplicaCount int) (int, error) {
    85  	if rollingUpdateStrategy.MaxSurge == nil {
    86  		return 1, nil
    87  	}
    88  
    89  	return intstr.GetScaledValueFromIntOrPercent(rollingUpdateStrategy.MaxSurge, desiredReplicaCount, true)
    90  }
    91  
    92  // maxUnavailable calculates the maximum number of replicas which can be unavailable at any time.
    93  func (rollingUpdateStrategy *rollingUpdateStrategy) maxUnavailable(desiredReplicaCount int) (int, error) {
    94  	if rollingUpdateStrategy.MaxUnavailable != nil {
    95  		val, err := intstr.GetScaledValueFromIntOrPercent(rollingUpdateStrategy.MaxUnavailable, desiredReplicaCount, false)
    96  		if err != nil {
    97  			return 0, errors.Wrap(err, "failed to get scaled value or int from maxUnavailable")
    98  		}
    99  
   100  		return val, nil
   101  	}
   102  
   103  	return 0, nil
   104  }
   105  
   106  // SelectMachinesToDelete selects the machines to delete based on the machine state, desired replica count, and
   107  // the DeletePolicy.
   108  func (rollingUpdateStrategy rollingUpdateStrategy) SelectMachinesToDelete(ctx context.Context, desiredReplicaCount int32, machinesByProviderID map[string]infrav1exp.AzureMachinePoolMachine) ([]infrav1exp.AzureMachinePoolMachine, error) {
   109  	ctx, _, done := tele.StartSpanWithLogger(
   110  		ctx,
   111  		"strategies.rollingUpdateStrategy.SelectMachinesToDelete",
   112  	)
   113  	defer done()
   114  
   115  	maxUnavailable, err := rollingUpdateStrategy.maxUnavailable(int(desiredReplicaCount))
   116  	if err != nil {
   117  		return nil, err
   118  	}
   119  
   120  	var (
   121  		order = func() func(machines []infrav1exp.AzureMachinePoolMachine) []infrav1exp.AzureMachinePoolMachine {
   122  			switch rollingUpdateStrategy.DeletePolicy {
   123  			case infrav1exp.OldestDeletePolicyType:
   124  				return orderByOldest
   125  			case infrav1exp.NewestDeletePolicyType:
   126  				return orderByNewest
   127  			default:
   128  				return orderRandom
   129  			}
   130  		}()
   131  		log                        = ctrl.LoggerFrom(ctx).V(4)
   132  		failedMachines             = order(getFailedMachines(machinesByProviderID))
   133  		deletingMachines           = order(getDeletingMachines(machinesByProviderID))
   134  		readyMachines              = order(getReadyMachines(machinesByProviderID))
   135  		machinesWithoutLatestModel = order(getMachinesWithoutLatestModel(machinesByProviderID))
   136  		overProvisionCount         = len(readyMachines) - int(desiredReplicaCount)
   137  		disruptionBudget           = func() int {
   138  			if maxUnavailable > int(desiredReplicaCount) {
   139  				return int(desiredReplicaCount)
   140  			}
   141  
   142  			return len(readyMachines) - int(desiredReplicaCount) + maxUnavailable
   143  		}()
   144  	)
   145  
   146  	// Order AzureMachinePoolMachines with the clutserv1.DeleteMachineAnnotation to the front so that they have delete priority.
   147  	// This allows MachinePool Machines to work with the autoscaler.
   148  	failedMachines = orderByDeleteMachineAnnotation(failedMachines)
   149  	deletingMachines = orderByDeleteMachineAnnotation(deletingMachines)
   150  	readyMachines = orderByDeleteMachineAnnotation(readyMachines)
   151  	machinesWithoutLatestModel = orderByDeleteMachineAnnotation(machinesWithoutLatestModel)
   152  
   153  	log.Info("selecting machines to delete",
   154  		"readyMachines", len(readyMachines),
   155  		"desiredReplicaCount", desiredReplicaCount,
   156  		"maxUnavailable", maxUnavailable,
   157  		"disruptionBudget", disruptionBudget,
   158  		"machinesWithoutTheLatestModel", len(machinesWithoutLatestModel),
   159  		"failedMachines", len(failedMachines),
   160  		"deletingMachines", len(deletingMachines),
   161  	)
   162  
   163  	// if we have failed or deleting machines, remove them
   164  	if len(failedMachines) > 0 || len(deletingMachines) > 0 {
   165  		log.Info("failed or deleting machines", "desiredReplicaCount", desiredReplicaCount, "maxUnavailable", maxUnavailable, "failedMachines", getProviderIDs(failedMachines), "deletingMachines", getProviderIDs(deletingMachines))
   166  		return append(failedMachines, deletingMachines...), nil
   167  	}
   168  
   169  	// if we have not yet reached our desired count, don't try to delete anything
   170  	if len(readyMachines) < int(desiredReplicaCount) {
   171  		log.Info("not enough ready machines", "desiredReplicaCount", desiredReplicaCount, "readyMachinesCount", len(readyMachines), "machinesByProviderID", len(machinesByProviderID))
   172  		return []infrav1exp.AzureMachinePoolMachine{}, nil
   173  	}
   174  
   175  	// we have too many machines, let's choose the oldest to remove
   176  	if overProvisionCount > 0 {
   177  		var toDelete []infrav1exp.AzureMachinePoolMachine
   178  		log.Info("over-provisioned", "desiredReplicaCount", desiredReplicaCount, "overProvisionCount", overProvisionCount, "machinesWithoutLatestModel", getProviderIDs(machinesWithoutLatestModel))
   179  		// we are over-provisioned try to remove old models
   180  		for _, v := range machinesWithoutLatestModel {
   181  			if len(toDelete) >= overProvisionCount {
   182  				return toDelete, nil
   183  			}
   184  
   185  			toDelete = append(toDelete, v)
   186  		}
   187  
   188  		log.Info("over-provisioned ready", "desiredReplicaCount", desiredReplicaCount, "overProvisionCount", overProvisionCount, "readyMachines", getProviderIDs(readyMachines))
   189  		// remove ready machines
   190  		for _, v := range readyMachines {
   191  			if len(toDelete) >= overProvisionCount {
   192  				return toDelete, nil
   193  			}
   194  
   195  			toDelete = append(toDelete, v)
   196  		}
   197  
   198  		return toDelete, nil
   199  	}
   200  
   201  	if len(machinesWithoutLatestModel) == 0 {
   202  		log.Info("nothing more to do since all the AzureMachinePoolMachine(s) are the latest model and not over-provisioned")
   203  		return []infrav1exp.AzureMachinePoolMachine{}, nil
   204  	}
   205  
   206  	if disruptionBudget <= 0 {
   207  		log.Info("exit early since disruption budget is less than or equal to zero", "disruptionBudget", disruptionBudget, "desiredReplicaCount", desiredReplicaCount, "maxUnavailable", maxUnavailable, "readyMachines", getProviderIDs(readyMachines), "readyMachinesCount", len(readyMachines))
   208  		return []infrav1exp.AzureMachinePoolMachine{}, nil
   209  	}
   210  
   211  	var toDelete []infrav1exp.AzureMachinePoolMachine
   212  	log.Info("removing ready machines within disruption budget", "desiredReplicaCount", desiredReplicaCount, "maxUnavailable", maxUnavailable, "readyMachines", getProviderIDs(readyMachines), "readyMachinesCount", len(readyMachines))
   213  	for _, v := range readyMachines {
   214  		if len(toDelete) >= disruptionBudget {
   215  			return toDelete, nil
   216  		}
   217  
   218  		if !v.Status.LatestModelApplied {
   219  			toDelete = append(toDelete, v)
   220  		}
   221  	}
   222  
   223  	log.Info("completed without filling toDelete", "toDelete", getProviderIDs(toDelete), "numToDelete", len(toDelete))
   224  	return toDelete, nil
   225  }
   226  
   227  func getFailedMachines(machinesByProviderID map[string]infrav1exp.AzureMachinePoolMachine) []infrav1exp.AzureMachinePoolMachine {
   228  	var machines []infrav1exp.AzureMachinePoolMachine
   229  	for _, v := range machinesByProviderID {
   230  		// ready status, with provisioning state Succeeded, and not marked for delete
   231  		if v.Status.ProvisioningState != nil && *v.Status.ProvisioningState == infrav1.Failed {
   232  			machines = append(machines, v)
   233  		}
   234  	}
   235  
   236  	return machines
   237  }
   238  
   239  // getDeletingMachines is responsible for identifying machines whose VMs are in an active state of deletion
   240  // but whose corresponding AzureMachinePoolMachine resource has not yet been marked for deletion.
   241  func getDeletingMachines(machinesByProviderID map[string]infrav1exp.AzureMachinePoolMachine) []infrav1exp.AzureMachinePoolMachine {
   242  	var machines []infrav1exp.AzureMachinePoolMachine
   243  	for _, v := range machinesByProviderID {
   244  		if v.Status.ProvisioningState != nil &&
   245  			// provisioning state is Deleting
   246  			*v.Status.ProvisioningState == infrav1.Deleting &&
   247  			// Ensure that the machine has not already been marked for deletion
   248  			v.DeletionTimestamp.IsZero() {
   249  			machines = append(machines, v)
   250  		}
   251  	}
   252  
   253  	return machines
   254  }
   255  
   256  func getReadyMachines(machinesByProviderID map[string]infrav1exp.AzureMachinePoolMachine) []infrav1exp.AzureMachinePoolMachine {
   257  	var readyMachines []infrav1exp.AzureMachinePoolMachine
   258  	for _, v := range machinesByProviderID {
   259  		// ready status, with provisioning state Succeeded, and not marked for delete
   260  		if v.Status.Ready &&
   261  			(v.Status.ProvisioningState != nil && *v.Status.ProvisioningState == infrav1.Succeeded) &&
   262  			// Don't include machines that have already been marked for delete
   263  			v.DeletionTimestamp.IsZero() &&
   264  			// Don't include machines whose VMs are in an active state of deleting
   265  			*v.Status.ProvisioningState != infrav1.Deleting {
   266  			readyMachines = append(readyMachines, v)
   267  		}
   268  	}
   269  
   270  	return readyMachines
   271  }
   272  
   273  func getMachinesWithoutLatestModel(machinesByProviderID map[string]infrav1exp.AzureMachinePoolMachine) []infrav1exp.AzureMachinePoolMachine {
   274  	var machinesWithLatestModel []infrav1exp.AzureMachinePoolMachine
   275  	for _, v := range machinesByProviderID {
   276  		if !v.Status.LatestModelApplied {
   277  			machinesWithLatestModel = append(machinesWithLatestModel, v)
   278  		}
   279  	}
   280  
   281  	return machinesWithLatestModel
   282  }
   283  
   284  func orderByNewest(machines []infrav1exp.AzureMachinePoolMachine) []infrav1exp.AzureMachinePoolMachine {
   285  	sort.Slice(machines, func(i, j int) bool {
   286  		return machines[i].ObjectMeta.CreationTimestamp.After(machines[j].ObjectMeta.CreationTimestamp.Time)
   287  	})
   288  
   289  	return machines
   290  }
   291  
   292  func orderByOldest(machines []infrav1exp.AzureMachinePoolMachine) []infrav1exp.AzureMachinePoolMachine {
   293  	sort.Slice(machines, func(i, j int) bool {
   294  		return machines[j].ObjectMeta.CreationTimestamp.After(machines[i].ObjectMeta.CreationTimestamp.Time)
   295  	})
   296  
   297  	return machines
   298  }
   299  
   300  func orderRandom(machines []infrav1exp.AzureMachinePoolMachine) []infrav1exp.AzureMachinePoolMachine {
   301  	//nolint:gosec // We don't need a cryptographically appropriate random number here
   302  	r := rand.New(rand.NewSource(time.Now().UnixNano()))
   303  	r.Shuffle(len(machines), func(i, j int) { machines[i], machines[j] = machines[j], machines[i] })
   304  	return machines
   305  }
   306  
   307  // orderByDeleteMachineAnnotation will sort AzureMachinePoolMachines with the clusterv1.DeleteMachineAnnotation to the front of the list.
   308  // It will preserve the existing order of the list otherwise so that it respects the existing delete priority otherwise.
   309  func orderByDeleteMachineAnnotation(machines []infrav1exp.AzureMachinePoolMachine) []infrav1exp.AzureMachinePoolMachine {
   310  	sort.SliceStable(machines, func(i, j int) bool {
   311  		_, iHasAnnotation := machines[i].Annotations[clusterv1.DeleteMachineAnnotation]
   312  
   313  		return iHasAnnotation
   314  	})
   315  
   316  	return machines
   317  }
   318  
   319  func getProviderIDs(machines []infrav1exp.AzureMachinePoolMachine) []string {
   320  	ids := make([]string, len(machines))
   321  	for i, machine := range machines {
   322  		ids[i] = machine.Spec.ProviderID
   323  	}
   324  
   325  	return ids
   326  }