github.com/smithx10/nomad@v0.9.1-rc1/scheduler/reconcile.go

github.com/smithx10/nomad@v0.9.1-rc1/scheduler/reconcile.go (about)

     1  package scheduler
     2  
     3  import (
     4  	"fmt"
     5  	"time"
     6  
     7  	"sort"
     8  
     9  	log "github.com/hashicorp/go-hclog"
    10  
    11  	"github.com/hashicorp/nomad/helper"
    12  	"github.com/hashicorp/nomad/helper/uuid"
    13  	"github.com/hashicorp/nomad/nomad/structs"
    14  )
    15  
    16  const (
    17  	// batchedFailedAllocWindowSize is the window size used
    18  	// to batch up failed allocations before creating an eval
    19  	batchedFailedAllocWindowSize = 5 * time.Second
    20  
    21  	// rescheduleWindowSize is the window size relative to
    22  	// current time within which reschedulable allocations are placed.
    23  	// This helps protect against small clock drifts between servers
    24  	rescheduleWindowSize = 1 * time.Second
    25  )
    26  
    27  // allocUpdateType takes an existing allocation and a new job definition and
    28  // returns whether the allocation can ignore the change, requires a destructive
    29  // update, or can be inplace updated. If it can be inplace updated, an updated
    30  // allocation that has the new resources and alloc metrics attached will be
    31  // returned.
    32  type allocUpdateType func(existing *structs.Allocation, newJob *structs.Job,
    33  	newTG *structs.TaskGroup) (ignore, destructive bool, updated *structs.Allocation)
    34  
    35  // allocReconciler is used to determine the set of allocations that require
    36  // placement, inplace updating or stopping given the job specification and
    37  // existing cluster state. The reconciler should only be used for batch and
    38  // service jobs.
    39  type allocReconciler struct {
    40  	// logger is used to log debug information. Logging should be kept at a
    41  	// minimal here
    42  	logger log.Logger
    43  
    44  	// canInplace is used to check if the allocation can be inplace upgraded
    45  	allocUpdateFn allocUpdateType
    46  
    47  	// batch marks whether the job is a batch job
    48  	batch bool
    49  
    50  	// job is the job being operated on, it may be nil if the job is being
    51  	// stopped via a purge
    52  	job *structs.Job
    53  
    54  	// jobID is the ID of the job being operated on. The job may be nil if it is
    55  	// being stopped so we require this separately.
    56  	jobID string
    57  
    58  	// oldDeployment is the last deployment for the job
    59  	oldDeployment *structs.Deployment
    60  
    61  	// deployment is the current deployment for the job
    62  	deployment *structs.Deployment
    63  
    64  	// deploymentPaused marks whether the deployment is paused
    65  	deploymentPaused bool
    66  
    67  	// deploymentFailed marks whether the deployment is failed
    68  	deploymentFailed bool
    69  
    70  	// taintedNodes contains a map of nodes that are tainted
    71  	taintedNodes map[string]*structs.Node
    72  
    73  	// existingAllocs is non-terminal existing allocations
    74  	existingAllocs []*structs.Allocation
    75  
    76  	// evalID is the ID of the evaluation that triggered the reconciler
    77  	evalID string
    78  
    79  	// now is the time used when determining rescheduling eligibility
    80  	// defaults to time.Now, and overidden in unit tests
    81  	now time.Time
    82  
    83  	// result is the results of the reconcile. During computation it can be
    84  	// used to store intermediate state
    85  	result *reconcileResults
    86  }
    87  
    88  // reconcileResults contains the results of the reconciliation and should be
    89  // applied by the scheduler.
    90  type reconcileResults struct {
    91  	// deployment is the deployment that should be created or updated as a
    92  	// result of scheduling
    93  	deployment *structs.Deployment
    94  
    95  	// deploymentUpdates contains a set of deployment updates that should be
    96  	// applied as a result of scheduling
    97  	deploymentUpdates []*structs.DeploymentStatusUpdate
    98  
    99  	// place is the set of allocations to place by the scheduler
   100  	place []allocPlaceResult
   101  
   102  	// destructiveUpdate is the set of allocations to apply a destructive update to
   103  	destructiveUpdate []allocDestructiveResult
   104  
   105  	// inplaceUpdate is the set of allocations to apply an inplace update to
   106  	inplaceUpdate []*structs.Allocation
   107  
   108  	// stop is the set of allocations to stop
   109  	stop []allocStopResult
   110  
   111  	// attributeUpdates are updates to the allocation that are not from a
   112  	// jobspec change.
   113  	attributeUpdates map[string]*structs.Allocation
   114  
   115  	// desiredTGUpdates captures the desired set of changes to make for each
   116  	// task group.
   117  	desiredTGUpdates map[string]*structs.DesiredUpdates
   118  
   119  	// desiredFollowupEvals is the map of follow up evaluations to create per task group
   120  	// This is used to create a delayed evaluation for rescheduling failed allocations.
   121  	desiredFollowupEvals map[string][]*structs.Evaluation
   122  }
   123  
   124  // delayedRescheduleInfo contains the allocation id and a time when its eligible to be rescheduled.
   125  // this is used to create follow up evaluations
   126  type delayedRescheduleInfo struct {
   127  
   128  	// allocID is the ID of the allocation eligible to be rescheduled
   129  	allocID string
   130  
   131  	// rescheduleTime is the time to use in the delayed evaluation
   132  	rescheduleTime time.Time
   133  }
   134  
   135  func (r *reconcileResults) GoString() string {
   136  	base := fmt.Sprintf("Total changes: (place %d) (destructive %d) (inplace %d) (stop %d)",
   137  		len(r.place), len(r.destructiveUpdate), len(r.inplaceUpdate), len(r.stop))
   138  
   139  	if r.deployment != nil {
   140  		base += fmt.Sprintf("\nCreated Deployment: %q", r.deployment.ID)
   141  	}
   142  	for _, u := range r.deploymentUpdates {
   143  		base += fmt.Sprintf("\nDeployment Update for ID %q: Status %q; Description %q",
   144  			u.DeploymentID, u.Status, u.StatusDescription)
   145  	}
   146  	for tg, u := range r.desiredTGUpdates {
   147  		base += fmt.Sprintf("\nDesired Changes for %q: %#v", tg, u)
   148  	}
   149  	return base
   150  }
   151  
   152  // Changes returns the number of total changes
   153  func (r *reconcileResults) Changes() int {
   154  	return len(r.place) + len(r.inplaceUpdate) + len(r.stop)
   155  }
   156  
   157  // NewAllocReconciler creates a new reconciler that should be used to determine
   158  // the changes required to bring the cluster state inline with the declared jobspec
   159  func NewAllocReconciler(logger log.Logger, allocUpdateFn allocUpdateType, batch bool,
   160  	jobID string, job *structs.Job, deployment *structs.Deployment,
   161  	existingAllocs []*structs.Allocation, taintedNodes map[string]*structs.Node, evalID string) *allocReconciler {
   162  	return &allocReconciler{
   163  		logger:         logger.Named("reconciler"),
   164  		allocUpdateFn:  allocUpdateFn,
   165  		batch:          batch,
   166  		jobID:          jobID,
   167  		job:            job,
   168  		deployment:     deployment.Copy(),
   169  		existingAllocs: existingAllocs,
   170  		taintedNodes:   taintedNodes,
   171  		evalID:         evalID,
   172  		now:            time.Now(),
   173  		result: &reconcileResults{
   174  			desiredTGUpdates:     make(map[string]*structs.DesiredUpdates),
   175  			desiredFollowupEvals: make(map[string][]*structs.Evaluation),
   176  		},
   177  	}
   178  }
   179  
   180  // Compute reconciles the existing cluster state and returns the set of changes
   181  // required to converge the job spec and state
   182  func (a *allocReconciler) Compute() *reconcileResults {
   183  	// Create the allocation matrix
   184  	m := newAllocMatrix(a.job, a.existingAllocs)
   185  
   186  	// Handle stopping unneeded deployments
   187  	a.cancelDeployments()
   188  
   189  	// If we are just stopping a job we do not need to do anything more than
   190  	// stopping all running allocs
   191  	if a.job.Stopped() {
   192  		a.handleStop(m)
   193  		return a.result
   194  	}
   195  
   196  	// Detect if the deployment is paused
   197  	if a.deployment != nil {
   198  		a.deploymentPaused = a.deployment.Status == structs.DeploymentStatusPaused
   199  		a.deploymentFailed = a.deployment.Status == structs.DeploymentStatusFailed
   200  	}
   201  
   202  	// Reconcile each group
   203  	complete := true
   204  	for group, as := range m {
   205  		groupComplete := a.computeGroup(group, as)
   206  		complete = complete && groupComplete
   207  	}
   208  
   209  	// Mark the deployment as complete if possible
   210  	if a.deployment != nil && complete {
   211  		a.result.deploymentUpdates = append(a.result.deploymentUpdates, &structs.DeploymentStatusUpdate{
   212  			DeploymentID:      a.deployment.ID,
   213  			Status:            structs.DeploymentStatusSuccessful,
   214  			StatusDescription: structs.DeploymentStatusDescriptionSuccessful,
   215  		})
   216  	}
   217  
   218  	// Set the description of a created deployment
   219  	if d := a.result.deployment; d != nil {
   220  		if d.RequiresPromotion() {
   221  			d.StatusDescription = structs.DeploymentStatusDescriptionRunningNeedsPromotion
   222  		}
   223  	}
   224  
   225  	return a.result
   226  }
   227  
   228  // cancelDeployments cancels any deployment that is not needed
   229  func (a *allocReconciler) cancelDeployments() {
   230  	// If the job is stopped and there is a non-terminal deployment, cancel it
   231  	if a.job.Stopped() {
   232  		if a.deployment != nil && a.deployment.Active() {
   233  			a.result.deploymentUpdates = append(a.result.deploymentUpdates, &structs.DeploymentStatusUpdate{
   234  				DeploymentID:      a.deployment.ID,
   235  				Status:            structs.DeploymentStatusCancelled,
   236  				StatusDescription: structs.DeploymentStatusDescriptionStoppedJob,
   237  			})
   238  		}
   239  
   240  		// Nothing else to do
   241  		a.oldDeployment = a.deployment
   242  		a.deployment = nil
   243  		return
   244  	}
   245  
   246  	d := a.deployment
   247  	if d == nil {
   248  		return
   249  	}
   250  
   251  	// Check if the deployment is active and referencing an older job and cancel it
   252  	if d.JobCreateIndex != a.job.CreateIndex || d.JobVersion != a.job.Version {
   253  		if d.Active() {
   254  			a.result.deploymentUpdates = append(a.result.deploymentUpdates, &structs.DeploymentStatusUpdate{
   255  				DeploymentID:      a.deployment.ID,
   256  				Status:            structs.DeploymentStatusCancelled,
   257  				StatusDescription: structs.DeploymentStatusDescriptionNewerJob,
   258  			})
   259  		}
   260  
   261  		a.oldDeployment = d
   262  		a.deployment = nil
   263  	}
   264  
   265  	// Clear it as the current deployment if it is successful
   266  	if d.Status == structs.DeploymentStatusSuccessful {
   267  		a.oldDeployment = d
   268  		a.deployment = nil
   269  	}
   270  }
   271  
   272  // handleStop marks all allocations to be stopped, handling the lost case
   273  func (a *allocReconciler) handleStop(m allocMatrix) {
   274  	for group, as := range m {
   275  		as = filterByTerminal(as)
   276  		untainted, migrate, lost := as.filterByTainted(a.taintedNodes)
   277  		a.markStop(untainted, "", allocNotNeeded)
   278  		a.markStop(migrate, "", allocNotNeeded)
   279  		a.markStop(lost, structs.AllocClientStatusLost, allocLost)
   280  		desiredChanges := new(structs.DesiredUpdates)
   281  		desiredChanges.Stop = uint64(len(as))
   282  		a.result.desiredTGUpdates[group] = desiredChanges
   283  	}
   284  }
   285  
   286  // markStop is a helper for marking a set of allocation for stop with a
   287  // particular client status and description.
   288  func (a *allocReconciler) markStop(allocs allocSet, clientStatus, statusDescription string) {
   289  	for _, alloc := range allocs {
   290  		a.result.stop = append(a.result.stop, allocStopResult{
   291  			alloc:             alloc,
   292  			clientStatus:      clientStatus,
   293  			statusDescription: statusDescription,
   294  		})
   295  	}
   296  }
   297  
   298  // computeGroup reconciles state for a particular task group. It returns whether
   299  // the deployment it is for is complete with regards to the task group.
   300  func (a *allocReconciler) computeGroup(group string, all allocSet) bool {
   301  	// Create the desired update object for the group
   302  	desiredChanges := new(structs.DesiredUpdates)
   303  	a.result.desiredTGUpdates[group] = desiredChanges
   304  
   305  	// Get the task group. The task group may be nil if the job was updates such
   306  	// that the task group no longer exists
   307  	tg := a.job.LookupTaskGroup(group)
   308  
   309  	// If the task group is nil, then the task group has been removed so all we
   310  	// need to do is stop everything
   311  	if tg == nil {
   312  		untainted, migrate, lost := all.filterByTainted(a.taintedNodes)
   313  		a.markStop(untainted, "", allocNotNeeded)
   314  		a.markStop(migrate, "", allocNotNeeded)
   315  		a.markStop(lost, structs.AllocClientStatusLost, allocLost)
   316  		desiredChanges.Stop = uint64(len(untainted) + len(migrate) + len(lost))
   317  		return true
   318  	}
   319  
   320  	// Get the deployment state for the group
   321  	var dstate *structs.DeploymentState
   322  	existingDeployment := false
   323  	if a.deployment != nil {
   324  		dstate, existingDeployment = a.deployment.TaskGroups[group]
   325  	}
   326  	if !existingDeployment {
   327  		dstate = &structs.DeploymentState{}
   328  		if tg.Update != nil {
   329  			dstate.AutoRevert = tg.Update.AutoRevert
   330  			dstate.ProgressDeadline = tg.Update.ProgressDeadline
   331  		}
   332  	}
   333  
   334  	// Filter allocations that do not need to be considered because they are
   335  	// from an older job version and are terminal.
   336  	all, ignore := a.filterOldTerminalAllocs(all)
   337  	desiredChanges.Ignore += uint64(len(ignore))
   338  
   339  	// canaries is the set of canaries for the current deployment and all is all
   340  	// allocs including the canaries
   341  	canaries, all := a.handleGroupCanaries(all, desiredChanges)
   342  
   343  	// Determine what set of allocations are on tainted nodes
   344  	untainted, migrate, lost := all.filterByTainted(a.taintedNodes)
   345  
   346  	// Determine what set of terminal allocations need to be rescheduled
   347  	untainted, rescheduleNow, rescheduleLater := untainted.filterByRescheduleable(a.batch, a.now, a.evalID, a.deployment)
   348  
   349  	// Create batched follow up evaluations for allocations that are
   350  	// reschedulable later and mark the allocations for in place updating
   351  	a.handleDelayedReschedules(rescheduleLater, all, tg.Name)
   352  
   353  	// Create a structure for choosing names. Seed with the taken names which is
   354  	// the union of untainted and migrating nodes (includes canaries)
   355  	nameIndex := newAllocNameIndex(a.jobID, group, tg.Count, untainted.union(migrate, rescheduleNow))
   356  
   357  	// Stop any unneeded allocations and update the untainted set to not
   358  	// included stopped allocations.
   359  	canaryState := dstate != nil && dstate.DesiredCanaries != 0 && !dstate.Promoted
   360  	stop := a.computeStop(tg, nameIndex, untainted, migrate, lost, canaries, canaryState)
   361  	desiredChanges.Stop += uint64(len(stop))
   362  	untainted = untainted.difference(stop)
   363  
   364  	// Do inplace upgrades where possible and capture the set of upgrades that
   365  	// need to be done destructively.
   366  	ignore, inplace, destructive := a.computeUpdates(tg, untainted)
   367  	desiredChanges.Ignore += uint64(len(ignore))
   368  	desiredChanges.InPlaceUpdate += uint64(len(inplace))
   369  	if !existingDeployment {
   370  		dstate.DesiredTotal += len(destructive) + len(inplace)
   371  	}
   372  
   373  	// Remove the canaries now that we have handled rescheduling so that we do
   374  	// not consider them when making placement decisions.
   375  	if canaryState {
   376  		untainted = untainted.difference(canaries)
   377  	}
   378  
   379  	// The fact that we have destructive updates and have less canaries than is
   380  	// desired means we need to create canaries
   381  	numDestructive := len(destructive)
   382  	strategy := tg.Update
   383  	canariesPromoted := dstate != nil && dstate.Promoted
   384  	requireCanary := numDestructive != 0 && strategy != nil && len(canaries) < strategy.Canary && !canariesPromoted
   385  	if requireCanary && !a.deploymentPaused && !a.deploymentFailed {
   386  		number := strategy.Canary - len(canaries)
   387  		desiredChanges.Canary += uint64(number)
   388  		if !existingDeployment {
   389  			dstate.DesiredCanaries = strategy.Canary
   390  		}
   391  
   392  		for _, name := range nameIndex.NextCanaries(uint(number), canaries, destructive) {
   393  			a.result.place = append(a.result.place, allocPlaceResult{
   394  				name:      name,
   395  				canary:    true,
   396  				taskGroup: tg,
   397  			})
   398  		}
   399  	}
   400  
   401  	// Determine how many we can place
   402  	canaryState = dstate != nil && dstate.DesiredCanaries != 0 && !dstate.Promoted
   403  	limit := a.computeLimit(tg, untainted, destructive, migrate, canaryState)
   404  
   405  	// Place if:
   406  	// * The deployment is not paused or failed
   407  	// * Not placing any canaries
   408  	// * If there are any canaries that they have been promoted
   409  	place := a.computePlacements(tg, nameIndex, untainted, migrate, rescheduleNow)
   410  	if !existingDeployment {
   411  		dstate.DesiredTotal += len(place)
   412  	}
   413  
   414  	// deploymentPlaceReady tracks whether the deployment is in a state where
   415  	// placements can be made without any other consideration.
   416  	deploymentPlaceReady := !a.deploymentPaused && !a.deploymentFailed && !canaryState
   417  
   418  	if deploymentPlaceReady {
   419  		desiredChanges.Place += uint64(len(place))
   420  		for _, p := range place {
   421  			a.result.place = append(a.result.place, p)
   422  		}
   423  
   424  		min := helper.IntMin(len(place), limit)
   425  		limit -= min
   426  	} else if !deploymentPlaceReady {
   427  		// We do not want to place additional allocations but in the case we
   428  		// have lost allocations or allocations that require rescheduling now,
   429  		// we do so regardless to avoid odd user experiences.
   430  		if len(lost) != 0 {
   431  			allowed := helper.IntMin(len(lost), len(place))
   432  			desiredChanges.Place += uint64(allowed)
   433  			for _, p := range place[:allowed] {
   434  				a.result.place = append(a.result.place, p)
   435  			}
   436  		}
   437  
   438  		// Handle rescheduling of failed allocations even if the deployment is
   439  		// failed. We do not reschedule if the allocation is part of the failed
   440  		// deployment.
   441  		if now := len(rescheduleNow); now != 0 {
   442  			for _, p := range place {
   443  				prev := p.PreviousAllocation()
   444  				if p.IsRescheduling() && !(a.deploymentFailed && prev != nil && a.deployment.ID == prev.DeploymentID) {
   445  					a.result.place = append(a.result.place, p)
   446  					desiredChanges.Place++
   447  				}
   448  			}
   449  		}
   450  	}
   451  
   452  	if deploymentPlaceReady {
   453  		// Do all destructive updates
   454  		min := helper.IntMin(len(destructive), limit)
   455  		desiredChanges.DestructiveUpdate += uint64(min)
   456  		desiredChanges.Ignore += uint64(len(destructive) - min)
   457  		for _, alloc := range destructive.nameOrder()[:min] {
   458  			a.result.destructiveUpdate = append(a.result.destructiveUpdate, allocDestructiveResult{
   459  				placeName:             alloc.Name,
   460  				placeTaskGroup:        tg,
   461  				stopAlloc:             alloc,
   462  				stopStatusDescription: allocUpdating,
   463  			})
   464  		}
   465  	} else {
   466  		desiredChanges.Ignore += uint64(len(destructive))
   467  	}
   468  
   469  	// Migrate all the allocations
   470  	desiredChanges.Migrate += uint64(len(migrate))
   471  	for _, alloc := range migrate.nameOrder() {
   472  		a.result.stop = append(a.result.stop, allocStopResult{
   473  			alloc:             alloc,
   474  			statusDescription: allocMigrating,
   475  		})
   476  		a.result.place = append(a.result.place, allocPlaceResult{
   477  			name:          alloc.Name,
   478  			canary:        false,
   479  			taskGroup:     tg,
   480  			previousAlloc: alloc,
   481  		})
   482  	}
   483  
   484  	// Create new deployment if:
   485  	// 1. Updating a job specification
   486  	// 2. No running allocations (first time running a job)
   487  	updatingSpec := len(destructive) != 0 || len(a.result.inplaceUpdate) != 0
   488  	hadRunning := false
   489  	for _, alloc := range all {
   490  		if alloc.Job.Version == a.job.Version && alloc.Job.CreateIndex == a.job.CreateIndex {
   491  			hadRunning = true
   492  			break
   493  		}
   494  	}
   495  
   496  	// Create a new deployment if necessary
   497  	if !existingDeployment && strategy != nil && dstate.DesiredTotal != 0 && (!hadRunning || updatingSpec) {
   498  		// A previous group may have made the deployment already
   499  		if a.deployment == nil {
   500  			a.deployment = structs.NewDeployment(a.job)
   501  			a.result.deployment = a.deployment
   502  		}
   503  
   504  		// Attach the groups deployment state to the deployment
   505  		a.deployment.TaskGroups[group] = dstate
   506  	}
   507  
   508  	// deploymentComplete is whether the deployment is complete which largely
   509  	// means that no placements were made or desired to be made
   510  	deploymentComplete := len(destructive)+len(inplace)+len(place)+len(migrate)+len(rescheduleNow)+len(rescheduleLater) == 0 && !requireCanary
   511  
   512  	// Final check to see if the deployment is complete is to ensure everything
   513  	// is healthy
   514  	if deploymentComplete && a.deployment != nil {
   515  		if dstate, ok := a.deployment.TaskGroups[group]; ok {
   516  			if dstate.HealthyAllocs < helper.IntMax(dstate.DesiredTotal, dstate.DesiredCanaries) || // Make sure we have enough healthy allocs
   517  				(dstate.DesiredCanaries > 0 && !dstate.Promoted) { // Make sure we are promoted if we have canaries
   518  				deploymentComplete = false
   519  			}
   520  		}
   521  	}
   522  
   523  	return deploymentComplete
   524  }
   525  
   526  // filterOldTerminalAllocs filters allocations that should be ignored since they
   527  // are allocations that are terminal from a previous job version.
   528  func (a *allocReconciler) filterOldTerminalAllocs(all allocSet) (filtered, ignore allocSet) {
   529  	if !a.batch {
   530  		return all, nil
   531  	}
   532  
   533  	filtered = filtered.union(all)
   534  	ignored := make(map[string]*structs.Allocation)
   535  
   536  	// Ignore terminal batch jobs from older versions
   537  	for id, alloc := range filtered {
   538  		older := alloc.Job.Version < a.job.Version || alloc.Job.CreateIndex < a.job.CreateIndex
   539  		if older && alloc.TerminalStatus() {
   540  			delete(filtered, id)
   541  			ignored[id] = alloc
   542  		}
   543  	}
   544  
   545  	return filtered, ignored
   546  }
   547  
   548  // handleGroupCanaries handles the canaries for the group by stopping the
   549  // unneeded ones and returning the current set of canaries and the updated total
   550  // set of allocs for the group
   551  func (a *allocReconciler) handleGroupCanaries(all allocSet, desiredChanges *structs.DesiredUpdates) (canaries, newAll allocSet) {
   552  	// Stop any canary from an older deployment or from a failed one
   553  	var stop []string
   554  
   555  	// Cancel any non-promoted canaries from the older deployment
   556  	if a.oldDeployment != nil {
   557  		for _, s := range a.oldDeployment.TaskGroups {
   558  			if !s.Promoted {
   559  				stop = append(stop, s.PlacedCanaries...)
   560  			}
   561  		}
   562  	}
   563  
   564  	// Cancel any non-promoted canaries from a failed deployment
   565  	if a.deployment != nil && a.deployment.Status == structs.DeploymentStatusFailed {
   566  		for _, s := range a.deployment.TaskGroups {
   567  			if !s.Promoted {
   568  				stop = append(stop, s.PlacedCanaries...)
   569  			}
   570  		}
   571  	}
   572  
   573  	// stopSet is the allocSet that contains the canaries we desire to stop from
   574  	// above.
   575  	stopSet := all.fromKeys(stop)
   576  	a.markStop(stopSet, "", allocNotNeeded)
   577  	desiredChanges.Stop += uint64(len(stopSet))
   578  	all = all.difference(stopSet)
   579  
   580  	// Capture our current set of canaries and handle any migrations that are
   581  	// needed by just stopping them.
   582  	if a.deployment != nil {
   583  		var canaryIDs []string
   584  		for _, s := range a.deployment.TaskGroups {
   585  			canaryIDs = append(canaryIDs, s.PlacedCanaries...)
   586  		}
   587  
   588  		canaries = all.fromKeys(canaryIDs)
   589  		untainted, migrate, lost := canaries.filterByTainted(a.taintedNodes)
   590  		a.markStop(migrate, "", allocMigrating)
   591  		a.markStop(lost, structs.AllocClientStatusLost, allocLost)
   592  
   593  		canaries = untainted
   594  		all = all.difference(migrate, lost)
   595  	}
   596  
   597  	return canaries, all
   598  }
   599  
   600  // computeLimit returns the placement limit for a particular group. The inputs
   601  // are the group definition, the untainted, destructive, and migrate allocation
   602  // set and whether we are in a canary state.
   603  func (a *allocReconciler) computeLimit(group *structs.TaskGroup, untainted, destructive, migrate allocSet, canaryState bool) int {
   604  	// If there is no update strategy or deployment for the group we can deploy
   605  	// as many as the group has
   606  	if group.Update == nil || len(destructive)+len(migrate) == 0 {
   607  		return group.Count
   608  	} else if a.deploymentPaused || a.deploymentFailed {
   609  		// If the deployment is paused or failed, do not create anything else
   610  		return 0
   611  	}
   612  
   613  	// If we have canaries and they have not been promoted the limit is 0
   614  	if canaryState {
   615  		return 0
   616  	}
   617  
   618  	// If we have been promoted or there are no canaries, the limit is the
   619  	// configured MaxParallel minus any outstanding non-healthy alloc for the
   620  	// deployment
   621  	limit := group.Update.MaxParallel
   622  	if a.deployment != nil {
   623  		partOf, _ := untainted.filterByDeployment(a.deployment.ID)
   624  		for _, alloc := range partOf {
   625  			// An unhealthy allocation means nothing else should be happen.
   626  			if alloc.DeploymentStatus.IsUnhealthy() {
   627  				return 0
   628  			}
   629  
   630  			if !alloc.DeploymentStatus.IsHealthy() {
   631  				limit--
   632  			}
   633  		}
   634  	}
   635  
   636  	// The limit can be less than zero in the case that the job was changed such
   637  	// that it required destructive changes and the count was scaled up.
   638  	if limit < 0 {
   639  		return 0
   640  	}
   641  
   642  	return limit
   643  }
   644  
   645  // computePlacement returns the set of allocations to place given the group
   646  // definition, the set of untainted, migrating and reschedule allocations for the group.
   647  func (a *allocReconciler) computePlacements(group *structs.TaskGroup,
   648  	nameIndex *allocNameIndex, untainted, migrate allocSet, reschedule allocSet) []allocPlaceResult {
   649  
   650  	// Add rescheduled placement results
   651  	var place []allocPlaceResult
   652  	for _, alloc := range reschedule {
   653  		place = append(place, allocPlaceResult{
   654  			name:          alloc.Name,
   655  			taskGroup:     group,
   656  			previousAlloc: alloc,
   657  			reschedule:    true,
   658  			canary:        alloc.DeploymentStatus.IsCanary(),
   659  		})
   660  	}
   661  
   662  	// Hot path the nothing to do case
   663  	existing := len(untainted) + len(migrate) + len(reschedule)
   664  	if existing >= group.Count {
   665  		return place
   666  	}
   667  
   668  	// Add remaining placement results
   669  	if existing < group.Count {
   670  		for _, name := range nameIndex.Next(uint(group.Count - existing)) {
   671  			place = append(place, allocPlaceResult{
   672  				name:      name,
   673  				taskGroup: group,
   674  			})
   675  		}
   676  	}
   677  
   678  	return place
   679  }
   680  
   681  // computeStop returns the set of allocations that are marked for stopping given
   682  // the group definition, the set of allocations in various states and whether we
   683  // are canarying.
   684  func (a *allocReconciler) computeStop(group *structs.TaskGroup, nameIndex *allocNameIndex,
   685  	untainted, migrate, lost, canaries allocSet, canaryState bool) allocSet {
   686  
   687  	// Mark all lost allocations for stop. Previous allocation doesn't matter
   688  	// here since it is on a lost node
   689  	var stop allocSet
   690  	stop = stop.union(lost)
   691  	a.markStop(lost, structs.AllocClientStatusLost, allocLost)
   692  
   693  	// If we are still deploying or creating canaries, don't stop them
   694  	if canaryState {
   695  		untainted = untainted.difference(canaries)
   696  	}
   697  
   698  	// Hot path the nothing to do case
   699  	remove := len(untainted) + len(migrate) - group.Count
   700  	if remove <= 0 {
   701  		return stop
   702  	}
   703  
   704  	// Filter out any terminal allocations from the untainted set
   705  	// This is so that we don't try to mark them as stopped redundantly
   706  	untainted = filterByTerminal(untainted)
   707  
   708  	// Prefer stopping any alloc that has the same name as the canaries if we
   709  	// are promoted
   710  	if !canaryState && len(canaries) != 0 {
   711  		canaryNames := canaries.nameSet()
   712  		for id, alloc := range untainted.difference(canaries) {
   713  			if _, match := canaryNames[alloc.Name]; match {
   714  				stop[id] = alloc
   715  				a.result.stop = append(a.result.stop, allocStopResult{
   716  					alloc:             alloc,
   717  					statusDescription: allocNotNeeded,
   718  				})
   719  				delete(untainted, id)
   720  
   721  				remove--
   722  				if remove == 0 {
   723  					return stop
   724  				}
   725  			}
   726  		}
   727  	}
   728  
   729  	// Prefer selecting from the migrating set before stopping existing allocs
   730  	if len(migrate) != 0 {
   731  		mNames := newAllocNameIndex(a.jobID, group.Name, group.Count, migrate)
   732  		removeNames := mNames.Highest(uint(remove))
   733  		for id, alloc := range migrate {
   734  			if _, match := removeNames[alloc.Name]; !match {
   735  				continue
   736  			}
   737  			a.result.stop = append(a.result.stop, allocStopResult{
   738  				alloc:             alloc,
   739  				statusDescription: allocNotNeeded,
   740  			})
   741  			delete(migrate, id)
   742  			stop[id] = alloc
   743  			nameIndex.UnsetIndex(alloc.Index())
   744  
   745  			remove--
   746  			if remove == 0 {
   747  				return stop
   748  			}
   749  		}
   750  	}
   751  
   752  	// Select the allocs with the highest count to remove
   753  	removeNames := nameIndex.Highest(uint(remove))
   754  	for id, alloc := range untainted {
   755  		if _, ok := removeNames[alloc.Name]; ok {
   756  			stop[id] = alloc
   757  			a.result.stop = append(a.result.stop, allocStopResult{
   758  				alloc:             alloc,
   759  				statusDescription: allocNotNeeded,
   760  			})
   761  			delete(untainted, id)
   762  
   763  			remove--
   764  			if remove == 0 {
   765  				return stop
   766  			}
   767  		}
   768  	}
   769  
   770  	// It is possible that we didn't stop as many as we should have if there
   771  	// were allocations with duplicate names.
   772  	for id, alloc := range untainted {
   773  		stop[id] = alloc
   774  		a.result.stop = append(a.result.stop, allocStopResult{
   775  			alloc:             alloc,
   776  			statusDescription: allocNotNeeded,
   777  		})
   778  		delete(untainted, id)
   779  
   780  		remove--
   781  		if remove == 0 {
   782  			return stop
   783  		}
   784  	}
   785  
   786  	return stop
   787  }
   788  
   789  // computeUpdates determines which allocations for the passed group require
   790  // updates. Three groups are returned:
   791  // 1. Those that require no upgrades
   792  // 2. Those that can be upgraded in-place. These are added to the results
   793  // automatically since the function contains the correct state to do so,
   794  // 3. Those that require destructive updates
   795  func (a *allocReconciler) computeUpdates(group *structs.TaskGroup, untainted allocSet) (ignore, inplace, destructive allocSet) {
   796  	// Determine the set of allocations that need to be updated
   797  	ignore = make(map[string]*structs.Allocation)
   798  	inplace = make(map[string]*structs.Allocation)
   799  	destructive = make(map[string]*structs.Allocation)
   800  
   801  	for _, alloc := range untainted {
   802  		ignoreChange, destructiveChange, inplaceAlloc := a.allocUpdateFn(alloc, a.job, group)
   803  		if ignoreChange {
   804  			ignore[alloc.ID] = alloc
   805  		} else if destructiveChange {
   806  			destructive[alloc.ID] = alloc
   807  		} else {
   808  			inplace[alloc.ID] = alloc
   809  			a.result.inplaceUpdate = append(a.result.inplaceUpdate, inplaceAlloc)
   810  		}
   811  	}
   812  
   813  	return
   814  }
   815  
   816  // handleDelayedReschedules creates batched followup evaluations with the WaitUntil field set
   817  // for allocations that are eligible to be rescheduled later
   818  func (a *allocReconciler) handleDelayedReschedules(rescheduleLater []*delayedRescheduleInfo, all allocSet, tgName string) {
   819  	if len(rescheduleLater) == 0 {
   820  		return
   821  	}
   822  
   823  	// Sort by time
   824  	sort.Slice(rescheduleLater, func(i, j int) bool {
   825  		return rescheduleLater[i].rescheduleTime.Before(rescheduleLater[j].rescheduleTime)
   826  	})
   827  
   828  	var evals []*structs.Evaluation
   829  	nextReschedTime := rescheduleLater[0].rescheduleTime
   830  	allocIDToFollowupEvalID := make(map[string]string, len(rescheduleLater))
   831  
   832  	// Create a new eval for the first batch
   833  	eval := &structs.Evaluation{
   834  		ID:                uuid.Generate(),
   835  		Namespace:         a.job.Namespace,
   836  		Priority:          a.job.Priority,
   837  		Type:              a.job.Type,
   838  		TriggeredBy:       structs.EvalTriggerRetryFailedAlloc,
   839  		JobID:             a.job.ID,
   840  		JobModifyIndex:    a.job.ModifyIndex,
   841  		Status:            structs.EvalStatusPending,
   842  		StatusDescription: reschedulingFollowupEvalDesc,
   843  		WaitUntil:         nextReschedTime,
   844  	}
   845  	evals = append(evals, eval)
   846  
   847  	for _, allocReschedInfo := range rescheduleLater {
   848  		if allocReschedInfo.rescheduleTime.Sub(nextReschedTime) < batchedFailedAllocWindowSize {
   849  			allocIDToFollowupEvalID[allocReschedInfo.allocID] = eval.ID
   850  		} else {
   851  			// Start a new batch
   852  			nextReschedTime = allocReschedInfo.rescheduleTime
   853  			// Create a new eval for the new batch
   854  			eval = &structs.Evaluation{
   855  				ID:             uuid.Generate(),
   856  				Namespace:      a.job.Namespace,
   857  				Priority:       a.job.Priority,
   858  				Type:           a.job.Type,
   859  				TriggeredBy:    structs.EvalTriggerRetryFailedAlloc,
   860  				JobID:          a.job.ID,
   861  				JobModifyIndex: a.job.ModifyIndex,
   862  				Status:         structs.EvalStatusPending,
   863  				WaitUntil:      nextReschedTime,
   864  			}
   865  			evals = append(evals, eval)
   866  			// Set the evalID for the first alloc in this new batch
   867  			allocIDToFollowupEvalID[allocReschedInfo.allocID] = eval.ID
   868  		}
   869  	}
   870  
   871  	a.result.desiredFollowupEvals[tgName] = evals
   872  
   873  	// Initialize the annotations
   874  	if len(allocIDToFollowupEvalID) != 0 && a.result.attributeUpdates == nil {
   875  		a.result.attributeUpdates = make(map[string]*structs.Allocation)
   876  	}
   877  
   878  	// Create in-place updates for every alloc ID that needs to be updated with its follow up eval ID
   879  	for allocID, evalID := range allocIDToFollowupEvalID {
   880  		existingAlloc := all[allocID]
   881  		updatedAlloc := existingAlloc.Copy()
   882  		updatedAlloc.FollowupEvalID = evalID
   883  		a.result.attributeUpdates[updatedAlloc.ID] = updatedAlloc
   884  	}
   885  }