github.com/anuvu/nomad@v0.8.7-atom1/scheduler/reconcile.go

github.com/anuvu/nomad@v0.8.7-atom1/scheduler/reconcile.go (about)

     1  package scheduler
     2  
     3  import (
     4  	"fmt"
     5  	"log"
     6  	"time"
     7  
     8  	"sort"
     9  
    10  	"github.com/hashicorp/nomad/helper"
    11  	"github.com/hashicorp/nomad/helper/uuid"
    12  	"github.com/hashicorp/nomad/nomad/structs"
    13  )
    14  
    15  const (
    16  	// batchedFailedAllocWindowSize is the window size used
    17  	// to batch up failed allocations before creating an eval
    18  	batchedFailedAllocWindowSize = 5 * time.Second
    19  
    20  	// rescheduleWindowSize is the window size relative to
    21  	// current time within which reschedulable allocations are placed.
    22  	// This helps protect against small clock drifts between servers
    23  	rescheduleWindowSize = 1 * time.Second
    24  )
    25  
    26  // allocUpdateType takes an existing allocation and a new job definition and
    27  // returns whether the allocation can ignore the change, requires a destructive
    28  // update, or can be inplace updated. If it can be inplace updated, an updated
    29  // allocation that has the new resources and alloc metrics attached will be
    30  // returned.
    31  type allocUpdateType func(existing *structs.Allocation, newJob *structs.Job,
    32  	newTG *structs.TaskGroup) (ignore, destructive bool, updated *structs.Allocation)
    33  
    34  // allocReconciler is used to determine the set of allocations that require
    35  // placement, inplace updating or stopping given the job specification and
    36  // existing cluster state. The reconciler should only be used for batch and
    37  // service jobs.
    38  type allocReconciler struct {
    39  	// logger is used to log debug information. Logging should be kept at a
    40  	// minimal here
    41  	logger *log.Logger
    42  
    43  	// canInplace is used to check if the allocation can be inplace upgraded
    44  	allocUpdateFn allocUpdateType
    45  
    46  	// batch marks whether the job is a batch job
    47  	batch bool
    48  
    49  	// job is the job being operated on, it may be nil if the job is being
    50  	// stopped via a purge
    51  	job *structs.Job
    52  
    53  	// jobID is the ID of the job being operated on. The job may be nil if it is
    54  	// being stopped so we require this separately.
    55  	jobID string
    56  
    57  	// oldDeployment is the last deployment for the job
    58  	oldDeployment *structs.Deployment
    59  
    60  	// deployment is the current deployment for the job
    61  	deployment *structs.Deployment
    62  
    63  	// deploymentPaused marks whether the deployment is paused
    64  	deploymentPaused bool
    65  
    66  	// deploymentFailed marks whether the deployment is failed
    67  	deploymentFailed bool
    68  
    69  	// taintedNodes contains a map of nodes that are tainted
    70  	taintedNodes map[string]*structs.Node
    71  
    72  	// existingAllocs is non-terminal existing allocations
    73  	existingAllocs []*structs.Allocation
    74  
    75  	// evalID is the ID of the evaluation that triggered the reconciler
    76  	evalID string
    77  
    78  	// now is the time used when determining rescheduling eligibility
    79  	// defaults to time.Now, and overidden in unit tests
    80  	now time.Time
    81  
    82  	// result is the results of the reconcile. During computation it can be
    83  	// used to store intermediate state
    84  	result *reconcileResults
    85  }
    86  
    87  // reconcileResults contains the results of the reconciliation and should be
    88  // applied by the scheduler.
    89  type reconcileResults struct {
    90  	// deployment is the deployment that should be created or updated as a
    91  	// result of scheduling
    92  	deployment *structs.Deployment
    93  
    94  	// deploymentUpdates contains a set of deployment updates that should be
    95  	// applied as a result of scheduling
    96  	deploymentUpdates []*structs.DeploymentStatusUpdate
    97  
    98  	// place is the set of allocations to place by the scheduler
    99  	place []allocPlaceResult
   100  
   101  	// destructiveUpdate is the set of allocations to apply a destructive update to
   102  	destructiveUpdate []allocDestructiveResult
   103  
   104  	// inplaceUpdate is the set of allocations to apply an inplace update to
   105  	inplaceUpdate []*structs.Allocation
   106  
   107  	// stop is the set of allocations to stop
   108  	stop []allocStopResult
   109  
   110  	// attributeUpdates are updates to the allocation that are not from a
   111  	// jobspec change.
   112  	attributeUpdates map[string]*structs.Allocation
   113  
   114  	// desiredTGUpdates captures the desired set of changes to make for each
   115  	// task group.
   116  	desiredTGUpdates map[string]*structs.DesiredUpdates
   117  
   118  	// desiredFollowupEvals is the map of follow up evaluations to create per task group
   119  	// This is used to create a delayed evaluation for rescheduling failed allocations.
   120  	desiredFollowupEvals map[string][]*structs.Evaluation
   121  }
   122  
   123  // delayedRescheduleInfo contains the allocation id and a time when its eligible to be rescheduled.
   124  // this is used to create follow up evaluations
   125  type delayedRescheduleInfo struct {
   126  
   127  	// allocID is the ID of the allocation eligible to be rescheduled
   128  	allocID string
   129  
   130  	// rescheduleTime is the time to use in the delayed evaluation
   131  	rescheduleTime time.Time
   132  }
   133  
   134  func (r *reconcileResults) GoString() string {
   135  	base := fmt.Sprintf("Total changes: (place %d) (destructive %d) (inplace %d) (stop %d)",
   136  		len(r.place), len(r.destructiveUpdate), len(r.inplaceUpdate), len(r.stop))
   137  
   138  	if r.deployment != nil {
   139  		base += fmt.Sprintf("\nCreated Deployment: %q", r.deployment.ID)
   140  	}
   141  	for _, u := range r.deploymentUpdates {
   142  		base += fmt.Sprintf("\nDeployment Update for ID %q: Status %q; Description %q",
   143  			u.DeploymentID, u.Status, u.StatusDescription)
   144  	}
   145  	for tg, u := range r.desiredTGUpdates {
   146  		base += fmt.Sprintf("\nDesired Changes for %q: %#v", tg, u)
   147  	}
   148  	return base
   149  }
   150  
   151  // Changes returns the number of total changes
   152  func (r *reconcileResults) Changes() int {
   153  	return len(r.place) + len(r.inplaceUpdate) + len(r.stop)
   154  }
   155  
   156  // NewAllocReconciler creates a new reconciler that should be used to determine
   157  // the changes required to bring the cluster state inline with the declared jobspec
   158  func NewAllocReconciler(logger *log.Logger, allocUpdateFn allocUpdateType, batch bool,
   159  	jobID string, job *structs.Job, deployment *structs.Deployment,
   160  	existingAllocs []*structs.Allocation, taintedNodes map[string]*structs.Node, evalID string) *allocReconciler {
   161  	return &allocReconciler{
   162  		logger:         logger,
   163  		allocUpdateFn:  allocUpdateFn,
   164  		batch:          batch,
   165  		jobID:          jobID,
   166  		job:            job,
   167  		deployment:     deployment.Copy(),
   168  		existingAllocs: existingAllocs,
   169  		taintedNodes:   taintedNodes,
   170  		evalID:         evalID,
   171  		now:            time.Now(),
   172  		result: &reconcileResults{
   173  			desiredTGUpdates:     make(map[string]*structs.DesiredUpdates),
   174  			desiredFollowupEvals: make(map[string][]*structs.Evaluation),
   175  		},
   176  	}
   177  }
   178  
   179  // Compute reconciles the existing cluster state and returns the set of changes
   180  // required to converge the job spec and state
   181  func (a *allocReconciler) Compute() *reconcileResults {
   182  	// Create the allocation matrix
   183  	m := newAllocMatrix(a.job, a.existingAllocs)
   184  
   185  	// Handle stopping unneeded deployments
   186  	a.cancelDeployments()
   187  
   188  	// If we are just stopping a job we do not need to do anything more than
   189  	// stopping all running allocs
   190  	if a.job.Stopped() {
   191  		a.handleStop(m)
   192  		return a.result
   193  	}
   194  
   195  	// Detect if the deployment is paused
   196  	if a.deployment != nil {
   197  		a.deploymentPaused = a.deployment.Status == structs.DeploymentStatusPaused
   198  		a.deploymentFailed = a.deployment.Status == structs.DeploymentStatusFailed
   199  	}
   200  
   201  	// Reconcile each group
   202  	complete := true
   203  	for group, as := range m {
   204  		groupComplete := a.computeGroup(group, as)
   205  		complete = complete && groupComplete
   206  	}
   207  
   208  	// Mark the deployment as complete if possible
   209  	if a.deployment != nil && complete {
   210  		a.result.deploymentUpdates = append(a.result.deploymentUpdates, &structs.DeploymentStatusUpdate{
   211  			DeploymentID:      a.deployment.ID,
   212  			Status:            structs.DeploymentStatusSuccessful,
   213  			StatusDescription: structs.DeploymentStatusDescriptionSuccessful,
   214  		})
   215  	}
   216  
   217  	// Set the description of a created deployment
   218  	if d := a.result.deployment; d != nil {
   219  		if d.RequiresPromotion() {
   220  			d.StatusDescription = structs.DeploymentStatusDescriptionRunningNeedsPromotion
   221  		}
   222  	}
   223  
   224  	return a.result
   225  }
   226  
   227  // cancelDeployments cancels any deployment that is not needed
   228  func (a *allocReconciler) cancelDeployments() {
   229  	// If the job is stopped and there is a non-terminal deployment, cancel it
   230  	if a.job.Stopped() {
   231  		if a.deployment != nil && a.deployment.Active() {
   232  			a.result.deploymentUpdates = append(a.result.deploymentUpdates, &structs.DeploymentStatusUpdate{
   233  				DeploymentID:      a.deployment.ID,
   234  				Status:            structs.DeploymentStatusCancelled,
   235  				StatusDescription: structs.DeploymentStatusDescriptionStoppedJob,
   236  			})
   237  		}
   238  
   239  		// Nothing else to do
   240  		a.oldDeployment = a.deployment
   241  		a.deployment = nil
   242  		return
   243  	}
   244  
   245  	d := a.deployment
   246  	if d == nil {
   247  		return
   248  	}
   249  
   250  	// Check if the deployment is active and referencing an older job and cancel it
   251  	if d.JobCreateIndex != a.job.CreateIndex || d.JobVersion != a.job.Version {
   252  		if d.Active() {
   253  			a.result.deploymentUpdates = append(a.result.deploymentUpdates, &structs.DeploymentStatusUpdate{
   254  				DeploymentID:      a.deployment.ID,
   255  				Status:            structs.DeploymentStatusCancelled,
   256  				StatusDescription: structs.DeploymentStatusDescriptionNewerJob,
   257  			})
   258  		}
   259  
   260  		a.oldDeployment = d
   261  		a.deployment = nil
   262  	}
   263  
   264  	// Clear it as the current deployment if it is successful
   265  	if d.Status == structs.DeploymentStatusSuccessful {
   266  		a.oldDeployment = d
   267  		a.deployment = nil
   268  	}
   269  }
   270  
   271  // handleStop marks all allocations to be stopped, handling the lost case
   272  func (a *allocReconciler) handleStop(m allocMatrix) {
   273  	for group, as := range m {
   274  		as = filterByTerminal(as)
   275  		untainted, migrate, lost := as.filterByTainted(a.taintedNodes)
   276  		a.markStop(untainted, "", allocNotNeeded)
   277  		a.markStop(migrate, "", allocNotNeeded)
   278  		a.markStop(lost, structs.AllocClientStatusLost, allocLost)
   279  		desiredChanges := new(structs.DesiredUpdates)
   280  		desiredChanges.Stop = uint64(len(as))
   281  		a.result.desiredTGUpdates[group] = desiredChanges
   282  	}
   283  }
   284  
   285  // markStop is a helper for marking a set of allocation for stop with a
   286  // particular client status and description.
   287  func (a *allocReconciler) markStop(allocs allocSet, clientStatus, statusDescription string) {
   288  	for _, alloc := range allocs {
   289  		a.result.stop = append(a.result.stop, allocStopResult{
   290  			alloc:             alloc,
   291  			clientStatus:      clientStatus,
   292  			statusDescription: statusDescription,
   293  		})
   294  	}
   295  }
   296  
   297  // computeGroup reconciles state for a particular task group. It returns whether
   298  // the deployment it is for is complete with regards to the task group.
   299  func (a *allocReconciler) computeGroup(group string, all allocSet) bool {
   300  	// Create the desired update object for the group
   301  	desiredChanges := new(structs.DesiredUpdates)
   302  	a.result.desiredTGUpdates[group] = desiredChanges
   303  
   304  	// Get the task group. The task group may be nil if the job was updates such
   305  	// that the task group no longer exists
   306  	tg := a.job.LookupTaskGroup(group)
   307  
   308  	// If the task group is nil, then the task group has been removed so all we
   309  	// need to do is stop everything
   310  	if tg == nil {
   311  		untainted, migrate, lost := all.filterByTainted(a.taintedNodes)
   312  		a.markStop(untainted, "", allocNotNeeded)
   313  		a.markStop(migrate, "", allocNotNeeded)
   314  		a.markStop(lost, structs.AllocClientStatusLost, allocLost)
   315  		desiredChanges.Stop = uint64(len(untainted) + len(migrate) + len(lost))
   316  		return true
   317  	}
   318  
   319  	// Get the deployment state for the group
   320  	var dstate *structs.DeploymentState
   321  	existingDeployment := false
   322  	if a.deployment != nil {
   323  		dstate, existingDeployment = a.deployment.TaskGroups[group]
   324  	}
   325  	if !existingDeployment {
   326  		dstate = &structs.DeploymentState{}
   327  		if tg.Update != nil {
   328  			dstate.AutoRevert = tg.Update.AutoRevert
   329  			dstate.ProgressDeadline = tg.Update.ProgressDeadline
   330  		}
   331  	}
   332  
   333  	// Filter allocations that do not need to be considered because they are
   334  	// from an older job version and are terminal.
   335  	all, ignore := a.filterOldTerminalAllocs(all)
   336  	desiredChanges.Ignore += uint64(len(ignore))
   337  
   338  	// canaries is the set of canaries for the current deployment and all is all
   339  	// allocs including the canaries
   340  	canaries, all := a.handleGroupCanaries(all, desiredChanges)
   341  
   342  	// Determine what set of allocations are on tainted nodes
   343  	untainted, migrate, lost := all.filterByTainted(a.taintedNodes)
   344  
   345  	// Determine what set of terminal allocations need to be rescheduled
   346  	untainted, rescheduleNow, rescheduleLater := untainted.filterByRescheduleable(a.batch, a.now, a.evalID, a.deployment)
   347  
   348  	// Create batched follow up evaluations for allocations that are
   349  	// reschedulable later and mark the allocations for in place updating
   350  	a.handleDelayedReschedules(rescheduleLater, all, tg.Name)
   351  
   352  	// Create a structure for choosing names. Seed with the taken names which is
   353  	// the union of untainted and migrating nodes (includes canaries)
   354  	nameIndex := newAllocNameIndex(a.jobID, group, tg.Count, untainted.union(migrate, rescheduleNow))
   355  
   356  	// Stop any unneeded allocations and update the untainted set to not
   357  	// included stopped allocations.
   358  	canaryState := dstate != nil && dstate.DesiredCanaries != 0 && !dstate.Promoted
   359  	stop := a.computeStop(tg, nameIndex, untainted, migrate, lost, canaries, canaryState)
   360  	desiredChanges.Stop += uint64(len(stop))
   361  	untainted = untainted.difference(stop)
   362  
   363  	// Do inplace upgrades where possible and capture the set of upgrades that
   364  	// need to be done destructively.
   365  	ignore, inplace, destructive := a.computeUpdates(tg, untainted)
   366  	desiredChanges.Ignore += uint64(len(ignore))
   367  	desiredChanges.InPlaceUpdate += uint64(len(inplace))
   368  	if !existingDeployment {
   369  		dstate.DesiredTotal += len(destructive) + len(inplace)
   370  	}
   371  
   372  	// Remove the canaries now that we have handled rescheduling so that we do
   373  	// not consider them when making placement decisions.
   374  	if canaryState {
   375  		untainted = untainted.difference(canaries)
   376  	}
   377  
   378  	// The fact that we have destructive updates and have less canaries than is
   379  	// desired means we need to create canaries
   380  	numDestructive := len(destructive)
   381  	strategy := tg.Update
   382  	canariesPromoted := dstate != nil && dstate.Promoted
   383  	requireCanary := numDestructive != 0 && strategy != nil && len(canaries) < strategy.Canary && !canariesPromoted
   384  	if requireCanary && !a.deploymentPaused && !a.deploymentFailed {
   385  		number := strategy.Canary - len(canaries)
   386  		desiredChanges.Canary += uint64(number)
   387  		if !existingDeployment {
   388  			dstate.DesiredCanaries = strategy.Canary
   389  		}
   390  
   391  		for _, name := range nameIndex.NextCanaries(uint(number), canaries, destructive) {
   392  			a.result.place = append(a.result.place, allocPlaceResult{
   393  				name:      name,
   394  				canary:    true,
   395  				taskGroup: tg,
   396  			})
   397  		}
   398  	}
   399  
   400  	// Determine how many we can place
   401  	canaryState = dstate != nil && dstate.DesiredCanaries != 0 && !dstate.Promoted
   402  	limit := a.computeLimit(tg, untainted, destructive, migrate, canaryState)
   403  
   404  	// Place if:
   405  	// * The deployment is not paused or failed
   406  	// * Not placing any canaries
   407  	// * If there are any canaries that they have been promoted
   408  	place := a.computePlacements(tg, nameIndex, untainted, migrate, rescheduleNow)
   409  	if !existingDeployment {
   410  		dstate.DesiredTotal += len(place)
   411  	}
   412  
   413  	// deploymentPlaceReady tracks whether the deployment is in a state where
   414  	// placements can be made without any other consideration.
   415  	deploymentPlaceReady := !a.deploymentPaused && !a.deploymentFailed && !canaryState
   416  
   417  	if deploymentPlaceReady {
   418  		desiredChanges.Place += uint64(len(place))
   419  		for _, p := range place {
   420  			a.result.place = append(a.result.place, p)
   421  		}
   422  
   423  		min := helper.IntMin(len(place), limit)
   424  		limit -= min
   425  	} else if !deploymentPlaceReady {
   426  		// We do not want to place additional allocations but in the case we
   427  		// have lost allocations or allocations that require rescheduling now,
   428  		// we do so regardless to avoid odd user experiences.
   429  		if len(lost) != 0 {
   430  			allowed := helper.IntMin(len(lost), len(place))
   431  			desiredChanges.Place += uint64(allowed)
   432  			for _, p := range place[:allowed] {
   433  				a.result.place = append(a.result.place, p)
   434  			}
   435  		}
   436  
   437  		// Handle rescheduling of failed allocations even if the deployment is
   438  		// failed. We do not reschedule if the allocation is part of the failed
   439  		// deployment.
   440  		if now := len(rescheduleNow); now != 0 {
   441  			for _, p := range place {
   442  				prev := p.PreviousAllocation()
   443  				if p.IsRescheduling() && !(a.deploymentFailed && prev != nil && a.deployment.ID == prev.DeploymentID) {
   444  					a.result.place = append(a.result.place, p)
   445  					desiredChanges.Place++
   446  				}
   447  			}
   448  		}
   449  	}
   450  
   451  	if deploymentPlaceReady {
   452  		// Do all destructive updates
   453  		min := helper.IntMin(len(destructive), limit)
   454  		desiredChanges.DestructiveUpdate += uint64(min)
   455  		desiredChanges.Ignore += uint64(len(destructive) - min)
   456  		for _, alloc := range destructive.nameOrder()[:min] {
   457  			a.result.destructiveUpdate = append(a.result.destructiveUpdate, allocDestructiveResult{
   458  				placeName:             alloc.Name,
   459  				placeTaskGroup:        tg,
   460  				stopAlloc:             alloc,
   461  				stopStatusDescription: allocUpdating,
   462  			})
   463  		}
   464  	} else {
   465  		desiredChanges.Ignore += uint64(len(destructive))
   466  	}
   467  
   468  	// Migrate all the allocations
   469  	desiredChanges.Migrate += uint64(len(migrate))
   470  	for _, alloc := range migrate.nameOrder() {
   471  		a.result.stop = append(a.result.stop, allocStopResult{
   472  			alloc:             alloc,
   473  			statusDescription: allocMigrating,
   474  		})
   475  		a.result.place = append(a.result.place, allocPlaceResult{
   476  			name:          alloc.Name,
   477  			canary:        false,
   478  			taskGroup:     tg,
   479  			previousAlloc: alloc,
   480  		})
   481  	}
   482  
   483  	// Create new deployment if:
   484  	// 1. Updating a job specification
   485  	// 2. No running allocations (first time running a job)
   486  	updatingSpec := len(destructive) != 0 || len(a.result.inplaceUpdate) != 0
   487  	hadRunning := false
   488  	for _, alloc := range all {
   489  		if alloc.Job.Version == a.job.Version && alloc.Job.CreateIndex == a.job.CreateIndex {
   490  			hadRunning = true
   491  			break
   492  		}
   493  	}
   494  
   495  	// Create a new deployment if necessary
   496  	if !existingDeployment && strategy != nil && dstate.DesiredTotal != 0 && (!hadRunning || updatingSpec) {
   497  		// A previous group may have made the deployment already
   498  		if a.deployment == nil {
   499  			a.deployment = structs.NewDeployment(a.job)
   500  			a.result.deployment = a.deployment
   501  		}
   502  
   503  		// Attach the groups deployment state to the deployment
   504  		a.deployment.TaskGroups[group] = dstate
   505  	}
   506  
   507  	// deploymentComplete is whether the deployment is complete which largely
   508  	// means that no placements were made or desired to be made
   509  	deploymentComplete := len(destructive)+len(inplace)+len(place)+len(migrate)+len(rescheduleNow)+len(rescheduleLater) == 0 && !requireCanary
   510  
   511  	// Final check to see if the deployment is complete is to ensure everything
   512  	// is healthy
   513  	if deploymentComplete && a.deployment != nil {
   514  		if dstate, ok := a.deployment.TaskGroups[group]; ok {
   515  			if dstate.HealthyAllocs < helper.IntMax(dstate.DesiredTotal, dstate.DesiredCanaries) || // Make sure we have enough healthy allocs
   516  				(dstate.DesiredCanaries > 0 && !dstate.Promoted) { // Make sure we are promoted if we have canaries
   517  				deploymentComplete = false
   518  			}
   519  		}
   520  	}
   521  
   522  	return deploymentComplete
   523  }
   524  
   525  // filterOldTerminalAllocs filters allocations that should be ignored since they
   526  // are allocations that are terminal from a previous job version.
   527  func (a *allocReconciler) filterOldTerminalAllocs(all allocSet) (filtered, ignore allocSet) {
   528  	if !a.batch {
   529  		return all, nil
   530  	}
   531  
   532  	filtered = filtered.union(all)
   533  	ignored := make(map[string]*structs.Allocation)
   534  
   535  	// Ignore terminal batch jobs from older versions
   536  	for id, alloc := range filtered {
   537  		older := alloc.Job.Version < a.job.Version || alloc.Job.CreateIndex < a.job.CreateIndex
   538  		if older && alloc.TerminalStatus() {
   539  			delete(filtered, id)
   540  			ignored[id] = alloc
   541  		}
   542  	}
   543  
   544  	return filtered, ignored
   545  }
   546  
   547  // handleGroupCanaries handles the canaries for the group by stopping the
   548  // unneeded ones and returning the current set of canaries and the updated total
   549  // set of allocs for the group
   550  func (a *allocReconciler) handleGroupCanaries(all allocSet, desiredChanges *structs.DesiredUpdates) (canaries, newAll allocSet) {
   551  	// Stop any canary from an older deployment or from a failed one
   552  	var stop []string
   553  
   554  	// Cancel any non-promoted canaries from the older deployment
   555  	if a.oldDeployment != nil {
   556  		for _, s := range a.oldDeployment.TaskGroups {
   557  			if !s.Promoted {
   558  				stop = append(stop, s.PlacedCanaries...)
   559  			}
   560  		}
   561  	}
   562  
   563  	// Cancel any non-promoted canaries from a failed deployment
   564  	if a.deployment != nil && a.deployment.Status == structs.DeploymentStatusFailed {
   565  		for _, s := range a.deployment.TaskGroups {
   566  			if !s.Promoted {
   567  				stop = append(stop, s.PlacedCanaries...)
   568  			}
   569  		}
   570  	}
   571  
   572  	// stopSet is the allocSet that contains the canaries we desire to stop from
   573  	// above.
   574  	stopSet := all.fromKeys(stop)
   575  	a.markStop(stopSet, "", allocNotNeeded)
   576  	desiredChanges.Stop += uint64(len(stopSet))
   577  	all = all.difference(stopSet)
   578  
   579  	// Capture our current set of canaries and handle any migrations that are
   580  	// needed by just stopping them.
   581  	if a.deployment != nil {
   582  		var canaryIDs []string
   583  		for _, s := range a.deployment.TaskGroups {
   584  			canaryIDs = append(canaryIDs, s.PlacedCanaries...)
   585  		}
   586  
   587  		canaries = all.fromKeys(canaryIDs)
   588  		untainted, migrate, lost := canaries.filterByTainted(a.taintedNodes)
   589  		a.markStop(migrate, "", allocMigrating)
   590  		a.markStop(lost, structs.AllocClientStatusLost, allocLost)
   591  
   592  		canaries = untainted
   593  		all = all.difference(migrate, lost)
   594  	}
   595  
   596  	return canaries, all
   597  }
   598  
   599  // computeLimit returns the placement limit for a particular group. The inputs
   600  // are the group definition, the untainted, destructive, and migrate allocation
   601  // set and whether we are in a canary state.
   602  func (a *allocReconciler) computeLimit(group *structs.TaskGroup, untainted, destructive, migrate allocSet, canaryState bool) int {
   603  	// If there is no update strategy or deployment for the group we can deploy
   604  	// as many as the group has
   605  	if group.Update == nil || len(destructive)+len(migrate) == 0 {
   606  		return group.Count
   607  	} else if a.deploymentPaused || a.deploymentFailed {
   608  		// If the deployment is paused or failed, do not create anything else
   609  		return 0
   610  	}
   611  
   612  	// If we have canaries and they have not been promoted the limit is 0
   613  	if canaryState {
   614  		return 0
   615  	}
   616  
   617  	// If we have been promoted or there are no canaries, the limit is the
   618  	// configured MaxParallel minus any outstanding non-healthy alloc for the
   619  	// deployment
   620  	limit := group.Update.MaxParallel
   621  	if a.deployment != nil {
   622  		partOf, _ := untainted.filterByDeployment(a.deployment.ID)
   623  		for _, alloc := range partOf {
   624  			// An unhealthy allocation means nothing else should be happen.
   625  			if alloc.DeploymentStatus.IsUnhealthy() {
   626  				return 0
   627  			}
   628  
   629  			if !alloc.DeploymentStatus.IsHealthy() {
   630  				limit--
   631  			}
   632  		}
   633  	}
   634  
   635  	// The limit can be less than zero in the case that the job was changed such
   636  	// that it required destructive changes and the count was scaled up.
   637  	if limit < 0 {
   638  		return 0
   639  	}
   640  
   641  	return limit
   642  }
   643  
   644  // computePlacement returns the set of allocations to place given the group
   645  // definition, the set of untainted, migrating and reschedule allocations for the group.
   646  func (a *allocReconciler) computePlacements(group *structs.TaskGroup,
   647  	nameIndex *allocNameIndex, untainted, migrate allocSet, reschedule allocSet) []allocPlaceResult {
   648  
   649  	// Add rescheduled placement results
   650  	var place []allocPlaceResult
   651  	for _, alloc := range reschedule {
   652  		place = append(place, allocPlaceResult{
   653  			name:          alloc.Name,
   654  			taskGroup:     group,
   655  			previousAlloc: alloc,
   656  			reschedule:    true,
   657  			canary:        alloc.DeploymentStatus.IsCanary(),
   658  		})
   659  	}
   660  
   661  	// Hot path the nothing to do case
   662  	existing := len(untainted) + len(migrate) + len(reschedule)
   663  	if existing >= group.Count {
   664  		return place
   665  	}
   666  
   667  	// Add remaining placement results
   668  	if existing < group.Count {
   669  		for _, name := range nameIndex.Next(uint(group.Count - existing)) {
   670  			place = append(place, allocPlaceResult{
   671  				name:      name,
   672  				taskGroup: group,
   673  			})
   674  		}
   675  	}
   676  
   677  	return place
   678  }
   679  
   680  // computeStop returns the set of allocations that are marked for stopping given
   681  // the group definition, the set of allocations in various states and whether we
   682  // are canarying.
   683  func (a *allocReconciler) computeStop(group *structs.TaskGroup, nameIndex *allocNameIndex,
   684  	untainted, migrate, lost, canaries allocSet, canaryState bool) allocSet {
   685  
   686  	// Mark all lost allocations for stop. Previous allocation doesn't matter
   687  	// here since it is on a lost node
   688  	var stop allocSet
   689  	stop = stop.union(lost)
   690  	a.markStop(lost, structs.AllocClientStatusLost, allocLost)
   691  
   692  	// If we are still deploying or creating canaries, don't stop them
   693  	if canaryState {
   694  		untainted = untainted.difference(canaries)
   695  	}
   696  
   697  	// Hot path the nothing to do case
   698  	remove := len(untainted) + len(migrate) - group.Count
   699  	if remove <= 0 {
   700  		return stop
   701  	}
   702  
   703  	// Filter out any terminal allocations from the untainted set
   704  	// This is so that we don't try to mark them as stopped redundantly
   705  	untainted = filterByTerminal(untainted)
   706  
   707  	// Prefer stopping any alloc that has the same name as the canaries if we
   708  	// are promoted
   709  	if !canaryState && len(canaries) != 0 {
   710  		canaryNames := canaries.nameSet()
   711  		for id, alloc := range untainted.difference(canaries) {
   712  			if _, match := canaryNames[alloc.Name]; match {
   713  				stop[id] = alloc
   714  				a.result.stop = append(a.result.stop, allocStopResult{
   715  					alloc:             alloc,
   716  					statusDescription: allocNotNeeded,
   717  				})
   718  				delete(untainted, id)
   719  
   720  				remove--
   721  				if remove == 0 {
   722  					return stop
   723  				}
   724  			}
   725  		}
   726  	}
   727  
   728  	// Prefer selecting from the migrating set before stopping existing allocs
   729  	if len(migrate) != 0 {
   730  		mNames := newAllocNameIndex(a.jobID, group.Name, group.Count, migrate)
   731  		removeNames := mNames.Highest(uint(remove))
   732  		for id, alloc := range migrate {
   733  			if _, match := removeNames[alloc.Name]; !match {
   734  				continue
   735  			}
   736  			a.result.stop = append(a.result.stop, allocStopResult{
   737  				alloc:             alloc,
   738  				statusDescription: allocNotNeeded,
   739  			})
   740  			delete(migrate, id)
   741  			stop[id] = alloc
   742  			nameIndex.UnsetIndex(alloc.Index())
   743  
   744  			remove--
   745  			if remove == 0 {
   746  				return stop
   747  			}
   748  		}
   749  	}
   750  
   751  	// Select the allocs with the highest count to remove
   752  	removeNames := nameIndex.Highest(uint(remove))
   753  	for id, alloc := range untainted {
   754  		if _, ok := removeNames[alloc.Name]; ok {
   755  			stop[id] = alloc
   756  			a.result.stop = append(a.result.stop, allocStopResult{
   757  				alloc:             alloc,
   758  				statusDescription: allocNotNeeded,
   759  			})
   760  			delete(untainted, id)
   761  
   762  			remove--
   763  			if remove == 0 {
   764  				return stop
   765  			}
   766  		}
   767  	}
   768  
   769  	// It is possible that we didn't stop as many as we should have if there
   770  	// were allocations with duplicate names.
   771  	for id, alloc := range untainted {
   772  		stop[id] = alloc
   773  		a.result.stop = append(a.result.stop, allocStopResult{
   774  			alloc:             alloc,
   775  			statusDescription: allocNotNeeded,
   776  		})
   777  		delete(untainted, id)
   778  
   779  		remove--
   780  		if remove == 0 {
   781  			return stop
   782  		}
   783  	}
   784  
   785  	return stop
   786  }
   787  
   788  // computeUpdates determines which allocations for the passed group require
   789  // updates. Three groups are returned:
   790  // 1. Those that require no upgrades
   791  // 2. Those that can be upgraded in-place. These are added to the results
   792  // automatically since the function contains the correct state to do so,
   793  // 3. Those that require destructive updates
   794  func (a *allocReconciler) computeUpdates(group *structs.TaskGroup, untainted allocSet) (ignore, inplace, destructive allocSet) {
   795  	// Determine the set of allocations that need to be updated
   796  	ignore = make(map[string]*structs.Allocation)
   797  	inplace = make(map[string]*structs.Allocation)
   798  	destructive = make(map[string]*structs.Allocation)
   799  
   800  	for _, alloc := range untainted {
   801  		ignoreChange, destructiveChange, inplaceAlloc := a.allocUpdateFn(alloc, a.job, group)
   802  		if ignoreChange {
   803  			ignore[alloc.ID] = alloc
   804  		} else if destructiveChange {
   805  			destructive[alloc.ID] = alloc
   806  		} else {
   807  			inplace[alloc.ID] = alloc
   808  			a.result.inplaceUpdate = append(a.result.inplaceUpdate, inplaceAlloc)
   809  		}
   810  	}
   811  
   812  	return
   813  }
   814  
   815  // handleDelayedReschedules creates batched followup evaluations with the WaitUntil field set
   816  // for allocations that are eligible to be rescheduled later
   817  func (a *allocReconciler) handleDelayedReschedules(rescheduleLater []*delayedRescheduleInfo, all allocSet, tgName string) {
   818  	if len(rescheduleLater) == 0 {
   819  		return
   820  	}
   821  
   822  	// Sort by time
   823  	sort.Slice(rescheduleLater, func(i, j int) bool {
   824  		return rescheduleLater[i].rescheduleTime.Before(rescheduleLater[j].rescheduleTime)
   825  	})
   826  
   827  	var evals []*structs.Evaluation
   828  	nextReschedTime := rescheduleLater[0].rescheduleTime
   829  	allocIDToFollowupEvalID := make(map[string]string, len(rescheduleLater))
   830  
   831  	// Create a new eval for the first batch
   832  	eval := &structs.Evaluation{
   833  		ID:                uuid.Generate(),
   834  		Namespace:         a.job.Namespace,
   835  		Priority:          a.job.Priority,
   836  		Type:              a.job.Type,
   837  		TriggeredBy:       structs.EvalTriggerRetryFailedAlloc,
   838  		JobID:             a.job.ID,
   839  		JobModifyIndex:    a.job.ModifyIndex,
   840  		Status:            structs.EvalStatusPending,
   841  		StatusDescription: reschedulingFollowupEvalDesc,
   842  		WaitUntil:         nextReschedTime,
   843  	}
   844  	evals = append(evals, eval)
   845  
   846  	for _, allocReschedInfo := range rescheduleLater {
   847  		if allocReschedInfo.rescheduleTime.Sub(nextReschedTime) < batchedFailedAllocWindowSize {
   848  			allocIDToFollowupEvalID[allocReschedInfo.allocID] = eval.ID
   849  		} else {
   850  			// Start a new batch
   851  			nextReschedTime = allocReschedInfo.rescheduleTime
   852  			// Create a new eval for the new batch
   853  			eval = &structs.Evaluation{
   854  				ID:             uuid.Generate(),
   855  				Namespace:      a.job.Namespace,
   856  				Priority:       a.job.Priority,
   857  				Type:           a.job.Type,
   858  				TriggeredBy:    structs.EvalTriggerRetryFailedAlloc,
   859  				JobID:          a.job.ID,
   860  				JobModifyIndex: a.job.ModifyIndex,
   861  				Status:         structs.EvalStatusPending,
   862  				WaitUntil:      nextReschedTime,
   863  			}
   864  			evals = append(evals, eval)
   865  			// Set the evalID for the first alloc in this new batch
   866  			allocIDToFollowupEvalID[allocReschedInfo.allocID] = eval.ID
   867  		}
   868  	}
   869  
   870  	a.result.desiredFollowupEvals[tgName] = evals
   871  
   872  	// Initialize the annotations
   873  	if len(allocIDToFollowupEvalID) != 0 && a.result.attributeUpdates == nil {
   874  		a.result.attributeUpdates = make(map[string]*structs.Allocation)
   875  	}
   876  
   877  	// Create in-place updates for every alloc ID that needs to be updated with its follow up eval ID
   878  	for allocID, evalID := range allocIDToFollowupEvalID {
   879  		existingAlloc := all[allocID]
   880  		updatedAlloc := existingAlloc.Copy()
   881  		updatedAlloc.FollowupEvalID = evalID
   882  		a.result.attributeUpdates[updatedAlloc.ID] = updatedAlloc
   883  	}
   884  }