github.com/zoomfoo/nomad@v0.8.5-0.20180907175415-f28fd3a1a056/scheduler/reconcile.go

github.com/zoomfoo/nomad@v0.8.5-0.20180907175415-f28fd3a1a056/scheduler/reconcile.go (about)

     1  package scheduler
     2  
     3  import (
     4  	"fmt"
     5  	"log"
     6  	"time"
     7  
     8  	"sort"
     9  
    10  	"github.com/hashicorp/nomad/helper"
    11  	"github.com/hashicorp/nomad/helper/uuid"
    12  	"github.com/hashicorp/nomad/nomad/structs"
    13  )
    14  
    15  const (
    16  	// batchedFailedAllocWindowSize is the window size used
    17  	// to batch up failed allocations before creating an eval
    18  	batchedFailedAllocWindowSize = 5 * time.Second
    19  
    20  	// rescheduleWindowSize is the window size relative to
    21  	// current time within which reschedulable allocations are placed.
    22  	// This helps protect against small clock drifts between servers
    23  	rescheduleWindowSize = 1 * time.Second
    24  )
    25  
    26  // allocUpdateType takes an existing allocation and a new job definition and
    27  // returns whether the allocation can ignore the change, requires a destructive
    28  // update, or can be inplace updated. If it can be inplace updated, an updated
    29  // allocation that has the new resources and alloc metrics attached will be
    30  // returned.
    31  type allocUpdateType func(existing *structs.Allocation, newJob *structs.Job,
    32  	newTG *structs.TaskGroup) (ignore, destructive bool, updated *structs.Allocation)
    33  
    34  // allocReconciler is used to determine the set of allocations that require
    35  // placement, inplace updating or stopping given the job specification and
    36  // existing cluster state. The reconciler should only be used for batch and
    37  // service jobs.
    38  type allocReconciler struct {
    39  	// logger is used to log debug information. Logging should be kept at a
    40  	// minimal here
    41  	logger *log.Logger
    42  
    43  	// canInplace is used to check if the allocation can be inplace upgraded
    44  	allocUpdateFn allocUpdateType
    45  
    46  	// batch marks whether the job is a batch job
    47  	batch bool
    48  
    49  	// job is the job being operated on, it may be nil if the job is being
    50  	// stopped via a purge
    51  	job *structs.Job
    52  
    53  	// jobID is the ID of the job being operated on. The job may be nil if it is
    54  	// being stopped so we require this separately.
    55  	jobID string
    56  
    57  	// oldDeployment is the last deployment for the job
    58  	oldDeployment *structs.Deployment
    59  
    60  	// deployment is the current deployment for the job
    61  	deployment *structs.Deployment
    62  
    63  	// deploymentPaused marks whether the deployment is paused
    64  	deploymentPaused bool
    65  
    66  	// deploymentFailed marks whether the deployment is failed
    67  	deploymentFailed bool
    68  
    69  	// taintedNodes contains a map of nodes that are tainted
    70  	taintedNodes map[string]*structs.Node
    71  
    72  	// existingAllocs is non-terminal existing allocations
    73  	existingAllocs []*structs.Allocation
    74  
    75  	// evalID is the ID of the evaluation that triggered the reconciler
    76  	evalID string
    77  
    78  	// now is the time used when determining rescheduling eligibility
    79  	// defaults to time.Now, and overidden in unit tests
    80  	now time.Time
    81  
    82  	// result is the results of the reconcile. During computation it can be
    83  	// used to store intermediate state
    84  	result *reconcileResults
    85  }
    86  
    87  // reconcileResults contains the results of the reconciliation and should be
    88  // applied by the scheduler.
    89  type reconcileResults struct {
    90  	// deployment is the deployment that should be created or updated as a
    91  	// result of scheduling
    92  	deployment *structs.Deployment
    93  
    94  	// deploymentUpdates contains a set of deployment updates that should be
    95  	// applied as a result of scheduling
    96  	deploymentUpdates []*structs.DeploymentStatusUpdate
    97  
    98  	// place is the set of allocations to place by the scheduler
    99  	place []allocPlaceResult
   100  
   101  	// destructiveUpdate is the set of allocations to apply a destructive update to
   102  	destructiveUpdate []allocDestructiveResult
   103  
   104  	// inplaceUpdate is the set of allocations to apply an inplace update to
   105  	inplaceUpdate []*structs.Allocation
   106  
   107  	// stop is the set of allocations to stop
   108  	stop []allocStopResult
   109  
   110  	// attributeUpdates are updates to the allocation that are not from a
   111  	// jobspec change.
   112  	attributeUpdates map[string]*structs.Allocation
   113  
   114  	// desiredTGUpdates captures the desired set of changes to make for each
   115  	// task group.
   116  	desiredTGUpdates map[string]*structs.DesiredUpdates
   117  
   118  	// desiredFollowupEvals is the map of follow up evaluations to create per task group
   119  	// This is used to create a delayed evaluation for rescheduling failed allocations.
   120  	desiredFollowupEvals map[string][]*structs.Evaluation
   121  }
   122  
   123  // delayedRescheduleInfo contains the allocation id and a time when its eligible to be rescheduled.
   124  // this is used to create follow up evaluations
   125  type delayedRescheduleInfo struct {
   126  
   127  	// allocID is the ID of the allocation eligible to be rescheduled
   128  	allocID string
   129  
   130  	// rescheduleTime is the time to use in the delayed evaluation
   131  	rescheduleTime time.Time
   132  }
   133  
   134  func (r *reconcileResults) GoString() string {
   135  	base := fmt.Sprintf("Total changes: (place %d) (destructive %d) (inplace %d) (stop %d)",
   136  		len(r.place), len(r.destructiveUpdate), len(r.inplaceUpdate), len(r.stop))
   137  
   138  	if r.deployment != nil {
   139  		base += fmt.Sprintf("\nCreated Deployment: %q", r.deployment.ID)
   140  	}
   141  	for _, u := range r.deploymentUpdates {
   142  		base += fmt.Sprintf("\nDeployment Update for ID %q: Status %q; Description %q",
   143  			u.DeploymentID, u.Status, u.StatusDescription)
   144  	}
   145  	for tg, u := range r.desiredTGUpdates {
   146  		base += fmt.Sprintf("\nDesired Changes for %q: %#v", tg, u)
   147  	}
   148  	return base
   149  }
   150  
   151  // Changes returns the number of total changes
   152  func (r *reconcileResults) Changes() int {
   153  	return len(r.place) + len(r.inplaceUpdate) + len(r.stop)
   154  }
   155  
   156  // NewAllocReconciler creates a new reconciler that should be used to determine
   157  // the changes required to bring the cluster state inline with the declared jobspec
   158  func NewAllocReconciler(logger *log.Logger, allocUpdateFn allocUpdateType, batch bool,
   159  	jobID string, job *structs.Job, deployment *structs.Deployment,
   160  	existingAllocs []*structs.Allocation, taintedNodes map[string]*structs.Node, evalID string) *allocReconciler {
   161  	return &allocReconciler{
   162  		logger:         logger,
   163  		allocUpdateFn:  allocUpdateFn,
   164  		batch:          batch,
   165  		jobID:          jobID,
   166  		job:            job,
   167  		deployment:     deployment.Copy(),
   168  		existingAllocs: existingAllocs,
   169  		taintedNodes:   taintedNodes,
   170  		evalID:         evalID,
   171  		now:            time.Now(),
   172  		result: &reconcileResults{
   173  			desiredTGUpdates:     make(map[string]*structs.DesiredUpdates),
   174  			desiredFollowupEvals: make(map[string][]*structs.Evaluation),
   175  		},
   176  	}
   177  }
   178  
   179  // Compute reconciles the existing cluster state and returns the set of changes
   180  // required to converge the job spec and state
   181  func (a *allocReconciler) Compute() *reconcileResults {
   182  	// Create the allocation matrix
   183  	m := newAllocMatrix(a.job, a.existingAllocs)
   184  
   185  	// Handle stopping unneeded deployments
   186  	a.cancelDeployments()
   187  
   188  	// If we are just stopping a job we do not need to do anything more than
   189  	// stopping all running allocs
   190  	if a.job.Stopped() {
   191  		a.handleStop(m)
   192  		return a.result
   193  	}
   194  
   195  	// Detect if the deployment is paused
   196  	if a.deployment != nil {
   197  		a.deploymentPaused = a.deployment.Status == structs.DeploymentStatusPaused
   198  		a.deploymentFailed = a.deployment.Status == structs.DeploymentStatusFailed
   199  	}
   200  
   201  	// Reconcile each group
   202  	complete := true
   203  	for group, as := range m {
   204  		groupComplete := a.computeGroup(group, as)
   205  		complete = complete && groupComplete
   206  	}
   207  
   208  	// Mark the deployment as complete if possible
   209  	if a.deployment != nil && complete {
   210  		a.result.deploymentUpdates = append(a.result.deploymentUpdates, &structs.DeploymentStatusUpdate{
   211  			DeploymentID:      a.deployment.ID,
   212  			Status:            structs.DeploymentStatusSuccessful,
   213  			StatusDescription: structs.DeploymentStatusDescriptionSuccessful,
   214  		})
   215  	}
   216  
   217  	// Set the description of a created deployment
   218  	if d := a.result.deployment; d != nil {
   219  		if d.RequiresPromotion() {
   220  			d.StatusDescription = structs.DeploymentStatusDescriptionRunningNeedsPromotion
   221  		}
   222  	}
   223  
   224  	return a.result
   225  }
   226  
   227  // cancelDeployments cancels any deployment that is not needed
   228  func (a *allocReconciler) cancelDeployments() {
   229  	// If the job is stopped and there is a non-terminal deployment, cancel it
   230  	if a.job.Stopped() {
   231  		if a.deployment != nil && a.deployment.Active() {
   232  			a.result.deploymentUpdates = append(a.result.deploymentUpdates, &structs.DeploymentStatusUpdate{
   233  				DeploymentID:      a.deployment.ID,
   234  				Status:            structs.DeploymentStatusCancelled,
   235  				StatusDescription: structs.DeploymentStatusDescriptionStoppedJob,
   236  			})
   237  		}
   238  
   239  		// Nothing else to do
   240  		a.oldDeployment = a.deployment
   241  		a.deployment = nil
   242  		return
   243  	}
   244  
   245  	d := a.deployment
   246  	if d == nil {
   247  		return
   248  	}
   249  
   250  	// Check if the deployment is active and referencing an older job and cancel it
   251  	if d.JobCreateIndex != a.job.CreateIndex || d.JobVersion != a.job.Version {
   252  		if d.Active() {
   253  			a.result.deploymentUpdates = append(a.result.deploymentUpdates, &structs.DeploymentStatusUpdate{
   254  				DeploymentID:      a.deployment.ID,
   255  				Status:            structs.DeploymentStatusCancelled,
   256  				StatusDescription: structs.DeploymentStatusDescriptionNewerJob,
   257  			})
   258  		}
   259  
   260  		a.oldDeployment = d
   261  		a.deployment = nil
   262  	}
   263  
   264  	// Clear it as the current deployment if it is successful
   265  	if d.Status == structs.DeploymentStatusSuccessful {
   266  		a.oldDeployment = d
   267  		a.deployment = nil
   268  	}
   269  }
   270  
   271  // handleStop marks all allocations to be stopped, handling the lost case
   272  func (a *allocReconciler) handleStop(m allocMatrix) {
   273  	for group, as := range m {
   274  		untainted, migrate, lost := as.filterByTainted(a.taintedNodes)
   275  		a.markStop(untainted, "", allocNotNeeded)
   276  		a.markStop(migrate, "", allocNotNeeded)
   277  		a.markStop(lost, structs.AllocClientStatusLost, allocLost)
   278  		desiredChanges := new(structs.DesiredUpdates)
   279  		desiredChanges.Stop = uint64(len(as))
   280  		a.result.desiredTGUpdates[group] = desiredChanges
   281  	}
   282  }
   283  
   284  // markStop is a helper for marking a set of allocation for stop with a
   285  // particular client status and description.
   286  func (a *allocReconciler) markStop(allocs allocSet, clientStatus, statusDescription string) {
   287  	for _, alloc := range allocs {
   288  		a.result.stop = append(a.result.stop, allocStopResult{
   289  			alloc:             alloc,
   290  			clientStatus:      clientStatus,
   291  			statusDescription: statusDescription,
   292  		})
   293  	}
   294  }
   295  
   296  // computeGroup reconciles state for a particular task group. It returns whether
   297  // the deployment it is for is complete with regards to the task group.
   298  func (a *allocReconciler) computeGroup(group string, all allocSet) bool {
   299  	// Create the desired update object for the group
   300  	desiredChanges := new(structs.DesiredUpdates)
   301  	a.result.desiredTGUpdates[group] = desiredChanges
   302  
   303  	// Get the task group. The task group may be nil if the job was updates such
   304  	// that the task group no longer exists
   305  	tg := a.job.LookupTaskGroup(group)
   306  
   307  	// If the task group is nil, then the task group has been removed so all we
   308  	// need to do is stop everything
   309  	if tg == nil {
   310  		untainted, migrate, lost := all.filterByTainted(a.taintedNodes)
   311  		a.markStop(untainted, "", allocNotNeeded)
   312  		a.markStop(migrate, "", allocNotNeeded)
   313  		a.markStop(lost, structs.AllocClientStatusLost, allocLost)
   314  		desiredChanges.Stop = uint64(len(untainted) + len(migrate) + len(lost))
   315  		return true
   316  	}
   317  
   318  	// Get the deployment state for the group
   319  	var dstate *structs.DeploymentState
   320  	existingDeployment := false
   321  	if a.deployment != nil {
   322  		dstate, existingDeployment = a.deployment.TaskGroups[group]
   323  	}
   324  	if !existingDeployment {
   325  		dstate = &structs.DeploymentState{}
   326  		if tg.Update != nil {
   327  			dstate.AutoRevert = tg.Update.AutoRevert
   328  			dstate.ProgressDeadline = tg.Update.ProgressDeadline
   329  		}
   330  	}
   331  
   332  	// Filter allocations that do not need to be considered because they are
   333  	// from an older job version and are terminal.
   334  	all, ignore := a.filterOldTerminalAllocs(all)
   335  	desiredChanges.Ignore += uint64(len(ignore))
   336  
   337  	// canaries is the set of canaries for the current deployment and all is all
   338  	// allocs including the canaries
   339  	canaries, all := a.handleGroupCanaries(all, desiredChanges)
   340  
   341  	// Determine what set of allocations are on tainted nodes
   342  	untainted, migrate, lost := all.filterByTainted(a.taintedNodes)
   343  
   344  	// Determine what set of terminal allocations need to be rescheduled
   345  	untainted, rescheduleNow, rescheduleLater := untainted.filterByRescheduleable(a.batch, a.now, a.evalID, a.deployment)
   346  
   347  	// Create batched follow up evaluations for allocations that are
   348  	// reschedulable later and mark the allocations for in place updating
   349  	a.handleDelayedReschedules(rescheduleLater, all, tg.Name)
   350  
   351  	// Create a structure for choosing names. Seed with the taken names which is
   352  	// the union of untainted and migrating nodes (includes canaries)
   353  	nameIndex := newAllocNameIndex(a.jobID, group, tg.Count, untainted.union(migrate, rescheduleNow))
   354  
   355  	// Stop any unneeded allocations and update the untainted set to not
   356  	// included stopped allocations.
   357  	canaryState := dstate != nil && dstate.DesiredCanaries != 0 && !dstate.Promoted
   358  	stop := a.computeStop(tg, nameIndex, untainted, migrate, lost, canaries, canaryState)
   359  	desiredChanges.Stop += uint64(len(stop))
   360  	untainted = untainted.difference(stop)
   361  
   362  	// Do inplace upgrades where possible and capture the set of upgrades that
   363  	// need to be done destructively.
   364  	ignore, inplace, destructive := a.computeUpdates(tg, untainted)
   365  	desiredChanges.Ignore += uint64(len(ignore))
   366  	desiredChanges.InPlaceUpdate += uint64(len(inplace))
   367  	if !existingDeployment {
   368  		dstate.DesiredTotal += len(destructive) + len(inplace)
   369  	}
   370  
   371  	// Remove the canaries now that we have handled rescheduling so that we do
   372  	// not consider them when making placement decisions.
   373  	if canaryState {
   374  		untainted = untainted.difference(canaries)
   375  	}
   376  
   377  	// The fact that we have destructive updates and have less canaries than is
   378  	// desired means we need to create canaries
   379  	numDestructive := len(destructive)
   380  	strategy := tg.Update
   381  	canariesPromoted := dstate != nil && dstate.Promoted
   382  	requireCanary := numDestructive != 0 && strategy != nil && len(canaries) < strategy.Canary && !canariesPromoted
   383  	if requireCanary && !a.deploymentPaused && !a.deploymentFailed {
   384  		number := strategy.Canary - len(canaries)
   385  		desiredChanges.Canary += uint64(number)
   386  		if !existingDeployment {
   387  			dstate.DesiredCanaries = strategy.Canary
   388  		}
   389  
   390  		for _, name := range nameIndex.NextCanaries(uint(number), canaries, destructive) {
   391  			a.result.place = append(a.result.place, allocPlaceResult{
   392  				name:      name,
   393  				canary:    true,
   394  				taskGroup: tg,
   395  			})
   396  		}
   397  	}
   398  
   399  	// Determine how many we can place
   400  	canaryState = dstate != nil && dstate.DesiredCanaries != 0 && !dstate.Promoted
   401  	limit := a.computeLimit(tg, untainted, destructive, migrate, canaryState)
   402  
   403  	// Place if:
   404  	// * The deployment is not paused or failed
   405  	// * Not placing any canaries
   406  	// * If there are any canaries that they have been promoted
   407  	place := a.computePlacements(tg, nameIndex, untainted, migrate, rescheduleNow)
   408  	if !existingDeployment {
   409  		dstate.DesiredTotal += len(place)
   410  	}
   411  
   412  	// deploymentPlaceReady tracks whether the deployment is in a state where
   413  	// placements can be made without any other consideration.
   414  	deploymentPlaceReady := !a.deploymentPaused && !a.deploymentFailed && !canaryState
   415  
   416  	if deploymentPlaceReady {
   417  		desiredChanges.Place += uint64(len(place))
   418  		for _, p := range place {
   419  			a.result.place = append(a.result.place, p)
   420  		}
   421  
   422  		min := helper.IntMin(len(place), limit)
   423  		limit -= min
   424  	} else if !deploymentPlaceReady {
   425  		// We do not want to place additional allocations but in the case we
   426  		// have lost allocations or allocations that require rescheduling now,
   427  		// we do so regardless to avoid odd user experiences.
   428  		if len(lost) != 0 {
   429  			allowed := helper.IntMin(len(lost), len(place))
   430  			desiredChanges.Place += uint64(allowed)
   431  			for _, p := range place[:allowed] {
   432  				a.result.place = append(a.result.place, p)
   433  			}
   434  		}
   435  
   436  		// Handle rescheduling of failed allocations even if the deployment is
   437  		// failed. We do not reschedule if the allocation is part of the failed
   438  		// deployment.
   439  		if now := len(rescheduleNow); now != 0 {
   440  			for _, p := range place {
   441  				prev := p.PreviousAllocation()
   442  				if p.IsRescheduling() && !(a.deploymentFailed && prev != nil && a.deployment.ID == prev.DeploymentID) {
   443  					a.result.place = append(a.result.place, p)
   444  					desiredChanges.Place++
   445  				}
   446  			}
   447  		}
   448  	}
   449  
   450  	if deploymentPlaceReady {
   451  		// Do all destructive updates
   452  		min := helper.IntMin(len(destructive), limit)
   453  		desiredChanges.DestructiveUpdate += uint64(min)
   454  		desiredChanges.Ignore += uint64(len(destructive) - min)
   455  		for _, alloc := range destructive.nameOrder()[:min] {
   456  			a.result.destructiveUpdate = append(a.result.destructiveUpdate, allocDestructiveResult{
   457  				placeName:             alloc.Name,
   458  				placeTaskGroup:        tg,
   459  				stopAlloc:             alloc,
   460  				stopStatusDescription: allocUpdating,
   461  			})
   462  		}
   463  	} else {
   464  		desiredChanges.Ignore += uint64(len(destructive))
   465  	}
   466  
   467  	// Calculate the allowed number of changes and set the desired changes
   468  	// accordingly.
   469  	if !a.deploymentFailed && !a.deploymentPaused {
   470  		desiredChanges.Migrate += uint64(len(migrate))
   471  	} else {
   472  		desiredChanges.Stop += uint64(len(migrate))
   473  	}
   474  
   475  	for _, alloc := range migrate.nameOrder() {
   476  		// If the deployment is failed or paused, don't replace it, just mark as stop.
   477  		if a.deploymentFailed || a.deploymentPaused {
   478  			a.result.stop = append(a.result.stop, allocStopResult{
   479  				alloc:             alloc,
   480  				statusDescription: allocNodeTainted,
   481  			})
   482  			continue
   483  		}
   484  
   485  		a.result.stop = append(a.result.stop, allocStopResult{
   486  			alloc:             alloc,
   487  			statusDescription: allocMigrating,
   488  		})
   489  		a.result.place = append(a.result.place, allocPlaceResult{
   490  			name:          alloc.Name,
   491  			canary:        false,
   492  			taskGroup:     tg,
   493  			previousAlloc: alloc,
   494  		})
   495  	}
   496  
   497  	// Create new deployment if:
   498  	// 1. Updating a job specification
   499  	// 2. No running allocations (first time running a job)
   500  	updatingSpec := len(destructive) != 0 || len(a.result.inplaceUpdate) != 0
   501  	hadRunning := false
   502  	for _, alloc := range all {
   503  		if alloc.Job.Version == a.job.Version && alloc.Job.CreateIndex == a.job.CreateIndex {
   504  			hadRunning = true
   505  			break
   506  		}
   507  	}
   508  
   509  	// Create a new deployment if necessary
   510  	if !existingDeployment && strategy != nil && dstate.DesiredTotal != 0 && (!hadRunning || updatingSpec) {
   511  		// A previous group may have made the deployment already
   512  		if a.deployment == nil {
   513  			a.deployment = structs.NewDeployment(a.job)
   514  			a.result.deployment = a.deployment
   515  		}
   516  
   517  		// Attach the groups deployment state to the deployment
   518  		a.deployment.TaskGroups[group] = dstate
   519  	}
   520  
   521  	// deploymentComplete is whether the deployment is complete which largely
   522  	// means that no placements were made or desired to be made
   523  	deploymentComplete := len(destructive)+len(inplace)+len(place)+len(migrate)+len(rescheduleNow)+len(rescheduleLater) == 0 && !requireCanary
   524  
   525  	// Final check to see if the deployment is complete is to ensure everything
   526  	// is healthy
   527  	if deploymentComplete && a.deployment != nil {
   528  		if dstate, ok := a.deployment.TaskGroups[group]; ok {
   529  			if dstate.HealthyAllocs < helper.IntMax(dstate.DesiredTotal, dstate.DesiredCanaries) || // Make sure we have enough healthy allocs
   530  				(dstate.DesiredCanaries > 0 && !dstate.Promoted) { // Make sure we are promoted if we have canaries
   531  				deploymentComplete = false
   532  			}
   533  		}
   534  	}
   535  
   536  	return deploymentComplete
   537  }
   538  
   539  // filterOldTerminalAllocs filters allocations that should be ignored since they
   540  // are allocations that are terminal from a previous job version.
   541  func (a *allocReconciler) filterOldTerminalAllocs(all allocSet) (filtered, ignore allocSet) {
   542  	if !a.batch {
   543  		return all, nil
   544  	}
   545  
   546  	filtered = filtered.union(all)
   547  	ignored := make(map[string]*structs.Allocation)
   548  
   549  	// Ignore terminal batch jobs from older versions
   550  	for id, alloc := range filtered {
   551  		older := alloc.Job.Version < a.job.Version || alloc.Job.CreateIndex < a.job.CreateIndex
   552  		if older && alloc.TerminalStatus() {
   553  			delete(filtered, id)
   554  			ignored[id] = alloc
   555  		}
   556  	}
   557  
   558  	return filtered, ignored
   559  }
   560  
   561  // handleGroupCanaries handles the canaries for the group by stopping the
   562  // unneeded ones and returning the current set of canaries and the updated total
   563  // set of allocs for the group
   564  func (a *allocReconciler) handleGroupCanaries(all allocSet, desiredChanges *structs.DesiredUpdates) (canaries, newAll allocSet) {
   565  	// Stop any canary from an older deployment or from a failed one
   566  	var stop []string
   567  
   568  	// Cancel any non-promoted canaries from the older deployment
   569  	if a.oldDeployment != nil {
   570  		for _, s := range a.oldDeployment.TaskGroups {
   571  			if !s.Promoted {
   572  				stop = append(stop, s.PlacedCanaries...)
   573  			}
   574  		}
   575  	}
   576  
   577  	// Cancel any non-promoted canaries from a failed deployment
   578  	if a.deployment != nil && a.deployment.Status == structs.DeploymentStatusFailed {
   579  		for _, s := range a.deployment.TaskGroups {
   580  			if !s.Promoted {
   581  				stop = append(stop, s.PlacedCanaries...)
   582  			}
   583  		}
   584  	}
   585  
   586  	// stopSet is the allocSet that contains the canaries we desire to stop from
   587  	// above.
   588  	stopSet := all.fromKeys(stop)
   589  	a.markStop(stopSet, "", allocNotNeeded)
   590  	desiredChanges.Stop += uint64(len(stopSet))
   591  	all = all.difference(stopSet)
   592  
   593  	// Capture our current set of canaries and handle any migrations that are
   594  	// needed by just stopping them.
   595  	if a.deployment != nil {
   596  		var canaryIDs []string
   597  		for _, s := range a.deployment.TaskGroups {
   598  			canaryIDs = append(canaryIDs, s.PlacedCanaries...)
   599  		}
   600  
   601  		canaries = all.fromKeys(canaryIDs)
   602  		untainted, migrate, lost := canaries.filterByTainted(a.taintedNodes)
   603  		a.markStop(migrate, "", allocMigrating)
   604  		a.markStop(lost, structs.AllocClientStatusLost, allocLost)
   605  
   606  		canaries = untainted
   607  		all = all.difference(migrate, lost)
   608  	}
   609  
   610  	return canaries, all
   611  }
   612  
   613  // computeLimit returns the placement limit for a particular group. The inputs
   614  // are the group definition, the untainted, destructive, and migrate allocation
   615  // set and whether we are in a canary state.
   616  func (a *allocReconciler) computeLimit(group *structs.TaskGroup, untainted, destructive, migrate allocSet, canaryState bool) int {
   617  	// If there is no update strategy or deployment for the group we can deploy
   618  	// as many as the group has
   619  	if group.Update == nil || len(destructive)+len(migrate) == 0 {
   620  		return group.Count
   621  	} else if a.deploymentPaused || a.deploymentFailed {
   622  		// If the deployment is paused or failed, do not create anything else
   623  		return 0
   624  	}
   625  
   626  	// If we have canaries and they have not been promoted the limit is 0
   627  	if canaryState {
   628  		return 0
   629  	}
   630  
   631  	// If we have been promoted or there are no canaries, the limit is the
   632  	// configured MaxParallel minus any outstanding non-healthy alloc for the
   633  	// deployment
   634  	limit := group.Update.MaxParallel
   635  	if a.deployment != nil {
   636  		partOf, _ := untainted.filterByDeployment(a.deployment.ID)
   637  		for _, alloc := range partOf {
   638  			// An unhealthy allocation means nothing else should be happen.
   639  			if alloc.DeploymentStatus.IsUnhealthy() {
   640  				return 0
   641  			}
   642  
   643  			if !alloc.DeploymentStatus.IsHealthy() {
   644  				limit--
   645  			}
   646  		}
   647  	}
   648  
   649  	// The limit can be less than zero in the case that the job was changed such
   650  	// that it required destructive changes and the count was scaled up.
   651  	if limit < 0 {
   652  		return 0
   653  	}
   654  
   655  	return limit
   656  }
   657  
   658  // computePlacement returns the set of allocations to place given the group
   659  // definition, the set of untainted, migrating and reschedule allocations for the group.
   660  func (a *allocReconciler) computePlacements(group *structs.TaskGroup,
   661  	nameIndex *allocNameIndex, untainted, migrate allocSet, reschedule allocSet) []allocPlaceResult {
   662  
   663  	// Add rescheduled placement results
   664  	var place []allocPlaceResult
   665  	for _, alloc := range reschedule {
   666  		place = append(place, allocPlaceResult{
   667  			name:          alloc.Name,
   668  			taskGroup:     group,
   669  			previousAlloc: alloc,
   670  			reschedule:    true,
   671  			canary:        alloc.DeploymentStatus.IsCanary(),
   672  		})
   673  	}
   674  
   675  	// Hot path the nothing to do case
   676  	existing := len(untainted) + len(migrate) + len(reschedule)
   677  	if existing >= group.Count {
   678  		return place
   679  	}
   680  
   681  	// Add remaining placement results
   682  	if existing < group.Count {
   683  		for _, name := range nameIndex.Next(uint(group.Count - existing)) {
   684  			place = append(place, allocPlaceResult{
   685  				name:      name,
   686  				taskGroup: group,
   687  			})
   688  		}
   689  	}
   690  
   691  	return place
   692  }
   693  
   694  // computeStop returns the set of allocations that are marked for stopping given
   695  // the group definition, the set of allocations in various states and whether we
   696  // are canarying.
   697  func (a *allocReconciler) computeStop(group *structs.TaskGroup, nameIndex *allocNameIndex,
   698  	untainted, migrate, lost, canaries allocSet, canaryState bool) allocSet {
   699  
   700  	// Mark all lost allocations for stop. Previous allocation doesn't matter
   701  	// here since it is on a lost node
   702  	var stop allocSet
   703  	stop = stop.union(lost)
   704  	a.markStop(lost, structs.AllocClientStatusLost, allocLost)
   705  
   706  	// If we are still deploying or creating canaries, don't stop them
   707  	if canaryState {
   708  		untainted = untainted.difference(canaries)
   709  	}
   710  
   711  	// Hot path the nothing to do case
   712  	remove := len(untainted) + len(migrate) - group.Count
   713  	if remove <= 0 {
   714  		return stop
   715  	}
   716  
   717  	// Filter out any terminal allocations from the untainted set
   718  	// This is so that we don't try to mark them as stopped redundantly
   719  	untainted = filterByTerminal(untainted)
   720  
   721  	// Prefer stopping any alloc that has the same name as the canaries if we
   722  	// are promoted
   723  	if !canaryState && len(canaries) != 0 {
   724  		canaryNames := canaries.nameSet()
   725  		for id, alloc := range untainted.difference(canaries) {
   726  			if _, match := canaryNames[alloc.Name]; match {
   727  				stop[id] = alloc
   728  				a.result.stop = append(a.result.stop, allocStopResult{
   729  					alloc:             alloc,
   730  					statusDescription: allocNotNeeded,
   731  				})
   732  				delete(untainted, id)
   733  
   734  				remove--
   735  				if remove == 0 {
   736  					return stop
   737  				}
   738  			}
   739  		}
   740  	}
   741  
   742  	// Prefer selecting from the migrating set before stopping existing allocs
   743  	if len(migrate) != 0 {
   744  		mNames := newAllocNameIndex(a.jobID, group.Name, group.Count, migrate)
   745  		removeNames := mNames.Highest(uint(remove))
   746  		for id, alloc := range migrate {
   747  			if _, match := removeNames[alloc.Name]; !match {
   748  				continue
   749  			}
   750  			a.result.stop = append(a.result.stop, allocStopResult{
   751  				alloc:             alloc,
   752  				statusDescription: allocNotNeeded,
   753  			})
   754  			delete(migrate, id)
   755  			stop[id] = alloc
   756  			nameIndex.UnsetIndex(alloc.Index())
   757  
   758  			remove--
   759  			if remove == 0 {
   760  				return stop
   761  			}
   762  		}
   763  	}
   764  
   765  	// Select the allocs with the highest count to remove
   766  	removeNames := nameIndex.Highest(uint(remove))
   767  	for id, alloc := range untainted {
   768  		if _, ok := removeNames[alloc.Name]; ok {
   769  			stop[id] = alloc
   770  			a.result.stop = append(a.result.stop, allocStopResult{
   771  				alloc:             alloc,
   772  				statusDescription: allocNotNeeded,
   773  			})
   774  			delete(untainted, id)
   775  
   776  			remove--
   777  			if remove == 0 {
   778  				return stop
   779  			}
   780  		}
   781  	}
   782  
   783  	// It is possible that we didn't stop as many as we should have if there
   784  	// were allocations with duplicate names.
   785  	for id, alloc := range untainted {
   786  		stop[id] = alloc
   787  		a.result.stop = append(a.result.stop, allocStopResult{
   788  			alloc:             alloc,
   789  			statusDescription: allocNotNeeded,
   790  		})
   791  		delete(untainted, id)
   792  
   793  		remove--
   794  		if remove == 0 {
   795  			return stop
   796  		}
   797  	}
   798  
   799  	return stop
   800  }
   801  
   802  // computeUpdates determines which allocations for the passed group require
   803  // updates. Three groups are returned:
   804  // 1. Those that require no upgrades
   805  // 2. Those that can be upgraded in-place. These are added to the results
   806  // automatically since the function contains the correct state to do so,
   807  // 3. Those that require destructive updates
   808  func (a *allocReconciler) computeUpdates(group *structs.TaskGroup, untainted allocSet) (ignore, inplace, destructive allocSet) {
   809  	// Determine the set of allocations that need to be updated
   810  	ignore = make(map[string]*structs.Allocation)
   811  	inplace = make(map[string]*structs.Allocation)
   812  	destructive = make(map[string]*structs.Allocation)
   813  
   814  	for _, alloc := range untainted {
   815  		ignoreChange, destructiveChange, inplaceAlloc := a.allocUpdateFn(alloc, a.job, group)
   816  		if ignoreChange {
   817  			ignore[alloc.ID] = alloc
   818  		} else if destructiveChange {
   819  			destructive[alloc.ID] = alloc
   820  		} else {
   821  			inplace[alloc.ID] = alloc
   822  			a.result.inplaceUpdate = append(a.result.inplaceUpdate, inplaceAlloc)
   823  		}
   824  	}
   825  
   826  	return
   827  }
   828  
   829  // handleDelayedReschedules creates batched followup evaluations with the WaitUntil field set
   830  // for allocations that are eligible to be rescheduled later
   831  func (a *allocReconciler) handleDelayedReschedules(rescheduleLater []*delayedRescheduleInfo, all allocSet, tgName string) {
   832  	if len(rescheduleLater) == 0 {
   833  		return
   834  	}
   835  
   836  	// Sort by time
   837  	sort.Slice(rescheduleLater, func(i, j int) bool {
   838  		return rescheduleLater[i].rescheduleTime.Before(rescheduleLater[j].rescheduleTime)
   839  	})
   840  
   841  	var evals []*structs.Evaluation
   842  	nextReschedTime := rescheduleLater[0].rescheduleTime
   843  	allocIDToFollowupEvalID := make(map[string]string, len(rescheduleLater))
   844  
   845  	// Create a new eval for the first batch
   846  	eval := &structs.Evaluation{
   847  		ID:                uuid.Generate(),
   848  		Namespace:         a.job.Namespace,
   849  		Priority:          a.job.Priority,
   850  		Type:              a.job.Type,
   851  		TriggeredBy:       structs.EvalTriggerRetryFailedAlloc,
   852  		JobID:             a.job.ID,
   853  		JobModifyIndex:    a.job.ModifyIndex,
   854  		Status:            structs.EvalStatusPending,
   855  		StatusDescription: reschedulingFollowupEvalDesc,
   856  		WaitUntil:         nextReschedTime,
   857  	}
   858  	evals = append(evals, eval)
   859  
   860  	for _, allocReschedInfo := range rescheduleLater {
   861  		if allocReschedInfo.rescheduleTime.Sub(nextReschedTime) < batchedFailedAllocWindowSize {
   862  			allocIDToFollowupEvalID[allocReschedInfo.allocID] = eval.ID
   863  		} else {
   864  			// Start a new batch
   865  			nextReschedTime = allocReschedInfo.rescheduleTime
   866  			// Create a new eval for the new batch
   867  			eval = &structs.Evaluation{
   868  				ID:             uuid.Generate(),
   869  				Namespace:      a.job.Namespace,
   870  				Priority:       a.job.Priority,
   871  				Type:           a.job.Type,
   872  				TriggeredBy:    structs.EvalTriggerRetryFailedAlloc,
   873  				JobID:          a.job.ID,
   874  				JobModifyIndex: a.job.ModifyIndex,
   875  				Status:         structs.EvalStatusPending,
   876  				WaitUntil:      nextReschedTime,
   877  			}
   878  			evals = append(evals, eval)
   879  			// Set the evalID for the first alloc in this new batch
   880  			allocIDToFollowupEvalID[allocReschedInfo.allocID] = eval.ID
   881  		}
   882  	}
   883  
   884  	a.result.desiredFollowupEvals[tgName] = evals
   885  
   886  	// Initialize the annotations
   887  	if len(allocIDToFollowupEvalID) != 0 && a.result.attributeUpdates == nil {
   888  		a.result.attributeUpdates = make(map[string]*structs.Allocation)
   889  	}
   890  
   891  	// Create in-place updates for every alloc ID that needs to be updated with its follow up eval ID
   892  	for allocID, evalID := range allocIDToFollowupEvalID {
   893  		existingAlloc := all[allocID]
   894  		updatedAlloc := existingAlloc.Copy()
   895  		updatedAlloc.FollowupEvalID = evalID
   896  		a.result.attributeUpdates[updatedAlloc.ID] = updatedAlloc
   897  	}
   898  }