github.com/emate/nomad@v0.8.2-wo-binpacking/scheduler/reconcile.go (about)

     1  package scheduler
     2  
     3  import (
     4  	"fmt"
     5  	"log"
     6  	"time"
     7  
     8  	"sort"
     9  
    10  	"github.com/hashicorp/nomad/helper"
    11  	"github.com/hashicorp/nomad/helper/uuid"
    12  	"github.com/hashicorp/nomad/nomad/structs"
    13  )
    14  
    15  const (
    16  	// batchedFailedAllocWindowSize is the window size used
    17  	// to batch up failed allocations before creating an eval
    18  	batchedFailedAllocWindowSize = 5 * time.Second
    19  
    20  	// rescheduleWindowSize is the window size relative to
    21  	// current time within which reschedulable allocations are placed.
    22  	// This helps protect against small clock drifts between servers
    23  	rescheduleWindowSize = 1 * time.Second
    24  )
    25  
    26  // allocUpdateType takes an existing allocation and a new job definition and
    27  // returns whether the allocation can ignore the change, requires a destructive
    28  // update, or can be inplace updated. If it can be inplace updated, an updated
    29  // allocation that has the new resources and alloc metrics attached will be
    30  // returned.
    31  type allocUpdateType func(existing *structs.Allocation, newJob *structs.Job,
    32  	newTG *structs.TaskGroup) (ignore, destructive bool, updated *structs.Allocation)
    33  
    34  // allocReconciler is used to determine the set of allocations that require
    35  // placement, inplace updating or stopping given the job specification and
    36  // existing cluster state. The reconciler should only be used for batch and
    37  // service jobs.
    38  type allocReconciler struct {
    39  	// logger is used to log debug information. Logging should be kept at a
    40  	// minimal here
    41  	logger *log.Logger
    42  
    43  	// canInplace is used to check if the allocation can be inplace upgraded
    44  	allocUpdateFn allocUpdateType
    45  
    46  	// batch marks whether the job is a batch job
    47  	batch bool
    48  
    49  	// job is the job being operated on, it may be nil if the job is being
    50  	// stopped via a purge
    51  	job *structs.Job
    52  
    53  	// jobID is the ID of the job being operated on. The job may be nil if it is
    54  	// being stopped so we require this separately.
    55  	jobID string
    56  
    57  	// oldDeployment is the last deployment for the job
    58  	oldDeployment *structs.Deployment
    59  
    60  	// deployment is the current deployment for the job
    61  	deployment *structs.Deployment
    62  
    63  	// deploymentPaused marks whether the deployment is paused
    64  	deploymentPaused bool
    65  
    66  	// deploymentFailed marks whether the deployment is failed
    67  	deploymentFailed bool
    68  
    69  	// taintedNodes contains a map of nodes that are tainted
    70  	taintedNodes map[string]*structs.Node
    71  
    72  	// existingAllocs is non-terminal existing allocations
    73  	existingAllocs []*structs.Allocation
    74  
    75  	// evalID is the ID of the evaluation that triggered the reconciler
    76  	evalID string
    77  
    78  	// now is the time used when determining rescheduling eligibility
    79  	// defaults to time.Now, and overidden in unit tests
    80  	now time.Time
    81  
    82  	// result is the results of the reconcile. During computation it can be
    83  	// used to store intermediate state
    84  	result *reconcileResults
    85  }
    86  
    87  // reconcileResults contains the results of the reconciliation and should be
    88  // applied by the scheduler.
    89  type reconcileResults struct {
    90  	// deployment is the deployment that should be created or updated as a
    91  	// result of scheduling
    92  	deployment *structs.Deployment
    93  
    94  	// deploymentUpdates contains a set of deployment updates that should be
    95  	// applied as a result of scheduling
    96  	deploymentUpdates []*structs.DeploymentStatusUpdate
    97  
    98  	// place is the set of allocations to place by the scheduler
    99  	place []allocPlaceResult
   100  
   101  	// destructiveUpdate is the set of allocations to apply a destructive update to
   102  	destructiveUpdate []allocDestructiveResult
   103  
   104  	// inplaceUpdate is the set of allocations to apply an inplace update to
   105  	inplaceUpdate []*structs.Allocation
   106  
   107  	// stop is the set of allocations to stop
   108  	stop []allocStopResult
   109  
   110  	// attributeUpdates are updates to the allocation that are not from a
   111  	// jobspec change.
   112  	attributeUpdates map[string]*structs.Allocation
   113  
   114  	// desiredTGUpdates captures the desired set of changes to make for each
   115  	// task group.
   116  	desiredTGUpdates map[string]*structs.DesiredUpdates
   117  
   118  	// desiredFollowupEvals is the map of follow up evaluations to create per task group
   119  	// This is used to create a delayed evaluation for rescheduling failed allocations.
   120  	desiredFollowupEvals map[string][]*structs.Evaluation
   121  }
   122  
   123  // delayedRescheduleInfo contains the allocation id and a time when its eligible to be rescheduled.
   124  // this is used to create follow up evaluations
   125  type delayedRescheduleInfo struct {
   126  
   127  	// allocID is the ID of the allocation eligible to be rescheduled
   128  	allocID string
   129  
   130  	// rescheduleTime is the time to use in the delayed evaluation
   131  	rescheduleTime time.Time
   132  }
   133  
   134  func (r *reconcileResults) GoString() string {
   135  	base := fmt.Sprintf("Total changes: (place %d) (destructive %d) (inplace %d) (stop %d)",
   136  		len(r.place), len(r.destructiveUpdate), len(r.inplaceUpdate), len(r.stop))
   137  
   138  	if r.deployment != nil {
   139  		base += fmt.Sprintf("\nCreated Deployment: %q", r.deployment.ID)
   140  	}
   141  	for _, u := range r.deploymentUpdates {
   142  		base += fmt.Sprintf("\nDeployment Update for ID %q: Status %q; Description %q",
   143  			u.DeploymentID, u.Status, u.StatusDescription)
   144  	}
   145  	for tg, u := range r.desiredTGUpdates {
   146  		base += fmt.Sprintf("\nDesired Changes for %q: %#v", tg, u)
   147  	}
   148  	return base
   149  }
   150  
   151  // Changes returns the number of total changes
   152  func (r *reconcileResults) Changes() int {
   153  	return len(r.place) + len(r.inplaceUpdate) + len(r.stop)
   154  }
   155  
   156  // NewAllocReconciler creates a new reconciler that should be used to determine
   157  // the changes required to bring the cluster state inline with the declared jobspec
   158  func NewAllocReconciler(logger *log.Logger, allocUpdateFn allocUpdateType, batch bool,
   159  	jobID string, job *structs.Job, deployment *structs.Deployment,
   160  	existingAllocs []*structs.Allocation, taintedNodes map[string]*structs.Node, evalID string) *allocReconciler {
   161  	return &allocReconciler{
   162  		logger:         logger,
   163  		allocUpdateFn:  allocUpdateFn,
   164  		batch:          batch,
   165  		jobID:          jobID,
   166  		job:            job,
   167  		deployment:     deployment.Copy(),
   168  		existingAllocs: existingAllocs,
   169  		taintedNodes:   taintedNodes,
   170  		evalID:         evalID,
   171  		now:            time.Now(),
   172  		result: &reconcileResults{
   173  			desiredTGUpdates:     make(map[string]*structs.DesiredUpdates),
   174  			desiredFollowupEvals: make(map[string][]*structs.Evaluation),
   175  		},
   176  	}
   177  }
   178  
   179  // Compute reconciles the existing cluster state and returns the set of changes
   180  // required to converge the job spec and state
   181  func (a *allocReconciler) Compute() *reconcileResults {
   182  	// Create the allocation matrix
   183  	m := newAllocMatrix(a.job, a.existingAllocs)
   184  
   185  	// Handle stopping unneeded deployments
   186  	a.cancelDeployments()
   187  
   188  	// If we are just stopping a job we do not need to do anything more than
   189  	// stopping all running allocs
   190  	if a.job.Stopped() {
   191  		a.handleStop(m)
   192  		return a.result
   193  	}
   194  
   195  	// Detect if the deployment is paused
   196  	if a.deployment != nil {
   197  		// Detect if any allocs associated with this deploy have failed
   198  		// Failed allocations could edge trigger an evaluation before the deployment watcher
   199  		// runs and marks the deploy as failed. This block makes sure that is still
   200  		// considered a failed deploy
   201  		failedAllocsInDeploy := false
   202  		for _, as := range m {
   203  			for _, alloc := range as {
   204  				if alloc.DeploymentID == a.deployment.ID && alloc.ClientStatus == structs.AllocClientStatusFailed {
   205  					failedAllocsInDeploy = true
   206  				}
   207  			}
   208  		}
   209  		a.deploymentPaused = a.deployment.Status == structs.DeploymentStatusPaused
   210  		a.deploymentFailed = a.deployment.Status == structs.DeploymentStatusFailed || failedAllocsInDeploy
   211  	}
   212  
   213  	// Reconcile each group
   214  	complete := true
   215  	for group, as := range m {
   216  		groupComplete := a.computeGroup(group, as)
   217  		complete = complete && groupComplete
   218  	}
   219  
   220  	// Mark the deployment as complete if possible
   221  	if a.deployment != nil && complete {
   222  		a.result.deploymentUpdates = append(a.result.deploymentUpdates, &structs.DeploymentStatusUpdate{
   223  			DeploymentID:      a.deployment.ID,
   224  			Status:            structs.DeploymentStatusSuccessful,
   225  			StatusDescription: structs.DeploymentStatusDescriptionSuccessful,
   226  		})
   227  	}
   228  
   229  	// Set the description of a created deployment
   230  	if d := a.result.deployment; d != nil {
   231  		if d.RequiresPromotion() {
   232  			d.StatusDescription = structs.DeploymentStatusDescriptionRunningNeedsPromotion
   233  		}
   234  	}
   235  
   236  	return a.result
   237  }
   238  
   239  // cancelDeployments cancels any deployment that is not needed
   240  func (a *allocReconciler) cancelDeployments() {
   241  	// If the job is stopped and there is a non-terminal deployment, cancel it
   242  	if a.job.Stopped() {
   243  		if a.deployment != nil && a.deployment.Active() {
   244  			a.result.deploymentUpdates = append(a.result.deploymentUpdates, &structs.DeploymentStatusUpdate{
   245  				DeploymentID:      a.deployment.ID,
   246  				Status:            structs.DeploymentStatusCancelled,
   247  				StatusDescription: structs.DeploymentStatusDescriptionStoppedJob,
   248  			})
   249  		}
   250  
   251  		// Nothing else to do
   252  		a.oldDeployment = a.deployment
   253  		a.deployment = nil
   254  		return
   255  	}
   256  
   257  	d := a.deployment
   258  	if d == nil {
   259  		return
   260  	}
   261  
   262  	// Check if the deployment is active and referencing an older job and cancel it
   263  	if d.JobCreateIndex != a.job.CreateIndex || d.JobVersion != a.job.Version {
   264  		if d.Active() {
   265  			a.result.deploymentUpdates = append(a.result.deploymentUpdates, &structs.DeploymentStatusUpdate{
   266  				DeploymentID:      a.deployment.ID,
   267  				Status:            structs.DeploymentStatusCancelled,
   268  				StatusDescription: structs.DeploymentStatusDescriptionNewerJob,
   269  			})
   270  		}
   271  
   272  		a.oldDeployment = d
   273  		a.deployment = nil
   274  	}
   275  
   276  	// Clear it as the current deployment if it is successful
   277  	if d.Status == structs.DeploymentStatusSuccessful {
   278  		a.oldDeployment = d
   279  		a.deployment = nil
   280  	}
   281  }
   282  
   283  // handleStop marks all allocations to be stopped, handling the lost case
   284  func (a *allocReconciler) handleStop(m allocMatrix) {
   285  	for group, as := range m {
   286  		untainted, migrate, lost := as.filterByTainted(a.taintedNodes)
   287  		a.markStop(untainted, "", allocNotNeeded)
   288  		a.markStop(migrate, "", allocNotNeeded)
   289  		a.markStop(lost, structs.AllocClientStatusLost, allocLost)
   290  		desiredChanges := new(structs.DesiredUpdates)
   291  		desiredChanges.Stop = uint64(len(as))
   292  		a.result.desiredTGUpdates[group] = desiredChanges
   293  	}
   294  }
   295  
   296  // markStop is a helper for marking a set of allocation for stop with a
   297  // particular client status and description.
   298  func (a *allocReconciler) markStop(allocs allocSet, clientStatus, statusDescription string) {
   299  	for _, alloc := range allocs {
   300  		a.result.stop = append(a.result.stop, allocStopResult{
   301  			alloc:             alloc,
   302  			clientStatus:      clientStatus,
   303  			statusDescription: statusDescription,
   304  		})
   305  	}
   306  }
   307  
   308  // computeGroup reconciles state for a particular task group. It returns whether
   309  // the deployment it is for is complete with regards to the task group.
   310  func (a *allocReconciler) computeGroup(group string, all allocSet) bool {
   311  	// Create the desired update object for the group
   312  	desiredChanges := new(structs.DesiredUpdates)
   313  	a.result.desiredTGUpdates[group] = desiredChanges
   314  
   315  	// Get the task group. The task group may be nil if the job was updates such
   316  	// that the task group no longer exists
   317  	tg := a.job.LookupTaskGroup(group)
   318  
   319  	// If the task group is nil, then the task group has been removed so all we
   320  	// need to do is stop everything
   321  	if tg == nil {
   322  		untainted, migrate, lost := all.filterByTainted(a.taintedNodes)
   323  		a.markStop(untainted, "", allocNotNeeded)
   324  		a.markStop(migrate, "", allocNotNeeded)
   325  		a.markStop(lost, structs.AllocClientStatusLost, allocLost)
   326  		desiredChanges.Stop = uint64(len(untainted) + len(migrate) + len(lost))
   327  		return true
   328  	}
   329  
   330  	// Get the deployment state for the group
   331  	var dstate *structs.DeploymentState
   332  	existingDeployment := false
   333  	if a.deployment != nil {
   334  		dstate, existingDeployment = a.deployment.TaskGroups[group]
   335  	}
   336  	if !existingDeployment {
   337  		autorevert := false
   338  		if tg.Update != nil && tg.Update.AutoRevert {
   339  			autorevert = true
   340  		}
   341  		dstate = &structs.DeploymentState{
   342  			AutoRevert: autorevert,
   343  		}
   344  	}
   345  
   346  	// Filter allocations that do not need to be considered because they are
   347  	// from an older job version and are terminal.
   348  	all, ignore := a.filterOldTerminalAllocs(all)
   349  	desiredChanges.Ignore += uint64(len(ignore))
   350  
   351  	canaries, all := a.handleGroupCanaries(all, desiredChanges)
   352  
   353  	// Determine what set of allocations are on tainted nodes
   354  	untainted, migrate, lost := all.filterByTainted(a.taintedNodes)
   355  
   356  	// Determine what set of terminal allocations need to be rescheduled
   357  	untainted, rescheduleNow, rescheduleLater := untainted.filterByRescheduleable(a.batch, a.now, a.evalID)
   358  
   359  	// Create batched follow up evaluations for allocations that are
   360  	// reschedulable later and mark the allocations for in place updating
   361  	a.handleDelayedReschedules(rescheduleLater, all, tg.Name)
   362  
   363  	// Create a structure for choosing names. Seed with the taken names which is
   364  	// the union of untainted and migrating nodes (includes canaries)
   365  	nameIndex := newAllocNameIndex(a.jobID, group, tg.Count, untainted.union(migrate, rescheduleNow))
   366  
   367  	// Stop any unneeded allocations and update the untainted set to not
   368  	// included stopped allocations.
   369  	canaryState := dstate != nil && dstate.DesiredCanaries != 0 && !dstate.Promoted
   370  	stop := a.computeStop(tg, nameIndex, untainted, migrate, lost, canaries, canaryState)
   371  	desiredChanges.Stop += uint64(len(stop))
   372  	untainted = untainted.difference(stop)
   373  
   374  	// Having stopped un-needed allocations, append the canaries to the existing
   375  	// set of untainted because they are promoted. This will cause them to be
   376  	// treated like non-canaries
   377  	if !canaryState {
   378  		untainted = untainted.union(canaries)
   379  		nameIndex.Set(canaries)
   380  	}
   381  
   382  	// Do inplace upgrades where possible and capture the set of upgrades that
   383  	// need to be done destructively.
   384  	ignore, inplace, destructive := a.computeUpdates(tg, untainted)
   385  	desiredChanges.Ignore += uint64(len(ignore))
   386  	desiredChanges.InPlaceUpdate += uint64(len(inplace))
   387  	if !existingDeployment {
   388  		dstate.DesiredTotal += len(destructive) + len(inplace)
   389  	}
   390  
   391  	// The fact that we have destructive updates and have less canaries than is
   392  	// desired means we need to create canaries
   393  	numDestructive := len(destructive)
   394  	strategy := tg.Update
   395  	canariesPromoted := dstate != nil && dstate.Promoted
   396  	requireCanary := numDestructive != 0 && strategy != nil && len(canaries) < strategy.Canary && !canariesPromoted
   397  	if requireCanary && !a.deploymentPaused && !a.deploymentFailed {
   398  		number := strategy.Canary - len(canaries)
   399  		number = helper.IntMin(numDestructive, number)
   400  		desiredChanges.Canary += uint64(number)
   401  		if !existingDeployment {
   402  			dstate.DesiredCanaries = strategy.Canary
   403  		}
   404  
   405  		for _, name := range nameIndex.NextCanaries(uint(number), canaries, destructive) {
   406  			a.result.place = append(a.result.place, allocPlaceResult{
   407  				name:      name,
   408  				canary:    true,
   409  				taskGroup: tg,
   410  			})
   411  		}
   412  	}
   413  
   414  	// Determine how many we can place
   415  	canaryState = dstate != nil && dstate.DesiredCanaries != 0 && !dstate.Promoted
   416  	limit := a.computeLimit(tg, untainted, destructive, migrate, canaryState)
   417  
   418  	// Place if:
   419  	// * The deployment is not paused or failed
   420  	// * Not placing any canaries
   421  	// * If there are any canaries that they have been promoted
   422  	place := a.computePlacements(tg, nameIndex, untainted, migrate, rescheduleNow)
   423  	if !existingDeployment {
   424  		dstate.DesiredTotal += len(place)
   425  	}
   426  
   427  	// deploymentPlaceReady tracks whether the deployment is in a state where
   428  	// placements can be made without any other consideration.
   429  	deploymentPlaceReady := !a.deploymentPaused && !a.deploymentFailed && !canaryState
   430  
   431  	if deploymentPlaceReady {
   432  		desiredChanges.Place += uint64(len(place))
   433  		for _, p := range place {
   434  			a.result.place = append(a.result.place, p)
   435  		}
   436  
   437  		min := helper.IntMin(len(place), limit)
   438  		limit -= min
   439  	} else if !deploymentPlaceReady && len(lost) != 0 {
   440  		// We are in a situation where we shouldn't be placing more than we need
   441  		// to but we have lost allocations. It is a very weird user experience
   442  		// if you have a node go down and Nomad doesn't replace the allocations
   443  		// because the deployment is paused/failed so we only place to recover
   444  		// the lost allocations.
   445  		allowed := helper.IntMin(len(lost), len(place))
   446  		desiredChanges.Place += uint64(allowed)
   447  		for _, p := range place[:allowed] {
   448  			a.result.place = append(a.result.place, p)
   449  		}
   450  	}
   451  
   452  	if deploymentPlaceReady {
   453  		// Do all destructive updates
   454  		min := helper.IntMin(len(destructive), limit)
   455  		desiredChanges.DestructiveUpdate += uint64(min)
   456  		desiredChanges.Ignore += uint64(len(destructive) - min)
   457  		for _, alloc := range destructive.nameOrder()[:min] {
   458  			a.result.destructiveUpdate = append(a.result.destructiveUpdate, allocDestructiveResult{
   459  				placeName:             alloc.Name,
   460  				placeTaskGroup:        tg,
   461  				stopAlloc:             alloc,
   462  				stopStatusDescription: allocUpdating,
   463  			})
   464  		}
   465  	} else {
   466  		desiredChanges.Ignore += uint64(len(destructive))
   467  	}
   468  
   469  	// Calculate the allowed number of changes and set the desired changes
   470  	// accordingly.
   471  	if !a.deploymentFailed && !a.deploymentPaused {
   472  		desiredChanges.Migrate += uint64(len(migrate))
   473  	} else {
   474  		desiredChanges.Stop += uint64(len(migrate))
   475  	}
   476  
   477  	for _, alloc := range migrate.nameOrder() {
   478  		// If the deployment is failed or paused, don't replace it, just mark as stop.
   479  		if a.deploymentFailed || a.deploymentPaused {
   480  			a.result.stop = append(a.result.stop, allocStopResult{
   481  				alloc:             alloc,
   482  				statusDescription: allocNodeTainted,
   483  			})
   484  			continue
   485  		}
   486  
   487  		a.result.stop = append(a.result.stop, allocStopResult{
   488  			alloc:             alloc,
   489  			statusDescription: allocMigrating,
   490  		})
   491  		a.result.place = append(a.result.place, allocPlaceResult{
   492  			name:          alloc.Name,
   493  			canary:        false,
   494  			taskGroup:     tg,
   495  			previousAlloc: alloc,
   496  		})
   497  	}
   498  
   499  	// Create new deployment if:
   500  	// 1. Updating a job specification
   501  	// 2. No running allocations (first time running a job)
   502  	updatingSpec := len(destructive) != 0 || len(a.result.inplaceUpdate) != 0
   503  	hadRunning := false
   504  	for _, alloc := range all {
   505  		if alloc.Job.Version == a.job.Version {
   506  			hadRunning = true
   507  			break
   508  		}
   509  	}
   510  
   511  	// Create a new deployment if necessary
   512  	if !existingDeployment && strategy != nil && dstate.DesiredTotal != 0 && (!hadRunning || updatingSpec) {
   513  		// A previous group may have made the deployment already
   514  		if a.deployment == nil {
   515  			a.deployment = structs.NewDeployment(a.job)
   516  			a.result.deployment = a.deployment
   517  		}
   518  
   519  		// Attach the groups deployment state to the deployment
   520  		a.deployment.TaskGroups[group] = dstate
   521  	}
   522  
   523  	// deploymentComplete is whether the deployment is complete which largely
   524  	// means that no placements were made or desired to be made
   525  	deploymentComplete := len(destructive)+len(inplace)+len(place)+len(migrate) == 0 && !requireCanary
   526  
   527  	// Final check to see if the deployment is complete is to ensure everything
   528  	// is healthy
   529  	if deploymentComplete && a.deployment != nil {
   530  		partOf, _ := untainted.filterByDeployment(a.deployment.ID)
   531  		for _, alloc := range partOf {
   532  			if !alloc.DeploymentStatus.IsHealthy() {
   533  				deploymentComplete = false
   534  				break
   535  			}
   536  		}
   537  	}
   538  
   539  	return deploymentComplete
   540  }
   541  
   542  // filterOldTerminalAllocs filters allocations that should be ignored since they
   543  // are allocations that are terminal from a previous job version.
   544  func (a *allocReconciler) filterOldTerminalAllocs(all allocSet) (filtered, ignore allocSet) {
   545  	if !a.batch {
   546  		return all, nil
   547  	}
   548  
   549  	filtered = filtered.union(all)
   550  	ignored := make(map[string]*structs.Allocation)
   551  
   552  	// Ignore terminal batch jobs from older versions
   553  	for id, alloc := range filtered {
   554  		older := alloc.Job.Version < a.job.Version || alloc.Job.CreateIndex < a.job.CreateIndex
   555  		if older && alloc.TerminalStatus() {
   556  			delete(filtered, id)
   557  			ignored[id] = alloc
   558  		}
   559  	}
   560  
   561  	return filtered, ignored
   562  }
   563  
   564  // handleGroupCanaries handles the canaries for the group by stopping the
   565  // unneeded ones and returning the current set of canaries and the updated total
   566  // set of allocs for the group
   567  func (a *allocReconciler) handleGroupCanaries(all allocSet, desiredChanges *structs.DesiredUpdates) (canaries, newAll allocSet) {
   568  	// Stop any canary from an older deployment or from a failed one
   569  	var stop []string
   570  
   571  	// Cancel any non-promoted canaries from the older deployment
   572  	if a.oldDeployment != nil {
   573  		for _, s := range a.oldDeployment.TaskGroups {
   574  			if !s.Promoted {
   575  				stop = append(stop, s.PlacedCanaries...)
   576  			}
   577  		}
   578  	}
   579  
   580  	// Cancel any non-promoted canaries from a failed deployment
   581  	if a.deployment != nil && a.deployment.Status == structs.DeploymentStatusFailed {
   582  		for _, s := range a.deployment.TaskGroups {
   583  			if !s.Promoted {
   584  				stop = append(stop, s.PlacedCanaries...)
   585  			}
   586  		}
   587  	}
   588  
   589  	// stopSet is the allocSet that contains the canaries we desire to stop from
   590  	// above.
   591  	stopSet := all.fromKeys(stop)
   592  	a.markStop(stopSet, "", allocNotNeeded)
   593  	desiredChanges.Stop += uint64(len(stopSet))
   594  	all = all.difference(stopSet)
   595  
   596  	// Capture our current set of canaries and handle any migrations that are
   597  	// needed by just stopping them.
   598  	if a.deployment != nil {
   599  		var canaryIDs []string
   600  		for _, s := range a.deployment.TaskGroups {
   601  			canaryIDs = append(canaryIDs, s.PlacedCanaries...)
   602  		}
   603  
   604  		canaries = all.fromKeys(canaryIDs)
   605  		untainted, migrate, lost := canaries.filterByTainted(a.taintedNodes)
   606  		a.markStop(migrate, "", allocMigrating)
   607  		a.markStop(lost, structs.AllocClientStatusLost, allocLost)
   608  
   609  		canaries = untainted
   610  		all = all.difference(migrate, lost)
   611  	}
   612  
   613  	return canaries, all
   614  }
   615  
   616  // computeLimit returns the placement limit for a particular group. The inputs
   617  // are the group definition, the untainted, destructive, and migrate allocation
   618  // set and whether we are in a canary state.
   619  func (a *allocReconciler) computeLimit(group *structs.TaskGroup, untainted, destructive, migrate allocSet, canaryState bool) int {
   620  	// If there is no update strategy or deployment for the group we can deploy
   621  	// as many as the group has
   622  	if group.Update == nil || len(destructive)+len(migrate) == 0 {
   623  		return group.Count
   624  	} else if a.deploymentPaused || a.deploymentFailed {
   625  		// If the deployment is paused or failed, do not create anything else
   626  		return 0
   627  	}
   628  
   629  	// If we have canaries and they have not been promoted the limit is 0
   630  	if canaryState {
   631  		return 0
   632  	}
   633  
   634  	// If we have been promoted or there are no canaries, the limit is the
   635  	// configured MaxParallel minus any outstanding non-healthy alloc for the
   636  	// deployment
   637  	limit := group.Update.MaxParallel
   638  	if a.deployment != nil {
   639  		partOf, _ := untainted.filterByDeployment(a.deployment.ID)
   640  		for _, alloc := range partOf {
   641  			// An unhealthy allocation means nothing else should be happen.
   642  			if alloc.DeploymentStatus.IsUnhealthy() {
   643  				return 0
   644  			}
   645  
   646  			if !alloc.DeploymentStatus.IsHealthy() {
   647  				limit--
   648  			}
   649  		}
   650  	}
   651  
   652  	// The limit can be less than zero in the case that the job was changed such
   653  	// that it required destructive changes and the count was scaled up.
   654  	if limit < 0 {
   655  		return 0
   656  	}
   657  
   658  	return limit
   659  }
   660  
   661  // computePlacement returns the set of allocations to place given the group
   662  // definition, the set of untainted, migrating and reschedule allocations for the group.
   663  func (a *allocReconciler) computePlacements(group *structs.TaskGroup,
   664  	nameIndex *allocNameIndex, untainted, migrate allocSet, reschedule allocSet) []allocPlaceResult {
   665  
   666  	// Hot path the nothing to do case
   667  	existing := len(untainted) + len(migrate)
   668  	if existing >= group.Count {
   669  		return nil
   670  	}
   671  	var place []allocPlaceResult
   672  	// Add rescheduled placement results
   673  	// Any allocations being rescheduled will remain at DesiredStatusRun ClientStatusFailed
   674  	for _, alloc := range reschedule {
   675  		place = append(place, allocPlaceResult{
   676  			name:          alloc.Name,
   677  			taskGroup:     group,
   678  			previousAlloc: alloc,
   679  			reschedule:    true,
   680  		})
   681  		existing += 1
   682  		if existing == group.Count {
   683  			break
   684  		}
   685  	}
   686  	// Add remaining placement results
   687  	if existing < group.Count {
   688  		for _, name := range nameIndex.Next(uint(group.Count - existing)) {
   689  			place = append(place, allocPlaceResult{
   690  				name:      name,
   691  				taskGroup: group,
   692  			})
   693  		}
   694  	}
   695  
   696  	return place
   697  }
   698  
   699  // computeStop returns the set of allocations that are marked for stopping given
   700  // the group definition, the set of allocations in various states and whether we
   701  // are canarying.
   702  func (a *allocReconciler) computeStop(group *structs.TaskGroup, nameIndex *allocNameIndex,
   703  	untainted, migrate, lost, canaries allocSet, canaryState bool) allocSet {
   704  
   705  	// Mark all lost allocations for stop. Previous allocation doesn't matter
   706  	// here since it is on a lost node
   707  	var stop allocSet
   708  	stop = stop.union(lost)
   709  	a.markStop(lost, structs.AllocClientStatusLost, allocLost)
   710  
   711  	// If we are still deploying or creating canaries, don't stop them
   712  	if canaryState {
   713  		untainted = untainted.difference(canaries)
   714  	}
   715  
   716  	// Hot path the nothing to do case
   717  	remove := len(untainted) + len(migrate) - group.Count
   718  	if remove <= 0 {
   719  		return stop
   720  	}
   721  
   722  	// Filter out any terminal allocations from the untainted set
   723  	// This is so that we don't try to mark them as stopped redundantly
   724  	untainted = filterByTerminal(untainted)
   725  
   726  	// Prefer stopping any alloc that has the same name as the canaries if we
   727  	// are promoted
   728  	if !canaryState && len(canaries) != 0 {
   729  		canaryNames := canaries.nameSet()
   730  		for id, alloc := range untainted.difference(canaries) {
   731  			if _, match := canaryNames[alloc.Name]; match {
   732  				stop[id] = alloc
   733  				a.result.stop = append(a.result.stop, allocStopResult{
   734  					alloc:             alloc,
   735  					statusDescription: allocNotNeeded,
   736  				})
   737  				delete(untainted, id)
   738  
   739  				remove--
   740  				if remove == 0 {
   741  					return stop
   742  				}
   743  			}
   744  		}
   745  	}
   746  
   747  	// Prefer selecting from the migrating set before stopping existing allocs
   748  	if len(migrate) != 0 {
   749  		mNames := newAllocNameIndex(a.jobID, group.Name, group.Count, migrate)
   750  		removeNames := mNames.Highest(uint(remove))
   751  		for id, alloc := range migrate {
   752  			if _, match := removeNames[alloc.Name]; !match {
   753  				continue
   754  			}
   755  			a.result.stop = append(a.result.stop, allocStopResult{
   756  				alloc:             alloc,
   757  				statusDescription: allocNotNeeded,
   758  			})
   759  			delete(migrate, id)
   760  			stop[id] = alloc
   761  			nameIndex.UnsetIndex(alloc.Index())
   762  
   763  			remove--
   764  			if remove == 0 {
   765  				return stop
   766  			}
   767  		}
   768  	}
   769  
   770  	// Select the allocs with the highest count to remove
   771  	removeNames := nameIndex.Highest(uint(remove))
   772  	for id, alloc := range untainted {
   773  		if _, ok := removeNames[alloc.Name]; ok {
   774  			stop[id] = alloc
   775  			a.result.stop = append(a.result.stop, allocStopResult{
   776  				alloc:             alloc,
   777  				statusDescription: allocNotNeeded,
   778  			})
   779  			delete(untainted, id)
   780  
   781  			remove--
   782  			if remove == 0 {
   783  				return stop
   784  			}
   785  		}
   786  	}
   787  
   788  	// It is possible that we didn't stop as many as we should have if there
   789  	// were allocations with duplicate names.
   790  	for id, alloc := range untainted {
   791  		stop[id] = alloc
   792  		a.result.stop = append(a.result.stop, allocStopResult{
   793  			alloc:             alloc,
   794  			statusDescription: allocNotNeeded,
   795  		})
   796  		delete(untainted, id)
   797  
   798  		remove--
   799  		if remove == 0 {
   800  			return stop
   801  		}
   802  	}
   803  
   804  	return stop
   805  }
   806  
   807  // computeUpdates determines which allocations for the passed group require
   808  // updates. Three groups are returned:
   809  // 1. Those that require no upgrades
   810  // 2. Those that can be upgraded in-place. These are added to the results
   811  // automatically since the function contains the correct state to do so,
   812  // 3. Those that require destructive updates
   813  func (a *allocReconciler) computeUpdates(group *structs.TaskGroup, untainted allocSet) (ignore, inplace, destructive allocSet) {
   814  	// Determine the set of allocations that need to be updated
   815  	ignore = make(map[string]*structs.Allocation)
   816  	inplace = make(map[string]*structs.Allocation)
   817  	destructive = make(map[string]*structs.Allocation)
   818  
   819  	for _, alloc := range untainted {
   820  		ignoreChange, destructiveChange, inplaceAlloc := a.allocUpdateFn(alloc, a.job, group)
   821  		if ignoreChange {
   822  			ignore[alloc.ID] = alloc
   823  		} else if destructiveChange {
   824  			destructive[alloc.ID] = alloc
   825  		} else {
   826  			inplace[alloc.ID] = alloc
   827  			a.result.inplaceUpdate = append(a.result.inplaceUpdate, inplaceAlloc)
   828  		}
   829  	}
   830  
   831  	return
   832  }
   833  
   834  // handleDelayedReschedules creates batched followup evaluations with the WaitUntil field set
   835  // for allocations that are eligible to be rescheduled later
   836  func (a *allocReconciler) handleDelayedReschedules(rescheduleLater []*delayedRescheduleInfo, all allocSet, tgName string) {
   837  	if len(rescheduleLater) == 0 {
   838  		return
   839  	}
   840  
   841  	// Sort by time
   842  	sort.Slice(rescheduleLater, func(i, j int) bool {
   843  		return rescheduleLater[i].rescheduleTime.Before(rescheduleLater[j].rescheduleTime)
   844  	})
   845  
   846  	var evals []*structs.Evaluation
   847  	nextReschedTime := rescheduleLater[0].rescheduleTime
   848  	allocIDToFollowupEvalID := make(map[string]string, len(rescheduleLater))
   849  
   850  	// Create a new eval for the first batch
   851  	eval := &structs.Evaluation{
   852  		ID:                uuid.Generate(),
   853  		Namespace:         a.job.Namespace,
   854  		Priority:          a.job.Priority,
   855  		Type:              a.job.Type,
   856  		TriggeredBy:       structs.EvalTriggerRetryFailedAlloc,
   857  		JobID:             a.job.ID,
   858  		JobModifyIndex:    a.job.ModifyIndex,
   859  		Status:            structs.EvalStatusPending,
   860  		StatusDescription: reschedulingFollowupEvalDesc,
   861  		WaitUntil:         nextReschedTime,
   862  	}
   863  	evals = append(evals, eval)
   864  
   865  	for _, allocReschedInfo := range rescheduleLater {
   866  		if allocReschedInfo.rescheduleTime.Sub(nextReschedTime) < batchedFailedAllocWindowSize {
   867  			allocIDToFollowupEvalID[allocReschedInfo.allocID] = eval.ID
   868  		} else {
   869  			// Start a new batch
   870  			nextReschedTime = allocReschedInfo.rescheduleTime
   871  			// Create a new eval for the new batch
   872  			eval = &structs.Evaluation{
   873  				ID:             uuid.Generate(),
   874  				Namespace:      a.job.Namespace,
   875  				Priority:       a.job.Priority,
   876  				Type:           a.job.Type,
   877  				TriggeredBy:    structs.EvalTriggerRetryFailedAlloc,
   878  				JobID:          a.job.ID,
   879  				JobModifyIndex: a.job.ModifyIndex,
   880  				Status:         structs.EvalStatusPending,
   881  				WaitUntil:      nextReschedTime,
   882  			}
   883  			evals = append(evals, eval)
   884  			// Set the evalID for the first alloc in this new batch
   885  			allocIDToFollowupEvalID[allocReschedInfo.allocID] = eval.ID
   886  		}
   887  	}
   888  
   889  	a.result.desiredFollowupEvals[tgName] = evals
   890  
   891  	// Initialize the annotations
   892  	if len(allocIDToFollowupEvalID) != 0 && a.result.attributeUpdates == nil {
   893  		a.result.attributeUpdates = make(map[string]*structs.Allocation)
   894  	}
   895  
   896  	// Create in-place updates for every alloc ID that needs to be updated with its follow up eval ID
   897  	for allocID, evalID := range allocIDToFollowupEvalID {
   898  		existingAlloc := all[allocID]
   899  		updatedAlloc := existingAlloc.Copy()
   900  		updatedAlloc.FollowupEvalID = evalID
   901  		a.result.attributeUpdates[updatedAlloc.ID] = updatedAlloc
   902  	}
   903  }